Unverified Commit b2ce40c4 authored by Francesc Guasch's avatar Francesc Guasch Committed by GitHub
Browse files

984 nodes (#991)

* wip(cli): use faster machine listing

issue #984

* feature(backend); set process priorities

issue #984

* wip(backend): cache connections and check free memory

Now Virtual Manager connections are cached.
When balancing VMs, check if machine memory fits

issue #984

* wip(backend): ignores disabled nodes, disable when one fails

When a node is disabled, try not to use it.
When a node fails, set disabled.
Use process priorities.
Improve MAC address generation on KVM

* fix(KVM): don't reset the password if machine up

issue #955

* wip(backend): remove forkmanager

We handle forks ourselves so we can tweak this more.
ie we can allow more process with more nodes.

issue #984

* test(nodes): check remote remove base

issue #984

* test(nodes): migrate without CD and fix volume names

issue #984

* test(download): give time process to start

* test(nodes): check base and volatile clones

issue #984

* wip(nodes): cached nodes and allow clone when preparing base

Check the cached nodes are there and not renamed
When preparing a remote base allow to clone the machine

issue #984

* refactor(KVM): disable nodes and better UUID

creating random uuids sometimes failed

issue #984

* wip(requests): kill request if pid not found

issue #984

* test(volumes): create mock volumes with different names

issue #984

* refactor(nodes): clean remote bases

issue #984

* test(nodes): check volatile clones on remote

* test(backend): properly create mock domains

issue #984

* refactor(KVM): cope with duplicated uuids

issue #984

* refactor(nodes): disable failed remote domains

also cope with duplicated uuids on migration

issue #984

* refactor(nodes): improve balancing failover and volatile

issue #984

* test(pools): check unique uuid on creation

issue #984

* refactor(nodes): fixed remove base on remote down node

issue #984

* refactor(nodes): do not check active if node down

issue #984

* refactor(requests): allow shutdown timeout on refreshing vms

May come useful one day, handy for testing by now

issue #984

* refactor(tests): removed debug

* refactor(test): check down domain on disabled/down nodes

issue #984

* refactor(backend): return all known remote ips

In some cases may be more than 1 remote ip, probably
when accessing from localhost

issue #984

* refactor(test): wait more for remote machines down

issue #984

* refactor(backend): use timeout shutdown only if defined

issue #984
parent 6085dc52
......@@ -155,8 +155,8 @@ sub do_start {
my $t_refresh = 0;
my $ravada = Ravada->new( %CONFIG );
Ravada::Request->enforce_limits();
Ravada::Request->refresh_vms();
# Ravada::Request->enforce_limits();
#Ravada::Request->refresh_vms();
for (;;) {
my $t0 = time;
$ravada->process_priority_requests();
......@@ -439,7 +439,8 @@ sub shutdown_domain {
my $down = 0;
my $found = 0;
DOMAIN:
for my $domain ($rvd_back->list_domains) {
for my $domain_data ($rvd_back->list_domains_data) {
my $domain = Ravada::Front::Domain->open($domain_data->{id});
if ((defined $domain_name && $domain->name eq $domain_name)
|| ($hibernated && $domain->is_hibernated)
|| ($DISCONNECTED
......@@ -465,10 +466,11 @@ sub shutdown_domain {
next DOMAIN if _verify_connection($domain);
}
print "Shutting down ".$domain->name.".\n";
eval {
$domain->shutdown(user => Ravada::Utils::user_daemon(), timeout => 300);
$down++;
};
Ravada::Request->shutdown_domain(uid => Ravada::Utils::user_daemon()->id
,id_domain => $domain->id
, timeout => 300
);
$down++;
warn $@ if $@;
}
}
......
......@@ -11,7 +11,6 @@ use DBIx::Connector;
use File::Copy;
use Hash::Util qw(lock_hash);
use Moose;
use Parallel::ForkManager;
use POSIX qw(WNOHANG);
use Time::HiRes qw(gettimeofday tv_interval);
use YAML;
......@@ -1490,10 +1489,10 @@ sub create_domain {
my $user = Ravada::Auth::SQL->search_by_id($id_owner);
$request->status("creating machine") if $request;
if ( $base && $base->volatile_clones
|| $user->is_temporary ) {
$vm = $vm->balance_vm($base);
$request->status("creating machine on ".$vm->name);
if ( $base && $base->is_base ) {
$request->status("balancing") if $request;
$vm = $vm->balance_vm($base) or die "Error: No free nodes available.";
$request->status("creating machine on ".$vm->name) if $request;
}
confess "No vm found, request = ".Dumper(request => $request) if !$vm;
......@@ -1728,10 +1727,13 @@ sub list_domains {
for my $row (@$domains_data) {
my $domain = Ravada::Domain->open($row->{id});
next if !$domain;
next if defined $active && !$domain->is_removed &&
( $domain->is_active && !$active
|| !$domain->is_active && $active );
my $is_active;
$is_active = $domain->is_active;
if ( defined $active && !$domain->is_removed &&
( $is_active && !$active
|| !$is_active && $active )) {
next;
}
next if $user && $domain->id_owner != $user->id;
push @domains,($domain);
......@@ -1958,6 +1960,7 @@ sub process_requests {
." or priority"
if $request_type !~ /^(long|huge|priority|all)$/;
$self->_wait_pids();
$self->_kill_stale_process();
my $sth = $CONNECTOR->dbh->prepare("SELECT id,id_domain FROM requests "
......@@ -1985,7 +1988,7 @@ sub process_requests {
push @reqs,($req);
}
for my $req (@reqs) {
for my $req (sort { $a->priority <=> $b->priority } @reqs) {
next if $req eq 'refresh_vms' && scalar@reqs > 2;
warn "[$request_type] $$ executing request ".$req->id." ".$req->status()." "
......@@ -2048,7 +2051,6 @@ sub _timeout_requests($self) {
sub _kill_requests($self, @requests) {
for my $req (@requests) {
warn "killing request ".$req->id." ".$req->command." pid: ".$req->pid;
$req->status('stopping');
my @procs = $self->_process_sons($req->pid);
if ( @procs) {
......@@ -2059,14 +2061,8 @@ sub _kill_requests($self, @requests) {
warn "sending $signal to $pid $cmd";
kill($signal, $pid);
}
} else {
if ($req->pid == $$) {
warn "I'm not gonna kill myself $$";
} else {
kill(15, $req->pid);
}
$req->status('done');
}
$req->stop();
}
}
......@@ -2166,7 +2162,8 @@ sub _domain_working {
}
}
my $sth = $CONNECTOR->dbh->prepare("SELECT id, status FROM requests "
." WHERE id <> ? AND id_domain=? AND (status <> 'requested' AND status <> 'done')");
." WHERE id <> ? AND id_domain=? "
." AND (status <> 'requested' AND status <> 'done' AND command <> 'set_base_vm')");
$sth->execute($id_request, $id_domain);
my ($id, $status) = $sth->fetchrow;
# warn "CHECKING DOMAIN WORKING "
......@@ -2236,30 +2233,22 @@ sub _execute {
return;
}
if ( $self->_wait_requests($request) ) {
$request->status("requested","Server loaded, queuing request");
return;
}
$self->_wait_pids;
return if !$self->_can_fork($request);
$request->status('working','');
if (!$self->{fork_manager}) {
my $fm = Parallel::ForkManager->new($request->requests_limit('priority'));
$self->{fork_manager} = $fm;
}
$self->{fork_manager}->reap_finished_children;
my $pid = $self->{fork_manager}->start;
my $pid = fork();
die "I can't fork" if !defined $pid;
if ( $pid == 0 ) {
$request->status('working','');
my $t0 = [gettimeofday];
$self->_do_execute_command($sub, $request);
$self->{fork_manager}->finish; # Terminates the child process
my $elapsed = tv_interval($t0,[gettimeofday]);
$request->run_time($elapsed) if !$request->run_time();
exit;
}
$self->_add_pid($pid, $request);
$request->pid($pid);
$self->{fork_manager}->reap_finished_children;
}
sub _do_execute_command {
......@@ -2367,44 +2356,60 @@ sub _cmd_create{
}
sub _wait_requests {
sub _can_fork {
my $self = shift;
my $req = shift or confess "Missing request";
# don't wait for priority requests
return if $req->type eq 'priority';
my $try = 0;
for ( 1 .. $SECONDS_WAIT_CHILDREN ) {
my $type = $req->type;
my $msg;
my $n_pids = scalar(keys %{$self->{pids}->{$type}});
return 1 if $n_pids <= $req->requests_limit();
my $n_pids = $req->count_requests();
my $msg = $req->command
." waiting for processes to finish"
." limit ".$req->requests_limit;
$msg = $req->command
." waiting for processes to finish $n_pids"
." of ".$req->requests_limit;
return if $n_pids < $req->requests_limit();
return 1 if $n_pids > $req->requests_limit + 2;
sleep 1;
warn $msg if $DEBUG;
$req->error($msg);
$req->status('waiting') if $req->status() !~ 'waiting';
return 0;
}
next if $try++;
sub _wait_pids {
my $self = shift;
$req->error($msg);
$req->status('waiting') if $req->status() !~ 'waiting';
for my $type ( keys %{$self->{pids}} ) {
for my $pid ( keys %{$self->{pids}->{$type}}) {
my $kid = waitpid($pid , WNOHANG);
last if $kid <= 0 ;
my $request = Ravada::Request->open($self->{pids}->{$type}->{$kid});
if ($request) {
$request->status('done') if $request->status =~ /working/i;
};
delete $self->{pids}->{$type}->{$kid};
}
}
return 1;
}
sub _set_req_done {
sub _add_pid($self, $pid, $request) {
my $type = $request->type;
$self->{pids}->{$type}->{$pid} = $request->id;
}
sub _delete_pid {
my $self = shift;
my $pid = shift;
my $id_request = $self->{pids}->{$pid};
return if !$id_request;
my $req = Ravada::Request->open($id_request);
$req->status('done') if $req->status =~ /working/i;
for my $type ( keys %{$self->{pids}} ) {
delete $self->{pids}->{$type}->{$pid}
}
}
sub _cmd_remove {
......@@ -2704,6 +2709,7 @@ sub _cmd_shutdown {
die "ERROR: Domain $id_domain is ".$domain2->name." not $name."
if $domain && $domain->name ne $domain2->name;
$domain = $domain2;
die "Unknown domain '$id_domain'\n" if !$domain
}
my $user = Ravada::Auth::SQL->search_by_id( $uid);
......@@ -2975,6 +2981,10 @@ sub _refresh_down_nodes($self, $request = undef ) {
}
sub _refresh_disabled_nodes($self, $request = undef ) {
my @timeout = ();
@timeout = ( timeout => $request->args('timeout_shutdown') )
if defined $request && $request->defined_arg('timeout_shutdown');
my $sth = $CONNECTOR->dbh->prepare(
"SELECT d.id, d.name, vms.name FROM domains d, vms "
." WHERE d.id_vm = vms.id "
......@@ -2983,7 +2993,10 @@ sub _refresh_disabled_nodes($self, $request = undef ) {
);
$sth->execute();
while ( my ($id_domain, $domain_name, $vm_name) = $sth->fetchrow ) {
Ravada::Request->shutdown_domain( id_domain => $id_domain, uid => Ravada::Utils::user_daemon->id);
Ravada::Request->shutdown_domain( id_domain => $id_domain
, uid => Ravada::Utils::user_daemon->id
, @timeout
);
$request->status("Shutting down domain $domain_name in disabled node $vm_name");
}
$sth->finish;
......@@ -3040,8 +3053,10 @@ sub _refresh_volatile_domains($self) {
while ( my ($id_domain, $name, $id_vm) = $sth->fetchrow ) {
my $domain = Ravada::Domain->open(id => $id_domain, _force => 1);
if ( !$domain || $domain->status eq 'down' || !$domain->is_active) {
$domain->_post_shutdown(user => $USER_DAEMON);
$domain->remove($USER_DAEMON);
if ($domain) {
$domain->_post_shutdown(user => $USER_DAEMON);
$domain->remove($USER_DAEMON);
}
my $sth_del = $CONNECTOR->dbh->prepare("DELETE FROM domains WHERE id=?");
$sth_del->execute($id_domain);
$sth_del->finish;
......@@ -3092,17 +3107,6 @@ sub _cmd_cleanup($self, $request) {
$self->_wait_pids();
}
sub _wait_pids($self) {
$self->{fork_manager}->reap_finished_children if $self->{fork_manager};
my $procs = `ps -eo "pid cmd"`;
for my $line (split /\n/, $procs ) {
my ($pid, $cmd) = $line =~ m{\s*(\d+)\s+.*(rvd_back).*defunct};
next if !$pid;
next if $cmd !~ /rvd_back/;
my $kid = waitpid($pid , WNOHANG);
}
}
sub _req_method {
my $self = shift;
my $cmd = shift;
......@@ -3321,9 +3325,11 @@ sub _clean_volatile_machines($self, %args) {
id => $domain->{id}
,_force => 1
);
next if $domain_real->domain && $domain_real->is_active;
$domain_real->_post_shutdown();
$domain_real->remove($USER_DAEMON);
if ($domain_real) {
next if $domain_real->domain && $domain_real->is_active;
$domain_real->_post_shutdown();
$domain_real->remove($USER_DAEMON);
}
$sth_remove->execute($domain->{id});
}
......@@ -3350,7 +3356,7 @@ sub _post_login_locale($self, $request) {
}
sub DESTROY($self) {
$self->{fork_manager}->reap_finished_children if $self->{fork_manager}
$self->_wait_pids();
}
=head2 version
......
......@@ -214,7 +214,6 @@ sub BUILD {
my $name;
$name = $args->{name} if exists $args->{name};
$name = $args->{domain}->get_name if !$name && $args->{domain};
$self->{_name} = $name if $name;
......@@ -335,12 +334,25 @@ sub _search_already_started($self) {
my %started;
while (my ($id) = $sth->fetchrow) {
my $vm = Ravada::VM->open($id);
next if !$vm->is_enabled || !$vm->is_active;
next if !$vm->is_enabled;
my $vm_active;
eval {
$vm_active = $vm->is_active;
};
my $error = $@;
if ($error) {
warn $error;
$vm->enabled(0) if !$vm->is_local;
next;
}
next if !$vm_active;
my $domain;
eval { $domain = $vm->search_domain($self->name) };
if ( $@ ) {
warn $@;
$vm->enabled(0) if !$vm->is_local;
next;
}
next if !$domain;
......@@ -423,6 +435,8 @@ sub _allow_remove($self, $user) {
confess "ERROR: Undefined user" if !defined $user;
return if !$self->is_known(); # already removed
die "ERROR: remove not allowed for user ".$user->name
unless $user->can_remove_machine($self);
......@@ -886,7 +900,6 @@ sub open($class, @args) {
die "ERROR: Domain not found id=$id\n"
if !keys %$row;
if (!$vm && ( $id_vm || ( $self->_data('id_vm') && !$self->is_base) ) ) {
eval {
$vm = Ravada::VM->open(id => ( $id_vm or $self->_data('id_vm') )
......@@ -1290,7 +1303,7 @@ sub _after_remove_domain {
$self->_remove_domain_cascade($user) if !$cascade;
if ($self->is_known && $self->is_base) {
$self->_do_remove_base(@_);
$self->_do_remove_base($user);
$self->_remove_files_base();
}
return if !$self->{_data};
......@@ -1568,13 +1581,16 @@ sub _convert_png {
Makes the domain a regular, non-base virtual machine and removes the base files.
=cut
sub remove_base {
my $self = shift;
return $self->_do_remove_base();
sub remove_base($self, $user) {
return $self->_do_remove_base($user);
}
sub _do_remove_base {
my $self = shift;
sub _do_remove_base($self, $user) {
if ($self->is_base) {
for my $vm ( $self->list_vms ) {
$self->remove_base_vm(vm => $vm, user => $user) if !$vm->is_local;
}
}
$self->is_base(0);
for my $file ($self->list_files_base) {
next if ! -e $file;
......@@ -1815,13 +1831,14 @@ sub _post_shutdown {
}
sub _around_is_active($orig, $self) {
return 0 if $self->is_removed;
if (!$self->_vm) {
return 1 if $self->_data('status') eq 'active';
return 0;
}
if ($self->_vm && $self->_vm->is_active ) {
return 0 if $self->is_removed;
}
my $is_active = 0;
$is_active = $self->$orig() if $self->_vm->is_active;
......@@ -1833,7 +1850,8 @@ sub _around_is_active($orig, $self) {
$status = 'shutdown' if $status eq 'active';
$status = 'active' if $is_active;
$status = 'hibernated' if !$is_active && !$self->is_removed && $self->is_hibernated;
$status = 'hibernated' if !$is_active
&& $self->_vm->is_active && !$self->is_removed && $self->is_hibernated;
$self->_data(status => $status);
$self->needs_restart(0) if $self->needs_restart() && !$is_active;
......@@ -2605,14 +2623,19 @@ sub remote_ip($self) {
." ORDER BY time_req DESC "
);
$sth->execute($self->id);
my @ip;
while ( my ($remote_ip, $iptables_json ) = $sth->fetchrow() ) {
my $iptables = decode_json($iptables_json);
next if $iptables->[4] ne 'ACCEPT';
# TODO check multiple IPs
return $remote_ip;
push @ip,($remote_ip);
}
$sth->finish;
return;
return @ip if wantarray;
for my $ip (@ip) {
return $ip if $ip eq '127.0.0.1';
}
return $ip[0];
}
......@@ -2792,7 +2815,7 @@ sub rsync($self, @args) {
if ($self->is_base) {
push @files_base,($self->list_files_base);
}
$files = [ $self->list_volumes(), @files_base ];
$files = [ $self->list_volumes( device => 'disk'), @files_base ];
}
$request->status("working") if $request;
......@@ -2806,7 +2829,7 @@ sub rsync($self, @args) {
$node->_connect_ssh()
or confess "No Connection to ".$self->_vm->host;
}
my $rsync = File::Rsync->new(update => 1);
my $rsync = File::Rsync->new(update => 1, sparse => 1);
for my $file ( @$files ) {
my ($path) = $file =~ m{(.*)/};
my ($out, $err) = $node->run_command("/bin/mkdir","-p",$path);
......@@ -2954,6 +2977,21 @@ sub set_base_vm($self, %args) {
$request->status("working", "Syncing base volumes to ".$vm->host)
if $request;
$self->migrate($vm, $request);
} else {
if ($vm->is_active) {
for my $file ($self->list_files_base()) {
confess "Error: file has non-valid characters" if $file =~ /[*;&'" ]/;
my ($out, $err);
eval {
my ($out, $err) = $vm->run_command("test -e $file && rm $file");
};
next if $@ && $@ =~ / ssh /i;
$err = $@ if $@;
confess $err if $err;
}
}
my $vm_local = $self->_vm->new( host => 'localhost' );
$self->_set_vm($vm_local, 1);
}
return $self->_set_base_vm_db($vm->id, $value);
}
......@@ -2973,6 +3011,7 @@ Removes a base in a Virtual Machine Manager node.
sub remove_base_vm($self, %args) {
my $user = delete $args{user};
my $vm = delete $args{vm};
$vm = delete $args{node} if !$vm;
confess "ERROR: Unknown arguments ".join(',',sort keys %args).", valid are user and vm."
if keys %args;
......@@ -3239,7 +3278,7 @@ sub needs_restart($self, $value=undef) {
sub _post_change_hardware {
my $self = shift;
$self->info(Ravada::Utils::user_daemon) if $self->is_known();
$self->needs_restart(1) if $self->is_active;
$self->needs_restart(1) if $self->is_known && $self->is_active;
}
=head2 Access restrictions
......
......@@ -177,8 +177,6 @@ sub remove_disks {
for my $file ($self->list_disks( device => 'disk')) {
confess $file if $file =~ /iso$/;
if (! -e $file ) {
warn "WARNING: $file already removed for ".$self->name."\n"
if $0 !~ /.t$/;
next;
}
$self->_vol_remove($file);
......@@ -677,16 +675,30 @@ sub start {
}
$self->_set_spice_ip($set_password);
$self->status('starting');
eval { $self->domain->create() };
my $error = $@;
if ( $error && $error !~ /already running/i ) {
if ( $self->domain->has_managed_save_image ) {
$request->status("removing saved image") if $request;
$self->domain->managed_save_remove();
$self->domain->create();
} else {
die $error;
my $error;
for ( ;; ) {
eval { $self->domain->create() };
$error = $@;
next if $error && $error =~ /libvirt error code: 1, .* pool .* asynchronous/;
last;
}
return if !$error || $error =~ /already running/i;
if ($error =~ /libvirt error code: 38,/) {
if (!$self->_vm->is_local) {
warn "Disabling node ".$self->_vm->name();
$self->_vm->enabled(0);
}
die $error;
sleep 1;
} elsif ( $error =~ /libvirt error code: 9, .*already defined with uuid/) {
die "TODO";
} elsif ( $self->domain->has_managed_save_image ) {
$request->status("removing saved image") if $request;
$self->domain->managed_save_remove();
$self->domain->create();
} else {
die $error;
}
}
......@@ -735,7 +747,9 @@ sub shutdown {
sub _do_shutdown {
my $self = shift;
$self->domain->shutdown();
return if !$self->domain->is_active;
eval { $self->domain->shutdown() };
die $@ if $@ && $@ !~ /libvirt error code: 55,/;
}
......@@ -1004,7 +1018,6 @@ sub _cmd_boot_order($self, $set, $index=undef, $order=1) {
my $this_order = $boot->getAttribute('order');
next if $this_order < $order;
$boot->setAttribute( order => $this_order+1);
warn "[$count] $this_order -> ".($this_order + 1);
next;
}
if (!$set) {
......@@ -1991,9 +2004,18 @@ sub migrate($self, $node, $request=undef) {
my $xml = $self->domain->get_xml_description();
my $doc = XML::LibXML->load_xml(string => $xml);
$self->_check_uuid($doc, $node);
$self->_check_machine($doc);
$dom = $node->vm->define_domain($doc->toString());
for ( ;; ) {
$self->_check_uuid($doc, $node);
eval { $dom = $node->vm->define_domain($doc->toString()) };
my $error = $@;
last if !$error;
die $error if $error !~ /libvirt error code: 9, .*already defined with uuid/;
my $msg = "migrating ".$self->name." ".$error;
warn $msg;
$request->error($msg) if $request;
sleep 1;
}
$self->domain($dom);
}
$self->_set_spice_ip(1,$node->ip);
......
......@@ -40,28 +40,6 @@ our $CONVERT = `which convert`;
chomp $CONVERT;
#######################################3
sub BUILD {
my $self = shift;
my $args = $_[0];