Unverified Commit 12d47403 authored by Francesc Guasch's avatar Francesc Guasch Committed by GitHub
Browse files

Feature #882 dead node (#939)

* wip(nodes): do not try to run iptables code on down nodes

issue #882

* test(nodes): test machine set down if node dies

issue #882

* wip(nodes): check node is down before using machine

- set the machine to down
- do not try to use the node connection if it is down.

issue #882

* test(nodes): domain in deleted node should be down

issue #882

* test(nodes): check operations after node down and up

isue #882

* wip(nodes): do not cache active node if forced

issue #882

* test(nodes): check status in dead nodes

issue #882

* wip(test): check id vm

issue #882

* wip(nodes): connect to local node if remote down

issue #882

* wip(nodes): return cached state from frontend

issue #882

* wip(nodes): shutdown all domains of a node

issue #882

* wip(nodes): make sure the node is still active

issue #882

* refactor(backend): show trace errors when failed

This is useful now we are in development. This should
be removed once it reaches production level.

issue #882

* test(nodes): we can't enforce same VM

removed this test. Slightly improved remote clones checks

issue #882

* wip(tests): make test disks small

issue #882

* test(nodes): test active clones in disabled node down

issue #882

* feature(nodes): cope with dead nodes

- Do not search machines in disabled or down nodes
- Set machines down in disabled nodes
- Refresh machines status on dead nodes

issue #882

* wip(test): reviewed already started tests

issue #882

* refactor(nodes): removed debug

issue #882

* wip(frontend): show nodes enabled/disabled buttons

issue #882

* wip(frontend): request check vms after adding new node

issue #882

* wip(nodes): fix problems when nodes suddenly down

set node status inactive if found down

issue #882

* wip(nodes): do not use down nodes

issue #882

* wip(nodes): set node inactive if ssh fails

issue #882

* wip(nodes): search domain in node only if active

issue #882

* wip(nodes): previous VM can be used on opening domain

issue #882

* refactor(nodes); speed up balancing and listing

issue #882

* wip(backend): removed warning , probably we should mark req done

issue #882

* wip(tests): removed unnecessary checks

issue #882

* wip(nodes): fixed wrong cached inactive call

issue #882

* test(nodes): force node to start in

issue #882

* wip(nodes): non-clones go to main node

This way when it is prepared as a base all the disk
volumes are here. It would be more difficult to handle
creation of bases that are in a remote node.

issue #882
parent e9d2f9ac
......@@ -1530,17 +1530,18 @@ sub remove_domain {
lock_hash(%arg);
my $sth = $CONNECTOR->dbh->prepare("SELECT id FROM domains WHERE name = ?");
my $sth = $CONNECTOR->dbh->prepare("SELECT id,vm FROM domains WHERE name = ?");
$sth->execute($name);
my ($id)= $sth->fetchrow;
my ($id,$vm_type)= $sth->fetchrow;
confess "Error: Unknown domain $name" if !$id;
my $user = Ravada::Auth::SQL->search_by_id( $arg{uid});
die "Error: user ".$user->name." can't remove domain $id"
if !$user->can_remove_machine($id);
my $domain = Ravada::Domain->open(id => $id, _force => 1)
my $vm = Ravada::VM->open(type => $vm_type);
my $domain = Ravada::Domain->open(id => $id, _force => 1, id_vm => $vm->id)
or do {
warn "Warning: I can't find domain '$id', maybe already removed.";
$sth = $CONNECTOR->dbh->prepare("DELETE FROM domains where id=?");
......@@ -1558,19 +1559,30 @@ sub remove_domain {
=cut
sub search_domain($self, $name, $import = 0) {
my $sth = $CONNECTOR->dbh->prepare("SELECT id,id_vm "
." FROM domains WHERE name=?");
my $query =
"SELECT d.id, d.id_vm "
." FROM domains d LEFT JOIN vms "
." ON d.id_vm = vms.id "
." WHERE "
." d.name=? "
;
my $sth = $CONNECTOR->dbh->prepare($query);
$sth->execute($name);
my ($id, $id_vm) = $sth->fetchrow();
my ($id, $id_vm ) = $sth->fetchrow();
return if !$id;
if ($id_vm) {
my $vm = Ravada::VM->open($id_vm);
if (!$vm->is_active) {
warn "Don't search domain $name in inactive VM ".$vm->name;
my $vm;
eval { $vm = Ravada::VM->open($id_vm) };
warn $@ if $@;
if ( $vm && !$vm->is_active) {
$vm->disconnect();
} else {
return $vm->search_domain($name);
}
if ($vm && $vm->is_active ) {
my $domain;
eval { $domain = $vm->search_domain($name)};
warn $@ if $@;
return $domain if $domain;
}
}
# for my $vm (@{$self->vm}) {
......@@ -2027,10 +2039,11 @@ sub _domain_working {
if (!$id_domain) {
my $domain_name = $req->defined_arg('name');
return if !$domain_name;
my $domain = $self->search_domain($domain_name) or return;
$id_domain = $domain->id;
my $sth = $CONNECTOR->dbh->prepare("SELECT id FROM domains WHERE name=?");
$sth->execute($domain_name);
($id_domain) = $sth->fetchrow;
if (!$id_domain) {
warn Dumper($req);
# TODO: maybe this request should be marked down because domain already removed
return;
}
}
......@@ -2281,7 +2294,6 @@ sub _cmd_remove {
if !defined $request->args->{uid};
$self->remove_domain(name => $request->args('name'), uid => $request->args('uid'));
}
sub _cmd_pause {
......@@ -2650,6 +2662,10 @@ sub _cmd_refresh_vms($self, $request=undef) {
if ($request && (my $id_recent = $request->done_recently(30))) {
die "Command ".$request->command." run recently by $id_recent.\n";
}
$self->_refresh_disabled_nodes( $request );
$self->_refresh_down_nodes( $request );
my ($active_domain, $active_vm) = $self->_refresh_active_domains($request);
$self->_refresh_down_domains($active_domain, $active_vm);
......@@ -2702,8 +2718,8 @@ sub _refresh_active_domains($self, $request=undef) {
for my $vm ($self->list_vms) {
$request->status('working',"checking active domains on ".$vm->name)
if $request;
next if !$vm->enabled();
if ( !$vm->is_active ) {
if ( !$vm->enabled() || !$vm->is_active ) {
$vm->shutdown_domains();
$active_vm{$vm->id} = 0;
$vm->disconnect();
next;
......@@ -2723,6 +2739,32 @@ sub _refresh_active_domains($self, $request=undef) {
return \%active_domain, \%active_vm;
}
sub _refresh_down_nodes($self, $request = undef ) {
my $sth = $CONNECTOR->dbh->prepare(
"SELECT id FROM vms "
);
$sth->execute();
while ( my ($id) = $sth->fetchrow()) {
my $vm;
$vm = Ravada::VM->open($id);
}
}
sub _refresh_disabled_nodes($self, $request = undef ) {
my $sth = $CONNECTOR->dbh->prepare(
"SELECT d.id, d.name, vms.name FROM domains d, vms "
." WHERE d.id_vm = vms.id "
." AND ( vms.enabled = 0 || vms.is_active = 0 )"
." AND d.status = 'active'"
);
$sth->execute();
while ( my ($id_domain, $domain_name, $vm_name) = $sth->fetchrow ) {
Ravada::Request->shutdown_domain( id_domain => $id_domain, uid => Ravada::Utils::user_daemon->id);
$request->status("Shutting down domain $domain_name in disabled node $vm_name");
}
$sth->finish;
}
sub _refresh_active_domain($self, $vm, $domain, $active_domain) {
return if $domain->is_hibernated();
......
......@@ -195,7 +195,7 @@ around 'set_memory' => \&_around_set_mem;
around 'is_active' => \&_around_is_active;
around 'is_active' => \&_around_is_active;
around 'is_hibernated' => \&_around_is_hibernated;
around 'autostart' => \&_around_autostart;
......@@ -314,6 +314,9 @@ sub _start_preconditions{
}
$self->_balance_vm();
$self->rsync(request => $request) if !$self->is_volatile && !$self->_vm->is_local();
} elsif (!$self->is_local) {
my $vm_local = $self->_vm->new( host => 'localhost' );
$self->_set_vm($vm_local, 1);
}
$self->_check_free_vm_memory();
#TODO: remove them and make it more general now we have nodes
......@@ -330,7 +333,12 @@ sub _search_already_started($self) {
my $vm = Ravada::VM->open($id);
next if !$vm->is_enabled || !$vm->is_active;
my $domain = $vm->search_domain($self->name);
my $domain;
eval { $domain = $vm->search_domain($self->name) };
if ( $@ ) {
warn $@;
next;
}
next if !$domain;
if ( $domain->is_active || $domain->is_hibernated ) {
$self->_set_vm($vm,'force');
......@@ -419,7 +427,7 @@ sub _allow_remove($self, $user) {
&& $self->id_base
&& ($user->can_remove_clones() || $user->can_remove_clone_all())
) {
my $base = $self->open($self->id_base);
my $base = $self->open(id => $self->id_base, id_vm => $self->_vm->id);
return if ($user->can_remove_clone_all() || ($base->id_owner == $user->id));
}
......@@ -773,13 +781,16 @@ sub open($class, @args) {
my $readonly = 0;
my $id_vm;
my $force;
my $vm;
if (scalar @args > 1) {
my %args = @args;
$id = delete $args{id} or confess "ERROR: Missing field id";
$readonly = delete $args{readonly} if exists $args{readonly};
$id_vm = delete $args{id_vm};
$force = delete $args{_force};
$vm = delete $args{vm};
confess "ERROR: id_vm and vm don't match. ".($vm->name." id: ".$vm->id)
if $id_vm && $vm && $vm->id != $id_vm;
confess "ERROR: Unknown fields ".join(",", sort keys %args)
if keys %args;
}
......@@ -797,27 +808,39 @@ sub open($class, @args) {
die "ERROR: Domain not found id=$id\n"
if !keys %$row;
my $vm;
my $vm_local = {};
my $vm_class = "Ravada::VM::".$row->{vm};
bless $vm_local, $vm_class;
if ($id_vm || ( $self->_data('id_vm') && !$self->is_base) ) {
$vm = Ravada::VM->open(id => ( $id_vm or $self->_data('id_vm') )
if (!$vm && ( $id_vm || ( $self->_data('id_vm') && !$self->is_base) ) ) {
eval {
$vm = Ravada::VM->open(id => ( $id_vm or $self->_data('id_vm') )
, readonly => $readonly);
};
if ($@ && $@ =~ /I can't find VM id=/) {
$vm = Ravada::VM->open( type => $self->type );
}
}
if (!$vm || !$vm->is_active) {
my $vm_local;
if (!$vm || !$vm->is_active || !$vm->enabled) {
$vm_local = {};
my $vm_class = "Ravada::VM::".$row->{vm};
bless $vm_local, $vm_class;
$vm = $vm_local->new( );
}
my $domain = $vm->search_domain($row->{name}, $force);
if ( !$domain ) {
return if $vm->is_local;
if (!$vm_local) {
$vm_local = {};
my $vm_class = "Ravada::VM::".$row->{vm};
bless $vm_local, $vm_class;
}
$vm = $vm_local->new();
$domain = $vm->search_domain($row->{name}, $force) or return;
}
if (!$id_vm) {
$domain->_search_already_started();
$domain->_search_already_started() if !$domain->is_base;
$domain->_check_clean_shutdown() if $domain->domain && !$domain->is_active;
}
$domain->_insert_db_extra() if $domain && !$domain->is_known_extra();
......@@ -1131,6 +1154,8 @@ sub _pre_remove_domain($self, $user, @) {
warn $@ if $@;
$self->_allow_remove($user);
$self->_check_active_node();
$self->is_volatile() if $self->is_known || $self->domain;
$self->list_disks() if ($self->is_known && $self->is_known_extra)
|| $self->domain ;
......@@ -1138,6 +1163,20 @@ sub _pre_remove_domain($self, $user, @) {
$self->_remove_iptables() if $self->is_known();
}
sub _check_active_node($self) {
return $self->_vm if $self->_vm->is_active(1);
for my $node ($self->_vm->list_nodes) {
next if !$node->is_local;
$self->_vm($node);
$self->domain($node->search_domain_by_id($self->id)->domain);
last;
}
return $self->_vm;
}
sub _after_remove_domain {
my $self = shift;
my ($user, $cascade) = @_;
......@@ -1168,8 +1207,9 @@ sub _remove_domain_cascade($self,$user, $cascade = 1) {
while ($sth->fetchrow) {
next if $id == $self->_vm->id;
my $vm = Ravada::VM->open($id);
my $domain = $vm->search_domain($domain_name) or next;
$domain->remove($user, $cascade);
my $domain;
eval { $domain = $vm->search_domain($domain_name) };
$domain->remove($user, $cascade) if $domain;
}
}
......@@ -1588,6 +1628,7 @@ sub _pre_shutdown {
$self->_allow_shutdown(@_);
$self->_vm->connect;
$self->_pre_shutdown_domain();
if ($self->is_paused) {
......@@ -1602,7 +1643,7 @@ sub _post_shutdown {
my %arg = @_;
my $timeout = delete $arg{timeout};
$self->_remove_iptables();
$self->_remove_iptables() if $self->_vm->is_active;
$self->_data(status => 'shutdown')
if $self->is_known && !$self->is_volatile && !$self->is_active;
......@@ -1650,7 +1691,15 @@ sub _post_shutdown {
sub _around_is_active($orig, $self) {
return 0 if $self->is_removed;
my $is_active = $self->$orig();
if (!$self->_vm) {
return 1 if $self->_data('status') eq 'active';
return 0;
}
my $is_active = 0;
$is_active = $self->$orig() if $self->_vm->is_active;
return $is_active if $self->readonly
|| !$self->is_known
|| (defined $self->_data('id_vm') && (defined $self->_vm) && $self->_vm->id != $self->_data('id_vm'));
......@@ -1664,11 +1713,18 @@ sub _around_is_active($orig, $self) {
return $is_active;
}
sub _around_is_hibernated($orig, $self) {
return if $self->_vm && !$self->_vm->is_active;
return $self->$orig();
}
sub _around_shutdown_now {
my $orig = shift;
my $self = shift;
my $user = shift;
$self->_vm->connect;
$self->list_disks;
$self->_pre_shutdown(user => $user);
if ($self->is_active) {
......@@ -1897,6 +1953,8 @@ sub _add_iptable {
sub _delete_ip_rule ($self, $iptables, $vm = $self->_vm) {
return if !$vm->is_active;
my ($s, $d, $filter, $chain, $jump, $extra) = @$iptables;
lock_hash %$extra;
......@@ -1918,7 +1976,8 @@ sub _delete_ip_rule ($self, $iptables, $vm = $self->_vm) {
&& ( $args{dport} eq $extra->{d_port}))
{
$vm->run_command("/sbin/iptables", "-t", $filter, "-D", $chain, $count);
$vm->run_command("/sbin/iptables", "-t", $filter, "-D", $chain, $count)
if $vm->is_active;
$removed++;
$count--;
}
......@@ -2603,7 +2662,7 @@ sub rsync($self, @args) {
." are both local "
if $self->_vm->is_local;
$self->_vm->_connect_ssh()
or confess "No Connection to ".$node->host;
or confess "No Connection to ".$self->_vm->host;
} else {
$node->_connect_ssh()
or confess "No Connection to ".$self->_vm->host;
......@@ -2644,7 +2703,7 @@ sub _rsync_volumes_back($self, $request=undef) {
sub _pre_migrate($self, $node, $request = undef) {
$self->_check_equal_storage_pools($node);
$self->_check_equal_storage_pools($node) if $self->_vm->is_active;
return if !$self->id_base;
......
......@@ -162,7 +162,7 @@ sub remove_disks {
my $id;
eval { $id = $self->id };
return if $@ && $@ =~ /No DB info/i;
die $@ if $@;
confess $@ if $@;
$self->_vm->connect();
for my $file ($self->list_disks) {
......@@ -245,17 +245,17 @@ sub remove {
}
eval { $self->domain->undefine() if $self->domain && !$self->is_removed };
die $@ if $@ && $@ !~ /libvirt error code: 42/;
confess $@ if $@ && $@ !~ /libvirt error code: 42/;
eval { $self->remove_disks() if $self->is_known };
die $@ if $@ && $@ !~ /libvirt error code: 42/;
confess $@ if $@ && $@ !~ /libvirt error code: 42/;
for my $file ( @volumes ) {
$self->_vol_remove($file);
}
eval { $self->_remove_file_image() };
die $@ if $@ && $@ !~ /libvirt error code: 42/;
confess $@ if $@ && $@ !~ /libvirt error code: 42/;
# warn "WARNING: Problem removing file image for ".$self->name." : $@" if $@ && $0 !~ /\.t$/;
# warn "WARNING: Problem removing ".$self->file_base_img." for ".$self->name
......@@ -1724,6 +1724,8 @@ sub migrate($self, $node, $request=undef) {
sub is_removed($self) {
my $is_removed = 0;
return if !$self->_vm->is_active;
eval {
$is_removed = 1 if !$self->domain;
$self->domain->get_xml_description if !$is_removed;
......
......@@ -1052,6 +1052,9 @@ sub add_node($self,%arg) {
my $sth = $CONNECTOR->dbh->prepare($sql);
$sth->execute(map { $arg{$_} } sort keys %arg );
$sth->finish;
my $req = Ravada::Request->refresh_vms( _force => 1 );
return $req->id;
}
=head2 version
......
......@@ -72,7 +72,6 @@ our %VALID_ARG = (
,hybernate=> {uid => 1, id_domain => 1}
,download => {uid => 2, id_iso => 1, id_vm => 2, verbose => 2, delay => 2}
,refresh_storage => { id_vm => 2 }
,refresh_vms => { id_domain => 2 }
,set_base_vm=> {uid => 1, id_vm=> 1, id_domain => 1, value => 2 }
,cleanup => { }
,clone => { uid => 1, id_domain => 1, name => 1, memory => 2 }
......@@ -80,9 +79,9 @@ our %VALID_ARG = (
,add_hardware => {uid => 1, id_domain => 1, name => 1, number => 1}
,remove_hardware => {uid => 1, id_domain => 1, name => 1, index => 1}
,change_max_memory => {uid => 1, id_domain => 1, ram => 1}
,refresh_vms => { }
,enforce_limits => { timeout => 2, _force => 2 }
,refresh_machine => { id_domain => 1 }
,refresh_vms => { _force => 2 }
);
our %CMD_SEND_MESSAGE = map { $_ => 1 }
......@@ -1050,23 +1049,14 @@ sub refresh_vms {
my $class = ref($proto) || $proto;
my $args = _check_args('refresh_vms', @_ );
return if _requested('refresh_vms');
if (!$args->{_force} ) {
return if done_recently(undef,60,'refresh_vms') || _requested('refresh_vms');
}
my $self = {};
bless($self,$class);
_init_connector();
my $sth = $$CONNECTOR->dbh->prepare(
"SELECT id, date_changed FROM requests WHERE command = 'refresh_vms' "
." AND ( date_changed > ? OR status='requested') "
);
my @now = Today_and_Now();
$now[4]-- if $now[4] >1 ;
my $before = "$now[0]-$now[1]-$now[2] $now[3]:$now[4]:$now[5]";
$sth->execute($before);
my ($id, $date) = $sth->fetchrow;
return if $id;
return $self->_new_request(
command => 'refresh_vms'
, args => $args
......
......@@ -24,6 +24,8 @@ use IO::Socket;
use IO::Interface;
use Net::Domain qw(hostfqdn);
use Ravada::Utils;
no warnings "experimental::signatures";
use feature qw(signatures);
......@@ -255,13 +257,18 @@ sub _connect_ssh($self, $disconnect=0) {
$ssh = Net::SSH2->new( timeout => $SSH_TIMEOUT );
my $connect;
for ( 1 .. 3 ) {
$connect = $ssh->connect($self->host);
eval { $connect = $ssh->connect($self->host) };
last if $connect;
warn "RETRYING ssh ".$self->host." ".join(" ",$ssh->error);
sleep 1;
}
$connect = $ssh->connect($self->host) if !$connect;
confess $ssh->error() if !$connect;
if ( !$connect) {
eval { $connect = $ssh->connect($self->host) };
if (!$connect) {
$self->_cached_active(0);
confess $ssh->error();
}
}
$ssh->auth_publickey( 'root'
, "$home/.ssh/id_rsa.pub"
, "$home/.ssh/id_rsa"
......@@ -834,7 +841,6 @@ sub ping($self, $option=undef) {
return 1 if $self->is_local();
warn "trying tcp" if $debug;
my $p = Net::Ping->new('tcp',2);
my $ping_ok;
eval { $ping_ok = $p->ping($self->host) };
......@@ -863,12 +869,22 @@ sub _around_ping($orig, $self, $option=undef) {
=head2 is_active
Returns if the domain is active.
Returns if the domain is active. The active state is cached for some seconds.
Pass an optional true value to perform a real check.
Arguments: optional force mode
if ($node->is_active) {
}
if ($node->is_active(1)) {
}
=cut
sub is_active($self) {
return $self->_do_is_active() if $self->is_local;
sub is_active($self, $force=0) {
return $self->_do_is_active() if $self->is_local || $force;
return $self->_cached_active if time - $self->_cached_active_time < 60;
return $self->_do_is_active();
......@@ -909,12 +925,12 @@ Returns if the domain is enabled.
=cut
sub enabled($self) {
return $self->_data('enabled');
sub enabled($self, $value=undef) {
return $self->_data('enabled', $value);
}
sub is_enabled($self) {
return $self->enabled();
sub is_enabled($self, $value=undef) {
return $self->enabled($value);
}
=head2 remove
......@@ -1061,9 +1077,14 @@ sub balance_vm($self, $base=undef) {
my %vm_list;
for my $vm ($self->list_nodes) {
next if !$vm->enabled();
next if !$vm->is_active || $vm->free_memory < $Ravada::Domain::MIN_FREE_MEMORY;
my $key = scalar($vm->list_domains(active => 1)).".".$vm->free_memory;
next if !$vm->is_active();
my $free_memory = $vm->free_memory;
next if $free_memory < $Ravada::Domain::MIN_FREE_MEMORY;
my $key = $vm->count_domains(status => 'active').".".$free_memory;
$vm_list{$key} = $vm;
last if $key =~ /^[01]+\./; # don't look for other nodes when this one is empty !
}
my @sorted_vm = map { $vm_list{$_} } sort keys %vm_list;
......@@ -1074,6 +1095,34 @@ sub balance_vm($self, $base=undef) {
return;
}
sub count_domains($self, %args) {
my $query = "SELECT count(*) FROM domains WHERE id_vm = ? AND ";
$query .= join(" AND ",map { "$_ = ?" } sort keys %args );
my $sth = $$CONNECTOR->dbh->prepare($query);
$sth->execute( $self->id, map { $args{$_} } sort keys %args );
my ($count) = $sth->fetchrow;
return $count;
}
sub shutdown_domains($self) {
my $sth_inactive
= $$CONNECTOR->dbh->prepare("UPDATE domains set status='down' WHERE id=?");
my $sth = $$CONNECTOR->dbh->prepare(
"SELECT id FROM domains "
." where status='active'"
." AND id_vm = ".$self->id
);
$sth->execute();
while ( my ($id_domain) = $sth->fetchrow) {
$sth_inactive->execute($id_domain);
Ravada::Request->shutdown_domain(
id_domain => $id_domain
, uid => Ravada::Utils::user_daemon->id
);
}
$sth->finish;
}
1;
......@@ -94,7 +94,8 @@ sub _connect {
if ($self->host eq 'localhost') {
$vm = Sys::Virt->new( address => $con_type.":///system" , readonly => $self->readonly);
} else {
return if $self->readonly;
confess "Error: You can't connect to remote VMs in readonly mode"
if $self->readonly;