From 340dd409e2d04d6b43947f608ea6d3b330525be4 Mon Sep 17 00:00:00 2001 From: Daniel Dehennin Date: Wed, 20 Dec 2017 16:32:13 +0100 Subject: [PATCH] The VMs are not always resumes on reboot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The “libvirt-guests” service can conflict with “onevm-all” and we don't need to wait for each VM to boot during start. * posttemplate/10-libvirt-guests: disable the “libvirt-guests” service. * init/onenode.service (ExecStart): use default wait timeout (60s). (TimeoutStartSec): wait for longer than the default timeout. (ExecReload): just try to resume any remaining VMs. (ExecStop): wait longer for VM to suspend. (TimeoutStopSec): wait for longer than the stop timeout. * scripts/onevm-all: schedule actions in parallel and wait globally for their executions. Ref: #22155 --- init/onenode.service | 10 +- posttemplate/10-libvirt-guests | 9 ++ scripts/onevm-all | 281 +++++++++++++++++++-------------- 3 files changed, 182 insertions(+), 118 deletions(-) create mode 100644 posttemplate/10-libvirt-guests diff --git a/init/onenode.service b/init/onenode.service index ceb5fb8..9a165a6 100644 --- a/init/onenode.service +++ b/init/onenode.service @@ -8,11 +8,15 @@ After=multi-user.target Type=oneshot Environment=CREDS=/var/lib/one/.one/one_auth Environment=ENDPOINT=http://127.0.0.1:2633/RPC2 -TimeoutSec=1min RemainAfterExit=yes Restart=no -ExecStart=/usr/share/eole/sbin/onevm-all -t 20 -w -c ${CREDS} -e ${ENDPOINT} -a "resume" -ExecStop=/usr/share/eole/sbin/onevm-all -t 20 -w -c ${CREDS} -e ${ENDPOINT} -a "suspend" +ExecStart=/usr/share/eole/sbin/onevm-all -c ${CREDS} -e ${ENDPOINT} -a "resume" +# Permit to start remaining VMs at distance by a simple restart +ExecReload=/usr/share/eole/sbin/onevm-all -c ${CREDS} -e ${ENDPOINT} -a "resume" +ExecStop=/usr/share/eole/sbin/onevm-all -w 300 -c ${CREDS} -e ${ENDPOINT} -a "suspend" +# Keep some marging with timeout +TimeoutStartSec=120s +TimeoutStopSec=360s [Install] WantedBy=multi-user.target diff --git a/posttemplate/10-libvirt-guests b/posttemplate/10-libvirt-guests new file mode 100644 index 0000000..88eef61 --- /dev/null +++ b/posttemplate/10-libvirt-guests @@ -0,0 +1,9 @@ +#!/bin/sh + +echo "Disable and mask libvirt-guests service" +for action in stop disable mask +do + systemctl ${action} libvirt-guests.service 2> /dev/null +done + +exit 0 diff --git a/scripts/onevm-all b/scripts/onevm-all index e6d7e27..efb12d1 100755 --- a/scripts/onevm-all +++ b/scripts/onevm-all @@ -1,5 +1,8 @@ #!/usr/bin/env ruby +# Do not buffer output +STDOUT.sync = TRUE + ############################################################################## # Environment Configuration ############################################################################## @@ -30,19 +33,30 @@ include OpenNebula MAXWAIT=60 INTERVAL=1 -def _wait(vm, st) - wait = 0 - while vm.status != st - vm.info - if vm.status == 'unkn' - break - end - wait += INTERVAL - sleep(INTERVAL) - if wait >= MAXWAIT - break - end +# List of supported actions +ACTIONS = [ + 'status', # Get the status of all VMs in OpenNebula VM pool + 'suspend', # Suspend all VMs in RUNNING state + 'resume', # Resume all VMs in SUSPENDED or UNKNOWN state +] + + +# Map each action with a target state +EXPECTED_STATUS_MAP = { + 'status' => nil, + 'boot' => 'runn', + 'suspend' => 'susp', + 'resume' => 'runn' +} + +def dump_running_vms_file() + if File.exist?(RUNVMFILE) + running_vms = File.readlines(RUNVMFILE).uniq + else + running_vms = [] end + + return running_vms end def CreoleGet(variable) @@ -54,19 +68,48 @@ def CreoleGet(variable) end end +def _do_wait(vms, action, maxwait) + if maxwait == 0 and action == 'resume' + # User explicitely don't want to wait + vms.clear + return 0 + end + + print "Wait #{maxwait}s for VMs to #{action}" + for try in 0..maxwait + vms.delete_if do |vm| + vm.info + vm.status == EXPECTED_STATUS_MAP[action] + end + break if vms.empty? + print "." + sleep(1) + end + if vms.empty? + puts " OK" + return 0 + else + puts " FAIL" + return -1 + end +end + + # # NAME: _do_suspend # PARAM: OpenNebula::VirtualMachine object # AIM: Suspend a virtual machine # -def _do_suspend(vm, wait) +def _do_suspend(vm) fd = File.open(RUNVMFILE,'a') if vm.status == "runn" - puts("Suspending #{vm.name} ...") + puts("Suspending #{vm.id} - #{vm.name}... ") fd.write("#{vm.id}\n") - vm.suspend - if wait - _wait(vm, "susp") + rc = vm.suspend + if OpenNebula.is_error?(rc) + puts rc.message + else + puts "scheduled" end end fd.close @@ -77,28 +120,13 @@ end # PARAM: OpenNebula::VirtualMachine object # AIM: Resum a suspended virtual machines # -def _do_resume(vm, wait, force=FALSE) - if force - vm.resume +def _do_resume(vm) + print("Resume #{vm.id} - #{vm.name}... ") + rc = vm.resume + if OpenNebula.is_error?(rc) + puts rc.message else - if vm.status == "susp" - puts("Resume on #{vm.name}") - vm.resume - # elsif vm.status == 'save' - # puts("Recover on #{vm.name}") - # # Try to recover VM with retry action - # vm.recover(2) - # vm.resume - elsif vm.status == 'unkn' - puts("Resume on #{vm.name}") - vm.resume - else - return -1 - end - end - - if wait - _wait(vm, "runn") + puts "scheduled" end end @@ -107,65 +135,65 @@ options = {:creds => nil, :action => nil, :endpoint => nil, :timeout => nil} parser = OptionParser.new do|opts| - opts.banner = "Usage: #{File.basename(__FILE__)} [options]" - opts.on('-c', '--creds file', 'Crediential file') do |value| - options[:creds] = value; - end + opts.banner = "Usage: #{File.basename(__FILE__)} [options]" + opts.on('-c', '--creds file', 'Crediential file') do |value| + options[:creds] = value; + end - opts.on('-a', '--action action', 'Action to run') do |value| - options[:action] = value; - end + opts.on('-a', '--action action', 'Action to run') do |value| + options[:action] = value; + end - opts.on('-e', '--end-point url', 'End point URL') do |value| - options[:endpoint] = value; - end + opts.on('-e', '--end-point url', 'End point URL') do |value| + options[:endpoint] = value; + end - opts.on('-t', '--timeout timeout', 'Timeout for opennebula connection') do |value| - options[:timeout] = value.to_i; - end + opts.on('-t', '--timeout timeout', 'Timeout for opennebula connection') do |value| + options[:timeout] = value.to_i; + end - opts.on('-w', '--wait', 'Wait for action ends') do |w| - options[:wait] = w - end - - opts.on('-h', '--help', 'Displays Help') do - puts opts - exit - end + opts.on('-w', '--wait timeout', 'Wait for action ends') do |value| + options[:wait] = value.to_i + end + opts.on('-h', '--help', 'Displays Help') do + puts opts + exit + end end parser.parse! # OpenNebula credentials - if not options[:creds] options[:creds] = "/var/lib/one/.one/one_auth" end if not options[:action] - options[:action] = "status" + options[:action] = "status" end if not options[:endpoint] - ip = CreoleGet('adresse_ip_eth0').chomp - options[:endpoint] = "http://#{ip}:2633/RPC2" + ip = CreoleGet('adresse_ip_eth0').chomp + options[:endpoint] = "http://#{ip}:2633/RPC2" end if not options[:timeout] - options[:timeout] = TIMEOUT + options[:timeout] = TIMEOUT end -# Actions -SUPPORTED = ['status', 'boot', 'resume', 'shutdown', 'suspend'] - - -if not SUPPORTED.include?(options[:action]) - puts("Action : #{options[:action]}) is not supported") - exit(-1) +if not options[:wait] + options[:wait] = MAXWAIT end + +if not ACTIONS.include?(options[:action]) + puts("Action : #{options[:action]}) is not supported") + exit(-1) +end + + begin File.readlines(options[:creds]).each do |line| CREDENTIALS = line @@ -175,54 +203,77 @@ rescue exit(-1) end + +exit_code = 0 begin - client = Client.new(CREDENTIALS, options[:endpoint]) + client = Client.new(CREDENTIALS, options[:endpoint]) - vm_pool = VirtualMachinePool.new(client, USERFLAG) + vm_pool = VirtualMachinePool.new(client, USERFLAG) - if File.exist?(RUNVMFILE) - running_vms = File.readlines(RUNVMFILE) - else - running_vms = [] - end - - rc = vm_pool.info - cnt = 0 - while OpenNebula.is_error?(rc) - if cnt == options[:timeout] - puts rc.message - exit(-1) - end + # Try to load vm pool infos from OpenNebula until timeout expires rc = vm_pool.info - sleep(1) - cnt += 1 - end - - vm_pool.each do |vm| - case options[:action] - when "status" - puts("#{vm.name}\t#{vm.status}") - when "boot" - puts("DEBUG #{vm.status}") - if vm.status == "unkn" - puts("Booting #{vm.name} ...") - vm.boot - end - when "suspend" - _do_suspend(vm, options[:wait]) - when "resume" - if running_vms.include?("#{vm.id}\n") - _do_resume(vm, options[:wait], TRUE) - end - else - puts("#{vm.name}\t#{vm.status}") + cnt = 0 + while OpenNebula.is_error?(rc) + if cnt == options[:timeout] + puts rc.message + exit(-1) + end + rc = vm_pool.info + sleep(1) + cnt += 1 end - end - if options[:action] == "resume" - File.truncate(RUNVMFILE, 0) if File.exists?(RUNVMFILE) - end + + if options[:action] == "resume" + running_vms = dump_running_vms_file() + running_vms.each do |vmid| + vm = VirtualMachine.new_with_id(vmid, client) + vm.info + _do_resume(vm) + end + + else + vm_pool.each do |vm| + case options[:action] + when "status" + puts "#{vm.name}\t#{vm.status}" + + when "suspend" + _do_suspend(vm) + end + end + + # Update list of suspended VMs + running_vms = dump_running_vms_file() + end + + if options[:action] != 'status' + vms = [] + running_vms.each do |vmid| + vm = VirtualMachine.new_with_id(vmid, client) + vms.push(vm) + end + exit_code = _do_wait(vms, options[:action], options[:wait]) + end + + if options[:action] == "resume" + if vms.empty? + File.truncate(RUNVMFILE, 0) if File.exists?(RUNVMFILE) + else + fd = File.open(RUNVMFILE,'w') + vms.each do |vm| + fd.write("#{vm.id}\n") + end + end + end + rescue Exception => e - puts e.message - exit(-1) + puts e.message + puts e.backtrace + exit(-1) end -exit 0 + +exit(exit_code) + +# Local Variables: +# ruby-indent-level: 4 +# End: