The VMs are not always resumes on reboot

The “libvirt-guests” service can conflict with “onevm-all” and we
don't need to wait for each VM to boot during start.

* posttemplate/10-libvirt-guests: disable the “libvirt-guests” service.

* init/onenode.service (ExecStart): use default wait timeout (60s).
  (TimeoutStartSec): wait for longer than the default timeout.
  (ExecReload): just try to resume any remaining VMs.
  (ExecStop): wait longer for VM to suspend.
  (TimeoutStopSec): wait for longer than the stop timeout.

* scripts/onevm-all: schedule actions in parallel and wait globally
  for their executions.

Ref: #22155
This commit is contained in:
Daniel Dehennin 2017-12-20 16:32:13 +01:00
parent d9e9d2e81c
commit 340dd409e2
3 changed files with 182 additions and 118 deletions

View File

@ -8,11 +8,15 @@ After=multi-user.target
Type=oneshot
Environment=CREDS=/var/lib/one/.one/one_auth
Environment=ENDPOINT=http://127.0.0.1:2633/RPC2
TimeoutSec=1min
RemainAfterExit=yes
Restart=no
ExecStart=/usr/share/eole/sbin/onevm-all -t 20 -w -c ${CREDS} -e ${ENDPOINT} -a "resume"
ExecStop=/usr/share/eole/sbin/onevm-all -t 20 -w -c ${CREDS} -e ${ENDPOINT} -a "suspend"
ExecStart=/usr/share/eole/sbin/onevm-all -c ${CREDS} -e ${ENDPOINT} -a "resume"
# Permit to start remaining VMs at distance by a simple restart
ExecReload=/usr/share/eole/sbin/onevm-all -c ${CREDS} -e ${ENDPOINT} -a "resume"
ExecStop=/usr/share/eole/sbin/onevm-all -w 300 -c ${CREDS} -e ${ENDPOINT} -a "suspend"
# Keep some marging with timeout
TimeoutStartSec=120s
TimeoutStopSec=360s
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,9 @@
#!/bin/sh
echo "Disable and mask libvirt-guests service"
for action in stop disable mask
do
systemctl ${action} libvirt-guests.service 2> /dev/null
done
exit 0

View File

@ -1,5 +1,8 @@
#!/usr/bin/env ruby
# Do not buffer output
STDOUT.sync = TRUE
##############################################################################
# Environment Configuration
##############################################################################
@ -30,19 +33,30 @@ include OpenNebula
MAXWAIT=60
INTERVAL=1
def _wait(vm, st)
wait = 0
while vm.status != st
vm.info
if vm.status == 'unkn'
break
end
wait += INTERVAL
sleep(INTERVAL)
if wait >= MAXWAIT
break
end
# List of supported actions
ACTIONS = [
'status', # Get the status of all VMs in OpenNebula VM pool
'suspend', # Suspend all VMs in RUNNING state
'resume', # Resume all VMs in SUSPENDED or UNKNOWN state
]
# Map each action with a target state
EXPECTED_STATUS_MAP = {
'status' => nil,
'boot' => 'runn',
'suspend' => 'susp',
'resume' => 'runn'
}
def dump_running_vms_file()
if File.exist?(RUNVMFILE)
running_vms = File.readlines(RUNVMFILE).uniq
else
running_vms = []
end
return running_vms
end
def CreoleGet(variable)
@ -54,19 +68,48 @@ def CreoleGet(variable)
end
end
def _do_wait(vms, action, maxwait)
if maxwait == 0 and action == 'resume'
# User explicitely don't want to wait
vms.clear
return 0
end
print "Wait #{maxwait}s for VMs to #{action}"
for try in 0..maxwait
vms.delete_if do |vm|
vm.info
vm.status == EXPECTED_STATUS_MAP[action]
end
break if vms.empty?
print "."
sleep(1)
end
if vms.empty?
puts " OK"
return 0
else
puts " FAIL"
return -1
end
end
#
# NAME: _do_suspend
# PARAM: OpenNebula::VirtualMachine object
# AIM: Suspend a virtual machine
#
def _do_suspend(vm, wait)
def _do_suspend(vm)
fd = File.open(RUNVMFILE,'a')
if vm.status == "runn"
puts("Suspending #{vm.name} ...")
puts("Suspending #{vm.id} - #{vm.name}... ")
fd.write("#{vm.id}\n")
vm.suspend
if wait
_wait(vm, "susp")
rc = vm.suspend
if OpenNebula.is_error?(rc)
puts rc.message
else
puts "scheduled"
end
end
fd.close
@ -77,28 +120,13 @@ end
# PARAM: OpenNebula::VirtualMachine object
# AIM: Resum a suspended virtual machines
#
def _do_resume(vm, wait, force=FALSE)
if force
vm.resume
def _do_resume(vm)
print("Resume #{vm.id} - #{vm.name}... ")
rc = vm.resume
if OpenNebula.is_error?(rc)
puts rc.message
else
if vm.status == "susp"
puts("Resume on #{vm.name}")
vm.resume
# elsif vm.status == 'save'
# puts("Recover on #{vm.name}")
# # Try to recover VM with retry action
# vm.recover(2)
# vm.resume
elsif vm.status == 'unkn'
puts("Resume on #{vm.name}")
vm.resume
else
return -1
end
end
if wait
_wait(vm, "runn")
puts "scheduled"
end
end
@ -107,65 +135,65 @@ options = {:creds => nil, :action => nil, :endpoint => nil,
:timeout => nil}
parser = OptionParser.new do|opts|
opts.banner = "Usage: #{File.basename(__FILE__)} [options]"
opts.on('-c', '--creds file', 'Crediential file') do |value|
options[:creds] = value;
end
opts.banner = "Usage: #{File.basename(__FILE__)} [options]"
opts.on('-c', '--creds file', 'Crediential file') do |value|
options[:creds] = value;
end
opts.on('-a', '--action action', 'Action to run') do |value|
options[:action] = value;
end
opts.on('-a', '--action action', 'Action to run') do |value|
options[:action] = value;
end
opts.on('-e', '--end-point url', 'End point URL') do |value|
options[:endpoint] = value;
end
opts.on('-e', '--end-point url', 'End point URL') do |value|
options[:endpoint] = value;
end
opts.on('-t', '--timeout timeout', 'Timeout for opennebula connection') do |value|
options[:timeout] = value.to_i;
end
opts.on('-t', '--timeout timeout', 'Timeout for opennebula connection') do |value|
options[:timeout] = value.to_i;
end
opts.on('-w', '--wait', 'Wait for action ends') do |w|
options[:wait] = w
end
opts.on('-h', '--help', 'Displays Help') do
puts opts
exit
end
opts.on('-w', '--wait timeout', 'Wait for action ends') do |value|
options[:wait] = value.to_i
end
opts.on('-h', '--help', 'Displays Help') do
puts opts
exit
end
end
parser.parse!
# OpenNebula credentials
if not options[:creds]
options[:creds] = "/var/lib/one/.one/one_auth"
end
if not options[:action]
options[:action] = "status"
options[:action] = "status"
end
if not options[:endpoint]
ip = CreoleGet('adresse_ip_eth0').chomp
options[:endpoint] = "http://#{ip}:2633/RPC2"
ip = CreoleGet('adresse_ip_eth0').chomp
options[:endpoint] = "http://#{ip}:2633/RPC2"
end
if not options[:timeout]
options[:timeout] = TIMEOUT
options[:timeout] = TIMEOUT
end
# Actions
SUPPORTED = ['status', 'boot', 'resume', 'shutdown', 'suspend']
if not SUPPORTED.include?(options[:action])
puts("Action : #{options[:action]}) is not supported")
exit(-1)
if not options[:wait]
options[:wait] = MAXWAIT
end
if not ACTIONS.include?(options[:action])
puts("Action : #{options[:action]}) is not supported")
exit(-1)
end
begin
File.readlines(options[:creds]).each do |line|
CREDENTIALS = line
@ -175,54 +203,77 @@ rescue
exit(-1)
end
exit_code = 0
begin
client = Client.new(CREDENTIALS, options[:endpoint])
client = Client.new(CREDENTIALS, options[:endpoint])
vm_pool = VirtualMachinePool.new(client, USERFLAG)
vm_pool = VirtualMachinePool.new(client, USERFLAG)
if File.exist?(RUNVMFILE)
running_vms = File.readlines(RUNVMFILE)
else
running_vms = []
end
rc = vm_pool.info
cnt = 0
while OpenNebula.is_error?(rc)
if cnt == options[:timeout]
puts rc.message
exit(-1)
end
# Try to load vm pool infos from OpenNebula until timeout expires
rc = vm_pool.info
sleep(1)
cnt += 1
end
vm_pool.each do |vm|
case options[:action]
when "status"
puts("#{vm.name}\t#{vm.status}")
when "boot"
puts("DEBUG #{vm.status}")
if vm.status == "unkn"
puts("Booting #{vm.name} ...")
vm.boot
end
when "suspend"
_do_suspend(vm, options[:wait])
when "resume"
if running_vms.include?("#{vm.id}\n")
_do_resume(vm, options[:wait], TRUE)
end
else
puts("#{vm.name}\t#{vm.status}")
cnt = 0
while OpenNebula.is_error?(rc)
if cnt == options[:timeout]
puts rc.message
exit(-1)
end
rc = vm_pool.info
sleep(1)
cnt += 1
end
end
if options[:action] == "resume"
File.truncate(RUNVMFILE, 0) if File.exists?(RUNVMFILE)
end
if options[:action] == "resume"
running_vms = dump_running_vms_file()
running_vms.each do |vmid|
vm = VirtualMachine.new_with_id(vmid, client)
vm.info
_do_resume(vm)
end
else
vm_pool.each do |vm|
case options[:action]
when "status"
puts "#{vm.name}\t#{vm.status}"
when "suspend"
_do_suspend(vm)
end
end
# Update list of suspended VMs
running_vms = dump_running_vms_file()
end
if options[:action] != 'status'
vms = []
running_vms.each do |vmid|
vm = VirtualMachine.new_with_id(vmid, client)
vms.push(vm)
end
exit_code = _do_wait(vms, options[:action], options[:wait])
end
if options[:action] == "resume"
if vms.empty?
File.truncate(RUNVMFILE, 0) if File.exists?(RUNVMFILE)
else
fd = File.open(RUNVMFILE,'w')
vms.each do |vm|
fd.write("#{vm.id}\n")
end
end
end
rescue Exception => e
puts e.message
exit(-1)
puts e.message
puts e.backtrace
exit(-1)
end
exit 0
exit(exit_code)
# Local Variables:
# ruby-indent-level: 4
# End: