The VMs are not always resumes on reboot

The “libvirt-guests” service can conflict with “onevm-all” and we
don't need to wait for each VM to boot during start.

* posttemplate/10-libvirt-guests: disable the “libvirt-guests” service.

* init/onenode.service (ExecStart): use default wait timeout (60s).
  (TimeoutStartSec): wait for longer than the default timeout.
  (ExecReload): just try to resume any remaining VMs.
  (ExecStop): wait longer for VM to suspend.
  (TimeoutStopSec): wait for longer than the stop timeout.

* scripts/onevm-all: schedule actions in parallel and wait globally
  for their executions.

Ref: #22155
This commit is contained in:
Daniel Dehennin 2017-12-20 16:32:13 +01:00
parent d9e9d2e81c
commit 340dd409e2
3 changed files with 182 additions and 118 deletions

View File

@ -8,11 +8,15 @@ After=multi-user.target
Type=oneshot Type=oneshot
Environment=CREDS=/var/lib/one/.one/one_auth Environment=CREDS=/var/lib/one/.one/one_auth
Environment=ENDPOINT=http://127.0.0.1:2633/RPC2 Environment=ENDPOINT=http://127.0.0.1:2633/RPC2
TimeoutSec=1min
RemainAfterExit=yes RemainAfterExit=yes
Restart=no Restart=no
ExecStart=/usr/share/eole/sbin/onevm-all -t 20 -w -c ${CREDS} -e ${ENDPOINT} -a "resume" ExecStart=/usr/share/eole/sbin/onevm-all -c ${CREDS} -e ${ENDPOINT} -a "resume"
ExecStop=/usr/share/eole/sbin/onevm-all -t 20 -w -c ${CREDS} -e ${ENDPOINT} -a "suspend" # Permit to start remaining VMs at distance by a simple restart
ExecReload=/usr/share/eole/sbin/onevm-all -c ${CREDS} -e ${ENDPOINT} -a "resume"
ExecStop=/usr/share/eole/sbin/onevm-all -w 300 -c ${CREDS} -e ${ENDPOINT} -a "suspend"
# Keep some marging with timeout
TimeoutStartSec=120s
TimeoutStopSec=360s
[Install] [Install]
WantedBy=multi-user.target WantedBy=multi-user.target

View File

@ -0,0 +1,9 @@
#!/bin/sh
echo "Disable and mask libvirt-guests service"
for action in stop disable mask
do
systemctl ${action} libvirt-guests.service 2> /dev/null
done
exit 0

View File

@ -1,5 +1,8 @@
#!/usr/bin/env ruby #!/usr/bin/env ruby
# Do not buffer output
STDOUT.sync = TRUE
############################################################################## ##############################################################################
# Environment Configuration # Environment Configuration
############################################################################## ##############################################################################
@ -30,19 +33,30 @@ include OpenNebula
MAXWAIT=60 MAXWAIT=60
INTERVAL=1 INTERVAL=1
def _wait(vm, st) # List of supported actions
wait = 0 ACTIONS = [
while vm.status != st 'status', # Get the status of all VMs in OpenNebula VM pool
vm.info 'suspend', # Suspend all VMs in RUNNING state
if vm.status == 'unkn' 'resume', # Resume all VMs in SUSPENDED or UNKNOWN state
break ]
end
wait += INTERVAL
sleep(INTERVAL) # Map each action with a target state
if wait >= MAXWAIT EXPECTED_STATUS_MAP = {
break 'status' => nil,
end 'boot' => 'runn',
'suspend' => 'susp',
'resume' => 'runn'
}
def dump_running_vms_file()
if File.exist?(RUNVMFILE)
running_vms = File.readlines(RUNVMFILE).uniq
else
running_vms = []
end end
return running_vms
end end
def CreoleGet(variable) def CreoleGet(variable)
@ -54,19 +68,48 @@ def CreoleGet(variable)
end end
end end
def _do_wait(vms, action, maxwait)
if maxwait == 0 and action == 'resume'
# User explicitely don't want to wait
vms.clear
return 0
end
print "Wait #{maxwait}s for VMs to #{action}"
for try in 0..maxwait
vms.delete_if do |vm|
vm.info
vm.status == EXPECTED_STATUS_MAP[action]
end
break if vms.empty?
print "."
sleep(1)
end
if vms.empty?
puts " OK"
return 0
else
puts " FAIL"
return -1
end
end
# #
# NAME: _do_suspend # NAME: _do_suspend
# PARAM: OpenNebula::VirtualMachine object # PARAM: OpenNebula::VirtualMachine object
# AIM: Suspend a virtual machine # AIM: Suspend a virtual machine
# #
def _do_suspend(vm, wait) def _do_suspend(vm)
fd = File.open(RUNVMFILE,'a') fd = File.open(RUNVMFILE,'a')
if vm.status == "runn" if vm.status == "runn"
puts("Suspending #{vm.name} ...") puts("Suspending #{vm.id} - #{vm.name}... ")
fd.write("#{vm.id}\n") fd.write("#{vm.id}\n")
vm.suspend rc = vm.suspend
if wait if OpenNebula.is_error?(rc)
_wait(vm, "susp") puts rc.message
else
puts "scheduled"
end end
end end
fd.close fd.close
@ -77,28 +120,13 @@ end
# PARAM: OpenNebula::VirtualMachine object # PARAM: OpenNebula::VirtualMachine object
# AIM: Resum a suspended virtual machines # AIM: Resum a suspended virtual machines
# #
def _do_resume(vm, wait, force=FALSE) def _do_resume(vm)
if force print("Resume #{vm.id} - #{vm.name}... ")
vm.resume rc = vm.resume
if OpenNebula.is_error?(rc)
puts rc.message
else else
if vm.status == "susp" puts "scheduled"
puts("Resume on #{vm.name}")
vm.resume
# elsif vm.status == 'save'
# puts("Recover on #{vm.name}")
# # Try to recover VM with retry action
# vm.recover(2)
# vm.resume
elsif vm.status == 'unkn'
puts("Resume on #{vm.name}")
vm.resume
else
return -1
end
end
if wait
_wait(vm, "runn")
end end
end end
@ -124,8 +152,8 @@ parser = OptionParser.new do|opts|
options[:timeout] = value.to_i; options[:timeout] = value.to_i;
end end
opts.on('-w', '--wait', 'Wait for action ends') do |w| opts.on('-w', '--wait timeout', 'Wait for action ends') do |value|
options[:wait] = w options[:wait] = value.to_i
end end
opts.on('-h', '--help', 'Displays Help') do opts.on('-h', '--help', 'Displays Help') do
@ -133,13 +161,11 @@ parser = OptionParser.new do|opts|
exit exit
end end
end end
parser.parse! parser.parse!
# OpenNebula credentials # OpenNebula credentials
if not options[:creds] if not options[:creds]
options[:creds] = "/var/lib/one/.one/one_auth" options[:creds] = "/var/lib/one/.one/one_auth"
end end
@ -157,15 +183,17 @@ if not options[:timeout]
options[:timeout] = TIMEOUT options[:timeout] = TIMEOUT
end end
# Actions if not options[:wait]
SUPPORTED = ['status', 'boot', 'resume', 'shutdown', 'suspend'] options[:wait] = MAXWAIT
end
if not SUPPORTED.include?(options[:action]) if not ACTIONS.include?(options[:action])
puts("Action : #{options[:action]}) is not supported") puts("Action : #{options[:action]}) is not supported")
exit(-1) exit(-1)
end end
begin begin
File.readlines(options[:creds]).each do |line| File.readlines(options[:creds]).each do |line|
CREDENTIALS = line CREDENTIALS = line
@ -175,17 +203,14 @@ rescue
exit(-1) exit(-1)
end end
exit_code = 0
begin begin
client = Client.new(CREDENTIALS, options[:endpoint]) client = Client.new(CREDENTIALS, options[:endpoint])
vm_pool = VirtualMachinePool.new(client, USERFLAG) vm_pool = VirtualMachinePool.new(client, USERFLAG)
if File.exist?(RUNVMFILE) # Try to load vm pool infos from OpenNebula until timeout expires
running_vms = File.readlines(RUNVMFILE)
else
running_vms = []
end
rc = vm_pool.info rc = vm_pool.info
cnt = 0 cnt = 0
while OpenNebula.is_error?(rc) while OpenNebula.is_error?(rc)
@ -198,31 +223,57 @@ begin
cnt += 1 cnt += 1
end end
if options[:action] == "resume"
running_vms = dump_running_vms_file()
running_vms.each do |vmid|
vm = VirtualMachine.new_with_id(vmid, client)
vm.info
_do_resume(vm)
end
else
vm_pool.each do |vm| vm_pool.each do |vm|
case options[:action] case options[:action]
when "status" when "status"
puts("#{vm.name}\t#{vm.status}") puts "#{vm.name}\t#{vm.status}"
when "boot"
puts("DEBUG #{vm.status}")
if vm.status == "unkn"
puts("Booting #{vm.name} ...")
vm.boot
end
when "suspend" when "suspend"
_do_suspend(vm, options[:wait]) _do_suspend(vm)
when "resume"
if running_vms.include?("#{vm.id}\n")
_do_resume(vm, options[:wait], TRUE)
end
else
puts("#{vm.name}\t#{vm.status}")
end end
end end
# Update list of suspended VMs
running_vms = dump_running_vms_file()
end
if options[:action] != 'status'
vms = []
running_vms.each do |vmid|
vm = VirtualMachine.new_with_id(vmid, client)
vms.push(vm)
end
exit_code = _do_wait(vms, options[:action], options[:wait])
end
if options[:action] == "resume" if options[:action] == "resume"
if vms.empty?
File.truncate(RUNVMFILE, 0) if File.exists?(RUNVMFILE) File.truncate(RUNVMFILE, 0) if File.exists?(RUNVMFILE)
else
fd = File.open(RUNVMFILE,'w')
vms.each do |vm|
fd.write("#{vm.id}\n")
end end
end
end
rescue Exception => e rescue Exception => e
puts e.message puts e.message
puts e.backtrace
exit(-1) exit(-1)
end end
exit 0
exit(exit_code)
# Local Variables:
# ruby-indent-level: 4
# End: