Browse Source

The VMs are not always resumes on reboot

The “libvirt-guests” service can conflict with “onevm-all” and we
don't need to wait for each VM to boot during start.

* posttemplate/10-libvirt-guests: disable the “libvirt-guests” service.

* init/onenode.service (ExecStart): use default wait timeout (60s).
  (TimeoutStartSec): wait for longer than the default timeout.
  (ExecReload): just try to resume any remaining VMs.
  (ExecStop): wait longer for VM to suspend.
  (TimeoutStopSec): wait for longer than the stop timeout.

* scripts/onevm-all: schedule actions in parallel and wait globally
  for their executions.

Ref: #22155
Daniel Dehennin 1 year ago
parent
commit
340dd409e2
3 changed files with 179 additions and 115 deletions
  1. 7
    3
      init/onenode.service
  2. 9
    0
      posttemplate/10-libvirt-guests
  3. 163
    112
      scripts/onevm-all

+ 7
- 3
init/onenode.service View File

@@ -8,11 +8,15 @@ After=multi-user.target
8 8
 Type=oneshot
9 9
 Environment=CREDS=/var/lib/one/.one/one_auth
10 10
 Environment=ENDPOINT=http://127.0.0.1:2633/RPC2
11
-TimeoutSec=1min
12 11
 RemainAfterExit=yes
13 12
 Restart=no
14
-ExecStart=/usr/share/eole/sbin/onevm-all -t 20 -w -c ${CREDS} -e ${ENDPOINT} -a "resume"
15
-ExecStop=/usr/share/eole/sbin/onevm-all -t 20 -w -c ${CREDS} -e ${ENDPOINT} -a "suspend"
13
+ExecStart=/usr/share/eole/sbin/onevm-all -c ${CREDS} -e ${ENDPOINT} -a "resume"
14
+# Permit to start remaining VMs at distance by a simple restart
15
+ExecReload=/usr/share/eole/sbin/onevm-all -c ${CREDS} -e ${ENDPOINT} -a "resume"
16
+ExecStop=/usr/share/eole/sbin/onevm-all -w 300 -c ${CREDS} -e ${ENDPOINT} -a "suspend"
17
+# Keep some marging with timeout
18
+TimeoutStartSec=120s
19
+TimeoutStopSec=360s
16 20
 
17 21
 [Install]
18 22
 WantedBy=multi-user.target

+ 9
- 0
posttemplate/10-libvirt-guests View File

@@ -0,0 +1,9 @@
1
+#!/bin/sh
2
+
3
+echo "Disable and mask libvirt-guests service"
4
+for action in stop disable mask
5
+do
6
+    systemctl ${action} libvirt-guests.service 2> /dev/null
7
+done
8
+
9
+exit 0

+ 163
- 112
scripts/onevm-all View File

@@ -1,5 +1,8 @@
1 1
 #!/usr/bin/env ruby
2 2
 
3
+# Do not buffer output
4
+STDOUT.sync = TRUE
5
+
3 6
 ##############################################################################
4 7
 # Environment Configuration
5 8
 ##############################################################################
@@ -30,19 +33,30 @@ include OpenNebula
30 33
 MAXWAIT=60
31 34
 INTERVAL=1
32 35
 
33
-def _wait(vm, st)
34
-    wait = 0
35
-    while vm.status != st
36
-        vm.info
37
-        if vm.status == 'unkn'
38
-          break
39
-        end
40
-        wait += INTERVAL
41
-        sleep(INTERVAL)
42
-        if wait >= MAXWAIT
43
-            break
44
-        end
36
+# List of supported actions
37
+ACTIONS = [
38
+  'status',  # Get the status of all VMs in OpenNebula VM pool
39
+  'suspend', # Suspend all VMs in RUNNING state
40
+  'resume',  # Resume all VMs in SUSPENDED or UNKNOWN state
41
+]
42
+
43
+
44
+# Map each action with a target state
45
+EXPECTED_STATUS_MAP = {
46
+  'status'  => nil,
47
+  'boot'    => 'runn',
48
+  'suspend' => 'susp',
49
+  'resume'  => 'runn'
50
+}
51
+
52
+def dump_running_vms_file()
53
+    if File.exist?(RUNVMFILE)
54
+        running_vms = File.readlines(RUNVMFILE).uniq
55
+    else
56
+        running_vms = []
45 57
     end
58
+
59
+    return running_vms
46 60
 end
47 61
 
48 62
 def CreoleGet(variable)
@@ -54,19 +68,48 @@ def CreoleGet(variable)
54 68
     end
55 69
 end
56 70
 
71
+def _do_wait(vms, action, maxwait)
72
+    if maxwait == 0 and action == 'resume'
73
+        # User explicitely don't want to wait
74
+        vms.clear
75
+        return 0
76
+    end
77
+
78
+    print "Wait #{maxwait}s for VMs to #{action}"
79
+    for try in 0..maxwait
80
+        vms.delete_if do |vm|
81
+            vm.info
82
+            vm.status == EXPECTED_STATUS_MAP[action]
83
+        end
84
+        break if vms.empty?
85
+        print "."
86
+        sleep(1)
87
+    end
88
+    if vms.empty?
89
+        puts " OK"
90
+        return 0
91
+    else
92
+        puts " FAIL"
93
+        return -1
94
+    end
95
+end
96
+
97
+
57 98
 #
58 99
 # NAME: _do_suspend
59 100
 # PARAM: OpenNebula::VirtualMachine object
60 101
 # AIM: Suspend a virtual machine
61 102
 #
62
-def _do_suspend(vm, wait)
103
+def _do_suspend(vm)
63 104
     fd = File.open(RUNVMFILE,'a')
64 105
     if vm.status == "runn"
65
-        puts("Suspending #{vm.name} ...")
106
+        puts("Suspending #{vm.id} - #{vm.name}... ")
66 107
         fd.write("#{vm.id}\n")
67
-        vm.suspend
68
-        if wait
69
-            _wait(vm, "susp")
108
+        rc = vm.suspend
109
+        if OpenNebula.is_error?(rc)
110
+            puts rc.message
111
+        else
112
+            puts "scheduled"
70 113
         end
71 114
     end
72 115
     fd.close
@@ -77,28 +120,13 @@ end
77 120
 # PARAM: OpenNebula::VirtualMachine object
78 121
 # AIM: Resum a suspended virtual machines
79 122
 #
80
-def _do_resume(vm, wait, force=FALSE)
81
-    if force
82
-      vm.resume
123
+def _do_resume(vm)
124
+    print("Resume #{vm.id} - #{vm.name}... ")
125
+    rc = vm.resume
126
+    if OpenNebula.is_error?(rc)
127
+        puts rc.message
83 128
     else
84
-      if vm.status == "susp"
85
-        puts("Resume on #{vm.name}")
86
-        vm.resume
87
-        #    elsif vm.status == 'save'
88
-        #      puts("Recover on #{vm.name}")
89
-        #      # Try to recover VM with retry action
90
-        #      vm.recover(2)
91
-        #      vm.resume
92
-      elsif vm.status == 'unkn'
93
-        puts("Resume on #{vm.name}")
94
-        vm.resume
95
-      else
96
-        return -1
97
-      end
98
-    end
99
-
100
-    if wait
101
-      _wait(vm, "runn")
129
+        puts "scheduled"
102 130
     end
103 131
 end
104 132
 
@@ -107,65 +135,65 @@ options = {:creds => nil, :action => nil, :endpoint => nil,
107 135
            :timeout => nil}
108 136
 
109 137
 parser = OptionParser.new do|opts|
110
-  opts.banner = "Usage: #{File.basename(__FILE__)} [options]"
111
-  opts.on('-c', '--creds file', 'Crediential file') do |value|
112
-    options[:creds] = value;
113
-  end
114
-
115
-  opts.on('-a', '--action action', 'Action to run') do |value|
116
-    options[:action] = value;
117
-  end
138
+    opts.banner = "Usage: #{File.basename(__FILE__)} [options]"
139
+    opts.on('-c', '--creds file', 'Crediential file') do |value|
140
+        options[:creds] = value;
141
+    end
118 142
 
119
-  opts.on('-e', '--end-point url', 'End point URL') do |value|
120
-    options[:endpoint] = value;
121
-  end
143
+    opts.on('-a', '--action action', 'Action to run') do |value|
144
+        options[:action] = value;
145
+    end
122 146
 
123
-  opts.on('-t', '--timeout timeout', 'Timeout for opennebula connection') do |value|
124
-      options[:timeout] = value.to_i;
125
-  end
147
+    opts.on('-e', '--end-point url', 'End point URL') do |value|
148
+        options[:endpoint] = value;
149
+    end
126 150
 
127
-  opts.on('-w', '--wait', 'Wait for action ends') do |w|
128
-      options[:wait] = w
129
-  end
151
+    opts.on('-t', '--timeout timeout', 'Timeout for opennebula connection') do |value|
152
+        options[:timeout] = value.to_i;
153
+    end
130 154
 
131
-  opts.on('-h', '--help', 'Displays Help') do
132
-    puts opts
133
-    exit
134
-  end
155
+    opts.on('-w', '--wait timeout', 'Wait for action ends') do |value|
156
+        options[:wait] = value.to_i
157
+    end
135 158
 
159
+    opts.on('-h', '--help', 'Displays Help') do
160
+        puts opts
161
+        exit
162
+    end
136 163
 
137 164
 end
138 165
 
139 166
 parser.parse!
140 167
 
141 168
 # OpenNebula credentials
142
-
143 169
 if not options[:creds]
144 170
     options[:creds] = "/var/lib/one/.one/one_auth"
145 171
 end
146 172
 
147 173
 if not options[:action]
148
-  options[:action] = "status"
174
+    options[:action] = "status"
149 175
 end
150 176
 
151 177
 if not options[:endpoint]
152
-  ip = CreoleGet('adresse_ip_eth0').chomp
153
-  options[:endpoint] = "http://#{ip}:2633/RPC2"
178
+    ip = CreoleGet('adresse_ip_eth0').chomp
179
+    options[:endpoint] = "http://#{ip}:2633/RPC2"
154 180
 end
155 181
 
156 182
 if not options[:timeout]
157
-  options[:timeout] = TIMEOUT
183
+    options[:timeout] = TIMEOUT
158 184
 end
159 185
 
160
-# Actions
161
-SUPPORTED = ['status', 'boot', 'resume', 'shutdown', 'suspend']
186
+if not options[:wait]
187
+    options[:wait] = MAXWAIT
188
+end
162 189
 
163 190
 
164
-if not SUPPORTED.include?(options[:action])
165
-  puts("Action : #{options[:action]}) is not supported")
166
-  exit(-1)
191
+if not ACTIONS.include?(options[:action])
192
+    puts("Action : #{options[:action]}) is not supported")
193
+    exit(-1)
167 194
 end
168 195
 
196
+
169 197
 begin
170 198
     File.readlines(options[:creds]).each do |line|
171 199
         CREDENTIALS = line
@@ -175,54 +203,77 @@ rescue
175 203
     exit(-1)
176 204
 end
177 205
 
178
-begin
179
-  client = Client.new(CREDENTIALS, options[:endpoint])
180 206
 
181
-  vm_pool = VirtualMachinePool.new(client, USERFLAG)
207
+exit_code = 0
208
+begin
209
+    client = Client.new(CREDENTIALS, options[:endpoint])
182 210
 
183
-  if File.exist?(RUNVMFILE)
184
-    running_vms = File.readlines(RUNVMFILE)
185
-  else
186
-    running_vms = []
187
-  end
211
+    vm_pool = VirtualMachinePool.new(client, USERFLAG)
188 212
 
189
-  rc = vm_pool.info
190
-  cnt = 0
191
-  while OpenNebula.is_error?(rc)
192
-    if cnt == options[:timeout]
193
-        puts rc.message
194
-        exit(-1)
195
-    end
213
+    # Try to load vm pool infos from OpenNebula until timeout expires
196 214
     rc = vm_pool.info
197
-    sleep(1)
198
-    cnt += 1
199
-  end
200
-
201
-  vm_pool.each do |vm|
202
-    case options[:action]
203
-    when "status"
204
-      puts("#{vm.name}\t#{vm.status}")
205
-    when "boot"
206
-      puts("DEBUG #{vm.status}")
207
-      if vm.status == "unkn"
208
-        puts("Booting #{vm.name} ...")
209
-        vm.boot
210
-      end
211
-    when "suspend"
212
-        _do_suspend(vm, options[:wait])
213
-    when "resume"
214
-      if running_vms.include?("#{vm.id}\n")
215
-        _do_resume(vm, options[:wait], TRUE)
216
-      end
215
+    cnt = 0
216
+    while OpenNebula.is_error?(rc)
217
+        if cnt == options[:timeout]
218
+            puts rc.message
219
+            exit(-1)
220
+        end
221
+        rc = vm_pool.info
222
+        sleep(1)
223
+        cnt += 1
224
+    end
225
+
226
+    if options[:action] == "resume"
227
+        running_vms = dump_running_vms_file()
228
+        running_vms.each do |vmid|
229
+            vm = VirtualMachine.new_with_id(vmid, client)
230
+            vm.info
231
+            _do_resume(vm)
232
+        end
233
+
217 234
     else
218
-      puts("#{vm.name}\t#{vm.status}")
235
+        vm_pool.each do |vm|
236
+            case options[:action]
237
+            when "status"
238
+                puts "#{vm.name}\t#{vm.status}"
239
+
240
+            when "suspend"
241
+                _do_suspend(vm)
242
+            end
243
+        end
244
+
245
+        # Update list of suspended VMs
246
+        running_vms = dump_running_vms_file()
247
+    end
248
+
249
+    if options[:action] != 'status'
250
+        vms = []
251
+        running_vms.each do |vmid|
252
+            vm = VirtualMachine.new_with_id(vmid, client)
253
+            vms.push(vm)
254
+        end
255
+        exit_code = _do_wait(vms, options[:action], options[:wait])
256
+    end
257
+
258
+    if options[:action] == "resume"
259
+        if vms.empty?
260
+            File.truncate(RUNVMFILE, 0) if File.exists?(RUNVMFILE)
261
+        else
262
+            fd = File.open(RUNVMFILE,'w')
263
+            vms.each do |vm|
264
+                fd.write("#{vm.id}\n")
265
+            end
266
+        end
219 267
     end
220
-  end
221
-  if options[:action] == "resume"
222
-    File.truncate(RUNVMFILE, 0) if File.exists?(RUNVMFILE)
223
-  end
268
+
224 269
 rescue Exception => e
225
-  puts e.message
226
-  exit(-1)
270
+    puts e.message
271
+    puts e.backtrace
272
+    exit(-1)
227 273
 end
228
-exit 0
274
+
275
+exit(exit_code)
276
+
277
+# Local Variables:
278
+# ruby-indent-level: 4
279
+# End: