Manage a pool of nodes for a distributed test infrastructure
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_launcher.py 79KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958
  1. # Copyright (C) 2014 OpenStack Foundation
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
  12. # implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import logging
  16. import math
  17. import time
  18. import fixtures
  19. import mock
  20. from nodepool import tests
  21. from nodepool import zk
  22. from nodepool.driver.fake import provider as fakeprovider
  23. from nodepool.nodeutils import iterate_timeout
  24. import nodepool.launcher
  25. from kazoo import exceptions as kze
  26. class TestLauncher(tests.DBTestCase):
  27. log = logging.getLogger("nodepool.TestLauncher")
  28. def test_node_assignment(self):
  29. '''
  30. Successful node launch should have unlocked nodes in READY state
  31. and assigned to the request.
  32. '''
  33. configfile = self.setup_config('node_no_min_ready.yaml')
  34. self.useBuilder(configfile)
  35. image = self.waitForImage('fake-provider', 'fake-image')
  36. self.assertEqual(image.username, 'zuul')
  37. nodepool.launcher.LOCK_CLEANUP = 1
  38. pool = self.useNodepool(configfile, watermark_sleep=1)
  39. pool.start()
  40. req = zk.NodeRequest()
  41. req.state = zk.REQUESTED
  42. req.node_types.append('fake-label')
  43. self.zk.storeNodeRequest(req)
  44. req = self.waitForNodeRequest(req)
  45. self.assertEqual(req.state, zk.FULFILLED)
  46. self.assertNotEqual(req.nodes, [])
  47. for node_id in req.nodes:
  48. node = self.zk.getNode(node_id)
  49. self.assertEqual(node.allocated_to, req.id)
  50. self.assertEqual(node.state, zk.READY)
  51. self.assertIsNotNone(node.launcher)
  52. self.assertEqual(node.cloud, 'fake')
  53. self.assertEqual(node.region, 'fake-region')
  54. self.assertEqual(node.az, "az1")
  55. self.assertEqual(node.username, "zuul")
  56. self.assertEqual(node.connection_type, 'ssh')
  57. self.assertEqual(node.connection_port, 22)
  58. p = "{path}/{id}".format(
  59. path=self.zk._imageUploadPath(image.image_name,
  60. image.build_id,
  61. image.provider_name),
  62. id=image.id)
  63. self.assertEqual(node.image_id, p)
  64. resources = {
  65. 'cores': 4,
  66. 'instances': 1,
  67. 'ram': 8192,
  68. }
  69. self.assertEqual(node.resources, resources)
  70. self.zk.lockNode(node, blocking=False)
  71. self.zk.unlockNode(node)
  72. # Verify the cleanup thread removed the lock
  73. self.assertIsNotNone(
  74. self.zk.client.exists(self.zk._requestLockPath(req.id))
  75. )
  76. self.zk.deleteNodeRequest(req)
  77. self.waitForNodeRequestLockDeletion(req.id)
  78. self.assertReportedStat('nodepool.nodes.ready', value='1', kind='g')
  79. self.assertReportedStat('nodepool.nodes.building', value='0', kind='g')
  80. self.assertReportedStat('nodepool.label.fake-label.nodes.ready',
  81. value='1', kind='g')
  82. # Verify that we correctly initialized unused label stats to 0
  83. self.assertReportedStat('nodepool.label.fake-label2.nodes.building',
  84. value='0', kind='g')
  85. self.assertReportedStat('nodepool.label.fake-label2.nodes.testing',
  86. value='0', kind='g')
  87. self.assertReportedStat('nodepool.label.fake-label2.nodes.ready',
  88. value='0', kind='g')
  89. self.assertReportedStat('nodepool.label.fake-label2.nodes.in-use',
  90. value='0', kind='g')
  91. self.assertReportedStat('nodepool.label.fake-label2.nodes.used',
  92. value='0', kind='g')
  93. self.assertReportedStat('nodepool.label.fake-label2.nodes.hold',
  94. value='0', kind='g')
  95. self.assertReportedStat('nodepool.label.fake-label2.nodes.deleting',
  96. value='0', kind='g')
  97. self.assertReportedStat('nodepool.label.fake-label2.nodes.failed',
  98. value='0', kind='g')
  99. self.assertReportedStat('nodepool.label.fake-label2.nodes.init',
  100. value='0', kind='g')
  101. self.assertReportedStat('nodepool.label.fake-label2.nodes.aborted',
  102. value='0', kind='g')
  103. def test_node_assignment_order(self):
  104. """Test that nodes are assigned in the order requested"""
  105. configfile = self.setup_config('node_many_labels.yaml')
  106. self.useBuilder(configfile)
  107. self.waitForImage('fake-provider', 'fake-image')
  108. pool = self.useNodepool(configfile, watermark_sleep=1)
  109. pool.start()
  110. self.waitForNodes('fake-label1')
  111. self.waitForNodes('fake-label2')
  112. self.waitForNodes('fake-label3')
  113. self.waitForNodes('fake-label4')
  114. req = zk.NodeRequest()
  115. req.state = zk.REQUESTED
  116. req.node_types.append('fake-label3')
  117. req.node_types.append('fake-label1')
  118. req.node_types.append('fake-label4')
  119. req.node_types.append('fake-label2')
  120. self.zk.storeNodeRequest(req)
  121. req = self.waitForNodeRequest(req)
  122. self.assertEqual(req.state, zk.FULFILLED)
  123. self.assertEqual(4, len(req.nodes))
  124. nodes = []
  125. for node_id in req.nodes:
  126. nodes.append(self.zk.getNode(node_id))
  127. self.assertEqual(nodes[0].type, ['fake-label3'])
  128. self.assertEqual(nodes[1].type, ['fake-label1'])
  129. self.assertEqual(nodes[2].type, ['fake-label4'])
  130. self.assertEqual(nodes[3].type, ['fake-label2'])
  131. def _test_node_assignment_at_quota(self,
  132. config,
  133. max_cores=100,
  134. max_instances=20,
  135. max_ram=1000000):
  136. '''
  137. Successful node launch should have unlocked nodes in READY state
  138. and assigned to the request. This should be run with a quota that
  139. fits for two nodes.
  140. '''
  141. # patch the cloud with requested quota
  142. def fake_get_quota():
  143. return (max_cores, max_instances, max_ram)
  144. self.useFixture(fixtures.MockPatchObject(
  145. fakeprovider.FakeProvider.fake_cloud, '_get_quota',
  146. fake_get_quota
  147. ))
  148. configfile = self.setup_config(config)
  149. self.useBuilder(configfile)
  150. self.waitForImage('fake-provider', 'fake-image')
  151. nodepool.launcher.LOCK_CLEANUP = 1
  152. pool = self.useNodepool(configfile, watermark_sleep=1)
  153. pool.start()
  154. self.wait_for_config(pool)
  155. client = pool.getProviderManager('fake-provider')._getClient()
  156. req1 = zk.NodeRequest()
  157. req1.state = zk.REQUESTED
  158. req1.node_types.append('fake-label')
  159. req1.node_types.append('fake-label')
  160. self.zk.storeNodeRequest(req1)
  161. self.log.debug("Waiting for 1st request %s", req1.id)
  162. req1 = self.waitForNodeRequest(req1, (zk.FULFILLED,))
  163. self.assertEqual(len(req1.nodes), 2)
  164. # Mark the first request's nodes as in use so they won't be deleted
  165. # when we pause. Locking them is enough.
  166. req1_node1 = self.zk.getNode(req1.nodes[0])
  167. req1_node2 = self.zk.getNode(req1.nodes[1])
  168. self.zk.lockNode(req1_node1, blocking=False)
  169. self.zk.lockNode(req1_node2, blocking=False)
  170. # One of the things we want to test is that if we spawn many
  171. # node launches at once, we do not deadlock while the request
  172. # handler pauses for quota. To ensure we test that case,
  173. # pause server creation until we have accepted all of the node
  174. # requests we submit. This will ensure that we hold locks on
  175. # all of the nodes before pausing so that we can validate they
  176. # are released.
  177. req2 = zk.NodeRequest()
  178. req2.state = zk.REQUESTED
  179. req2.node_types.append('fake-label')
  180. req2.node_types.append('fake-label')
  181. self.zk.storeNodeRequest(req2)
  182. req2 = self.waitForNodeRequest(req2, (zk.PENDING,))
  183. # At this point, we should have already created two servers for the
  184. # first request, and the request handler has accepted the second node
  185. # request but paused waiting for the server count to go below quota.
  186. # Wait until there is a paused request handler and check if there
  187. # are exactly two servers
  188. pool_worker = pool.getPoolWorkers('fake-provider')
  189. while not pool_worker[0].paused_handler:
  190. time.sleep(0.1)
  191. self.assertEqual(len(client._server_list), 2)
  192. # Mark the first request's nodes as USED, which will get them deleted
  193. # and allow the second to proceed.
  194. self.log.debug("Marking first node as used %s", req1.id)
  195. req1_node1.state = zk.USED
  196. self.zk.storeNode(req1_node1)
  197. self.zk.unlockNode(req1_node1)
  198. self.waitForNodeDeletion(req1_node1)
  199. # To force the sequential nature of what we're testing, wait for
  200. # the 2nd request to get a node allocated to it now that we've
  201. # freed up a node.
  202. self.log.debug("Waiting for node allocation for 2nd request")
  203. done = False
  204. while not done:
  205. for n in self.zk.nodeIterator():
  206. if n.allocated_to == req2.id:
  207. done = True
  208. break
  209. self.log.debug("Marking second node as used %s", req1.id)
  210. req1_node2.state = zk.USED
  211. self.zk.storeNode(req1_node2)
  212. self.zk.unlockNode(req1_node2)
  213. self.waitForNodeDeletion(req1_node2)
  214. self.log.debug("Deleting 1st request %s", req1.id)
  215. self.zk.deleteNodeRequest(req1)
  216. self.waitForNodeRequestLockDeletion(req1.id)
  217. req2 = self.waitForNodeRequest(req2, (zk.FULFILLED,))
  218. self.assertEqual(len(req2.nodes), 2)
  219. def test_node_assignment_at_pool_quota_cores(self):
  220. self._test_node_assignment_at_quota(
  221. config='node_quota_pool_cores.yaml')
  222. def test_node_assignment_at_pool_quota_instances(self):
  223. self._test_node_assignment_at_quota(
  224. config='node_quota_pool_instances.yaml')
  225. def test_node_assignment_at_pool_quota_ram(self):
  226. self._test_node_assignment_at_quota(
  227. config='node_quota_pool_ram.yaml')
  228. def test_node_assignment_at_cloud_cores_quota(self):
  229. self._test_node_assignment_at_quota(config='node_quota_cloud.yaml',
  230. max_cores=8,
  231. # check that -1 and inf work for no
  232. # quota
  233. max_instances=-1,
  234. max_ram=math.inf)
  235. def test_node_assignment_at_cloud_instances_quota(self):
  236. self._test_node_assignment_at_quota(config='node_quota_cloud.yaml',
  237. max_cores=math.inf,
  238. max_instances=2,
  239. max_ram=math.inf)
  240. def test_node_assignment_at_cloud_ram_quota(self):
  241. self._test_node_assignment_at_quota(config='node_quota_cloud.yaml',
  242. max_cores=math.inf,
  243. max_instances=math.inf,
  244. max_ram=2 * 8192)
  245. def test_over_quota(self, config='node_quota_cloud.yaml'):
  246. '''
  247. This tests what happens when a cloud unexpectedly returns an
  248. over-quota error.
  249. '''
  250. # Start with an instance quota of 2
  251. max_cores = math.inf
  252. max_instances = 2
  253. max_ram = math.inf
  254. # patch the cloud with requested quota
  255. def fake_get_quota():
  256. return (max_cores, max_instances, max_ram)
  257. self.useFixture(fixtures.MockPatchObject(
  258. fakeprovider.FakeProvider.fake_cloud, '_get_quota',
  259. fake_get_quota
  260. ))
  261. configfile = self.setup_config(config)
  262. self.useBuilder(configfile)
  263. self.waitForImage('fake-provider', 'fake-image')
  264. nodepool.launcher.LOCK_CLEANUP = 1
  265. pool = self.useNodepool(configfile, watermark_sleep=1)
  266. pool.start()
  267. self.wait_for_config(pool)
  268. client = pool.getProviderManager('fake-provider')._getClient()
  269. # Wait for a single node to be created
  270. req1 = zk.NodeRequest()
  271. req1.state = zk.REQUESTED
  272. req1.node_types.append('fake-label')
  273. self.log.debug("Adding first request")
  274. self.zk.storeNodeRequest(req1)
  275. req1 = self.waitForNodeRequest(req1)
  276. self.assertEqual(req1.state, zk.FULFILLED)
  277. # Lock this node so it appears as used and not deleted
  278. req1_node = self.zk.getNode(req1.nodes[0])
  279. self.zk.lockNode(req1_node, blocking=False)
  280. # Now, reduce the quota so the next node unexpectedly
  281. # (according to nodepool's quota estimate) fails.
  282. client.max_instances = 1
  283. # Request a second node; this request should pause the handler.
  284. req2 = zk.NodeRequest()
  285. req2.state = zk.REQUESTED
  286. req2.node_types.append('fake-label')
  287. self.log.debug("Adding second request")
  288. self.zk.storeNodeRequest(req2)
  289. pool_worker = pool.getPoolWorkers('fake-provider')
  290. while not pool_worker[0].paused_handler:
  291. time.sleep(0.1)
  292. # The handler is paused now and the request should be in state PENDING
  293. req2 = self.waitForNodeRequest(req2, zk.PENDING)
  294. self.assertEqual(req2.state, zk.PENDING)
  295. # Now free up the first node
  296. self.log.debug("Marking first node as used %s", req1.id)
  297. req1_node.state = zk.USED
  298. self.zk.storeNode(req1_node)
  299. self.zk.unlockNode(req1_node)
  300. self.waitForNodeDeletion(req1_node)
  301. # After the first node is cleaned up the second request should be
  302. # able to fulfill now.
  303. req2 = self.waitForNodeRequest(req2)
  304. self.assertEqual(req2.state, zk.FULFILLED)
  305. self.assertEqual(len(client._server_list), 1)
  306. def test_fail_request_on_launch_failure(self):
  307. '''
  308. Test that provider launch error fails the request.
  309. '''
  310. configfile = self.setup_config('node_launch_retry.yaml')
  311. self.useBuilder(configfile)
  312. self.waitForImage('fake-provider', 'fake-image')
  313. pool = self.useNodepool(configfile, watermark_sleep=1)
  314. pool.start()
  315. self.wait_for_config(pool)
  316. manager = pool.getProviderManager('fake-provider')
  317. manager.createServer_fails = 2
  318. req = zk.NodeRequest()
  319. req.state = zk.REQUESTED
  320. req.node_types.append('fake-label')
  321. self.zk.storeNodeRequest(req)
  322. req = self.waitForNodeRequest(req)
  323. self.assertEqual(0, manager.createServer_fails)
  324. self.assertEqual(req.state, zk.FAILED)
  325. self.assertNotEqual(req.declined_by, [])
  326. def test_az_change_recover(self):
  327. '''
  328. Test that nodepool recovers from az change in the cloud.
  329. '''
  330. configfile = self.setup_config('node_az_change.yaml')
  331. self.useBuilder(configfile)
  332. self.waitForImage('fake-provider', 'fake-image')
  333. pool = self.useNodepool(configfile, watermark_sleep=1)
  334. pool.start()
  335. self.wait_for_config(pool)
  336. req = zk.NodeRequest()
  337. req.state = zk.REQUESTED
  338. req.node_types.append('fake-label')
  339. self.zk.storeNodeRequest(req)
  340. req = self.waitForNodeRequest(req)
  341. self.assertEqual(req.state, zk.FULFILLED)
  342. # now change the azs in the cloud
  343. cloud = pool.getProviderManager('fake-provider')._getClient()
  344. cloud._azs = ['new-az1', 'new-az2']
  345. # Do a second request. This will fail because the cached azs are not
  346. # available anymore.
  347. # TODO(tobiash): Ideally we should already be able to already recover
  348. # this request.
  349. req2 = zk.NodeRequest()
  350. req2.state = zk.REQUESTED
  351. req2.node_types.append('fake-label')
  352. self.zk.storeNodeRequest(req2)
  353. req2 = self.waitForNodeRequest(req2)
  354. self.assertEqual(req2.state, zk.FAILED)
  355. # Create a third request to test that nodepool successfully recovers
  356. # from a stale az cache.
  357. req3 = zk.NodeRequest()
  358. req3.state = zk.REQUESTED
  359. req3.node_types.append('fake-label')
  360. self.zk.storeNodeRequest(req3)
  361. req3 = self.waitForNodeRequest(req3)
  362. self.assertEqual(req3.state, zk.FULFILLED)
  363. node = self.zk.getNode(req3.nodes[0])
  364. self.assertIn(node.az, ['new-az1', 'new-az2'])
  365. def test_fail_minready_request_at_capacity(self):
  366. '''
  367. A min-ready request to a provider that is already at capacity should
  368. be declined.
  369. '''
  370. configfile = self.setup_config('node_min_ready_capacity.yaml')
  371. self.useBuilder(configfile)
  372. self.waitForImage('fake-provider', 'fake-image')
  373. pool = self.useNodepool(configfile, watermark_sleep=1)
  374. pool.start()
  375. # Get an initial node ready
  376. req = zk.NodeRequest()
  377. req.state = zk.REQUESTED
  378. req.node_types.append("fake-label")
  379. self.zk.storeNodeRequest(req)
  380. req = self.waitForNodeRequest(req)
  381. self.assertEqual(req.state, zk.FULFILLED)
  382. # Now simulate a min-ready request
  383. min_ready_req = zk.NodeRequest()
  384. min_ready_req.state = zk.REQUESTED
  385. min_ready_req.node_types.append("fake-label")
  386. min_ready_req.requestor = "NodePool:min-ready"
  387. self.zk.storeNodeRequest(min_ready_req)
  388. min_ready_req = self.waitForNodeRequest(min_ready_req)
  389. self.assertEqual(min_ready_req.state, zk.FAILED)
  390. self.assertNotEqual(min_ready_req.declined_by, [])
  391. def test_invalid_image_fails(self):
  392. '''
  393. Test that an invalid image declines and fails the request.
  394. '''
  395. configfile = self.setup_config('node.yaml')
  396. pool = self.useNodepool(configfile, watermark_sleep=1)
  397. pool.start()
  398. req = zk.NodeRequest()
  399. req.state = zk.REQUESTED
  400. req.node_types.append("zorky-zumba")
  401. self.zk.storeNodeRequest(req)
  402. req = self.waitForNodeRequest(req)
  403. self.assertEqual(req.state, zk.FAILED)
  404. self.assertNotEqual(req.declined_by, [])
  405. def test_node(self):
  406. """Test that an image and node are created"""
  407. configfile = self.setup_config('node.yaml')
  408. pool = self.useNodepool(configfile, watermark_sleep=1)
  409. self.useBuilder(configfile)
  410. pool.start()
  411. image = self.waitForImage('fake-provider', 'fake-image')
  412. self.assertEqual(image.username, 'zuul')
  413. nodes = self.waitForNodes('fake-label')
  414. self.assertEqual(len(nodes), 1)
  415. self.assertEqual(nodes[0].provider, 'fake-provider')
  416. self.assertEqual(nodes[0].type, ['fake-label'])
  417. self.assertEqual(nodes[0].username, 'zuul')
  418. self.assertNotEqual(nodes[0].host_keys, [])
  419. self.assertEqual(nodes[0].attributes,
  420. {'key1': 'value1', 'key2': 'value2'})
  421. def test_node_host_key_checking_false(self):
  422. """Test that an image and node are created"""
  423. configfile = self.setup_config('node-host-key-checking.yaml')
  424. pool = self.useNodepool(configfile, watermark_sleep=1)
  425. self.useBuilder(configfile)
  426. pool.start()
  427. image = self.waitForImage('fake-provider', 'fake-image')
  428. self.assertEqual(image.username, 'zuul')
  429. nodes = self.waitForNodes('fake-label')
  430. self.assertEqual(len(nodes), 1)
  431. self.assertEqual(nodes[0].provider, 'fake-provider')
  432. self.assertEqual(nodes[0].type, ['fake-label'])
  433. self.assertEqual(nodes[0].username, 'zuul')
  434. # We have no host_keys because host-key-checking is False.
  435. self.assertEqual(nodes[0].host_keys, [])
  436. def test_multiple_launcher(self):
  437. """Test that an image and node are created with 2 launchers"""
  438. # nodepool-builder needs access to both providers to upload images
  439. configfile = self.setup_config('node_two_provider.yaml')
  440. self.useBuilder(configfile)
  441. # Start up first launcher
  442. configfile1 = self.setup_config('node.yaml')
  443. pool1 = self.useNodepool(configfile1, watermark_sleep=1)
  444. pool1.start()
  445. # Start up second launcher
  446. configfile2 = self.setup_config('node_second_provider.yaml')
  447. pool2 = self.useNodepool(configfile2, watermark_sleep=1)
  448. pool2.start()
  449. # Validate we have images in both providers
  450. image1 = self.waitForImage('fake-provider', 'fake-image')
  451. self.assertEqual(image1.username, 'zuul')
  452. image2 = self.waitForImage('fake-provider2', 'fake-image')
  453. self.assertEqual(image2.username, 'zuul')
  454. # We don't need to check which provider launched the min-ready, just
  455. # that one was launched.
  456. nodes = self.waitForNodes('fake-label', 1)
  457. self.assertEqual(len(nodes), 1)
  458. self.assertEqual(nodes[0].type, ['fake-label'])
  459. self.assertEqual(nodes[0].username, 'zuul')
  460. self.assertNotEqual(nodes[0].host_keys, [])
  461. def test_node_boot_from_volume(self):
  462. """Test that an image and node are created from a volume"""
  463. configfile = self.setup_config('node_boot_from_volume.yaml')
  464. pool = self.useNodepool(configfile, watermark_sleep=1)
  465. self.useBuilder(configfile)
  466. pool.start()
  467. self.waitForImage('fake-provider', 'fake-image')
  468. nodes = self.waitForNodes('fake-label')
  469. self.assertEqual(len(nodes), 1)
  470. self.assertEqual(nodes[0].provider, 'fake-provider')
  471. self.assertEqual(nodes[0].type, ['fake-label'])
  472. def test_disabled_label(self):
  473. """Test that a node is not created with min-ready=0"""
  474. configfile = self.setup_config('node_disabled_label.yaml')
  475. pool = self.useNodepool(configfile, watermark_sleep=1)
  476. self.useBuilder(configfile)
  477. pool.start()
  478. self.waitForImage('fake-provider', 'fake-image')
  479. self.assertEqual([], self.zk.getNodeRequests())
  480. self.assertEqual([], self.zk.getNodes())
  481. def test_node_net_name(self):
  482. """Test that a node is created with a net name"""
  483. configfile = self.setup_config('node_net_name.yaml')
  484. pool = self.useNodepool(configfile, watermark_sleep=1)
  485. self.useBuilder(configfile)
  486. pool.start()
  487. self.waitForImage('fake-provider', 'fake-image')
  488. nodes = self.waitForNodes('fake-label')
  489. self.assertEqual(len(nodes), 1)
  490. self.assertEqual(nodes[0].provider, 'fake-provider')
  491. self.assertEqual(nodes[0].type, ['fake-label'])
  492. self.assertEqual(nodes[0].username, 'zuul')
  493. def test_node_security_group(self):
  494. """Test that an image and node are created with sec_group specified"""
  495. configfile = self.setup_config('node_security_group.yaml')
  496. pool = self.useNodepool(configfile, watermark_sleep=1)
  497. self.useBuilder(configfile)
  498. pool.start()
  499. self.waitForImage('fake-provider', 'fake-image')
  500. nodes = self.waitForNodes('fake-label')
  501. nodes_def_sg = self.waitForNodes('fake-label2')
  502. self.assertEqual(len(nodes), 1)
  503. self.assertEqual(nodes[0].provider, 'fake-provider')
  504. self.assertEqual(len(nodes_def_sg), 1)
  505. self.assertEqual(nodes_def_sg[0].provider, 'fake-provider')
  506. client = pool.getProviderManager('fake-provider')._getClient()
  507. for server in client._server_list:
  508. if server.id == nodes[0].external_id:
  509. self.assertEqual(server.security_groups, ['fake-sg'])
  510. elif server.id == nodes_def_sg[0].external_id:
  511. self.assertEqual(server.security_groups, [])
  512. def test_node_flavor_name(self):
  513. """Test that a node is created with a flavor name"""
  514. configfile = self.setup_config('node_flavor_name.yaml')
  515. pool = self.useNodepool(configfile, watermark_sleep=1)
  516. self.useBuilder(configfile)
  517. pool.start()
  518. self.waitForImage('fake-provider', 'fake-image')
  519. nodes = self.waitForNodes('fake-label')
  520. self.assertEqual(len(nodes), 1)
  521. self.assertEqual(nodes[0].provider, 'fake-provider')
  522. self.assertEqual(nodes[0].type, ['fake-label'])
  523. def test_node_vhd_image(self):
  524. """Test that a image and node are created vhd image"""
  525. configfile = self.setup_config('node_vhd.yaml')
  526. pool = self.useNodepool(configfile, watermark_sleep=1)
  527. self.useBuilder(configfile)
  528. pool.start()
  529. self.waitForImage('fake-provider', 'fake-image')
  530. nodes = self.waitForNodes('fake-label')
  531. self.assertEqual(len(nodes), 1)
  532. self.assertEqual(nodes[0].provider, 'fake-provider')
  533. self.assertEqual(nodes[0].type, ['fake-label'])
  534. def test_node_vhd_and_qcow2(self):
  535. """Test label provided by vhd and qcow2 images builds"""
  536. configfile = self.setup_config('node_vhd_and_qcow2.yaml')
  537. self.useBuilder(configfile)
  538. p1_image = self.waitForImage('fake-provider1', 'fake-image')
  539. p2_image = self.waitForImage('fake-provider2', 'fake-image')
  540. # We can't guarantee which provider would build the requested
  541. # nodes, but that doesn't matter so much as guaranteeing that the
  542. # correct image type is uploaded to the correct provider.
  543. self.assertEqual(p1_image.format, "vhd")
  544. self.assertEqual(p2_image.format, "qcow2")
  545. def test_dib_upload_fail(self):
  546. """Test that an image upload failure is contained."""
  547. configfile = self.setup_config('node_upload_fail.yaml')
  548. pool = self.useNodepool(configfile, watermark_sleep=1)
  549. self.useBuilder(configfile)
  550. pool.start()
  551. self.waitForImage('fake-provider2', 'fake-image')
  552. nodes = self.waitForNodes('fake-label', 2)
  553. self.assertEqual(len(nodes), 2)
  554. total_nodes = sum(1 for _ in self.zk.nodeIterator())
  555. self.assertEqual(total_nodes, 2)
  556. self.assertEqual(nodes[0].provider, 'fake-provider2')
  557. self.assertEqual(nodes[0].type, ['fake-label'])
  558. self.assertEqual(nodes[0].username, 'zuul')
  559. self.assertEqual(nodes[1].provider, 'fake-provider2')
  560. self.assertEqual(nodes[1].type, ['fake-label'])
  561. self.assertEqual(nodes[1].username, 'zuul')
  562. def test_node_az(self):
  563. """Test that an image and node are created with az specified"""
  564. configfile = self.setup_config('node_az.yaml')
  565. pool = self.useNodepool(configfile, watermark_sleep=1)
  566. self.useBuilder(configfile)
  567. pool.start()
  568. self.waitForImage('fake-provider', 'fake-image')
  569. nodes = self.waitForNodes('fake-label')
  570. self.assertEqual(len(nodes), 1)
  571. self.assertEqual(nodes[0].provider, 'fake-provider')
  572. self.assertEqual(nodes[0].az, 'az1')
  573. def test_node_ipv6(self):
  574. """Test that ipv6 existence either way works fine."""
  575. configfile = self.setup_config('node_ipv6.yaml')
  576. pool = self.useNodepool(configfile, watermark_sleep=1)
  577. self.useBuilder(configfile)
  578. pool.start()
  579. self.waitForImage('fake-provider1', 'fake-image')
  580. self.waitForImage('fake-provider2', 'fake-image')
  581. label1_nodes = self.waitForNodes('fake-label1')
  582. label2_nodes = self.waitForNodes('fake-label2')
  583. self.assertEqual(len(label1_nodes), 1)
  584. self.assertEqual(len(label2_nodes), 1)
  585. # ipv6 address available
  586. self.assertEqual(label1_nodes[0].provider, 'fake-provider1')
  587. self.assertEqual(label1_nodes[0].public_ipv4, 'fake')
  588. self.assertEqual(label1_nodes[0].public_ipv6, 'fake_v6')
  589. self.assertEqual(label1_nodes[0].interface_ip, 'fake_v6')
  590. self.assertEqual(label1_nodes[0].host_id, 'fake_host_id')
  591. # ipv6 address unavailable
  592. self.assertEqual(label2_nodes[0].provider, 'fake-provider2')
  593. self.assertEqual(label2_nodes[0].public_ipv4, 'fake')
  594. self.assertEqual(label2_nodes[0].public_ipv6, '')
  595. self.assertEqual(label2_nodes[0].interface_ip, 'fake')
  596. self.assertEqual(label2_nodes[0].host_id, 'fake')
  597. def test_node_delete_success(self):
  598. configfile = self.setup_config('node.yaml')
  599. pool = self.useNodepool(configfile, watermark_sleep=1)
  600. self.useBuilder(configfile)
  601. pool.start()
  602. self.waitForImage('fake-provider', 'fake-image')
  603. nodes = self.waitForNodes('fake-label')
  604. self.assertEqual(len(nodes), 1)
  605. self.assertEqual(zk.READY, nodes[0].state)
  606. self.assertEqual('fake-provider', nodes[0].provider)
  607. nodes[0].state = zk.DELETING
  608. self.zk.storeNode(nodes[0])
  609. # Wait for this one to be deleted
  610. self.waitForNodeDeletion(nodes[0])
  611. # Wait for a new one to take it's place
  612. new_nodes = self.waitForNodes('fake-label')
  613. self.assertEqual(len(new_nodes), 1)
  614. self.assertEqual(zk.READY, new_nodes[0].state)
  615. self.assertEqual('fake-provider', new_nodes[0].provider)
  616. self.assertNotEqual(nodes[0], new_nodes[0])
  617. def test_node_launch_retries(self):
  618. configfile = self.setup_config('node_launch_retry.yaml')
  619. pool = self.useNodepool(configfile, watermark_sleep=1)
  620. self.useBuilder(configfile)
  621. pool.start()
  622. self.wait_for_config(pool)
  623. manager = pool.getProviderManager('fake-provider')
  624. manager.createServer_fails = 2
  625. self.waitForImage('fake-provider', 'fake-image')
  626. req = zk.NodeRequest()
  627. req.state = zk.REQUESTED
  628. req.node_types.append('fake-label')
  629. self.zk.storeNodeRequest(req)
  630. req = self.waitForNodeRequest(req)
  631. self.assertEqual(req.state, zk.FAILED)
  632. # retries in config is set to 2, so 2 attempts to create a server
  633. self.assertEqual(0, manager.createServer_fails)
  634. def test_node_launch_with_broken_znodes(self):
  635. """Test that node launch still works if there are broken znodes"""
  636. # Create a znode without type
  637. znode = zk.Node()
  638. znode.provider = 'fake-provider'
  639. znode.pool = 'main'
  640. znode.external_id = 'fakeid'
  641. znode.state = zk.READY
  642. # Create znode without pool
  643. self.zk.storeNode(znode)
  644. znode = zk.Node()
  645. znode.provider = 'fake-provider'
  646. znode.type = ['fake-label']
  647. znode.external_id = 'fakeid'
  648. znode.state = zk.READY
  649. self.zk.storeNode(znode)
  650. configfile = self.setup_config('node_launch_retry.yaml')
  651. pool = self.useNodepool(configfile, watermark_sleep=1)
  652. self.useBuilder(configfile)
  653. pool.start()
  654. self.wait_for_config(pool)
  655. self.waitForImage('fake-provider', 'fake-image')
  656. req = zk.NodeRequest()
  657. req.state = zk.REQUESTED
  658. req.node_types.append('fake-label')
  659. self.zk.storeNodeRequest(req)
  660. req = self.waitForNodeRequest(req)
  661. self.assertEqual(req.state, zk.FULFILLED)
  662. def test_node_launch_retries_with_external_id(self):
  663. configfile = self.setup_config('node_launch_retry.yaml')
  664. pool = self.useNodepool(configfile, watermark_sleep=1)
  665. self.useBuilder(configfile)
  666. pool.start()
  667. self.wait_for_config(pool)
  668. manager = pool.getProviderManager('fake-provider')
  669. manager.createServer_fails_with_external_id = 2
  670. self.waitForImage('fake-provider', 'fake-image')
  671. # Stop the DeletedNodeWorker so we can make sure the fake znode that
  672. # is used to delete the failed servers is still around when requesting.
  673. # the second node.
  674. pool._delete_thread.stop()
  675. time.sleep(1)
  676. req = zk.NodeRequest()
  677. req.state = zk.REQUESTED
  678. req.node_types.append('fake-label')
  679. self.zk.storeNodeRequest(req)
  680. req = self.waitForNodeRequest(req)
  681. self.assertEqual(req.state, zk.FAILED)
  682. # retries in config is set to 2, so 2 attempts to create a server
  683. self.assertEqual(0, manager.createServer_fails_with_external_id)
  684. # Request another node to check if nothing is wedged
  685. req = zk.NodeRequest()
  686. req.state = zk.REQUESTED
  687. req.node_types.append('fake-label')
  688. self.zk.storeNodeRequest(req)
  689. req = self.waitForNodeRequest(req)
  690. self.assertEqual(req.state, zk.FULFILLED)
  691. def test_node_delete_failure(self):
  692. def fail_delete(self, name):
  693. raise RuntimeError('Fake Error')
  694. self.useFixture(fixtures.MockPatchObject(
  695. fakeprovider.FakeProvider, 'deleteServer', fail_delete))
  696. configfile = self.setup_config('node.yaml')
  697. pool = self.useNodepool(configfile, watermark_sleep=1)
  698. self.useBuilder(configfile)
  699. pool.start()
  700. self.waitForImage('fake-provider', 'fake-image')
  701. nodes = self.waitForNodes('fake-label')
  702. self.assertEqual(len(nodes), 1)
  703. self.zk.lockNode(nodes[0], blocking=False)
  704. nodepool.launcher.NodeDeleter.delete(
  705. self.zk, pool.getProviderManager('fake-provider'), nodes[0])
  706. # Make sure our old node is in delete state, even though delete failed
  707. deleted_node = self.zk.getNode(nodes[0].id)
  708. self.assertIsNotNone(deleted_node)
  709. self.assertEqual(deleted_node.state, zk.DELETING)
  710. # Make sure we have a new, READY node
  711. nodes = self.waitForNodes('fake-label')
  712. self.assertEqual(len(nodes), 1)
  713. self.assertEqual(nodes[0].provider, 'fake-provider')
  714. def test_node_delete_error(self):
  715. def error_delete(self, name):
  716. # Set ERROR status instead of deleting the node
  717. self._getClient()._server_list[0].status = 'ERROR'
  718. self.useFixture(fixtures.MockPatchObject(
  719. fakeprovider.FakeProvider, 'deleteServer', error_delete))
  720. configfile = self.setup_config('node_delete_error.yaml')
  721. pool = self.useNodepool(configfile, watermark_sleep=1)
  722. self.useBuilder(configfile)
  723. pool.start()
  724. self.waitForImage('fake-provider', 'fake-image')
  725. # request a node
  726. req = zk.NodeRequest()
  727. req.state = zk.REQUESTED
  728. req.node_types.append('fake-label')
  729. self.zk.storeNodeRequest(req)
  730. self.log.debug("Wait for request")
  731. req = self.waitForNodeRequest(req)
  732. self.assertEqual(req.state, zk.FULFILLED)
  733. self.assertEqual(len(req.nodes), 1)
  734. # remove the node from db
  735. self.log.debug("deleting node %s", req.nodes[0])
  736. node = self.zk.getNode(req.nodes[0])
  737. self.zk.deleteNode(node)
  738. # wait the cleanup thread to kick in
  739. time.sleep(5)
  740. zk_nodes = self.zk.getNodes()
  741. self.assertEqual(len(zk_nodes), 1)
  742. node = self.zk.getNode(zk_nodes[0])
  743. self.assertEqual(node.state, zk.DELETING)
  744. # remove error nodes
  745. pool.getProviderManager(
  746. 'fake-provider')._getClient()._server_list.clear()
  747. def test_leaked_node(self):
  748. """Test that a leaked node is deleted"""
  749. configfile = self.setup_config('leaked_node.yaml')
  750. pool = self.useNodepool(configfile, watermark_sleep=1)
  751. self.useBuilder(configfile)
  752. pool.start()
  753. self.waitForImage('fake-provider', 'fake-image')
  754. self.log.debug("Waiting for initial pool...")
  755. nodes = self.waitForNodes('fake-label')
  756. self.log.debug("...done waiting for initial pool.")
  757. # Make sure we have a node built and ready
  758. self.assertEqual(len(nodes), 1)
  759. manager = pool.getProviderManager('fake-provider')
  760. servers = manager.listNodes()
  761. self.assertEqual(len(servers), 1)
  762. # Delete the node from ZooKeeper, but leave the instance
  763. # so it is leaked.
  764. self.log.debug("Delete node db record so instance is leaked...")
  765. self.zk.deleteNode(nodes[0])
  766. self.log.debug("...deleted node db so instance is leaked.")
  767. # Wait for nodepool to replace it
  768. self.log.debug("Waiting for replacement pool...")
  769. new_nodes = self.waitForNodes('fake-label')
  770. self.log.debug("...done waiting for replacement pool.")
  771. self.assertEqual(len(new_nodes), 1)
  772. # Wait for the instance to be cleaned up
  773. self.waitForInstanceDeletion(manager, nodes[0].external_id)
  774. # Make sure we end up with only one server (the replacement)
  775. servers = manager.listNodes()
  776. self.assertEqual(len(servers), 1)
  777. def test_max_ready_age(self):
  778. """Test a node with exceeded max-ready-age is deleted"""
  779. configfile = self.setup_config('node_max_ready_age.yaml')
  780. pool = self.useNodepool(configfile, watermark_sleep=1)
  781. self.useBuilder(configfile)
  782. pool.start()
  783. self.waitForImage('fake-provider', 'fake-image')
  784. self.log.debug("Waiting for initial pool...")
  785. nodes = self.waitForNodes('fake-label')
  786. self.log.debug("...done waiting for initial pool.")
  787. # Wait for the instance to be cleaned up
  788. manager = pool.getProviderManager('fake-provider')
  789. self.waitForInstanceDeletion(manager, nodes[0].external_id)
  790. def test_max_hold_age(self):
  791. """Test a held node with exceeded max-hold-age is deleted"""
  792. configfile = self.setup_config('node_max_hold_age.yaml')
  793. pool = self.useNodepool(configfile, watermark_sleep=1)
  794. self.useBuilder(configfile)
  795. pool.start()
  796. self.waitForImage('fake-provider', 'fake-image')
  797. self.log.debug("Waiting for initial pool...")
  798. nodes = self.waitForNodes('fake-label')
  799. self.log.debug("...done waiting for initial pool.")
  800. node = nodes[0]
  801. self.log.debug("Holding node %s..." % node.id)
  802. # hold the node
  803. self.zk.lockNode(node, blocking=False)
  804. node.state = zk.HOLD
  805. node.comment = 'testing'
  806. self.zk.storeNode(node)
  807. self.zk.unlockNode(node)
  808. znode = self.zk.getNode(node.id)
  809. self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
  810. # Wait for the instance to be cleaned up
  811. manager = pool.getProviderManager('fake-provider')
  812. self.waitForInstanceDeletion(manager, node.external_id)
  813. def test_hold_expiration_no_default(self):
  814. """Test a held node is deleted when past its operator-specified TTL,
  815. no max-hold-age set"""
  816. configfile = self.setup_config('node_max_hold_age_no_default.yaml')
  817. pool = self.useNodepool(configfile, watermark_sleep=1)
  818. self.useBuilder(configfile)
  819. pool.start()
  820. self.waitForImage('fake-provider', 'fake-image')
  821. self.log.debug("Waiting for initial pool...")
  822. nodes = self.waitForNodes('fake-label')
  823. self.log.debug("...done waiting for initial pool.")
  824. node = nodes[0]
  825. self.log.debug("Holding node %s..." % node.id)
  826. # hold the node
  827. self.zk.lockNode(node, blocking=False)
  828. node.state = zk.HOLD
  829. node.comment = 'testing'
  830. node.hold_expiration = 1
  831. self.zk.storeNode(node)
  832. self.zk.unlockNode(node)
  833. znode = self.zk.getNode(node.id)
  834. self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
  835. # Wait for the instance to be cleaned up
  836. manager = pool.getProviderManager('fake-provider')
  837. self.waitForInstanceDeletion(manager, node.external_id)
  838. def test_hold_expiration_str_type(self):
  839. """Test a held node is deleted when past its operator-specified TTL,
  840. even when the type is bad"""
  841. configfile = self.setup_config('node_max_hold_age_no_default.yaml')
  842. pool = self.useNodepool(configfile, watermark_sleep=1)
  843. self.useBuilder(configfile)
  844. pool.start()
  845. self.waitForImage('fake-provider', 'fake-image')
  846. self.log.debug("Waiting for initial pool...")
  847. nodes = self.waitForNodes('fake-label')
  848. self.log.debug("...done waiting for initial pool.")
  849. node = nodes[0]
  850. self.log.debug("Holding node %s..." % node.id)
  851. # hold the node
  852. self.zk.lockNode(node, blocking=False)
  853. node.state = zk.HOLD
  854. node.comment = 'testing'
  855. node.hold_expiration = '1'
  856. self.zk.storeNode(node)
  857. self.zk.unlockNode(node)
  858. znode = self.zk.getNode(node.id)
  859. self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
  860. # Wait for the instance to be cleaned up
  861. manager = pool.getProviderManager('fake-provider')
  862. self.waitForInstanceDeletion(manager, node.external_id)
  863. def test_hold_expiration_bad_type_coercion(self):
  864. """Test a held node uses default expiration value when type is bad"""
  865. configfile = self.setup_config('node_max_hold_age_no_default.yaml')
  866. pool = self.useNodepool(configfile, watermark_sleep=1)
  867. self.useBuilder(configfile)
  868. pool.start()
  869. self.waitForImage('fake-provider', 'fake-image')
  870. self.log.debug("Waiting for initial pool...")
  871. nodes = self.waitForNodes('fake-label')
  872. self.log.debug("...done waiting for initial pool.")
  873. node = nodes[0]
  874. self.log.debug("Holding node %s..." % node.id)
  875. # hold the node
  876. self.zk.lockNode(node, blocking=False)
  877. node.state = zk.HOLD
  878. node.comment = 'testing'
  879. node.hold_expiration = 'notanumber'
  880. self.zk.storeNode(node)
  881. self.zk.unlockNode(node)
  882. znode = self.zk.getNode(node.id)
  883. self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
  884. self.assertEqual(znode.hold_expiration, 0)
  885. def test_hold_expiration_lower_than_default(self):
  886. """Test a held node is deleted when past its operator-specified TTL,
  887. with max-hold-age set in the configuration"""
  888. configfile = self.setup_config('node_max_hold_age_2.yaml')
  889. pool = self.useNodepool(configfile, watermark_sleep=1)
  890. self.useBuilder(configfile)
  891. pool.start()
  892. self.waitForImage('fake-provider', 'fake-image')
  893. self.log.debug("Waiting for initial pool...")
  894. nodes = self.waitForNodes('fake-label', 2)
  895. self.log.debug("...done waiting for initial pool.")
  896. node_custom = nodes[0]
  897. # TODO make it a fraction of fixture's max-hold-age
  898. hold_expiration = 2
  899. node = nodes[1]
  900. self.log.debug("Holding node %s... (default)" % node.id)
  901. self.log.debug("Holding node %s...(%s seconds)" % (node_custom.id,
  902. hold_expiration))
  903. # hold the nodes
  904. self.zk.lockNode(node, blocking=False)
  905. node.state = zk.HOLD
  906. node.comment = 'testing'
  907. self.zk.storeNode(node)
  908. self.zk.unlockNode(node)
  909. self.zk.lockNode(node_custom, blocking=False)
  910. node_custom.state = zk.HOLD
  911. node_custom.comment = 'testing hold_expiration'
  912. node_custom.hold_expiration = hold_expiration
  913. self.zk.storeNode(node_custom)
  914. self.zk.unlockNode(node_custom)
  915. znode = self.zk.getNode(node.id)
  916. self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
  917. znode_custom = self.zk.getNode(node_custom.id)
  918. self.log.debug("Node %s in state '%s'" % (znode_custom.id,
  919. znode_custom.state))
  920. # Wait for the instance to be cleaned up
  921. manager = pool.getProviderManager('fake-provider')
  922. self.waitForInstanceDeletion(manager, node_custom.external_id)
  923. # control node should still be held
  924. held_nodes = [n for n in self.zk.nodeIterator() if n.state == zk.HOLD]
  925. self.assertTrue(any(n.id == node.id for n in held_nodes),
  926. held_nodes)
  927. # finally, control node gets deleted
  928. self.waitForInstanceDeletion(manager, node.external_id)
  929. def test_hold_expiration_higher_than_default(self):
  930. """Test a held node is deleted after max-hold-age seconds if the
  931. operator specifies a larger TTL"""
  932. configfile = self.setup_config('node_max_hold_age_2.yaml')
  933. pool = self.useNodepool(configfile, watermark_sleep=1)
  934. self.useBuilder(configfile)
  935. pool.start()
  936. self.waitForImage('fake-provider', 'fake-image')
  937. self.log.debug("Waiting for initial pool...")
  938. nodes = self.waitForNodes('fake-label', 2)
  939. self.log.debug("...done waiting for initial pool.")
  940. node_custom = nodes[0]
  941. # Make hold expiration much larger than max hold age.
  942. hold_expiration = 180
  943. node = nodes[1]
  944. self.log.debug("Holding node %s... (default)" % node.id)
  945. self.log.debug("Holding node %s...(%s seconds)" % (node_custom.id,
  946. hold_expiration))
  947. # hold the nodes
  948. self.zk.lockNode(node, blocking=False)
  949. node.state = zk.HOLD
  950. node.comment = 'testing'
  951. self.zk.storeNode(node)
  952. self.zk.unlockNode(node)
  953. self.zk.lockNode(node_custom, blocking=False)
  954. node_custom.state = zk.HOLD
  955. node_custom.comment = 'testing hold_expiration'
  956. node_custom.hold_expiration = hold_expiration
  957. self.zk.storeNode(node_custom)
  958. self.zk.unlockNode(node_custom)
  959. znode = self.zk.getNode(node.id)
  960. self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
  961. znode_custom = self.zk.getNode(node_custom.id)
  962. self.log.debug("Node %s in state '%s'" % (znode_custom.id,
  963. znode_custom.state))
  964. # Wait for the instance to be cleaned up
  965. manager = pool.getProviderManager('fake-provider')
  966. self.waitForInstanceDeletion(manager, node.external_id)
  967. # The custom node should be deleted as well but it may be slightly
  968. # delayed after the other node. Because of that we have defined a much
  969. # higher hold time than the max hold age. So we can give nodepool a few
  970. # extra seconds to clean it up and still validate that the max hold
  971. # age is not violated.
  972. for _ in iterate_timeout(10, Exception, 'assert custom_node is gone'):
  973. try:
  974. held_nodes = [n for n in self.zk.nodeIterator(cached=False)
  975. if n.state == zk.HOLD]
  976. self.assertEqual(0, len(held_nodes), held_nodes)
  977. break
  978. except AssertionError:
  979. # node still listed, retry
  980. pass
  981. def test_label_provider(self):
  982. """Test that only providers listed in the label satisfy the request"""
  983. configfile = self.setup_config('node_label_provider.yaml')
  984. pool = self.useNodepool(configfile, watermark_sleep=1)
  985. self.useBuilder(configfile)
  986. pool.start()
  987. self.waitForImage('fake-provider', 'fake-image')
  988. self.waitForImage('fake-provider2', 'fake-image')
  989. nodes = self.waitForNodes('fake-label')
  990. self.assertEqual(len(nodes), 1)
  991. self.assertEqual(nodes[0].provider, 'fake-provider2')
  992. def _create_pending_request(self):
  993. req = zk.NodeRequest()
  994. req.state = zk.PENDING
  995. req.requestor = 'test_nodepool'
  996. req.node_types.append('fake-label')
  997. self.zk.storeNodeRequest(req)
  998. # Create a node that is allocated to the request, but not yet assigned
  999. # within the NodeRequest object
  1000. node = zk.Node()
  1001. node.state = zk.READY
  1002. node.type = 'fake-label'
  1003. node.public_ipv4 = 'fake'
  1004. node.provider = 'fake-provider'
  1005. node.pool = 'main'
  1006. node.allocated_to = req.id
  1007. self.zk.storeNode(node)
  1008. return (req, node)
  1009. def test_lost_requests(self):
  1010. """Test a request left pending is reset and satisfied on restart"""
  1011. (req, node) = self._create_pending_request()
  1012. configfile = self.setup_config('node_lost_requests.yaml')
  1013. pool = self.useNodepool(configfile, watermark_sleep=1)
  1014. self.useBuilder(configfile)
  1015. self.waitForImage('fake-provider', 'fake-image')
  1016. pool.start()
  1017. req = self.waitForNodeRequest(req, (zk.FULFILLED,))
  1018. # Since our config file has min-ready=0, we should be able to re-use
  1019. # the previously assigned node, thus making sure that the cleanup
  1020. # code reset the 'allocated_to' field.
  1021. self.assertIn(node.id, req.nodes)
  1022. def test_node_deallocation(self):
  1023. """Test an allocated node with a missing request is deallocated"""
  1024. node = zk.Node()
  1025. node.state = zk.READY
  1026. node.type = 'fake-label'
  1027. node.public_ipv4 = 'fake'
  1028. node.provider = 'fake-provider'
  1029. node.allocated_to = "MISSING"
  1030. self.zk.storeNode(node)
  1031. configfile = self.setup_config('node_lost_requests.yaml')
  1032. pool = self.useNodepool(configfile, watermark_sleep=1)
  1033. self.useBuilder(configfile)
  1034. pool.start()
  1035. while True:
  1036. node = self.zk.getNode(node.id)
  1037. if not node.allocated_to:
  1038. break
  1039. def test_multiple_pools(self):
  1040. """Test that an image and node are created"""
  1041. configfile = self.setup_config('multiple_pools.yaml')
  1042. pool = self.useNodepool(configfile, watermark_sleep=1)
  1043. self.useBuilder(configfile)
  1044. pool.start()
  1045. self.waitForImage('fake-provider', 'fake-image')
  1046. lab1 = self.waitForNodes('fake-label1')
  1047. lab2 = self.waitForNodes('fake-label2')
  1048. self.assertEqual(len(lab1), 1)
  1049. self.assertEqual(lab1[0].provider, 'fake-provider')
  1050. self.assertEqual(lab1[0].type, ['fake-label1'])
  1051. self.assertEqual(lab1[0].az, 'az1')
  1052. self.assertEqual(lab1[0].pool, 'pool1')
  1053. self.assertEqual(len(lab2), 1)
  1054. self.assertEqual(lab2[0].provider, 'fake-provider')
  1055. self.assertEqual(lab2[0].type, ['fake-label2'])
  1056. self.assertEqual(lab2[0].az, 'az2')
  1057. self.assertEqual(lab2[0].pool, 'pool2')
  1058. def test_unmanaged_image(self):
  1059. """Test node launching using an unmanaged image"""
  1060. configfile = self.setup_config('node_unmanaged_image.yaml')
  1061. pool = self.useNodepool(configfile, watermark_sleep=1)
  1062. pool.start()
  1063. self.wait_for_config(pool)
  1064. manager = pool.getProviderManager('fake-provider')
  1065. manager._client.create_image(name="fake-image")
  1066. manager._client.create_image(name="fake-image-windows")
  1067. manager._client.create_image(name="fake-image-windows-port")
  1068. nodes = self.waitForNodes('fake-label')
  1069. self.assertEqual(len(nodes), 1)
  1070. self.assertIsNone(nodes[0].username)
  1071. nodes = self.waitForNodes('fake-label-windows')
  1072. self.assertEqual(len(nodes), 1)
  1073. self.assertEqual('zuul', nodes[0].username)
  1074. self.assertEqual('winrm', nodes[0].connection_type)
  1075. self.assertEqual(5986, nodes[0].connection_port)
  1076. self.assertEqual(nodes[0].host_keys, [])
  1077. nodes = self.waitForNodes('fake-label-arbitrary-port')
  1078. self.assertEqual(len(nodes), 1)
  1079. self.assertEqual('zuul', nodes[0].username)
  1080. self.assertEqual('winrm', nodes[0].connection_type)
  1081. self.assertEqual(1234, nodes[0].connection_port)
  1082. self.assertEqual(nodes[0].host_keys, [])
  1083. def test_unmanaged_image_provider_name(self):
  1084. """
  1085. Test node launching using an unmanaged image referencing the
  1086. image name as known by the provider.
  1087. """
  1088. configfile = self.setup_config('unmanaged_image_provider_name.yaml')
  1089. pool = self.useNodepool(configfile, watermark_sleep=1)
  1090. pool.start()
  1091. self.wait_for_config(pool)
  1092. manager = pool.getProviderManager('fake-provider')
  1093. manager._client.create_image(name="provider-named-image")
  1094. nodes = self.waitForNodes('fake-label')
  1095. self.assertEqual(len(nodes), 1)
  1096. def test_unmanaged_image_provider_id(self):
  1097. """
  1098. Test node launching using an unmanaged image referencing the
  1099. image ID as known by the provider.
  1100. """
  1101. configfile = self.setup_config('unmanaged_image_provider_id.yaml')
  1102. pool = self.useNodepool(configfile, watermark_sleep=1)
  1103. pool.start()
  1104. self.log.debug("Waiting for node")
  1105. nodes = self.waitForNodes('fake-label')
  1106. self.assertEqual(len(nodes), 1)
  1107. def test_paused_gets_declined(self):
  1108. """Test that a paused request, that later gets declined, unpauses."""
  1109. # First config has max-servers set to 2
  1110. configfile = self.setup_config('pause_declined_1.yaml')
  1111. self.useBuilder(configfile)
  1112. self.waitForImage('fake-provider', 'fake-image')
  1113. pool = self.useNodepool(configfile, watermark_sleep=1)
  1114. pool.start()
  1115. # Create a request that uses all capacity (2 servers)
  1116. req = zk.NodeRequest()
  1117. req.state = zk.REQUESTED
  1118. req.node_types.append('fake-label')
  1119. req.node_types.append('fake-label')
  1120. self.zk.storeNodeRequest(req)
  1121. req = self.waitForNodeRequest(req)
  1122. self.assertEqual(req.state, zk.FULFILLED)
  1123. self.assertEqual(len(req.nodes), 2)
  1124. # Now that we have 2 nodes in use, create another request that
  1125. # requests two nodes, which should cause the request to pause.
  1126. req2 = zk.NodeRequest()
  1127. req2.state = zk.REQUESTED
  1128. req2.node_types.append('fake-label')
  1129. req2.node_types.append('fake-label')
  1130. self.zk.storeNodeRequest(req2)
  1131. req2 = self.waitForNodeRequest(req2, (zk.PENDING,))
  1132. # Second config decreases max-servers to 1
  1133. self.replace_config(configfile, 'pause_declined_2.yaml')
  1134. # Because the second request asked for 2 nodes, but that now exceeds
  1135. # max-servers, req2 should get declined now, and transition to FAILED
  1136. req2 = self.waitForNodeRequest(req2, (zk.FAILED,))
  1137. self.assertNotEqual(req2.declined_by, [])
  1138. def test_node_auto_floating_ip(self):
  1139. """Test that auto-floating-ip option works fine."""
  1140. configfile = self.setup_config('node_auto_floating_ip.yaml')
  1141. pool = self.useNodepool(configfile, watermark_sleep=1)
  1142. self.useBuilder(configfile)
  1143. pool.start()
  1144. self.waitForImage('fake-provider1', 'fake-image')
  1145. self.waitForImage('fake-provider2', 'fake-image')
  1146. self.waitForImage('fake-provider3', 'fake-image')
  1147. label1_nodes = self.waitForNodes('fake-label1')
  1148. label2_nodes = self.waitForNodes('fake-label2')
  1149. label3_nodes = self.waitForNodes('fake-label3')
  1150. self.assertEqual(1, len(label1_nodes))
  1151. self.assertEqual(1, len(label2_nodes))
  1152. self.assertEqual(1, len(label3_nodes))
  1153. # auto-floating-ip: False
  1154. self.assertEqual('fake-provider1', label1_nodes[0].provider)
  1155. self.assertEqual('', label1_nodes[0].public_ipv4)
  1156. self.assertEqual('', label1_nodes[0].public_ipv6)
  1157. self.assertEqual('fake', label1_nodes[0].interface_ip)
  1158. # auto-floating-ip: True
  1159. self.assertEqual('fake-provider2', label2_nodes[0].provider)
  1160. self.assertEqual('fake', label2_nodes[0].public_ipv4)
  1161. self.assertEqual('', label2_nodes[0].public_ipv6)
  1162. self.assertEqual('fake', label2_nodes[0].interface_ip)
  1163. # auto-floating-ip: default value
  1164. self.assertEqual('fake-provider3', label3_nodes[0].provider)
  1165. self.assertEqual('fake', label3_nodes[0].public_ipv4)
  1166. self.assertEqual('', label3_nodes[0].public_ipv6)
  1167. self.assertEqual('fake', label3_nodes[0].interface_ip)
  1168. def test_secure_file(self):
  1169. """Test using secure.conf file"""
  1170. configfile = self.setup_config('secure_file_config.yaml')
  1171. securefile = self.setup_secure('secure_file_secure.yaml')
  1172. pool = self.useNodepool(
  1173. configfile,
  1174. secure_conf=securefile,
  1175. watermark_sleep=1)
  1176. self.useBuilder(configfile, securefile=securefile)
  1177. pool.start()
  1178. self.wait_for_config(pool)
  1179. fake_image = pool.config.diskimages['fake-image']
  1180. self.assertIn('REG_PASSWORD', fake_image.env_vars)
  1181. self.assertEqual('secret', fake_image.env_vars['REG_PASSWORD'])
  1182. zk_servers = pool.config.zookeeper_servers
  1183. self.assertEqual(1, len(zk_servers))
  1184. key = list(zk_servers.keys())[0]
  1185. self.assertEqual(self.zookeeper_host, zk_servers[key].host)
  1186. self.assertEqual(self.zookeeper_port, zk_servers[key].port)
  1187. self.assertEqual(self.zookeeper_chroot, zk_servers[key].chroot)
  1188. image = self.waitForImage('fake-provider', 'fake-image')
  1189. self.assertEqual(image.username, 'zuul')
  1190. nodes = self.waitForNodes('fake-label')
  1191. self.assertEqual(len(nodes), 1)
  1192. self.assertEqual(nodes[0].provider, 'fake-provider')
  1193. self.assertEqual(nodes[0].type, ['fake-label'])
  1194. self.assertEqual(nodes[0].username, 'zuul')
  1195. self.assertNotEqual(nodes[0].host_keys, [])
  1196. def test_provider_removal(self):
  1197. """Test that removing a provider stops the worker thread"""
  1198. configfile = self.setup_config('launcher_two_provider.yaml')
  1199. self.useBuilder(configfile)
  1200. pool = self.useNodepool(configfile, watermark_sleep=.5)
  1201. pool.start()
  1202. self.waitForNodes('fake-label')
  1203. self.assertEqual(2, len(pool._pool_threads))
  1204. self.replace_config(configfile, 'launcher_two_provider_remove.yaml')
  1205. # wait longer than our watermark_sleep time for the config to change
  1206. time.sleep(1)
  1207. self.assertEqual(1, len(pool._pool_threads))
  1208. def test_failed_provider(self):
  1209. """Test that broken provider doesn't fail node requests."""
  1210. configfile = self.setup_config('launcher_two_provider_max_1.yaml')
  1211. self.useBuilder(configfile)
  1212. pool = self.useNodepool(configfile, watermark_sleep=.5)
  1213. pool.start()
  1214. self.wait_for_config(pool)
  1215. # Steady state at images available.
  1216. self.waitForImage('fake-provider', 'fake-image')
  1217. self.waitForImage('fake-provider2', 'fake-image')
  1218. # We have now reached steady state and can manipulate the system to
  1219. # test failing cloud behavior.
  1220. # Make two requests so that the next requests are paused.
  1221. # Note we use different provider specific labels here to avoid
  1222. # a race where a single provider fulfills both of these initial
  1223. # requests.
  1224. # fake-provider
  1225. req = zk.NodeRequest()
  1226. req.state = zk.REQUESTED
  1227. req.node_types.append('fake-label2')
  1228. self.zk.storeNodeRequest(req)
  1229. req = self.waitForNodeRequest(req, zk.FULFILLED)
  1230. # fake-provider2
  1231. req = zk.NodeRequest()
  1232. req.state = zk.REQUESTED
  1233. req.node_types.append('fake-label3')
  1234. self.zk.storeNodeRequest(req)
  1235. req = self.waitForNodeRequest(req, zk.FULFILLED)
  1236. nodes = map(pool.zk.getNode, pool.zk.getNodes())
  1237. provider1_first = None
  1238. provider2_first = None
  1239. for node in nodes:
  1240. if node.provider == 'fake-provider2':
  1241. provider2_first = node
  1242. elif node.provider == 'fake-provider':
  1243. provider1_first = node
  1244. # Mark the nodes as being used so they won't be deleted at pause.
  1245. # Locking them is enough.
  1246. self.zk.lockNode(provider1_first, blocking=False)
  1247. self.zk.lockNode(provider2_first, blocking=False)
  1248. # Next two requests will go pending one for each provider.
  1249. req1 = zk.NodeRequest()
  1250. req1.state = zk.REQUESTED
  1251. req1.node_types.append('fake-label')
  1252. self.zk.storeNodeRequest(req1)
  1253. req1 = self.waitForNodeRequest(req1, zk.PENDING)
  1254. req2 = zk.NodeRequest()
  1255. req2.state = zk.REQUESTED
  1256. req2.node_types.append('fake-label')
  1257. self.zk.storeNodeRequest(req2)
  1258. req2 = self.waitForNodeRequest(req2, zk.PENDING)
  1259. # Delete node attached to provider2 this will cause provider2 to
  1260. # fulfill the request it had pending.
  1261. provider2_first.state = zk.DELETING
  1262. self.zk.storeNode(provider2_first)
  1263. self.zk.unlockNode(provider2_first)
  1264. self.waitForNodeDeletion(provider2_first)
  1265. while True:
  1266. # Wait for provider2 node to be created. Also find the request
  1267. # that was not fulfilled. This is the request that fake-provider
  1268. # is pending on.
  1269. req = self.zk.getNodeRequest(req1.id)
  1270. if req.state == zk.FULFILLED:
  1271. final_req = req2
  1272. break
  1273. req = self.zk.getNodeRequest(req2.id)
  1274. if req.state == zk.FULFILLED:
  1275. final_req = req1
  1276. break
  1277. provider2_second = None
  1278. nodes = map(pool.zk.getNode, pool.zk.getNodes())
  1279. for node in nodes:
  1280. if (node and node.provider == 'fake-provider2' and
  1281. node.state == zk.READY):
  1282. provider2_second = node
  1283. break
  1284. # Now delete the new node we had provider2 build. At this point,
  1285. # the only provider with any requests is fake-provider.
  1286. provider2_second.state = zk.DELETING
  1287. self.zk.storeNode(provider2_second)
  1288. # Set provider1 runHandler to throw exception to simulate a
  1289. # broken cloud. Note the pool worker instantiates request handlers on
  1290. # demand which is why we have a somewhat convoluted monkey patch here.
  1291. # We must patch deep enough in the request handler that
  1292. # despite being paused fake-provider will still trip over this code.
  1293. pool_worker = pool.getPoolWorkers('fake-provider')[0]
  1294. request_handler = pool_worker.request_handlers[0]
  1295. def raise_KeyError(node):
  1296. raise KeyError('fake-provider')
  1297. request_handler.launch = raise_KeyError
  1298. # Delete instance in fake-provider. This should cause provider2
  1299. # to service the request that was held pending by fake-provider.
  1300. provider1_first.state = zk.DELETING
  1301. self.zk.storeNode(provider1_first)
  1302. self.zk.unlockNode(provider1_first)
  1303. # Request is fulfilled by provider 2
  1304. req = self.waitForNodeRequest(final_req)
  1305. self.assertEqual(req.state, zk.FULFILLED)
  1306. self.assertEqual(1, len(req.declined_by))
  1307. self.assertIn('fake-provider-main', req.declined_by[0])
  1308. def test_disabled_provider(self):
  1309. '''
  1310. A request should fail even with a provider that is disabled by
  1311. setting max-servers to 0. Because we look to see that all providers
  1312. decline a request by comparing the declined_by request attribute to
  1313. the list of registered launchers, this means that each must attempt
  1314. to handle it at least once, and thus decline it.
  1315. '''
  1316. configfile = self.setup_config('disabled_provider.yaml')
  1317. self.useBuilder(configfile)
  1318. pool = self.useNodepool(configfile, watermark_sleep=1)
  1319. pool.start()
  1320. req = zk.NodeRequest()
  1321. req.state = zk.REQUESTED
  1322. req.node_types.append('fake-label')
  1323. self.zk.storeNodeRequest(req)
  1324. req = self.waitForNodeRequest(req)
  1325. self.assertEqual(req.state, zk.FAILED)
  1326. def test_provider_wont_wedge(self):
  1327. '''
  1328. A provider should not wedge itself when it is at (1) maximum capacity
  1329. (# registered nodes == max-servers), (2) all of its current nodes are
  1330. not being used, and (3) a request comes in with a label that it does
  1331. not yet have available. Normally, situation (3) combined with (1)
  1332. would cause the provider to pause until capacity becomes available,
  1333. but because of (2), it never will and we would wedge the provider.
  1334. '''
  1335. configfile = self.setup_config('wedge_test.yaml')
  1336. self.useBuilder(configfile)
  1337. pool = self.useNodepool(configfile, watermark_sleep=1)
  1338. pool.start()
  1339. # Wait for fake-label1 min-ready request to be fulfilled, which will
  1340. # put us at maximum capacity with max-servers of 1.
  1341. label1_nodes = self.waitForNodes('fake-label1')
  1342. self.assertEqual(1, len(label1_nodes))
  1343. # Now we submit a request for fake-label2, which is not yet available.
  1344. req = zk.NodeRequest()
  1345. req.state = zk.REQUESTED
  1346. req.node_types.append('fake-label2')
  1347. self.zk.storeNodeRequest(req)
  1348. # The provider should pause here to handle the fake-label2 request.
  1349. # But because the fake-label1 node is not being used, and will never
  1350. # be freed because we are paused and not handling additional requests,
  1351. # the pool worker thread should recognize that and delete the unused
  1352. # fake-label1 node for us. It can then fulfill the fake-label2 request.
  1353. self.waitForNodeDeletion(label1_nodes[0])
  1354. req = self.waitForNodeRequest(req)
  1355. self.assertEqual(req.state, zk.FULFILLED)
  1356. def test_launcher_registers_config_change(self):
  1357. '''
  1358. Launchers register themselves and some config info with ZooKeeper.
  1359. Validate that a config change will propogate to ZooKeeper.
  1360. '''
  1361. configfile = self.setup_config('launcher_reg1.yaml')
  1362. self.useBuilder(configfile)
  1363. pool = self.useNodepool(configfile, watermark_sleep=1)
  1364. pool.start()
  1365. self.waitForNodes('fake-label')
  1366. launchers = self.zk.getRegisteredLaunchers()
  1367. self.assertEqual(1, len(launchers))
  1368. # the fake-label-unused label should not appear
  1369. self.assertEqual({'fake-label'}, launchers[0].supported_labels)
  1370. self.replace_config(configfile, 'launcher_reg2.yaml')
  1371. # we should get 1 additional label now
  1372. while launchers[0].supported_labels != {'fake-label', 'fake-label2'}:
  1373. time.sleep(1)
  1374. launchers = self.zk.getRegisteredLaunchers()
  1375. @mock.patch('nodepool.driver.openstack.handler.'
  1376. 'OpenStackNodeLauncher._launchNode')
  1377. def test_launchNode_session_expired(self, mock_launch):
  1378. '''
  1379. Test ZK session lost during _launchNode().
  1380. '''
  1381. mock_launch.side_effect = kze.SessionExpiredError()
  1382. # use a config with min-ready of 0
  1383. configfile = self.setup_config('node_launch_retry.yaml')
  1384. self.useBuilder(configfile)
  1385. pool = self.useNodepool(configfile, watermark_sleep=1)
  1386. pool.cleanup_interval = 60
  1387. pool.start()
  1388. self.waitForImage('fake-provider', 'fake-image')
  1389. req = zk.NodeRequest()
  1390. req.state = zk.REQUESTED
  1391. req.node_types.append('fake-label')
  1392. self.zk.storeNodeRequest(req)
  1393. # A session loss during node launch should at least try to set the
  1394. # request state to FAILED (in a non-test scenario, it may actually
  1395. # be missing).
  1396. req = self.waitForNodeRequest(req, states=(zk.FAILED,))
  1397. self.assertEqual(1, mock_launch.call_count)
  1398. # Any znodes created for the request should eventually get deleted.
  1399. while self.zk.countPoolNodes('fake-provider', 'main'):
  1400. time.sleep(0)
  1401. def test_launchNode_delete_error(self):
  1402. '''
  1403. Test that the launcher keeps trying to spawn a node in case of a
  1404. delete error
  1405. '''
  1406. fake_client = fakeprovider.FakeLaunchAndDeleteFailCloud(
  1407. times_to_fail=1)
  1408. def get_fake_client(*args, **kwargs):
  1409. return fake_client
  1410. self.useFixture(fixtures.MockPatchObject(
  1411. fakeprovider.FakeProvider, '_getClient',
  1412. get_fake_client))
  1413. configfile = self.setup_config('node_launch_retry.yaml')
  1414. self.useBuilder(configfile)
  1415. pool = self.useNodepool(configfile, watermark_sleep=1)
  1416. pool.cleanup_interval = 60
  1417. pool.start()
  1418. self.waitForImage('fake-provider', 'fake-image')
  1419. req = zk.NodeRequest()
  1420. req.state = zk.REQUESTED
  1421. req.node_types.append('fake-label')
  1422. self.zk.storeNodeRequest(req)
  1423. req = self.waitForNodeRequest(req)
  1424. # The deletion of the node can be delayed so wait for it.
  1425. while True:
  1426. if fake_client.delete_success:
  1427. break
  1428. time.sleep(0.1)
  1429. self.assertTrue(fake_client.launch_success)
  1430. self.assertEqual(fake_client.times_to_fail_delete,
  1431. fake_client.times_failed_delete)
  1432. self.assertEqual(fake_client.times_to_fail_launch,
  1433. fake_client.times_failed_launch)
  1434. self.assertEqual(req.state, zk.FULFILLED)
  1435. self.assertEqual(len(req.nodes), 1)
  1436. @mock.patch('nodepool.driver.NodeRequestHandler.poll')
  1437. def test_handler_poll_session_expired(self, mock_poll):
  1438. '''
  1439. Test ZK session lost during handler poll() removes handler.
  1440. '''
  1441. req = zk.NodeRequest()
  1442. req.state = zk.REQUESTED
  1443. req.node_types.append('fake-label')
  1444. self.zk.storeNodeRequest(req)
  1445. # We need to stop processing of this request so that it does not
  1446. # re-enter request handling, so we can then verify that it was
  1447. # actually removed from request_handlers in the final assert of
  1448. # this test.
  1449. def side_effect():
  1450. req.state = zk.FAILED
  1451. # Intentionally ignore that it is already locked.
  1452. self.zk.storeNodeRequest(req)
  1453. raise kze.SessionExpiredError()
  1454. mock_poll.side_effect = side_effect
  1455. # use a config with min-ready of 0
  1456. configfile = self.setup_config('node_launch_retry.yaml')
  1457. self.useBuilder(configfile)
  1458. # Wait for the image to exist before starting the launcher, else
  1459. # we'll decline the request.
  1460. self.waitForImage('fake-provider', 'fake-image')
  1461. pool = self.useNodepool(configfile, watermark_sleep=1)
  1462. pool.cleanup_interval = 60
  1463. pool.start()
  1464. # Wait for request handling to occur
  1465. while not mock_poll.call_count:
  1466. time.sleep(.1)
  1467. # Note: The launcher is not setting FAILED state here, but our mock
  1468. # side effect should be doing so. Just verify that.
  1469. req = self.waitForNodeRequest(req)
  1470. self.assertEqual(zk.FAILED, req.state)
  1471. # A session loss during handler poll should at least remove the
  1472. # request from active handlers. The session exception from our first
  1473. # time through poll() should handle removing the request handler.
  1474. # And our mock side effect should ensure it does not re-enter
  1475. # request handling before we check it.
  1476. self.assertEqual(0, len(
  1477. pool._pool_threads["fake-provider-main"].request_handlers))
  1478. def test_exception_causing_decline_of_paused_request(self):
  1479. """
  1480. Test that a paused request, that later gets declined because of
  1481. an exception (say, thrown from a provider operation), unpauses
  1482. and removes the request handler.
  1483. """
  1484. # First config has max-servers set to 2
  1485. configfile = self.setup_config('pause_declined_1.yaml')
  1486. self.useBuilder(configfile)
  1487. self.waitForImage('fake-provider', 'fake-image')
  1488. pool = self.useNodepool(configfile, watermark_sleep=1)
  1489. pool.start()
  1490. # Create a request that uses all capacity (2 servers)
  1491. req = zk.NodeRequest()
  1492. req.state = zk.REQUESTED
  1493. req.node_types.append('fake-label')
  1494. req.node_types.append('fake-label')
  1495. self.zk.storeNodeRequest(req)
  1496. req = self.waitForNodeRequest(req)
  1497. self.assertEqual(req.state, zk.FULFILLED)
  1498. self.assertEqual(len(req.nodes), 2)
  1499. # Now that we have 2 nodes in use, create another request that
  1500. # requests two nodes, which should cause the request to pause.
  1501. req2 = zk.NodeRequest()
  1502. req2.state = zk.REQUESTED
  1503. req2.node_types.append('fake-label')
  1504. req2.node_types.append('fake-label')
  1505. self.zk.storeNodeRequest(req2)
  1506. req2 = self.waitForNodeRequest(req2, (zk.PENDING,))
  1507. # Force an exception within the run handler.
  1508. pool_worker = pool.getPoolWorkers('fake-provider')
  1509. while not pool_worker[0].paused_handler:
  1510. time.sleep(0.1)
  1511. pool_worker[0].paused_handler.hasProviderQuota = mock.Mock(
  1512. side_effect=Exception('mock exception')
  1513. )
  1514. # The above exception should cause us to fail the paused request.
  1515. req2 = self.waitForNodeRequest(req2, (zk.FAILED,))
  1516. self.assertNotEqual(req2.declined_by, [])
  1517. # The exception handling should make sure that we unpause AND remove
  1518. # the request handler.
  1519. while pool_worker[0].paused_handler:
  1520. time.sleep(0.1)
  1521. self.assertEqual(0, len(pool_worker[0].request_handlers))
  1522. def test_ignore_provider_quota_false(self):
  1523. '''
  1524. Test that a node request get fulfilled with ignore-provider-quota set
  1525. to false.
  1526. '''
  1527. # Set max-cores quota value to 0 to force "out of quota". Note that
  1528. # the fake provider checks the number of instances during server
  1529. # creation to decide if it should throw an over quota exception,
  1530. # but it doesn't check cores.
  1531. def fake_get_quota():
  1532. return (0, 20, 1000000)
  1533. self.useFixture(fixtures.MockPatchObject(
  1534. fakeprovider.FakeProvider.fake_cloud, '_get_quota',
  1535. fake_get_quota
  1536. ))
  1537. configfile = self.setup_config('ignore_provider_quota_false.yaml')
  1538. self.useBuilder(configfile)
  1539. self.waitForImage('fake-provider', 'fake-image')
  1540. pool = self.useNodepool(configfile, watermark_sleep=1)
  1541. pool.start()
  1542. # Create a request with ignore-provider-quota set to false that should
  1543. # fail because it will decline the request because "it would exceed
  1544. # quota".
  1545. self.log.debug("Submitting request with ignore-provider-quota False")
  1546. req = zk.NodeRequest()
  1547. req.state = zk.REQUESTED
  1548. req.node_types.append('fake-label')
  1549. self.zk.storeNodeRequest(req)
  1550. req = self.waitForNodeRequest(req)
  1551. self.assertEqual(req.state, zk.FAILED)
  1552. def test_ignore_provider_quota_true(self):
  1553. '''
  1554. Test that a node request get fulfilled with ignore-provider-quota set
  1555. to true.
  1556. '''
  1557. # Set max-cores quota value to 0 to force "out of quota". Note that
  1558. # the fake provider checks the number of instances during server
  1559. # creation to decide if it should throw an over quota exception,
  1560. # but it doesn't check cores.
  1561. def fake_get_quota():
  1562. return (0, 20, 1000000)
  1563. self.useFixture(fixtures.MockPatchObject(
  1564. fakeprovider.FakeProvider.fake_cloud, '_get_quota',
  1565. fake_get_quota
  1566. ))
  1567. configfile = self.setup_config('ignore_provider_quota_true.yaml')
  1568. self.useBuilder(configfile)
  1569. self.waitForImage('fake-provider', 'fake-image')
  1570. pool = self.useNodepool(configfile, watermark_sleep=1)
  1571. pool.start()
  1572. # Create a request with ignore-provider-quota set to true that should
  1573. # pass regardless of the lack of cloud/provider quota.
  1574. self.replace_config(configfile, 'ignore_provider_quota_true.yaml')
  1575. self.log.debug(
  1576. "Submitting an initial request with ignore-provider-quota True")
  1577. req1 = zk.NodeRequest()
  1578. req1.state = zk.REQUESTED
  1579. req1.node_types.append('fake-label')
  1580. self.zk.storeNodeRequest(req1)
  1581. req1 = self.waitForNodeRequest(req1)
  1582. self.assertEqual(req1.state, zk.FULFILLED)
  1583. # Lock this node so it appears as used and not deleted
  1584. req1_node = self.zk.getNode(req1.nodes[0])
  1585. self.zk.lockNode(req1_node, blocking=False)
  1586. # Request a second node; this request should pause the handler
  1587. # due to the pool set with max-servers: 1
  1588. req2 = zk.NodeRequest()
  1589. req2.state = zk.REQUESTED
  1590. req2.node_types.append('fake-label')
  1591. self.log.debug(
  1592. "Submitting a second request with ignore-provider-quota True"
  1593. "but with a full max-servers quota.")
  1594. self.zk.storeNodeRequest(req2)
  1595. pool_worker = pool.getPoolWorkers('fake-provider')
  1596. while not pool_worker[0].paused_handler:
  1597. time.sleep(0.1)
  1598. # The handler is paused now and the request should be in state PENDING
  1599. req2 = self.waitForNodeRequest(req2, zk.PENDING)
  1600. self.assertEqual(req2.state, zk.PENDING)
  1601. # Now free up the first node
  1602. self.log.debug("Marking first node as used %s", req1.id)
  1603. req1_node.state = zk.USED
  1604. self.zk.storeNode(req1_node)
  1605. self.zk.unlockNode(req1_node)
  1606. self.waitForNodeDeletion(req1_node)
  1607. # After the first node is cleaned up the second request should be
  1608. # able to fulfill now.
  1609. req2 = self.waitForNodeRequest(req2)
  1610. self.assertEqual(req2.state, zk.FULFILLED)
  1611. # Lock this node so it appears as used and not deleted
  1612. req2_node = self.zk.getNode(req2.nodes[0])
  1613. self.zk.lockNode(req2_node, blocking=False)
  1614. # Now free up the second node
  1615. self.log.debug("Marking second node as used %s", req2.id)
  1616. req2_node.state = zk.USED
  1617. self.zk.storeNode(req2_node)
  1618. self.zk.unlockNode(req2_node)
  1619. self.waitForNodeDeletion(req2_node)
  1620. # Request a 2 node set; this request should fail
  1621. # due to the provider only being able to fulfill
  1622. # a single node at a time.
  1623. req3 = zk.NodeRequest()
  1624. req3.state = zk.REQUESTED
  1625. req3.node_types.append('fake-label')
  1626. req3.node_types.append('fake-label')
  1627. self.log.debug(
  1628. "Submitting a third request with ignore-provider-quota True"
  1629. "for a 2-node set which the provider cannot fulfill.")
  1630. self.zk.storeNodeRequest(req3)
  1631. req3 = self.waitForNodeRequest(req3)
  1632. self.assertEqual(req3.state, zk.FAILED)
  1633. def test_request_order(self):
  1634. """Test that requests are handled in sorted order"""
  1635. configfile = self.setup_config('node_no_min_ready.yaml')
  1636. self.useBuilder(configfile)
  1637. image = self.waitForImage('fake-provider', 'fake-image')
  1638. self.assertEqual(image.username, 'zuul')
  1639. req1 = zk.NodeRequest()
  1640. req1.state = zk.REQUESTED
  1641. req1.node_types.append('fake-label')
  1642. req1.relative_priority = 2
  1643. self.zk.storeNodeRequest(req1)
  1644. req2 = zk.NodeRequest()
  1645. req2.state = zk.REQUESTED
  1646. req2.node_types.append('fake-label')
  1647. req2.relative_priority = 1
  1648. self.zk.storeNodeRequest(req2)
  1649. pool = self.useNodepool(configfile, watermark_sleep=1)
  1650. pool.start()
  1651. req2 = self.waitForNodeRequest(req2)
  1652. self.assertEqual(req2.state, zk.FULFILLED)
  1653. req1 = self.waitForNodeRequest(req1)
  1654. self.assertEqual(req1.state, zk.FULFILLED)
  1655. self.assertTrue(req2.id > req1.id)
  1656. self.assertTrue(req2.state_time < req1.state_time)
  1657. def test_empty_node_deleted(self):
  1658. """Test that empty nodes are deleted by the cleanup thread"""
  1659. configfile = self.setup_config('node.yaml')
  1660. # Create empty node
  1661. path = "%s" % self.zk._nodePath("12345")
  1662. self.log.debug("node path %s", path)
  1663. self.zk.client.create(path, makepath=True)
  1664. self.assertTrue(self.zk.client.exists(path))
  1665. pool = self.useNodepool(configfile, watermark_sleep=1)
  1666. pool.cleanup_interval = .1
  1667. pool.start()
  1668. while self.zk.client.exists(path):
  1669. time.sleep(.1)
  1670. def test_leaked_port_cleanup(self):
  1671. configfile = self.setup_config('node.yaml')
  1672. self.useBuilder(configfile)
  1673. pool = self.useNodepool(configfile, watermark_sleep=1)
  1674. pool.cleanup_interval = 1
  1675. pool.start()
  1676. self.waitForNodes('fake-label')
  1677. manager = pool.getProviderManager('fake-provider')
  1678. down_ports = manager.listPorts(status='DOWN')
  1679. self.assertEqual(2, len(down_ports))
  1680. self.log.debug("Down ports: %s", down_ports)
  1681. # Change the port cleanup interval to happen quicker
  1682. manager._port_cleanup_interval_secs = 2
  1683. while manager.listPorts(status='DOWN'):
  1684. time.sleep(1)
  1685. self.assertReportedStat('nodepool.provider.fake-provider.downPorts',
  1686. value='2', kind='c')