Manage a pool of nodes for a distributed test infrastructure
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_launcher.py 74KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826
  1. # Copyright (C) 2014 OpenStack Foundation
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
  12. # implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import logging
  16. import math
  17. import time
  18. import fixtures
  19. import mock
  20. from nodepool import tests
  21. from nodepool import zk
  22. from nodepool.driver.fake import provider as fakeprovider
  23. from nodepool.nodeutils import iterate_timeout
  24. import nodepool.launcher
  25. from kazoo import exceptions as kze
  26. class TestLauncher(tests.DBTestCase):
  27. log = logging.getLogger("nodepool.TestLauncher")
  28. def test_node_assignment(self):
  29. '''
  30. Successful node launch should have unlocked nodes in READY state
  31. and assigned to the request.
  32. '''
  33. configfile = self.setup_config('node_no_min_ready.yaml')
  34. self.useBuilder(configfile)
  35. image = self.waitForImage('fake-provider', 'fake-image')
  36. self.assertEqual(image.username, 'zuul')
  37. nodepool.launcher.LOCK_CLEANUP = 1
  38. pool = self.useNodepool(configfile, watermark_sleep=1)
  39. pool.start()
  40. req = zk.NodeRequest()
  41. req.state = zk.REQUESTED
  42. req.node_types.append('fake-label')
  43. self.zk.storeNodeRequest(req)
  44. req = self.waitForNodeRequest(req)
  45. self.assertEqual(req.state, zk.FULFILLED)
  46. self.assertNotEqual(req.nodes, [])
  47. for node_id in req.nodes:
  48. node = self.zk.getNode(node_id)
  49. self.assertEqual(node.allocated_to, req.id)
  50. self.assertEqual(node.state, zk.READY)
  51. self.assertIsNotNone(node.launcher)
  52. self.assertEqual(node.cloud, 'fake')
  53. self.assertEqual(node.region, 'fake-region')
  54. self.assertEqual(node.az, "az1")
  55. self.assertEqual(node.username, "zuul")
  56. self.assertEqual(node.connection_type, 'ssh')
  57. self.assertEqual(node.connection_port, 22)
  58. p = "{path}/{id}".format(
  59. path=self.zk._imageUploadPath(image.image_name,
  60. image.build_id,
  61. image.provider_name),
  62. id=image.id)
  63. self.assertEqual(node.image_id, p)
  64. resources = {
  65. 'cores': 4,
  66. 'instances': 1,
  67. 'ram': 8192,
  68. }
  69. self.assertEqual(node.resources, resources)
  70. self.zk.lockNode(node, blocking=False)
  71. self.zk.unlockNode(node)
  72. # Verify the cleanup thread removed the lock
  73. self.assertIsNotNone(
  74. self.zk.client.exists(self.zk._requestLockPath(req.id))
  75. )
  76. self.zk.deleteNodeRequest(req)
  77. self.waitForNodeRequestLockDeletion(req.id)
  78. self.assertReportedStat('nodepool.nodes.ready', value='1', kind='g')
  79. self.assertReportedStat('nodepool.nodes.building', value='0', kind='g')
  80. self.assertReportedStat('nodepool.label.fake-label.nodes.ready',
  81. value='1', kind='g')
  82. # Verify that we correctly initialized unused label stats to 0
  83. self.assertReportedStat('nodepool.label.fake-label2.nodes.building',
  84. value='0', kind='g')
  85. self.assertReportedStat('nodepool.label.fake-label2.nodes.testing',
  86. value='0', kind='g')
  87. self.assertReportedStat('nodepool.label.fake-label2.nodes.ready',
  88. value='0', kind='g')
  89. self.assertReportedStat('nodepool.label.fake-label2.nodes.in-use',
  90. value='0', kind='g')
  91. self.assertReportedStat('nodepool.label.fake-label2.nodes.used',
  92. value='0', kind='g')
  93. self.assertReportedStat('nodepool.label.fake-label2.nodes.hold',
  94. value='0', kind='g')
  95. self.assertReportedStat('nodepool.label.fake-label2.nodes.deleting',
  96. value='0', kind='g')
  97. self.assertReportedStat('nodepool.label.fake-label2.nodes.failed',
  98. value='0', kind='g')
  99. self.assertReportedStat('nodepool.label.fake-label2.nodes.init',
  100. value='0', kind='g')
  101. self.assertReportedStat('nodepool.label.fake-label2.nodes.aborted',
  102. value='0', kind='g')
  103. def test_node_assignment_order(self):
  104. """Test that nodes are assigned in the order requested"""
  105. configfile = self.setup_config('node_many_labels.yaml')
  106. self.useBuilder(configfile)
  107. self.waitForImage('fake-provider', 'fake-image')
  108. pool = self.useNodepool(configfile, watermark_sleep=1)
  109. pool.start()
  110. self.waitForNodes('fake-label1')
  111. self.waitForNodes('fake-label2')
  112. self.waitForNodes('fake-label3')
  113. self.waitForNodes('fake-label4')
  114. req = zk.NodeRequest()
  115. req.state = zk.REQUESTED
  116. req.node_types.append('fake-label3')
  117. req.node_types.append('fake-label1')
  118. req.node_types.append('fake-label4')
  119. req.node_types.append('fake-label2')
  120. self.zk.storeNodeRequest(req)
  121. req = self.waitForNodeRequest(req)
  122. self.assertEqual(req.state, zk.FULFILLED)
  123. self.assertEqual(4, len(req.nodes))
  124. nodes = []
  125. for node_id in req.nodes:
  126. nodes.append(self.zk.getNode(node_id))
  127. self.assertEqual(nodes[0].type, ['fake-label3'])
  128. self.assertEqual(nodes[1].type, ['fake-label1'])
  129. self.assertEqual(nodes[2].type, ['fake-label4'])
  130. self.assertEqual(nodes[3].type, ['fake-label2'])
  131. def _test_node_assignment_at_quota(self,
  132. config,
  133. max_cores=100,
  134. max_instances=20,
  135. max_ram=1000000):
  136. '''
  137. Successful node launch should have unlocked nodes in READY state
  138. and assigned to the request. This should be run with a quota that
  139. fits for two nodes.
  140. '''
  141. # patch the cloud with requested quota
  142. def fake_get_quota():
  143. return (max_cores, max_instances, max_ram)
  144. self.useFixture(fixtures.MockPatchObject(
  145. fakeprovider.FakeProvider.fake_cloud, '_get_quota',
  146. fake_get_quota
  147. ))
  148. configfile = self.setup_config(config)
  149. self.useBuilder(configfile)
  150. self.waitForImage('fake-provider', 'fake-image')
  151. nodepool.launcher.LOCK_CLEANUP = 1
  152. pool = self.useNodepool(configfile, watermark_sleep=1)
  153. pool.start()
  154. self.wait_for_config(pool)
  155. client = pool.getProviderManager('fake-provider')._getClient()
  156. req1 = zk.NodeRequest()
  157. req1.state = zk.REQUESTED
  158. req1.node_types.append('fake-label')
  159. req1.node_types.append('fake-label')
  160. self.zk.storeNodeRequest(req1)
  161. self.log.debug("Waiting for 1st request %s", req1.id)
  162. req1 = self.waitForNodeRequest(req1, (zk.FULFILLED,))
  163. self.assertEqual(len(req1.nodes), 2)
  164. # Mark the first request's nodes as in use so they won't be deleted
  165. # when we pause. Locking them is enough.
  166. req1_node1 = self.zk.getNode(req1.nodes[0])
  167. req1_node2 = self.zk.getNode(req1.nodes[1])
  168. self.zk.lockNode(req1_node1, blocking=False)
  169. self.zk.lockNode(req1_node2, blocking=False)
  170. # One of the things we want to test is that if we spawn many
  171. # node launches at once, we do not deadlock while the request
  172. # handler pauses for quota. To ensure we test that case,
  173. # pause server creation until we have accepted all of the node
  174. # requests we submit. This will ensure that we hold locks on
  175. # all of the nodes before pausing so that we can validate they
  176. # are released.
  177. req2 = zk.NodeRequest()
  178. req2.state = zk.REQUESTED
  179. req2.node_types.append('fake-label')
  180. req2.node_types.append('fake-label')
  181. self.zk.storeNodeRequest(req2)
  182. req2 = self.waitForNodeRequest(req2, (zk.PENDING,))
  183. # At this point, we should have already created two servers for the
  184. # first request, and the request handler has accepted the second node
  185. # request but paused waiting for the server count to go below quota.
  186. # Wait until there is a paused request handler and check if there
  187. # are exactly two servers
  188. pool_worker = pool.getPoolWorkers('fake-provider')
  189. while not pool_worker[0].paused_handler:
  190. time.sleep(0.1)
  191. self.assertEqual(len(client._server_list), 2)
  192. # Mark the first request's nodes as USED, which will get them deleted
  193. # and allow the second to proceed.
  194. self.log.debug("Marking first node as used %s", req1.id)
  195. req1_node1.state = zk.USED
  196. self.zk.storeNode(req1_node1)
  197. self.zk.unlockNode(req1_node1)
  198. self.waitForNodeDeletion(req1_node1)
  199. # To force the sequential nature of what we're testing, wait for
  200. # the 2nd request to get a node allocated to it now that we've
  201. # freed up a node.
  202. self.log.debug("Waiting for node allocation for 2nd request")
  203. done = False
  204. while not done:
  205. for n in self.zk.nodeIterator():
  206. if n.allocated_to == req2.id:
  207. done = True
  208. break
  209. self.log.debug("Marking second node as used %s", req1.id)
  210. req1_node2.state = zk.USED
  211. self.zk.storeNode(req1_node2)
  212. self.zk.unlockNode(req1_node2)
  213. self.waitForNodeDeletion(req1_node2)
  214. self.log.debug("Deleting 1st request %s", req1.id)
  215. self.zk.deleteNodeRequest(req1)
  216. self.waitForNodeRequestLockDeletion(req1.id)
  217. req2 = self.waitForNodeRequest(req2, (zk.FULFILLED,))
  218. self.assertEqual(len(req2.nodes), 2)
  219. def test_node_assignment_at_pool_quota_cores(self):
  220. self._test_node_assignment_at_quota(
  221. config='node_quota_pool_cores.yaml')
  222. def test_node_assignment_at_pool_quota_instances(self):
  223. self._test_node_assignment_at_quota(
  224. config='node_quota_pool_instances.yaml')
  225. def test_node_assignment_at_pool_quota_ram(self):
  226. self._test_node_assignment_at_quota(
  227. config='node_quota_pool_ram.yaml')
  228. def test_node_assignment_at_cloud_cores_quota(self):
  229. self._test_node_assignment_at_quota(config='node_quota_cloud.yaml',
  230. max_cores=8,
  231. # check that -1 and inf work for no
  232. # quota
  233. max_instances=-1,
  234. max_ram=math.inf)
  235. def test_node_assignment_at_cloud_instances_quota(self):
  236. self._test_node_assignment_at_quota(config='node_quota_cloud.yaml',
  237. max_cores=math.inf,
  238. max_instances=2,
  239. max_ram=math.inf)
  240. def test_node_assignment_at_cloud_ram_quota(self):
  241. self._test_node_assignment_at_quota(config='node_quota_cloud.yaml',
  242. max_cores=math.inf,
  243. max_instances=math.inf,
  244. max_ram=2 * 8192)
  245. def test_over_quota(self, config='node_quota_cloud.yaml'):
  246. '''
  247. This tests what happens when a cloud unexpectedly returns an
  248. over-quota error.
  249. '''
  250. # Start with an instance quota of 2
  251. max_cores = math.inf
  252. max_instances = 2
  253. max_ram = math.inf
  254. # patch the cloud with requested quota
  255. def fake_get_quota():
  256. return (max_cores, max_instances, max_ram)
  257. self.useFixture(fixtures.MockPatchObject(
  258. fakeprovider.FakeProvider.fake_cloud, '_get_quota',
  259. fake_get_quota
  260. ))
  261. configfile = self.setup_config(config)
  262. self.useBuilder(configfile)
  263. self.waitForImage('fake-provider', 'fake-image')
  264. nodepool.launcher.LOCK_CLEANUP = 1
  265. pool = self.useNodepool(configfile, watermark_sleep=1)
  266. pool.start()
  267. self.wait_for_config(pool)
  268. client = pool.getProviderManager('fake-provider')._getClient()
  269. # Wait for a single node to be created
  270. req1 = zk.NodeRequest()
  271. req1.state = zk.REQUESTED
  272. req1.node_types.append('fake-label')
  273. self.log.debug("Adding first request")
  274. self.zk.storeNodeRequest(req1)
  275. req1 = self.waitForNodeRequest(req1)
  276. self.assertEqual(req1.state, zk.FULFILLED)
  277. # Lock this node so it appears as used and not deleted
  278. req1_node = self.zk.getNode(req1.nodes[0])
  279. self.zk.lockNode(req1_node, blocking=False)
  280. # Now, reduce the quota so the next node unexpectedly
  281. # (according to nodepool's quota estimate) fails.
  282. client.max_instances = 1
  283. # Request a second node; this request should pause the handler.
  284. req2 = zk.NodeRequest()
  285. req2.state = zk.REQUESTED
  286. req2.node_types.append('fake-label')
  287. self.log.debug("Adding second request")
  288. self.zk.storeNodeRequest(req2)
  289. pool_worker = pool.getPoolWorkers('fake-provider')
  290. while not pool_worker[0].paused_handler:
  291. time.sleep(0.1)
  292. # The handler is paused now and the request should be in state PENDING
  293. req2 = self.waitForNodeRequest(req2, zk.PENDING)
  294. self.assertEqual(req2.state, zk.PENDING)
  295. # Now free up the first node
  296. self.log.debug("Marking first node as used %s", req1.id)
  297. req1_node.state = zk.USED
  298. self.zk.storeNode(req1_node)
  299. self.zk.unlockNode(req1_node)
  300. self.waitForNodeDeletion(req1_node)
  301. # After the first node is cleaned up the second request should be
  302. # able to fulfill now.
  303. req2 = self.waitForNodeRequest(req2)
  304. self.assertEqual(req2.state, zk.FULFILLED)
  305. self.assertEqual(len(client._server_list), 1)
  306. def test_fail_request_on_launch_failure(self):
  307. '''
  308. Test that provider launch error fails the request.
  309. '''
  310. configfile = self.setup_config('node_launch_retry.yaml')
  311. self.useBuilder(configfile)
  312. self.waitForImage('fake-provider', 'fake-image')
  313. pool = self.useNodepool(configfile, watermark_sleep=1)
  314. pool.start()
  315. self.wait_for_config(pool)
  316. manager = pool.getProviderManager('fake-provider')
  317. manager.createServer_fails = 2
  318. req = zk.NodeRequest()
  319. req.state = zk.REQUESTED
  320. req.node_types.append('fake-label')
  321. self.zk.storeNodeRequest(req)
  322. req = self.waitForNodeRequest(req)
  323. self.assertEqual(0, manager.createServer_fails)
  324. self.assertEqual(req.state, zk.FAILED)
  325. self.assertNotEqual(req.declined_by, [])
  326. def test_az_change_recover(self):
  327. '''
  328. Test that nodepool recovers from az change in the cloud.
  329. '''
  330. configfile = self.setup_config('node_az_change.yaml')
  331. self.useBuilder(configfile)
  332. self.waitForImage('fake-provider', 'fake-image')
  333. pool = self.useNodepool(configfile, watermark_sleep=1)
  334. pool.start()
  335. self.wait_for_config(pool)
  336. req = zk.NodeRequest()
  337. req.state = zk.REQUESTED
  338. req.node_types.append('fake-label')
  339. self.zk.storeNodeRequest(req)
  340. req = self.waitForNodeRequest(req)
  341. self.assertEqual(req.state, zk.FULFILLED)
  342. # now change the azs in the cloud
  343. cloud = pool.getProviderManager('fake-provider')._getClient()
  344. cloud._azs = ['new-az1', 'new-az2']
  345. # Do a second request. This will fail because the cached azs are not
  346. # available anymore.
  347. # TODO(tobiash): Ideally we should already be able to already recover
  348. # this request.
  349. req2 = zk.NodeRequest()
  350. req2.state = zk.REQUESTED
  351. req2.node_types.append('fake-label')
  352. self.zk.storeNodeRequest(req2)
  353. req2 = self.waitForNodeRequest(req2)
  354. self.assertEqual(req2.state, zk.FAILED)
  355. # Create a third request to test that nodepool successfully recovers
  356. # from a stale az cache.
  357. req3 = zk.NodeRequest()
  358. req3.state = zk.REQUESTED
  359. req3.node_types.append('fake-label')
  360. self.zk.storeNodeRequest(req3)
  361. req3 = self.waitForNodeRequest(req3)
  362. self.assertEqual(req3.state, zk.FULFILLED)
  363. node = self.zk.getNode(req3.nodes[0])
  364. self.assertIn(node.az, ['new-az1', 'new-az2'])
  365. def test_fail_minready_request_at_capacity(self):
  366. '''
  367. A min-ready request to a provider that is already at capacity should
  368. be declined.
  369. '''
  370. configfile = self.setup_config('node_min_ready_capacity.yaml')
  371. self.useBuilder(configfile)
  372. self.waitForImage('fake-provider', 'fake-image')
  373. pool = self.useNodepool(configfile, watermark_sleep=1)
  374. pool.start()
  375. # Get an initial node ready
  376. req = zk.NodeRequest()
  377. req.state = zk.REQUESTED
  378. req.node_types.append("fake-label")
  379. self.zk.storeNodeRequest(req)
  380. req = self.waitForNodeRequest(req)
  381. self.assertEqual(req.state, zk.FULFILLED)
  382. # Now simulate a min-ready request
  383. min_ready_req = zk.NodeRequest()
  384. min_ready_req.state = zk.REQUESTED
  385. min_ready_req.node_types.append("fake-label")
  386. min_ready_req.requestor = "NodePool:min-ready"
  387. self.zk.storeNodeRequest(min_ready_req)
  388. min_ready_req = self.waitForNodeRequest(min_ready_req)
  389. self.assertEqual(min_ready_req.state, zk.FAILED)
  390. self.assertNotEqual(min_ready_req.declined_by, [])
  391. def test_invalid_image_fails(self):
  392. '''
  393. Test that an invalid image declines and fails the request.
  394. '''
  395. configfile = self.setup_config('node.yaml')
  396. pool = self.useNodepool(configfile, watermark_sleep=1)
  397. pool.start()
  398. req = zk.NodeRequest()
  399. req.state = zk.REQUESTED
  400. req.node_types.append("zorky-zumba")
  401. self.zk.storeNodeRequest(req)
  402. req = self.waitForNodeRequest(req)
  403. self.assertEqual(req.state, zk.FAILED)
  404. self.assertNotEqual(req.declined_by, [])
  405. def test_node(self):
  406. """Test that an image and node are created"""
  407. configfile = self.setup_config('node.yaml')
  408. pool = self.useNodepool(configfile, watermark_sleep=1)
  409. self.useBuilder(configfile)
  410. pool.start()
  411. image = self.waitForImage('fake-provider', 'fake-image')
  412. self.assertEqual(image.username, 'zuul')
  413. nodes = self.waitForNodes('fake-label')
  414. self.assertEqual(len(nodes), 1)
  415. self.assertEqual(nodes[0].provider, 'fake-provider')
  416. self.assertEqual(nodes[0].type, ['fake-label'])
  417. self.assertEqual(nodes[0].username, 'zuul')
  418. self.assertNotEqual(nodes[0].host_keys, [])
  419. self.assertEqual(nodes[0].attributes,
  420. {'key1': 'value1', 'key2': 'value2'})
  421. def test_node_host_key_checking_false(self):
  422. """Test that an image and node are created"""
  423. configfile = self.setup_config('node-host-key-checking.yaml')
  424. pool = self.useNodepool(configfile, watermark_sleep=1)
  425. self.useBuilder(configfile)
  426. pool.start()
  427. image = self.waitForImage('fake-provider', 'fake-image')
  428. self.assertEqual(image.username, 'zuul')
  429. nodes = self.waitForNodes('fake-label')
  430. self.assertEqual(len(nodes), 1)
  431. self.assertEqual(nodes[0].provider, 'fake-provider')
  432. self.assertEqual(nodes[0].type, ['fake-label'])
  433. self.assertEqual(nodes[0].username, 'zuul')
  434. # We have no host_keys because host-key-checking is False.
  435. self.assertEqual(nodes[0].host_keys, [])
  436. def test_multiple_launcher(self):
  437. """Test that an image and node are created with 2 launchers"""
  438. # nodepool-builder needs access to both providers to upload images
  439. configfile = self.setup_config('node_two_provider.yaml')
  440. self.useBuilder(configfile)
  441. # Start up first launcher
  442. configfile1 = self.setup_config('node.yaml')
  443. pool1 = self.useNodepool(configfile1, watermark_sleep=1)
  444. pool1.start()
  445. # Start up second launcher
  446. configfile2 = self.setup_config('node_second_provider.yaml')
  447. pool2 = self.useNodepool(configfile2, watermark_sleep=1)
  448. pool2.start()
  449. # Validate we have images in both providers
  450. image1 = self.waitForImage('fake-provider', 'fake-image')
  451. self.assertEqual(image1.username, 'zuul')
  452. image2 = self.waitForImage('fake-provider2', 'fake-image')
  453. self.assertEqual(image2.username, 'zuul')
  454. # We don't need to check which provider launched the min-ready, just
  455. # that one was launched.
  456. nodes = self.waitForNodes('fake-label', 1)
  457. self.assertEqual(len(nodes), 1)
  458. self.assertEqual(nodes[0].type, ['fake-label'])
  459. self.assertEqual(nodes[0].username, 'zuul')
  460. self.assertNotEqual(nodes[0].host_keys, [])
  461. def test_node_boot_from_volume(self):
  462. """Test that an image and node are created from a volume"""
  463. configfile = self.setup_config('node_boot_from_volume.yaml')
  464. pool = self.useNodepool(configfile, watermark_sleep=1)
  465. self.useBuilder(configfile)
  466. pool.start()
  467. self.waitForImage('fake-provider', 'fake-image')
  468. nodes = self.waitForNodes('fake-label')
  469. self.assertEqual(len(nodes), 1)
  470. self.assertEqual(nodes[0].provider, 'fake-provider')
  471. self.assertEqual(nodes[0].type, ['fake-label'])
  472. def test_disabled_label(self):
  473. """Test that a node is not created with min-ready=0"""
  474. configfile = self.setup_config('node_disabled_label.yaml')
  475. pool = self.useNodepool(configfile, watermark_sleep=1)
  476. self.useBuilder(configfile)
  477. pool.start()
  478. self.waitForImage('fake-provider', 'fake-image')
  479. self.assertEqual([], self.zk.getNodeRequests())
  480. self.assertEqual([], self.zk.getNodes())
  481. def test_node_net_name(self):
  482. """Test that a node is created with a net name"""
  483. configfile = self.setup_config('node_net_name.yaml')
  484. pool = self.useNodepool(configfile, watermark_sleep=1)
  485. self.useBuilder(configfile)
  486. pool.start()
  487. self.waitForImage('fake-provider', 'fake-image')
  488. nodes = self.waitForNodes('fake-label')
  489. self.assertEqual(len(nodes), 1)
  490. self.assertEqual(nodes[0].provider, 'fake-provider')
  491. self.assertEqual(nodes[0].type, ['fake-label'])
  492. self.assertEqual(nodes[0].username, 'zuul')
  493. def test_node_security_group(self):
  494. """Test that an image and node are created with sec_group specified"""
  495. configfile = self.setup_config('node_security_group.yaml')
  496. pool = self.useNodepool(configfile, watermark_sleep=1)
  497. self.useBuilder(configfile)
  498. pool.start()
  499. self.waitForImage('fake-provider', 'fake-image')
  500. nodes = self.waitForNodes('fake-label')
  501. nodes_def_sg = self.waitForNodes('fake-label2')
  502. self.assertEqual(len(nodes), 1)
  503. self.assertEqual(nodes[0].provider, 'fake-provider')
  504. self.assertEqual(len(nodes_def_sg), 1)
  505. self.assertEqual(nodes_def_sg[0].provider, 'fake-provider')
  506. client = pool.getProviderManager('fake-provider')._getClient()
  507. for server in client._server_list:
  508. if server.id == nodes[0].external_id:
  509. self.assertEqual(server.security_groups, ['fake-sg'])
  510. elif server.id == nodes_def_sg[0].external_id:
  511. self.assertEqual(server.security_groups, [])
  512. def test_node_flavor_name(self):
  513. """Test that a node is created with a flavor name"""
  514. configfile = self.setup_config('node_flavor_name.yaml')
  515. pool = self.useNodepool(configfile, watermark_sleep=1)
  516. self.useBuilder(configfile)
  517. pool.start()
  518. self.waitForImage('fake-provider', 'fake-image')
  519. nodes = self.waitForNodes('fake-label')
  520. self.assertEqual(len(nodes), 1)
  521. self.assertEqual(nodes[0].provider, 'fake-provider')
  522. self.assertEqual(nodes[0].type, ['fake-label'])
  523. def test_node_vhd_image(self):
  524. """Test that a image and node are created vhd image"""
  525. configfile = self.setup_config('node_vhd.yaml')
  526. pool = self.useNodepool(configfile, watermark_sleep=1)
  527. self.useBuilder(configfile)
  528. pool.start()
  529. self.waitForImage('fake-provider', 'fake-image')
  530. nodes = self.waitForNodes('fake-label')
  531. self.assertEqual(len(nodes), 1)
  532. self.assertEqual(nodes[0].provider, 'fake-provider')
  533. self.assertEqual(nodes[0].type, ['fake-label'])
  534. def test_node_vhd_and_qcow2(self):
  535. """Test label provided by vhd and qcow2 images builds"""
  536. configfile = self.setup_config('node_vhd_and_qcow2.yaml')
  537. self.useBuilder(configfile)
  538. p1_image = self.waitForImage('fake-provider1', 'fake-image')
  539. p2_image = self.waitForImage('fake-provider2', 'fake-image')
  540. # We can't guarantee which provider would build the requested
  541. # nodes, but that doesn't matter so much as guaranteeing that the
  542. # correct image type is uploaded to the correct provider.
  543. self.assertEqual(p1_image.format, "vhd")
  544. self.assertEqual(p2_image.format, "qcow2")
  545. def test_dib_upload_fail(self):
  546. """Test that an image upload failure is contained."""
  547. configfile = self.setup_config('node_upload_fail.yaml')
  548. pool = self.useNodepool(configfile, watermark_sleep=1)
  549. self.useBuilder(configfile)
  550. pool.start()
  551. self.waitForImage('fake-provider2', 'fake-image')
  552. nodes = self.waitForNodes('fake-label', 2)
  553. self.assertEqual(len(nodes), 2)
  554. total_nodes = sum(1 for _ in self.zk.nodeIterator())
  555. self.assertEqual(total_nodes, 2)
  556. self.assertEqual(nodes[0].provider, 'fake-provider2')
  557. self.assertEqual(nodes[0].type, ['fake-label'])
  558. self.assertEqual(nodes[0].username, 'zuul')
  559. self.assertEqual(nodes[1].provider, 'fake-provider2')
  560. self.assertEqual(nodes[1].type, ['fake-label'])
  561. self.assertEqual(nodes[1].username, 'zuul')
  562. def test_node_az(self):
  563. """Test that an image and node are created with az specified"""
  564. configfile = self.setup_config('node_az.yaml')
  565. pool = self.useNodepool(configfile, watermark_sleep=1)
  566. self.useBuilder(configfile)
  567. pool.start()
  568. self.waitForImage('fake-provider', 'fake-image')
  569. nodes = self.waitForNodes('fake-label')
  570. self.assertEqual(len(nodes), 1)
  571. self.assertEqual(nodes[0].provider, 'fake-provider')
  572. self.assertEqual(nodes[0].az, 'az1')
  573. def test_node_ipv6(self):
  574. """Test that ipv6 existence either way works fine."""
  575. configfile = self.setup_config('node_ipv6.yaml')
  576. pool = self.useNodepool(configfile, watermark_sleep=1)
  577. self.useBuilder(configfile)
  578. pool.start()
  579. self.waitForImage('fake-provider1', 'fake-image')
  580. self.waitForImage('fake-provider2', 'fake-image')
  581. label1_nodes = self.waitForNodes('fake-label1')
  582. label2_nodes = self.waitForNodes('fake-label2')
  583. self.assertEqual(len(label1_nodes), 1)
  584. self.assertEqual(len(label2_nodes), 1)
  585. # ipv6 address available
  586. self.assertEqual(label1_nodes[0].provider, 'fake-provider1')
  587. self.assertEqual(label1_nodes[0].public_ipv4, 'fake')
  588. self.assertEqual(label1_nodes[0].public_ipv6, 'fake_v6')
  589. self.assertEqual(label1_nodes[0].interface_ip, 'fake_v6')
  590. # ipv6 address unavailable
  591. self.assertEqual(label2_nodes[0].provider, 'fake-provider2')
  592. self.assertEqual(label2_nodes[0].public_ipv4, 'fake')
  593. self.assertEqual(label2_nodes[0].public_ipv6, '')
  594. self.assertEqual(label2_nodes[0].interface_ip, 'fake')
  595. def test_node_delete_success(self):
  596. configfile = self.setup_config('node.yaml')
  597. pool = self.useNodepool(configfile, watermark_sleep=1)
  598. self.useBuilder(configfile)
  599. pool.start()
  600. self.waitForImage('fake-provider', 'fake-image')
  601. nodes = self.waitForNodes('fake-label')
  602. self.assertEqual(len(nodes), 1)
  603. self.assertEqual(zk.READY, nodes[0].state)
  604. self.assertEqual('fake-provider', nodes[0].provider)
  605. nodes[0].state = zk.DELETING
  606. self.zk.storeNode(nodes[0])
  607. # Wait for this one to be deleted
  608. self.waitForNodeDeletion(nodes[0])
  609. # Wait for a new one to take it's place
  610. new_nodes = self.waitForNodes('fake-label')
  611. self.assertEqual(len(new_nodes), 1)
  612. self.assertEqual(zk.READY, new_nodes[0].state)
  613. self.assertEqual('fake-provider', new_nodes[0].provider)
  614. self.assertNotEqual(nodes[0], new_nodes[0])
  615. def test_node_launch_retries(self):
  616. configfile = self.setup_config('node_launch_retry.yaml')
  617. pool = self.useNodepool(configfile, watermark_sleep=1)
  618. self.useBuilder(configfile)
  619. pool.start()
  620. self.wait_for_config(pool)
  621. manager = pool.getProviderManager('fake-provider')
  622. manager.createServer_fails = 2
  623. self.waitForImage('fake-provider', 'fake-image')
  624. req = zk.NodeRequest()
  625. req.state = zk.REQUESTED
  626. req.node_types.append('fake-label')
  627. self.zk.storeNodeRequest(req)
  628. req = self.waitForNodeRequest(req)
  629. self.assertEqual(req.state, zk.FAILED)
  630. # retries in config is set to 2, so 2 attempts to create a server
  631. self.assertEqual(0, manager.createServer_fails)
  632. def test_node_delete_failure(self):
  633. def fail_delete(self, name):
  634. raise RuntimeError('Fake Error')
  635. self.useFixture(fixtures.MockPatchObject(
  636. fakeprovider.FakeProvider, 'deleteServer', fail_delete))
  637. configfile = self.setup_config('node.yaml')
  638. pool = self.useNodepool(configfile, watermark_sleep=1)
  639. self.useBuilder(configfile)
  640. pool.start()
  641. self.waitForImage('fake-provider', 'fake-image')
  642. nodes = self.waitForNodes('fake-label')
  643. self.assertEqual(len(nodes), 1)
  644. self.zk.lockNode(nodes[0], blocking=False)
  645. nodepool.launcher.NodeDeleter.delete(
  646. self.zk, pool.getProviderManager('fake-provider'), nodes[0])
  647. # Make sure our old node is in delete state, even though delete failed
  648. deleted_node = self.zk.getNode(nodes[0].id)
  649. self.assertIsNotNone(deleted_node)
  650. self.assertEqual(deleted_node.state, zk.DELETING)
  651. # Make sure we have a new, READY node
  652. nodes = self.waitForNodes('fake-label')
  653. self.assertEqual(len(nodes), 1)
  654. self.assertEqual(nodes[0].provider, 'fake-provider')
  655. def test_node_delete_error(self):
  656. def error_delete(self, name):
  657. # Set ERROR status instead of deleting the node
  658. self._getClient()._server_list[0].status = 'ERROR'
  659. self.useFixture(fixtures.MockPatchObject(
  660. fakeprovider.FakeProvider, 'deleteServer', error_delete))
  661. configfile = self.setup_config('node_delete_error.yaml')
  662. pool = self.useNodepool(configfile, watermark_sleep=1)
  663. self.useBuilder(configfile)
  664. pool.start()
  665. self.waitForImage('fake-provider', 'fake-image')
  666. # request a node
  667. req = zk.NodeRequest()
  668. req.state = zk.REQUESTED
  669. req.node_types.append('fake-label')
  670. self.zk.storeNodeRequest(req)
  671. self.log.debug("Wait for request")
  672. req = self.waitForNodeRequest(req)
  673. self.assertEqual(req.state, zk.FULFILLED)
  674. self.assertEqual(len(req.nodes), 1)
  675. # remove the node from db
  676. self.log.debug("deleting node %s", req.nodes[0])
  677. node = self.zk.getNode(req.nodes[0])
  678. self.zk.deleteNode(node)
  679. # wait the cleanup thread to kick in
  680. time.sleep(5)
  681. zk_nodes = self.zk.getNodes()
  682. self.assertEqual(len(zk_nodes), 1)
  683. node = self.zk.getNode(zk_nodes[0])
  684. self.assertEqual(node.state, zk.DELETING)
  685. # remove error nodes
  686. pool.getProviderManager(
  687. 'fake-provider')._getClient()._server_list.clear()
  688. def test_leaked_node(self):
  689. """Test that a leaked node is deleted"""
  690. configfile = self.setup_config('leaked_node.yaml')
  691. pool = self.useNodepool(configfile, watermark_sleep=1)
  692. self.useBuilder(configfile)
  693. pool.start()
  694. self.waitForImage('fake-provider', 'fake-image')
  695. self.log.debug("Waiting for initial pool...")
  696. nodes = self.waitForNodes('fake-label')
  697. self.log.debug("...done waiting for initial pool.")
  698. # Make sure we have a node built and ready
  699. self.assertEqual(len(nodes), 1)
  700. manager = pool.getProviderManager('fake-provider')
  701. servers = manager.listNodes()
  702. self.assertEqual(len(servers), 1)
  703. # Delete the node from ZooKeeper, but leave the instance
  704. # so it is leaked.
  705. self.log.debug("Delete node db record so instance is leaked...")
  706. self.zk.deleteNode(nodes[0])
  707. self.log.debug("...deleted node db so instance is leaked.")
  708. # Wait for nodepool to replace it
  709. self.log.debug("Waiting for replacement pool...")
  710. new_nodes = self.waitForNodes('fake-label')
  711. self.log.debug("...done waiting for replacement pool.")
  712. self.assertEqual(len(new_nodes), 1)
  713. # Wait for the instance to be cleaned up
  714. self.waitForInstanceDeletion(manager, nodes[0].external_id)
  715. # Make sure we end up with only one server (the replacement)
  716. servers = manager.listNodes()
  717. self.assertEqual(len(servers), 1)
  718. def test_max_ready_age(self):
  719. """Test a node with exceeded max-ready-age is deleted"""
  720. configfile = self.setup_config('node_max_ready_age.yaml')
  721. pool = self.useNodepool(configfile, watermark_sleep=1)
  722. self.useBuilder(configfile)
  723. pool.start()
  724. self.waitForImage('fake-provider', 'fake-image')
  725. self.log.debug("Waiting for initial pool...")
  726. nodes = self.waitForNodes('fake-label')
  727. self.log.debug("...done waiting for initial pool.")
  728. # Wait for the instance to be cleaned up
  729. manager = pool.getProviderManager('fake-provider')
  730. self.waitForInstanceDeletion(manager, nodes[0].external_id)
  731. def test_max_hold_age(self):
  732. """Test a held node with exceeded max-hold-age is deleted"""
  733. configfile = self.setup_config('node_max_hold_age.yaml')
  734. pool = self.useNodepool(configfile, watermark_sleep=1)
  735. self.useBuilder(configfile)
  736. pool.start()
  737. self.waitForImage('fake-provider', 'fake-image')
  738. self.log.debug("Waiting for initial pool...")
  739. nodes = self.waitForNodes('fake-label')
  740. self.log.debug("...done waiting for initial pool.")
  741. node = nodes[0]
  742. self.log.debug("Holding node %s..." % node.id)
  743. # hold the node
  744. self.zk.lockNode(node, blocking=False)
  745. node.state = zk.HOLD
  746. node.comment = 'testing'
  747. self.zk.storeNode(node)
  748. self.zk.unlockNode(node)
  749. znode = self.zk.getNode(node.id)
  750. self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
  751. # Wait for the instance to be cleaned up
  752. manager = pool.getProviderManager('fake-provider')
  753. self.waitForInstanceDeletion(manager, node.external_id)
  754. def test_hold_expiration_no_default(self):
  755. """Test a held node is deleted when past its operator-specified TTL,
  756. no max-hold-age set"""
  757. configfile = self.setup_config('node_max_hold_age_no_default.yaml')
  758. pool = self.useNodepool(configfile, watermark_sleep=1)
  759. self.useBuilder(configfile)
  760. pool.start()
  761. self.waitForImage('fake-provider', 'fake-image')
  762. self.log.debug("Waiting for initial pool...")
  763. nodes = self.waitForNodes('fake-label')
  764. self.log.debug("...done waiting for initial pool.")
  765. node = nodes[0]
  766. self.log.debug("Holding node %s..." % node.id)
  767. # hold the node
  768. self.zk.lockNode(node, blocking=False)
  769. node.state = zk.HOLD
  770. node.comment = 'testing'
  771. node.hold_expiration = 1
  772. self.zk.storeNode(node)
  773. self.zk.unlockNode(node)
  774. znode = self.zk.getNode(node.id)
  775. self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
  776. # Wait for the instance to be cleaned up
  777. manager = pool.getProviderManager('fake-provider')
  778. self.waitForInstanceDeletion(manager, node.external_id)
  779. def test_hold_expiration_str_type(self):
  780. """Test a held node is deleted when past its operator-specified TTL,
  781. even when the type is bad"""
  782. configfile = self.setup_config('node_max_hold_age_no_default.yaml')
  783. pool = self.useNodepool(configfile, watermark_sleep=1)
  784. self.useBuilder(configfile)
  785. pool.start()
  786. self.waitForImage('fake-provider', 'fake-image')
  787. self.log.debug("Waiting for initial pool...")
  788. nodes = self.waitForNodes('fake-label')
  789. self.log.debug("...done waiting for initial pool.")
  790. node = nodes[0]
  791. self.log.debug("Holding node %s..." % node.id)
  792. # hold the node
  793. self.zk.lockNode(node, blocking=False)
  794. node.state = zk.HOLD
  795. node.comment = 'testing'
  796. node.hold_expiration = '1'
  797. self.zk.storeNode(node)
  798. self.zk.unlockNode(node)
  799. znode = self.zk.getNode(node.id)
  800. self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
  801. # Wait for the instance to be cleaned up
  802. manager = pool.getProviderManager('fake-provider')
  803. self.waitForInstanceDeletion(manager, node.external_id)
  804. def test_hold_expiration_bad_type_coercion(self):
  805. """Test a held node uses default expiration value when type is bad"""
  806. configfile = self.setup_config('node_max_hold_age_no_default.yaml')
  807. pool = self.useNodepool(configfile, watermark_sleep=1)
  808. self.useBuilder(configfile)
  809. pool.start()
  810. self.waitForImage('fake-provider', 'fake-image')
  811. self.log.debug("Waiting for initial pool...")
  812. nodes = self.waitForNodes('fake-label')
  813. self.log.debug("...done waiting for initial pool.")
  814. node = nodes[0]
  815. self.log.debug("Holding node %s..." % node.id)
  816. # hold the node
  817. self.zk.lockNode(node, blocking=False)
  818. node.state = zk.HOLD
  819. node.comment = 'testing'
  820. node.hold_expiration = 'notanumber'
  821. self.zk.storeNode(node)
  822. self.zk.unlockNode(node)
  823. znode = self.zk.getNode(node.id)
  824. self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
  825. self.assertEqual(znode.hold_expiration, 0)
  826. def test_hold_expiration_lower_than_default(self):
  827. """Test a held node is deleted when past its operator-specified TTL,
  828. with max-hold-age set in the configuration"""
  829. configfile = self.setup_config('node_max_hold_age_2.yaml')
  830. pool = self.useNodepool(configfile, watermark_sleep=1)
  831. self.useBuilder(configfile)
  832. pool.start()
  833. self.waitForImage('fake-provider', 'fake-image')
  834. self.log.debug("Waiting for initial pool...")
  835. nodes = self.waitForNodes('fake-label', 2)
  836. self.log.debug("...done waiting for initial pool.")
  837. node_custom = nodes[0]
  838. # TODO make it a fraction of fixture's max-hold-age
  839. hold_expiration = 2
  840. node = nodes[1]
  841. self.log.debug("Holding node %s... (default)" % node.id)
  842. self.log.debug("Holding node %s...(%s seconds)" % (node_custom.id,
  843. hold_expiration))
  844. # hold the nodes
  845. self.zk.lockNode(node, blocking=False)
  846. node.state = zk.HOLD
  847. node.comment = 'testing'
  848. self.zk.storeNode(node)
  849. self.zk.unlockNode(node)
  850. self.zk.lockNode(node_custom, blocking=False)
  851. node_custom.state = zk.HOLD
  852. node_custom.comment = 'testing hold_expiration'
  853. node_custom.hold_expiration = hold_expiration
  854. self.zk.storeNode(node_custom)
  855. self.zk.unlockNode(node_custom)
  856. znode = self.zk.getNode(node.id)
  857. self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
  858. znode_custom = self.zk.getNode(node_custom.id)
  859. self.log.debug("Node %s in state '%s'" % (znode_custom.id,
  860. znode_custom.state))
  861. # Wait for the instance to be cleaned up
  862. manager = pool.getProviderManager('fake-provider')
  863. self.waitForInstanceDeletion(manager, node_custom.external_id)
  864. # control node should still be held
  865. held_nodes = [n for n in self.zk.nodeIterator() if n.state == zk.HOLD]
  866. self.assertTrue(any(n.id == node.id for n in held_nodes),
  867. held_nodes)
  868. # finally, control node gets deleted
  869. self.waitForInstanceDeletion(manager, node.external_id)
  870. def test_hold_expiration_higher_than_default(self):
  871. """Test a held node is deleted after max-hold-age seconds if the
  872. operator specifies a larger TTL"""
  873. configfile = self.setup_config('node_max_hold_age_2.yaml')
  874. pool = self.useNodepool(configfile, watermark_sleep=1)
  875. self.useBuilder(configfile)
  876. pool.start()
  877. self.waitForImage('fake-provider', 'fake-image')
  878. self.log.debug("Waiting for initial pool...")
  879. nodes = self.waitForNodes('fake-label', 2)
  880. self.log.debug("...done waiting for initial pool.")
  881. node_custom = nodes[0]
  882. # Make hold expiration much larger than max hold age.
  883. hold_expiration = 180
  884. node = nodes[1]
  885. self.log.debug("Holding node %s... (default)" % node.id)
  886. self.log.debug("Holding node %s...(%s seconds)" % (node_custom.id,
  887. hold_expiration))
  888. # hold the nodes
  889. self.zk.lockNode(node, blocking=False)
  890. node.state = zk.HOLD
  891. node.comment = 'testing'
  892. self.zk.storeNode(node)
  893. self.zk.unlockNode(node)
  894. self.zk.lockNode(node_custom, blocking=False)
  895. node_custom.state = zk.HOLD
  896. node_custom.comment = 'testing hold_expiration'
  897. node_custom.hold_expiration = hold_expiration
  898. self.zk.storeNode(node_custom)
  899. self.zk.unlockNode(node_custom)
  900. znode = self.zk.getNode(node.id)
  901. self.log.debug("Node %s in state '%s'" % (znode.id, znode.state))
  902. znode_custom = self.zk.getNode(node_custom.id)
  903. self.log.debug("Node %s in state '%s'" % (znode_custom.id,
  904. znode_custom.state))
  905. # Wait for the instance to be cleaned up
  906. manager = pool.getProviderManager('fake-provider')
  907. self.waitForInstanceDeletion(manager, node.external_id)
  908. # The custom node should be deleted as well but it may be slightly
  909. # delayed after the other node. Because of that we have defined a much
  910. # higher hold time than the max hold age. So we can give nodepool a few
  911. # extra seconds to clean it up and still validate that the max hold
  912. # age is not violated.
  913. for _ in iterate_timeout(10, Exception, 'assert custom_node is gone'):
  914. try:
  915. held_nodes = [n for n in self.zk.nodeIterator(cached=False)
  916. if n.state == zk.HOLD]
  917. self.assertEqual(0, len(held_nodes), held_nodes)
  918. break
  919. except AssertionError:
  920. # node still listed, retry
  921. pass
  922. def test_label_provider(self):
  923. """Test that only providers listed in the label satisfy the request"""
  924. configfile = self.setup_config('node_label_provider.yaml')
  925. pool = self.useNodepool(configfile, watermark_sleep=1)
  926. self.useBuilder(configfile)
  927. pool.start()
  928. self.waitForImage('fake-provider', 'fake-image')
  929. self.waitForImage('fake-provider2', 'fake-image')
  930. nodes = self.waitForNodes('fake-label')
  931. self.assertEqual(len(nodes), 1)
  932. self.assertEqual(nodes[0].provider, 'fake-provider2')
  933. def _create_pending_request(self):
  934. req = zk.NodeRequest()
  935. req.state = zk.PENDING
  936. req.requestor = 'test_nodepool'
  937. req.node_types.append('fake-label')
  938. self.zk.storeNodeRequest(req)
  939. # Create a node that is allocated to the request, but not yet assigned
  940. # within the NodeRequest object
  941. node = zk.Node()
  942. node.state = zk.READY
  943. node.type = 'fake-label'
  944. node.public_ipv4 = 'fake'
  945. node.provider = 'fake-provider'
  946. node.pool = 'main'
  947. node.allocated_to = req.id
  948. self.zk.storeNode(node)
  949. return (req, node)
  950. def test_lost_requests(self):
  951. """Test a request left pending is reset and satisfied on restart"""
  952. (req, node) = self._create_pending_request()
  953. configfile = self.setup_config('node_lost_requests.yaml')
  954. pool = self.useNodepool(configfile, watermark_sleep=1)
  955. self.useBuilder(configfile)
  956. self.waitForImage('fake-provider', 'fake-image')
  957. pool.start()
  958. req = self.waitForNodeRequest(req, (zk.FULFILLED,))
  959. # Since our config file has min-ready=0, we should be able to re-use
  960. # the previously assigned node, thus making sure that the cleanup
  961. # code reset the 'allocated_to' field.
  962. self.assertIn(node.id, req.nodes)
  963. def test_node_deallocation(self):
  964. """Test an allocated node with a missing request is deallocated"""
  965. node = zk.Node()
  966. node.state = zk.READY
  967. node.type = 'fake-label'
  968. node.public_ipv4 = 'fake'
  969. node.provider = 'fake-provider'
  970. node.allocated_to = "MISSING"
  971. self.zk.storeNode(node)
  972. configfile = self.setup_config('node_lost_requests.yaml')
  973. pool = self.useNodepool(configfile, watermark_sleep=1)
  974. self.useBuilder(configfile)
  975. pool.start()
  976. while True:
  977. node = self.zk.getNode(node.id)
  978. if not node.allocated_to:
  979. break
  980. def test_multiple_pools(self):
  981. """Test that an image and node are created"""
  982. configfile = self.setup_config('multiple_pools.yaml')
  983. pool = self.useNodepool(configfile, watermark_sleep=1)
  984. self.useBuilder(configfile)
  985. pool.start()
  986. self.waitForImage('fake-provider', 'fake-image')
  987. lab1 = self.waitForNodes('fake-label1')
  988. lab2 = self.waitForNodes('fake-label2')
  989. self.assertEqual(len(lab1), 1)
  990. self.assertEqual(lab1[0].provider, 'fake-provider')
  991. self.assertEqual(lab1[0].type, ['fake-label1'])
  992. self.assertEqual(lab1[0].az, 'az1')
  993. self.assertEqual(lab1[0].pool, 'pool1')
  994. self.assertEqual(len(lab2), 1)
  995. self.assertEqual(lab2[0].provider, 'fake-provider')
  996. self.assertEqual(lab2[0].type, ['fake-label2'])
  997. self.assertEqual(lab2[0].az, 'az2')
  998. self.assertEqual(lab2[0].pool, 'pool2')
  999. def test_unmanaged_image(self):
  1000. """Test node launching using an unmanaged image"""
  1001. configfile = self.setup_config('node_unmanaged_image.yaml')
  1002. pool = self.useNodepool(configfile, watermark_sleep=1)
  1003. pool.start()
  1004. self.wait_for_config(pool)
  1005. manager = pool.getProviderManager('fake-provider')
  1006. manager._client.create_image(name="fake-image")
  1007. manager._client.create_image(name="fake-image-windows")
  1008. manager._client.create_image(name="fake-image-windows-port")
  1009. nodes = self.waitForNodes('fake-label')
  1010. self.assertEqual(len(nodes), 1)
  1011. self.assertIsNone(nodes[0].username)
  1012. nodes = self.waitForNodes('fake-label-windows')
  1013. self.assertEqual(len(nodes), 1)
  1014. self.assertEqual('zuul', nodes[0].username)
  1015. self.assertEqual('winrm', nodes[0].connection_type)
  1016. self.assertEqual(5986, nodes[0].connection_port)
  1017. self.assertEqual(nodes[0].host_keys, [])
  1018. nodes = self.waitForNodes('fake-label-arbitrary-port')
  1019. self.assertEqual(len(nodes), 1)
  1020. self.assertEqual('zuul', nodes[0].username)
  1021. self.assertEqual('winrm', nodes[0].connection_type)
  1022. self.assertEqual(1234, nodes[0].connection_port)
  1023. self.assertEqual(nodes[0].host_keys, [])
  1024. def test_unmanaged_image_provider_name(self):
  1025. """
  1026. Test node launching using an unmanaged image referencing the
  1027. image name as known by the provider.
  1028. """
  1029. configfile = self.setup_config('unmanaged_image_provider_name.yaml')
  1030. pool = self.useNodepool(configfile, watermark_sleep=1)
  1031. pool.start()
  1032. self.wait_for_config(pool)
  1033. manager = pool.getProviderManager('fake-provider')
  1034. manager._client.create_image(name="provider-named-image")
  1035. nodes = self.waitForNodes('fake-label')
  1036. self.assertEqual(len(nodes), 1)
  1037. def test_unmanaged_image_provider_id(self):
  1038. """
  1039. Test node launching using an unmanaged image referencing the
  1040. image ID as known by the provider.
  1041. """
  1042. configfile = self.setup_config('unmanaged_image_provider_id.yaml')
  1043. pool = self.useNodepool(configfile, watermark_sleep=1)
  1044. pool.start()
  1045. self.log.debug("Waiting for node")
  1046. nodes = self.waitForNodes('fake-label')
  1047. self.assertEqual(len(nodes), 1)
  1048. def test_paused_gets_declined(self):
  1049. """Test that a paused request, that later gets declined, unpauses."""
  1050. # First config has max-servers set to 2
  1051. configfile = self.setup_config('pause_declined_1.yaml')
  1052. self.useBuilder(configfile)
  1053. self.waitForImage('fake-provider', 'fake-image')
  1054. pool = self.useNodepool(configfile, watermark_sleep=1)
  1055. pool.start()
  1056. # Create a request that uses all capacity (2 servers)
  1057. req = zk.NodeRequest()
  1058. req.state = zk.REQUESTED
  1059. req.node_types.append('fake-label')
  1060. req.node_types.append('fake-label')
  1061. self.zk.storeNodeRequest(req)
  1062. req = self.waitForNodeRequest(req)
  1063. self.assertEqual(req.state, zk.FULFILLED)
  1064. self.assertEqual(len(req.nodes), 2)
  1065. # Now that we have 2 nodes in use, create another request that
  1066. # requests two nodes, which should cause the request to pause.
  1067. req2 = zk.NodeRequest()
  1068. req2.state = zk.REQUESTED
  1069. req2.node_types.append('fake-label')
  1070. req2.node_types.append('fake-label')
  1071. self.zk.storeNodeRequest(req2)
  1072. req2 = self.waitForNodeRequest(req2, (zk.PENDING,))
  1073. # Second config decreases max-servers to 1
  1074. self.replace_config(configfile, 'pause_declined_2.yaml')
  1075. # Because the second request asked for 2 nodes, but that now exceeds
  1076. # max-servers, req2 should get declined now, and transition to FAILED
  1077. req2 = self.waitForNodeRequest(req2, (zk.FAILED,))
  1078. self.assertNotEqual(req2.declined_by, [])
  1079. def test_node_auto_floating_ip(self):
  1080. """Test that auto-floating-ip option works fine."""
  1081. configfile = self.setup_config('node_auto_floating_ip.yaml')
  1082. pool = self.useNodepool(configfile, watermark_sleep=1)
  1083. self.useBuilder(configfile)
  1084. pool.start()
  1085. self.waitForImage('fake-provider1', 'fake-image')
  1086. self.waitForImage('fake-provider2', 'fake-image')
  1087. self.waitForImage('fake-provider3', 'fake-image')
  1088. label1_nodes = self.waitForNodes('fake-label1')
  1089. label2_nodes = self.waitForNodes('fake-label2')
  1090. label3_nodes = self.waitForNodes('fake-label3')
  1091. self.assertEqual(1, len(label1_nodes))
  1092. self.assertEqual(1, len(label2_nodes))
  1093. self.assertEqual(1, len(label3_nodes))
  1094. # auto-floating-ip: False
  1095. self.assertEqual('fake-provider1', label1_nodes[0].provider)
  1096. self.assertEqual('', label1_nodes[0].public_ipv4)
  1097. self.assertEqual('', label1_nodes[0].public_ipv6)
  1098. self.assertEqual('fake', label1_nodes[0].interface_ip)
  1099. # auto-floating-ip: True
  1100. self.assertEqual('fake-provider2', label2_nodes[0].provider)
  1101. self.assertEqual('fake', label2_nodes[0].public_ipv4)
  1102. self.assertEqual('', label2_nodes[0].public_ipv6)
  1103. self.assertEqual('fake', label2_nodes[0].interface_ip)
  1104. # auto-floating-ip: default value
  1105. self.assertEqual('fake-provider3', label3_nodes[0].provider)
  1106. self.assertEqual('fake', label3_nodes[0].public_ipv4)
  1107. self.assertEqual('', label3_nodes[0].public_ipv6)
  1108. self.assertEqual('fake', label3_nodes[0].interface_ip)
  1109. def test_secure_file(self):
  1110. """Test using secure.conf file"""
  1111. configfile = self.setup_config('secure_file_config.yaml')
  1112. securefile = self.setup_secure('secure_file_secure.yaml')
  1113. pool = self.useNodepool(
  1114. configfile,
  1115. secure_conf=securefile,
  1116. watermark_sleep=1)
  1117. self.useBuilder(configfile, securefile=securefile)
  1118. pool.start()
  1119. self.wait_for_config(pool)
  1120. fake_image = pool.config.diskimages['fake-image']
  1121. self.assertIn('REG_PASSWORD', fake_image.env_vars)
  1122. self.assertEqual('secret', fake_image.env_vars['REG_PASSWORD'])
  1123. zk_servers = pool.config.zookeeper_servers
  1124. self.assertEqual(1, len(zk_servers))
  1125. key = list(zk_servers.keys())[0]
  1126. self.assertEqual(self.zookeeper_host, zk_servers[key].host)
  1127. self.assertEqual(self.zookeeper_port, zk_servers[key].port)
  1128. self.assertEqual(self.zookeeper_chroot, zk_servers[key].chroot)
  1129. image = self.waitForImage('fake-provider', 'fake-image')
  1130. self.assertEqual(image.username, 'zuul')
  1131. nodes = self.waitForNodes('fake-label')
  1132. self.assertEqual(len(nodes), 1)
  1133. self.assertEqual(nodes[0].provider, 'fake-provider')
  1134. self.assertEqual(nodes[0].type, ['fake-label'])
  1135. self.assertEqual(nodes[0].username, 'zuul')
  1136. self.assertNotEqual(nodes[0].host_keys, [])
  1137. def test_provider_removal(self):
  1138. """Test that removing a provider stops the worker thread"""
  1139. configfile = self.setup_config('launcher_two_provider.yaml')
  1140. self.useBuilder(configfile)
  1141. pool = self.useNodepool(configfile, watermark_sleep=.5)
  1142. pool.start()
  1143. self.waitForNodes('fake-label')
  1144. self.assertEqual(2, len(pool._pool_threads))
  1145. self.replace_config(configfile, 'launcher_two_provider_remove.yaml')
  1146. # wait longer than our watermark_sleep time for the config to change
  1147. time.sleep(1)
  1148. self.assertEqual(1, len(pool._pool_threads))
  1149. def test_failed_provider(self):
  1150. """Test that broken provider doesn't fail node requests."""
  1151. configfile = self.setup_config('launcher_two_provider_max_1.yaml')
  1152. self.useBuilder(configfile)
  1153. pool = self.useNodepool(configfile, watermark_sleep=.5)
  1154. pool.start()
  1155. self.wait_for_config(pool)
  1156. # Steady state at images available.
  1157. self.waitForImage('fake-provider', 'fake-image')
  1158. self.waitForImage('fake-provider2', 'fake-image')
  1159. # We have now reached steady state and can manipulate the system to
  1160. # test failing cloud behavior.
  1161. # Make two requests so that the next requests are paused.
  1162. # Note we use different provider specific labels here to avoid
  1163. # a race where a single provider fulfills both of these initial
  1164. # requests.
  1165. # fake-provider
  1166. req = zk.NodeRequest()
  1167. req.state = zk.REQUESTED
  1168. req.node_types.append('fake-label2')
  1169. self.zk.storeNodeRequest(req)
  1170. req = self.waitForNodeRequest(req, zk.FULFILLED)
  1171. # fake-provider2
  1172. req = zk.NodeRequest()
  1173. req.state = zk.REQUESTED
  1174. req.node_types.append('fake-label3')
  1175. self.zk.storeNodeRequest(req)
  1176. req = self.waitForNodeRequest(req, zk.FULFILLED)
  1177. nodes = map(pool.zk.getNode, pool.zk.getNodes())
  1178. provider1_first = None
  1179. provider2_first = None
  1180. for node in nodes:
  1181. if node.provider == 'fake-provider2':
  1182. provider2_first = node
  1183. elif node.provider == 'fake-provider':
  1184. provider1_first = node
  1185. # Mark the nodes as being used so they won't be deleted at pause.
  1186. # Locking them is enough.
  1187. self.zk.lockNode(provider1_first, blocking=False)
  1188. self.zk.lockNode(provider2_first, blocking=False)
  1189. # Next two requests will go pending one for each provider.
  1190. req1 = zk.NodeRequest()
  1191. req1.state = zk.REQUESTED
  1192. req1.node_types.append('fake-label')
  1193. self.zk.storeNodeRequest(req1)
  1194. req1 = self.waitForNodeRequest(req1, zk.PENDING)
  1195. req2 = zk.NodeRequest()
  1196. req2.state = zk.REQUESTED
  1197. req2.node_types.append('fake-label')
  1198. self.zk.storeNodeRequest(req2)
  1199. req2 = self.waitForNodeRequest(req2, zk.PENDING)
  1200. # Delete node attached to provider2 this will cause provider2 to
  1201. # fulfill the request it had pending.
  1202. provider2_first.state = zk.DELETING
  1203. self.zk.storeNode(provider2_first)
  1204. self.zk.unlockNode(provider2_first)
  1205. self.waitForNodeDeletion(provider2_first)
  1206. while True:
  1207. # Wait for provider2 node to be created. Also find the request
  1208. # that was not fulfilled. This is the request that fake-provider
  1209. # is pending on.
  1210. req = self.zk.getNodeRequest(req1.id)
  1211. if req.state == zk.FULFILLED:
  1212. final_req = req2
  1213. break
  1214. req = self.zk.getNodeRequest(req2.id)
  1215. if req.state == zk.FULFILLED:
  1216. final_req = req1
  1217. break
  1218. provider2_second = None
  1219. nodes = map(pool.zk.getNode, pool.zk.getNodes())
  1220. for node in nodes:
  1221. if (node and node.provider == 'fake-provider2' and
  1222. node.state == zk.READY):
  1223. provider2_second = node
  1224. break
  1225. # Now delete the new node we had provider2 build. At this point,
  1226. # the only provider with any requests is fake-provider.
  1227. provider2_second.state = zk.DELETING
  1228. self.zk.storeNode(provider2_second)
  1229. # Set provider1 runHandler to throw exception to simulate a
  1230. # broken cloud. Note the pool worker instantiates request handlers on
  1231. # demand which is why we have a somewhat convoluted monkey patch here.
  1232. # We must patch deep enough in the request handler that
  1233. # despite being paused fake-provider will still trip over this code.
  1234. pool_worker = pool.getPoolWorkers('fake-provider')[0]
  1235. request_handler = pool_worker.request_handlers[0]
  1236. def raise_KeyError(node):
  1237. raise KeyError('fake-provider')
  1238. request_handler.launch = raise_KeyError
  1239. # Delete instance in fake-provider. This should cause provider2
  1240. # to service the request that was held pending by fake-provider.
  1241. provider1_first.state = zk.DELETING
  1242. self.zk.storeNode(provider1_first)
  1243. self.zk.unlockNode(provider1_first)
  1244. # Request is fulfilled by provider 2
  1245. req = self.waitForNodeRequest(final_req)
  1246. self.assertEqual(req.state, zk.FULFILLED)
  1247. self.assertEqual(1, len(req.declined_by))
  1248. self.assertIn('fake-provider-main', req.declined_by[0])
  1249. def test_disabled_provider(self):
  1250. '''
  1251. A request should fail even with a provider that is disabled by
  1252. setting max-servers to 0. Because we look to see that all providers
  1253. decline a request by comparing the declined_by request attribute to
  1254. the list of registered launchers, this means that each must attempt
  1255. to handle it at least once, and thus decline it.
  1256. '''
  1257. configfile = self.setup_config('disabled_provider.yaml')
  1258. self.useBuilder(configfile)
  1259. pool = self.useNodepool(configfile, watermark_sleep=1)
  1260. pool.start()
  1261. req = zk.NodeRequest()
  1262. req.state = zk.REQUESTED
  1263. req.node_types.append('fake-label')
  1264. self.zk.storeNodeRequest(req)
  1265. req = self.waitForNodeRequest(req)
  1266. self.assertEqual(req.state, zk.FAILED)
  1267. def test_provider_wont_wedge(self):
  1268. '''
  1269. A provider should not wedge itself when it is at (1) maximum capacity
  1270. (# registered nodes == max-servers), (2) all of its current nodes are
  1271. not being used, and (3) a request comes in with a label that it does
  1272. not yet have available. Normally, situation (3) combined with (1)
  1273. would cause the provider to pause until capacity becomes available,
  1274. but because of (2), it never will and we would wedge the provider.
  1275. '''
  1276. configfile = self.setup_config('wedge_test.yaml')
  1277. self.useBuilder(configfile)
  1278. pool = self.useNodepool(configfile, watermark_sleep=1)
  1279. pool.start()
  1280. # Wait for fake-label1 min-ready request to be fulfilled, which will
  1281. # put us at maximum capacity with max-servers of 1.
  1282. label1_nodes = self.waitForNodes('fake-label1')
  1283. self.assertEqual(1, len(label1_nodes))
  1284. # Now we submit a request for fake-label2, which is not yet available.
  1285. req = zk.NodeRequest()
  1286. req.state = zk.REQUESTED
  1287. req.node_types.append('fake-label2')
  1288. self.zk.storeNodeRequest(req)
  1289. # The provider should pause here to handle the fake-label2 request.
  1290. # But because the fake-label1 node is not being used, and will never
  1291. # be freed because we are paused and not handling additional requests,
  1292. # the pool worker thread should recognize that and delete the unused
  1293. # fake-label1 node for us. It can then fulfill the fake-label2 request.
  1294. self.waitForNodeDeletion(label1_nodes[0])
  1295. req = self.waitForNodeRequest(req)
  1296. self.assertEqual(req.state, zk.FULFILLED)
  1297. def test_launcher_registers_config_change(self):
  1298. '''
  1299. Launchers register themselves and some config info with ZooKeeper.
  1300. Validate that a config change will propogate to ZooKeeper.
  1301. '''
  1302. configfile = self.setup_config('launcher_reg1.yaml')
  1303. self.useBuilder(configfile)
  1304. pool = self.useNodepool(configfile, watermark_sleep=1)
  1305. pool.start()
  1306. self.waitForNodes('fake-label')
  1307. launchers = self.zk.getRegisteredLaunchers()
  1308. self.assertEqual(1, len(launchers))
  1309. # the fake-label-unused label should not appear
  1310. self.assertEqual({'fake-label'}, launchers[0].supported_labels)
  1311. self.replace_config(configfile, 'launcher_reg2.yaml')
  1312. # we should get 1 additional label now
  1313. while launchers[0].supported_labels != {'fake-label', 'fake-label2'}:
  1314. time.sleep(1)
  1315. launchers = self.zk.getRegisteredLaunchers()
  1316. @mock.patch('nodepool.driver.openstack.handler.'
  1317. 'OpenStackNodeLauncher._launchNode')
  1318. def test_launchNode_session_expired(self, mock_launch):
  1319. '''
  1320. Test ZK session lost during _launchNode().
  1321. '''
  1322. mock_launch.side_effect = kze.SessionExpiredError()
  1323. # use a config with min-ready of 0
  1324. configfile = self.setup_config('node_launch_retry.yaml')
  1325. self.useBuilder(configfile)
  1326. pool = self.useNodepool(configfile, watermark_sleep=1)
  1327. pool.cleanup_interval = 60
  1328. pool.start()
  1329. self.waitForImage('fake-provider', 'fake-image')
  1330. req = zk.NodeRequest()
  1331. req.state = zk.REQUESTED
  1332. req.node_types.append('fake-label')
  1333. self.zk.storeNodeRequest(req)
  1334. # A session loss during node launch should at least try to set the
  1335. # request state to FAILED (in a non-test scenario, it may actually
  1336. # be missing).
  1337. req = self.waitForNodeRequest(req, states=(zk.FAILED,))
  1338. self.assertEqual(1, mock_launch.call_count)
  1339. # Any znodes created for the request should eventually get deleted.
  1340. while self.zk.countPoolNodes('fake-provider', 'main'):
  1341. time.sleep(0)
  1342. def test_launchNode_delete_error(self):
  1343. '''
  1344. Test that the launcher keeps trying to spawn a node in case of a
  1345. delete error
  1346. '''
  1347. fake_client = fakeprovider.FakeLaunchAndDeleteFailCloud(
  1348. times_to_fail=1)
  1349. def get_fake_client(*args, **kwargs):
  1350. return fake_client
  1351. self.useFixture(fixtures.MockPatchObject(
  1352. fakeprovider.FakeProvider, '_getClient',
  1353. get_fake_client))
  1354. configfile = self.setup_config('node_launch_retry.yaml')
  1355. self.useBuilder(configfile)
  1356. pool = self.useNodepool(configfile, watermark_sleep=1)
  1357. pool.cleanup_interval = 60
  1358. pool.start()
  1359. self.waitForImage('fake-provider', 'fake-image')
  1360. req = zk.NodeRequest()
  1361. req.state = zk.REQUESTED
  1362. req.node_types.append('fake-label')
  1363. self.zk.storeNodeRequest(req)
  1364. req = self.waitForNodeRequest(req)
  1365. # The deletion of the node can be delayed so wait for it.
  1366. while True:
  1367. if fake_client.delete_success:
  1368. break
  1369. time.sleep(0.1)
  1370. self.assertTrue(fake_client.launch_success)
  1371. self.assertEqual(fake_client.times_to_fail_delete,
  1372. fake_client.times_failed_delete)
  1373. self.assertEqual(fake_client.times_to_fail_launch,
  1374. fake_client.times_failed_launch)
  1375. self.assertEqual(req.state, zk.FULFILLED)
  1376. self.assertEqual(len(req.nodes), 1)
  1377. @mock.patch(
  1378. 'nodepool.driver.openstack.handler.OpenStackNodeRequestHandler.poll')
  1379. def test_handler_poll_session_expired(self, mock_poll):
  1380. '''
  1381. Test ZK session lost during handler poll().
  1382. '''
  1383. mock_poll.side_effect = kze.SessionExpiredError()
  1384. # use a config with min-ready of 0
  1385. configfile = self.setup_config('node_launch_retry.yaml')
  1386. self.useBuilder(configfile)
  1387. pool = self.useNodepool(configfile, watermark_sleep=1)
  1388. pool.cleanup_interval = 60
  1389. pool.start()
  1390. self.waitForImage('fake-provider', 'fake-image')
  1391. req = zk.NodeRequest()
  1392. req.state = zk.REQUESTED
  1393. req.node_types.append('fake-label')
  1394. self.zk.storeNodeRequest(req)
  1395. # A session loss during handler poll should at least remove the
  1396. # request from active handlers
  1397. req = self.waitForNodeRequest(req, states=(zk.PENDING,))
  1398. self.assertEqual(1, mock_poll.call_count)
  1399. self.assertEqual(0, len(
  1400. pool._pool_threads["fake-provider-main"].request_handlers))
  1401. def test_exception_causing_decline_of_paused_request(self):
  1402. """
  1403. Test that a paused request, that later gets declined because of
  1404. an exception (say, thrown from a provider operation), unpauses
  1405. and removes the request handler.
  1406. """
  1407. # First config has max-servers set to 2
  1408. configfile = self.setup_config('pause_declined_1.yaml')
  1409. self.useBuilder(configfile)
  1410. self.waitForImage('fake-provider', 'fake-image')
  1411. pool = self.useNodepool(configfile, watermark_sleep=1)
  1412. pool.start()
  1413. # Create a request that uses all capacity (2 servers)
  1414. req = zk.NodeRequest()
  1415. req.state = zk.REQUESTED
  1416. req.node_types.append('fake-label')
  1417. req.node_types.append('fake-label')
  1418. self.zk.storeNodeRequest(req)
  1419. req = self.waitForNodeRequest(req)
  1420. self.assertEqual(req.state, zk.FULFILLED)
  1421. self.assertEqual(len(req.nodes), 2)
  1422. # Now that we have 2 nodes in use, create another request that
  1423. # requests two nodes, which should cause the request to pause.
  1424. req2 = zk.NodeRequest()
  1425. req2.state = zk.REQUESTED
  1426. req2.node_types.append('fake-label')
  1427. req2.node_types.append('fake-label')
  1428. self.zk.storeNodeRequest(req2)
  1429. req2 = self.waitForNodeRequest(req2, (zk.PENDING,))
  1430. # Force an exception within the run handler.
  1431. pool_worker = pool.getPoolWorkers('fake-provider')
  1432. while not pool_worker[0].paused_handler:
  1433. time.sleep(0.1)
  1434. pool_worker[0].paused_handler.hasProviderQuota = mock.Mock(
  1435. side_effect=Exception('mock exception')
  1436. )
  1437. # The above exception should cause us to fail the paused request.
  1438. req2 = self.waitForNodeRequest(req2, (zk.FAILED,))
  1439. self.assertNotEqual(req2.declined_by, [])
  1440. # The exception handling should make sure that we unpause AND remove
  1441. # the request handler.
  1442. while pool_worker[0].paused_handler:
  1443. time.sleep(0.1)
  1444. self.assertEqual(0, len(pool_worker[0].request_handlers))
  1445. def test_ignore_provider_quota_false(self):
  1446. '''
  1447. Test that a node request get fulfilled with ignore-provider-quota set
  1448. to false.
  1449. '''
  1450. # Set max-cores quota value to 0 to force "out of quota". Note that
  1451. # the fake provider checks the number of instances during server
  1452. # creation to decide if it should throw an over quota exception,
  1453. # but it doesn't check cores.
  1454. def fake_get_quota():
  1455. return (0, 20, 1000000)
  1456. self.useFixture(fixtures.MockPatchObject(
  1457. fakeprovider.FakeProvider.fake_cloud, '_get_quota',
  1458. fake_get_quota
  1459. ))
  1460. configfile = self.setup_config('ignore_provider_quota_false.yaml')
  1461. self.useBuilder(configfile)
  1462. self.waitForImage('fake-provider', 'fake-image')
  1463. pool = self.useNodepool(configfile, watermark_sleep=1)
  1464. pool.start()
  1465. # Create a request with ignore-provider-quota set to false that should
  1466. # fail because it will decline the request because "it would exceed
  1467. # quota".
  1468. self.log.debug("Submitting request with ignore-provider-quota False")
  1469. req = zk.NodeRequest()
  1470. req.state = zk.REQUESTED
  1471. req.node_types.append('fake-label')
  1472. self.zk.storeNodeRequest(req)
  1473. req = self.waitForNodeRequest(req)
  1474. self.assertEqual(req.state, zk.FAILED)
  1475. def test_ignore_provider_quota_true(self):
  1476. '''
  1477. Test that a node request get fulfilled with ignore-provider-quota set
  1478. to true.
  1479. '''
  1480. # Set max-cores quota value to 0 to force "out of quota". Note that
  1481. # the fake provider checks the number of instances during server
  1482. # creation to decide if it should throw an over quota exception,
  1483. # but it doesn't check cores.
  1484. def fake_get_quota():
  1485. return (0, 20, 1000000)
  1486. self.useFixture(fixtures.MockPatchObject(
  1487. fakeprovider.FakeProvider.fake_cloud, '_get_quota',
  1488. fake_get_quota
  1489. ))
  1490. configfile = self.setup_config('ignore_provider_quota_true.yaml')
  1491. self.useBuilder(configfile)
  1492. self.waitForImage('fake-provider', 'fake-image')
  1493. pool = self.useNodepool(configfile, watermark_sleep=1)
  1494. pool.start()
  1495. # Create a request with ignore-provider-quota set to true that should
  1496. # pass regardless of the lack of cloud/provider quota.
  1497. self.replace_config(configfile, 'ignore_provider_quota_true.yaml')
  1498. self.log.debug(
  1499. "Submitting an initial request with ignore-provider-quota True")
  1500. req1 = zk.NodeRequest()
  1501. req1.state = zk.REQUESTED
  1502. req1.node_types.append('fake-label')
  1503. self.zk.storeNodeRequest(req1)
  1504. req1 = self.waitForNodeRequest(req1)
  1505. self.assertEqual(req1.state, zk.FULFILLED)
  1506. # Lock this node so it appears as used and not deleted
  1507. req1_node = self.zk.getNode(req1.nodes[0])
  1508. self.zk.lockNode(req1_node, blocking=False)
  1509. # Request a second node; this request should pause the handler
  1510. # due to the pool set with max-servers: 1
  1511. req2 = zk.NodeRequest()
  1512. req2.state = zk.REQUESTED
  1513. req2.node_types.append('fake-label')
  1514. self.log.debug(
  1515. "Submitting a second request with ignore-provider-quota True"
  1516. "but with a full max-servers quota.")
  1517. self.zk.storeNodeRequest(req2)
  1518. pool_worker = pool.getPoolWorkers('fake-provider')
  1519. while not pool_worker[0].paused_handler:
  1520. time.sleep(0.1)
  1521. # The handler is paused now and the request should be in state PENDING
  1522. req2 = self.waitForNodeRequest(req2, zk.PENDING)
  1523. self.assertEqual(req2.state, zk.PENDING)
  1524. # Now free up the first node
  1525. self.log.debug("Marking first node as used %s", req1.id)
  1526. req1_node.state = zk.USED
  1527. self.zk.storeNode(req1_node)
  1528. self.zk.unlockNode(req1_node)
  1529. self.waitForNodeDeletion(req1_node)
  1530. # After the first node is cleaned up the second request should be
  1531. # able to fulfill now.
  1532. req2 = self.waitForNodeRequest(req2)
  1533. self.assertEqual(req2.state, zk.FULFILLED)
  1534. # Lock this node so it appears as used and not deleted
  1535. req2_node = self.zk.getNode(req2.nodes[0])
  1536. self.zk.lockNode(req2_node, blocking=False)
  1537. # Now free up the second node
  1538. self.log.debug("Marking second node as used %s", req2.id)
  1539. req2_node.state = zk.USED
  1540. self.zk.storeNode(req2_node)
  1541. self.zk.unlockNode(req2_node)
  1542. self.waitForNodeDeletion(req2_node)
  1543. # Request a 2 node set; this request should fail
  1544. # due to the provider only being able to fulfill
  1545. # a single node at a time.
  1546. req3 = zk.NodeRequest()
  1547. req3.state = zk.REQUESTED
  1548. req3.node_types.append('fake-label')
  1549. req3.node_types.append('fake-label')
  1550. self.log.debug(
  1551. "Submitting a third request with ignore-provider-quota True"
  1552. "for a 2-node set which the provider cannot fulfill.")
  1553. self.zk.storeNodeRequest(req3)
  1554. req3 = self.waitForNodeRequest(req3)
  1555. self.assertEqual(req3.state, zk.FAILED)
  1556. def test_request_order(self):
  1557. """Test that requests are handled in sorted order"""
  1558. configfile = self.setup_config('node_no_min_ready.yaml')
  1559. self.useBuilder(configfile)
  1560. image = self.waitForImage('fake-provider', 'fake-image')
  1561. self.assertEqual(image.username, 'zuul')
  1562. req1 = zk.NodeRequest()
  1563. req1.state = zk.REQUESTED
  1564. req1.node_types.append('fake-label')
  1565. req1.relative_priority = 2
  1566. self.zk.storeNodeRequest(req1)
  1567. req2 = zk.NodeRequest()
  1568. req2.state = zk.REQUESTED
  1569. req2.node_types.append('fake-label')
  1570. req2.relative_priority = 1
  1571. self.zk.storeNodeRequest(req2)
  1572. pool = self.useNodepool(configfile, watermark_sleep=1)
  1573. pool.start()
  1574. req2 = self.waitForNodeRequest(req2)
  1575. self.assertEqual(req2.state, zk.FULFILLED)
  1576. req1 = self.waitForNodeRequest(req1)
  1577. self.assertEqual(req1.state, zk.FULFILLED)
  1578. self.assertTrue(req2.id > req1.id)
  1579. self.assertTrue(req2.state_time < req1.state_time)