OpenStack Compute (Nova)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

manager.py 400KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510
  1. # Copyright 2010 United States Government as represented by the
  2. # Administrator of the National Aeronautics and Space Administration.
  3. # Copyright 2011 Justin Santa Barbara
  4. # All Rights Reserved.
  5. #
  6. # Licensed under the Apache License, Version 2.0 (the "License"); you may
  7. # not use this file except in compliance with the License. You may obtain
  8. # a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing, software
  13. # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  14. # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  15. # License for the specific language governing permissions and limitations
  16. # under the License.
  17. """Handles all processes relating to instances (guest vms).
  18. The :py:class:`ComputeManager` class is a :py:class:`nova.manager.Manager` that
  19. handles RPC calls relating to creating instances. It is responsible for
  20. building a disk image, launching it via the underlying virtualization driver,
  21. responding to calls to check its state, attaching persistent storage, and
  22. terminating it.
  23. """
  24. import base64
  25. import binascii
  26. import contextlib
  27. import functools
  28. import inspect
  29. import sys
  30. import time
  31. import traceback
  32. from cinderclient import exceptions as cinder_exception
  33. from cursive import exception as cursive_exception
  34. import eventlet.event
  35. from eventlet import greenthread
  36. import eventlet.semaphore
  37. import eventlet.timeout
  38. import futurist
  39. from keystoneauth1 import exceptions as keystone_exception
  40. from oslo_log import log as logging
  41. import oslo_messaging as messaging
  42. from oslo_serialization import jsonutils
  43. from oslo_service import loopingcall
  44. from oslo_service import periodic_task
  45. from oslo_utils import excutils
  46. from oslo_utils import strutils
  47. from oslo_utils import timeutils
  48. from oslo_utils import units
  49. import six
  50. from six.moves import range
  51. from nova import block_device
  52. from nova.cells import rpcapi as cells_rpcapi
  53. from nova import compute
  54. from nova.compute import build_results
  55. from nova.compute import claims
  56. from nova.compute import power_state
  57. from nova.compute import resource_tracker
  58. from nova.compute import rpcapi as compute_rpcapi
  59. from nova.compute import task_states
  60. from nova.compute import utils as compute_utils
  61. from nova.compute.utils import wrap_instance_event
  62. from nova.compute import vm_states
  63. from nova import conductor
  64. import nova.conf
  65. from nova.console import rpcapi as console_rpcapi
  66. import nova.context
  67. from nova import exception
  68. from nova import exception_wrapper
  69. from nova import hooks
  70. from nova.i18n import _
  71. from nova import image
  72. from nova import manager
  73. from nova import network
  74. from nova.network import base_api as base_net_api
  75. from nova.network import model as network_model
  76. from nova.network.security_group import openstack_driver
  77. from nova import objects
  78. from nova.objects import base as obj_base
  79. from nova.objects import fields
  80. from nova.objects import instance as obj_instance
  81. from nova.objects import migrate_data as migrate_data_obj
  82. from nova.pci import whitelist
  83. from nova import rpc
  84. from nova import safe_utils
  85. from nova.scheduler.client import query
  86. from nova import utils
  87. from nova.virt import block_device as driver_block_device
  88. from nova.virt import configdrive
  89. from nova.virt import driver
  90. from nova.virt import event as virtevent
  91. from nova.virt import storage_users
  92. from nova.virt import virtapi
  93. from nova.volume import cinder
  94. CONF = nova.conf.CONF
  95. LOG = logging.getLogger(__name__)
  96. get_notifier = functools.partial(rpc.get_notifier, service='compute')
  97. wrap_exception = functools.partial(exception_wrapper.wrap_exception,
  98. get_notifier=get_notifier,
  99. binary='nova-compute')
  100. @contextlib.contextmanager
  101. def errors_out_migration_ctxt(migration):
  102. """Context manager to error out migration on failure."""
  103. try:
  104. yield
  105. except Exception:
  106. with excutils.save_and_reraise_exception():
  107. if migration:
  108. # We may have been passed None for our migration if we're
  109. # receiving from an older client. The migration will be
  110. # errored via the legacy path.
  111. migration.status = 'error'
  112. try:
  113. with migration.obj_as_admin():
  114. migration.save()
  115. except Exception:
  116. LOG.debug(
  117. 'Error setting migration status for instance %s.',
  118. migration.instance_uuid, exc_info=True)
  119. @utils.expects_func_args('migration')
  120. def errors_out_migration(function):
  121. """Decorator to error out migration on failure."""
  122. @functools.wraps(function)
  123. def decorated_function(self, context, *args, **kwargs):
  124. wrapped_func = safe_utils.get_wrapped_function(function)
  125. keyed_args = inspect.getcallargs(wrapped_func, self, context,
  126. *args, **kwargs)
  127. migration = keyed_args['migration']
  128. with errors_out_migration_ctxt(migration):
  129. return function(self, context, *args, **kwargs)
  130. return decorated_function
  131. @utils.expects_func_args('instance')
  132. def reverts_task_state(function):
  133. """Decorator to revert task_state on failure."""
  134. @functools.wraps(function)
  135. def decorated_function(self, context, *args, **kwargs):
  136. try:
  137. return function(self, context, *args, **kwargs)
  138. except exception.UnexpectedTaskStateError as e:
  139. # Note(maoy): unexpected task state means the current
  140. # task is preempted. Do not clear task state in this
  141. # case.
  142. with excutils.save_and_reraise_exception():
  143. LOG.info("Task possibly preempted: %s",
  144. e.format_message())
  145. except Exception:
  146. with excutils.save_and_reraise_exception():
  147. wrapped_func = safe_utils.get_wrapped_function(function)
  148. keyed_args = inspect.getcallargs(wrapped_func, self, context,
  149. *args, **kwargs)
  150. # NOTE(mriedem): 'instance' must be in keyed_args because we
  151. # have utils.expects_func_args('instance') decorating this
  152. # method.
  153. instance = keyed_args['instance']
  154. original_task_state = instance.task_state
  155. try:
  156. self._instance_update(context, instance, task_state=None)
  157. LOG.info("Successfully reverted task state from %s on "
  158. "failure for instance.",
  159. original_task_state, instance=instance)
  160. except exception.InstanceNotFound:
  161. # We might delete an instance that failed to build shortly
  162. # after it errored out this is an expected case and we
  163. # should not trace on it.
  164. pass
  165. except Exception as e:
  166. LOG.warning("Failed to revert task state for instance. "
  167. "Error: %s", e, instance=instance)
  168. return decorated_function
  169. @utils.expects_func_args('instance')
  170. def wrap_instance_fault(function):
  171. """Wraps a method to catch exceptions related to instances.
  172. This decorator wraps a method to catch any exceptions having to do with
  173. an instance that may get thrown. It then logs an instance fault in the db.
  174. """
  175. @functools.wraps(function)
  176. def decorated_function(self, context, *args, **kwargs):
  177. try:
  178. return function(self, context, *args, **kwargs)
  179. except exception.InstanceNotFound:
  180. raise
  181. except Exception as e:
  182. # NOTE(gtt): If argument 'instance' is in args rather than kwargs,
  183. # we will get a KeyError exception which will cover up the real
  184. # exception. So, we update kwargs with the values from args first.
  185. # then, we can get 'instance' from kwargs easily.
  186. kwargs.update(dict(zip(function.__code__.co_varnames[2:], args)))
  187. with excutils.save_and_reraise_exception():
  188. compute_utils.add_instance_fault_from_exc(context,
  189. kwargs['instance'], e, sys.exc_info())
  190. return decorated_function
  191. @utils.expects_func_args('image_id', 'instance')
  192. def delete_image_on_error(function):
  193. """Used for snapshot related method to ensure the image created in
  194. compute.api is deleted when an error occurs.
  195. """
  196. @functools.wraps(function)
  197. def decorated_function(self, context, image_id, instance,
  198. *args, **kwargs):
  199. try:
  200. return function(self, context, image_id, instance,
  201. *args, **kwargs)
  202. except Exception:
  203. with excutils.save_and_reraise_exception():
  204. LOG.debug("Cleaning up image %s", image_id,
  205. exc_info=True, instance=instance)
  206. try:
  207. self.image_api.delete(context, image_id)
  208. except exception.ImageNotFound:
  209. # Since we're trying to cleanup an image, we don't care if
  210. # if it's already gone.
  211. pass
  212. except Exception:
  213. LOG.exception("Error while trying to clean up image %s",
  214. image_id, instance=instance)
  215. return decorated_function
  216. # TODO(danms): Remove me after Icehouse
  217. # TODO(alaski): Actually remove this after Newton, assuming a major RPC bump
  218. # NOTE(mikal): if the method being decorated has more than one decorator, then
  219. # put this one first. Otherwise the various exception handling decorators do
  220. # not function correctly.
  221. def object_compat(function):
  222. """Wraps a method that expects a new-world instance
  223. This provides compatibility for callers passing old-style dict
  224. instances.
  225. """
  226. @functools.wraps(function)
  227. def decorated_function(self, context, *args, **kwargs):
  228. def _load_instance(instance_or_dict):
  229. if isinstance(instance_or_dict, dict):
  230. # try to get metadata and system_metadata for most cases but
  231. # only attempt to load those if the db instance already has
  232. # those fields joined
  233. metas = [meta for meta in ('metadata', 'system_metadata')
  234. if meta in instance_or_dict]
  235. instance = objects.Instance._from_db_object(
  236. context, objects.Instance(), instance_or_dict,
  237. expected_attrs=metas)
  238. instance._context = context
  239. return instance
  240. return instance_or_dict
  241. try:
  242. kwargs['instance'] = _load_instance(kwargs['instance'])
  243. except KeyError:
  244. args = (_load_instance(args[0]),) + args[1:]
  245. migration = kwargs.get('migration')
  246. if isinstance(migration, dict):
  247. migration = objects.Migration._from_db_object(
  248. context.elevated(), objects.Migration(),
  249. migration)
  250. kwargs['migration'] = migration
  251. return function(self, context, *args, **kwargs)
  252. return decorated_function
  253. class InstanceEvents(object):
  254. def __init__(self):
  255. self._events = {}
  256. @staticmethod
  257. def _lock_name(instance):
  258. return '%s-%s' % (instance.uuid, 'events')
  259. def prepare_for_instance_event(self, instance, name, tag):
  260. """Prepare to receive an event for an instance.
  261. This will register an event for the given instance that we will
  262. wait on later. This should be called before initiating whatever
  263. action will trigger the event. The resulting eventlet.event.Event
  264. object should be wait()'d on to ensure completion.
  265. :param instance: the instance for which the event will be generated
  266. :param name: the name of the event we're expecting
  267. :param tag: the tag associated with the event we're expecting
  268. :returns: an event object that should be wait()'d on
  269. """
  270. if self._events is None:
  271. # NOTE(danms): We really should have a more specific error
  272. # here, but this is what we use for our default error case
  273. raise exception.NovaException('In shutdown, no new events '
  274. 'can be scheduled')
  275. @utils.synchronized(self._lock_name(instance))
  276. def _create_or_get_event():
  277. instance_events = self._events.setdefault(instance.uuid, {})
  278. return instance_events.setdefault((name, tag),
  279. eventlet.event.Event())
  280. LOG.debug('Preparing to wait for external event %(name)s-%(tag)s',
  281. {'name': name, 'tag': tag}, instance=instance)
  282. return _create_or_get_event()
  283. def pop_instance_event(self, instance, event):
  284. """Remove a pending event from the wait list.
  285. This will remove a pending event from the wait list so that it
  286. can be used to signal the waiters to wake up.
  287. :param instance: the instance for which the event was generated
  288. :param event: the nova.objects.external_event.InstanceExternalEvent
  289. that describes the event
  290. :returns: the eventlet.event.Event object on which the waiters
  291. are blocked
  292. """
  293. no_events_sentinel = object()
  294. no_matching_event_sentinel = object()
  295. @utils.synchronized(self._lock_name(instance))
  296. def _pop_event():
  297. if self._events is None:
  298. LOG.debug('Unexpected attempt to pop events during shutdown',
  299. instance=instance)
  300. return no_events_sentinel
  301. events = self._events.get(instance.uuid)
  302. if not events:
  303. return no_events_sentinel
  304. _event = events.pop((event.name, event.tag), None)
  305. if not events:
  306. del self._events[instance.uuid]
  307. if _event is None:
  308. return no_matching_event_sentinel
  309. return _event
  310. result = _pop_event()
  311. if result is no_events_sentinel:
  312. LOG.debug('No waiting events found dispatching %(event)s',
  313. {'event': event.key},
  314. instance=instance)
  315. return None
  316. elif result is no_matching_event_sentinel:
  317. LOG.debug('No event matching %(event)s in %(events)s',
  318. {'event': event.key,
  319. 'events': self._events.get(instance.uuid, {}).keys()},
  320. instance=instance)
  321. return None
  322. else:
  323. return result
  324. def clear_events_for_instance(self, instance):
  325. """Remove all pending events for an instance.
  326. This will remove all events currently pending for an instance
  327. and return them (indexed by event name).
  328. :param instance: the instance for which events should be purged
  329. :returns: a dictionary of {event_name: eventlet.event.Event}
  330. """
  331. @utils.synchronized(self._lock_name(instance))
  332. def _clear_events():
  333. if self._events is None:
  334. LOG.debug('Unexpected attempt to clear events during shutdown',
  335. instance=instance)
  336. return dict()
  337. # NOTE(danms): We have historically returned the raw internal
  338. # format here, which is {event.key: [events, ...])} so just
  339. # trivially convert it here.
  340. return {'%s-%s' % k: e
  341. for k, e in self._events.pop(instance.uuid, {}).items()}
  342. return _clear_events()
  343. def cancel_all_events(self):
  344. if self._events is None:
  345. LOG.debug('Unexpected attempt to cancel events during shutdown.')
  346. return
  347. our_events = self._events
  348. # NOTE(danms): Block new events
  349. self._events = None
  350. for instance_uuid, events in our_events.items():
  351. for (name, tag), eventlet_event in events.items():
  352. LOG.debug('Canceling in-flight event %(name)s-%(tag)s for '
  353. 'instance %(instance_uuid)s',
  354. {'name': name,
  355. 'tag': tag,
  356. 'instance_uuid': instance_uuid})
  357. event = objects.InstanceExternalEvent(
  358. instance_uuid=instance_uuid,
  359. name=name, status='failed',
  360. tag=tag, data={})
  361. eventlet_event.send(event)
  362. class ComputeVirtAPI(virtapi.VirtAPI):
  363. def __init__(self, compute):
  364. super(ComputeVirtAPI, self).__init__()
  365. self._compute = compute
  366. def _default_error_callback(self, event_name, instance):
  367. raise exception.NovaException(_('Instance event failed'))
  368. @contextlib.contextmanager
  369. def wait_for_instance_event(self, instance, event_names, deadline=300,
  370. error_callback=None):
  371. """Plan to wait for some events, run some code, then wait.
  372. This context manager will first create plans to wait for the
  373. provided event_names, yield, and then wait for all the scheduled
  374. events to complete.
  375. Note that this uses an eventlet.timeout.Timeout to bound the
  376. operation, so callers should be prepared to catch that
  377. failure and handle that situation appropriately.
  378. If the event is not received by the specified timeout deadline,
  379. eventlet.timeout.Timeout is raised.
  380. If the event is received but did not have a 'completed'
  381. status, a NovaException is raised. If an error_callback is
  382. provided, instead of raising an exception as detailed above
  383. for the failure case, the callback will be called with the
  384. event_name and instance, and can return True to continue
  385. waiting for the rest of the events, False to stop processing,
  386. or raise an exception which will bubble up to the waiter.
  387. :param instance: The instance for which an event is expected
  388. :param event_names: A list of event names. Each element is a
  389. tuple of strings to indicate (name, tag),
  390. where name is required, but tag may be None.
  391. :param deadline: Maximum number of seconds we should wait for all
  392. of the specified events to arrive.
  393. :param error_callback: A function to be called if an event arrives
  394. """
  395. if error_callback is None:
  396. error_callback = self._default_error_callback
  397. events = {}
  398. for event_name in event_names:
  399. name, tag = event_name
  400. event_name = objects.InstanceExternalEvent.make_key(name, tag)
  401. try:
  402. events[event_name] = (
  403. self._compute.instance_events.prepare_for_instance_event(
  404. instance, name, tag))
  405. except exception.NovaException:
  406. error_callback(event_name, instance)
  407. # NOTE(danms): Don't wait for any of the events. They
  408. # should all be canceled and fired immediately below,
  409. # but don't stick around if not.
  410. deadline = 0
  411. yield
  412. with eventlet.timeout.Timeout(deadline):
  413. for event_name, event in events.items():
  414. actual_event = event.wait()
  415. if actual_event.status == 'completed':
  416. continue
  417. decision = error_callback(event_name, instance)
  418. if decision is False:
  419. break
  420. class ComputeManager(manager.Manager):
  421. """Manages the running instances from creation to destruction."""
  422. target = messaging.Target(version='5.1')
  423. def __init__(self, compute_driver=None, *args, **kwargs):
  424. """Load configuration options and connect to the hypervisor."""
  425. self.virtapi = ComputeVirtAPI(self)
  426. self.network_api = network.API()
  427. self.volume_api = cinder.API()
  428. self.image_api = image.API()
  429. self._last_host_check = 0
  430. self._last_bw_usage_poll = 0
  431. self._bw_usage_supported = True
  432. self._last_bw_usage_cell_update = 0
  433. self.compute_api = compute.API()
  434. self.compute_rpcapi = compute_rpcapi.ComputeAPI()
  435. self.conductor_api = conductor.API()
  436. self.compute_task_api = conductor.ComputeTaskAPI()
  437. self.is_neutron_security_groups = (
  438. openstack_driver.is_neutron_security_groups())
  439. self.cells_rpcapi = cells_rpcapi.CellsAPI()
  440. self.query_client = query.SchedulerQueryClient()
  441. self.instance_events = InstanceEvents()
  442. self._sync_power_pool = eventlet.GreenPool(
  443. size=CONF.sync_power_state_pool_size)
  444. self._syncs_in_progress = {}
  445. self.send_instance_updates = (
  446. CONF.filter_scheduler.track_instance_changes)
  447. if CONF.max_concurrent_builds != 0:
  448. self._build_semaphore = eventlet.semaphore.Semaphore(
  449. CONF.max_concurrent_builds)
  450. else:
  451. self._build_semaphore = compute_utils.UnlimitedSemaphore()
  452. if max(CONF.max_concurrent_live_migrations, 0) != 0:
  453. self._live_migration_executor = futurist.GreenThreadPoolExecutor(
  454. max_workers=CONF.max_concurrent_live_migrations)
  455. else:
  456. if CONF.max_concurrent_live_migrations < 0:
  457. LOG.warning('The value of the max_concurrent_live_migrations '
  458. 'config option is less than 0. '
  459. 'It is treated as 0 and will raise ValueError '
  460. 'in a future release.')
  461. self._live_migration_executor = futurist.GreenThreadPoolExecutor()
  462. # This is a dict, keyed by instance uuid, to a two-item tuple of
  463. # migration object and Future for the queued live migration.
  464. self._waiting_live_migrations = {}
  465. super(ComputeManager, self).__init__(service_name="compute",
  466. *args, **kwargs)
  467. # NOTE(russellb) Load the driver last. It may call back into the
  468. # compute manager via the virtapi, so we want it to be fully
  469. # initialized before that happens.
  470. self.driver = driver.load_compute_driver(self.virtapi, compute_driver)
  471. self.use_legacy_block_device_info = \
  472. self.driver.need_legacy_block_device_info
  473. self.rt = resource_tracker.ResourceTracker(self.host, self.driver)
  474. self.reportclient = self.rt.reportclient
  475. def reset(self):
  476. LOG.info('Reloading compute RPC API')
  477. compute_rpcapi.LAST_VERSION = None
  478. self.compute_rpcapi = compute_rpcapi.ComputeAPI()
  479. self.reportclient.clear_provider_cache()
  480. def _update_resource_tracker(self, context, instance):
  481. """Let the resource tracker know that an instance has changed state."""
  482. if instance.host == self.host:
  483. self.rt.update_usage(context, instance, instance.node)
  484. def _instance_update(self, context, instance, **kwargs):
  485. """Update an instance in the database using kwargs as value."""
  486. for k, v in kwargs.items():
  487. setattr(instance, k, v)
  488. instance.save()
  489. self._update_resource_tracker(context, instance)
  490. def _nil_out_instance_obj_host_and_node(self, instance):
  491. # NOTE(jwcroppe): We don't do instance.save() here for performance
  492. # reasons; a call to this is expected to be immediately followed by
  493. # another call that does instance.save(), thus avoiding two writes
  494. # to the database layer.
  495. instance.host = None
  496. instance.node = None
  497. # If the instance is not on a host, it's not in an aggregate and
  498. # therefore is not in an availability zone.
  499. instance.availability_zone = None
  500. def _set_instance_obj_error_state(self, context, instance,
  501. clean_task_state=False):
  502. try:
  503. instance.vm_state = vm_states.ERROR
  504. if clean_task_state:
  505. instance.task_state = None
  506. instance.save()
  507. except exception.InstanceNotFound:
  508. LOG.debug('Instance has been destroyed from under us while '
  509. 'trying to set it to ERROR', instance=instance)
  510. def _get_instances_on_driver(self, context, filters=None):
  511. """Return a list of instance records for the instances found
  512. on the hypervisor which satisfy the specified filters. If filters=None
  513. return a list of instance records for all the instances found on the
  514. hypervisor.
  515. """
  516. if not filters:
  517. filters = {}
  518. try:
  519. driver_uuids = self.driver.list_instance_uuids()
  520. if len(driver_uuids) == 0:
  521. # Short circuit, don't waste a DB call
  522. return objects.InstanceList()
  523. filters['uuid'] = driver_uuids
  524. local_instances = objects.InstanceList.get_by_filters(
  525. context, filters, use_slave=True)
  526. return local_instances
  527. except NotImplementedError:
  528. pass
  529. # The driver doesn't support uuids listing, so we'll have
  530. # to brute force.
  531. driver_instances = self.driver.list_instances()
  532. # NOTE(mjozefcz): In this case we need to apply host filter.
  533. # Without this all instance data would be fetched from db.
  534. filters['host'] = self.host
  535. instances = objects.InstanceList.get_by_filters(context, filters,
  536. use_slave=True)
  537. name_map = {instance.name: instance for instance in instances}
  538. local_instances = []
  539. for driver_instance in driver_instances:
  540. instance = name_map.get(driver_instance)
  541. if not instance:
  542. continue
  543. local_instances.append(instance)
  544. return local_instances
  545. def _destroy_evacuated_instances(self, context):
  546. """Destroys evacuated instances.
  547. While nova-compute was down, the instances running on it could be
  548. evacuated to another host. This method looks for evacuation migration
  549. records where this is the source host and which were either started
  550. (accepted), in-progress (pre-migrating) or migrated (done). From those
  551. migration records, local instances reported by the hypervisor are
  552. compared to the instances for the migration records and those local
  553. guests are destroyed, along with instance allocation records in
  554. Placement for this node.
  555. """
  556. filters = {
  557. 'source_compute': self.host,
  558. # NOTE(mriedem): Migration records that have been accepted are
  559. # included in case the source node comes back up while instances
  560. # are being evacuated to another host. We don't want the same
  561. # instance being reported from multiple hosts.
  562. # NOTE(lyarwood): pre-migrating is also included here as the
  563. # source compute can come back online shortly after the RT
  564. # claims on the destination that in-turn moves the migration to
  565. # pre-migrating. If the evacuate fails on the destination host,
  566. # the user can rebuild the instance (in ERROR state) on the source
  567. # host.
  568. 'status': ['accepted', 'pre-migrating', 'done'],
  569. 'migration_type': 'evacuation',
  570. }
  571. with utils.temporary_mutation(context, read_deleted='yes'):
  572. evacuations = objects.MigrationList.get_by_filters(context,
  573. filters)
  574. if not evacuations:
  575. return
  576. evacuations = {mig.instance_uuid: mig for mig in evacuations}
  577. # TODO(mriedem): We could optimize by pre-loading the joined fields
  578. # we know we'll use, like info_cache and flavor.
  579. local_instances = self._get_instances_on_driver(context)
  580. evacuated = [inst for inst in local_instances
  581. if inst.uuid in evacuations]
  582. # NOTE(gibi): We are called from init_host and at this point the
  583. # compute_nodes of the resource tracker has not been populated yet so
  584. # we cannot rely on the resource tracker here.
  585. compute_nodes = {}
  586. for instance in evacuated:
  587. migration = evacuations[instance.uuid]
  588. LOG.info('Deleting instance as it has been evacuated from '
  589. 'this host', instance=instance)
  590. try:
  591. network_info = self.network_api.get_instance_nw_info(
  592. context, instance)
  593. bdi = self._get_instance_block_device_info(context,
  594. instance)
  595. destroy_disks = not (self._is_instance_storage_shared(
  596. context, instance))
  597. except exception.InstanceNotFound:
  598. network_info = network_model.NetworkInfo()
  599. bdi = {}
  600. LOG.info('Instance has been marked deleted already, '
  601. 'removing it from the hypervisor.',
  602. instance=instance)
  603. # always destroy disks if the instance was deleted
  604. destroy_disks = True
  605. self.driver.destroy(context, instance,
  606. network_info,
  607. bdi, destroy_disks)
  608. # delete the allocation of the evacuated instance from this host
  609. if migration.source_node not in compute_nodes:
  610. try:
  611. cn_uuid = objects.ComputeNode.get_by_host_and_nodename(
  612. context, self.host, migration.source_node).uuid
  613. compute_nodes[migration.source_node] = cn_uuid
  614. except exception.ComputeHostNotFound:
  615. LOG.error("Failed to clean allocation of evacuated "
  616. "instance as the source node %s is not found",
  617. migration.source_node, instance=instance)
  618. continue
  619. cn_uuid = compute_nodes[migration.source_node]
  620. # If the instance was deleted in the interim, assume its
  621. # allocations were properly cleaned up (either by its hosting
  622. # compute service or the API).
  623. if (not instance.deleted and
  624. not self.reportclient.
  625. remove_provider_tree_from_instance_allocation(
  626. context, instance.uuid, cn_uuid)):
  627. LOG.error("Failed to clean allocation of evacuated instance "
  628. "on the source node %s",
  629. cn_uuid, instance=instance)
  630. migration.status = 'completed'
  631. migration.save()
  632. return evacuations
  633. def _is_instance_storage_shared(self, context, instance, host=None):
  634. shared_storage = True
  635. data = None
  636. try:
  637. data = self.driver.check_instance_shared_storage_local(context,
  638. instance)
  639. if data:
  640. shared_storage = (self.compute_rpcapi.
  641. check_instance_shared_storage(context,
  642. instance, data, host=host))
  643. except NotImplementedError:
  644. LOG.debug('Hypervisor driver does not support '
  645. 'instance shared storage check, '
  646. 'assuming it\'s not on shared storage',
  647. instance=instance)
  648. shared_storage = False
  649. except Exception:
  650. LOG.exception('Failed to check if instance shared',
  651. instance=instance)
  652. finally:
  653. if data:
  654. self.driver.check_instance_shared_storage_cleanup(context,
  655. data)
  656. return shared_storage
  657. def _complete_partial_deletion(self, context, instance):
  658. """Complete deletion for instances in DELETED status but not marked as
  659. deleted in the DB
  660. """
  661. instance.destroy()
  662. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  663. context, instance.uuid)
  664. self._complete_deletion(context,
  665. instance)
  666. self._notify_about_instance_usage(context, instance, "delete.end")
  667. compute_utils.notify_about_instance_action(context, instance,
  668. self.host, action=fields.NotificationAction.DELETE,
  669. phase=fields.NotificationPhase.END, bdms=bdms)
  670. def _complete_deletion(self, context, instance):
  671. self._update_resource_tracker(context, instance)
  672. self.reportclient.delete_allocation_for_instance(context,
  673. instance.uuid)
  674. self._clean_instance_console_tokens(context, instance)
  675. self._delete_scheduler_instance_info(context, instance.uuid)
  676. def _init_instance(self, context, instance):
  677. """Initialize this instance during service init."""
  678. # NOTE(danms): If the instance appears to not be owned by this
  679. # host, it may have been evacuated away, but skipped by the
  680. # evacuation cleanup code due to configuration. Thus, if that
  681. # is a possibility, don't touch the instance in any way, but
  682. # log the concern. This will help avoid potential issues on
  683. # startup due to misconfiguration.
  684. if instance.host != self.host:
  685. LOG.warning('Instance %(uuid)s appears to not be owned '
  686. 'by this host, but by %(host)s. Startup '
  687. 'processing is being skipped.',
  688. {'uuid': instance.uuid,
  689. 'host': instance.host})
  690. return
  691. # Instances that are shut down, or in an error state can not be
  692. # initialized and are not attempted to be recovered. The exception
  693. # to this are instances that are in RESIZE_MIGRATING or DELETING,
  694. # which are dealt with further down.
  695. if (instance.vm_state == vm_states.SOFT_DELETED or
  696. (instance.vm_state == vm_states.ERROR and
  697. instance.task_state not in
  698. (task_states.RESIZE_MIGRATING, task_states.DELETING))):
  699. LOG.debug("Instance is in %s state.",
  700. instance.vm_state, instance=instance)
  701. return
  702. if instance.vm_state == vm_states.DELETED:
  703. try:
  704. self._complete_partial_deletion(context, instance)
  705. except Exception:
  706. # we don't want that an exception blocks the init_host
  707. LOG.exception('Failed to complete a deletion',
  708. instance=instance)
  709. return
  710. if (instance.vm_state == vm_states.BUILDING or
  711. instance.task_state in [task_states.SCHEDULING,
  712. task_states.BLOCK_DEVICE_MAPPING,
  713. task_states.NETWORKING,
  714. task_states.SPAWNING]):
  715. # NOTE(dave-mcnally) compute stopped before instance was fully
  716. # spawned so set to ERROR state. This is safe to do as the state
  717. # may be set by the api but the host is not so if we get here the
  718. # instance has already been scheduled to this particular host.
  719. LOG.debug("Instance failed to spawn correctly, "
  720. "setting to ERROR state", instance=instance)
  721. instance.task_state = None
  722. instance.vm_state = vm_states.ERROR
  723. instance.save()
  724. return
  725. if (instance.vm_state in [vm_states.ACTIVE, vm_states.STOPPED] and
  726. instance.task_state in [task_states.REBUILDING,
  727. task_states.REBUILD_BLOCK_DEVICE_MAPPING,
  728. task_states.REBUILD_SPAWNING]):
  729. # NOTE(jichenjc) compute stopped before instance was fully
  730. # spawned so set to ERROR state. This is consistent to BUILD
  731. LOG.debug("Instance failed to rebuild correctly, "
  732. "setting to ERROR state", instance=instance)
  733. instance.task_state = None
  734. instance.vm_state = vm_states.ERROR
  735. instance.save()
  736. return
  737. if (instance.vm_state != vm_states.ERROR and
  738. instance.task_state in [task_states.IMAGE_SNAPSHOT_PENDING,
  739. task_states.IMAGE_PENDING_UPLOAD,
  740. task_states.IMAGE_UPLOADING,
  741. task_states.IMAGE_SNAPSHOT]):
  742. LOG.debug("Instance in transitional state %s at start-up "
  743. "clearing task state",
  744. instance.task_state, instance=instance)
  745. try:
  746. self._post_interrupted_snapshot_cleanup(context, instance)
  747. except Exception:
  748. # we don't want that an exception blocks the init_host
  749. LOG.exception('Failed to cleanup snapshot.', instance=instance)
  750. instance.task_state = None
  751. instance.save()
  752. if (instance.vm_state != vm_states.ERROR and
  753. instance.task_state in [task_states.RESIZE_PREP]):
  754. LOG.debug("Instance in transitional state %s at start-up "
  755. "clearing task state",
  756. instance['task_state'], instance=instance)
  757. instance.task_state = None
  758. instance.save()
  759. if instance.task_state == task_states.DELETING:
  760. try:
  761. LOG.info('Service started deleting the instance during '
  762. 'the previous run, but did not finish. Restarting'
  763. ' the deletion now.', instance=instance)
  764. instance.obj_load_attr('metadata')
  765. instance.obj_load_attr('system_metadata')
  766. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  767. context, instance.uuid)
  768. self._delete_instance(context, instance, bdms)
  769. except Exception:
  770. # we don't want that an exception blocks the init_host
  771. LOG.exception('Failed to complete a deletion',
  772. instance=instance)
  773. self._set_instance_obj_error_state(context, instance)
  774. return
  775. current_power_state = self._get_power_state(context, instance)
  776. try_reboot, reboot_type = self._retry_reboot(context, instance,
  777. current_power_state)
  778. if try_reboot:
  779. LOG.debug("Instance in transitional state (%(task_state)s) at "
  780. "start-up and power state is (%(power_state)s), "
  781. "triggering reboot",
  782. {'task_state': instance.task_state,
  783. 'power_state': current_power_state},
  784. instance=instance)
  785. # NOTE(mikal): if the instance was doing a soft reboot that got as
  786. # far as shutting down the instance but not as far as starting it
  787. # again, then we've just become a hard reboot. That means the
  788. # task state for the instance needs to change so that we're in one
  789. # of the expected task states for a hard reboot.
  790. if (instance.task_state in task_states.soft_reboot_states and
  791. reboot_type == 'HARD'):
  792. instance.task_state = task_states.REBOOT_PENDING_HARD
  793. instance.save()
  794. self.reboot_instance(context, instance, block_device_info=None,
  795. reboot_type=reboot_type)
  796. return
  797. elif (current_power_state == power_state.RUNNING and
  798. instance.task_state in [task_states.REBOOT_STARTED,
  799. task_states.REBOOT_STARTED_HARD,
  800. task_states.PAUSING,
  801. task_states.UNPAUSING]):
  802. LOG.warning("Instance in transitional state "
  803. "(%(task_state)s) at start-up and power state "
  804. "is (%(power_state)s), clearing task state",
  805. {'task_state': instance.task_state,
  806. 'power_state': current_power_state},
  807. instance=instance)
  808. instance.task_state = None
  809. instance.vm_state = vm_states.ACTIVE
  810. instance.save()
  811. elif (current_power_state == power_state.PAUSED and
  812. instance.task_state == task_states.UNPAUSING):
  813. LOG.warning("Instance in transitional state "
  814. "(%(task_state)s) at start-up and power state "
  815. "is (%(power_state)s), clearing task state "
  816. "and unpausing the instance",
  817. {'task_state': instance.task_state,
  818. 'power_state': current_power_state},
  819. instance=instance)
  820. try:
  821. self.unpause_instance(context, instance)
  822. except NotImplementedError:
  823. # Some virt driver didn't support pause and unpause
  824. pass
  825. except Exception:
  826. LOG.exception('Failed to unpause instance', instance=instance)
  827. return
  828. if instance.task_state == task_states.POWERING_OFF:
  829. try:
  830. LOG.debug("Instance in transitional state %s at start-up "
  831. "retrying stop request",
  832. instance.task_state, instance=instance)
  833. self.stop_instance(context, instance, True)
  834. except Exception:
  835. # we don't want that an exception blocks the init_host
  836. LOG.exception('Failed to stop instance', instance=instance)
  837. return
  838. if instance.task_state == task_states.POWERING_ON:
  839. try:
  840. LOG.debug("Instance in transitional state %s at start-up "
  841. "retrying start request",
  842. instance.task_state, instance=instance)
  843. self.start_instance(context, instance)
  844. except Exception:
  845. # we don't want that an exception blocks the init_host
  846. LOG.exception('Failed to start instance', instance=instance)
  847. return
  848. net_info = instance.get_network_info()
  849. try:
  850. self.driver.plug_vifs(instance, net_info)
  851. except NotImplementedError as e:
  852. LOG.debug(e, instance=instance)
  853. except exception.VirtualInterfacePlugException:
  854. # NOTE(mriedem): If we get here, it could be because the vif_type
  855. # in the cache is "binding_failed" or "unbound".
  856. # The periodic task _heal_instance_info_cache checks for this
  857. # condition. It should fix this by binding the ports again when
  858. # it gets to this instance.
  859. LOG.exception('Virtual interface plugging failed for instance. '
  860. 'The port binding:host_id may need to be manually '
  861. 'updated.', instance=instance)
  862. self._set_instance_obj_error_state(context, instance)
  863. return
  864. if instance.task_state == task_states.RESIZE_MIGRATING:
  865. # We crashed during resize/migration, so roll back for safety
  866. try:
  867. # NOTE(mriedem): check old_vm_state for STOPPED here, if it's
  868. # not in system_metadata we default to True for backwards
  869. # compatibility
  870. power_on = (instance.system_metadata.get('old_vm_state') !=
  871. vm_states.STOPPED)
  872. block_dev_info = self._get_instance_block_device_info(context,
  873. instance)
  874. self.driver.finish_revert_migration(context,
  875. instance, net_info, block_dev_info, power_on)
  876. except Exception:
  877. LOG.exception('Failed to revert crashed migration',
  878. instance=instance)
  879. finally:
  880. LOG.info('Instance found in migrating state during '
  881. 'startup. Resetting task_state',
  882. instance=instance)
  883. instance.task_state = None
  884. instance.save()
  885. if instance.task_state == task_states.MIGRATING:
  886. # Live migration did not complete, but instance is on this
  887. # host, so reset the state.
  888. instance.task_state = None
  889. instance.save(expected_task_state=[task_states.MIGRATING])
  890. db_state = instance.power_state
  891. drv_state = self._get_power_state(context, instance)
  892. expect_running = (db_state == power_state.RUNNING and
  893. drv_state != db_state)
  894. LOG.debug('Current state is %(drv_state)s, state in DB is '
  895. '%(db_state)s.',
  896. {'drv_state': drv_state, 'db_state': db_state},
  897. instance=instance)
  898. if expect_running and CONF.resume_guests_state_on_host_boot:
  899. self._resume_guests_state(context, instance, net_info)
  900. elif drv_state == power_state.RUNNING:
  901. # VMwareAPI drivers will raise an exception
  902. try:
  903. self.driver.ensure_filtering_rules_for_instance(
  904. instance, net_info)
  905. except NotImplementedError:
  906. LOG.debug('Hypervisor driver does not support '
  907. 'firewall rules', instance=instance)
  908. def _resume_guests_state(self, context, instance, net_info):
  909. LOG.info('Rebooting instance after nova-compute restart.',
  910. instance=instance)
  911. block_device_info = \
  912. self._get_instance_block_device_info(context, instance)
  913. try:
  914. self.driver.resume_state_on_host_boot(
  915. context, instance, net_info, block_device_info)
  916. except NotImplementedError:
  917. LOG.warning('Hypervisor driver does not support '
  918. 'resume guests', instance=instance)
  919. except Exception:
  920. # NOTE(vish): The instance failed to resume, so we set the
  921. # instance to error and attempt to continue.
  922. LOG.warning('Failed to resume instance',
  923. instance=instance)
  924. self._set_instance_obj_error_state(context, instance)
  925. def _retry_reboot(self, context, instance, current_power_state):
  926. current_task_state = instance.task_state
  927. retry_reboot = False
  928. reboot_type = compute_utils.get_reboot_type(current_task_state,
  929. current_power_state)
  930. pending_soft = (current_task_state == task_states.REBOOT_PENDING and
  931. instance.vm_state in vm_states.ALLOW_SOFT_REBOOT)
  932. pending_hard = (current_task_state == task_states.REBOOT_PENDING_HARD
  933. and instance.vm_state in vm_states.ALLOW_HARD_REBOOT)
  934. started_not_running = (current_task_state in
  935. [task_states.REBOOT_STARTED,
  936. task_states.REBOOT_STARTED_HARD] and
  937. current_power_state != power_state.RUNNING)
  938. if pending_soft or pending_hard or started_not_running:
  939. retry_reboot = True
  940. return retry_reboot, reboot_type
  941. def handle_lifecycle_event(self, event):
  942. LOG.info("VM %(state)s (Lifecycle Event)",
  943. {'state': event.get_name()},
  944. instance_uuid=event.get_instance_uuid())
  945. context = nova.context.get_admin_context(read_deleted='yes')
  946. vm_power_state = None
  947. event_transition = event.get_transition()
  948. if event_transition == virtevent.EVENT_LIFECYCLE_STOPPED:
  949. vm_power_state = power_state.SHUTDOWN
  950. elif event_transition == virtevent.EVENT_LIFECYCLE_STARTED:
  951. vm_power_state = power_state.RUNNING
  952. elif event_transition in (
  953. virtevent.EVENT_LIFECYCLE_PAUSED,
  954. virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED,
  955. virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED):
  956. vm_power_state = power_state.PAUSED
  957. elif event_transition == virtevent.EVENT_LIFECYCLE_RESUMED:
  958. vm_power_state = power_state.RUNNING
  959. elif event_transition == virtevent.EVENT_LIFECYCLE_SUSPENDED:
  960. vm_power_state = power_state.SUSPENDED
  961. else:
  962. LOG.warning("Unexpected lifecycle event: %d", event_transition)
  963. migrate_finish_statuses = {
  964. # This happens on the source node and indicates live migration
  965. # entered post-copy mode.
  966. virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED: 'running (post-copy)',
  967. # Suspended for offline migration.
  968. virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED: 'running'
  969. }
  970. expected_attrs = []
  971. if event_transition in migrate_finish_statuses:
  972. # Join on info_cache since that's needed in migrate_instance_start.
  973. expected_attrs.append('info_cache')
  974. instance = objects.Instance.get_by_uuid(context,
  975. event.get_instance_uuid(),
  976. expected_attrs=expected_attrs)
  977. # Note(lpetrut): The event may be delayed, thus not reflecting
  978. # the current instance power state. In that case, ignore the event.
  979. current_power_state = self._get_power_state(context, instance)
  980. if current_power_state == vm_power_state:
  981. LOG.debug('Synchronizing instance power state after lifecycle '
  982. 'event "%(event)s"; current vm_state: %(vm_state)s, '
  983. 'current task_state: %(task_state)s, current DB '
  984. 'power_state: %(db_power_state)s, VM power_state: '
  985. '%(vm_power_state)s',
  986. {'event': event.get_name(),
  987. 'vm_state': instance.vm_state,
  988. 'task_state': instance.task_state,
  989. 'db_power_state': instance.power_state,
  990. 'vm_power_state': vm_power_state},
  991. instance_uuid=instance.uuid)
  992. self._sync_instance_power_state(context,
  993. instance,
  994. vm_power_state)
  995. # The following checks are for live migration. We want to activate
  996. # the port binding for the destination host before the live migration
  997. # is resumed on the destination host in order to reduce network
  998. # downtime. Otherwise the ports are bound to the destination host
  999. # in post_live_migration_at_destination.
  1000. # TODO(danms): Explore options for using a different live migration
  1001. # specific callback for this instead of piggy-backing on the
  1002. # handle_lifecycle_event callback.
  1003. if (instance.task_state == task_states.MIGRATING and
  1004. event_transition in migrate_finish_statuses):
  1005. status = migrate_finish_statuses[event_transition]
  1006. try:
  1007. migration = objects.Migration.get_by_instance_and_status(
  1008. context, instance.uuid, status)
  1009. LOG.debug('Binding ports to destination host: %s',
  1010. migration.dest_compute, instance=instance)
  1011. # For neutron, migrate_instance_start will activate the
  1012. # destination host port bindings, if there are any created by
  1013. # conductor before live migration started.
  1014. self.network_api.migrate_instance_start(
  1015. context, instance, migration)
  1016. except exception.MigrationNotFoundByStatus:
  1017. LOG.warning("Unable to find migration record with status "
  1018. "'%s' for instance. Port binding will happen in "
  1019. "post live migration.", status, instance=instance)
  1020. def handle_events(self, event):
  1021. if isinstance(event, virtevent.LifecycleEvent):
  1022. try:
  1023. self.handle_lifecycle_event(event)
  1024. except exception.InstanceNotFound:
  1025. LOG.debug("Event %s arrived for non-existent instance. The "
  1026. "instance was probably deleted.", event)
  1027. else:
  1028. LOG.debug("Ignoring event %s", event)
  1029. def init_virt_events(self):
  1030. if CONF.workarounds.handle_virt_lifecycle_events:
  1031. self.driver.register_event_listener(self.handle_events)
  1032. else:
  1033. # NOTE(mriedem): If the _sync_power_states periodic task is
  1034. # disabled we should emit a warning in the logs.
  1035. if CONF.sync_power_state_interval < 0:
  1036. LOG.warning('Instance lifecycle events from the compute '
  1037. 'driver have been disabled. Note that lifecycle '
  1038. 'changes to an instance outside of the compute '
  1039. 'service will not be synchronized '
  1040. 'automatically since the _sync_power_states '
  1041. 'periodic task is also disabled.')
  1042. else:
  1043. LOG.info('Instance lifecycle events from the compute '
  1044. 'driver have been disabled. Note that lifecycle '
  1045. 'changes to an instance outside of the compute '
  1046. 'service will only be synchronized by the '
  1047. '_sync_power_states periodic task.')
  1048. def init_host(self):
  1049. """Initialization for a standalone compute service."""
  1050. if CONF.pci.passthrough_whitelist:
  1051. # Simply loading the PCI passthrough whitelist will do a bunch of
  1052. # validation that would otherwise wait until the PciDevTracker is
  1053. # constructed when updating available resources for the compute
  1054. # node(s) in the resource tracker, effectively killing that task.
  1055. # So load up the whitelist when starting the compute service to
  1056. # flush any invalid configuration early so we can kill the service
  1057. # if the configuration is wrong.
  1058. whitelist.Whitelist(CONF.pci.passthrough_whitelist)
  1059. nova.conf.neutron.register_dynamic_opts(CONF)
  1060. # Override the number of concurrent disk operations allowed if the
  1061. # user has specified a limit.
  1062. if CONF.compute.max_concurrent_disk_ops != 0:
  1063. compute_utils.disk_ops_semaphore = \
  1064. eventlet.semaphore.BoundedSemaphore(
  1065. CONF.compute.max_concurrent_disk_ops)
  1066. self.driver.init_host(host=self.host)
  1067. context = nova.context.get_admin_context()
  1068. instances = objects.InstanceList.get_by_host(
  1069. context, self.host, expected_attrs=['info_cache', 'metadata'])
  1070. if CONF.defer_iptables_apply:
  1071. self.driver.filter_defer_apply_on()
  1072. self.init_virt_events()
  1073. try:
  1074. # checking that instance was not already evacuated to other host
  1075. evacuated_instances = self._destroy_evacuated_instances(context)
  1076. # Initialise instances on the host that are not evacuating
  1077. for instance in instances:
  1078. if (not evacuated_instances or
  1079. instance.uuid not in evacuated_instances):
  1080. self._init_instance(context, instance)
  1081. finally:
  1082. if CONF.defer_iptables_apply:
  1083. self.driver.filter_defer_apply_off()
  1084. if instances:
  1085. # We only send the instance info to the scheduler on startup
  1086. # if there is anything to send, otherwise this host might
  1087. # not be mapped yet in a cell and the scheduler may have
  1088. # issues dealing with the information. Later changes to
  1089. # instances on this host will update the scheduler, or the
  1090. # _sync_scheduler_instance_info periodic task will.
  1091. self._update_scheduler_instance_info(context, instances)
  1092. def cleanup_host(self):
  1093. self.driver.register_event_listener(None)
  1094. self.instance_events.cancel_all_events()
  1095. self.driver.cleanup_host(host=self.host)
  1096. self._cleanup_live_migrations_in_pool()
  1097. def _cleanup_live_migrations_in_pool(self):
  1098. # Shutdown the pool so we don't get new requests.
  1099. self._live_migration_executor.shutdown(wait=False)
  1100. # For any queued migrations, cancel the migration and update
  1101. # its status.
  1102. for migration, future in self._waiting_live_migrations.values():
  1103. # If we got here before the Future was submitted then we need
  1104. # to move on since there isn't anything we can do.
  1105. if future is None:
  1106. continue
  1107. if future.cancel():
  1108. self._set_migration_status(migration, 'cancelled')
  1109. LOG.info('Successfully cancelled queued live migration.',
  1110. instance_uuid=migration.instance_uuid)
  1111. else:
  1112. LOG.warning('Unable to cancel live migration.',
  1113. instance_uuid=migration.instance_uuid)
  1114. self._waiting_live_migrations.clear()
  1115. def pre_start_hook(self):
  1116. """After the service is initialized, but before we fully bring
  1117. the service up by listening on RPC queues, make sure to update
  1118. our available resources (and indirectly our available nodes).
  1119. """
  1120. self.update_available_resource(nova.context.get_admin_context(),
  1121. startup=True)
  1122. def _get_power_state(self, context, instance):
  1123. """Retrieve the power state for the given instance."""
  1124. LOG.debug('Checking state', instance=instance)
  1125. try:
  1126. return self.driver.get_info(instance).state
  1127. except exception.InstanceNotFound:
  1128. return power_state.NOSTATE
  1129. def get_console_topic(self, context):
  1130. """Retrieves the console host for a project on this host.
  1131. Currently this is just set in the flags for each compute host.
  1132. """
  1133. # TODO(mdragon): perhaps make this variable by console_type?
  1134. return '%s.%s' % (console_rpcapi.RPC_TOPIC, CONF.console_host)
  1135. @wrap_exception()
  1136. def get_console_pool_info(self, context, console_type):
  1137. return self.driver.get_console_pool_info(console_type)
  1138. @wrap_exception()
  1139. def refresh_instance_security_rules(self, context, instance):
  1140. """Tell the virtualization driver to refresh security rules for
  1141. an instance.
  1142. Passes straight through to the virtualization driver.
  1143. Synchronize the call because we may still be in the middle of
  1144. creating the instance.
  1145. """
  1146. @utils.synchronized(instance.uuid)
  1147. def _sync_refresh():
  1148. try:
  1149. return self.driver.refresh_instance_security_rules(instance)
  1150. except NotImplementedError:
  1151. LOG.debug('Hypervisor driver does not support '
  1152. 'security groups.', instance=instance)
  1153. return _sync_refresh()
  1154. def _await_block_device_map_created(self, context, vol_id):
  1155. # TODO(yamahata): creating volume simultaneously
  1156. # reduces creation time?
  1157. # TODO(yamahata): eliminate dumb polling
  1158. start = time.time()
  1159. retries = CONF.block_device_allocate_retries
  1160. if retries < 0:
  1161. LOG.warning("Treating negative config value (%(retries)s) for "
  1162. "'block_device_retries' as 0.",
  1163. {'retries': retries})
  1164. # (1) treat negative config value as 0
  1165. # (2) the configured value is 0, one attempt should be made
  1166. # (3) the configured value is > 0, then the total number attempts
  1167. # is (retries + 1)
  1168. attempts = 1
  1169. if retries >= 1:
  1170. attempts = retries + 1
  1171. for attempt in range(1, attempts + 1):
  1172. volume = self.volume_api.get(context, vol_id)
  1173. volume_status = volume['status']
  1174. if volume_status not in ['creating', 'downloading']:
  1175. if volume_status == 'available':
  1176. return attempt
  1177. LOG.warning("Volume id: %(vol_id)s finished being "
  1178. "created but its status is %(vol_status)s.",
  1179. {'vol_id': vol_id,
  1180. 'vol_status': volume_status})
  1181. break
  1182. greenthread.sleep(CONF.block_device_allocate_retries_interval)
  1183. raise exception.VolumeNotCreated(volume_id=vol_id,
  1184. seconds=int(time.time() - start),
  1185. attempts=attempt,
  1186. volume_status=volume_status)
  1187. def _decode_files(self, injected_files):
  1188. """Base64 decode the list of files to inject."""
  1189. if not injected_files:
  1190. return []
  1191. def _decode(f):
  1192. path, contents = f
  1193. # Py3 raises binascii.Error instead of TypeError as in Py27
  1194. try:
  1195. decoded = base64.b64decode(contents)
  1196. return path, decoded
  1197. except (TypeError, binascii.Error):
  1198. raise exception.Base64Exception(path=path)
  1199. return [_decode(f) for f in injected_files]
  1200. def _validate_instance_group_policy(self, context, instance,
  1201. scheduler_hints):
  1202. # NOTE(russellb) Instance group policy is enforced by the scheduler.
  1203. # However, there is a race condition with the enforcement of
  1204. # the policy. Since more than one instance may be scheduled at the
  1205. # same time, it's possible that more than one instance with an
  1206. # anti-affinity policy may end up here. It's also possible that
  1207. # multiple instances with an affinity policy could end up on different
  1208. # hosts. This is a validation step to make sure that starting the
  1209. # instance here doesn't violate the policy.
  1210. group_hint = scheduler_hints.get('group')
  1211. if not group_hint:
  1212. return
  1213. # The RequestSpec stores scheduler_hints as key=list pairs so we need
  1214. # to check the type on the value and pull the single entry out. The
  1215. # API request schema validates that the 'group' hint is a single value.
  1216. if isinstance(group_hint, list):
  1217. group_hint = group_hint[0]
  1218. @utils.synchronized(group_hint)
  1219. def _do_validation(context, instance, group_hint):
  1220. group = objects.InstanceGroup.get_by_hint(context, group_hint)
  1221. if group.policy and 'anti-affinity' == group.policy:
  1222. instances_uuids = objects.InstanceList.get_uuids_by_host(
  1223. context, self.host)
  1224. ins_on_host = set(instances_uuids)
  1225. members = set(group.members)
  1226. # Determine the set of instance group members on this host
  1227. # which are not the instance in question. This is used to
  1228. # determine how many other members from the same anti-affinity
  1229. # group can be on this host.
  1230. members_on_host = ins_on_host & members - set([instance.uuid])
  1231. rules = group.rules
  1232. if rules and 'max_server_per_host' in rules:
  1233. max_server = rules['max_server_per_host']
  1234. else:
  1235. max_server = 1
  1236. if len(members_on_host) >= max_server:
  1237. msg = _("Anti-affinity instance group policy "
  1238. "was violated.")
  1239. raise exception.RescheduledException(
  1240. instance_uuid=instance.uuid,
  1241. reason=msg)
  1242. elif group.policy and 'affinity' == group.policy:
  1243. group_hosts = group.get_hosts(exclude=[instance.uuid])
  1244. if group_hosts and self.host not in group_hosts:
  1245. msg = _("Affinity instance group policy was violated.")
  1246. raise exception.RescheduledException(
  1247. instance_uuid=instance.uuid,
  1248. reason=msg)
  1249. if not CONF.workarounds.disable_group_policy_check_upcall:
  1250. _do_validation(context, instance, group_hint)
  1251. def _log_original_error(self, exc_info, instance_uuid):
  1252. LOG.error('Error: %s', exc_info[1], instance_uuid=instance_uuid,
  1253. exc_info=exc_info)
  1254. # TODO(mriedem): This method is confusing and only ever used for resize
  1255. # reschedules; remove it and merge into _reschedule_resize_or_reraise.
  1256. def _reschedule(self, context, request_spec, filter_properties,
  1257. instance, reschedule_method, method_args, task_state,
  1258. exc_info=None, host_list=None):
  1259. """Attempt to re-schedule a compute operation."""
  1260. instance_uuid = instance.uuid
  1261. retry = filter_properties.get('retry')
  1262. if not retry:
  1263. # no retry information, do not reschedule.
  1264. LOG.debug("Retry info not present, will not reschedule",
  1265. instance_uuid=instance_uuid)
  1266. return
  1267. LOG.debug("Re-scheduling %(method)s: attempt %(num)d",
  1268. {'method': reschedule_method.__name__,
  1269. 'num': retry['num_attempts']}, instance_uuid=instance_uuid)
  1270. # reset the task state:
  1271. self._instance_update(context, instance, task_state=task_state)
  1272. if exc_info:
  1273. # stringify to avoid circular ref problem in json serialization:
  1274. retry['exc'] = traceback.format_exception_only(exc_info[0],
  1275. exc_info[1])
  1276. reschedule_method(context, *method_args, request_spec=request_spec,
  1277. host_list=host_list)
  1278. return True
  1279. @periodic_task.periodic_task
  1280. def _check_instance_build_time(self, context):
  1281. """Ensure that instances are not stuck in build."""
  1282. timeout = CONF.instance_build_timeout
  1283. if timeout == 0:
  1284. return
  1285. filters = {'vm_state': vm_states.BUILDING,
  1286. 'host': self.host}
  1287. building_insts = objects.InstanceList.get_by_filters(context,
  1288. filters, expected_attrs=[], use_slave=True)
  1289. for instance in building_insts:
  1290. if timeutils.is_older_than(instance.created_at, timeout):
  1291. self._set_instance_obj_error_state(context, instance)
  1292. LOG.warning("Instance build timed out. Set to error "
  1293. "state.", instance=instance)
  1294. def _check_instance_exists(self, context, instance):
  1295. """Ensure an instance with the same name is not already present."""
  1296. if self.driver.instance_exists(instance):
  1297. raise exception.InstanceExists(name=instance.name)
  1298. def _allocate_network_async(self, context, instance, requested_networks,
  1299. macs, security_groups, is_vpn,
  1300. resource_provider_mapping):
  1301. """Method used to allocate networks in the background.
  1302. Broken out for testing.
  1303. """
  1304. # First check to see if we're specifically not supposed to allocate
  1305. # networks because if so, we can exit early.
  1306. if requested_networks and requested_networks.no_allocate:
  1307. LOG.debug("Not allocating networking since 'none' was specified.",
  1308. instance=instance)
  1309. return network_model.NetworkInfo([])
  1310. LOG.debug("Allocating IP information in the background.",
  1311. instance=instance)
  1312. retries = CONF.network_allocate_retries
  1313. attempts = retries + 1
  1314. retry_time = 1
  1315. bind_host_id = self.driver.network_binding_host_id(context, instance)
  1316. for attempt in range(1, attempts + 1):
  1317. try:
  1318. nwinfo = self.network_api.allocate_for_instance(
  1319. context, instance, vpn=is_vpn,
  1320. requested_networks=requested_networks,
  1321. macs=macs,
  1322. security_groups=security_groups,
  1323. bind_host_id=bind_host_id,
  1324. resource_provider_mapping=resource_provider_mapping)
  1325. LOG.debug('Instance network_info: |%s|', nwinfo,
  1326. instance=instance)
  1327. instance.system_metadata['network_allocated'] = 'True'
  1328. # NOTE(JoshNang) do not save the instance here, as it can cause
  1329. # races. The caller shares a reference to instance and waits
  1330. # for this async greenthread to finish before calling
  1331. # instance.save().
  1332. return nwinfo
  1333. except Exception:
  1334. exc_info = sys.exc_info()
  1335. log_info = {'attempt': attempt,
  1336. 'attempts': attempts}
  1337. if attempt == attempts:
  1338. LOG.exception('Instance failed network setup '
  1339. 'after %(attempts)d attempt(s)',
  1340. log_info)
  1341. six.reraise(*exc_info)
  1342. LOG.warning('Instance failed network setup '
  1343. '(attempt %(attempt)d of %(attempts)d)',
  1344. log_info, instance=instance)
  1345. time.sleep(retry_time)
  1346. retry_time *= 2
  1347. if retry_time > 30:
  1348. retry_time = 30
  1349. # Not reached.
  1350. def _build_networks_for_instance(self, context, instance,
  1351. requested_networks, security_groups, resource_provider_mapping):
  1352. # If we're here from a reschedule the network may already be allocated.
  1353. if strutils.bool_from_string(
  1354. instance.system_metadata.get('network_allocated', 'False')):
  1355. # NOTE(alex_xu): The network_allocated is True means the network
  1356. # resource already allocated at previous scheduling, and the
  1357. # network setup is cleanup at previous. After rescheduling, the
  1358. # network resource need setup on the new host.
  1359. self.network_api.setup_instance_network_on_host(
  1360. context, instance, instance.host)
  1361. return self.network_api.get_instance_nw_info(context, instance)
  1362. if not self.is_neutron_security_groups:
  1363. security_groups = []
  1364. macs = self.driver.macs_for_instance(instance)
  1365. network_info = self._allocate_network(context, instance,
  1366. requested_networks, macs, security_groups,
  1367. resource_provider_mapping)
  1368. return network_info
  1369. def _allocate_network(self, context, instance, requested_networks, macs,
  1370. security_groups, resource_provider_mapping):
  1371. """Start network allocation asynchronously. Return an instance
  1372. of NetworkInfoAsyncWrapper that can be used to retrieve the
  1373. allocated networks when the operation has finished.
  1374. """
  1375. # NOTE(comstud): Since we're allocating networks asynchronously,
  1376. # this task state has little meaning, as we won't be in this
  1377. # state for very long.
  1378. instance.vm_state = vm_states.BUILDING
  1379. instance.task_state = task_states.NETWORKING
  1380. instance.save(expected_task_state=[None])
  1381. is_vpn = False
  1382. return network_model.NetworkInfoAsyncWrapper(
  1383. self._allocate_network_async, context, instance,
  1384. requested_networks, macs, security_groups, is_vpn,
  1385. resource_provider_mapping)
  1386. def _default_root_device_name(self, instance, image_meta, root_bdm):
  1387. """Gets a default root device name from the driver.
  1388. :param nova.objects.Instance instance:
  1389. The instance for which to get the root device name.
  1390. :param nova.objects.ImageMeta image_meta:
  1391. The metadata of the image of the instance.
  1392. :param nova.objects.BlockDeviceMapping root_bdm:
  1393. The description of the root device.
  1394. :returns: str -- The default root device name.
  1395. :raises: InternalError, TooManyDiskDevices
  1396. """
  1397. try:
  1398. return self.driver.default_root_device_name(instance,
  1399. image_meta,
  1400. root_bdm)
  1401. except NotImplementedError:
  1402. return compute_utils.get_next_device_name(instance, [])
  1403. def _default_device_names_for_instance(self, instance,
  1404. root_device_name,
  1405. *block_device_lists):
  1406. """Default the missing device names in the BDM from the driver.
  1407. :param nova.objects.Instance instance:
  1408. The instance for which to get default device names.
  1409. :param str root_device_name: The root device name.
  1410. :param list block_device_lists: List of block device mappings.
  1411. :returns: None
  1412. :raises: InternalError, TooManyDiskDevices
  1413. """
  1414. try:
  1415. self.driver.default_device_names_for_instance(instance,
  1416. root_device_name,
  1417. *block_device_lists)
  1418. except NotImplementedError:
  1419. compute_utils.default_device_names_for_instance(
  1420. instance, root_device_name, *block_device_lists)
  1421. def _get_device_name_for_instance(self, instance, bdms, block_device_obj):
  1422. """Get the next device name from the driver, based on the BDM.
  1423. :param nova.objects.Instance instance:
  1424. The instance whose volume is requesting a device name.
  1425. :param nova.objects.BlockDeviceMappingList bdms:
  1426. The block device mappings for the instance.
  1427. :param nova.objects.BlockDeviceMapping block_device_obj:
  1428. A block device mapping containing info about the requested block
  1429. device.
  1430. :returns: The next device name.
  1431. :raises: InternalError, TooManyDiskDevices
  1432. """
  1433. # NOTE(ndipanov): Copy obj to avoid changing the original
  1434. block_device_obj = block_device_obj.obj_clone()
  1435. try:
  1436. return self.driver.get_device_name_for_instance(
  1437. instance, bdms, block_device_obj)
  1438. except NotImplementedError:
  1439. return compute_utils.get_device_name_for_instance(
  1440. instance, bdms, block_device_obj.get("device_name"))
  1441. def _default_block_device_names(self, instance, image_meta, block_devices):
  1442. """Verify that all the devices have the device_name set. If not,
  1443. provide a default name.
  1444. It also ensures that there is a root_device_name and is set to the
  1445. first block device in the boot sequence (boot_index=0).
  1446. """
  1447. root_bdm = block_device.get_root_bdm(block_devices)
  1448. if not root_bdm:
  1449. return
  1450. # Get the root_device_name from the root BDM or the instance
  1451. root_device_name = None
  1452. update_root_bdm = False
  1453. if root_bdm.device_name:
  1454. root_device_name = root_bdm.device_name
  1455. instance.root_device_name = root_device_name
  1456. elif instance.root_device_name:
  1457. root_device_name = instance.root_device_name
  1458. root_bdm.device_name = root_device_name
  1459. update_root_bdm = True
  1460. else:
  1461. root_device_name = self._default_root_device_name(instance,
  1462. image_meta,
  1463. root_bdm)
  1464. instance.root_device_name = root_device_name
  1465. root_bdm.device_name = root_device_name
  1466. update_root_bdm = True
  1467. if update_root_bdm:
  1468. root_bdm.save()
  1469. ephemerals = list(filter(block_device.new_format_is_ephemeral,
  1470. block_devices))
  1471. swap = list(filter(block_device.new_format_is_swap,
  1472. block_devices))
  1473. block_device_mapping = list(filter(
  1474. driver_block_device.is_block_device_mapping, block_devices))
  1475. self._default_device_names_for_instance(instance,
  1476. root_device_name,
  1477. ephemerals,
  1478. swap,
  1479. block_device_mapping)
  1480. def _block_device_info_to_legacy(self, block_device_info):
  1481. """Convert BDI to the old format for drivers that need it."""
  1482. if self.use_legacy_block_device_info:
  1483. ephemerals = driver_block_device.legacy_block_devices(
  1484. driver.block_device_info_get_ephemerals(block_device_info))
  1485. mapping = driver_block_device.legacy_block_devices(
  1486. driver.block_device_info_get_mapping(block_device_info))
  1487. swap = block_device_info['swap']
  1488. if swap:
  1489. swap = swap.legacy()
  1490. block_device_info.update({
  1491. 'ephemerals': ephemerals,
  1492. 'swap': swap,
  1493. 'block_device_mapping': mapping})
  1494. def _add_missing_dev_names(self, bdms, instance):
  1495. for bdm in bdms:
  1496. if bdm.device_name is not None:
  1497. continue
  1498. device_name = self._get_device_name_for_instance(instance,
  1499. bdms, bdm)
  1500. values = {'device_name': device_name}
  1501. bdm.update(values)
  1502. bdm.save()
  1503. def _prep_block_device(self, context, instance, bdms):
  1504. """Set up the block device for an instance with error logging."""
  1505. try:
  1506. self._add_missing_dev_names(bdms, instance)
  1507. block_device_info = driver.get_block_device_info(instance, bdms)
  1508. mapping = driver.block_device_info_get_mapping(block_device_info)
  1509. driver_block_device.attach_block_devices(
  1510. mapping, context, instance, self.volume_api, self.driver,
  1511. wait_func=self._await_block_device_map_created)
  1512. self._block_device_info_to_legacy(block_device_info)
  1513. return block_device_info
  1514. except exception.OverQuota as e:
  1515. LOG.warning('Failed to create block device for instance due'
  1516. ' to exceeding volume related resource quota.'
  1517. ' Error: %s', e.message, instance=instance)
  1518. raise
  1519. except Exception as ex:
  1520. LOG.exception('Instance failed block device setup',
  1521. instance=instance)
  1522. # InvalidBDM will eventually result in a BuildAbortException when
  1523. # booting from volume, and will be recorded as an instance fault.
  1524. # Maintain the original exception message which most likely has
  1525. # useful details which the standard InvalidBDM error message lacks.
  1526. raise exception.InvalidBDM(six.text_type(ex))
  1527. def _update_instance_after_spawn(self, context, instance):
  1528. instance.power_state = self._get_power_state(context, instance)
  1529. instance.vm_state = vm_states.ACTIVE
  1530. instance.task_state = None
  1531. instance.launched_at = timeutils.utcnow()
  1532. configdrive.update_instance(instance)
  1533. def _update_scheduler_instance_info(self, context, instance):
  1534. """Sends an InstanceList with created or updated Instance objects to
  1535. the Scheduler client.
  1536. In the case of init_host, the value passed will already be an
  1537. InstanceList. Other calls will send individual Instance objects that
  1538. have been created or resized. In this case, we create an InstanceList
  1539. object containing that Instance.
  1540. """
  1541. if not self.send_instance_updates:
  1542. return
  1543. if isinstance(instance, obj_instance.Instance):
  1544. instance = objects.InstanceList(objects=[instance])
  1545. context = context.elevated()
  1546. self.query_client.update_instance_info(context, self.host,
  1547. instance)
  1548. def _delete_scheduler_instance_info(self, context, instance_uuid):
  1549. """Sends the uuid of the deleted Instance to the Scheduler client."""
  1550. if not self.send_instance_updates:
  1551. return
  1552. context = context.elevated()
  1553. self.query_client.delete_instance_info(context, self.host,
  1554. instance_uuid)
  1555. @periodic_task.periodic_task(spacing=CONF.scheduler_instance_sync_interval)
  1556. def _sync_scheduler_instance_info(self, context):
  1557. if not self.send_instance_updates:
  1558. return
  1559. context = context.elevated()
  1560. instances = objects.InstanceList.get_by_host(context, self.host,
  1561. expected_attrs=[],
  1562. use_slave=True)
  1563. uuids = [instance.uuid for instance in instances]
  1564. self.query_client.sync_instance_info(context, self.host, uuids)
  1565. def _notify_about_instance_usage(self, context, instance, event_suffix,
  1566. network_info=None, extra_usage_info=None,
  1567. fault=None):
  1568. compute_utils.notify_about_instance_usage(
  1569. self.notifier, context, instance, event_suffix,
  1570. network_info=network_info,
  1571. extra_usage_info=extra_usage_info, fault=fault)
  1572. def _deallocate_network(self, context, instance,
  1573. requested_networks=None):
  1574. # If we were told not to allocate networks let's save ourselves
  1575. # the trouble of calling the network API.
  1576. if requested_networks and requested_networks.no_allocate:
  1577. LOG.debug("Skipping network deallocation for instance since "
  1578. "networking was not requested.", instance=instance)
  1579. return
  1580. LOG.debug('Deallocating network for instance', instance=instance)
  1581. with timeutils.StopWatch() as timer:
  1582. self.network_api.deallocate_for_instance(
  1583. context, instance, requested_networks=requested_networks)
  1584. # nova-network does an rpc call so we're OK tracking time spent here
  1585. LOG.info('Took %0.2f seconds to deallocate network for instance.',
  1586. timer.elapsed(), instance=instance)
  1587. def _get_instance_block_device_info(self, context, instance,
  1588. refresh_conn_info=False,
  1589. bdms=None):
  1590. """Transform block devices to the driver block_device format."""
  1591. if bdms is None:
  1592. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  1593. context, instance.uuid)
  1594. block_device_info = driver.get_block_device_info(instance, bdms)
  1595. if not refresh_conn_info:
  1596. # if the block_device_mapping has no value in connection_info
  1597. # (returned as None), don't include in the mapping
  1598. block_device_info['block_device_mapping'] = [
  1599. bdm for bdm in driver.block_device_info_get_mapping(
  1600. block_device_info)
  1601. if bdm.get('connection_info')]
  1602. else:
  1603. driver_block_device.refresh_conn_infos(
  1604. driver.block_device_info_get_mapping(block_device_info),
  1605. context, instance, self.volume_api, self.driver)
  1606. self._block_device_info_to_legacy(block_device_info)
  1607. return block_device_info
  1608. def _build_failed(self, node):
  1609. if CONF.compute.consecutive_build_service_disable_threshold:
  1610. # NOTE(danms): Update our counter, but wait for the next
  1611. # update_available_resource() periodic to flush it to the DB
  1612. self.rt.build_failed(node)
  1613. def _build_succeeded(self, node):
  1614. self.rt.build_succeeded(node)
  1615. @wrap_exception()
  1616. @reverts_task_state
  1617. @wrap_instance_fault
  1618. def build_and_run_instance(self, context, instance, image, request_spec,
  1619. filter_properties, admin_password=None,
  1620. injected_files=None, requested_networks=None,
  1621. security_groups=None, block_device_mapping=None,
  1622. node=None, limits=None, host_list=None):
  1623. @utils.synchronized(instance.uuid)
  1624. def _locked_do_build_and_run_instance(*args, **kwargs):
  1625. # NOTE(danms): We grab the semaphore with the instance uuid
  1626. # locked because we could wait in line to build this instance
  1627. # for a while and we want to make sure that nothing else tries
  1628. # to do anything with this instance while we wait.
  1629. with self._build_semaphore:
  1630. try:
  1631. result = self._do_build_and_run_instance(*args, **kwargs)
  1632. except Exception:
  1633. # NOTE(mriedem): This should really only happen if
  1634. # _decode_files in _do_build_and_run_instance fails, and
  1635. # that's before a guest is spawned so it's OK to remove
  1636. # allocations for the instance for this node from Placement
  1637. # below as there is no guest consuming resources anyway.
  1638. # The _decode_files case could be handled more specifically
  1639. # but that's left for another day.
  1640. result = build_results.FAILED
  1641. raise
  1642. finally:
  1643. if result == build_results.FAILED:
  1644. # Remove the allocation records from Placement for the
  1645. # instance if the build failed. The instance.host is
  1646. # likely set to None in _do_build_and_run_instance
  1647. # which means if the user deletes the instance, it
  1648. # will be deleted in the API, not the compute service.
  1649. # Setting the instance.host to None in
  1650. # _do_build_and_run_instance means that the
  1651. # ResourceTracker will no longer consider this instance
  1652. # to be claiming resources against it, so we want to
  1653. # reflect that same thing in Placement. No need to
  1654. # call this for a reschedule, as the allocations will
  1655. # have already been removed in
  1656. # self._do_build_and_run_instance().
  1657. self.reportclient.delete_allocation_for_instance(
  1658. context, instance.uuid)
  1659. if result in (build_results.FAILED,
  1660. build_results.RESCHEDULED):
  1661. self._build_failed(node)
  1662. else:
  1663. self._build_succeeded(node)
  1664. # NOTE(danms): We spawn here to return the RPC worker thread back to
  1665. # the pool. Since what follows could take a really long time, we don't
  1666. # want to tie up RPC workers.
  1667. utils.spawn_n(_locked_do_build_and_run_instance,
  1668. context, instance, image, request_spec,
  1669. filter_properties, admin_password, injected_files,
  1670. requested_networks, security_groups,
  1671. block_device_mapping, node, limits, host_list)
  1672. def _check_device_tagging(self, requested_networks, block_device_mapping):
  1673. tagging_requested = False
  1674. if requested_networks:
  1675. for net in requested_networks:
  1676. if 'tag' in net and net.tag is not None:
  1677. tagging_requested = True
  1678. break
  1679. if block_device_mapping and not tagging_requested:
  1680. for bdm in block_device_mapping:
  1681. if 'tag' in bdm and bdm.tag is not None:
  1682. tagging_requested = True
  1683. break
  1684. if (tagging_requested and
  1685. not self.driver.capabilities.get('supports_device_tagging',
  1686. False)):
  1687. raise exception.BuildAbortException('Attempt to boot guest with '
  1688. 'tagged devices on host that '
  1689. 'does not support tagging.')
  1690. def _check_trusted_certs(self, instance):
  1691. if (instance.trusted_certs and
  1692. not self.driver.capabilities.get('supports_trusted_certs',
  1693. False)):
  1694. raise exception.BuildAbortException(
  1695. 'Trusted image certificates provided on host that does not '
  1696. 'support certificate validation.')
  1697. @hooks.add_hook('build_instance')
  1698. @wrap_exception()
  1699. @reverts_task_state
  1700. @wrap_instance_event(prefix='compute')
  1701. @wrap_instance_fault
  1702. def _do_build_and_run_instance(self, context, instance, image,
  1703. request_spec, filter_properties, admin_password, injected_files,
  1704. requested_networks, security_groups, block_device_mapping,
  1705. node=None, limits=None, host_list=None):
  1706. try:
  1707. LOG.debug('Starting instance...', instance=instance)
  1708. instance.vm_state = vm_states.BUILDING
  1709. instance.task_state = None
  1710. instance.save(expected_task_state=
  1711. (task_states.SCHEDULING, None))
  1712. except exception.InstanceNotFound:
  1713. msg = 'Instance disappeared before build.'
  1714. LOG.debug(msg, instance=instance)
  1715. return build_results.FAILED
  1716. except exception.UnexpectedTaskStateError as e:
  1717. LOG.debug(e.format_message(), instance=instance)
  1718. return build_results.FAILED
  1719. # b64 decode the files to inject:
  1720. decoded_files = self._decode_files(injected_files)
  1721. if limits is None:
  1722. limits = {}
  1723. if node is None:
  1724. node = self._get_nodename(instance, refresh=True)
  1725. try:
  1726. with timeutils.StopWatch() as timer:
  1727. self._build_and_run_instance(context, instance, image,
  1728. decoded_files, admin_password, requested_networks,
  1729. security_groups, block_device_mapping, node, limits,
  1730. filter_properties, request_spec)
  1731. LOG.info('Took %0.2f seconds to build instance.',
  1732. timer.elapsed(), instance=instance)
  1733. return build_results.ACTIVE
  1734. except exception.RescheduledException as e:
  1735. retry = filter_properties.get('retry')
  1736. if not retry:
  1737. # no retry information, do not reschedule.
  1738. LOG.debug("Retry info not present, will not reschedule",
  1739. instance=instance)
  1740. self._cleanup_allocated_networks(context, instance,
  1741. requested_networks)
  1742. self._cleanup_volumes(context, instance,
  1743. block_device_mapping, raise_exc=False)
  1744. compute_utils.add_instance_fault_from_exc(context,
  1745. instance, e, sys.exc_info(),
  1746. fault_message=e.kwargs['reason'])
  1747. self._nil_out_instance_obj_host_and_node(instance)
  1748. self._set_instance_obj_error_state(context, instance,
  1749. clean_task_state=True)
  1750. return build_results.FAILED
  1751. LOG.debug(e.format_message(), instance=instance)
  1752. # This will be used for logging the exception
  1753. retry['exc'] = traceback.format_exception(*sys.exc_info())
  1754. # This will be used for setting the instance fault message
  1755. retry['exc_reason'] = e.kwargs['reason']
  1756. # NOTE(comstud): Deallocate networks if the driver wants
  1757. # us to do so.
  1758. # NOTE(mriedem): Always deallocate networking when using Neutron.
  1759. # This is to unbind any ports that the user supplied in the server
  1760. # create request, or delete any ports that nova created which were
  1761. # meant to be bound to this host. This check intentionally bypasses
  1762. # the result of deallocate_networks_on_reschedule because the
  1763. # default value in the driver is False, but that method was really
  1764. # only meant for Ironic and should be removed when nova-network is
  1765. # removed (since is_neutron() will then always be True).
  1766. # NOTE(vladikr): SR-IOV ports should be deallocated to
  1767. # allow new sriov pci devices to be allocated on a new host.
  1768. # Otherwise, if devices with pci addresses are already allocated
  1769. # on the destination host, the instance will fail to spawn.
  1770. # info_cache.network_info should be present at this stage.
  1771. if (self.driver.deallocate_networks_on_reschedule(instance) or
  1772. utils.is_neutron() or
  1773. self.deallocate_sriov_ports_on_reschedule(instance)):
  1774. self._cleanup_allocated_networks(context, instance,
  1775. requested_networks)
  1776. else:
  1777. # NOTE(alex_xu): Network already allocated and we don't
  1778. # want to deallocate them before rescheduling. But we need
  1779. # to cleanup those network resources setup on this host before
  1780. # rescheduling.
  1781. self.network_api.cleanup_instance_network_on_host(
  1782. context, instance, self.host)
  1783. self._nil_out_instance_obj_host_and_node(instance)
  1784. instance.task_state = task_states.SCHEDULING
  1785. instance.save()
  1786. # The instance will have already claimed resources from this host
  1787. # before this build was attempted. Now that it has failed, we need
  1788. # to unclaim those resources before casting to the conductor, so
  1789. # that if there are alternate hosts available for a retry, it can
  1790. # claim resources on that new host for the instance.
  1791. self.reportclient.delete_allocation_for_instance(context,
  1792. instance.uuid)
  1793. self.compute_task_api.build_instances(context, [instance],
  1794. image, filter_properties, admin_password,
  1795. injected_files, requested_networks, security_groups,
  1796. block_device_mapping, request_spec=request_spec,
  1797. host_lists=[host_list])
  1798. return build_results.RESCHEDULED
  1799. except (exception.InstanceNotFound,
  1800. exception.UnexpectedDeletingTaskStateError):
  1801. msg = 'Instance disappeared during build.'
  1802. LOG.debug(msg, instance=instance)
  1803. self._cleanup_allocated_networks(context, instance,
  1804. requested_networks)
  1805. return build_results.FAILED
  1806. except Exception as e:
  1807. if isinstance(e, exception.BuildAbortException):
  1808. LOG.error(e.format_message(), instance=instance)
  1809. else:
  1810. # Should not reach here.
  1811. LOG.exception('Unexpected build failure, not rescheduling '
  1812. 'build.', instance=instance)
  1813. self._cleanup_allocated_networks(context, instance,
  1814. requested_networks)
  1815. self._cleanup_volumes(context, instance,
  1816. block_device_mapping, raise_exc=False)
  1817. compute_utils.add_instance_fault_from_exc(context, instance,
  1818. e, sys.exc_info())
  1819. self._nil_out_instance_obj_host_and_node(instance)
  1820. self._set_instance_obj_error_state(context, instance,
  1821. clean_task_state=True)
  1822. return build_results.FAILED
  1823. def deallocate_sriov_ports_on_reschedule(self, instance):
  1824. """Determine if networks are needed to be deallocated before reschedule
  1825. Check the cached network info for any assigned SR-IOV ports.
  1826. SR-IOV ports should be deallocated prior to rescheduling
  1827. in order to allow new sriov pci devices to be allocated on a new host.
  1828. """
  1829. info_cache = instance.info_cache
  1830. def _has_sriov_port(vif):
  1831. return vif['vnic_type'] in network_model.VNIC_TYPES_SRIOV
  1832. if (info_cache and info_cache.network_info):
  1833. for vif in info_cache.network_info:
  1834. if _has_sriov_port(vif):
  1835. return True
  1836. return False
  1837. @staticmethod
  1838. def _get_scheduler_hints(filter_properties, request_spec=None):
  1839. """Helper method to get scheduler hints.
  1840. This method prefers to get the hints out of the request spec, but that
  1841. might not be provided. Conductor will pass request_spec down to the
  1842. first compute chosen for a build but older computes will not pass
  1843. the request_spec to conductor's build_instances method for a
  1844. a reschedule, so if we're on a host via a retry, request_spec may not
  1845. be provided so we need to fallback to use the filter_properties
  1846. to get scheduler hints.
  1847. """
  1848. hints = {}
  1849. if request_spec is not None and 'scheduler_hints' in request_spec:
  1850. hints = request_spec.scheduler_hints
  1851. if not hints:
  1852. hints = filter_properties.get('scheduler_hints') or {}
  1853. return hints
  1854. @staticmethod
  1855. def _get_request_group_mapping(request_spec):
  1856. """Return request group resource - provider mapping. This is currently
  1857. used for Neutron ports that have resource request due to the port
  1858. having QoS minimum bandwidth policy rule attached.
  1859. :param request_spec: A RequestSpec object
  1860. :returns: A dict keyed by RequestGroup requester_id, currently Neutron
  1861. port_id, to resource provider UUID that provides resource for that
  1862. RequestGroup.
  1863. """
  1864. if (request_spec
  1865. and 'requested_resources' in request_spec
  1866. and request_spec.requested_resources is not None):
  1867. return {
  1868. group.requester_id: group.provider_uuids
  1869. for group in request_spec.requested_resources
  1870. }
  1871. else:
  1872. return None
  1873. def _update_pci_request_spec_with_allocated_interface_name(
  1874. self, context, instance, request_group_resource_providers_mapping):
  1875. if not instance.pci_requests:
  1876. return
  1877. def needs_update(pci_request, mapping):
  1878. return (pci_request.requester_id
  1879. and pci_request.requester_id in mapping)
  1880. modified = False
  1881. for pci_request in instance.pci_requests.requests:
  1882. if needs_update(
  1883. pci_request, request_group_resource_providers_mapping):
  1884. provider_uuids = request_group_resource_providers_mapping[
  1885. pci_request.requester_id]
  1886. if len(provider_uuids) != 1:
  1887. reason = (
  1888. 'Allocating resources from more than one resource '
  1889. 'providers %(providers)s for a single pci request '
  1890. '%(requester)s is not supported.' %
  1891. {'providers': provider_uuids,
  1892. 'requester': pci_request.requester_id})
  1893. raise exception.BuildAbortException(
  1894. instance_uuid=instance.uuid,
  1895. reason=reason)
  1896. dev_rp_name = self.reportclient.get_resource_provider_name(
  1897. context,
  1898. provider_uuids[0])
  1899. # NOTE(gibi): the device RP name reported by neutron is
  1900. # structured like <hostname>:<agentname>:<interfacename>
  1901. rp_name_pieces = dev_rp_name.split(':')
  1902. if len(rp_name_pieces) != 3:
  1903. reason = (
  1904. 'Resource provider %(provider)s used to allocate '
  1905. 'resources for the pci request %(requester)s does not '
  1906. 'have properly formatted name. Expected name format '
  1907. 'is <hostname>:<agentname>:<interfacename>, but got '
  1908. '%(provider_name)s' %
  1909. {'provider': provider_uuids[0],
  1910. 'requester': pci_request.requester_id,
  1911. 'provider_name': dev_rp_name})
  1912. raise exception.BuildAbortException(
  1913. instance_uuid=instance.uuid,
  1914. reason=reason)
  1915. for spec in pci_request.spec:
  1916. spec['parent_ifname'] = rp_name_pieces[2]
  1917. modified = True
  1918. if modified:
  1919. instance.save()
  1920. def _build_and_run_instance(self, context, instance, image, injected_files,
  1921. admin_password, requested_networks, security_groups,
  1922. block_device_mapping, node, limits, filter_properties,
  1923. request_spec=None):
  1924. image_name = image.get('name')
  1925. self._notify_about_instance_usage(context, instance, 'create.start',
  1926. extra_usage_info={'image_name': image_name})
  1927. compute_utils.notify_about_instance_create(
  1928. context, instance, self.host,
  1929. phase=fields.NotificationPhase.START,
  1930. bdms=block_device_mapping)
  1931. # NOTE(mikal): cache the keystone roles associated with the instance
  1932. # at boot time for later reference
  1933. instance.system_metadata.update(
  1934. {'boot_roles': ','.join(context.roles)})
  1935. self._check_device_tagging(requested_networks, block_device_mapping)
  1936. self._check_trusted_certs(instance)
  1937. request_group_resource_providers_mapping = \
  1938. self._get_request_group_mapping(request_spec)
  1939. if request_group_resource_providers_mapping:
  1940. self._update_pci_request_spec_with_allocated_interface_name(
  1941. context, instance, request_group_resource_providers_mapping)
  1942. try:
  1943. scheduler_hints = self._get_scheduler_hints(filter_properties,
  1944. request_spec)
  1945. with self.rt.instance_claim(context, instance, node, limits):
  1946. # NOTE(russellb) It's important that this validation be done
  1947. # *after* the resource tracker instance claim, as that is where
  1948. # the host is set on the instance.
  1949. self._validate_instance_group_policy(context, instance,
  1950. scheduler_hints)
  1951. image_meta = objects.ImageMeta.from_dict(image)
  1952. request_group_resource_providers_mapping = \
  1953. self._get_request_group_mapping(request_spec)
  1954. with self._build_resources(context, instance,
  1955. requested_networks, security_groups, image_meta,
  1956. block_device_mapping,
  1957. request_group_resource_providers_mapping) as resources:
  1958. instance.vm_state = vm_states.BUILDING
  1959. instance.task_state = task_states.SPAWNING
  1960. # NOTE(JoshNang) This also saves the changes to the
  1961. # instance from _allocate_network_async, as they aren't
  1962. # saved in that function to prevent races.
  1963. instance.save(expected_task_state=
  1964. task_states.BLOCK_DEVICE_MAPPING)
  1965. block_device_info = resources['block_device_info']
  1966. network_info = resources['network_info']
  1967. allocs = resources['allocations']
  1968. LOG.debug('Start spawning the instance on the hypervisor.',
  1969. instance=instance)
  1970. with timeutils.StopWatch() as timer:
  1971. self.driver.spawn(context, instance, image_meta,
  1972. injected_files, admin_password,
  1973. allocs, network_info=network_info,
  1974. block_device_info=block_device_info)
  1975. LOG.info('Took %0.2f seconds to spawn the instance on '
  1976. 'the hypervisor.', timer.elapsed(),
  1977. instance=instance)
  1978. except (exception.InstanceNotFound,
  1979. exception.UnexpectedDeletingTaskStateError) as e:
  1980. with excutils.save_and_reraise_exception():
  1981. self._notify_about_instance_usage(context, instance,
  1982. 'create.error', fault=e)
  1983. tb = traceback.format_exc()
  1984. compute_utils.notify_about_instance_create(
  1985. context, instance, self.host,
  1986. phase=fields.NotificationPhase.ERROR, exception=e,
  1987. bdms=block_device_mapping, tb=tb)
  1988. except exception.ComputeResourcesUnavailable as e:
  1989. LOG.debug(e.format_message(), instance=instance)
  1990. self._notify_about_instance_usage(context, instance,
  1991. 'create.error', fault=e)
  1992. tb = traceback.format_exc()
  1993. compute_utils.notify_about_instance_create(
  1994. context, instance, self.host,
  1995. phase=fields.NotificationPhase.ERROR, exception=e,
  1996. bdms=block_device_mapping, tb=tb)
  1997. raise exception.RescheduledException(
  1998. instance_uuid=instance.uuid, reason=e.format_message())
  1999. except exception.BuildAbortException as e:
  2000. with excutils.save_and_reraise_exception():
  2001. LOG.debug(e.format_message(), instance=instance)
  2002. self._notify_about_instance_usage(context, instance,
  2003. 'create.error', fault=e)
  2004. tb = traceback.format_exc()
  2005. compute_utils.notify_about_instance_create(
  2006. context, instance, self.host,
  2007. phase=fields.NotificationPhase.ERROR, exception=e,
  2008. bdms=block_device_mapping, tb=tb)
  2009. except (exception.FixedIpLimitExceeded,
  2010. exception.NoMoreNetworks, exception.NoMoreFixedIps) as e:
  2011. LOG.warning('No more network or fixed IP to be allocated',
  2012. instance=instance)
  2013. self._notify_about_instance_usage(context, instance,
  2014. 'create.error', fault=e)
  2015. tb = traceback.format_exc()
  2016. compute_utils.notify_about_instance_create(
  2017. context, instance, self.host,
  2018. phase=fields.NotificationPhase.ERROR, exception=e,
  2019. bdms=block_device_mapping, tb=tb)
  2020. msg = _('Failed to allocate the network(s) with error %s, '
  2021. 'not rescheduling.') % e.format_message()
  2022. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2023. reason=msg)
  2024. except (exception.VirtualInterfaceCreateException,
  2025. exception.VirtualInterfaceMacAddressException,
  2026. exception.FixedIpInvalidOnHost,
  2027. exception.UnableToAutoAllocateNetwork,
  2028. exception.NetworksWithQoSPolicyNotSupported) as e:
  2029. LOG.exception('Failed to allocate network(s)',
  2030. instance=instance)
  2031. self._notify_about_instance_usage(context, instance,
  2032. 'create.error', fault=e)
  2033. tb = traceback.format_exc()
  2034. compute_utils.notify_about_instance_create(
  2035. context, instance, self.host,
  2036. phase=fields.NotificationPhase.ERROR, exception=e,
  2037. bdms=block_device_mapping, tb=tb)
  2038. msg = _('Failed to allocate the network(s), not rescheduling.')
  2039. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2040. reason=msg)
  2041. except (exception.FlavorDiskTooSmall,
  2042. exception.FlavorMemoryTooSmall,
  2043. exception.ImageNotActive,
  2044. exception.ImageUnacceptable,
  2045. exception.InvalidDiskInfo,
  2046. exception.InvalidDiskFormat,
  2047. cursive_exception.SignatureVerificationError,
  2048. exception.CertificateValidationFailed,
  2049. exception.VolumeEncryptionNotSupported,
  2050. exception.InvalidInput,
  2051. # TODO(mriedem): We should be validating RequestedVRamTooHigh
  2052. # in the API during server create and rebuild.
  2053. exception.RequestedVRamTooHigh) as e:
  2054. self._notify_about_instance_usage(context, instance,
  2055. 'create.error', fault=e)
  2056. tb = traceback.format_exc()
  2057. compute_utils.notify_about_instance_create(
  2058. context, instance, self.host,
  2059. phase=fields.NotificationPhase.ERROR, exception=e,
  2060. bdms=block_device_mapping, tb=tb)
  2061. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2062. reason=e.format_message())
  2063. except Exception as e:
  2064. self._notify_about_instance_usage(context, instance,
  2065. 'create.error', fault=e)
  2066. tb = traceback.format_exc()
  2067. compute_utils.notify_about_instance_create(
  2068. context, instance, self.host,
  2069. phase=fields.NotificationPhase.ERROR, exception=e,
  2070. bdms=block_device_mapping, tb=tb)
  2071. raise exception.RescheduledException(
  2072. instance_uuid=instance.uuid, reason=six.text_type(e))
  2073. # NOTE(alaski): This is only useful during reschedules, remove it now.
  2074. instance.system_metadata.pop('network_allocated', None)
  2075. # If CONF.default_access_ip_network_name is set, grab the
  2076. # corresponding network and set the access ip values accordingly.
  2077. network_name = CONF.default_access_ip_network_name
  2078. if (network_name and not instance.access_ip_v4 and
  2079. not instance.access_ip_v6):
  2080. # Note that when there are multiple ips to choose from, an
  2081. # arbitrary one will be chosen.
  2082. for vif in network_info:
  2083. if vif['network']['label'] == network_name:
  2084. for ip in vif.fixed_ips():
  2085. if not instance.access_ip_v4 and ip['version'] == 4:
  2086. instance.access_ip_v4 = ip['address']
  2087. if not instance.access_ip_v6 and ip['version'] == 6:
  2088. instance.access_ip_v6 = ip['address']
  2089. break
  2090. self._update_instance_after_spawn(context, instance)
  2091. try:
  2092. instance.save(expected_task_state=task_states.SPAWNING)
  2093. except (exception.InstanceNotFound,
  2094. exception.UnexpectedDeletingTaskStateError) as e:
  2095. with excutils.save_and_reraise_exception():
  2096. self._notify_about_instance_usage(context, instance,
  2097. 'create.error', fault=e)
  2098. tb = traceback.format_exc()
  2099. compute_utils.notify_about_instance_create(
  2100. context, instance, self.host,
  2101. phase=fields.NotificationPhase.ERROR, exception=e,
  2102. bdms=block_device_mapping, tb=tb)
  2103. self._update_scheduler_instance_info(context, instance)
  2104. self._notify_about_instance_usage(context, instance, 'create.end',
  2105. extra_usage_info={'message': _('Success')},
  2106. network_info=network_info)
  2107. compute_utils.notify_about_instance_create(context, instance,
  2108. self.host, phase=fields.NotificationPhase.END,
  2109. bdms=block_device_mapping)
  2110. @contextlib.contextmanager
  2111. def _build_resources(self, context, instance, requested_networks,
  2112. security_groups, image_meta, block_device_mapping,
  2113. resource_provider_mapping):
  2114. resources = {}
  2115. network_info = None
  2116. try:
  2117. LOG.debug('Start building networks asynchronously for instance.',
  2118. instance=instance)
  2119. network_info = self._build_networks_for_instance(context, instance,
  2120. requested_networks, security_groups,
  2121. resource_provider_mapping)
  2122. resources['network_info'] = network_info
  2123. except (exception.InstanceNotFound,
  2124. exception.UnexpectedDeletingTaskStateError):
  2125. raise
  2126. except exception.UnexpectedTaskStateError as e:
  2127. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2128. reason=e.format_message())
  2129. except Exception:
  2130. # Because this allocation is async any failures are likely to occur
  2131. # when the driver accesses network_info during spawn().
  2132. LOG.exception('Failed to allocate network(s)',
  2133. instance=instance)
  2134. msg = _('Failed to allocate the network(s), not rescheduling.')
  2135. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2136. reason=msg)
  2137. try:
  2138. # Perform any driver preparation work for the driver.
  2139. self.driver.prepare_for_spawn(instance)
  2140. # Depending on a virt driver, some network configuration is
  2141. # necessary before preparing block devices.
  2142. self.driver.prepare_networks_before_block_device_mapping(
  2143. instance, network_info)
  2144. # Verify that all the BDMs have a device_name set and assign a
  2145. # default to the ones missing it with the help of the driver.
  2146. self._default_block_device_names(instance, image_meta,
  2147. block_device_mapping)
  2148. LOG.debug('Start building block device mappings for instance.',
  2149. instance=instance)
  2150. instance.vm_state = vm_states.BUILDING
  2151. instance.task_state = task_states.BLOCK_DEVICE_MAPPING
  2152. instance.save()
  2153. block_device_info = self._prep_block_device(context, instance,
  2154. block_device_mapping)
  2155. resources['block_device_info'] = block_device_info
  2156. except (exception.InstanceNotFound,
  2157. exception.UnexpectedDeletingTaskStateError):
  2158. with excutils.save_and_reraise_exception():
  2159. # Make sure the async call finishes
  2160. if network_info is not None:
  2161. network_info.wait(do_raise=False)
  2162. self.driver.clean_networks_preparation(instance,
  2163. network_info)
  2164. self.driver.failed_spawn_cleanup(instance)
  2165. except (exception.UnexpectedTaskStateError,
  2166. exception.OverQuota, exception.InvalidBDM) as e:
  2167. # Make sure the async call finishes
  2168. if network_info is not None:
  2169. network_info.wait(do_raise=False)
  2170. self.driver.clean_networks_preparation(instance, network_info)
  2171. self.driver.failed_spawn_cleanup(instance)
  2172. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2173. reason=e.format_message())
  2174. except Exception:
  2175. LOG.exception('Failure prepping block device',
  2176. instance=instance)
  2177. # Make sure the async call finishes
  2178. if network_info is not None:
  2179. network_info.wait(do_raise=False)
  2180. self.driver.clean_networks_preparation(instance, network_info)
  2181. self.driver.failed_spawn_cleanup(instance)
  2182. msg = _('Failure prepping block device.')
  2183. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2184. reason=msg)
  2185. try:
  2186. resources['allocations'] = (
  2187. self.reportclient.get_allocations_for_consumer(context,
  2188. instance.uuid))
  2189. except Exception:
  2190. LOG.exception('Failure retrieving placement allocations',
  2191. instance=instance)
  2192. # Make sure the async call finishes
  2193. if network_info is not None:
  2194. network_info.wait(do_raise=False)
  2195. self.driver.failed_spawn_cleanup(instance)
  2196. msg = _('Failure retrieving placement allocations')
  2197. raise exception.BuildAbortException(instance_uuid=instance.uuid,
  2198. reason=msg)
  2199. try:
  2200. yield resources
  2201. except Exception as exc:
  2202. with excutils.save_and_reraise_exception() as ctxt:
  2203. if not isinstance(exc, (
  2204. exception.InstanceNotFound,
  2205. exception.UnexpectedDeletingTaskStateError)):
  2206. LOG.exception('Instance failed to spawn',
  2207. instance=instance)
  2208. # Make sure the async call finishes
  2209. if network_info is not None:
  2210. network_info.wait(do_raise=False)
  2211. # if network_info is empty we're likely here because of
  2212. # network allocation failure. Since nothing can be reused on
  2213. # rescheduling it's better to deallocate network to eliminate
  2214. # the chance of orphaned ports in neutron
  2215. deallocate_networks = False if network_info else True
  2216. try:
  2217. self._shutdown_instance(context, instance,
  2218. block_device_mapping, requested_networks,
  2219. try_deallocate_networks=deallocate_networks)
  2220. except Exception as exc2:
  2221. ctxt.reraise = False
  2222. LOG.warning('Could not clean up failed build,'
  2223. ' not rescheduling. Error: %s',
  2224. six.text_type(exc2))
  2225. raise exception.BuildAbortException(
  2226. instance_uuid=instance.uuid,
  2227. reason=six.text_type(exc))
  2228. def _cleanup_allocated_networks(self, context, instance,
  2229. requested_networks):
  2230. try:
  2231. self._deallocate_network(context, instance, requested_networks)
  2232. except Exception:
  2233. LOG.exception('Failed to deallocate networks', instance=instance)
  2234. return
  2235. instance.system_metadata['network_allocated'] = 'False'
  2236. try:
  2237. instance.save()
  2238. except exception.InstanceNotFound:
  2239. # NOTE(alaski): It's possible that we're cleaning up the networks
  2240. # because the instance was deleted. If that's the case then this
  2241. # exception will be raised by instance.save()
  2242. pass
  2243. def _try_deallocate_network(self, context, instance,
  2244. requested_networks=None):
  2245. # During auto-scale cleanup, we could be deleting a large number
  2246. # of servers at the same time and overloading parts of the system,
  2247. # so we retry a few times in case of connection failures to the
  2248. # networking service.
  2249. @loopingcall.RetryDecorator(
  2250. max_retry_count=3, inc_sleep_time=2, max_sleep_time=12,
  2251. exceptions=(keystone_exception.connection.ConnectFailure,))
  2252. def _deallocate_network_with_retries():
  2253. try:
  2254. self._deallocate_network(
  2255. context, instance, requested_networks)
  2256. except keystone_exception.connection.ConnectFailure as e:
  2257. # Provide a warning that something is amiss.
  2258. with excutils.save_and_reraise_exception():
  2259. LOG.warning('Failed to deallocate network for instance; '
  2260. 'retrying. Error: %s', six.text_type(e),
  2261. instance=instance)
  2262. try:
  2263. # tear down allocated network structure
  2264. _deallocate_network_with_retries()
  2265. except Exception as ex:
  2266. with excutils.save_and_reraise_exception():
  2267. LOG.error('Failed to deallocate network for instance. '
  2268. 'Error: %s', ex, instance=instance)
  2269. self._set_instance_obj_error_state(context, instance)
  2270. def _get_power_off_values(self, context, instance, clean_shutdown):
  2271. """Get the timing configuration for powering down this instance."""
  2272. if clean_shutdown:
  2273. timeout = compute_utils.get_value_from_system_metadata(instance,
  2274. key='image_os_shutdown_timeout', type=int,
  2275. default=CONF.shutdown_timeout)
  2276. retry_interval = CONF.compute.shutdown_retry_interval
  2277. else:
  2278. timeout = 0
  2279. retry_interval = 0
  2280. return timeout, retry_interval
  2281. def _power_off_instance(self, context, instance, clean_shutdown=True):
  2282. """Power off an instance on this host."""
  2283. timeout, retry_interval = self._get_power_off_values(context,
  2284. instance, clean_shutdown)
  2285. self.driver.power_off(instance, timeout, retry_interval)
  2286. def _shutdown_instance(self, context, instance,
  2287. bdms, requested_networks=None, notify=True,
  2288. try_deallocate_networks=True):
  2289. """Shutdown an instance on this host.
  2290. :param:context: security context
  2291. :param:instance: a nova.objects.Instance object
  2292. :param:bdms: the block devices for the instance to be torn
  2293. down
  2294. :param:requested_networks: the networks on which the instance
  2295. has ports
  2296. :param:notify: true if a final usage notification should be
  2297. emitted
  2298. :param:try_deallocate_networks: false if we should avoid
  2299. trying to teardown networking
  2300. """
  2301. context = context.elevated()
  2302. LOG.info('Terminating instance', instance=instance)
  2303. if notify:
  2304. self._notify_about_instance_usage(context, instance,
  2305. "shutdown.start")
  2306. compute_utils.notify_about_instance_action(context, instance,
  2307. self.host, action=fields.NotificationAction.SHUTDOWN,
  2308. phase=fields.NotificationPhase.START, bdms=bdms)
  2309. network_info = instance.get_network_info()
  2310. # NOTE(vish) get bdms before destroying the instance
  2311. vol_bdms = [bdm for bdm in bdms if bdm.is_volume]
  2312. block_device_info = self._get_instance_block_device_info(
  2313. context, instance, bdms=bdms)
  2314. # NOTE(melwitt): attempt driver destroy before releasing ip, may
  2315. # want to keep ip allocated for certain failures
  2316. try:
  2317. LOG.debug('Start destroying the instance on the hypervisor.',
  2318. instance=instance)
  2319. with timeutils.StopWatch() as timer:
  2320. self.driver.destroy(context, instance, network_info,
  2321. block_device_info)
  2322. LOG.info('Took %0.2f seconds to destroy the instance on the '
  2323. 'hypervisor.', timer.elapsed(), instance=instance)
  2324. except exception.InstancePowerOffFailure:
  2325. # if the instance can't power off, don't release the ip
  2326. with excutils.save_and_reraise_exception():
  2327. pass
  2328. except Exception:
  2329. with excutils.save_and_reraise_exception():
  2330. # deallocate ip and fail without proceeding to
  2331. # volume api calls, preserving current behavior
  2332. if try_deallocate_networks:
  2333. self._try_deallocate_network(context, instance,
  2334. requested_networks)
  2335. if try_deallocate_networks:
  2336. self._try_deallocate_network(context, instance, requested_networks)
  2337. timer.restart()
  2338. for bdm in vol_bdms:
  2339. try:
  2340. if bdm.attachment_id:
  2341. self.volume_api.attachment_delete(context,
  2342. bdm.attachment_id)
  2343. else:
  2344. # NOTE(vish): actual driver detach done in driver.destroy,
  2345. # so just tell cinder that we are done with it.
  2346. connector = self.driver.get_volume_connector(instance)
  2347. self.volume_api.terminate_connection(context,
  2348. bdm.volume_id,
  2349. connector)
  2350. self.volume_api.detach(context, bdm.volume_id,
  2351. instance.uuid)
  2352. except exception.VolumeAttachmentNotFound as exc:
  2353. LOG.debug('Ignoring VolumeAttachmentNotFound: %s', exc,
  2354. instance=instance)
  2355. except exception.DiskNotFound as exc:
  2356. LOG.debug('Ignoring DiskNotFound: %s', exc,
  2357. instance=instance)
  2358. except exception.VolumeNotFound as exc:
  2359. LOG.debug('Ignoring VolumeNotFound: %s', exc,
  2360. instance=instance)
  2361. except (cinder_exception.EndpointNotFound,
  2362. keystone_exception.EndpointNotFound) as exc:
  2363. LOG.warning('Ignoring EndpointNotFound for '
  2364. 'volume %(volume_id)s: %(exc)s',
  2365. {'exc': exc, 'volume_id': bdm.volume_id},
  2366. instance=instance)
  2367. except cinder_exception.ClientException as exc:
  2368. LOG.warning('Ignoring unknown cinder exception for '
  2369. 'volume %(volume_id)s: %(exc)s',
  2370. {'exc': exc, 'volume_id': bdm.volume_id},
  2371. instance=instance)
  2372. except Exception as exc:
  2373. LOG.warning('Ignoring unknown exception for '
  2374. 'volume %(volume_id)s: %(exc)s',
  2375. {'exc': exc, 'volume_id': bdm.volume_id},
  2376. instance=instance)
  2377. if vol_bdms:
  2378. LOG.info('Took %(time).2f seconds to detach %(num)s volumes '
  2379. 'for instance.',
  2380. {'time': timer.elapsed(), 'num': len(vol_bdms)},
  2381. instance=instance)
  2382. if notify:
  2383. self._notify_about_instance_usage(context, instance,
  2384. "shutdown.end")
  2385. compute_utils.notify_about_instance_action(context, instance,
  2386. self.host, action=fields.NotificationAction.SHUTDOWN,
  2387. phase=fields.NotificationPhase.END, bdms=bdms)
  2388. def _cleanup_volumes(self, context, instance, bdms, raise_exc=True,
  2389. detach=True):
  2390. exc_info = None
  2391. for bdm in bdms:
  2392. if detach and bdm.volume_id:
  2393. try:
  2394. LOG.debug("Detaching volume: %s", bdm.volume_id,
  2395. instance_uuid=instance.uuid)
  2396. destroy = bdm.delete_on_termination
  2397. self._detach_volume(context, bdm, instance,
  2398. destroy_bdm=destroy)
  2399. except Exception as exc:
  2400. exc_info = sys.exc_info()
  2401. LOG.warning('Failed to detach volume: %(volume_id)s '
  2402. 'due to %(exc)s',
  2403. {'volume_id': bdm.volume_id, 'exc': exc})
  2404. if bdm.volume_id and bdm.delete_on_termination:
  2405. try:
  2406. LOG.debug("Deleting volume: %s", bdm.volume_id,
  2407. instance_uuid=instance.uuid)
  2408. self.volume_api.delete(context, bdm.volume_id)
  2409. except Exception as exc:
  2410. exc_info = sys.exc_info()
  2411. LOG.warning('Failed to delete volume: %(volume_id)s '
  2412. 'due to %(exc)s',
  2413. {'volume_id': bdm.volume_id, 'exc': exc})
  2414. if exc_info is not None and raise_exc:
  2415. six.reraise(exc_info[0], exc_info[1], exc_info[2])
  2416. @hooks.add_hook("delete_instance")
  2417. def _delete_instance(self, context, instance, bdms):
  2418. """Delete an instance on this host.
  2419. :param context: nova request context
  2420. :param instance: nova.objects.instance.Instance object
  2421. :param bdms: nova.objects.block_device.BlockDeviceMappingList object
  2422. """
  2423. events = self.instance_events.clear_events_for_instance(instance)
  2424. if events:
  2425. LOG.debug('Events pending at deletion: %(events)s',
  2426. {'events': ','.join(events.keys())},
  2427. instance=instance)
  2428. self._notify_about_instance_usage(context, instance,
  2429. "delete.start")
  2430. compute_utils.notify_about_instance_action(context, instance,
  2431. self.host, action=fields.NotificationAction.DELETE,
  2432. phase=fields.NotificationPhase.START, bdms=bdms)
  2433. self._shutdown_instance(context, instance, bdms)
  2434. # NOTE(vish): We have already deleted the instance, so we have
  2435. # to ignore problems cleaning up the volumes. It
  2436. # would be nice to let the user know somehow that
  2437. # the volume deletion failed, but it is not
  2438. # acceptable to have an instance that can not be
  2439. # deleted. Perhaps this could be reworked in the
  2440. # future to set an instance fault the first time
  2441. # and to only ignore the failure if the instance
  2442. # is already in ERROR.
  2443. # NOTE(ameeda): The volumes already detached during the above
  2444. # _shutdown_instance() call and this is why
  2445. # detach is not requested from _cleanup_volumes()
  2446. # in this case
  2447. self._cleanup_volumes(context, instance, bdms,
  2448. raise_exc=False, detach=False)
  2449. # if a delete task succeeded, always update vm state and task
  2450. # state without expecting task state to be DELETING
  2451. instance.vm_state = vm_states.DELETED
  2452. instance.task_state = None
  2453. instance.power_state = power_state.NOSTATE
  2454. instance.terminated_at = timeutils.utcnow()
  2455. instance.save()
  2456. self._complete_deletion(context, instance)
  2457. # only destroy the instance in the db if the _complete_deletion
  2458. # doesn't raise and therefore allocation is successfully
  2459. # deleted in placement
  2460. instance.destroy()
  2461. self._notify_about_instance_usage(context, instance, "delete.end")
  2462. compute_utils.notify_about_instance_action(context, instance,
  2463. self.host, action=fields.NotificationAction.DELETE,
  2464. phase=fields.NotificationPhase.END, bdms=bdms)
  2465. @wrap_exception()
  2466. @reverts_task_state
  2467. @wrap_instance_event(prefix='compute')
  2468. @wrap_instance_fault
  2469. def terminate_instance(self, context, instance, bdms):
  2470. """Terminate an instance on this host."""
  2471. @utils.synchronized(instance.uuid)
  2472. def do_terminate_instance(instance, bdms):
  2473. # NOTE(mriedem): If we are deleting the instance while it was
  2474. # booting from volume, we could be racing with a database update of
  2475. # the BDM volume_id. Since the compute API passes the BDMs over RPC
  2476. # to compute here, the BDMs may be stale at this point. So check
  2477. # for any volume BDMs that don't have volume_id set and if we
  2478. # detect that, we need to refresh the BDM list before proceeding.
  2479. # TODO(mriedem): Move this into _delete_instance and make the bdms
  2480. # parameter optional.
  2481. for bdm in list(bdms):
  2482. if bdm.is_volume and not bdm.volume_id:
  2483. LOG.debug('There are potentially stale BDMs during '
  2484. 'delete, refreshing the BlockDeviceMappingList.',
  2485. instance=instance)
  2486. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  2487. context, instance.uuid)
  2488. break
  2489. try:
  2490. self._delete_instance(context, instance, bdms)
  2491. except exception.InstanceNotFound:
  2492. LOG.info("Instance disappeared during terminate",
  2493. instance=instance)
  2494. except Exception:
  2495. # As we're trying to delete always go to Error if something
  2496. # goes wrong that _delete_instance can't handle.
  2497. with excutils.save_and_reraise_exception():
  2498. LOG.exception('Setting instance vm_state to ERROR',
  2499. instance=instance)
  2500. self._set_instance_obj_error_state(context, instance)
  2501. do_terminate_instance(instance, bdms)
  2502. # NOTE(johannes): This is probably better named power_off_instance
  2503. # so it matches the driver method, but because of other issues, we
  2504. # can't use that name in grizzly.
  2505. @wrap_exception()
  2506. @reverts_task_state
  2507. @wrap_instance_event(prefix='compute')
  2508. @wrap_instance_fault
  2509. def stop_instance(self, context, instance, clean_shutdown):
  2510. """Stopping an instance on this host."""
  2511. @utils.synchronized(instance.uuid)
  2512. def do_stop_instance():
  2513. current_power_state = self._get_power_state(context, instance)
  2514. LOG.debug('Stopping instance; current vm_state: %(vm_state)s, '
  2515. 'current task_state: %(task_state)s, current DB '
  2516. 'power_state: %(db_power_state)s, current VM '
  2517. 'power_state: %(current_power_state)s',
  2518. {'vm_state': instance.vm_state,
  2519. 'task_state': instance.task_state,
  2520. 'db_power_state': instance.power_state,
  2521. 'current_power_state': current_power_state},
  2522. instance_uuid=instance.uuid)
  2523. # NOTE(mriedem): If the instance is already powered off, we are
  2524. # possibly tearing down and racing with other operations, so we can
  2525. # expect the task_state to be None if something else updates the
  2526. # instance and we're not locking it.
  2527. expected_task_state = [task_states.POWERING_OFF]
  2528. # The list of power states is from _sync_instance_power_state.
  2529. if current_power_state in (power_state.NOSTATE,
  2530. power_state.SHUTDOWN,
  2531. power_state.CRASHED):
  2532. LOG.info('Instance is already powered off in the '
  2533. 'hypervisor when stop is called.',
  2534. instance=instance)
  2535. expected_task_state.append(None)
  2536. self._notify_about_instance_usage(context, instance,
  2537. "power_off.start")
  2538. compute_utils.notify_about_instance_action(context, instance,
  2539. self.host, action=fields.NotificationAction.POWER_OFF,
  2540. phase=fields.NotificationPhase.START)
  2541. self._power_off_instance(context, instance, clean_shutdown)
  2542. instance.power_state = self._get_power_state(context, instance)
  2543. instance.vm_state = vm_states.STOPPED
  2544. instance.task_state = None
  2545. instance.save(expected_task_state=expected_task_state)
  2546. self._notify_about_instance_usage(context, instance,
  2547. "power_off.end")
  2548. compute_utils.notify_about_instance_action(context, instance,
  2549. self.host, action=fields.NotificationAction.POWER_OFF,
  2550. phase=fields.NotificationPhase.END)
  2551. do_stop_instance()
  2552. def _power_on(self, context, instance):
  2553. network_info = self.network_api.get_instance_nw_info(context, instance)
  2554. block_device_info = self._get_instance_block_device_info(context,
  2555. instance)
  2556. self.driver.power_on(context, instance,
  2557. network_info,
  2558. block_device_info)
  2559. def _delete_snapshot_of_shelved_instance(self, context, instance,
  2560. snapshot_id):
  2561. """Delete snapshot of shelved instance."""
  2562. try:
  2563. self.image_api.delete(context, snapshot_id)
  2564. except (exception.ImageNotFound,
  2565. exception.ImageNotAuthorized) as exc:
  2566. LOG.warning("Failed to delete snapshot "
  2567. "from shelved instance (%s).",
  2568. exc.format_message(), instance=instance)
  2569. except Exception:
  2570. LOG.exception("Something wrong happened when trying to "
  2571. "delete snapshot from shelved instance.",
  2572. instance=instance)
  2573. # NOTE(johannes): This is probably better named power_on_instance
  2574. # so it matches the driver method, but because of other issues, we
  2575. # can't use that name in grizzly.
  2576. @wrap_exception()
  2577. @reverts_task_state
  2578. @wrap_instance_event(prefix='compute')
  2579. @wrap_instance_fault
  2580. def start_instance(self, context, instance):
  2581. """Starting an instance on this host."""
  2582. self._notify_about_instance_usage(context, instance, "power_on.start")
  2583. compute_utils.notify_about_instance_action(context, instance,
  2584. self.host, action=fields.NotificationAction.POWER_ON,
  2585. phase=fields.NotificationPhase.START)
  2586. self._power_on(context, instance)
  2587. instance.power_state = self._get_power_state(context, instance)
  2588. instance.vm_state = vm_states.ACTIVE
  2589. instance.task_state = None
  2590. # Delete an image(VM snapshot) for a shelved instance
  2591. snapshot_id = instance.system_metadata.get('shelved_image_id')
  2592. if snapshot_id:
  2593. self._delete_snapshot_of_shelved_instance(context, instance,
  2594. snapshot_id)
  2595. # Delete system_metadata for a shelved instance
  2596. compute_utils.remove_shelved_keys_from_system_metadata(instance)
  2597. instance.save(expected_task_state=task_states.POWERING_ON)
  2598. self._notify_about_instance_usage(context, instance, "power_on.end")
  2599. compute_utils.notify_about_instance_action(context, instance,
  2600. self.host, action=fields.NotificationAction.POWER_ON,
  2601. phase=fields.NotificationPhase.END)
  2602. @messaging.expected_exceptions(NotImplementedError,
  2603. exception.TriggerCrashDumpNotSupported,
  2604. exception.InstanceNotRunning)
  2605. @wrap_exception()
  2606. @wrap_instance_event(prefix='compute')
  2607. @wrap_instance_fault
  2608. def trigger_crash_dump(self, context, instance):
  2609. """Trigger crash dump in an instance."""
  2610. self._notify_about_instance_usage(context, instance,
  2611. "trigger_crash_dump.start")
  2612. compute_utils.notify_about_instance_action(context, instance,
  2613. self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP,
  2614. phase=fields.NotificationPhase.START)
  2615. # This method does not change task_state and power_state because the
  2616. # effect of a trigger depends on user's configuration.
  2617. self.driver.trigger_crash_dump(instance)
  2618. self._notify_about_instance_usage(context, instance,
  2619. "trigger_crash_dump.end")
  2620. compute_utils.notify_about_instance_action(context, instance,
  2621. self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP,
  2622. phase=fields.NotificationPhase.END)
  2623. @wrap_exception()
  2624. @reverts_task_state
  2625. @wrap_instance_event(prefix='compute')
  2626. @wrap_instance_fault
  2627. def soft_delete_instance(self, context, instance):
  2628. """Soft delete an instance on this host."""
  2629. with compute_utils.notify_about_instance_delete(
  2630. self.notifier, context, instance, 'soft_delete',
  2631. source=fields.NotificationSource.COMPUTE):
  2632. try:
  2633. self.driver.soft_delete(instance)
  2634. except NotImplementedError:
  2635. # Fallback to just powering off the instance if the
  2636. # hypervisor doesn't implement the soft_delete method
  2637. self.driver.power_off(instance)
  2638. instance.power_state = self._get_power_state(context, instance)
  2639. instance.vm_state = vm_states.SOFT_DELETED
  2640. instance.task_state = None
  2641. instance.save(expected_task_state=[task_states.SOFT_DELETING])
  2642. @wrap_exception()
  2643. @reverts_task_state
  2644. @wrap_instance_event(prefix='compute')
  2645. @wrap_instance_fault
  2646. def restore_instance(self, context, instance):
  2647. """Restore a soft-deleted instance on this host."""
  2648. self._notify_about_instance_usage(context, instance, "restore.start")
  2649. compute_utils.notify_about_instance_action(context, instance,
  2650. self.host, action=fields.NotificationAction.RESTORE,
  2651. phase=fields.NotificationPhase.START)
  2652. try:
  2653. self.driver.restore(instance)
  2654. except NotImplementedError:
  2655. # Fallback to just powering on the instance if the hypervisor
  2656. # doesn't implement the restore method
  2657. self._power_on(context, instance)
  2658. instance.power_state = self._get_power_state(context, instance)
  2659. instance.vm_state = vm_states.ACTIVE
  2660. instance.task_state = None
  2661. instance.save(expected_task_state=task_states.RESTORING)
  2662. self._notify_about_instance_usage(context, instance, "restore.end")
  2663. compute_utils.notify_about_instance_action(context, instance,
  2664. self.host, action=fields.NotificationAction.RESTORE,
  2665. phase=fields.NotificationPhase.END)
  2666. @staticmethod
  2667. def _set_migration_status(migration, status):
  2668. """Set the status, and guard against a None being passed in.
  2669. This is useful as some of the compute RPC calls will not pass
  2670. a migration object in older versions. The check can be removed when
  2671. we move past 4.x major version of the RPC API.
  2672. """
  2673. if migration:
  2674. migration.status = status
  2675. migration.save()
  2676. def _rebuild_default_impl(self, context, instance, image_meta,
  2677. injected_files, admin_password, allocations,
  2678. bdms, detach_block_devices, attach_block_devices,
  2679. network_info=None,
  2680. evacuate=False, block_device_info=None,
  2681. preserve_ephemeral=False):
  2682. if preserve_ephemeral:
  2683. # The default code path does not support preserving ephemeral
  2684. # partitions.
  2685. raise exception.PreserveEphemeralNotSupported()
  2686. if evacuate:
  2687. detach_block_devices(context, bdms)
  2688. else:
  2689. self._power_off_instance(context, instance, clean_shutdown=True)
  2690. detach_block_devices(context, bdms)
  2691. self.driver.destroy(context, instance,
  2692. network_info=network_info,
  2693. block_device_info=block_device_info)
  2694. instance.task_state = task_states.REBUILD_BLOCK_DEVICE_MAPPING
  2695. instance.save(expected_task_state=[task_states.REBUILDING])
  2696. new_block_device_info = attach_block_devices(context, instance, bdms)
  2697. instance.task_state = task_states.REBUILD_SPAWNING
  2698. instance.save(
  2699. expected_task_state=[task_states.REBUILD_BLOCK_DEVICE_MAPPING])
  2700. with instance.mutated_migration_context():
  2701. self.driver.spawn(context, instance, image_meta, injected_files,
  2702. admin_password, allocations,
  2703. network_info=network_info,
  2704. block_device_info=new_block_device_info)
  2705. def _notify_instance_rebuild_error(self, context, instance, error, bdms):
  2706. tb = traceback.format_exc()
  2707. self._notify_about_instance_usage(context, instance,
  2708. 'rebuild.error', fault=error)
  2709. compute_utils.notify_about_instance_rebuild(
  2710. context, instance, self.host,
  2711. phase=fields.NotificationPhase.ERROR, exception=error, bdms=bdms,
  2712. tb=tb)
  2713. @messaging.expected_exceptions(exception.PreserveEphemeralNotSupported)
  2714. @wrap_exception()
  2715. @reverts_task_state
  2716. @wrap_instance_event(prefix='compute')
  2717. @wrap_instance_fault
  2718. def rebuild_instance(self, context, instance, orig_image_ref, image_ref,
  2719. injected_files, new_pass, orig_sys_metadata,
  2720. bdms, recreate, on_shared_storage,
  2721. preserve_ephemeral, migration,
  2722. scheduled_node, limits, request_spec):
  2723. """Destroy and re-make this instance.
  2724. A 'rebuild' effectively purges all existing data from the system and
  2725. remakes the VM with given 'metadata' and 'personalities'.
  2726. :param context: `nova.RequestContext` object
  2727. :param instance: Instance object
  2728. :param orig_image_ref: Original image_ref before rebuild
  2729. :param image_ref: New image_ref for rebuild
  2730. :param injected_files: Files to inject
  2731. :param new_pass: password to set on rebuilt instance
  2732. :param orig_sys_metadata: instance system metadata from pre-rebuild
  2733. :param bdms: block-device-mappings to use for rebuild
  2734. :param recreate: True if the instance is being recreated (e.g. the
  2735. hypervisor it was on failed) - cleanup of old state will be
  2736. skipped.
  2737. :param on_shared_storage: True if instance files on shared storage.
  2738. If not provided then information from the
  2739. driver will be used to decide if the instance
  2740. files are available or not on the target host
  2741. :param preserve_ephemeral: True if the default ephemeral storage
  2742. partition must be preserved on rebuild
  2743. :param migration: a Migration object if one was created for this
  2744. rebuild operation (if it's a part of evacuate)
  2745. :param scheduled_node: A node of the host chosen by the scheduler. If a
  2746. host was specified by the user, this will be
  2747. None
  2748. :param limits: Overcommit limits set by the scheduler. If a host was
  2749. specified by the user, this will be None
  2750. :param request_spec: a RequestSpec object used to schedule the instance
  2751. """
  2752. # recreate=True means the instance is being evacuated from a failed
  2753. # host to a new destination host (this host). The 'recreate' variable
  2754. # name is confusing, so rename it to evacuate here at the top, which
  2755. # is simpler than renaming a parameter in an RPC versioned method.
  2756. evacuate = recreate
  2757. context = context.elevated()
  2758. if evacuate:
  2759. LOG.info("Evacuating instance", instance=instance)
  2760. else:
  2761. LOG.info("Rebuilding instance", instance=instance)
  2762. if evacuate:
  2763. # This is an evacuation to a new host, so we need to perform a
  2764. # resource claim.
  2765. rebuild_claim = self.rt.rebuild_claim
  2766. else:
  2767. # This is a rebuild to the same host, so we don't need to make
  2768. # a claim since the instance is already on this host.
  2769. rebuild_claim = claims.NopClaim
  2770. if image_ref:
  2771. image_meta = objects.ImageMeta.from_image_ref(
  2772. context, self.image_api, image_ref)
  2773. elif evacuate:
  2774. # For evacuate the API does not send down the image_ref since the
  2775. # image does not change so just get it from what was stashed in
  2776. # the instance system_metadata when the instance was created (or
  2777. # last rebuilt). This also works for volume-backed instances.
  2778. image_meta = instance.image_meta
  2779. else:
  2780. image_meta = objects.ImageMeta()
  2781. # NOTE(mriedem): On an evacuate, we need to update
  2782. # the instance's host and node properties to reflect it's
  2783. # destination node for the evacuate.
  2784. if not scheduled_node:
  2785. if evacuate:
  2786. try:
  2787. compute_node = self._get_compute_info(context, self.host)
  2788. scheduled_node = compute_node.hypervisor_hostname
  2789. except exception.ComputeHostNotFound:
  2790. LOG.exception('Failed to get compute_info for %s',
  2791. self.host)
  2792. else:
  2793. scheduled_node = instance.node
  2794. with self._error_out_instance_on_exception(context, instance):
  2795. try:
  2796. claim_ctxt = rebuild_claim(
  2797. context, instance, scheduled_node,
  2798. limits=limits, image_meta=image_meta,
  2799. migration=migration)
  2800. self._do_rebuild_instance_with_claim(
  2801. claim_ctxt, context, instance, orig_image_ref,
  2802. image_meta, injected_files, new_pass, orig_sys_metadata,
  2803. bdms, evacuate, on_shared_storage, preserve_ephemeral,
  2804. migration, request_spec)
  2805. except (exception.ComputeResourcesUnavailable,
  2806. exception.RescheduledException) as e:
  2807. if isinstance(e, exception.ComputeResourcesUnavailable):
  2808. LOG.debug("Could not rebuild instance on this host, not "
  2809. "enough resources available.", instance=instance)
  2810. else:
  2811. # RescheduledException is raised by the late server group
  2812. # policy check during evacuation if a parallel scheduling
  2813. # violated the policy.
  2814. # We catch the RescheduledException here but we don't have
  2815. # the plumbing to do an actual reschedule so we abort the
  2816. # operation.
  2817. LOG.debug("Could not rebuild instance on this host, "
  2818. "late server group check failed.",
  2819. instance=instance)
  2820. # NOTE(ndipanov): We just abort the build for now and leave a
  2821. # migration record for potential cleanup later
  2822. self._set_migration_status(migration, 'failed')
  2823. # Since the claim failed, we need to remove the allocation
  2824. # created against the destination node. Note that we can only
  2825. # get here when evacuating to a destination node. Rebuilding
  2826. # on the same host (not evacuate) uses the NopClaim which will
  2827. # not raise ComputeResourcesUnavailable.
  2828. self.rt.delete_allocation_for_evacuated_instance(
  2829. context, instance, scheduled_node, node_type='destination')
  2830. self._notify_instance_rebuild_error(context, instance, e, bdms)
  2831. raise exception.BuildAbortException(
  2832. instance_uuid=instance.uuid, reason=e.format_message())
  2833. except (exception.InstanceNotFound,
  2834. exception.UnexpectedDeletingTaskStateError) as e:
  2835. LOG.debug('Instance was deleted while rebuilding',
  2836. instance=instance)
  2837. self._set_migration_status(migration, 'failed')
  2838. self._notify_instance_rebuild_error(context, instance, e, bdms)
  2839. except Exception as e:
  2840. self._set_migration_status(migration, 'failed')
  2841. if evacuate or scheduled_node is not None:
  2842. self.rt.delete_allocation_for_evacuated_instance(
  2843. context, instance, scheduled_node,
  2844. node_type='destination')
  2845. self._notify_instance_rebuild_error(context, instance, e, bdms)
  2846. raise
  2847. else:
  2848. instance.apply_migration_context()
  2849. # NOTE (ndipanov): This save will now update the host and node
  2850. # attributes making sure that next RT pass is consistent since
  2851. # it will be based on the instance and not the migration DB
  2852. # entry.
  2853. instance.host = self.host
  2854. instance.node = scheduled_node
  2855. instance.save()
  2856. instance.drop_migration_context()
  2857. # NOTE (ndipanov): Mark the migration as done only after we
  2858. # mark the instance as belonging to this host.
  2859. self._set_migration_status(migration, 'done')
  2860. def _do_rebuild_instance_with_claim(self, claim_context, *args, **kwargs):
  2861. """Helper to avoid deep nesting in the top-level method."""
  2862. with claim_context:
  2863. self._do_rebuild_instance(*args, **kwargs)
  2864. @staticmethod
  2865. def _get_image_name(image_meta):
  2866. if image_meta.obj_attr_is_set("name"):
  2867. return image_meta.name
  2868. else:
  2869. return ''
  2870. def _do_rebuild_instance(self, context, instance, orig_image_ref,
  2871. image_meta, injected_files, new_pass,
  2872. orig_sys_metadata, bdms, evacuate,
  2873. on_shared_storage, preserve_ephemeral,
  2874. migration, request_spec):
  2875. orig_vm_state = instance.vm_state
  2876. if evacuate:
  2877. if request_spec:
  2878. # NOTE(gibi): Do a late check of server group policy as
  2879. # parallel scheduling could violate such policy. This will
  2880. # cause the evacuate to fail as rebuild does not implement
  2881. # reschedule.
  2882. hints = self._get_scheduler_hints({}, request_spec)
  2883. self._validate_instance_group_policy(context, instance, hints)
  2884. if not self.driver.capabilities.get("supports_evacuate", False):
  2885. raise exception.InstanceEvacuateNotSupported
  2886. self._check_instance_exists(context, instance)
  2887. if on_shared_storage is None:
  2888. LOG.debug('on_shared_storage is not provided, using driver '
  2889. 'information to decide if the instance needs to '
  2890. 'be evacuated')
  2891. on_shared_storage = self.driver.instance_on_disk(instance)
  2892. elif (on_shared_storage !=
  2893. self.driver.instance_on_disk(instance)):
  2894. # To cover case when admin expects that instance files are
  2895. # on shared storage, but not accessible and vice versa
  2896. raise exception.InvalidSharedStorage(
  2897. _("Invalid state of instance files on shared"
  2898. " storage"))
  2899. if on_shared_storage:
  2900. LOG.info('disk on shared storage, evacuating using'
  2901. ' existing disk')
  2902. elif instance.image_ref:
  2903. orig_image_ref = instance.image_ref
  2904. LOG.info("disk not on shared storage, evacuating from "
  2905. "image: '%s'", str(orig_image_ref))
  2906. else:
  2907. LOG.info('disk on volume, evacuating using existing '
  2908. 'volume')
  2909. # We check trusted certs capabilities for both evacuate (rebuild on
  2910. # another host) and rebuild (rebuild on the same host) because for
  2911. # evacuate we need to make sure an instance with trusted certs can
  2912. # have the image verified with those certs during rebuild, and for
  2913. # rebuild we could be rebuilding a server that started out with no
  2914. # trusted certs on this host, and then was rebuilt with trusted certs
  2915. # for a new image, in which case we need to validate that new image
  2916. # with the trusted certs during the rebuild.
  2917. self._check_trusted_certs(instance)
  2918. # This instance.exists message should contain the original
  2919. # image_ref, not the new one. Since the DB has been updated
  2920. # to point to the new one... we have to override it.
  2921. orig_image_ref_url = self.image_api.generate_image_url(orig_image_ref,
  2922. context)
  2923. extra_usage_info = {'image_ref_url': orig_image_ref_url}
  2924. compute_utils.notify_usage_exists(
  2925. self.notifier, context, instance, self.host,
  2926. current_period=True, system_metadata=orig_sys_metadata,
  2927. extra_usage_info=extra_usage_info)
  2928. # This message should contain the new image_ref
  2929. extra_usage_info = {'image_name': self._get_image_name(image_meta)}
  2930. self._notify_about_instance_usage(context, instance,
  2931. "rebuild.start", extra_usage_info=extra_usage_info)
  2932. # NOTE: image_name is not included in the versioned notification
  2933. # because we already provide the image_uuid in the notification
  2934. # payload and the image details can be looked up via the uuid.
  2935. compute_utils.notify_about_instance_rebuild(
  2936. context, instance, self.host,
  2937. phase=fields.NotificationPhase.START,
  2938. bdms=bdms)
  2939. instance.power_state = self._get_power_state(context, instance)
  2940. instance.task_state = task_states.REBUILDING
  2941. instance.save(expected_task_state=[task_states.REBUILDING])
  2942. if evacuate:
  2943. self.network_api.setup_networks_on_host(
  2944. context, instance, self.host)
  2945. # For nova-network this is needed to move floating IPs
  2946. # For neutron this updates the host in the port binding
  2947. # TODO(cfriesen): this network_api call and the one above
  2948. # are so similar, we should really try to unify them.
  2949. self.network_api.setup_instance_network_on_host(
  2950. context, instance, self.host, migration)
  2951. # TODO(mriedem): Consider decorating setup_instance_network_on_host
  2952. # with @base_api.refresh_cache and then we wouldn't need this
  2953. # explicit call to get_instance_nw_info.
  2954. network_info = self.network_api.get_instance_nw_info(context,
  2955. instance)
  2956. else:
  2957. network_info = instance.get_network_info()
  2958. allocations = self.reportclient.get_allocations_for_consumer(
  2959. context, instance.uuid)
  2960. if bdms is None:
  2961. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  2962. context, instance.uuid)
  2963. block_device_info = \
  2964. self._get_instance_block_device_info(
  2965. context, instance, bdms=bdms)
  2966. def detach_block_devices(context, bdms):
  2967. for bdm in bdms:
  2968. if bdm.is_volume:
  2969. # NOTE (ildikov): Having the attachment_id set in the BDM
  2970. # means that it's the new Cinder attach/detach flow
  2971. # (available from v3.44). In that case we explicitly
  2972. # attach and detach the volumes through attachment level
  2973. # operations. In this scenario _detach_volume will delete
  2974. # the existing attachment which would make the volume
  2975. # status change to 'available' if we don't pre-create
  2976. # another empty attachment before deleting the old one.
  2977. attachment_id = None
  2978. if bdm.attachment_id:
  2979. attachment_id = self.volume_api.attachment_create(
  2980. context, bdm['volume_id'], instance.uuid)['id']
  2981. self._detach_volume(context, bdm, instance,
  2982. destroy_bdm=False)
  2983. if attachment_id:
  2984. bdm.attachment_id = attachment_id
  2985. bdm.save()
  2986. files = self._decode_files(injected_files)
  2987. kwargs = dict(
  2988. context=context,
  2989. instance=instance,
  2990. image_meta=image_meta,
  2991. injected_files=files,
  2992. admin_password=new_pass,
  2993. allocations=allocations,
  2994. bdms=bdms,
  2995. detach_block_devices=detach_block_devices,
  2996. attach_block_devices=self._prep_block_device,
  2997. block_device_info=block_device_info,
  2998. network_info=network_info,
  2999. preserve_ephemeral=preserve_ephemeral,
  3000. evacuate=evacuate)
  3001. try:
  3002. with instance.mutated_migration_context():
  3003. self.driver.rebuild(**kwargs)
  3004. except NotImplementedError:
  3005. # NOTE(rpodolyaka): driver doesn't provide specialized version
  3006. # of rebuild, fall back to the default implementation
  3007. self._rebuild_default_impl(**kwargs)
  3008. self._update_instance_after_spawn(context, instance)
  3009. instance.save(expected_task_state=[task_states.REBUILD_SPAWNING])
  3010. if orig_vm_state == vm_states.STOPPED:
  3011. LOG.info("bringing vm to original state: '%s'",
  3012. orig_vm_state, instance=instance)
  3013. instance.vm_state = vm_states.ACTIVE
  3014. instance.task_state = task_states.POWERING_OFF
  3015. instance.progress = 0
  3016. instance.save()
  3017. self.stop_instance(context, instance, False)
  3018. # TODO(melwitt): We should clean up instance console tokens here in the
  3019. # case of evacuate. The instance is on a new host and will need to
  3020. # establish a new console connection.
  3021. self._update_scheduler_instance_info(context, instance)
  3022. self._notify_about_instance_usage(
  3023. context, instance, "rebuild.end",
  3024. network_info=network_info,
  3025. extra_usage_info=extra_usage_info)
  3026. compute_utils.notify_about_instance_rebuild(
  3027. context, instance, self.host,
  3028. phase=fields.NotificationPhase.END,
  3029. bdms=bdms)
  3030. def _handle_bad_volumes_detached(self, context, instance, bad_devices,
  3031. block_device_info):
  3032. """Handle cases where the virt-layer had to detach non-working volumes
  3033. in order to complete an operation.
  3034. """
  3035. for bdm in block_device_info['block_device_mapping']:
  3036. if bdm.get('mount_device') in bad_devices:
  3037. try:
  3038. volume_id = bdm['connection_info']['data']['volume_id']
  3039. except KeyError:
  3040. continue
  3041. # NOTE(sirp): ideally we'd just call
  3042. # `compute_api.detach_volume` here but since that hits the
  3043. # DB directly, that's off limits from within the
  3044. # compute-manager.
  3045. #
  3046. # API-detach
  3047. LOG.info("Detaching from volume api: %s", volume_id)
  3048. self.volume_api.begin_detaching(context, volume_id)
  3049. # Manager-detach
  3050. self.detach_volume(context, volume_id, instance)
  3051. @wrap_exception()
  3052. @reverts_task_state
  3053. @wrap_instance_event(prefix='compute')
  3054. @wrap_instance_fault
  3055. def reboot_instance(self, context, instance, block_device_info,
  3056. reboot_type):
  3057. """Reboot an instance on this host."""
  3058. # acknowledge the request made it to the manager
  3059. if reboot_type == "SOFT":
  3060. instance.task_state = task_states.REBOOT_PENDING
  3061. expected_states = task_states.soft_reboot_states
  3062. else:
  3063. instance.task_state = task_states.REBOOT_PENDING_HARD
  3064. expected_states = task_states.hard_reboot_states
  3065. context = context.elevated()
  3066. LOG.info("Rebooting instance", instance=instance)
  3067. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  3068. context, instance.uuid)
  3069. block_device_info = self._get_instance_block_device_info(
  3070. context, instance, bdms=bdms)
  3071. network_info = self.network_api.get_instance_nw_info(context, instance)
  3072. self._notify_about_instance_usage(context, instance, "reboot.start")
  3073. compute_utils.notify_about_instance_action(
  3074. context, instance, self.host,
  3075. action=fields.NotificationAction.REBOOT,
  3076. phase=fields.NotificationPhase.START,
  3077. bdms=bdms
  3078. )
  3079. instance.power_state = self._get_power_state(context, instance)
  3080. instance.save(expected_task_state=expected_states)
  3081. if instance.power_state != power_state.RUNNING:
  3082. state = instance.power_state
  3083. running = power_state.RUNNING
  3084. LOG.warning('trying to reboot a non-running instance:'
  3085. ' (state: %(state)s expected: %(running)s)',
  3086. {'state': state, 'running': running},
  3087. instance=instance)
  3088. def bad_volumes_callback(bad_devices):
  3089. self._handle_bad_volumes_detached(
  3090. context, instance, bad_devices, block_device_info)
  3091. try:
  3092. # Don't change it out of rescue mode
  3093. if instance.vm_state == vm_states.RESCUED:
  3094. new_vm_state = vm_states.RESCUED
  3095. else:
  3096. new_vm_state = vm_states.ACTIVE
  3097. new_power_state = None
  3098. if reboot_type == "SOFT":
  3099. instance.task_state = task_states.REBOOT_STARTED
  3100. expected_state = task_states.REBOOT_PENDING
  3101. else:
  3102. instance.task_state = task_states.REBOOT_STARTED_HARD
  3103. expected_state = task_states.REBOOT_PENDING_HARD
  3104. instance.save(expected_task_state=expected_state)
  3105. self.driver.reboot(context, instance,
  3106. network_info,
  3107. reboot_type,
  3108. block_device_info=block_device_info,
  3109. bad_volumes_callback=bad_volumes_callback)
  3110. except Exception as error:
  3111. with excutils.save_and_reraise_exception() as ctxt:
  3112. exc_info = sys.exc_info()
  3113. # if the reboot failed but the VM is running don't
  3114. # put it into an error state
  3115. new_power_state = self._get_power_state(context, instance)
  3116. if new_power_state == power_state.RUNNING:
  3117. LOG.warning('Reboot failed but instance is running',
  3118. instance=instance)
  3119. compute_utils.add_instance_fault_from_exc(context,
  3120. instance, error, exc_info)
  3121. self._notify_about_instance_usage(context, instance,
  3122. 'reboot.error', fault=error)
  3123. tb = traceback.format_exc()
  3124. compute_utils.notify_about_instance_action(
  3125. context, instance, self.host,
  3126. action=fields.NotificationAction.REBOOT,
  3127. phase=fields.NotificationPhase.ERROR,
  3128. exception=error, bdms=bdms, tb=tb
  3129. )
  3130. ctxt.reraise = False
  3131. else:
  3132. LOG.error('Cannot reboot instance: %s', error,
  3133. instance=instance)
  3134. self._set_instance_obj_error_state(context, instance)
  3135. if not new_power_state:
  3136. new_power_state = self._get_power_state(context, instance)
  3137. try:
  3138. instance.power_state = new_power_state
  3139. instance.vm_state = new_vm_state
  3140. instance.task_state = None
  3141. instance.save()
  3142. except exception.InstanceNotFound:
  3143. LOG.warning("Instance disappeared during reboot",
  3144. instance=instance)
  3145. self._notify_about_instance_usage(context, instance, "reboot.end")
  3146. compute_utils.notify_about_instance_action(
  3147. context, instance, self.host,
  3148. action=fields.NotificationAction.REBOOT,
  3149. phase=fields.NotificationPhase.END,
  3150. bdms=bdms
  3151. )
  3152. @delete_image_on_error
  3153. def _do_snapshot_instance(self, context, image_id, instance):
  3154. self._snapshot_instance(context, image_id, instance,
  3155. task_states.IMAGE_BACKUP)
  3156. @wrap_exception()
  3157. @reverts_task_state
  3158. @wrap_instance_event(prefix='compute')
  3159. @wrap_instance_fault
  3160. def backup_instance(self, context, image_id, instance, backup_type,
  3161. rotation):
  3162. """Backup an instance on this host.
  3163. :param backup_type: daily | weekly
  3164. :param rotation: int representing how many backups to keep around
  3165. """
  3166. self._do_snapshot_instance(context, image_id, instance)
  3167. self._rotate_backups(context, instance, backup_type, rotation)
  3168. @wrap_exception()
  3169. @reverts_task_state
  3170. @wrap_instance_event(prefix='compute')
  3171. @wrap_instance_fault
  3172. @delete_image_on_error
  3173. def snapshot_instance(self, context, image_id, instance):
  3174. """Snapshot an instance on this host.
  3175. :param context: security context
  3176. :param image_id: glance.db.sqlalchemy.models.Image.Id
  3177. :param instance: a nova.objects.instance.Instance object
  3178. """
  3179. # NOTE(dave-mcnally) the task state will already be set by the api
  3180. # but if the compute manager has crashed/been restarted prior to the
  3181. # request getting here the task state may have been cleared so we set
  3182. # it again and things continue normally
  3183. try:
  3184. instance.task_state = task_states.IMAGE_SNAPSHOT
  3185. instance.save(
  3186. expected_task_state=task_states.IMAGE_SNAPSHOT_PENDING)
  3187. except exception.InstanceNotFound:
  3188. # possibility instance no longer exists, no point in continuing
  3189. LOG.debug("Instance not found, could not set state %s "
  3190. "for instance.",
  3191. task_states.IMAGE_SNAPSHOT, instance=instance)
  3192. return
  3193. except exception.UnexpectedDeletingTaskStateError:
  3194. LOG.debug("Instance being deleted, snapshot cannot continue",
  3195. instance=instance)
  3196. return
  3197. self._snapshot_instance(context, image_id, instance,
  3198. task_states.IMAGE_SNAPSHOT)
  3199. def _snapshot_instance(self, context, image_id, instance,
  3200. expected_task_state):
  3201. context = context.elevated()
  3202. instance.power_state = self._get_power_state(context, instance)
  3203. try:
  3204. instance.save()
  3205. LOG.info('instance snapshotting', instance=instance)
  3206. if instance.power_state != power_state.RUNNING:
  3207. state = instance.power_state
  3208. running = power_state.RUNNING
  3209. LOG.warning('trying to snapshot a non-running instance: '
  3210. '(state: %(state)s expected: %(running)s)',
  3211. {'state': state, 'running': running},
  3212. instance=instance)
  3213. self._notify_about_instance_usage(
  3214. context, instance, "snapshot.start")
  3215. compute_utils.notify_about_instance_snapshot(context, instance,
  3216. self.host, phase=fields.NotificationPhase.START,
  3217. snapshot_image_id=image_id)
  3218. def update_task_state(task_state,
  3219. expected_state=expected_task_state):
  3220. instance.task_state = task_state
  3221. instance.save(expected_task_state=expected_state)
  3222. with timeutils.StopWatch() as timer:
  3223. self.driver.snapshot(context, instance, image_id,
  3224. update_task_state)
  3225. LOG.info('Took %0.2f seconds to snapshot the instance on '
  3226. 'the hypervisor.', timer.elapsed(), instance=instance)
  3227. instance.task_state = None
  3228. instance.save(expected_task_state=task_states.IMAGE_UPLOADING)
  3229. self._notify_about_instance_usage(context, instance,
  3230. "snapshot.end")
  3231. compute_utils.notify_about_instance_snapshot(context, instance,
  3232. self.host, phase=fields.NotificationPhase.END,
  3233. snapshot_image_id=image_id)
  3234. except (exception.InstanceNotFound,
  3235. exception.UnexpectedDeletingTaskStateError):
  3236. # the instance got deleted during the snapshot
  3237. # Quickly bail out of here
  3238. msg = 'Instance disappeared during snapshot'
  3239. LOG.debug(msg, instance=instance)
  3240. try:
  3241. image = self.image_api.get(context, image_id)
  3242. if image['status'] != 'active':
  3243. self.image_api.delete(context, image_id)
  3244. except exception.ImageNotFound:
  3245. LOG.debug('Image not found during clean up %s', image_id)
  3246. except Exception:
  3247. LOG.warning("Error while trying to clean up image %s",
  3248. image_id, instance=instance)
  3249. except exception.ImageNotFound:
  3250. instance.task_state = None
  3251. instance.save()
  3252. LOG.warning("Image not found during snapshot", instance=instance)
  3253. def _post_interrupted_snapshot_cleanup(self, context, instance):
  3254. self.driver.post_interrupted_snapshot_cleanup(context, instance)
  3255. @messaging.expected_exceptions(NotImplementedError)
  3256. @wrap_exception()
  3257. def volume_snapshot_create(self, context, instance, volume_id,
  3258. create_info):
  3259. self.driver.volume_snapshot_create(context, instance, volume_id,
  3260. create_info)
  3261. @messaging.expected_exceptions(NotImplementedError)
  3262. @wrap_exception()
  3263. def volume_snapshot_delete(self, context, instance, volume_id,
  3264. snapshot_id, delete_info):
  3265. self.driver.volume_snapshot_delete(context, instance, volume_id,
  3266. snapshot_id, delete_info)
  3267. @wrap_instance_fault
  3268. def _rotate_backups(self, context, instance, backup_type, rotation):
  3269. """Delete excess backups associated to an instance.
  3270. Instances are allowed a fixed number of backups (the rotation number);
  3271. this method deletes the oldest backups that exceed the rotation
  3272. threshold.
  3273. :param context: security context
  3274. :param instance: Instance dict
  3275. :param backup_type: a user-defined type, like "daily" or "weekly" etc.
  3276. :param rotation: int representing how many backups to keep around;
  3277. None if rotation shouldn't be used (as in the case of snapshots)
  3278. """
  3279. filters = {'property-image_type': 'backup',
  3280. 'property-backup_type': backup_type,
  3281. 'property-instance_uuid': instance.uuid}
  3282. images = self.image_api.get_all(context, filters=filters,
  3283. sort_key='created_at', sort_dir='desc')
  3284. num_images = len(images)
  3285. LOG.debug("Found %(num_images)d images (rotation: %(rotation)d)",
  3286. {'num_images': num_images, 'rotation': rotation},
  3287. instance=instance)
  3288. if num_images > rotation:
  3289. # NOTE(sirp): this deletes all backups that exceed the rotation
  3290. # limit
  3291. excess = len(images) - rotation
  3292. LOG.debug("Rotating out %d backups", excess,
  3293. instance=instance)
  3294. for i in range(excess):
  3295. image = images.pop()
  3296. image_id = image['id']
  3297. LOG.debug("Deleting image %s", image_id,
  3298. instance=instance)
  3299. try:
  3300. self.image_api.delete(context, image_id)
  3301. except exception.ImageNotFound:
  3302. LOG.info("Failed to find image %(image_id)s to "
  3303. "delete", {'image_id': image_id},
  3304. instance=instance)
  3305. except (exception.ImageDeleteConflict, Exception) as exc:
  3306. LOG.info("Failed to delete image %(image_id)s during "
  3307. "deleting excess backups. "
  3308. "Continuing for next image.. %(exc)s",
  3309. {'image_id': image_id, 'exc': exc},
  3310. instance=instance)
  3311. @wrap_exception()
  3312. @reverts_task_state
  3313. @wrap_instance_event(prefix='compute')
  3314. @wrap_instance_fault
  3315. def set_admin_password(self, context, instance, new_pass):
  3316. """Set the root/admin password for an instance on this host.
  3317. This is generally only called by API password resets after an
  3318. image has been built.
  3319. @param context: Nova auth context.
  3320. @param instance: Nova instance object.
  3321. @param new_pass: The admin password for the instance.
  3322. """
  3323. context = context.elevated()
  3324. if new_pass is None:
  3325. # Generate a random password
  3326. new_pass = utils.generate_password()
  3327. current_power_state = self._get_power_state(context, instance)
  3328. expected_state = power_state.RUNNING
  3329. if current_power_state != expected_state:
  3330. instance.task_state = None
  3331. instance.save(expected_task_state=task_states.UPDATING_PASSWORD)
  3332. _msg = _('instance %s is not running') % instance.uuid
  3333. raise exception.InstancePasswordSetFailed(
  3334. instance=instance.uuid, reason=_msg)
  3335. try:
  3336. self.driver.set_admin_password(instance, new_pass)
  3337. LOG.info("Admin password set", instance=instance)
  3338. instance.task_state = None
  3339. instance.save(
  3340. expected_task_state=task_states.UPDATING_PASSWORD)
  3341. except exception.InstanceAgentNotEnabled:
  3342. with excutils.save_and_reraise_exception():
  3343. LOG.debug('Guest agent is not enabled for the instance.',
  3344. instance=instance)
  3345. instance.task_state = None
  3346. instance.save(
  3347. expected_task_state=task_states.UPDATING_PASSWORD)
  3348. except exception.SetAdminPasswdNotSupported:
  3349. with excutils.save_and_reraise_exception():
  3350. LOG.info('set_admin_password is not supported '
  3351. 'by this driver or guest instance.',
  3352. instance=instance)
  3353. instance.task_state = None
  3354. instance.save(
  3355. expected_task_state=task_states.UPDATING_PASSWORD)
  3356. except NotImplementedError:
  3357. LOG.warning('set_admin_password is not implemented '
  3358. 'by this driver or guest instance.',
  3359. instance=instance)
  3360. instance.task_state = None
  3361. instance.save(
  3362. expected_task_state=task_states.UPDATING_PASSWORD)
  3363. raise NotImplementedError(_('set_admin_password is not '
  3364. 'implemented by this driver or guest '
  3365. 'instance.'))
  3366. except exception.UnexpectedTaskStateError:
  3367. # interrupted by another (most likely delete) task
  3368. # do not retry
  3369. raise
  3370. except Exception:
  3371. # Catch all here because this could be anything.
  3372. LOG.exception('set_admin_password failed', instance=instance)
  3373. # We create a new exception here so that we won't
  3374. # potentially reveal password information to the
  3375. # API caller. The real exception is logged above
  3376. _msg = _('error setting admin password')
  3377. raise exception.InstancePasswordSetFailed(
  3378. instance=instance.uuid, reason=_msg)
  3379. @wrap_exception()
  3380. @reverts_task_state
  3381. @wrap_instance_fault
  3382. def inject_file(self, context, path, file_contents, instance):
  3383. """Write a file to the specified path in an instance on this host."""
  3384. # NOTE(russellb) Remove this method, as well as the underlying virt
  3385. # driver methods, when the compute rpc interface is bumped to 4.x
  3386. # as it is no longer used.
  3387. context = context.elevated()
  3388. current_power_state = self._get_power_state(context, instance)
  3389. expected_state = power_state.RUNNING
  3390. if current_power_state != expected_state:
  3391. LOG.warning('trying to inject a file into a non-running '
  3392. '(state: %(current_state)s expected: '
  3393. '%(expected_state)s)',
  3394. {'current_state': current_power_state,
  3395. 'expected_state': expected_state},
  3396. instance=instance)
  3397. LOG.info('injecting file to %s', path, instance=instance)
  3398. self.driver.inject_file(instance, path, file_contents)
  3399. def _get_rescue_image(self, context, instance, rescue_image_ref=None):
  3400. """Determine what image should be used to boot the rescue VM."""
  3401. # 1. If rescue_image_ref is passed in, use that for rescue.
  3402. # 2. Else, use the base image associated with instance's current image.
  3403. # The idea here is to provide the customer with a rescue
  3404. # environment which they are familiar with.
  3405. # So, if they built their instance off of a Debian image,
  3406. # their rescue VM will also be Debian.
  3407. # 3. As a last resort, use instance's current image.
  3408. if not rescue_image_ref:
  3409. system_meta = utils.instance_sys_meta(instance)
  3410. rescue_image_ref = system_meta.get('image_base_image_ref')
  3411. if not rescue_image_ref:
  3412. LOG.warning('Unable to find a different image to use for '
  3413. 'rescue VM, using instance\'s current image',
  3414. instance=instance)
  3415. rescue_image_ref = instance.image_ref
  3416. return objects.ImageMeta.from_image_ref(
  3417. context, self.image_api, rescue_image_ref)
  3418. @wrap_exception()
  3419. @reverts_task_state
  3420. @wrap_instance_event(prefix='compute')
  3421. @wrap_instance_fault
  3422. def rescue_instance(self, context, instance, rescue_password,
  3423. rescue_image_ref, clean_shutdown):
  3424. context = context.elevated()
  3425. LOG.info('Rescuing', instance=instance)
  3426. admin_password = (rescue_password if rescue_password else
  3427. utils.generate_password())
  3428. network_info = self.network_api.get_instance_nw_info(context, instance)
  3429. rescue_image_meta = self._get_rescue_image(context, instance,
  3430. rescue_image_ref)
  3431. extra_usage_info = {'rescue_image_name':
  3432. self._get_image_name(rescue_image_meta)}
  3433. self._notify_about_instance_usage(context, instance,
  3434. "rescue.start", extra_usage_info=extra_usage_info,
  3435. network_info=network_info)
  3436. compute_utils.notify_about_instance_rescue_action(
  3437. context, instance, self.host, rescue_image_ref,
  3438. phase=fields.NotificationPhase.START)
  3439. try:
  3440. self._power_off_instance(context, instance, clean_shutdown)
  3441. self.driver.rescue(context, instance,
  3442. network_info,
  3443. rescue_image_meta, admin_password)
  3444. except Exception as e:
  3445. LOG.exception("Error trying to Rescue Instance",
  3446. instance=instance)
  3447. self._set_instance_obj_error_state(context, instance)
  3448. raise exception.InstanceNotRescuable(
  3449. instance_id=instance.uuid,
  3450. reason=_("Driver Error: %s") % e)
  3451. compute_utils.notify_usage_exists(self.notifier, context, instance,
  3452. self.host, current_period=True)
  3453. instance.vm_state = vm_states.RESCUED
  3454. instance.task_state = None
  3455. instance.power_state = self._get_power_state(context, instance)
  3456. instance.launched_at = timeutils.utcnow()
  3457. instance.save(expected_task_state=task_states.RESCUING)
  3458. self._notify_about_instance_usage(context, instance,
  3459. "rescue.end", extra_usage_info=extra_usage_info,
  3460. network_info=network_info)
  3461. compute_utils.notify_about_instance_rescue_action(
  3462. context, instance, self.host, rescue_image_ref,
  3463. phase=fields.NotificationPhase.END)
  3464. @wrap_exception()
  3465. @reverts_task_state
  3466. @wrap_instance_event(prefix='compute')
  3467. @wrap_instance_fault
  3468. def unrescue_instance(self, context, instance):
  3469. context = context.elevated()
  3470. LOG.info('Unrescuing', instance=instance)
  3471. network_info = self.network_api.get_instance_nw_info(context, instance)
  3472. self._notify_about_instance_usage(context, instance,
  3473. "unrescue.start", network_info=network_info)
  3474. compute_utils.notify_about_instance_action(context, instance,
  3475. self.host, action=fields.NotificationAction.UNRESCUE,
  3476. phase=fields.NotificationPhase.START)
  3477. with self._error_out_instance_on_exception(context, instance):
  3478. self.driver.unrescue(instance,
  3479. network_info)
  3480. instance.vm_state = vm_states.ACTIVE
  3481. instance.task_state = None
  3482. instance.power_state = self._get_power_state(context, instance)
  3483. instance.save(expected_task_state=task_states.UNRESCUING)
  3484. self._notify_about_instance_usage(context,
  3485. instance,
  3486. "unrescue.end",
  3487. network_info=network_info)
  3488. compute_utils.notify_about_instance_action(context, instance,
  3489. self.host, action=fields.NotificationAction.UNRESCUE,
  3490. phase=fields.NotificationPhase.END)
  3491. @wrap_exception()
  3492. @wrap_instance_fault
  3493. def change_instance_metadata(self, context, diff, instance):
  3494. """Update the metadata published to the instance."""
  3495. LOG.debug("Changing instance metadata according to %r",
  3496. diff, instance=instance)
  3497. self.driver.change_instance_metadata(context, instance, diff)
  3498. @wrap_exception()
  3499. @wrap_instance_event(prefix='compute')
  3500. @wrap_instance_fault
  3501. def confirm_resize(self, context, instance, migration):
  3502. """Confirms a migration/resize and deletes the 'old' instance.
  3503. This is called from the API and runs on the source host.
  3504. Nothing needs to happen on the destination host at this point since
  3505. the instance is already running there. This routine just cleans up the
  3506. source host.
  3507. """
  3508. @utils.synchronized(instance.uuid)
  3509. def do_confirm_resize(context, instance, migration_id):
  3510. # NOTE(wangpan): Get the migration status from db, if it has been
  3511. # confirmed, we do nothing and return here
  3512. LOG.debug("Going to confirm migration %s", migration_id,
  3513. instance=instance)
  3514. try:
  3515. # TODO(russellb) Why are we sending the migration object just
  3516. # to turn around and look it up from the db again?
  3517. migration = objects.Migration.get_by_id(
  3518. context.elevated(), migration_id)
  3519. except exception.MigrationNotFound:
  3520. LOG.error("Migration %s is not found during confirmation",
  3521. migration_id, instance=instance)
  3522. return
  3523. if migration.status == 'confirmed':
  3524. LOG.info("Migration %s is already confirmed",
  3525. migration_id, instance=instance)
  3526. return
  3527. elif migration.status not in ('finished', 'confirming'):
  3528. LOG.warning("Unexpected confirmation status '%(status)s' "
  3529. "of migration %(id)s, exit confirmation process",
  3530. {"status": migration.status, "id": migration_id},
  3531. instance=instance)
  3532. return
  3533. # NOTE(wangpan): Get the instance from db, if it has been
  3534. # deleted, we do nothing and return here
  3535. expected_attrs = ['metadata', 'system_metadata', 'flavor']
  3536. try:
  3537. instance = objects.Instance.get_by_uuid(
  3538. context, instance.uuid,
  3539. expected_attrs=expected_attrs)
  3540. except exception.InstanceNotFound:
  3541. LOG.info("Instance is not found during confirmation",
  3542. instance=instance)
  3543. return
  3544. self._confirm_resize(context, instance, migration=migration)
  3545. do_confirm_resize(context, instance, migration.id)
  3546. def _confirm_resize(self, context, instance, migration=None):
  3547. """Destroys the source instance."""
  3548. self._notify_about_instance_usage(context, instance,
  3549. "resize.confirm.start")
  3550. compute_utils.notify_about_instance_action(context, instance,
  3551. self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
  3552. phase=fields.NotificationPhase.START)
  3553. with self._error_out_instance_on_exception(context, instance):
  3554. # NOTE(danms): delete stashed migration information
  3555. old_instance_type = instance.old_flavor
  3556. instance.old_flavor = None
  3557. instance.new_flavor = None
  3558. instance.system_metadata.pop('old_vm_state', None)
  3559. instance.save()
  3560. # NOTE(tr3buchet): tear down networks on source host
  3561. self.network_api.setup_networks_on_host(context, instance,
  3562. migration.source_compute, teardown=True)
  3563. network_info = self.network_api.get_instance_nw_info(context,
  3564. instance)
  3565. # TODO(mriedem): Get BDMs here and pass them to the driver.
  3566. self.driver.confirm_migration(context, migration, instance,
  3567. network_info)
  3568. migration.status = 'confirmed'
  3569. with migration.obj_as_admin():
  3570. migration.save()
  3571. self.rt.drop_move_claim(context, instance, migration.source_node,
  3572. old_instance_type, prefix='old_')
  3573. self._delete_allocation_after_move(context, instance, migration)
  3574. instance.drop_migration_context()
  3575. # NOTE(mriedem): The old_vm_state could be STOPPED but the user
  3576. # might have manually powered up the instance to confirm the
  3577. # resize/migrate, so we need to check the current power state
  3578. # on the instance and set the vm_state appropriately. We default
  3579. # to ACTIVE because if the power state is not SHUTDOWN, we
  3580. # assume _sync_instance_power_state will clean it up.
  3581. p_state = instance.power_state
  3582. vm_state = None
  3583. if p_state == power_state.SHUTDOWN:
  3584. vm_state = vm_states.STOPPED
  3585. LOG.debug("Resized/migrated instance is powered off. "
  3586. "Setting vm_state to '%s'.", vm_state,
  3587. instance=instance)
  3588. else:
  3589. vm_state = vm_states.ACTIVE
  3590. instance.vm_state = vm_state
  3591. instance.task_state = None
  3592. instance.save(expected_task_state=[None, task_states.DELETING,
  3593. task_states.SOFT_DELETING])
  3594. self._notify_about_instance_usage(
  3595. context, instance, "resize.confirm.end",
  3596. network_info=network_info)
  3597. compute_utils.notify_about_instance_action(context, instance,
  3598. self.host, action=fields.NotificationAction.RESIZE_CONFIRM,
  3599. phase=fields.NotificationPhase.END)
  3600. def _delete_allocation_after_move(self, context, instance, migration):
  3601. """Deletes resource allocations held by the migration record against
  3602. the source compute node resource provider after a confirmed cold /
  3603. successful live migration.
  3604. """
  3605. try:
  3606. # NOTE(danms): We're finishing on the source node, so try
  3607. # to delete the allocation based on the migration uuid
  3608. self.reportclient.delete_allocation_for_instance(
  3609. context, migration.uuid, consumer_type='migration')
  3610. except exception.AllocationDeleteFailed:
  3611. LOG.error('Deleting allocation in placement for migration '
  3612. '%(migration_uuid)s failed. The instance '
  3613. '%(instance_uuid)s will be put to ERROR state '
  3614. 'but the allocation held by the migration is '
  3615. 'leaked.',
  3616. {'instance_uuid': instance.uuid,
  3617. 'migration_uuid': migration.uuid})
  3618. raise
  3619. @wrap_exception()
  3620. @reverts_task_state
  3621. @wrap_instance_event(prefix='compute')
  3622. @errors_out_migration
  3623. @wrap_instance_fault
  3624. def revert_resize(self, context, instance, migration):
  3625. """Destroys the new instance on the destination machine.
  3626. Reverts the model changes, and powers on the old instance on the
  3627. source machine.
  3628. """
  3629. # NOTE(comstud): A revert_resize is essentially a resize back to
  3630. # the old size, so we need to send a usage event here.
  3631. compute_utils.notify_usage_exists(self.notifier, context, instance,
  3632. self.host, current_period=True)
  3633. with self._error_out_instance_on_exception(context, instance):
  3634. # NOTE(tr3buchet): tear down networks on destination host
  3635. self.network_api.setup_networks_on_host(context, instance,
  3636. teardown=True)
  3637. migration_p = obj_base.obj_to_primitive(migration)
  3638. self.network_api.migrate_instance_start(context,
  3639. instance,
  3640. migration_p)
  3641. network_info = self.network_api.get_instance_nw_info(context,
  3642. instance)
  3643. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  3644. context, instance.uuid)
  3645. block_device_info = self._get_instance_block_device_info(
  3646. context, instance, bdms=bdms)
  3647. destroy_disks = not self._is_instance_storage_shared(
  3648. context, instance, host=migration.source_compute)
  3649. self.driver.destroy(context, instance, network_info,
  3650. block_device_info, destroy_disks)
  3651. self._terminate_volume_connections(context, instance, bdms)
  3652. migration.status = 'reverted'
  3653. with migration.obj_as_admin():
  3654. migration.save()
  3655. # NOTE(ndipanov): We need to do this here because dropping the
  3656. # claim means we lose the migration_context data. We really should
  3657. # fix this by moving the drop_move_claim call to the
  3658. # finish_revert_resize method as this is racy (revert is dropped,
  3659. # but instance resources will be tracked with the new flavor until
  3660. # it gets rolled back in finish_revert_resize, which is
  3661. # potentially wrong for a period of time).
  3662. instance.revert_migration_context()
  3663. instance.save()
  3664. self.rt.drop_move_claim(context, instance, instance.node)
  3665. # RPC cast back to the source host to finish the revert there.
  3666. self.compute_rpcapi.finish_revert_resize(context, instance,
  3667. migration, migration.source_compute)
  3668. @wrap_exception()
  3669. @reverts_task_state
  3670. @wrap_instance_event(prefix='compute')
  3671. @errors_out_migration
  3672. @wrap_instance_fault
  3673. def finish_revert_resize(self, context, instance, migration):
  3674. """Finishes the second half of reverting a resize on the source host.
  3675. Bring the original source instance state back (active/shutoff) and
  3676. revert the resized attributes in the database.
  3677. """
  3678. with self._error_out_instance_on_exception(context, instance):
  3679. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  3680. context, instance.uuid)
  3681. self._notify_about_instance_usage(
  3682. context, instance, "resize.revert.start")
  3683. compute_utils.notify_about_instance_action(context, instance,
  3684. self.host, action=fields.NotificationAction.RESIZE_REVERT,
  3685. phase=fields.NotificationPhase.START, bdms=bdms)
  3686. # NOTE(mriedem): delete stashed old_vm_state information; we
  3687. # default to ACTIVE for backwards compatibility if old_vm_state
  3688. # is not set
  3689. old_vm_state = instance.system_metadata.pop('old_vm_state',
  3690. vm_states.ACTIVE)
  3691. self._set_instance_info(instance, instance.old_flavor)
  3692. instance.old_flavor = None
  3693. instance.new_flavor = None
  3694. instance.host = migration.source_compute
  3695. instance.node = migration.source_node
  3696. instance.save()
  3697. try:
  3698. self._revert_allocation(context, instance, migration)
  3699. except exception.AllocationMoveFailed:
  3700. LOG.error('Reverting allocation in placement for migration '
  3701. '%(migration_uuid)s failed. The instance '
  3702. '%(instance_uuid)s will be put into ERROR state but '
  3703. 'the allocation held by the migration is leaked.',
  3704. {'instance_uuid': instance.uuid,
  3705. 'migration_uuid': migration.uuid})
  3706. raise
  3707. self.network_api.setup_networks_on_host(context, instance,
  3708. migration.source_compute)
  3709. migration_p = obj_base.obj_to_primitive(migration)
  3710. # NOTE(hanrong): we need to change migration_p['dest_compute'] to
  3711. # source host temporarily. "network_api.migrate_instance_finish"
  3712. # will setup the network for the instance on the destination host.
  3713. # For revert resize, the instance will back to the source host, the
  3714. # setup of the network for instance should be on the source host.
  3715. # So set the migration_p['dest_compute'] to source host at here.
  3716. migration_p['dest_compute'] = migration.source_compute
  3717. self.network_api.migrate_instance_finish(context,
  3718. instance,
  3719. migration_p)
  3720. network_info = self.network_api.get_instance_nw_info(context,
  3721. instance)
  3722. # revert_resize deleted any volume attachments for the instance
  3723. # and created new ones to be used on this host, but we
  3724. # have to update those attachments with the host connector so the
  3725. # BDM.connection_info will get set in the call to
  3726. # _get_instance_block_device_info below with refresh_conn_info=True
  3727. # and then the volumes can be re-connected via the driver on this
  3728. # host.
  3729. self._update_volume_attachments(context, instance, bdms)
  3730. block_device_info = self._get_instance_block_device_info(
  3731. context, instance, refresh_conn_info=True, bdms=bdms)
  3732. power_on = old_vm_state != vm_states.STOPPED
  3733. self.driver.finish_revert_migration(context, instance,
  3734. network_info,
  3735. block_device_info, power_on)
  3736. instance.drop_migration_context()
  3737. instance.launched_at = timeutils.utcnow()
  3738. instance.save(expected_task_state=task_states.RESIZE_REVERTING)
  3739. # Complete any volume attachments so the volumes are in-use.
  3740. self._complete_volume_attachments(context, bdms)
  3741. # if the original vm state was STOPPED, set it back to STOPPED
  3742. LOG.info("Updating instance to original state: '%s'",
  3743. old_vm_state, instance=instance)
  3744. if power_on:
  3745. instance.vm_state = vm_states.ACTIVE
  3746. instance.task_state = None
  3747. instance.save()
  3748. else:
  3749. instance.task_state = task_states.POWERING_OFF
  3750. instance.save()
  3751. self.stop_instance(context, instance=instance,
  3752. clean_shutdown=True)
  3753. self._notify_about_instance_usage(
  3754. context, instance, "resize.revert.end")
  3755. compute_utils.notify_about_instance_action(context, instance,
  3756. self.host, action=fields.NotificationAction.RESIZE_REVERT,
  3757. phase=fields.NotificationPhase.END, bdms=bdms)
  3758. def _revert_allocation(self, context, instance, migration):
  3759. """Revert an allocation that is held by migration to our instance."""
  3760. # Fetch the original allocation that the instance had on the source
  3761. # node, which are now held by the migration
  3762. orig_alloc = self.reportclient.get_allocations_for_consumer(
  3763. context, migration.uuid)
  3764. if not orig_alloc:
  3765. LOG.error('Did not find resource allocations for migration '
  3766. '%s on source node %s. Unable to revert source node '
  3767. 'allocations back to the instance.',
  3768. migration.uuid, migration.source_node, instance=instance)
  3769. return False
  3770. if len(orig_alloc) > 1:
  3771. # NOTE(danms): This may change later if we have other allocations
  3772. # against other providers that need to be held by the migration
  3773. # as well. Perhaps something like shared storage resources that
  3774. # will actually be duplicated during a resize type operation.
  3775. LOG.error('Migration %(mig)s has allocations against '
  3776. 'more than one provider %(rps)s. This should not be '
  3777. 'possible, but reverting it anyway.',
  3778. {'mig': migration.uuid,
  3779. 'rps': ','.join(orig_alloc.keys())},
  3780. instance=instance)
  3781. # We only have a claim against one provider, it is the source node
  3782. cn_uuid = list(orig_alloc.keys())[0]
  3783. # FIXME(danms): This method is flawed in that it asssumes allocations
  3784. # against only one provider. So, this may overwite allocations against
  3785. # a shared provider, if we had one.
  3786. LOG.info('Swapping old allocation on %(node)s held by migration '
  3787. '%(mig)s for instance',
  3788. {'node': cn_uuid, 'mig': migration.uuid},
  3789. instance=instance)
  3790. # TODO(cdent): Should we be doing anything with return values here?
  3791. self.reportclient.move_allocations(context, migration.uuid,
  3792. instance.uuid)
  3793. return True
  3794. def _prep_resize(self, context, image, instance, instance_type,
  3795. filter_properties, node, migration, clean_shutdown=True):
  3796. if not filter_properties:
  3797. filter_properties = {}
  3798. if not instance.host:
  3799. self._set_instance_obj_error_state(context, instance)
  3800. msg = _('Instance has no source host')
  3801. raise exception.MigrationError(reason=msg)
  3802. same_host = instance.host == self.host
  3803. # if the flavor IDs match, it's migrate; otherwise resize
  3804. if same_host and instance_type.id == instance['instance_type_id']:
  3805. # check driver whether support migrate to same host
  3806. if not self.driver.capabilities.get(
  3807. 'supports_migrate_to_same_host', False):
  3808. raise exception.UnableToMigrateToSelf(
  3809. instance_id=instance.uuid, host=self.host)
  3810. # NOTE(danms): Stash the new instance_type to avoid having to
  3811. # look it up in the database later
  3812. instance.new_flavor = instance_type
  3813. # NOTE(mriedem): Stash the old vm_state so we can set the
  3814. # resized/reverted instance back to the same state later.
  3815. vm_state = instance.vm_state
  3816. LOG.debug('Stashing vm_state: %s', vm_state, instance=instance)
  3817. instance.system_metadata['old_vm_state'] = vm_state
  3818. instance.save()
  3819. limits = filter_properties.get('limits', {})
  3820. with self.rt.resize_claim(context, instance, instance_type, node,
  3821. migration, image_meta=image,
  3822. limits=limits) as claim:
  3823. LOG.info('Migrating', instance=instance)
  3824. # RPC cast to the source host to start the actual resize/migration.
  3825. self.compute_rpcapi.resize_instance(
  3826. context, instance, claim.migration, image,
  3827. instance_type, clean_shutdown)
  3828. def _send_prep_resize_notifications(
  3829. self, context, instance, phase, flavor):
  3830. """Send "resize.prep.*" notifications.
  3831. :param context: nova auth request context
  3832. :param instance: The instance being resized
  3833. :param phase: The phase of the action (NotificationPhase enum)
  3834. :param flavor: The (new) flavor for the resize (same as existing
  3835. instance.flavor for a cold migration)
  3836. """
  3837. # Only send notify_usage_exists if it's the "start" phase.
  3838. if phase == fields.NotificationPhase.START:
  3839. compute_utils.notify_usage_exists(
  3840. self.notifier, context, instance, self.host,
  3841. current_period=True)
  3842. # Send extra usage info about the flavor if it's the "end" phase for
  3843. # the legacy unversioned notification.
  3844. extra_usage_info = None
  3845. if phase == fields.NotificationPhase.END:
  3846. extra_usage_info = dict(
  3847. new_instance_type=flavor.name,
  3848. new_instance_type_id=flavor.id)
  3849. self._notify_about_instance_usage(
  3850. context, instance, "resize.prep.%s" % phase,
  3851. extra_usage_info=extra_usage_info)
  3852. # Send the versioned notification.
  3853. compute_utils.notify_about_resize_prep_instance(
  3854. context, instance, self.host, phase, flavor)
  3855. @wrap_exception()
  3856. @reverts_task_state
  3857. @wrap_instance_event(prefix='compute')
  3858. @wrap_instance_fault
  3859. def prep_resize(self, context, image, instance, instance_type,
  3860. request_spec, filter_properties, node,
  3861. clean_shutdown, migration, host_list):
  3862. """Initiates the process of moving a running instance to another host.
  3863. Possibly changes the VCPU, RAM and disk size in the process.
  3864. This is initiated from conductor and runs on the destination host.
  3865. The main purpose of this method is performing some checks on the
  3866. destination host and making a claim for resources. If the claim fails
  3867. then a reschedule to another host may be attempted which involves
  3868. calling back to conductor to start the process over again.
  3869. """
  3870. if node is None:
  3871. node = self._get_nodename(instance, refresh=True)
  3872. with self._error_out_instance_on_exception(context, instance), \
  3873. errors_out_migration_ctxt(migration):
  3874. self._send_prep_resize_notifications(
  3875. context, instance, fields.NotificationPhase.START,
  3876. instance_type)
  3877. try:
  3878. self._prep_resize(context, image, instance,
  3879. instance_type, filter_properties,
  3880. node, migration, clean_shutdown)
  3881. except Exception:
  3882. # Since we hit a failure, we're either rescheduling or dead
  3883. # and either way we need to cleanup any allocations created
  3884. # by the scheduler for the destination node.
  3885. self._revert_allocation(context, instance, migration)
  3886. # try to re-schedule the resize elsewhere:
  3887. exc_info = sys.exc_info()
  3888. self._reschedule_resize_or_reraise(context, instance,
  3889. exc_info, instance_type, request_spec,
  3890. filter_properties, host_list)
  3891. finally:
  3892. self._send_prep_resize_notifications(
  3893. context, instance, fields.NotificationPhase.END,
  3894. instance_type)
  3895. def _reschedule_resize_or_reraise(self, context, instance, exc_info,
  3896. instance_type, request_spec, filter_properties, host_list):
  3897. """Try to re-schedule the resize or re-raise the original error to
  3898. error out the instance.
  3899. """
  3900. if not filter_properties:
  3901. filter_properties = {}
  3902. rescheduled = False
  3903. instance_uuid = instance.uuid
  3904. try:
  3905. reschedule_method = self.compute_task_api.resize_instance
  3906. scheduler_hint = dict(filter_properties=filter_properties)
  3907. method_args = (instance, None, scheduler_hint, instance_type)
  3908. task_state = task_states.RESIZE_PREP
  3909. rescheduled = self._reschedule(context, request_spec,
  3910. filter_properties, instance, reschedule_method,
  3911. method_args, task_state, exc_info, host_list=host_list)
  3912. except Exception as error:
  3913. rescheduled = False
  3914. LOG.exception("Error trying to reschedule",
  3915. instance_uuid=instance_uuid)
  3916. compute_utils.add_instance_fault_from_exc(context,
  3917. instance, error,
  3918. exc_info=sys.exc_info())
  3919. self._notify_about_instance_usage(context, instance,
  3920. 'resize.error', fault=error)
  3921. compute_utils.notify_about_instance_action(
  3922. context, instance, self.host,
  3923. action=fields.NotificationAction.RESIZE,
  3924. phase=fields.NotificationPhase.ERROR,
  3925. exception=error,
  3926. tb=','.join(traceback.format_exception(*exc_info)))
  3927. if rescheduled:
  3928. self._log_original_error(exc_info, instance_uuid)
  3929. compute_utils.add_instance_fault_from_exc(context,
  3930. instance, exc_info[1], exc_info=exc_info)
  3931. self._notify_about_instance_usage(context, instance,
  3932. 'resize.error', fault=exc_info[1])
  3933. compute_utils.notify_about_instance_action(
  3934. context, instance, self.host,
  3935. action=fields.NotificationAction.RESIZE,
  3936. phase=fields.NotificationPhase.ERROR,
  3937. exception=exc_info[1],
  3938. tb=','.join(traceback.format_exception(*exc_info)))
  3939. else:
  3940. # not re-scheduling
  3941. six.reraise(*exc_info)
  3942. @wrap_exception()
  3943. @reverts_task_state
  3944. @wrap_instance_event(prefix='compute')
  3945. @wrap_instance_fault
  3946. def resize_instance(self, context, instance, image,
  3947. migration, instance_type, clean_shutdown):
  3948. """Starts the migration of a running instance to another host.
  3949. This is initiated from the destination host's ``prep_resize`` routine
  3950. and runs on the source host.
  3951. """
  3952. try:
  3953. self._resize_instance(context, instance, image, migration,
  3954. instance_type, clean_shutdown)
  3955. except Exception:
  3956. with excutils.save_and_reraise_exception():
  3957. self._revert_allocation(context, instance, migration)
  3958. def _resize_instance(self, context, instance, image,
  3959. migration, instance_type, clean_shutdown):
  3960. with self._error_out_instance_on_exception(context, instance), \
  3961. errors_out_migration_ctxt(migration):
  3962. network_info = self.network_api.get_instance_nw_info(context,
  3963. instance)
  3964. migration.status = 'migrating'
  3965. with migration.obj_as_admin():
  3966. migration.save()
  3967. instance.task_state = task_states.RESIZE_MIGRATING
  3968. instance.save(expected_task_state=task_states.RESIZE_PREP)
  3969. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  3970. context, instance.uuid)
  3971. self._send_resize_instance_notifications(
  3972. context, instance, bdms, network_info,
  3973. fields.NotificationPhase.START)
  3974. block_device_info = self._get_instance_block_device_info(
  3975. context, instance, bdms=bdms)
  3976. timeout, retry_interval = self._get_power_off_values(context,
  3977. instance, clean_shutdown)
  3978. disk_info = self.driver.migrate_disk_and_power_off(
  3979. context, instance, migration.dest_host,
  3980. instance_type, network_info,
  3981. block_device_info,
  3982. timeout, retry_interval)
  3983. self._terminate_volume_connections(context, instance, bdms)
  3984. migration_p = obj_base.obj_to_primitive(migration)
  3985. self.network_api.migrate_instance_start(context,
  3986. instance,
  3987. migration_p)
  3988. migration.status = 'post-migrating'
  3989. with migration.obj_as_admin():
  3990. migration.save()
  3991. instance.host = migration.dest_compute
  3992. instance.node = migration.dest_node
  3993. instance.task_state = task_states.RESIZE_MIGRATED
  3994. instance.save(expected_task_state=task_states.RESIZE_MIGRATING)
  3995. # RPC cast to the destination host to finish the resize/migration.
  3996. self.compute_rpcapi.finish_resize(context, instance,
  3997. migration, image, disk_info, migration.dest_compute)
  3998. self._send_resize_instance_notifications(
  3999. context, instance, bdms, network_info,
  4000. fields.NotificationPhase.END)
  4001. self.instance_events.clear_events_for_instance(instance)
  4002. def _send_resize_instance_notifications(
  4003. self, context, instance, bdms, network_info, phase):
  4004. """Send "resize.(start|end)" notifications.
  4005. :param context: nova auth request context
  4006. :param instance: The instance being resized
  4007. :param bdms: BlockDeviceMappingList for the BDMs associated with the
  4008. instance
  4009. :param network_info: NetworkInfo for the instance info cache of ports
  4010. :param phase: The phase of the action (NotificationPhase enum, either
  4011. ``start`` or ``end``)
  4012. """
  4013. action = fields.NotificationAction.RESIZE
  4014. # Send the legacy unversioned notification.
  4015. self._notify_about_instance_usage(
  4016. context, instance, "%s.%s" % (action, phase),
  4017. network_info=network_info)
  4018. # Send the versioned notification.
  4019. compute_utils.notify_about_instance_action(
  4020. context, instance, self.host, action=action, phase=phase,
  4021. bdms=bdms)
  4022. def _terminate_volume_connections(self, context, instance, bdms):
  4023. connector = None
  4024. for bdm in bdms:
  4025. if bdm.is_volume:
  4026. if bdm.attachment_id:
  4027. # NOTE(jdg): So here's the thing, the idea behind the new
  4028. # attach API's was to have a new code fork/path that we
  4029. # followed, we're not going to do that so we have to do
  4030. # some extra work in here to make it *behave* just like the
  4031. # old code. Cinder doesn't allow disconnect/reconnect (you
  4032. # just delete the attachment and get a new one)
  4033. # attachments in the new attach code so we have to do
  4034. # a delete and create without a connector (reserve),
  4035. # in other words, beware
  4036. attachment_id = self.volume_api.attachment_create(
  4037. context, bdm.volume_id, instance.uuid)['id']
  4038. self.volume_api.attachment_delete(context,
  4039. bdm.attachment_id)
  4040. bdm.attachment_id = attachment_id
  4041. bdm.save()
  4042. else:
  4043. if connector is None:
  4044. connector = self.driver.get_volume_connector(instance)
  4045. self.volume_api.terminate_connection(context,
  4046. bdm.volume_id,
  4047. connector)
  4048. @staticmethod
  4049. def _set_instance_info(instance, instance_type):
  4050. instance.instance_type_id = instance_type.id
  4051. instance.memory_mb = instance_type.memory_mb
  4052. instance.vcpus = instance_type.vcpus
  4053. instance.root_gb = instance_type.root_gb
  4054. instance.ephemeral_gb = instance_type.ephemeral_gb
  4055. instance.flavor = instance_type
  4056. def _update_volume_attachments(self, context, instance, bdms):
  4057. """Updates volume attachments using the virt driver host connector.
  4058. :param context: nova.context.RequestContext - user request context
  4059. :param instance: nova.objects.Instance
  4060. :param bdms: nova.objects.BlockDeviceMappingList - the list of block
  4061. device mappings for the given instance
  4062. """
  4063. if bdms:
  4064. connector = None
  4065. for bdm in bdms:
  4066. if bdm.is_volume and bdm.attachment_id:
  4067. if connector is None:
  4068. connector = self.driver.get_volume_connector(instance)
  4069. self.volume_api.attachment_update(
  4070. context, bdm.attachment_id, connector, bdm.device_name)
  4071. def _complete_volume_attachments(self, context, bdms):
  4072. """Completes volume attachments for the instance
  4073. :param context: nova.context.RequestContext - user request context
  4074. :param bdms: nova.objects.BlockDeviceMappingList - the list of block
  4075. device mappings for the given instance
  4076. """
  4077. if bdms:
  4078. for bdm in bdms:
  4079. if bdm.is_volume and bdm.attachment_id:
  4080. self.volume_api.attachment_complete(
  4081. context, bdm.attachment_id)
  4082. def _finish_resize(self, context, instance, migration, disk_info,
  4083. image_meta, bdms):
  4084. resize_instance = False
  4085. old_instance_type_id = migration['old_instance_type_id']
  4086. new_instance_type_id = migration['new_instance_type_id']
  4087. old_instance_type = instance.get_flavor()
  4088. # NOTE(mriedem): Get the old_vm_state so we know if we should
  4089. # power on the instance. If old_vm_state is not set we need to default
  4090. # to ACTIVE for backwards compatibility
  4091. old_vm_state = instance.system_metadata.get('old_vm_state',
  4092. vm_states.ACTIVE)
  4093. instance.old_flavor = old_instance_type
  4094. if old_instance_type_id != new_instance_type_id:
  4095. instance_type = instance.get_flavor('new')
  4096. self._set_instance_info(instance, instance_type)
  4097. for key in ('root_gb', 'swap', 'ephemeral_gb'):
  4098. if old_instance_type[key] != instance_type[key]:
  4099. resize_instance = True
  4100. break
  4101. instance.apply_migration_context()
  4102. # NOTE(tr3buchet): setup networks on destination host
  4103. self.network_api.setup_networks_on_host(context, instance,
  4104. migration['dest_compute'])
  4105. migration_p = obj_base.obj_to_primitive(migration)
  4106. self.network_api.migrate_instance_finish(context,
  4107. instance,
  4108. migration_p)
  4109. network_info = self.network_api.get_instance_nw_info(context, instance)
  4110. instance.task_state = task_states.RESIZE_FINISH
  4111. instance.save(expected_task_state=task_states.RESIZE_MIGRATED)
  4112. self._send_finish_resize_notifications(
  4113. context, instance, bdms, network_info,
  4114. fields.NotificationPhase.START)
  4115. # We need to update any volume attachments using the destination
  4116. # host connector so that we can update the BDM.connection_info
  4117. # before calling driver.finish_migration otherwise the driver
  4118. # won't know how to connect the volumes to this host.
  4119. # Note that _get_instance_block_device_info with
  4120. # refresh_conn_info=True will update the BDM.connection_info value
  4121. # in the database so we must do this before calling that method.
  4122. self._update_volume_attachments(context, instance, bdms)
  4123. block_device_info = self._get_instance_block_device_info(
  4124. context, instance, refresh_conn_info=True, bdms=bdms)
  4125. # NOTE(mriedem): If the original vm_state was STOPPED, we don't
  4126. # automatically power on the instance after it's migrated
  4127. power_on = old_vm_state != vm_states.STOPPED
  4128. try:
  4129. self.driver.finish_migration(context, migration, instance,
  4130. disk_info,
  4131. network_info,
  4132. image_meta, resize_instance,
  4133. block_device_info, power_on)
  4134. except Exception:
  4135. with excutils.save_and_reraise_exception():
  4136. if old_instance_type_id != new_instance_type_id:
  4137. self._set_instance_info(instance,
  4138. old_instance_type)
  4139. # Now complete any volume attachments that were previously updated.
  4140. self._complete_volume_attachments(context, bdms)
  4141. migration.status = 'finished'
  4142. with migration.obj_as_admin():
  4143. migration.save()
  4144. instance.vm_state = vm_states.RESIZED
  4145. instance.task_state = None
  4146. instance.launched_at = timeutils.utcnow()
  4147. instance.save(expected_task_state=task_states.RESIZE_FINISH)
  4148. return network_info
  4149. @wrap_exception()
  4150. @reverts_task_state
  4151. @wrap_instance_event(prefix='compute')
  4152. @wrap_instance_fault
  4153. def finish_resize(self, context, disk_info, image, instance,
  4154. migration):
  4155. """Completes the migration process.
  4156. Sets up the newly transferred disk and turns on the instance at its
  4157. new host machine.
  4158. """
  4159. try:
  4160. self._finish_resize_helper(context, disk_info, image, instance,
  4161. migration)
  4162. except Exception:
  4163. with excutils.save_and_reraise_exception():
  4164. self._revert_allocation(context, instance, migration)
  4165. def _finish_resize_helper(self, context, disk_info, image, instance,
  4166. migration):
  4167. """Completes the migration process.
  4168. The caller must revert the instance's allocations if the migration
  4169. process failed.
  4170. """
  4171. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  4172. context, instance.uuid)
  4173. with self._error_out_instance_on_exception(context, instance), \
  4174. errors_out_migration_ctxt(migration):
  4175. image_meta = objects.ImageMeta.from_dict(image)
  4176. network_info = self._finish_resize(context, instance, migration,
  4177. disk_info, image_meta, bdms)
  4178. # TODO(melwitt): We should clean up instance console tokens here. The
  4179. # instance is on a new host and will need to establish a new console
  4180. # connection.
  4181. self._update_scheduler_instance_info(context, instance)
  4182. self._send_finish_resize_notifications(
  4183. context, instance, bdms, network_info,
  4184. fields.NotificationPhase.END)
  4185. def _send_finish_resize_notifications(
  4186. self, context, instance, bdms, network_info, phase):
  4187. """Send notifications for the finish_resize flow.
  4188. :param context: nova auth request context
  4189. :param instance: The instance being resized
  4190. :param bdms: BlockDeviceMappingList for the BDMs associated with the
  4191. instance
  4192. :param network_info: NetworkInfo for the instance info cache of ports
  4193. :param phase: The phase of the action (NotificationPhase enum, either
  4194. ``start`` or ``end``)
  4195. """
  4196. # Send the legacy unversioned notification.
  4197. self._notify_about_instance_usage(
  4198. context, instance, "finish_resize.%s" % phase,
  4199. network_info=network_info)
  4200. # Send the versioned notification.
  4201. compute_utils.notify_about_instance_action(
  4202. context, instance, self.host,
  4203. action=fields.NotificationAction.RESIZE_FINISH, phase=phase,
  4204. bdms=bdms)
  4205. @wrap_exception()
  4206. @wrap_instance_fault
  4207. def add_fixed_ip_to_instance(self, context, network_id, instance):
  4208. """Calls network_api to add new fixed_ip to instance
  4209. then injects the new network info and resets instance networking.
  4210. """
  4211. self._notify_about_instance_usage(
  4212. context, instance, "create_ip.start")
  4213. network_info = self.network_api.add_fixed_ip_to_instance(context,
  4214. instance,
  4215. network_id)
  4216. self._inject_network_info(context, instance, network_info)
  4217. self.reset_network(context, instance)
  4218. # NOTE(russellb) We just want to bump updated_at. See bug 1143466.
  4219. instance.updated_at = timeutils.utcnow()
  4220. instance.save()
  4221. self._notify_about_instance_usage(
  4222. context, instance, "create_ip.end", network_info=network_info)
  4223. @wrap_exception()
  4224. @wrap_instance_fault
  4225. def remove_fixed_ip_from_instance(self, context, address, instance):
  4226. """Calls network_api to remove existing fixed_ip from instance
  4227. by injecting the altered network info and resetting
  4228. instance networking.
  4229. """
  4230. self._notify_about_instance_usage(
  4231. context, instance, "delete_ip.start")
  4232. network_info = self.network_api.remove_fixed_ip_from_instance(context,
  4233. instance,
  4234. address)
  4235. self._inject_network_info(context, instance, network_info)
  4236. self.reset_network(context, instance)
  4237. # NOTE(russellb) We just want to bump updated_at. See bug 1143466.
  4238. instance.updated_at = timeutils.utcnow()
  4239. instance.save()
  4240. self._notify_about_instance_usage(
  4241. context, instance, "delete_ip.end", network_info=network_info)
  4242. @wrap_exception()
  4243. @reverts_task_state
  4244. @wrap_instance_event(prefix='compute')
  4245. @wrap_instance_fault
  4246. def pause_instance(self, context, instance):
  4247. """Pause an instance on this host."""
  4248. context = context.elevated()
  4249. LOG.info('Pausing', instance=instance)
  4250. self._notify_about_instance_usage(context, instance, 'pause.start')
  4251. compute_utils.notify_about_instance_action(context, instance,
  4252. self.host, action=fields.NotificationAction.PAUSE,
  4253. phase=fields.NotificationPhase.START)
  4254. self.driver.pause(instance)
  4255. instance.power_state = self._get_power_state(context, instance)
  4256. instance.vm_state = vm_states.PAUSED
  4257. instance.task_state = None
  4258. instance.save(expected_task_state=task_states.PAUSING)
  4259. self._notify_about_instance_usage(context, instance, 'pause.end')
  4260. compute_utils.notify_about_instance_action(context, instance,
  4261. self.host, action=fields.NotificationAction.PAUSE,
  4262. phase=fields.NotificationPhase.END)
  4263. @wrap_exception()
  4264. @reverts_task_state
  4265. @wrap_instance_event(prefix='compute')
  4266. @wrap_instance_fault
  4267. def unpause_instance(self, context, instance):
  4268. """Unpause a paused instance on this host."""
  4269. context = context.elevated()
  4270. LOG.info('Unpausing', instance=instance)
  4271. self._notify_about_instance_usage(context, instance, 'unpause.start')
  4272. compute_utils.notify_about_instance_action(context, instance,
  4273. self.host, action=fields.NotificationAction.UNPAUSE,
  4274. phase=fields.NotificationPhase.START)
  4275. self.driver.unpause(instance)
  4276. instance.power_state = self._get_power_state(context, instance)
  4277. instance.vm_state = vm_states.ACTIVE
  4278. instance.task_state = None
  4279. instance.save(expected_task_state=task_states.UNPAUSING)
  4280. self._notify_about_instance_usage(context, instance, 'unpause.end')
  4281. compute_utils.notify_about_instance_action(context, instance,
  4282. self.host, action=fields.NotificationAction.UNPAUSE,
  4283. phase=fields.NotificationPhase.END)
  4284. @wrap_exception()
  4285. def host_power_action(self, context, action):
  4286. """Reboots, shuts down or powers up the host."""
  4287. return self.driver.host_power_action(action)
  4288. @wrap_exception()
  4289. def host_maintenance_mode(self, context, host, mode):
  4290. """Start/Stop host maintenance window. On start, it triggers
  4291. guest VMs evacuation.
  4292. """
  4293. return self.driver.host_maintenance_mode(host, mode)
  4294. @wrap_exception()
  4295. def set_host_enabled(self, context, enabled):
  4296. """Sets the specified host's ability to accept new instances."""
  4297. return self.driver.set_host_enabled(enabled)
  4298. @wrap_exception()
  4299. def get_host_uptime(self, context):
  4300. """Returns the result of calling "uptime" on the target host."""
  4301. return self.driver.get_host_uptime()
  4302. @wrap_exception()
  4303. @wrap_instance_fault
  4304. def get_diagnostics(self, context, instance):
  4305. """Retrieve diagnostics for an instance on this host."""
  4306. current_power_state = self._get_power_state(context, instance)
  4307. if current_power_state == power_state.RUNNING:
  4308. LOG.info("Retrieving diagnostics", instance=instance)
  4309. return self.driver.get_diagnostics(instance)
  4310. else:
  4311. raise exception.InstanceInvalidState(
  4312. attr='power state',
  4313. instance_uuid=instance.uuid,
  4314. state=power_state.STATE_MAP[instance.power_state],
  4315. method='get_diagnostics')
  4316. @wrap_exception()
  4317. @wrap_instance_fault
  4318. def get_instance_diagnostics(self, context, instance):
  4319. """Retrieve diagnostics for an instance on this host."""
  4320. current_power_state = self._get_power_state(context, instance)
  4321. if current_power_state == power_state.RUNNING:
  4322. LOG.info("Retrieving diagnostics", instance=instance)
  4323. return self.driver.get_instance_diagnostics(instance)
  4324. else:
  4325. raise exception.InstanceInvalidState(
  4326. attr='power state',
  4327. instance_uuid=instance.uuid,
  4328. state=power_state.STATE_MAP[instance.power_state],
  4329. method='get_diagnostics')
  4330. @wrap_exception()
  4331. @reverts_task_state
  4332. @wrap_instance_event(prefix='compute')
  4333. @wrap_instance_fault
  4334. def suspend_instance(self, context, instance):
  4335. """Suspend the given instance."""
  4336. context = context.elevated()
  4337. # Store the old state
  4338. instance.system_metadata['old_vm_state'] = instance.vm_state
  4339. self._notify_about_instance_usage(context, instance, 'suspend.start')
  4340. compute_utils.notify_about_instance_action(context, instance,
  4341. self.host, action=fields.NotificationAction.SUSPEND,
  4342. phase=fields.NotificationPhase.START)
  4343. with self._error_out_instance_on_exception(context, instance,
  4344. instance_state=instance.vm_state):
  4345. self.driver.suspend(context, instance)
  4346. instance.power_state = self._get_power_state(context, instance)
  4347. instance.vm_state = vm_states.SUSPENDED
  4348. instance.task_state = None
  4349. instance.save(expected_task_state=task_states.SUSPENDING)
  4350. self._notify_about_instance_usage(context, instance, 'suspend.end')
  4351. compute_utils.notify_about_instance_action(context, instance,
  4352. self.host, action=fields.NotificationAction.SUSPEND,
  4353. phase=fields.NotificationPhase.END)
  4354. @wrap_exception()
  4355. @reverts_task_state
  4356. @wrap_instance_event(prefix='compute')
  4357. @wrap_instance_fault
  4358. def resume_instance(self, context, instance):
  4359. """Resume the given suspended instance."""
  4360. context = context.elevated()
  4361. LOG.info('Resuming', instance=instance)
  4362. self._notify_about_instance_usage(context, instance, 'resume.start')
  4363. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  4364. context, instance.uuid)
  4365. block_device_info = self._get_instance_block_device_info(
  4366. context, instance, bdms=bdms)
  4367. compute_utils.notify_about_instance_action(context, instance,
  4368. self.host, action=fields.NotificationAction.RESUME,
  4369. phase=fields.NotificationPhase.START, bdms=bdms)
  4370. network_info = self.network_api.get_instance_nw_info(context, instance)
  4371. with self._error_out_instance_on_exception(context, instance,
  4372. instance_state=instance.vm_state):
  4373. self.driver.resume(context, instance, network_info,
  4374. block_device_info)
  4375. instance.power_state = self._get_power_state(context, instance)
  4376. # We default to the ACTIVE state for backwards compatibility
  4377. instance.vm_state = instance.system_metadata.pop('old_vm_state',
  4378. vm_states.ACTIVE)
  4379. instance.task_state = None
  4380. instance.save(expected_task_state=task_states.RESUMING)
  4381. self._notify_about_instance_usage(context, instance, 'resume.end')
  4382. compute_utils.notify_about_instance_action(context, instance,
  4383. self.host, action=fields.NotificationAction.RESUME,
  4384. phase=fields.NotificationPhase.END, bdms=bdms)
  4385. @wrap_exception()
  4386. @reverts_task_state
  4387. @wrap_instance_event(prefix='compute')
  4388. @wrap_instance_fault
  4389. def shelve_instance(self, context, instance, image_id,
  4390. clean_shutdown):
  4391. """Shelve an instance.
  4392. This should be used when you want to take a snapshot of the instance.
  4393. It also adds system_metadata that can be used by a periodic task to
  4394. offload the shelved instance after a period of time.
  4395. :param context: request context
  4396. :param instance: an Instance object
  4397. :param image_id: an image id to snapshot to.
  4398. :param clean_shutdown: give the GuestOS a chance to stop
  4399. """
  4400. @utils.synchronized(instance.uuid)
  4401. def do_shelve_instance():
  4402. self._shelve_instance(context, instance, image_id, clean_shutdown)
  4403. do_shelve_instance()
  4404. def _shelve_instance(self, context, instance, image_id,
  4405. clean_shutdown):
  4406. LOG.info('Shelving', instance=instance)
  4407. offload = CONF.shelved_offload_time == 0
  4408. if offload:
  4409. # Get the BDMs early so we can pass them into versioned
  4410. # notifications since _shelve_offload_instance needs the
  4411. # BDMs anyway.
  4412. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  4413. context, instance.uuid)
  4414. else:
  4415. bdms = None
  4416. compute_utils.notify_usage_exists(self.notifier, context, instance,
  4417. self.host, current_period=True)
  4418. self._notify_about_instance_usage(context, instance, 'shelve.start')
  4419. compute_utils.notify_about_instance_action(context, instance,
  4420. self.host, action=fields.NotificationAction.SHELVE,
  4421. phase=fields.NotificationPhase.START, bdms=bdms)
  4422. def update_task_state(task_state, expected_state=task_states.SHELVING):
  4423. shelving_state_map = {
  4424. task_states.IMAGE_PENDING_UPLOAD:
  4425. task_states.SHELVING_IMAGE_PENDING_UPLOAD,
  4426. task_states.IMAGE_UPLOADING:
  4427. task_states.SHELVING_IMAGE_UPLOADING,
  4428. task_states.SHELVING: task_states.SHELVING}
  4429. task_state = shelving_state_map[task_state]
  4430. expected_state = shelving_state_map[expected_state]
  4431. instance.task_state = task_state
  4432. instance.save(expected_task_state=expected_state)
  4433. # Do not attempt a clean shutdown of a paused guest since some
  4434. # hypervisors will fail the clean shutdown if the guest is not
  4435. # running.
  4436. if instance.power_state == power_state.PAUSED:
  4437. clean_shutdown = False
  4438. self._power_off_instance(context, instance, clean_shutdown)
  4439. self.driver.snapshot(context, instance, image_id, update_task_state)
  4440. instance.system_metadata['shelved_at'] = timeutils.utcnow().isoformat()
  4441. instance.system_metadata['shelved_image_id'] = image_id
  4442. instance.system_metadata['shelved_host'] = self.host
  4443. instance.vm_state = vm_states.SHELVED
  4444. instance.task_state = None
  4445. if CONF.shelved_offload_time == 0:
  4446. instance.task_state = task_states.SHELVING_OFFLOADING
  4447. instance.power_state = self._get_power_state(context, instance)
  4448. instance.save(expected_task_state=[
  4449. task_states.SHELVING,
  4450. task_states.SHELVING_IMAGE_UPLOADING])
  4451. self._notify_about_instance_usage(context, instance, 'shelve.end')
  4452. compute_utils.notify_about_instance_action(context, instance,
  4453. self.host, action=fields.NotificationAction.SHELVE,
  4454. phase=fields.NotificationPhase.END, bdms=bdms)
  4455. if offload:
  4456. self._shelve_offload_instance(context, instance,
  4457. clean_shutdown=False, bdms=bdms)
  4458. @wrap_exception()
  4459. @reverts_task_state
  4460. @wrap_instance_event(prefix='compute')
  4461. @wrap_instance_fault
  4462. def shelve_offload_instance(self, context, instance, clean_shutdown):
  4463. """Remove a shelved instance from the hypervisor.
  4464. This frees up those resources for use by other instances, but may lead
  4465. to slower unshelve times for this instance. This method is used by
  4466. volume backed instances since restoring them doesn't involve the
  4467. potentially large download of an image.
  4468. :param context: request context
  4469. :param instance: nova.objects.instance.Instance
  4470. :param clean_shutdown: give the GuestOS a chance to stop
  4471. """
  4472. @utils.synchronized(instance.uuid)
  4473. def do_shelve_offload_instance():
  4474. self._shelve_offload_instance(context, instance, clean_shutdown)
  4475. do_shelve_offload_instance()
  4476. def _shelve_offload_instance(self, context, instance, clean_shutdown,
  4477. bdms=None):
  4478. LOG.info('Shelve offloading', instance=instance)
  4479. if bdms is None:
  4480. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  4481. context, instance.uuid)
  4482. self._notify_about_instance_usage(context, instance,
  4483. 'shelve_offload.start')
  4484. compute_utils.notify_about_instance_action(context, instance,
  4485. self.host, action=fields.NotificationAction.SHELVE_OFFLOAD,
  4486. phase=fields.NotificationPhase.START, bdms=bdms)
  4487. self._power_off_instance(context, instance, clean_shutdown)
  4488. current_power_state = self._get_power_state(context, instance)
  4489. self.network_api.cleanup_instance_network_on_host(context, instance,
  4490. instance.host)
  4491. network_info = self.network_api.get_instance_nw_info(context, instance)
  4492. block_device_info = self._get_instance_block_device_info(context,
  4493. instance,
  4494. bdms=bdms)
  4495. self.driver.destroy(context, instance, network_info,
  4496. block_device_info)
  4497. # the instance is going to be removed from the host so we want to
  4498. # terminate all the connections with the volume server and the host
  4499. self._terminate_volume_connections(context, instance, bdms)
  4500. # Free up the resource allocations in the placement service.
  4501. # This should happen *before* the vm_state is changed to
  4502. # SHELVED_OFFLOADED in case client-side code is polling the API to
  4503. # schedule more instances (or unshelve) once this server is offloaded.
  4504. self.rt.delete_allocation_for_shelve_offloaded_instance(context,
  4505. instance)
  4506. instance.power_state = current_power_state
  4507. # NOTE(mriedem): The vm_state has to be set before updating the
  4508. # resource tracker, see vm_states.ALLOW_RESOURCE_REMOVAL. The host/node
  4509. # values cannot be nulled out until after updating the resource tracker
  4510. # though.
  4511. instance.vm_state = vm_states.SHELVED_OFFLOADED
  4512. instance.task_state = None
  4513. instance.save(expected_task_state=[task_states.SHELVING,
  4514. task_states.SHELVING_OFFLOADING])
  4515. # NOTE(ndipanov): Free resources from the resource tracker
  4516. self._update_resource_tracker(context, instance)
  4517. # NOTE(sfinucan): RPC calls should no longer be attempted against this
  4518. # instance, so ensure any calls result in errors
  4519. self._nil_out_instance_obj_host_and_node(instance)
  4520. instance.save(expected_task_state=None)
  4521. # TODO(melwitt): We should clean up instance console tokens here. The
  4522. # instance has no host at this point and will need to establish a new
  4523. # console connection in the future after it is unshelved.
  4524. self._delete_scheduler_instance_info(context, instance.uuid)
  4525. self._notify_about_instance_usage(context, instance,
  4526. 'shelve_offload.end')
  4527. compute_utils.notify_about_instance_action(context, instance,
  4528. self.host, action=fields.NotificationAction.SHELVE_OFFLOAD,
  4529. phase=fields.NotificationPhase.END, bdms=bdms)
  4530. @wrap_exception()
  4531. @reverts_task_state
  4532. @wrap_instance_event(prefix='compute')
  4533. @wrap_instance_fault
  4534. def unshelve_instance(self, context, instance, image,
  4535. filter_properties, node):
  4536. """Unshelve the instance.
  4537. :param context: request context
  4538. :param instance: a nova.objects.instance.Instance object
  4539. :param image: an image to build from. If None we assume a
  4540. volume backed instance.
  4541. :param filter_properties: dict containing limits, retry info etc.
  4542. :param node: target compute node
  4543. """
  4544. if filter_properties is None:
  4545. filter_properties = {}
  4546. @utils.synchronized(instance.uuid)
  4547. def do_unshelve_instance():
  4548. self._unshelve_instance(context, instance, image,
  4549. filter_properties, node)
  4550. do_unshelve_instance()
  4551. def _unshelve_instance_key_scrub(self, instance):
  4552. """Remove data from the instance that may cause side effects."""
  4553. cleaned_keys = dict(
  4554. key_data=instance.key_data,
  4555. auto_disk_config=instance.auto_disk_config)
  4556. instance.key_data = None
  4557. instance.auto_disk_config = False
  4558. return cleaned_keys
  4559. def _unshelve_instance_key_restore(self, instance, keys):
  4560. """Restore previously scrubbed keys before saving the instance."""
  4561. instance.update(keys)
  4562. def _unshelve_instance(self, context, instance, image, filter_properties,
  4563. node):
  4564. LOG.info('Unshelving', instance=instance)
  4565. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  4566. context, instance.uuid)
  4567. self._notify_about_instance_usage(context, instance, 'unshelve.start')
  4568. compute_utils.notify_about_instance_action(context, instance,
  4569. self.host, action=fields.NotificationAction.UNSHELVE,
  4570. phase=fields.NotificationPhase.START, bdms=bdms)
  4571. instance.task_state = task_states.SPAWNING
  4572. instance.save()
  4573. block_device_info = self._prep_block_device(context, instance, bdms)
  4574. scrubbed_keys = self._unshelve_instance_key_scrub(instance)
  4575. if node is None:
  4576. node = self._get_nodename(instance)
  4577. limits = filter_properties.get('limits', {})
  4578. allocations = self.reportclient.get_allocations_for_consumer(
  4579. context, instance.uuid)
  4580. shelved_image_ref = instance.image_ref
  4581. if image:
  4582. instance.image_ref = image['id']
  4583. image_meta = objects.ImageMeta.from_dict(image)
  4584. else:
  4585. image_meta = objects.ImageMeta.from_dict(
  4586. utils.get_image_from_system_metadata(
  4587. instance.system_metadata))
  4588. self.network_api.setup_instance_network_on_host(context, instance,
  4589. self.host)
  4590. network_info = self.network_api.get_instance_nw_info(context, instance)
  4591. try:
  4592. with self.rt.instance_claim(context, instance, node, limits):
  4593. self.driver.spawn(context, instance, image_meta,
  4594. injected_files=[],
  4595. admin_password=None,
  4596. allocations=allocations,
  4597. network_info=network_info,
  4598. block_device_info=block_device_info)
  4599. except Exception:
  4600. with excutils.save_and_reraise_exception(logger=LOG):
  4601. LOG.exception('Instance failed to spawn',
  4602. instance=instance)
  4603. # Cleanup allocations created by the scheduler on this host
  4604. # since we failed to spawn the instance. We do this both if
  4605. # the instance claim failed with ComputeResourcesUnavailable
  4606. # or if we did claim but the spawn failed, because aborting the
  4607. # instance claim will not remove the allocations.
  4608. self.reportclient.delete_allocation_for_instance(context,
  4609. instance.uuid)
  4610. # FIXME: Umm, shouldn't we be rolling back port bindings too?
  4611. self._terminate_volume_connections(context, instance, bdms)
  4612. # The reverts_task_state decorator on unshelve_instance will
  4613. # eventually save these updates.
  4614. self._nil_out_instance_obj_host_and_node(instance)
  4615. if image:
  4616. instance.image_ref = shelved_image_ref
  4617. self._delete_snapshot_of_shelved_instance(context, instance,
  4618. image['id'])
  4619. self._unshelve_instance_key_restore(instance, scrubbed_keys)
  4620. self._update_instance_after_spawn(context, instance)
  4621. # Delete system_metadata for a shelved instance
  4622. compute_utils.remove_shelved_keys_from_system_metadata(instance)
  4623. instance.save(expected_task_state=task_states.SPAWNING)
  4624. self._update_scheduler_instance_info(context, instance)
  4625. self._notify_about_instance_usage(context, instance, 'unshelve.end')
  4626. compute_utils.notify_about_instance_action(context, instance,
  4627. self.host, action=fields.NotificationAction.UNSHELVE,
  4628. phase=fields.NotificationPhase.END, bdms=bdms)
  4629. @messaging.expected_exceptions(NotImplementedError)
  4630. @wrap_instance_fault
  4631. def reset_network(self, context, instance):
  4632. """Reset networking on the given instance."""
  4633. LOG.debug('Reset network', instance=instance)
  4634. self.driver.reset_network(instance)
  4635. def _inject_network_info(self, context, instance, network_info):
  4636. """Inject network info for the given instance."""
  4637. LOG.debug('Inject network info', instance=instance)
  4638. LOG.debug('network_info to inject: |%s|', network_info,
  4639. instance=instance)
  4640. self.driver.inject_network_info(instance,
  4641. network_info)
  4642. @wrap_instance_fault
  4643. def inject_network_info(self, context, instance):
  4644. """Inject network info, but don't return the info."""
  4645. network_info = self.network_api.get_instance_nw_info(context, instance)
  4646. self._inject_network_info(context, instance, network_info)
  4647. @messaging.expected_exceptions(NotImplementedError,
  4648. exception.ConsoleNotAvailable,
  4649. exception.InstanceNotFound)
  4650. @wrap_exception()
  4651. @wrap_instance_fault
  4652. def get_console_output(self, context, instance, tail_length):
  4653. """Send the console output for the given instance."""
  4654. context = context.elevated()
  4655. LOG.info("Get console output", instance=instance)
  4656. output = self.driver.get_console_output(context, instance)
  4657. if type(output) is six.text_type:
  4658. output = six.b(output)
  4659. if tail_length is not None:
  4660. output = self._tail_log(output, tail_length)
  4661. return output.decode('ascii', 'replace')
  4662. def _tail_log(self, log, length):
  4663. try:
  4664. length = int(length)
  4665. except ValueError:
  4666. length = 0
  4667. if length == 0:
  4668. return b''
  4669. else:
  4670. return b'\n'.join(log.split(b'\n')[-int(length):])
  4671. @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
  4672. exception.InstanceNotReady,
  4673. exception.InstanceNotFound,
  4674. exception.ConsoleTypeUnavailable,
  4675. NotImplementedError)
  4676. @wrap_exception()
  4677. @wrap_instance_fault
  4678. def get_vnc_console(self, context, console_type, instance):
  4679. """Return connection information for a vnc console."""
  4680. context = context.elevated()
  4681. LOG.debug("Getting vnc console", instance=instance)
  4682. if not CONF.vnc.enabled:
  4683. raise exception.ConsoleTypeUnavailable(console_type=console_type)
  4684. if console_type == 'novnc':
  4685. # For essex, novncproxy_base_url must include the full path
  4686. # including the html file (like http://myhost/vnc_auto.html)
  4687. access_url_base = CONF.vnc.novncproxy_base_url
  4688. elif console_type == 'xvpvnc':
  4689. access_url_base = CONF.vnc.xvpvncproxy_base_url
  4690. else:
  4691. raise exception.ConsoleTypeInvalid(console_type=console_type)
  4692. try:
  4693. # Retrieve connect info from driver, and then decorate with our
  4694. # access info token
  4695. console = self.driver.get_vnc_console(context, instance)
  4696. console_auth = objects.ConsoleAuthToken(
  4697. context=context,
  4698. console_type=console_type,
  4699. host=console.host,
  4700. port=console.port,
  4701. internal_access_path=console.internal_access_path,
  4702. instance_uuid=instance.uuid,
  4703. access_url_base=access_url_base,
  4704. )
  4705. console_auth.authorize(CONF.consoleauth.token_ttl)
  4706. connect_info = console.get_connection_info(
  4707. console_auth.token, console_auth.access_url)
  4708. except exception.InstanceNotFound:
  4709. if instance.vm_state != vm_states.BUILDING:
  4710. raise
  4711. raise exception.InstanceNotReady(instance_id=instance.uuid)
  4712. return connect_info
  4713. @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
  4714. exception.InstanceNotReady,
  4715. exception.InstanceNotFound,
  4716. exception.ConsoleTypeUnavailable,
  4717. NotImplementedError)
  4718. @wrap_exception()
  4719. @wrap_instance_fault
  4720. def get_spice_console(self, context, console_type, instance):
  4721. """Return connection information for a spice console."""
  4722. context = context.elevated()
  4723. LOG.debug("Getting spice console", instance=instance)
  4724. if not CONF.spice.enabled:
  4725. raise exception.ConsoleTypeUnavailable(console_type=console_type)
  4726. if console_type != 'spice-html5':
  4727. raise exception.ConsoleTypeInvalid(console_type=console_type)
  4728. try:
  4729. # Retrieve connect info from driver, and then decorate with our
  4730. # access info token
  4731. console = self.driver.get_spice_console(context, instance)
  4732. console_auth = objects.ConsoleAuthToken(
  4733. context=context,
  4734. console_type=console_type,
  4735. host=console.host,
  4736. port=console.port,
  4737. internal_access_path=console.internal_access_path,
  4738. instance_uuid=instance.uuid,
  4739. access_url_base=CONF.spice.html5proxy_base_url,
  4740. )
  4741. console_auth.authorize(CONF.consoleauth.token_ttl)
  4742. connect_info = console.get_connection_info(
  4743. console_auth.token, console_auth.access_url)
  4744. except exception.InstanceNotFound:
  4745. if instance.vm_state != vm_states.BUILDING:
  4746. raise
  4747. raise exception.InstanceNotReady(instance_id=instance.uuid)
  4748. return connect_info
  4749. @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
  4750. exception.InstanceNotReady,
  4751. exception.InstanceNotFound,
  4752. exception.ConsoleTypeUnavailable,
  4753. NotImplementedError)
  4754. @wrap_exception()
  4755. @wrap_instance_fault
  4756. def get_rdp_console(self, context, console_type, instance):
  4757. """Return connection information for a RDP console."""
  4758. context = context.elevated()
  4759. LOG.debug("Getting RDP console", instance=instance)
  4760. if not CONF.rdp.enabled:
  4761. raise exception.ConsoleTypeUnavailable(console_type=console_type)
  4762. if console_type != 'rdp-html5':
  4763. raise exception.ConsoleTypeInvalid(console_type=console_type)
  4764. try:
  4765. # Retrieve connect info from driver, and then decorate with our
  4766. # access info token
  4767. console = self.driver.get_rdp_console(context, instance)
  4768. console_auth = objects.ConsoleAuthToken(
  4769. context=context,
  4770. console_type=console_type,
  4771. host=console.host,
  4772. port=console.port,
  4773. internal_access_path=console.internal_access_path,
  4774. instance_uuid=instance.uuid,
  4775. access_url_base=CONF.rdp.html5_proxy_base_url,
  4776. )
  4777. console_auth.authorize(CONF.consoleauth.token_ttl)
  4778. connect_info = console.get_connection_info(
  4779. console_auth.token, console_auth.access_url)
  4780. except exception.InstanceNotFound:
  4781. if instance.vm_state != vm_states.BUILDING:
  4782. raise
  4783. raise exception.InstanceNotReady(instance_id=instance.uuid)
  4784. return connect_info
  4785. @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
  4786. exception.InstanceNotReady,
  4787. exception.InstanceNotFound,
  4788. exception.ConsoleTypeUnavailable,
  4789. NotImplementedError)
  4790. @wrap_exception()
  4791. @wrap_instance_fault
  4792. def get_mks_console(self, context, console_type, instance):
  4793. """Return connection information for a MKS console."""
  4794. context = context.elevated()
  4795. LOG.debug("Getting MKS console", instance=instance)
  4796. if not CONF.mks.enabled:
  4797. raise exception.ConsoleTypeUnavailable(console_type=console_type)
  4798. if console_type != 'webmks':
  4799. raise exception.ConsoleTypeInvalid(console_type=console_type)
  4800. try:
  4801. # Retrieve connect info from driver, and then decorate with our
  4802. # access info token
  4803. console = self.driver.get_mks_console(context, instance)
  4804. console_auth = objects.ConsoleAuthToken(
  4805. context=context,
  4806. console_type=console_type,
  4807. host=console.host,
  4808. port=console.port,
  4809. internal_access_path=console.internal_access_path,
  4810. instance_uuid=instance.uuid,
  4811. access_url_base=CONF.mks.mksproxy_base_url,
  4812. )
  4813. console_auth.authorize(CONF.consoleauth.token_ttl)
  4814. connect_info = console.get_connection_info(
  4815. console_auth.token, console_auth.access_url)
  4816. except exception.InstanceNotFound:
  4817. if instance.vm_state != vm_states.BUILDING:
  4818. raise
  4819. raise exception.InstanceNotReady(instance_id=instance.uuid)
  4820. return connect_info
  4821. @messaging.expected_exceptions(
  4822. exception.ConsoleTypeInvalid,
  4823. exception.InstanceNotReady,
  4824. exception.InstanceNotFound,
  4825. exception.ConsoleTypeUnavailable,
  4826. exception.SocketPortRangeExhaustedException,
  4827. exception.ImageSerialPortNumberInvalid,
  4828. exception.ImageSerialPortNumberExceedFlavorValue,
  4829. NotImplementedError)
  4830. @wrap_exception()
  4831. @wrap_instance_fault
  4832. def get_serial_console(self, context, console_type, instance):
  4833. """Returns connection information for a serial console."""
  4834. LOG.debug("Getting serial console", instance=instance)
  4835. if not CONF.serial_console.enabled:
  4836. raise exception.ConsoleTypeUnavailable(console_type=console_type)
  4837. context = context.elevated()
  4838. try:
  4839. # Retrieve connect info from driver, and then decorate with our
  4840. # access info token
  4841. console = self.driver.get_serial_console(context, instance)
  4842. console_auth = objects.ConsoleAuthToken(
  4843. context=context,
  4844. console_type=console_type,
  4845. host=console.host,
  4846. port=console.port,
  4847. internal_access_path=console.internal_access_path,
  4848. instance_uuid=instance.uuid,
  4849. access_url_base=CONF.serial_console.base_url,
  4850. )
  4851. console_auth.authorize(CONF.consoleauth.token_ttl)
  4852. connect_info = console.get_connection_info(
  4853. console_auth.token, console_auth.access_url)
  4854. except exception.InstanceNotFound:
  4855. if instance.vm_state != vm_states.BUILDING:
  4856. raise
  4857. raise exception.InstanceNotReady(instance_id=instance.uuid)
  4858. return connect_info
  4859. @messaging.expected_exceptions(exception.ConsoleTypeInvalid,
  4860. exception.InstanceNotReady,
  4861. exception.InstanceNotFound)
  4862. @wrap_exception()
  4863. @wrap_instance_fault
  4864. def validate_console_port(self, ctxt, instance, port, console_type):
  4865. if console_type == "spice-html5":
  4866. console_info = self.driver.get_spice_console(ctxt, instance)
  4867. elif console_type == "rdp-html5":
  4868. console_info = self.driver.get_rdp_console(ctxt, instance)
  4869. elif console_type == "serial":
  4870. console_info = self.driver.get_serial_console(ctxt, instance)
  4871. elif console_type == "webmks":
  4872. console_info = self.driver.get_mks_console(ctxt, instance)
  4873. else:
  4874. console_info = self.driver.get_vnc_console(ctxt, instance)
  4875. # Some drivers may return an int on console_info.port but the port
  4876. # variable in this method is a string, so cast to be sure we are
  4877. # comparing the correct types.
  4878. return str(console_info.port) == port
  4879. @wrap_exception()
  4880. @reverts_task_state
  4881. @wrap_instance_fault
  4882. def reserve_block_device_name(self, context, instance, device,
  4883. volume_id, disk_bus, device_type, tag,
  4884. multiattach):
  4885. if (tag and not
  4886. self.driver.capabilities.get('supports_tagged_attach_volume',
  4887. False)):
  4888. raise exception.VolumeTaggedAttachNotSupported()
  4889. if (multiattach and not
  4890. self.driver.capabilities.get('supports_multiattach', False)):
  4891. raise exception.MultiattachNotSupportedByVirtDriver(
  4892. volume_id=volume_id)
  4893. @utils.synchronized(instance.uuid)
  4894. def do_reserve():
  4895. bdms = (
  4896. objects.BlockDeviceMappingList.get_by_instance_uuid(
  4897. context, instance.uuid))
  4898. # NOTE(ndipanov): We need to explicitly set all the fields on the
  4899. # object so that obj_load_attr does not fail
  4900. new_bdm = objects.BlockDeviceMapping(
  4901. context=context,
  4902. source_type='volume', destination_type='volume',
  4903. instance_uuid=instance.uuid, boot_index=None,
  4904. volume_id=volume_id,
  4905. device_name=device, guest_format=None,
  4906. disk_bus=disk_bus, device_type=device_type, tag=tag)
  4907. new_bdm.device_name = self._get_device_name_for_instance(
  4908. instance, bdms, new_bdm)
  4909. # NOTE(vish): create bdm here to avoid race condition
  4910. new_bdm.create()
  4911. return new_bdm
  4912. return do_reserve()
  4913. @wrap_exception()
  4914. @wrap_instance_event(prefix='compute')
  4915. @wrap_instance_fault
  4916. def attach_volume(self, context, instance, bdm):
  4917. """Attach a volume to an instance."""
  4918. driver_bdm = driver_block_device.convert_volume(bdm)
  4919. @utils.synchronized(instance.uuid)
  4920. def do_attach_volume(context, instance, driver_bdm):
  4921. try:
  4922. return self._attach_volume(context, instance, driver_bdm)
  4923. except Exception:
  4924. with excutils.save_and_reraise_exception():
  4925. bdm.destroy()
  4926. do_attach_volume(context, instance, driver_bdm)
  4927. def _attach_volume(self, context, instance, bdm):
  4928. context = context.elevated()
  4929. LOG.info('Attaching volume %(volume_id)s to %(mountpoint)s',
  4930. {'volume_id': bdm.volume_id,
  4931. 'mountpoint': bdm['mount_device']},
  4932. instance=instance)
  4933. compute_utils.notify_about_volume_attach_detach(
  4934. context, instance, self.host,
  4935. action=fields.NotificationAction.VOLUME_ATTACH,
  4936. phase=fields.NotificationPhase.START,
  4937. volume_id=bdm.volume_id)
  4938. try:
  4939. bdm.attach(context, instance, self.volume_api, self.driver,
  4940. do_driver_attach=True)
  4941. except Exception as e:
  4942. with excutils.save_and_reraise_exception():
  4943. LOG.exception("Failed to attach %(volume_id)s "
  4944. "at %(mountpoint)s",
  4945. {'volume_id': bdm.volume_id,
  4946. 'mountpoint': bdm['mount_device']},
  4947. instance=instance)
  4948. if bdm['attachment_id']:
  4949. # Try to delete the attachment to make the volume
  4950. # available again. Note that DriverVolumeBlockDevice
  4951. # may have already deleted the attachment so ignore
  4952. # VolumeAttachmentNotFound.
  4953. try:
  4954. self.volume_api.attachment_delete(
  4955. context, bdm['attachment_id'])
  4956. except exception.VolumeAttachmentNotFound as exc:
  4957. LOG.debug('Ignoring VolumeAttachmentNotFound: %s',
  4958. exc, instance=instance)
  4959. else:
  4960. self.volume_api.unreserve_volume(context, bdm.volume_id)
  4961. tb = traceback.format_exc()
  4962. compute_utils.notify_about_volume_attach_detach(
  4963. context, instance, self.host,
  4964. action=fields.NotificationAction.VOLUME_ATTACH,
  4965. phase=fields.NotificationPhase.ERROR,
  4966. exception=e,
  4967. volume_id=bdm.volume_id, tb=tb)
  4968. info = {'volume_id': bdm.volume_id}
  4969. self._notify_about_instance_usage(
  4970. context, instance, "volume.attach", extra_usage_info=info)
  4971. compute_utils.notify_about_volume_attach_detach(
  4972. context, instance, self.host,
  4973. action=fields.NotificationAction.VOLUME_ATTACH,
  4974. phase=fields.NotificationPhase.END,
  4975. volume_id=bdm.volume_id)
  4976. def _notify_volume_usage_detach(self, context, instance, bdm):
  4977. if CONF.volume_usage_poll_interval <= 0:
  4978. return
  4979. mp = bdm.device_name
  4980. # Handle bootable volumes which will not contain /dev/
  4981. if '/dev/' in mp:
  4982. mp = mp[5:]
  4983. try:
  4984. vol_stats = self.driver.block_stats(instance, mp)
  4985. if vol_stats is None:
  4986. return
  4987. except NotImplementedError:
  4988. return
  4989. LOG.debug("Updating volume usage cache with totals", instance=instance)
  4990. rd_req, rd_bytes, wr_req, wr_bytes, flush_ops = vol_stats
  4991. vol_usage = objects.VolumeUsage(context)
  4992. vol_usage.volume_id = bdm.volume_id
  4993. vol_usage.instance_uuid = instance.uuid
  4994. vol_usage.project_id = instance.project_id
  4995. vol_usage.user_id = instance.user_id
  4996. vol_usage.availability_zone = instance.availability_zone
  4997. vol_usage.curr_reads = rd_req
  4998. vol_usage.curr_read_bytes = rd_bytes
  4999. vol_usage.curr_writes = wr_req
  5000. vol_usage.curr_write_bytes = wr_bytes
  5001. vol_usage.save(update_totals=True)
  5002. self.notifier.info(context, 'volume.usage', vol_usage.to_dict())
  5003. compute_utils.notify_about_volume_usage(context, vol_usage, self.host)
  5004. def _detach_volume(self, context, bdm, instance, destroy_bdm=True,
  5005. attachment_id=None):
  5006. """Detach a volume from an instance.
  5007. :param context: security context
  5008. :param bdm: nova.objects.BlockDeviceMapping volume bdm to detach
  5009. :param instance: the Instance object to detach the volume from
  5010. :param destroy_bdm: if True, the corresponding BDM entry will be marked
  5011. as deleted. Disabling this is useful for operations
  5012. like rebuild, when we don't want to destroy BDM
  5013. :param attachment_id: The volume attachment_id for the given instance
  5014. and volume.
  5015. """
  5016. volume_id = bdm.volume_id
  5017. compute_utils.notify_about_volume_attach_detach(
  5018. context, instance, self.host,
  5019. action=fields.NotificationAction.VOLUME_DETACH,
  5020. phase=fields.NotificationPhase.START,
  5021. volume_id=volume_id)
  5022. self._notify_volume_usage_detach(context, instance, bdm)
  5023. LOG.info('Detaching volume %(volume_id)s',
  5024. {'volume_id': volume_id}, instance=instance)
  5025. driver_bdm = driver_block_device.convert_volume(bdm)
  5026. driver_bdm.detach(context, instance, self.volume_api, self.driver,
  5027. attachment_id=attachment_id, destroy_bdm=destroy_bdm)
  5028. info = dict(volume_id=volume_id)
  5029. self._notify_about_instance_usage(
  5030. context, instance, "volume.detach", extra_usage_info=info)
  5031. compute_utils.notify_about_volume_attach_detach(
  5032. context, instance, self.host,
  5033. action=fields.NotificationAction.VOLUME_DETACH,
  5034. phase=fields.NotificationPhase.END,
  5035. volume_id=volume_id)
  5036. if 'tag' in bdm and bdm.tag:
  5037. self._delete_disk_metadata(instance, bdm)
  5038. if destroy_bdm:
  5039. bdm.destroy()
  5040. def _delete_disk_metadata(self, instance, bdm):
  5041. for device in instance.device_metadata.devices:
  5042. if isinstance(device, objects.DiskMetadata):
  5043. if 'serial' in device:
  5044. if device.serial == bdm.volume_id:
  5045. instance.device_metadata.devices.remove(device)
  5046. instance.save()
  5047. break
  5048. else:
  5049. # NOTE(artom) We log the entire device object because all
  5050. # fields are nullable and may not be set
  5051. LOG.warning('Unable to determine whether to clean up '
  5052. 'device metadata for disk %s', device,
  5053. instance=instance)
  5054. @wrap_exception()
  5055. @wrap_instance_event(prefix='compute')
  5056. @wrap_instance_fault
  5057. def detach_volume(self, context, volume_id, instance, attachment_id):
  5058. """Detach a volume from an instance.
  5059. :param context: security context
  5060. :param volume_id: the volume id
  5061. :param instance: the Instance object to detach the volume from
  5062. :param attachment_id: The volume attachment_id for the given instance
  5063. and volume.
  5064. """
  5065. @utils.synchronized(instance.uuid)
  5066. def do_detach_volume(context, volume_id, instance, attachment_id):
  5067. bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(
  5068. context, volume_id, instance.uuid)
  5069. self._detach_volume(context, bdm, instance,
  5070. attachment_id=attachment_id)
  5071. do_detach_volume(context, volume_id, instance, attachment_id)
  5072. def _init_volume_connection(self, context, new_volume,
  5073. old_volume_id, connector, bdm,
  5074. new_attachment_id, mountpoint):
  5075. new_volume_id = new_volume['id']
  5076. if new_attachment_id is None:
  5077. # We're dealing with an old-style attachment so initialize the
  5078. # connection so we can get the connection_info.
  5079. new_cinfo = self.volume_api.initialize_connection(context,
  5080. new_volume_id,
  5081. connector)
  5082. else:
  5083. # Check for multiattach on the new volume and if True, check to
  5084. # see if the virt driver supports multiattach.
  5085. # TODO(mriedem): This is copied from DriverVolumeBlockDevice
  5086. # and should be consolidated into some common code at some point.
  5087. vol_multiattach = new_volume.get('multiattach', False)
  5088. virt_multiattach = self.driver.capabilities.get(
  5089. 'supports_multiattach', False)
  5090. if vol_multiattach and not virt_multiattach:
  5091. raise exception.MultiattachNotSupportedByVirtDriver(
  5092. volume_id=new_volume_id)
  5093. # This is a new style attachment and the API created the new
  5094. # volume attachment and passed the id to the compute over RPC.
  5095. # At this point we need to update the new volume attachment with
  5096. # the host connector, which will give us back the new attachment
  5097. # connection_info.
  5098. new_cinfo = self.volume_api.attachment_update(
  5099. context, new_attachment_id, connector,
  5100. mountpoint)['connection_info']
  5101. if vol_multiattach:
  5102. # This will be used by the volume driver to determine the
  5103. # proper disk configuration.
  5104. new_cinfo['multiattach'] = True
  5105. old_cinfo = jsonutils.loads(bdm['connection_info'])
  5106. if old_cinfo and 'serial' not in old_cinfo:
  5107. old_cinfo['serial'] = old_volume_id
  5108. # NOTE(lyarwood): serial is not always present in the returned
  5109. # connection_info so set it if it is missing as we do in
  5110. # DriverVolumeBlockDevice.attach().
  5111. if 'serial' not in new_cinfo:
  5112. new_cinfo['serial'] = new_volume_id
  5113. return (old_cinfo, new_cinfo)
  5114. def _swap_volume(self, context, instance, bdm, connector,
  5115. old_volume_id, new_volume, resize_to,
  5116. new_attachment_id, is_cinder_migration):
  5117. new_volume_id = new_volume['id']
  5118. mountpoint = bdm['device_name']
  5119. failed = False
  5120. new_cinfo = None
  5121. try:
  5122. old_cinfo, new_cinfo = self._init_volume_connection(
  5123. context, new_volume, old_volume_id, connector,
  5124. bdm, new_attachment_id, mountpoint)
  5125. # NOTE(lyarwood): The Libvirt driver, the only virt driver
  5126. # currently implementing swap_volume, will modify the contents of
  5127. # new_cinfo when connect_volume is called. This is then saved to
  5128. # the BDM in swap_volume for future use outside of this flow.
  5129. msg = ("swap_volume: Calling driver volume swap with "
  5130. "connection infos: new: %(new_cinfo)s; "
  5131. "old: %(old_cinfo)s" %
  5132. {'new_cinfo': new_cinfo, 'old_cinfo': old_cinfo})
  5133. # Both new and old info might contain password
  5134. LOG.debug(strutils.mask_password(msg), instance=instance)
  5135. self.driver.swap_volume(context, old_cinfo, new_cinfo, instance,
  5136. mountpoint, resize_to)
  5137. if new_attachment_id:
  5138. self.volume_api.attachment_complete(context, new_attachment_id)
  5139. msg = ("swap_volume: Driver volume swap returned, new "
  5140. "connection_info is now : %(new_cinfo)s" %
  5141. {'new_cinfo': new_cinfo})
  5142. LOG.debug(strutils.mask_password(msg))
  5143. except Exception as ex:
  5144. failed = True
  5145. with excutils.save_and_reraise_exception():
  5146. tb = traceback.format_exc()
  5147. compute_utils.notify_about_volume_swap(
  5148. context, instance, self.host,
  5149. fields.NotificationPhase.ERROR,
  5150. old_volume_id, new_volume_id, ex, tb)
  5151. if new_cinfo:
  5152. msg = ("Failed to swap volume %(old_volume_id)s "
  5153. "for %(new_volume_id)s")
  5154. LOG.exception(msg, {'old_volume_id': old_volume_id,
  5155. 'new_volume_id': new_volume_id},
  5156. instance=instance)
  5157. else:
  5158. msg = ("Failed to connect to volume %(volume_id)s "
  5159. "with volume at %(mountpoint)s")
  5160. LOG.exception(msg, {'volume_id': new_volume_id,
  5161. 'mountpoint': bdm['device_name']},
  5162. instance=instance)
  5163. # The API marked the volume as 'detaching' for the old volume
  5164. # so we need to roll that back so the volume goes back to
  5165. # 'in-use' state.
  5166. self.volume_api.roll_detaching(context, old_volume_id)
  5167. if new_attachment_id is None:
  5168. # The API reserved the new volume so it would be in
  5169. # 'attaching' status, so we need to unreserve it so it
  5170. # goes back to 'available' status.
  5171. self.volume_api.unreserve_volume(context, new_volume_id)
  5172. else:
  5173. # This is a new style attachment for the new volume, which
  5174. # was created in the API. We just need to delete it here
  5175. # to put the new volume back into 'available' status.
  5176. self.volume_api.attachment_delete(
  5177. context, new_attachment_id)
  5178. finally:
  5179. # TODO(mriedem): This finally block is terribly confusing and is
  5180. # trying to do too much. We should consider removing the finally
  5181. # block and move whatever needs to happen on success and failure
  5182. # into the blocks above for clarity, even if it means a bit of
  5183. # redundant code.
  5184. conn_volume = new_volume_id if failed else old_volume_id
  5185. if new_cinfo:
  5186. LOG.debug("swap_volume: removing Cinder connection "
  5187. "for volume %(volume)s", {'volume': conn_volume},
  5188. instance=instance)
  5189. if bdm.attachment_id is None:
  5190. # This is the pre-3.44 flow for new-style volume
  5191. # attachments so just terminate the connection.
  5192. self.volume_api.terminate_connection(context,
  5193. conn_volume,
  5194. connector)
  5195. else:
  5196. # This is a new style volume attachment. If we failed, then
  5197. # the new attachment was already deleted above in the
  5198. # exception block and we have nothing more to do here. If
  5199. # swap_volume was successful in the driver, then we need to
  5200. # "detach" the original attachment by deleting it.
  5201. if not failed:
  5202. self.volume_api.attachment_delete(
  5203. context, bdm.attachment_id)
  5204. # Need to make some decisions based on whether this was
  5205. # a Cinder initiated migration or not. The callback to
  5206. # migration completion isn't needed in the case of a
  5207. # nova initiated simple swap of two volume
  5208. # "volume-update" call so skip that. The new attachment
  5209. # scenarios will give us a new attachment record and
  5210. # that's what we want.
  5211. if bdm.attachment_id and not is_cinder_migration:
  5212. # we don't callback to cinder
  5213. comp_ret = {'save_volume_id': new_volume_id}
  5214. else:
  5215. # NOTE(lyarwood): The following call to
  5216. # os-migrate-volume-completion returns a dict containing
  5217. # save_volume_id, this volume id has two possible values :
  5218. # 1. old_volume_id if we are migrating (retyping) volumes
  5219. # 2. new_volume_id if we are swapping between two existing
  5220. # volumes
  5221. # This volume id is later used to update the volume_id and
  5222. # connection_info['serial'] of the BDM.
  5223. comp_ret = self.volume_api.migrate_volume_completion(
  5224. context,
  5225. old_volume_id,
  5226. new_volume_id,
  5227. error=failed)
  5228. LOG.debug("swap_volume: Cinder migrate_volume_completion "
  5229. "returned: %(comp_ret)s", {'comp_ret': comp_ret},
  5230. instance=instance)
  5231. return (comp_ret, new_cinfo)
  5232. @wrap_exception()
  5233. @wrap_instance_event(prefix='compute')
  5234. @wrap_instance_fault
  5235. def swap_volume(self, context, old_volume_id, new_volume_id, instance,
  5236. new_attachment_id):
  5237. """Swap volume for an instance."""
  5238. context = context.elevated()
  5239. compute_utils.notify_about_volume_swap(
  5240. context, instance, self.host,
  5241. fields.NotificationPhase.START,
  5242. old_volume_id, new_volume_id)
  5243. bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(
  5244. context, old_volume_id, instance.uuid)
  5245. connector = self.driver.get_volume_connector(instance)
  5246. resize_to = 0
  5247. old_volume = self.volume_api.get(context, old_volume_id)
  5248. # Yes this is a tightly-coupled state check of what's going on inside
  5249. # cinder, but we need this while we still support old (v1/v2) and
  5250. # new style attachments (v3.44). Once we drop support for old style
  5251. # attachments we could think about cleaning up the cinder-initiated
  5252. # swap volume API flows.
  5253. is_cinder_migration = (
  5254. True if old_volume['status'] in ('retyping',
  5255. 'migrating') else False)
  5256. old_vol_size = old_volume['size']
  5257. new_volume = self.volume_api.get(context, new_volume_id)
  5258. new_vol_size = new_volume['size']
  5259. if new_vol_size > old_vol_size:
  5260. resize_to = new_vol_size
  5261. LOG.info('Swapping volume %(old_volume)s for %(new_volume)s',
  5262. {'old_volume': old_volume_id, 'new_volume': new_volume_id},
  5263. instance=instance)
  5264. comp_ret, new_cinfo = self._swap_volume(context,
  5265. instance,
  5266. bdm,
  5267. connector,
  5268. old_volume_id,
  5269. new_volume,
  5270. resize_to,
  5271. new_attachment_id,
  5272. is_cinder_migration)
  5273. # NOTE(lyarwood): Update the BDM with the modified new_cinfo and
  5274. # correct volume_id returned by Cinder.
  5275. save_volume_id = comp_ret['save_volume_id']
  5276. new_cinfo['serial'] = save_volume_id
  5277. values = {
  5278. 'connection_info': jsonutils.dumps(new_cinfo),
  5279. 'source_type': 'volume',
  5280. 'destination_type': 'volume',
  5281. 'snapshot_id': None,
  5282. 'volume_id': save_volume_id,
  5283. 'no_device': None}
  5284. if resize_to:
  5285. values['volume_size'] = resize_to
  5286. if new_attachment_id is not None:
  5287. # This was a volume swap for a new-style attachment so we
  5288. # need to update the BDM attachment_id for the new attachment.
  5289. values['attachment_id'] = new_attachment_id
  5290. LOG.debug("swap_volume: Updating volume %(volume_id)s BDM record with "
  5291. "%(updates)s", {'volume_id': bdm.volume_id,
  5292. 'updates': values},
  5293. instance=instance)
  5294. bdm.update(values)
  5295. bdm.save()
  5296. compute_utils.notify_about_volume_swap(
  5297. context, instance, self.host,
  5298. fields.NotificationPhase.END,
  5299. old_volume_id, new_volume_id)
  5300. @wrap_exception()
  5301. def remove_volume_connection(self, context, volume_id, instance):
  5302. """Remove the volume connection on this host
  5303. Detach the volume from this instance on this host, and if this is
  5304. the cinder v2 flow, call cinder to terminate the connection.
  5305. """
  5306. try:
  5307. bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(
  5308. context, volume_id, instance.uuid)
  5309. driver_bdm = driver_block_device.convert_volume(bdm)
  5310. driver_bdm.driver_detach(context, instance,
  5311. self.volume_api, self.driver)
  5312. if bdm.attachment_id is None:
  5313. # cinder v2 api flow
  5314. connector = self.driver.get_volume_connector(instance)
  5315. self.volume_api.terminate_connection(context, volume_id,
  5316. connector)
  5317. except exception.NotFound:
  5318. pass
  5319. def _deallocate_port_for_instance(self, context, instance, port_id,
  5320. raise_on_failure=False):
  5321. try:
  5322. result = self.network_api.deallocate_port_for_instance(
  5323. context, instance, port_id)
  5324. __, port_allocation = result
  5325. except Exception as ex:
  5326. with excutils.save_and_reraise_exception(
  5327. reraise=raise_on_failure):
  5328. LOG.warning('Failed to deallocate port %(port_id)s '
  5329. 'for instance. Error: %(error)s',
  5330. {'port_id': port_id, 'error': ex},
  5331. instance=instance)
  5332. else:
  5333. if port_allocation:
  5334. # Deallocate the resources in placement that were used by the
  5335. # detached port.
  5336. try:
  5337. client = self.reportclient
  5338. client.remove_resources_from_instance_allocation(
  5339. context, instance.uuid, port_allocation)
  5340. except Exception as ex:
  5341. # We always raise here as it is not a race condition where
  5342. # somebody has already deleted the port we want to cleanup.
  5343. # Here we see that the port exists, the allocation exists,
  5344. # but we cannot clean it up so we will actually leak
  5345. # allocations.
  5346. with excutils.save_and_reraise_exception():
  5347. LOG.warning('Failed to remove resource allocation '
  5348. 'of port %(port_id)s for instance. Error: '
  5349. '%(error)s',
  5350. {'port_id': port_id, 'error': ex},
  5351. instance=instance)
  5352. @wrap_exception()
  5353. @wrap_instance_event(prefix='compute')
  5354. @wrap_instance_fault
  5355. def attach_interface(self, context, instance, network_id, port_id,
  5356. requested_ip, tag):
  5357. """Use hotplug to add an network adapter to an instance."""
  5358. if not self.driver.capabilities.get('supports_attach_interface',
  5359. False):
  5360. raise exception.AttachInterfaceNotSupported(
  5361. instance_uuid=instance.uuid)
  5362. if (tag and not
  5363. self.driver.capabilities.get('supports_tagged_attach_interface',
  5364. False)):
  5365. raise exception.NetworkInterfaceTaggedAttachNotSupported()
  5366. compute_utils.notify_about_instance_action(
  5367. context, instance, self.host,
  5368. action=fields.NotificationAction.INTERFACE_ATTACH,
  5369. phase=fields.NotificationPhase.START)
  5370. bind_host_id = self.driver.network_binding_host_id(context, instance)
  5371. network_info = self.network_api.allocate_port_for_instance(
  5372. context, instance, port_id, network_id, requested_ip,
  5373. bind_host_id=bind_host_id, tag=tag)
  5374. if len(network_info) != 1:
  5375. LOG.error('allocate_port_for_instance returned %(ports)s '
  5376. 'ports', {'ports': len(network_info)})
  5377. # TODO(elod.illes): an instance.interface_attach.error notification
  5378. # should be sent here
  5379. raise exception.InterfaceAttachFailed(
  5380. instance_uuid=instance.uuid)
  5381. image_meta = objects.ImageMeta.from_instance(instance)
  5382. try:
  5383. self.driver.attach_interface(context, instance, image_meta,
  5384. network_info[0])
  5385. except exception.NovaException as ex:
  5386. port_id = network_info[0].get('id')
  5387. LOG.warning("attach interface failed , try to deallocate "
  5388. "port %(port_id)s, reason: %(msg)s",
  5389. {'port_id': port_id, 'msg': ex},
  5390. instance=instance)
  5391. self._deallocate_port_for_instance(context, instance, port_id)
  5392. tb = traceback.format_exc()
  5393. compute_utils.notify_about_instance_action(
  5394. context, instance, self.host,
  5395. action=fields.NotificationAction.INTERFACE_ATTACH,
  5396. phase=fields.NotificationPhase.ERROR,
  5397. exception=ex, tb=tb)
  5398. raise exception.InterfaceAttachFailed(
  5399. instance_uuid=instance.uuid)
  5400. compute_utils.notify_about_instance_action(
  5401. context, instance, self.host,
  5402. action=fields.NotificationAction.INTERFACE_ATTACH,
  5403. phase=fields.NotificationPhase.END)
  5404. return network_info[0]
  5405. @wrap_exception()
  5406. @wrap_instance_event(prefix='compute')
  5407. @wrap_instance_fault
  5408. def detach_interface(self, context, instance, port_id):
  5409. """Detach a network adapter from an instance."""
  5410. network_info = instance.info_cache.network_info
  5411. condemned = None
  5412. for vif in network_info:
  5413. if vif['id'] == port_id:
  5414. condemned = vif
  5415. break
  5416. if condemned is None:
  5417. raise exception.PortNotFound(_("Port %s is not "
  5418. "attached") % port_id)
  5419. compute_utils.notify_about_instance_action(
  5420. context, instance, self.host,
  5421. action=fields.NotificationAction.INTERFACE_DETACH,
  5422. phase=fields.NotificationPhase.START)
  5423. try:
  5424. self.driver.detach_interface(context, instance, condemned)
  5425. except exception.NovaException as ex:
  5426. # If the instance was deleted before the interface was detached,
  5427. # just log it at debug.
  5428. log_level = (logging.DEBUG
  5429. if isinstance(ex, exception.InstanceNotFound)
  5430. else logging.WARNING)
  5431. LOG.log(log_level,
  5432. "Detach interface failed, port_id=%(port_id)s, reason: "
  5433. "%(msg)s", {'port_id': port_id, 'msg': ex},
  5434. instance=instance)
  5435. raise exception.InterfaceDetachFailed(instance_uuid=instance.uuid)
  5436. else:
  5437. self._deallocate_port_for_instance(
  5438. context, instance, port_id, raise_on_failure=True)
  5439. compute_utils.notify_about_instance_action(
  5440. context, instance, self.host,
  5441. action=fields.NotificationAction.INTERFACE_DETACH,
  5442. phase=fields.NotificationPhase.END)
  5443. def _get_compute_info(self, context, host):
  5444. return objects.ComputeNode.get_first_node_by_host_for_old_compat(
  5445. context, host)
  5446. @wrap_exception()
  5447. def check_instance_shared_storage(self, ctxt, instance, data):
  5448. """Check if the instance files are shared
  5449. :param ctxt: security context
  5450. :param instance: dict of instance data
  5451. :param data: result of driver.check_instance_shared_storage_local
  5452. Returns True if instance disks located on shared storage and
  5453. False otherwise.
  5454. """
  5455. return self.driver.check_instance_shared_storage_remote(ctxt, data)
  5456. @wrap_exception()
  5457. @wrap_instance_event(prefix='compute')
  5458. @wrap_instance_fault
  5459. def check_can_live_migrate_destination(self, ctxt, instance,
  5460. block_migration, disk_over_commit):
  5461. """Check if it is possible to execute live migration.
  5462. This runs checks on the destination host, and then calls
  5463. back to the source host to check the results.
  5464. :param context: security context
  5465. :param instance: dict of instance data
  5466. :param block_migration: if true, prepare for block migration
  5467. if None, calculate it in driver
  5468. :param disk_over_commit: if true, allow disk over commit
  5469. if None, ignore disk usage checking
  5470. :returns: a dict containing migration info
  5471. """
  5472. src_compute_info = obj_base.obj_to_primitive(
  5473. self._get_compute_info(ctxt, instance.host))
  5474. dst_compute_info = obj_base.obj_to_primitive(
  5475. self._get_compute_info(ctxt, CONF.host))
  5476. dest_check_data = self.driver.check_can_live_migrate_destination(ctxt,
  5477. instance, src_compute_info, dst_compute_info,
  5478. block_migration, disk_over_commit)
  5479. LOG.debug('destination check data is %s', dest_check_data)
  5480. try:
  5481. migrate_data = self.compute_rpcapi.\
  5482. check_can_live_migrate_source(ctxt, instance,
  5483. dest_check_data)
  5484. finally:
  5485. self.driver.cleanup_live_migration_destination_check(ctxt,
  5486. dest_check_data)
  5487. return migrate_data
  5488. @wrap_exception()
  5489. @wrap_instance_event(prefix='compute')
  5490. @wrap_instance_fault
  5491. def check_can_live_migrate_source(self, ctxt, instance, dest_check_data):
  5492. """Check if it is possible to execute live migration.
  5493. This checks if the live migration can succeed, based on the
  5494. results from check_can_live_migrate_destination.
  5495. :param ctxt: security context
  5496. :param instance: dict of instance data
  5497. :param dest_check_data: result of check_can_live_migrate_destination
  5498. :returns: a dict containing migration info
  5499. """
  5500. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  5501. ctxt, instance.uuid)
  5502. is_volume_backed = compute_utils.is_volume_backed_instance(
  5503. ctxt, instance, bdms)
  5504. dest_check_data.is_volume_backed = is_volume_backed
  5505. block_device_info = self._get_instance_block_device_info(
  5506. ctxt, instance, refresh_conn_info=False, bdms=bdms)
  5507. result = self.driver.check_can_live_migrate_source(ctxt, instance,
  5508. dest_check_data,
  5509. block_device_info)
  5510. LOG.debug('source check data is %s', result)
  5511. return result
  5512. @wrap_exception()
  5513. @wrap_instance_event(prefix='compute')
  5514. @wrap_instance_fault
  5515. def pre_live_migration(self, context, instance, block_migration, disk,
  5516. migrate_data):
  5517. """Preparations for live migration at dest host.
  5518. :param context: security context
  5519. :param instance: dict of instance data
  5520. :param block_migration: if true, prepare for block migration
  5521. :param disk: disk info of instance
  5522. :param migrate_data: A dict or LiveMigrateData object holding data
  5523. required for live migration without shared
  5524. storage.
  5525. :returns: migrate_data containing additional migration info
  5526. """
  5527. LOG.debug('pre_live_migration data is %s', migrate_data)
  5528. migrate_data.old_vol_attachment_ids = {}
  5529. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  5530. context, instance.uuid)
  5531. network_info = self.network_api.get_instance_nw_info(context, instance)
  5532. self._notify_about_instance_usage(
  5533. context, instance, "live_migration.pre.start",
  5534. network_info=network_info)
  5535. compute_utils.notify_about_instance_action(
  5536. context, instance, self.host,
  5537. action=fields.NotificationAction.LIVE_MIGRATION_PRE,
  5538. phase=fields.NotificationPhase.START, bdms=bdms)
  5539. connector = self.driver.get_volume_connector(instance)
  5540. try:
  5541. for bdm in bdms:
  5542. if bdm.is_volume and bdm.attachment_id is not None:
  5543. # This bdm uses the new cinder v3.44 API.
  5544. # We will create a new attachment for this
  5545. # volume on this migration destination host. The old
  5546. # attachment will be deleted on the source host
  5547. # when the migration succeeds. The old attachment_id
  5548. # is stored in dict with the key being the bdm.volume_id
  5549. # so it can be restored on rollback.
  5550. #
  5551. # Also note that attachment_update is not needed as we
  5552. # are providing the connector in the create call.
  5553. attach_ref = self.volume_api.attachment_create(
  5554. context, bdm.volume_id, bdm.instance_uuid,
  5555. connector=connector, mountpoint=bdm.device_name)
  5556. # save current attachment so we can detach it on success,
  5557. # or restore it on a rollback.
  5558. # NOTE(mdbooth): This data is no longer used by the source
  5559. # host since change I0390c9ff. We can't remove it until we
  5560. # are sure the source host has been upgraded.
  5561. migrate_data.old_vol_attachment_ids[bdm.volume_id] = \
  5562. bdm.attachment_id
  5563. # update the bdm with the new attachment_id.
  5564. bdm.attachment_id = attach_ref['id']
  5565. bdm.save()
  5566. block_device_info = self._get_instance_block_device_info(
  5567. context, instance, refresh_conn_info=True,
  5568. bdms=bdms)
  5569. # The driver pre_live_migration will plug vifs on the host. We call
  5570. # plug_vifs before calling ensure_filtering_rules_for_instance, to
  5571. # ensure bridge is set up.
  5572. migrate_data = self.driver.pre_live_migration(context,
  5573. instance,
  5574. block_device_info,
  5575. network_info,
  5576. disk,
  5577. migrate_data)
  5578. LOG.debug('driver pre_live_migration data is %s', migrate_data)
  5579. # driver.pre_live_migration is what plugs vifs on the destination
  5580. # host so now we can set the wait_for_vif_plugged flag in the
  5581. # migrate_data object which the source compute will use to
  5582. # determine if it should wait for a 'network-vif-plugged' event
  5583. # from neutron before starting the actual guest transfer in the
  5584. # hypervisor
  5585. migrate_data.wait_for_vif_plugged = (
  5586. CONF.compute.live_migration_wait_for_vif_plug)
  5587. # NOTE(tr3buchet): setup networks on destination host
  5588. self.network_api.setup_networks_on_host(context, instance,
  5589. self.host)
  5590. # Creating filters to hypervisors and firewalls.
  5591. # An example is that nova-instance-instance-xxx,
  5592. # which is written to libvirt.xml(Check "virsh nwfilter-list")
  5593. # This nwfilter is necessary on the destination host.
  5594. # In addition, this method is creating filtering rule
  5595. # onto destination host.
  5596. self.driver.ensure_filtering_rules_for_instance(instance,
  5597. network_info)
  5598. except Exception:
  5599. # If we raise, migrate_data with the updated attachment ids
  5600. # will not be returned to the source host for rollback.
  5601. # So we need to rollback new attachments here.
  5602. with excutils.save_and_reraise_exception():
  5603. old_attachments = migrate_data.old_vol_attachment_ids
  5604. for bdm in bdms:
  5605. if (bdm.is_volume and bdm.attachment_id is not None and
  5606. bdm.volume_id in old_attachments):
  5607. self.volume_api.attachment_delete(context,
  5608. bdm.attachment_id)
  5609. bdm.attachment_id = old_attachments[bdm.volume_id]
  5610. bdm.save()
  5611. # Volume connections are complete, tell cinder that all the
  5612. # attachments have completed.
  5613. for bdm in bdms:
  5614. if bdm.is_volume and bdm.attachment_id is not None:
  5615. self.volume_api.attachment_complete(context,
  5616. bdm.attachment_id)
  5617. self._notify_about_instance_usage(
  5618. context, instance, "live_migration.pre.end",
  5619. network_info=network_info)
  5620. compute_utils.notify_about_instance_action(
  5621. context, instance, self.host,
  5622. action=fields.NotificationAction.LIVE_MIGRATION_PRE,
  5623. phase=fields.NotificationPhase.END, bdms=bdms)
  5624. LOG.debug('pre_live_migration result data is %s', migrate_data)
  5625. return migrate_data
  5626. @staticmethod
  5627. def _neutron_failed_live_migration_callback(event_name, instance):
  5628. msg = ('Neutron reported failure during live migration '
  5629. 'with %(event)s for instance %(uuid)s')
  5630. msg_args = {'event': event_name, 'uuid': instance.uuid}
  5631. if CONF.vif_plugging_is_fatal:
  5632. raise exception.VirtualInterfacePlugException(msg % msg_args)
  5633. LOG.error(msg, msg_args)
  5634. @staticmethod
  5635. def _get_neutron_events_for_live_migration(instance):
  5636. # We don't generate events if CONF.vif_plugging_timeout=0
  5637. # meaning that the operator disabled using them.
  5638. if CONF.vif_plugging_timeout and utils.is_neutron():
  5639. return [('network-vif-plugged', vif['id'])
  5640. for vif in instance.get_network_info()]
  5641. else:
  5642. return []
  5643. def _cleanup_pre_live_migration(self, context, dest, instance,
  5644. migration, migrate_data):
  5645. """Helper method for when pre_live_migration fails
  5646. Sets the migration status to "error" and rolls back the live migration
  5647. setup on the destination host.
  5648. :param context: The user request context.
  5649. :type context: nova.context.RequestContext
  5650. :param dest: The live migration destination hostname.
  5651. :type dest: str
  5652. :param instance: The instance being live migrated.
  5653. :type instance: nova.objects.Instance
  5654. :param migration: The migration record tracking this live migration.
  5655. :type migration: nova.objects.Migration
  5656. :param migrate_data: Data about the live migration, populated from
  5657. the destination host.
  5658. :type migrate_data: Subclass of nova.objects.LiveMigrateData
  5659. """
  5660. self._set_migration_status(migration, 'error')
  5661. # Make sure we set this for _rollback_live_migration()
  5662. # so it can find it, as expected if it was called later
  5663. migrate_data.migration = migration
  5664. self._rollback_live_migration(context, instance, dest,
  5665. migrate_data)
  5666. def _do_live_migration(self, context, dest, instance, block_migration,
  5667. migration, migrate_data):
  5668. # NOTE(danms): We should enhance the RT to account for migrations
  5669. # and use the status field to denote when the accounting has been
  5670. # done on source/destination. For now, this is just here for status
  5671. # reporting
  5672. self._set_migration_status(migration, 'preparing')
  5673. source_bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  5674. context, instance.uuid)
  5675. class _BreakWaitForInstanceEvent(Exception):
  5676. """Used as a signal to stop waiting for the network-vif-plugged
  5677. event when we discover that
  5678. [compute]/live_migration_wait_for_vif_plug is not set on the
  5679. destination.
  5680. """
  5681. pass
  5682. events = self._get_neutron_events_for_live_migration(instance)
  5683. try:
  5684. if ('block_migration' in migrate_data and
  5685. migrate_data.block_migration):
  5686. block_device_info = self._get_instance_block_device_info(
  5687. context, instance, bdms=source_bdms)
  5688. disk = self.driver.get_instance_disk_info(
  5689. instance, block_device_info=block_device_info)
  5690. else:
  5691. disk = None
  5692. deadline = CONF.vif_plugging_timeout
  5693. error_cb = self._neutron_failed_live_migration_callback
  5694. # In order to avoid a race with the vif plugging that the virt
  5695. # driver does on the destination host, we register our events
  5696. # to wait for before calling pre_live_migration. Then if the
  5697. # dest host reports back that we shouldn't wait, we can break
  5698. # out of the context manager using _BreakWaitForInstanceEvent.
  5699. with self.virtapi.wait_for_instance_event(
  5700. instance, events, deadline=deadline,
  5701. error_callback=error_cb):
  5702. with timeutils.StopWatch() as timer:
  5703. migrate_data = self.compute_rpcapi.pre_live_migration(
  5704. context, instance,
  5705. block_migration, disk, dest, migrate_data)
  5706. LOG.info('Took %0.2f seconds for pre_live_migration on '
  5707. 'destination host %s.',
  5708. timer.elapsed(), dest, instance=instance)
  5709. wait_for_vif_plugged = (
  5710. 'wait_for_vif_plugged' in migrate_data and
  5711. migrate_data.wait_for_vif_plugged)
  5712. if events and not wait_for_vif_plugged:
  5713. raise _BreakWaitForInstanceEvent
  5714. except _BreakWaitForInstanceEvent:
  5715. if events:
  5716. LOG.debug('Not waiting for events after pre_live_migration: '
  5717. '%s. ', events, instance=instance)
  5718. # This is a bit weird, but we need to clear sys.exc_info() so that
  5719. # oslo.log formatting does not inadvertently use it later if an
  5720. # error message is logged without an explicit exc_info. This is
  5721. # only a problem with python 2.
  5722. if six.PY2:
  5723. sys.exc_clear()
  5724. except exception.VirtualInterfacePlugException:
  5725. with excutils.save_and_reraise_exception():
  5726. LOG.exception('Failed waiting for network virtual interfaces '
  5727. 'to be plugged on the destination host %s.',
  5728. dest, instance=instance)
  5729. self._cleanup_pre_live_migration(
  5730. context, dest, instance, migration, migrate_data)
  5731. except eventlet.timeout.Timeout:
  5732. # We only get here if wait_for_vif_plugged is True which means
  5733. # live_migration_wait_for_vif_plug=True on the destination host.
  5734. msg = (
  5735. 'Timed out waiting for events: %(events)s. If these timeouts '
  5736. 'are a persistent issue it could mean the networking backend '
  5737. 'on host %(dest)s does not support sending these events '
  5738. 'unless there are port binding host changes which does not '
  5739. 'happen at this point in the live migration process. You may '
  5740. 'need to disable the live_migration_wait_for_vif_plug option '
  5741. 'on host %(dest)s.')
  5742. subs = {'events': events, 'dest': dest}
  5743. LOG.warning(msg, subs, instance=instance)
  5744. if CONF.vif_plugging_is_fatal:
  5745. self._cleanup_pre_live_migration(
  5746. context, dest, instance, migration, migrate_data)
  5747. raise exception.MigrationError(reason=msg % subs)
  5748. except Exception:
  5749. with excutils.save_and_reraise_exception():
  5750. LOG.exception('Pre live migration failed at %s',
  5751. dest, instance=instance)
  5752. self._cleanup_pre_live_migration(
  5753. context, dest, instance, migration, migrate_data)
  5754. # NOTE(Kevin_Zheng): Pop the migration from the waiting queue
  5755. # if it exist in the queue, then we are good to moving on, if
  5756. # not, some other process must have aborted it, then we should
  5757. # rollback.
  5758. try:
  5759. self._waiting_live_migrations.pop(instance.uuid)
  5760. except KeyError:
  5761. LOG.debug('Migration %s aborted by another process, rollback.',
  5762. migration.uuid, instance=instance)
  5763. migrate_data.migration = migration
  5764. self._rollback_live_migration(context, instance, dest,
  5765. migrate_data, 'cancelled')
  5766. self._notify_live_migrate_abort_end(context, instance)
  5767. return
  5768. self._set_migration_status(migration, 'running')
  5769. if migrate_data:
  5770. migrate_data.migration = migration
  5771. # NOTE(mdbooth): pre_live_migration will update connection_info and
  5772. # attachment_id on all volume BDMS to reflect the new destination
  5773. # host attachment. We fetch BDMs before that to retain connection_info
  5774. # and attachment_id relating to the source host for post migration
  5775. # cleanup.
  5776. post_live_migration = functools.partial(self._post_live_migration,
  5777. source_bdms=source_bdms)
  5778. LOG.debug('live_migration data is %s', migrate_data)
  5779. try:
  5780. self.driver.live_migration(context, instance, dest,
  5781. post_live_migration,
  5782. self._rollback_live_migration,
  5783. block_migration, migrate_data)
  5784. except Exception:
  5785. LOG.exception('Live migration failed.', instance=instance)
  5786. with excutils.save_and_reraise_exception():
  5787. # Put instance and migration into error state,
  5788. # as its almost certainly too late to rollback
  5789. self._set_migration_status(migration, 'error')
  5790. # first refresh instance as it may have got updated by
  5791. # post_live_migration_at_destination
  5792. instance.refresh()
  5793. self._set_instance_obj_error_state(context, instance,
  5794. clean_task_state=True)
  5795. @wrap_exception()
  5796. @wrap_instance_event(prefix='compute')
  5797. @wrap_instance_fault
  5798. def live_migration(self, context, dest, instance, block_migration,
  5799. migration, migrate_data):
  5800. """Executing live migration.
  5801. :param context: security context
  5802. :param dest: destination host
  5803. :param instance: a nova.objects.instance.Instance object
  5804. :param block_migration: if true, prepare for block migration
  5805. :param migration: an nova.objects.Migration object
  5806. :param migrate_data: implementation specific params
  5807. """
  5808. self._set_migration_status(migration, 'queued')
  5809. # NOTE(Kevin_Zheng): Submit the live_migration job to the pool and
  5810. # put the returned Future object into dict mapped with migration.uuid
  5811. # in order to be able to track and abort it in the future.
  5812. self._waiting_live_migrations[instance.uuid] = (None, None)
  5813. try:
  5814. future = self._live_migration_executor.submit(
  5815. self._do_live_migration, context, dest, instance,
  5816. block_migration, migration, migrate_data)
  5817. self._waiting_live_migrations[instance.uuid] = (migration, future)
  5818. except RuntimeError:
  5819. # GreenThreadPoolExecutor.submit will raise RuntimeError if the
  5820. # pool is shutdown, which happens in
  5821. # _cleanup_live_migrations_in_pool.
  5822. LOG.info('Migration %s failed to submit as the compute service '
  5823. 'is shutting down.', migration.uuid, instance=instance)
  5824. self._set_migration_status(migration, 'error')
  5825. raise exception.LiveMigrationNotSubmitted(
  5826. migration_uuid=migration.uuid, instance_uuid=instance.uuid)
  5827. @wrap_exception()
  5828. @wrap_instance_event(prefix='compute')
  5829. @wrap_instance_fault
  5830. def live_migration_force_complete(self, context, instance):
  5831. """Force live migration to complete.
  5832. :param context: Security context
  5833. :param instance: The instance that is being migrated
  5834. """
  5835. self._notify_about_instance_usage(
  5836. context, instance, 'live.migration.force.complete.start')
  5837. compute_utils.notify_about_instance_action(
  5838. context, instance, self.host,
  5839. action=fields.NotificationAction.LIVE_MIGRATION_FORCE_COMPLETE,
  5840. phase=fields.NotificationPhase.START)
  5841. self.driver.live_migration_force_complete(instance)
  5842. self._notify_about_instance_usage(
  5843. context, instance, 'live.migration.force.complete.end')
  5844. compute_utils.notify_about_instance_action(
  5845. context, instance, self.host,
  5846. action=fields.NotificationAction.LIVE_MIGRATION_FORCE_COMPLETE,
  5847. phase=fields.NotificationPhase.END)
  5848. def _notify_live_migrate_abort_end(self, context, instance):
  5849. self._notify_about_instance_usage(
  5850. context, instance, 'live.migration.abort.end')
  5851. compute_utils.notify_about_instance_action(
  5852. context, instance, self.host,
  5853. action=fields.NotificationAction.LIVE_MIGRATION_ABORT,
  5854. phase=fields.NotificationPhase.END)
  5855. @wrap_exception()
  5856. @wrap_instance_event(prefix='compute')
  5857. @wrap_instance_fault
  5858. def live_migration_abort(self, context, instance, migration_id):
  5859. """Abort an in-progress live migration.
  5860. :param context: Security context
  5861. :param instance: The instance that is being migrated
  5862. :param migration_id: ID of in-progress live migration
  5863. """
  5864. self._notify_about_instance_usage(
  5865. context, instance, 'live.migration.abort.start')
  5866. compute_utils.notify_about_instance_action(
  5867. context, instance, self.host,
  5868. action=fields.NotificationAction.LIVE_MIGRATION_ABORT,
  5869. phase=fields.NotificationPhase.START)
  5870. # NOTE(Kevin_Zheng): Pop the migration out from the queue, this might
  5871. # lead to 3 scenarios:
  5872. # 1. The selected migration is still in queue, and the future.cancel()
  5873. # succeed, then the abort action is succeed, mark the migration
  5874. # status to 'cancelled'.
  5875. # 2. The selected migration is still in queue, but the future.cancel()
  5876. # failed, then the _do_live_migration() has started executing, and
  5877. # the migration status is 'preparing', then we just pop it from the
  5878. # queue, and the migration process will handle it later. And the
  5879. # migration status couldn't be 'running' in this scenario because
  5880. # if _do_live_migration has started executing and we've already
  5881. # popped it from the queue and set the migration status to
  5882. # 'running' at this point, popping it here will raise KeyError at
  5883. # which point we check if it's running and if so, we abort the old
  5884. # way.
  5885. # 3. The selected migration is not in the queue, then the migration
  5886. # status is 'running', let the driver handle it.
  5887. try:
  5888. migration, future = (
  5889. self._waiting_live_migrations.pop(instance.uuid))
  5890. if future and future.cancel():
  5891. # If we got here, we've successfully aborted the queued
  5892. # migration and _do_live_migration won't run so we need
  5893. # to set the migration status to cancelled and send the
  5894. # notification. If Future.cancel() fails, it means
  5895. # _do_live_migration is running and the migration status
  5896. # is preparing, and _do_live_migration() itself will attempt
  5897. # to pop the queued migration, hit a KeyError, and rollback,
  5898. # set the migration to cancelled and send the
  5899. # live.migration.abort.end notification.
  5900. self._set_migration_status(migration, 'cancelled')
  5901. except KeyError:
  5902. migration = objects.Migration.get_by_id(context, migration_id)
  5903. if migration.status != 'running':
  5904. raise exception.InvalidMigrationState(
  5905. migration_id=migration_id, instance_uuid=instance.uuid,
  5906. state=migration.status, method='abort live migration')
  5907. self.driver.live_migration_abort(instance)
  5908. self._notify_live_migrate_abort_end(context, instance)
  5909. def _live_migration_cleanup_flags(self, migrate_data):
  5910. """Determine whether disks or instance path need to be cleaned up after
  5911. live migration (at source on success, at destination on rollback)
  5912. Block migration needs empty image at destination host before migration
  5913. starts, so if any failure occurs, any empty images has to be deleted.
  5914. Also Volume backed live migration w/o shared storage needs to delete
  5915. newly created instance-xxx dir on the destination as a part of its
  5916. rollback process
  5917. :param migrate_data: implementation specific data
  5918. :returns: (bool, bool) -- do_cleanup, destroy_disks
  5919. """
  5920. # NOTE(pkoniszewski): block migration specific params are set inside
  5921. # migrate_data objects for drivers that expose block live migration
  5922. # information (i.e. Libvirt, Xenapi and HyperV). For other drivers
  5923. # cleanup is not needed.
  5924. do_cleanup = False
  5925. destroy_disks = False
  5926. if isinstance(migrate_data, migrate_data_obj.LibvirtLiveMigrateData):
  5927. # No instance booting at source host, but instance dir
  5928. # must be deleted for preparing next block migration
  5929. # must be deleted for preparing next live migration w/o shared
  5930. # storage
  5931. do_cleanup = not migrate_data.is_shared_instance_path
  5932. destroy_disks = not migrate_data.is_shared_block_storage
  5933. elif isinstance(migrate_data, migrate_data_obj.XenapiLiveMigrateData):
  5934. do_cleanup = migrate_data.block_migration
  5935. destroy_disks = migrate_data.block_migration
  5936. elif isinstance(migrate_data, migrate_data_obj.HyperVLiveMigrateData):
  5937. # NOTE(claudiub): We need to cleanup any zombie Planned VM.
  5938. do_cleanup = True
  5939. destroy_disks = not migrate_data.is_shared_instance_path
  5940. return (do_cleanup, destroy_disks)
  5941. @wrap_exception()
  5942. @wrap_instance_fault
  5943. def _post_live_migration(self, ctxt, instance, dest,
  5944. block_migration=False, migrate_data=None,
  5945. source_bdms=None):
  5946. """Post operations for live migration.
  5947. This method is called from live_migration
  5948. and mainly updating database record.
  5949. :param ctxt: security context
  5950. :param instance: instance dict
  5951. :param dest: destination host
  5952. :param block_migration: if true, prepare for block migration
  5953. :param migrate_data: if not None, it is a dict which has data
  5954. :param source_bdms: BDMs prior to modification by the destination
  5955. compute host. Set by _do_live_migration and not
  5956. part of the callback interface, so this is never
  5957. None
  5958. required for live migration without shared storage
  5959. """
  5960. LOG.info('_post_live_migration() is started..',
  5961. instance=instance)
  5962. # Cleanup source host post live-migration
  5963. block_device_info = self._get_instance_block_device_info(
  5964. ctxt, instance, bdms=source_bdms)
  5965. self.driver.post_live_migration(ctxt, instance, block_device_info,
  5966. migrate_data)
  5967. # Detaching volumes.
  5968. connector = self.driver.get_volume_connector(instance)
  5969. for bdm in source_bdms:
  5970. if bdm.is_volume:
  5971. # Detaching volumes is a call to an external API that can fail.
  5972. # If it does, we need to handle it gracefully so that the call
  5973. # to post_live_migration_at_destination - where we set instance
  5974. # host and task state - still happens. We need to rethink the
  5975. # current approach of setting instance host and task state
  5976. # AFTER a whole bunch of things that could fail in unhandled
  5977. # ways, but that is left as a TODO(artom).
  5978. try:
  5979. if bdm.attachment_id is None:
  5980. # Prior to cinder v3.44:
  5981. # We don't want to actually mark the volume detached,
  5982. # or delete the bdm, just remove the connection from
  5983. # this host.
  5984. #
  5985. # remove the volume connection without detaching from
  5986. # hypervisor because the instance is not running
  5987. # anymore on the current host
  5988. self.volume_api.terminate_connection(ctxt,
  5989. bdm.volume_id,
  5990. connector)
  5991. else:
  5992. # cinder v3.44 api flow - delete the old attachment
  5993. # for the source host
  5994. self.volume_api.attachment_delete(ctxt,
  5995. bdm.attachment_id)
  5996. except Exception as e:
  5997. if bdm.attachment_id is None:
  5998. LOG.error('Connection for volume %s not terminated on '
  5999. 'source host %s during post_live_migration: '
  6000. '%s', bdm.volume_id, self.host,
  6001. six.text_type(e), instance=instance)
  6002. else:
  6003. LOG.error('Volume attachment %s not deleted on source '
  6004. 'host %s during post_live_migration: %s',
  6005. bdm.attachment_id, self.host,
  6006. six.text_type(e), instance=instance)
  6007. # Releasing vlan.
  6008. # (not necessary in current implementation?)
  6009. network_info = self.network_api.get_instance_nw_info(ctxt, instance)
  6010. self._notify_about_instance_usage(ctxt, instance,
  6011. "live_migration._post.start",
  6012. network_info=network_info)
  6013. compute_utils.notify_about_instance_action(
  6014. ctxt, instance, self.host,
  6015. action=fields.NotificationAction.LIVE_MIGRATION_POST,
  6016. phase=fields.NotificationPhase.START)
  6017. # Releasing security group ingress rule.
  6018. LOG.debug('Calling driver.unfilter_instance from _post_live_migration',
  6019. instance=instance)
  6020. self.driver.unfilter_instance(instance,
  6021. network_info)
  6022. migration = {'source_compute': self.host,
  6023. 'dest_compute': dest, }
  6024. # For neutron, migrate_instance_start will activate the destination
  6025. # host port bindings, if there are any created by conductor before live
  6026. # migration started.
  6027. self.network_api.migrate_instance_start(ctxt,
  6028. instance,
  6029. migration)
  6030. destroy_vifs = False
  6031. try:
  6032. # It's possible that the vif type changed on the destination
  6033. # host and is already bound and active, so we need to use the
  6034. # stashed source vifs in migrate_data.vifs (if present) to unplug
  6035. # on the source host.
  6036. unplug_nw_info = network_info
  6037. if migrate_data and 'vifs' in migrate_data:
  6038. nw_info = []
  6039. for migrate_vif in migrate_data.vifs:
  6040. nw_info.append(migrate_vif.source_vif)
  6041. unplug_nw_info = network_model.NetworkInfo.hydrate(nw_info)
  6042. LOG.debug('Calling driver.post_live_migration_at_source '
  6043. 'with original source VIFs from migrate_data: %s',
  6044. unplug_nw_info, instance=instance)
  6045. self.driver.post_live_migration_at_source(ctxt, instance,
  6046. unplug_nw_info)
  6047. except NotImplementedError as ex:
  6048. LOG.debug(ex, instance=instance)
  6049. # For all hypervisors other than libvirt, there is a possibility
  6050. # they are unplugging networks from source node in the cleanup
  6051. # method
  6052. destroy_vifs = True
  6053. # NOTE(danms): Save source node before calling post method on
  6054. # destination, which will update it
  6055. source_node = instance.node
  6056. # Define domain at destination host, without doing it,
  6057. # pause/suspend/terminate do not work.
  6058. post_at_dest_success = True
  6059. try:
  6060. self.compute_rpcapi.post_live_migration_at_destination(ctxt,
  6061. instance, block_migration, dest)
  6062. except Exception as error:
  6063. post_at_dest_success = False
  6064. # We don't want to break _post_live_migration() if
  6065. # post_live_migration_at_destination() fails as it should never
  6066. # affect cleaning up source node.
  6067. LOG.exception("Post live migration at destination %s failed",
  6068. dest, instance=instance, error=error)
  6069. do_cleanup, destroy_disks = self._live_migration_cleanup_flags(
  6070. migrate_data)
  6071. if do_cleanup:
  6072. LOG.debug('Calling driver.cleanup from _post_live_migration',
  6073. instance=instance)
  6074. self.driver.cleanup(ctxt, instance, unplug_nw_info,
  6075. destroy_disks=destroy_disks,
  6076. migrate_data=migrate_data,
  6077. destroy_vifs=destroy_vifs)
  6078. self.instance_events.clear_events_for_instance(instance)
  6079. # NOTE(timello): make sure we update available resources on source
  6080. # host even before next periodic task.
  6081. self.update_available_resource(ctxt)
  6082. self._update_scheduler_instance_info(ctxt, instance)
  6083. self._notify_about_instance_usage(ctxt, instance,
  6084. "live_migration._post.end",
  6085. network_info=network_info)
  6086. compute_utils.notify_about_instance_action(
  6087. ctxt, instance, self.host,
  6088. action=fields.NotificationAction.LIVE_MIGRATION_POST,
  6089. phase=fields.NotificationPhase.END)
  6090. if post_at_dest_success:
  6091. LOG.info('Migrating instance to %s finished successfully.',
  6092. dest, instance=instance)
  6093. self._clean_instance_console_tokens(ctxt, instance)
  6094. if migrate_data and migrate_data.obj_attr_is_set('migration'):
  6095. migrate_data.migration.status = 'completed'
  6096. migrate_data.migration.save()
  6097. self._delete_allocation_after_move(ctxt,
  6098. instance,
  6099. migrate_data.migration)
  6100. else:
  6101. # We didn't have data on a migration, which means we can't
  6102. # look up to see if we had new-style migration-based
  6103. # allocations. This should really only happen in cases of
  6104. # a buggy virt driver. Log a warning so we know it happened.
  6105. LOG.warning('Live migration ended with no migrate_data '
  6106. 'record. Unable to clean up migration-based '
  6107. 'allocations for node %s which is almost certainly '
  6108. 'not an expected situation.', source_node,
  6109. instance=instance)
  6110. def _consoles_enabled(self):
  6111. """Returns whether a console is enable."""
  6112. return (CONF.vnc.enabled or CONF.spice.enabled or
  6113. CONF.rdp.enabled or CONF.serial_console.enabled or
  6114. CONF.mks.enabled)
  6115. def _clean_instance_console_tokens(self, ctxt, instance):
  6116. """Clean console tokens stored for an instance."""
  6117. # If the database backend isn't in use, don't bother trying to clean
  6118. # tokens. The database backend is not supported for cells v1.
  6119. if not CONF.cells.enable and self._consoles_enabled():
  6120. objects.ConsoleAuthToken.\
  6121. clean_console_auths_for_instance(ctxt, instance.uuid)
  6122. @wrap_exception()
  6123. @wrap_instance_event(prefix='compute')
  6124. @wrap_instance_fault
  6125. def post_live_migration_at_destination(self, context, instance,
  6126. block_migration):
  6127. """Post operations for live migration .
  6128. :param context: security context
  6129. :param instance: Instance dict
  6130. :param block_migration: if true, prepare for block migration
  6131. """
  6132. LOG.info('Post operation of migration started',
  6133. instance=instance)
  6134. # NOTE(tr3buchet): setup networks on destination host
  6135. # this is called a second time because
  6136. # multi_host does not create the bridge in
  6137. # plug_vifs
  6138. # NOTE(mriedem): This is a no-op for neutron.
  6139. self.network_api.setup_networks_on_host(context, instance,
  6140. self.host)
  6141. migration = {'source_compute': instance.host,
  6142. 'dest_compute': self.host, }
  6143. self.network_api.migrate_instance_finish(context,
  6144. instance,
  6145. migration)
  6146. network_info = self.network_api.get_instance_nw_info(context, instance)
  6147. self._notify_about_instance_usage(
  6148. context, instance, "live_migration.post.dest.start",
  6149. network_info=network_info)
  6150. compute_utils.notify_about_instance_action(context, instance,
  6151. self.host,
  6152. action=fields.NotificationAction.LIVE_MIGRATION_POST_DEST,
  6153. phase=fields.NotificationPhase.START)
  6154. block_device_info = self._get_instance_block_device_info(context,
  6155. instance)
  6156. try:
  6157. self.driver.post_live_migration_at_destination(
  6158. context, instance, network_info, block_migration,
  6159. block_device_info)
  6160. except Exception:
  6161. with excutils.save_and_reraise_exception():
  6162. instance.vm_state = vm_states.ERROR
  6163. LOG.error('Unexpected error during post live migration at '
  6164. 'destination host.', instance=instance)
  6165. finally:
  6166. # Restore instance state and update host
  6167. current_power_state = self._get_power_state(context, instance)
  6168. node_name = None
  6169. prev_host = instance.host
  6170. try:
  6171. compute_node = self._get_compute_info(context, self.host)
  6172. node_name = compute_node.hypervisor_hostname
  6173. except exception.ComputeHostNotFound:
  6174. LOG.exception('Failed to get compute_info for %s', self.host)
  6175. finally:
  6176. instance.host = self.host
  6177. instance.power_state = current_power_state
  6178. instance.task_state = None
  6179. instance.node = node_name
  6180. instance.progress = 0
  6181. instance.save(expected_task_state=task_states.MIGRATING)
  6182. # NOTE(tr3buchet): tear down networks on source host (nova-net)
  6183. # NOTE(mriedem): For neutron, this will delete any inactive source
  6184. # host port bindings.
  6185. try:
  6186. self.network_api.setup_networks_on_host(context, instance,
  6187. prev_host, teardown=True)
  6188. except exception.PortBindingDeletionFailed as e:
  6189. # Removing the inactive port bindings from the source host is not
  6190. # critical so just log an error but don't fail.
  6191. LOG.error('Network cleanup failed for source host %s during post '
  6192. 'live migration. You may need to manually clean up '
  6193. 'resources in the network service. Error: %s',
  6194. prev_host, six.text_type(e))
  6195. # NOTE(vish): this is necessary to update dhcp for nova-network
  6196. # NOTE(mriedem): This is a no-op for neutron.
  6197. self.network_api.setup_networks_on_host(context, instance, self.host)
  6198. self._notify_about_instance_usage(
  6199. context, instance, "live_migration.post.dest.end",
  6200. network_info=network_info)
  6201. compute_utils.notify_about_instance_action(context, instance,
  6202. self.host,
  6203. action=fields.NotificationAction.LIVE_MIGRATION_POST_DEST,
  6204. phase=fields.NotificationPhase.END)
  6205. @wrap_exception()
  6206. @wrap_instance_fault
  6207. def _rollback_live_migration(self, context, instance,
  6208. dest, migrate_data=None,
  6209. migration_status='error'):
  6210. """Recovers Instance/volume state from migrating -> running.
  6211. :param context: security context
  6212. :param instance: nova.objects.instance.Instance object
  6213. :param dest:
  6214. This method is called from live migration src host.
  6215. This param specifies destination host.
  6216. :param migrate_data:
  6217. if not none, contains implementation specific data.
  6218. :param migration_status:
  6219. Contains the status we want to set for the migration object
  6220. """
  6221. if (isinstance(migrate_data, migrate_data_obj.LiveMigrateData) and
  6222. migrate_data.obj_attr_is_set('migration')):
  6223. migration = migrate_data.migration
  6224. else:
  6225. migration = None
  6226. if migration:
  6227. # Remove allocations created in Placement for the dest node.
  6228. # If migration is None, the virt driver didn't pass it which is
  6229. # a bug.
  6230. self._revert_allocation(context, instance, migration)
  6231. else:
  6232. LOG.error('Unable to revert allocations during live migration '
  6233. 'rollback; compute driver did not provide migrate_data',
  6234. instance=instance)
  6235. instance.task_state = None
  6236. instance.progress = 0
  6237. instance.save(expected_task_state=[task_states.MIGRATING])
  6238. # NOTE(tr3buchet): setup networks on source host (really it's re-setup
  6239. # for nova-network)
  6240. # NOTE(mriedem): This is a no-op for neutron.
  6241. self.network_api.setup_networks_on_host(context, instance, self.host)
  6242. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  6243. context, instance.uuid)
  6244. for bdm in bdms:
  6245. if bdm.is_volume:
  6246. # remove the connection on the destination host
  6247. self.compute_rpcapi.remove_volume_connection(
  6248. context, instance, bdm.volume_id, dest)
  6249. if bdm.attachment_id:
  6250. # 3.44 cinder api flow. Set the bdm's
  6251. # attachment_id to the old attachment of the source
  6252. # host. If old_attachments is not there, then
  6253. # there was an error before the new attachment was made.
  6254. old_attachments = migrate_data.old_vol_attachment_ids \
  6255. if 'old_vol_attachment_ids' in migrate_data else None
  6256. if old_attachments and bdm.volume_id in old_attachments:
  6257. self.volume_api.attachment_delete(context,
  6258. bdm.attachment_id)
  6259. bdm.attachment_id = old_attachments[bdm.volume_id]
  6260. bdm.save()
  6261. self._notify_about_instance_usage(context, instance,
  6262. "live_migration._rollback.start")
  6263. compute_utils.notify_about_instance_action(context, instance,
  6264. self.host,
  6265. action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK,
  6266. phase=fields.NotificationPhase.START,
  6267. bdms=bdms)
  6268. do_cleanup, destroy_disks = self._live_migration_cleanup_flags(
  6269. migrate_data)
  6270. if do_cleanup:
  6271. self.compute_rpcapi.rollback_live_migration_at_destination(
  6272. context, instance, dest, destroy_disks=destroy_disks,
  6273. migrate_data=migrate_data)
  6274. elif utils.is_neutron():
  6275. # The port binding profiles need to be cleaned up.
  6276. with errors_out_migration_ctxt(migration):
  6277. try:
  6278. # This call will delete any inactive destination host
  6279. # port bindings.
  6280. self.network_api.setup_networks_on_host(
  6281. context, instance, host=dest, teardown=True)
  6282. except exception.PortBindingDeletionFailed as e:
  6283. # Removing the inactive port bindings from the destination
  6284. # host is not critical so just log an error but don't fail.
  6285. LOG.error(
  6286. 'Network cleanup failed for destination host %s '
  6287. 'during live migration rollback. You may need to '
  6288. 'manually clean up resources in the network service. '
  6289. 'Error: %s', dest, six.text_type(e))
  6290. except Exception:
  6291. with excutils.save_and_reraise_exception():
  6292. LOG.exception(
  6293. 'An error occurred while cleaning up networking '
  6294. 'during live migration rollback.',
  6295. instance=instance)
  6296. self._notify_about_instance_usage(context, instance,
  6297. "live_migration._rollback.end")
  6298. compute_utils.notify_about_instance_action(context, instance,
  6299. self.host,
  6300. action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK,
  6301. phase=fields.NotificationPhase.END,
  6302. bdms=bdms)
  6303. self._set_migration_status(migration, migration_status)
  6304. @wrap_exception()
  6305. @wrap_instance_event(prefix='compute')
  6306. @wrap_instance_fault
  6307. def rollback_live_migration_at_destination(self, context, instance,
  6308. destroy_disks,
  6309. migrate_data):
  6310. """Cleaning up image directory that is created pre_live_migration.
  6311. :param context: security context
  6312. :param instance: a nova.objects.instance.Instance object sent over rpc
  6313. :param destroy_disks: whether to destroy volumes or not
  6314. :param migrate_data: contains migration info
  6315. """
  6316. network_info = self.network_api.get_instance_nw_info(context, instance)
  6317. self._notify_about_instance_usage(
  6318. context, instance, "live_migration.rollback.dest.start",
  6319. network_info=network_info)
  6320. compute_utils.notify_about_instance_action(
  6321. context, instance, self.host,
  6322. action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK_DEST,
  6323. phase=fields.NotificationPhase.START)
  6324. try:
  6325. # NOTE(tr3buchet): tear down networks on dest host (nova-net)
  6326. # NOTE(mriedem): For neutron, this call will delete any
  6327. # destination host port bindings.
  6328. # TODO(mriedem): We should eventually remove this call from
  6329. # this method (rollback_live_migration_at_destination) since this
  6330. # method is only called conditionally based on whether or not the
  6331. # instance is running on shared storage. _rollback_live_migration
  6332. # already calls this method for neutron if we are running on
  6333. # shared storage.
  6334. self.network_api.setup_networks_on_host(context, instance,
  6335. self.host, teardown=True)
  6336. except exception.PortBindingDeletionFailed as e:
  6337. # Removing the inactive port bindings from the destination
  6338. # host is not critical so just log an error but don't fail.
  6339. LOG.error(
  6340. 'Network cleanup failed for destination host %s '
  6341. 'during live migration rollback. You may need to '
  6342. 'manually clean up resources in the network service. '
  6343. 'Error: %s', self.host, six.text_type(e))
  6344. except Exception:
  6345. with excutils.save_and_reraise_exception():
  6346. # NOTE(tdurakov): even if teardown networks fails driver
  6347. # should try to rollback live migration on destination.
  6348. LOG.exception('An error occurred while deallocating network.',
  6349. instance=instance)
  6350. finally:
  6351. # always run this even if setup_networks_on_host fails
  6352. # NOTE(vish): The mapping is passed in so the driver can disconnect
  6353. # from remote volumes if necessary
  6354. block_device_info = self._get_instance_block_device_info(context,
  6355. instance)
  6356. self.driver.rollback_live_migration_at_destination(
  6357. context, instance, network_info, block_device_info,
  6358. destroy_disks=destroy_disks, migrate_data=migrate_data)
  6359. self._notify_about_instance_usage(
  6360. context, instance, "live_migration.rollback.dest.end",
  6361. network_info=network_info)
  6362. compute_utils.notify_about_instance_action(
  6363. context, instance, self.host,
  6364. action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK_DEST,
  6365. phase=fields.NotificationPhase.END)
  6366. def _require_nw_info_update(self, context, instance):
  6367. """Detect whether there is a mismatch in binding:host_id, or
  6368. binding_failed or unbound binding:vif_type for any of the instances
  6369. ports.
  6370. """
  6371. if not utils.is_neutron():
  6372. return False
  6373. search_opts = {'device_id': instance.uuid,
  6374. 'fields': ['binding:host_id', 'binding:vif_type']}
  6375. ports = self.network_api.list_ports(context, **search_opts)
  6376. for p in ports['ports']:
  6377. if p.get('binding:host_id') != self.host:
  6378. return True
  6379. vif_type = p.get('binding:vif_type')
  6380. if (vif_type == network_model.VIF_TYPE_UNBOUND or
  6381. vif_type == network_model.VIF_TYPE_BINDING_FAILED):
  6382. return True
  6383. return False
  6384. @periodic_task.periodic_task(
  6385. spacing=CONF.heal_instance_info_cache_interval)
  6386. def _heal_instance_info_cache(self, context):
  6387. """Called periodically. On every call, try to update the
  6388. info_cache's network information for another instance by
  6389. calling to the network manager.
  6390. This is implemented by keeping a cache of uuids of instances
  6391. that live on this host. On each call, we pop one off of a
  6392. list, pull the DB record, and try the call to the network API.
  6393. If anything errors don't fail, as it's possible the instance
  6394. has been deleted, etc.
  6395. """
  6396. heal_interval = CONF.heal_instance_info_cache_interval
  6397. if not heal_interval:
  6398. return
  6399. instance_uuids = getattr(self, '_instance_uuids_to_heal', [])
  6400. instance = None
  6401. LOG.debug('Starting heal instance info cache')
  6402. if not instance_uuids:
  6403. # The list of instances to heal is empty so rebuild it
  6404. LOG.debug('Rebuilding the list of instances to heal')
  6405. db_instances = objects.InstanceList.get_by_host(
  6406. context, self.host, expected_attrs=[], use_slave=True)
  6407. for inst in db_instances:
  6408. # We don't want to refresh the cache for instances
  6409. # which are building or deleting so don't put them
  6410. # in the list. If they are building they will get
  6411. # added to the list next time we build it.
  6412. if (inst.vm_state == vm_states.BUILDING):
  6413. LOG.debug('Skipping network cache update for instance '
  6414. 'because it is Building.', instance=inst)
  6415. continue
  6416. if (inst.task_state == task_states.DELETING):
  6417. LOG.debug('Skipping network cache update for instance '
  6418. 'because it is being deleted.', instance=inst)
  6419. continue
  6420. if not instance:
  6421. # Save the first one we find so we don't
  6422. # have to get it again
  6423. instance = inst
  6424. else:
  6425. instance_uuids.append(inst['uuid'])
  6426. self._instance_uuids_to_heal = instance_uuids
  6427. else:
  6428. # Find the next valid instance on the list
  6429. while instance_uuids:
  6430. try:
  6431. inst = objects.Instance.get_by_uuid(
  6432. context, instance_uuids.pop(0),
  6433. expected_attrs=['system_metadata', 'info_cache',
  6434. 'flavor'],
  6435. use_slave=True)
  6436. except exception.InstanceNotFound:
  6437. # Instance is gone. Try to grab another.
  6438. continue
  6439. # Check the instance hasn't been migrated
  6440. if inst.host != self.host:
  6441. LOG.debug('Skipping network cache update for instance '
  6442. 'because it has been migrated to another '
  6443. 'host.', instance=inst)
  6444. # Check the instance isn't being deleting
  6445. elif inst.task_state == task_states.DELETING:
  6446. LOG.debug('Skipping network cache update for instance '
  6447. 'because it is being deleted.', instance=inst)
  6448. else:
  6449. instance = inst
  6450. break
  6451. if instance:
  6452. # We have an instance now to refresh
  6453. try:
  6454. # Fix potential mismatch in port binding if evacuation failed
  6455. # after reassigning the port binding to the dest host but
  6456. # before the instance host is changed.
  6457. # Do this only when instance has no pending task.
  6458. if instance.task_state is None and \
  6459. self._require_nw_info_update(context, instance):
  6460. LOG.info("Updating ports in neutron", instance=instance)
  6461. self.network_api.setup_instance_network_on_host(
  6462. context, instance, self.host)
  6463. # Call to network API to get instance info.. this will
  6464. # force an update to the instance's info_cache
  6465. self.network_api.get_instance_nw_info(
  6466. context, instance, force_refresh=True)
  6467. LOG.debug('Updated the network info_cache for instance',
  6468. instance=instance)
  6469. except exception.InstanceNotFound:
  6470. # Instance is gone.
  6471. LOG.debug('Instance no longer exists. Unable to refresh',
  6472. instance=instance)
  6473. return
  6474. except exception.InstanceInfoCacheNotFound:
  6475. # InstanceInfoCache is gone.
  6476. LOG.debug('InstanceInfoCache no longer exists. '
  6477. 'Unable to refresh', instance=instance)
  6478. except Exception:
  6479. LOG.error('An error occurred while refreshing the network '
  6480. 'cache.', instance=instance, exc_info=True)
  6481. else:
  6482. LOG.debug("Didn't find any instances for network info cache "
  6483. "update.")
  6484. @periodic_task.periodic_task
  6485. def _poll_rebooting_instances(self, context):
  6486. if CONF.reboot_timeout > 0:
  6487. filters = {'task_state':
  6488. [task_states.REBOOTING,
  6489. task_states.REBOOT_STARTED,
  6490. task_states.REBOOT_PENDING],
  6491. 'host': self.host}
  6492. rebooting = objects.InstanceList.get_by_filters(
  6493. context, filters, expected_attrs=[], use_slave=True)
  6494. to_poll = []
  6495. for instance in rebooting:
  6496. if timeutils.is_older_than(instance.updated_at,
  6497. CONF.reboot_timeout):
  6498. to_poll.append(instance)
  6499. self.driver.poll_rebooting_instances(CONF.reboot_timeout, to_poll)
  6500. @periodic_task.periodic_task
  6501. def _poll_rescued_instances(self, context):
  6502. if CONF.rescue_timeout > 0:
  6503. filters = {'vm_state': vm_states.RESCUED,
  6504. 'host': self.host}
  6505. rescued_instances = objects.InstanceList.get_by_filters(
  6506. context, filters, expected_attrs=["system_metadata"],
  6507. use_slave=True)
  6508. to_unrescue = []
  6509. for instance in rescued_instances:
  6510. if timeutils.is_older_than(instance.launched_at,
  6511. CONF.rescue_timeout):
  6512. to_unrescue.append(instance)
  6513. for instance in to_unrescue:
  6514. self.compute_api.unrescue(context, instance)
  6515. @periodic_task.periodic_task
  6516. def _poll_unconfirmed_resizes(self, context):
  6517. if CONF.resize_confirm_window == 0:
  6518. return
  6519. migrations = objects.MigrationList.get_unconfirmed_by_dest_compute(
  6520. context, CONF.resize_confirm_window, self.host,
  6521. use_slave=True)
  6522. migrations_info = dict(migration_count=len(migrations),
  6523. confirm_window=CONF.resize_confirm_window)
  6524. if migrations_info["migration_count"] > 0:
  6525. LOG.info("Found %(migration_count)d unconfirmed migrations "
  6526. "older than %(confirm_window)d seconds",
  6527. migrations_info)
  6528. def _set_migration_to_error(migration, reason, **kwargs):
  6529. LOG.warning("Setting migration %(migration_id)s to error: "
  6530. "%(reason)s",
  6531. {'migration_id': migration['id'], 'reason': reason},
  6532. **kwargs)
  6533. migration.status = 'error'
  6534. with migration.obj_as_admin():
  6535. migration.save()
  6536. for migration in migrations:
  6537. instance_uuid = migration.instance_uuid
  6538. LOG.info("Automatically confirming migration "
  6539. "%(migration_id)s for instance %(instance_uuid)s",
  6540. {'migration_id': migration.id,
  6541. 'instance_uuid': instance_uuid})
  6542. expected_attrs = ['metadata', 'system_metadata']
  6543. try:
  6544. instance = objects.Instance.get_by_uuid(context,
  6545. instance_uuid, expected_attrs=expected_attrs,
  6546. use_slave=True)
  6547. except exception.InstanceNotFound:
  6548. reason = (_("Instance %s not found") %
  6549. instance_uuid)
  6550. _set_migration_to_error(migration, reason)
  6551. continue
  6552. if instance.vm_state == vm_states.ERROR:
  6553. reason = _("In ERROR state")
  6554. _set_migration_to_error(migration, reason,
  6555. instance=instance)
  6556. continue
  6557. # race condition: The instance in DELETING state should not be
  6558. # set the migration state to error, otherwise the instance in
  6559. # to be deleted which is in RESIZED state
  6560. # will not be able to confirm resize
  6561. if instance.task_state in [task_states.DELETING,
  6562. task_states.SOFT_DELETING]:
  6563. msg = ("Instance being deleted or soft deleted during resize "
  6564. "confirmation. Skipping.")
  6565. LOG.debug(msg, instance=instance)
  6566. continue
  6567. # race condition: This condition is hit when this method is
  6568. # called between the save of the migration record with a status of
  6569. # finished and the save of the instance object with a state of
  6570. # RESIZED. The migration record should not be set to error.
  6571. if instance.task_state == task_states.RESIZE_FINISH:
  6572. msg = ("Instance still resizing during resize "
  6573. "confirmation. Skipping.")
  6574. LOG.debug(msg, instance=instance)
  6575. continue
  6576. vm_state = instance.vm_state
  6577. task_state = instance.task_state
  6578. if vm_state != vm_states.RESIZED or task_state is not None:
  6579. reason = (_("In states %(vm_state)s/%(task_state)s, not "
  6580. "RESIZED/None") %
  6581. {'vm_state': vm_state,
  6582. 'task_state': task_state})
  6583. _set_migration_to_error(migration, reason,
  6584. instance=instance)
  6585. continue
  6586. try:
  6587. self.compute_api.confirm_resize(context, instance,
  6588. migration=migration)
  6589. except Exception as e:
  6590. LOG.info("Error auto-confirming resize: %s. "
  6591. "Will retry later.", e, instance=instance)
  6592. @periodic_task.periodic_task(spacing=CONF.shelved_poll_interval)
  6593. def _poll_shelved_instances(self, context):
  6594. if CONF.shelved_offload_time <= 0:
  6595. return
  6596. filters = {'vm_state': vm_states.SHELVED,
  6597. 'task_state': None,
  6598. 'host': self.host}
  6599. shelved_instances = objects.InstanceList.get_by_filters(
  6600. context, filters=filters, expected_attrs=['system_metadata'],
  6601. use_slave=True)
  6602. to_gc = []
  6603. for instance in shelved_instances:
  6604. sys_meta = instance.system_metadata
  6605. shelved_at = timeutils.parse_strtime(sys_meta['shelved_at'])
  6606. if timeutils.is_older_than(shelved_at, CONF.shelved_offload_time):
  6607. to_gc.append(instance)
  6608. for instance in to_gc:
  6609. try:
  6610. instance.task_state = task_states.SHELVING_OFFLOADING
  6611. instance.save(expected_task_state=(None,))
  6612. self.shelve_offload_instance(context, instance,
  6613. clean_shutdown=False)
  6614. except Exception:
  6615. LOG.exception('Periodic task failed to offload instance.',
  6616. instance=instance)
  6617. @periodic_task.periodic_task
  6618. def _instance_usage_audit(self, context):
  6619. if not CONF.instance_usage_audit:
  6620. return
  6621. begin, end = utils.last_completed_audit_period()
  6622. if objects.TaskLog.get(context, 'instance_usage_audit', begin, end,
  6623. self.host):
  6624. return
  6625. instances = objects.InstanceList.get_active_by_window_joined(
  6626. context, begin, end, host=self.host,
  6627. expected_attrs=['system_metadata', 'info_cache', 'metadata',
  6628. 'flavor'],
  6629. use_slave=True)
  6630. num_instances = len(instances)
  6631. errors = 0
  6632. successes = 0
  6633. LOG.info("Running instance usage audit for host %(host)s "
  6634. "from %(begin_time)s to %(end_time)s. "
  6635. "%(number_instances)s instances.",
  6636. {'host': self.host,
  6637. 'begin_time': begin,
  6638. 'end_time': end,
  6639. 'number_instances': num_instances})
  6640. start_time = time.time()
  6641. task_log = objects.TaskLog(context)
  6642. task_log.task_name = 'instance_usage_audit'
  6643. task_log.period_beginning = begin
  6644. task_log.period_ending = end
  6645. task_log.host = self.host
  6646. task_log.task_items = num_instances
  6647. task_log.message = 'Instance usage audit started...'
  6648. task_log.begin_task()
  6649. for instance in instances:
  6650. try:
  6651. compute_utils.notify_usage_exists(
  6652. self.notifier, context, instance, self.host,
  6653. ignore_missing_network_data=False)
  6654. successes += 1
  6655. except Exception:
  6656. LOG.exception('Failed to generate usage '
  6657. 'audit for instance '
  6658. 'on host %s', self.host,
  6659. instance=instance)
  6660. errors += 1
  6661. task_log.errors = errors
  6662. task_log.message = (
  6663. 'Instance usage audit ran for host %s, %s instances in %s seconds.'
  6664. % (self.host, num_instances, time.time() - start_time))
  6665. task_log.end_task()
  6666. @periodic_task.periodic_task(spacing=CONF.bandwidth_poll_interval)
  6667. def _poll_bandwidth_usage(self, context):
  6668. if not self._bw_usage_supported:
  6669. return
  6670. prev_time, start_time = utils.last_completed_audit_period()
  6671. curr_time = time.time()
  6672. if (curr_time - self._last_bw_usage_poll >
  6673. CONF.bandwidth_poll_interval):
  6674. self._last_bw_usage_poll = curr_time
  6675. LOG.info("Updating bandwidth usage cache")
  6676. cells_update_interval = CONF.cells.bandwidth_update_interval
  6677. if (cells_update_interval > 0 and
  6678. curr_time - self._last_bw_usage_cell_update >
  6679. cells_update_interval):
  6680. self._last_bw_usage_cell_update = curr_time
  6681. update_cells = True
  6682. else:
  6683. update_cells = False
  6684. instances = objects.InstanceList.get_by_host(context,
  6685. self.host,
  6686. use_slave=True)
  6687. try:
  6688. bw_counters = self.driver.get_all_bw_counters(instances)
  6689. except NotImplementedError:
  6690. # NOTE(mdragon): Not all hypervisors have bandwidth polling
  6691. # implemented yet. If they don't it doesn't break anything,
  6692. # they just don't get the info in the usage events.
  6693. # NOTE(PhilDay): Record that its not supported so we can
  6694. # skip fast on future calls rather than waste effort getting
  6695. # the list of instances.
  6696. LOG.info("Bandwidth usage not supported by %(driver)s.",
  6697. {'driver': CONF.compute_driver})
  6698. self._bw_usage_supported = False
  6699. return
  6700. refreshed = timeutils.utcnow()
  6701. for bw_ctr in bw_counters:
  6702. # Allow switching of greenthreads between queries.
  6703. greenthread.sleep(0)
  6704. bw_in = 0
  6705. bw_out = 0
  6706. last_ctr_in = None
  6707. last_ctr_out = None
  6708. usage = objects.BandwidthUsage.get_by_instance_uuid_and_mac(
  6709. context, bw_ctr['uuid'], bw_ctr['mac_address'],
  6710. start_period=start_time, use_slave=True)
  6711. if usage:
  6712. bw_in = usage.bw_in
  6713. bw_out = usage.bw_out
  6714. last_ctr_in = usage.last_ctr_in
  6715. last_ctr_out = usage.last_ctr_out
  6716. else:
  6717. usage = (objects.BandwidthUsage.
  6718. get_by_instance_uuid_and_mac(
  6719. context, bw_ctr['uuid'], bw_ctr['mac_address'],
  6720. start_period=prev_time, use_slave=True))
  6721. if usage:
  6722. last_ctr_in = usage.last_ctr_in
  6723. last_ctr_out = usage.last_ctr_out
  6724. if last_ctr_in is not None:
  6725. if bw_ctr['bw_in'] < last_ctr_in:
  6726. # counter rollover
  6727. bw_in += bw_ctr['bw_in']
  6728. else:
  6729. bw_in += (bw_ctr['bw_in'] - last_ctr_in)
  6730. if last_ctr_out is not None:
  6731. if bw_ctr['bw_out'] < last_ctr_out:
  6732. # counter rollover
  6733. bw_out += bw_ctr['bw_out']
  6734. else:
  6735. bw_out += (bw_ctr['bw_out'] - last_ctr_out)
  6736. objects.BandwidthUsage(context=context).create(
  6737. bw_ctr['uuid'],
  6738. bw_ctr['mac_address'],
  6739. bw_in,
  6740. bw_out,
  6741. bw_ctr['bw_in'],
  6742. bw_ctr['bw_out'],
  6743. start_period=start_time,
  6744. last_refreshed=refreshed,
  6745. update_cells=update_cells)
  6746. def _get_host_volume_bdms(self, context, use_slave=False):
  6747. """Return all block device mappings on a compute host."""
  6748. compute_host_bdms = []
  6749. instances = objects.InstanceList.get_by_host(context, self.host,
  6750. use_slave=use_slave)
  6751. for instance in instances:
  6752. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  6753. context, instance.uuid, use_slave=use_slave)
  6754. instance_bdms = [bdm for bdm in bdms if bdm.is_volume]
  6755. compute_host_bdms.append(dict(instance=instance,
  6756. instance_bdms=instance_bdms))
  6757. return compute_host_bdms
  6758. def _update_volume_usage_cache(self, context, vol_usages):
  6759. """Updates the volume usage cache table with a list of stats."""
  6760. for usage in vol_usages:
  6761. # Allow switching of greenthreads between queries.
  6762. greenthread.sleep(0)
  6763. vol_usage = objects.VolumeUsage(context)
  6764. vol_usage.volume_id = usage['volume']
  6765. vol_usage.instance_uuid = usage['instance'].uuid
  6766. vol_usage.project_id = usage['instance'].project_id
  6767. vol_usage.user_id = usage['instance'].user_id
  6768. vol_usage.availability_zone = usage['instance'].availability_zone
  6769. vol_usage.curr_reads = usage['rd_req']
  6770. vol_usage.curr_read_bytes = usage['rd_bytes']
  6771. vol_usage.curr_writes = usage['wr_req']
  6772. vol_usage.curr_write_bytes = usage['wr_bytes']
  6773. vol_usage.save()
  6774. self.notifier.info(context, 'volume.usage', vol_usage.to_dict())
  6775. compute_utils.notify_about_volume_usage(context, vol_usage,
  6776. self.host)
  6777. @periodic_task.periodic_task(spacing=CONF.volume_usage_poll_interval)
  6778. def _poll_volume_usage(self, context):
  6779. if CONF.volume_usage_poll_interval == 0:
  6780. return
  6781. compute_host_bdms = self._get_host_volume_bdms(context,
  6782. use_slave=True)
  6783. if not compute_host_bdms:
  6784. return
  6785. LOG.debug("Updating volume usage cache")
  6786. try:
  6787. vol_usages = self.driver.get_all_volume_usage(context,
  6788. compute_host_bdms)
  6789. except NotImplementedError:
  6790. return
  6791. self._update_volume_usage_cache(context, vol_usages)
  6792. @periodic_task.periodic_task(spacing=CONF.sync_power_state_interval,
  6793. run_immediately=True)
  6794. def _sync_power_states(self, context):
  6795. """Align power states between the database and the hypervisor.
  6796. To sync power state data we make a DB call to get the number of
  6797. virtual machines known by the hypervisor and if the number matches the
  6798. number of virtual machines known by the database, we proceed in a lazy
  6799. loop, one database record at a time, checking if the hypervisor has the
  6800. same power state as is in the database.
  6801. """
  6802. db_instances = objects.InstanceList.get_by_host(context, self.host,
  6803. expected_attrs=[],
  6804. use_slave=True)
  6805. try:
  6806. num_vm_instances = self.driver.get_num_instances()
  6807. except exception.VirtDriverNotReady as e:
  6808. # If the virt driver is not ready, like ironic-api not being up
  6809. # yet in the case of ironic, just log it and exit.
  6810. LOG.info('Skipping _sync_power_states periodic task due to: %s', e)
  6811. return
  6812. num_db_instances = len(db_instances)
  6813. if num_vm_instances != num_db_instances:
  6814. LOG.warning("While synchronizing instance power states, found "
  6815. "%(num_db_instances)s instances in the database "
  6816. "and %(num_vm_instances)s instances on the "
  6817. "hypervisor.",
  6818. {'num_db_instances': num_db_instances,
  6819. 'num_vm_instances': num_vm_instances})
  6820. def _sync(db_instance):
  6821. # NOTE(melwitt): This must be synchronized as we query state from
  6822. # two separate sources, the driver and the database.
  6823. # They are set (in stop_instance) and read, in sync.
  6824. @utils.synchronized(db_instance.uuid)
  6825. def query_driver_power_state_and_sync():
  6826. self._query_driver_power_state_and_sync(context, db_instance)
  6827. try:
  6828. query_driver_power_state_and_sync()
  6829. except Exception:
  6830. LOG.exception("Periodic sync_power_state task had an "
  6831. "error while processing an instance.",
  6832. instance=db_instance)
  6833. self._syncs_in_progress.pop(db_instance.uuid)
  6834. for db_instance in db_instances:
  6835. # process syncs asynchronously - don't want instance locking to
  6836. # block entire periodic task thread
  6837. uuid = db_instance.uuid
  6838. if uuid in self._syncs_in_progress:
  6839. LOG.debug('Sync already in progress for %s', uuid)
  6840. else:
  6841. LOG.debug('Triggering sync for uuid %s', uuid)
  6842. self._syncs_in_progress[uuid] = True
  6843. self._sync_power_pool.spawn_n(_sync, db_instance)
  6844. def _query_driver_power_state_and_sync(self, context, db_instance):
  6845. if db_instance.task_state is not None:
  6846. LOG.info("During sync_power_state the instance has a "
  6847. "pending task (%(task)s). Skip.",
  6848. {'task': db_instance.task_state}, instance=db_instance)
  6849. return
  6850. # No pending tasks. Now try to figure out the real vm_power_state.
  6851. try:
  6852. vm_instance = self.driver.get_info(db_instance)
  6853. vm_power_state = vm_instance.state
  6854. except exception.InstanceNotFound:
  6855. vm_power_state = power_state.NOSTATE
  6856. # Note(maoy): the above get_info call might take a long time,
  6857. # for example, because of a broken libvirt driver.
  6858. try:
  6859. self._sync_instance_power_state(context,
  6860. db_instance,
  6861. vm_power_state,
  6862. use_slave=True)
  6863. except exception.InstanceNotFound:
  6864. # NOTE(hanlind): If the instance gets deleted during sync,
  6865. # silently ignore.
  6866. pass
  6867. def _stop_unexpected_shutdown_instance(self, context, vm_state,
  6868. db_instance, orig_db_power_state):
  6869. # this is an exceptional case; make sure our data is up
  6870. # to date before slamming through a power off
  6871. vm_instance = self.driver.get_info(db_instance,
  6872. use_cache=False)
  6873. vm_power_state = vm_instance.state
  6874. # if it still looks off, go ahead and call stop()
  6875. if vm_power_state in (power_state.SHUTDOWN,
  6876. power_state.CRASHED):
  6877. LOG.warning("Instance shutdown by itself. Calling the "
  6878. "stop API. Current vm_state: %(vm_state)s, "
  6879. "current task_state: %(task_state)s, "
  6880. "original DB power_state: %(db_power_state)s, "
  6881. "current VM power_state: %(vm_power_state)s",
  6882. {'vm_state': vm_state,
  6883. 'task_state': db_instance.task_state,
  6884. 'db_power_state': orig_db_power_state,
  6885. 'vm_power_state': vm_power_state},
  6886. instance=db_instance)
  6887. try:
  6888. # Note(maoy): here we call the API instead of
  6889. # brutally updating the vm_state in the database
  6890. # to allow all the hooks and checks to be performed.
  6891. if db_instance.shutdown_terminate:
  6892. self.compute_api.delete(context, db_instance)
  6893. else:
  6894. self.compute_api.stop(context, db_instance)
  6895. except Exception:
  6896. # Note(maoy): there is no need to propagate the error
  6897. # because the same power_state will be retrieved next
  6898. # time and retried.
  6899. # For example, there might be another task scheduled.
  6900. LOG.exception("error during stop() in sync_power_state.",
  6901. instance=db_instance)
  6902. def _sync_instance_power_state(self, context, db_instance, vm_power_state,
  6903. use_slave=False):
  6904. """Align instance power state between the database and hypervisor.
  6905. If the instance is not found on the hypervisor, but is in the database,
  6906. then a stop() API will be called on the instance.
  6907. """
  6908. # We re-query the DB to get the latest instance info to minimize
  6909. # (not eliminate) race condition.
  6910. db_instance.refresh(use_slave=use_slave)
  6911. db_power_state = db_instance.power_state
  6912. vm_state = db_instance.vm_state
  6913. if self.host != db_instance.host:
  6914. # on the sending end of nova-compute _sync_power_state
  6915. # may have yielded to the greenthread performing a live
  6916. # migration; this in turn has changed the resident-host
  6917. # for the VM; However, the instance is still active, it
  6918. # is just in the process of migrating to another host.
  6919. # This implies that the compute source must relinquish
  6920. # control to the compute destination.
  6921. LOG.info("During the sync_power process the "
  6922. "instance has moved from "
  6923. "host %(src)s to host %(dst)s",
  6924. {'src': db_instance.host,
  6925. 'dst': self.host},
  6926. instance=db_instance)
  6927. return
  6928. elif db_instance.task_state is not None:
  6929. # on the receiving end of nova-compute, it could happen
  6930. # that the DB instance already report the new resident
  6931. # but the actual VM has not showed up on the hypervisor
  6932. # yet. In this case, let's allow the loop to continue
  6933. # and run the state sync in a later round
  6934. LOG.info("During sync_power_state the instance has a "
  6935. "pending task (%(task)s). Skip.",
  6936. {'task': db_instance.task_state},
  6937. instance=db_instance)
  6938. return
  6939. orig_db_power_state = db_power_state
  6940. if vm_power_state != db_power_state:
  6941. LOG.info('During _sync_instance_power_state the DB '
  6942. 'power_state (%(db_power_state)s) does not match '
  6943. 'the vm_power_state from the hypervisor '
  6944. '(%(vm_power_state)s). Updating power_state in the '
  6945. 'DB to match the hypervisor.',
  6946. {'db_power_state': db_power_state,
  6947. 'vm_power_state': vm_power_state},
  6948. instance=db_instance)
  6949. # power_state is always updated from hypervisor to db
  6950. db_instance.power_state = vm_power_state
  6951. db_instance.save()
  6952. db_power_state = vm_power_state
  6953. # Note(maoy): Now resolve the discrepancy between vm_state and
  6954. # vm_power_state. We go through all possible vm_states.
  6955. if vm_state in (vm_states.BUILDING,
  6956. vm_states.RESCUED,
  6957. vm_states.RESIZED,
  6958. vm_states.SUSPENDED,
  6959. vm_states.ERROR):
  6960. # TODO(maoy): we ignore these vm_state for now.
  6961. pass
  6962. elif vm_state == vm_states.ACTIVE:
  6963. # The only rational power state should be RUNNING
  6964. if vm_power_state in (power_state.SHUTDOWN,
  6965. power_state.CRASHED):
  6966. self._stop_unexpected_shutdown_instance(
  6967. context, vm_state, db_instance, orig_db_power_state)
  6968. elif vm_power_state == power_state.SUSPENDED:
  6969. LOG.warning("Instance is suspended unexpectedly. Calling "
  6970. "the stop API.", instance=db_instance)
  6971. try:
  6972. self.compute_api.stop(context, db_instance)
  6973. except Exception:
  6974. LOG.exception("error during stop() in sync_power_state.",
  6975. instance=db_instance)
  6976. elif vm_power_state == power_state.PAUSED:
  6977. # Note(maoy): a VM may get into the paused state not only
  6978. # because the user request via API calls, but also
  6979. # due to (temporary) external instrumentations.
  6980. # Before the virt layer can reliably report the reason,
  6981. # we simply ignore the state discrepancy. In many cases,
  6982. # the VM state will go back to running after the external
  6983. # instrumentation is done. See bug 1097806 for details.
  6984. LOG.warning("Instance is paused unexpectedly. Ignore.",
  6985. instance=db_instance)
  6986. elif vm_power_state == power_state.NOSTATE:
  6987. # Occasionally, depending on the status of the hypervisor,
  6988. # which could be restarting for example, an instance may
  6989. # not be found. Therefore just log the condition.
  6990. LOG.warning("Instance is unexpectedly not found. Ignore.",
  6991. instance=db_instance)
  6992. elif vm_state == vm_states.STOPPED:
  6993. if vm_power_state not in (power_state.NOSTATE,
  6994. power_state.SHUTDOWN,
  6995. power_state.CRASHED):
  6996. LOG.warning("Instance is not stopped. Calling "
  6997. "the stop API. Current vm_state: %(vm_state)s,"
  6998. " current task_state: %(task_state)s, "
  6999. "original DB power_state: %(db_power_state)s, "
  7000. "current VM power_state: %(vm_power_state)s",
  7001. {'vm_state': vm_state,
  7002. 'task_state': db_instance.task_state,
  7003. 'db_power_state': orig_db_power_state,
  7004. 'vm_power_state': vm_power_state},
  7005. instance=db_instance)
  7006. try:
  7007. # NOTE(russellb) Force the stop, because normally the
  7008. # compute API would not allow an attempt to stop a stopped
  7009. # instance.
  7010. self.compute_api.force_stop(context, db_instance)
  7011. except Exception:
  7012. LOG.exception("error during stop() in sync_power_state.",
  7013. instance=db_instance)
  7014. elif vm_state == vm_states.PAUSED:
  7015. if vm_power_state in (power_state.SHUTDOWN,
  7016. power_state.CRASHED):
  7017. LOG.warning("Paused instance shutdown by itself. Calling "
  7018. "the stop API.", instance=db_instance)
  7019. try:
  7020. self.compute_api.force_stop(context, db_instance)
  7021. except Exception:
  7022. LOG.exception("error during stop() in sync_power_state.",
  7023. instance=db_instance)
  7024. elif vm_state in (vm_states.SOFT_DELETED,
  7025. vm_states.DELETED):
  7026. if vm_power_state not in (power_state.NOSTATE,
  7027. power_state.SHUTDOWN):
  7028. # Note(maoy): this should be taken care of periodically in
  7029. # _cleanup_running_deleted_instances().
  7030. LOG.warning("Instance is not (soft-)deleted.",
  7031. instance=db_instance)
  7032. @periodic_task.periodic_task
  7033. def _reclaim_queued_deletes(self, context):
  7034. """Reclaim instances that are queued for deletion."""
  7035. interval = CONF.reclaim_instance_interval
  7036. if interval <= 0:
  7037. LOG.debug("CONF.reclaim_instance_interval <= 0, skipping...")
  7038. return
  7039. filters = {'vm_state': vm_states.SOFT_DELETED,
  7040. 'task_state': None,
  7041. 'host': self.host}
  7042. instances = objects.InstanceList.get_by_filters(
  7043. context, filters,
  7044. expected_attrs=objects.instance.INSTANCE_DEFAULT_FIELDS,
  7045. use_slave=True)
  7046. for instance in instances:
  7047. if self._deleted_old_enough(instance, interval):
  7048. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  7049. context, instance.uuid)
  7050. LOG.info('Reclaiming deleted instance', instance=instance)
  7051. try:
  7052. self._delete_instance(context, instance, bdms)
  7053. except Exception as e:
  7054. LOG.warning("Periodic reclaim failed to delete "
  7055. "instance: %s",
  7056. e, instance=instance)
  7057. def _get_nodename(self, instance, refresh=False):
  7058. """Helper method to get the name of the first available node
  7059. on this host. This method should not be used with any operations
  7060. on ironic instances since it does not handle multiple nodes.
  7061. """
  7062. node = self.driver.get_available_nodes(refresh=refresh)[0]
  7063. LOG.debug("No node specified, defaulting to %s", node,
  7064. instance=instance)
  7065. return node
  7066. def _update_available_resource_for_node(self, context, nodename,
  7067. startup=False):
  7068. try:
  7069. self.rt.update_available_resource(context, nodename,
  7070. startup=startup)
  7071. except exception.ComputeHostNotFound:
  7072. LOG.warning("Compute node '%s' not found in "
  7073. "update_available_resource.", nodename)
  7074. except exception.ReshapeFailed:
  7075. # We're only supposed to get here on startup, if a reshape was
  7076. # needed, was attempted, and failed. We want to kill the service.
  7077. with excutils.save_and_reraise_exception():
  7078. LOG.critical("Resource provider data migration failed "
  7079. "fatally during startup for node %s.", nodename)
  7080. except exception.ReshapeNeeded:
  7081. # This exception should only find its way here if the virt driver's
  7082. # update_provider_tree raised it incorrectly: either
  7083. # a) After the resource tracker already caught it once and
  7084. # reinvoked update_provider_tree with allocations. At this point
  7085. # the driver is just supposed to *do* the reshape, so if it raises
  7086. # ReshapeNeeded, it's a bug, and we want to kill the compute
  7087. # service.
  7088. # b) On periodic rather than startup (we only allow reshapes to
  7089. # happen on startup). In this case we'll just make the logs red and
  7090. # go again at the next periodic interval, where the same thing may
  7091. # or may not happen again. Depending on the previous and intended
  7092. # shape of the providers/inventories, this may not actually cause
  7093. # any immediately visible symptoms (in terms of scheduling, etc.)
  7094. # If this becomes a problem, we may wish to make it pop immediately
  7095. # (e.g. disable the service).
  7096. with excutils.save_and_reraise_exception():
  7097. LOG.exception("ReshapeNeeded exception is unexpected here!")
  7098. except Exception:
  7099. LOG.exception("Error updating resources for node %(node)s.",
  7100. {'node': nodename})
  7101. @periodic_task.periodic_task(spacing=CONF.update_resources_interval)
  7102. def update_available_resource(self, context, startup=False):
  7103. """See driver.get_available_resource()
  7104. Periodic process that keeps that the compute host's understanding of
  7105. resource availability and usage in sync with the underlying hypervisor.
  7106. :param context: security context
  7107. :param startup: True if this is being called when the nova-compute
  7108. service is starting, False otherwise.
  7109. """
  7110. compute_nodes_in_db = self._get_compute_nodes_in_db(context,
  7111. use_slave=True,
  7112. startup=startup)
  7113. try:
  7114. nodenames = set(self.driver.get_available_nodes())
  7115. except exception.VirtDriverNotReady:
  7116. LOG.warning("Virt driver is not ready.")
  7117. return
  7118. # Delete orphan compute node not reported by driver but still in db
  7119. for cn in compute_nodes_in_db:
  7120. if cn.hypervisor_hostname not in nodenames:
  7121. LOG.info("Deleting orphan compute node %(id)s "
  7122. "hypervisor host is %(hh)s, "
  7123. "nodes are %(nodes)s",
  7124. {'id': cn.id, 'hh': cn.hypervisor_hostname,
  7125. 'nodes': nodenames})
  7126. cn.destroy()
  7127. self.rt.remove_node(cn.hypervisor_hostname)
  7128. # Delete the corresponding resource provider in placement,
  7129. # along with any associated allocations and inventory.
  7130. self.reportclient.delete_resource_provider(context, cn,
  7131. cascade=True)
  7132. for nodename in nodenames:
  7133. self._update_available_resource_for_node(context, nodename,
  7134. startup=startup)
  7135. def _get_compute_nodes_in_db(self, context, use_slave=False,
  7136. startup=False):
  7137. try:
  7138. return objects.ComputeNodeList.get_all_by_host(context, self.host,
  7139. use_slave=use_slave)
  7140. except exception.NotFound:
  7141. if startup:
  7142. LOG.warning(
  7143. "No compute node record found for host %s. If this is "
  7144. "the first time this service is starting on this "
  7145. "host, then you can ignore this warning.", self.host)
  7146. else:
  7147. LOG.error("No compute node record for host %s", self.host)
  7148. return []
  7149. @periodic_task.periodic_task(
  7150. spacing=CONF.running_deleted_instance_poll_interval)
  7151. def _cleanup_running_deleted_instances(self, context):
  7152. """Cleanup any instances which are erroneously still running after
  7153. having been deleted.
  7154. Valid actions to take are:
  7155. 1. noop - do nothing
  7156. 2. log - log which instances are erroneously running
  7157. 3. reap - shutdown and cleanup any erroneously running instances
  7158. 4. shutdown - power off *and disable* any erroneously running
  7159. instances
  7160. The use-case for this cleanup task is: for various reasons, it may be
  7161. possible for the database to show an instance as deleted but for that
  7162. instance to still be running on a host machine (see bug
  7163. https://bugs.launchpad.net/nova/+bug/911366).
  7164. This cleanup task is a cross-hypervisor utility for finding these
  7165. zombied instances and either logging the discrepancy (likely what you
  7166. should do in production), or automatically reaping the instances (more
  7167. appropriate for dev environments).
  7168. """
  7169. action = CONF.running_deleted_instance_action
  7170. if action == "noop":
  7171. return
  7172. # NOTE(sirp): admin contexts don't ordinarily return deleted records
  7173. with utils.temporary_mutation(context, read_deleted="yes"):
  7174. for instance in self._running_deleted_instances(context):
  7175. if action == "log":
  7176. LOG.warning("Detected instance with name label "
  7177. "'%s' which is marked as "
  7178. "DELETED but still present on host.",
  7179. instance.name, instance=instance)
  7180. elif action == 'shutdown':
  7181. LOG.info("Powering off instance with name label "
  7182. "'%s' which is marked as "
  7183. "DELETED but still present on host.",
  7184. instance.name, instance=instance)
  7185. try:
  7186. try:
  7187. # disable starting the instance
  7188. self.driver.set_bootable(instance, False)
  7189. except NotImplementedError:
  7190. LOG.debug("set_bootable is not implemented "
  7191. "for the current driver")
  7192. # and power it off
  7193. self.driver.power_off(instance)
  7194. except Exception:
  7195. LOG.warning("Failed to power off instance",
  7196. instance=instance, exc_info=True)
  7197. elif action == 'reap':
  7198. LOG.info("Destroying instance with name label "
  7199. "'%s' which is marked as "
  7200. "DELETED but still present on host.",
  7201. instance.name, instance=instance)
  7202. bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
  7203. context, instance.uuid, use_slave=True)
  7204. self.instance_events.clear_events_for_instance(instance)
  7205. try:
  7206. self._shutdown_instance(context, instance, bdms,
  7207. notify=False)
  7208. self._cleanup_volumes(context, instance, bdms,
  7209. detach=False)
  7210. except Exception as e:
  7211. LOG.warning("Periodic cleanup failed to delete "
  7212. "instance: %s",
  7213. e, instance=instance)
  7214. else:
  7215. raise Exception(_("Unrecognized value '%s'"
  7216. " for CONF.running_deleted_"
  7217. "instance_action") % action)
  7218. def _running_deleted_instances(self, context):
  7219. """Returns a list of instances nova thinks is deleted,
  7220. but the hypervisor thinks is still running.
  7221. """
  7222. timeout = CONF.running_deleted_instance_timeout
  7223. filters = {'deleted': True,
  7224. 'soft_deleted': False}
  7225. instances = self._get_instances_on_driver(context, filters)
  7226. return [i for i in instances if self._deleted_old_enough(i, timeout)]
  7227. def _deleted_old_enough(self, instance, timeout):
  7228. deleted_at = instance.deleted_at
  7229. if deleted_at:
  7230. deleted_at = deleted_at.replace(tzinfo=None)
  7231. return (not deleted_at or timeutils.is_older_than(deleted_at, timeout))
  7232. @contextlib.contextmanager
  7233. def _error_out_instance_on_exception(self, context, instance,
  7234. instance_state=vm_states.ACTIVE):
  7235. instance_uuid = instance.uuid
  7236. try:
  7237. yield
  7238. except NotImplementedError as error:
  7239. with excutils.save_and_reraise_exception():
  7240. LOG.info("Setting instance back to %(state)s after: "
  7241. "%(error)s",
  7242. {'state': instance_state, 'error': error},
  7243. instance_uuid=instance_uuid)
  7244. self._instance_update(context, instance,
  7245. vm_state=instance_state,
  7246. task_state=None)
  7247. except exception.InstanceFaultRollback as error:
  7248. LOG.info("Setting instance back to ACTIVE after: %s",
  7249. error, instance_uuid=instance_uuid)
  7250. self._instance_update(context, instance,
  7251. vm_state=vm_states.ACTIVE,
  7252. task_state=None)
  7253. raise error.inner_exception
  7254. except Exception:
  7255. LOG.exception('Setting instance vm_state to ERROR',
  7256. instance_uuid=instance_uuid)
  7257. with excutils.save_and_reraise_exception():
  7258. self._set_instance_obj_error_state(context, instance)
  7259. @wrap_exception()
  7260. def add_aggregate_host(self, context, aggregate, host, slave_info):
  7261. """Notify hypervisor of change (for hypervisor pools)."""
  7262. try:
  7263. self.driver.add_to_aggregate(context, aggregate, host,
  7264. slave_info=slave_info)
  7265. except NotImplementedError:
  7266. LOG.debug('Hypervisor driver does not support '
  7267. 'add_aggregate_host')
  7268. except exception.AggregateError:
  7269. with excutils.save_and_reraise_exception():
  7270. self.driver.undo_aggregate_operation(
  7271. context,
  7272. aggregate.delete_host,
  7273. aggregate, host)
  7274. @wrap_exception()
  7275. def remove_aggregate_host(self, context, host, slave_info, aggregate):
  7276. """Removes a host from a physical hypervisor pool."""
  7277. try:
  7278. self.driver.remove_from_aggregate(context, aggregate, host,
  7279. slave_info=slave_info)
  7280. except NotImplementedError:
  7281. LOG.debug('Hypervisor driver does not support '
  7282. 'remove_aggregate_host')
  7283. except (exception.AggregateError,
  7284. exception.InvalidAggregateAction) as e:
  7285. with excutils.save_and_reraise_exception():
  7286. self.driver.undo_aggregate_operation(
  7287. context,
  7288. aggregate.add_host,
  7289. aggregate, host,
  7290. isinstance(e, exception.AggregateError))
  7291. def _process_instance_event(self, instance, event):
  7292. _event = self.instance_events.pop_instance_event(instance, event)
  7293. if _event:
  7294. LOG.debug('Processing event %(event)s',
  7295. {'event': event.key}, instance=instance)
  7296. _event.send(event)
  7297. else:
  7298. # If it's a network-vif-unplugged event and the instance is being
  7299. # deleted then we don't need to make this a warning as it's
  7300. # expected. There are other things which could trigger this like
  7301. # detaching an interface, but we don't have a task state for that.
  7302. if (event.name == 'network-vif-unplugged' and
  7303. instance.task_state == task_states.DELETING):
  7304. LOG.debug('Received event %s for instance which is being '
  7305. 'deleted.', event.key, instance=instance)
  7306. else:
  7307. LOG.warning('Received unexpected event %(event)s for '
  7308. 'instance with vm_state %(vm_state)s and '
  7309. 'task_state %(task_state)s.',
  7310. {'event': event.key,
  7311. 'vm_state': instance.vm_state,
  7312. 'task_state': instance.task_state},
  7313. instance=instance)
  7314. def _process_instance_vif_deleted_event(self, context, instance,
  7315. deleted_vif_id):
  7316. # If an attached port is deleted by neutron, it needs to
  7317. # be detached from the instance.
  7318. # And info cache needs to be updated.
  7319. network_info = instance.info_cache.network_info
  7320. for index, vif in enumerate(network_info):
  7321. if vif['id'] == deleted_vif_id:
  7322. LOG.info('Neutron deleted interface %(intf)s; '
  7323. 'detaching it from the instance and '
  7324. 'deleting it from the info cache',
  7325. {'intf': vif['id']},
  7326. instance=instance)
  7327. del network_info[index]
  7328. base_net_api.update_instance_cache_with_nw_info(
  7329. self.network_api, context,
  7330. instance,
  7331. nw_info=network_info)
  7332. try:
  7333. self.driver.detach_interface(context, instance, vif)
  7334. except NotImplementedError:
  7335. # Not all virt drivers support attach/detach of interfaces
  7336. # yet (like Ironic), so just ignore this.
  7337. pass
  7338. except exception.NovaException as ex:
  7339. # If the instance was deleted before the interface was
  7340. # detached, just log it at debug.
  7341. log_level = (logging.DEBUG
  7342. if isinstance(ex, exception.InstanceNotFound)
  7343. else logging.WARNING)
  7344. LOG.log(log_level,
  7345. "Detach interface failed, "
  7346. "port_id=%(port_id)s, reason: %(msg)s",
  7347. {'port_id': deleted_vif_id, 'msg': ex},
  7348. instance=instance)
  7349. break
  7350. @wrap_instance_event(prefix='compute')
  7351. @wrap_instance_fault
  7352. def extend_volume(self, context, instance, extended_volume_id):
  7353. # If an attached volume is extended by cinder, it needs to
  7354. # be extended by virt driver so host can detect its new size.
  7355. # And bdm needs to be updated.
  7356. LOG.debug('Handling volume-extended event for volume %(vol)s',
  7357. {'vol': extended_volume_id}, instance=instance)
  7358. try:
  7359. bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(
  7360. context, extended_volume_id, instance.uuid)
  7361. except exception.NotFound:
  7362. LOG.warning('Extend volume failed, '
  7363. 'volume %(vol)s is not attached to instance.',
  7364. {'vol': extended_volume_id},
  7365. instance=instance)
  7366. return
  7367. LOG.info('Cinder extended volume %(vol)s; '
  7368. 'extending it to detect new size',
  7369. {'vol': extended_volume_id},
  7370. instance=instance)
  7371. volume = self.volume_api.get(context, bdm.volume_id)
  7372. if bdm.connection_info is None:
  7373. LOG.warning('Extend volume failed, '
  7374. 'attached volume %(vol)s has no connection_info',
  7375. {'vol': extended_volume_id},
  7376. instance=instance)
  7377. return
  7378. connection_info = jsonutils.loads(bdm.connection_info)
  7379. bdm.volume_size = volume['size']
  7380. bdm.save()
  7381. if not self.driver.capabilities.get('supports_extend_volume', False):
  7382. raise exception.ExtendVolumeNotSupported()
  7383. try:
  7384. self.driver.extend_volume(connection_info,
  7385. instance,
  7386. bdm.volume_size * units.Gi)
  7387. except Exception as ex:
  7388. LOG.warning('Extend volume failed, '
  7389. 'volume_id=%(volume_id)s, reason: %(msg)s',
  7390. {'volume_id': extended_volume_id, 'msg': ex},
  7391. instance=instance)
  7392. raise
  7393. @wrap_exception()
  7394. def external_instance_event(self, context, instances, events):
  7395. # NOTE(danms): Some event types are handled by the manager, such
  7396. # as when we're asked to update the instance's info_cache. If it's
  7397. # not one of those, look for some thread(s) waiting for the event and
  7398. # unblock them if so.
  7399. for event in events:
  7400. instance = [inst for inst in instances
  7401. if inst.uuid == event.instance_uuid][0]
  7402. LOG.debug('Received event %(event)s',
  7403. {'event': event.key},
  7404. instance=instance)
  7405. if event.name == 'network-changed':
  7406. try:
  7407. LOG.debug('Refreshing instance network info cache due to '
  7408. 'event %s.', event.key, instance=instance)
  7409. self.network_api.get_instance_nw_info(
  7410. context, instance, refresh_vif_id=event.tag)
  7411. except exception.NotFound as e:
  7412. LOG.info('Failed to process external instance event '
  7413. '%(event)s due to: %(error)s',
  7414. {'event': event.key, 'error': six.text_type(e)},
  7415. instance=instance)
  7416. elif event.name == 'network-vif-deleted':
  7417. try:
  7418. # TODO(gibi): If the vif had resource allocation then
  7419. # it needs to be deallocated in placement.
  7420. self._process_instance_vif_deleted_event(context,
  7421. instance,
  7422. event.tag)
  7423. except exception.NotFound as e:
  7424. LOG.info('Failed to process external instance event '
  7425. '%(event)s due to: %(error)s',
  7426. {'event': event.key, 'error': six.text_type(e)},
  7427. instance=instance)
  7428. elif event.name == 'volume-extended':
  7429. self.extend_volume(context, instance, event.tag)
  7430. else:
  7431. self._process_instance_event(instance, event)
  7432. @periodic_task.periodic_task(spacing=CONF.image_cache_manager_interval,
  7433. external_process_ok=True)
  7434. def _run_image_cache_manager_pass(self, context):
  7435. """Run a single pass of the image cache manager."""
  7436. if not self.driver.capabilities.get("has_imagecache", False):
  7437. return
  7438. # Determine what other nodes use this storage
  7439. storage_users.register_storage_use(CONF.instances_path, CONF.host)
  7440. nodes = storage_users.get_storage_users(CONF.instances_path)
  7441. # Filter all_instances to only include those nodes which share this
  7442. # storage path.
  7443. # TODO(mikal): this should be further refactored so that the cache
  7444. # cleanup code doesn't know what those instances are, just a remote
  7445. # count, and then this logic should be pushed up the stack.
  7446. filters = {'deleted': False,
  7447. 'soft_deleted': True,
  7448. 'host': nodes}
  7449. filtered_instances = objects.InstanceList.get_by_filters(context,
  7450. filters, expected_attrs=[], use_slave=True)
  7451. self.driver.manage_image_cache(context, filtered_instances)
  7452. @periodic_task.periodic_task(spacing=CONF.instance_delete_interval)
  7453. def _run_pending_deletes(self, context):
  7454. """Retry any pending instance file deletes."""
  7455. LOG.debug('Cleaning up deleted instances')
  7456. filters = {'deleted': True,
  7457. 'soft_deleted': False,
  7458. 'host': CONF.host,
  7459. 'cleaned': False}
  7460. attrs = ['system_metadata']
  7461. with utils.temporary_mutation(context, read_deleted='yes'):
  7462. instances = objects.InstanceList.get_by_filters(
  7463. context, filters, expected_attrs=attrs, use_slave=True)
  7464. LOG.debug('There are %d instances to clean', len(instances))
  7465. for instance in instances:
  7466. attempts = int(instance.system_metadata.get('clean_attempts', '0'))
  7467. LOG.debug('Instance has had %(attempts)s of %(max)s '
  7468. 'cleanup attempts',
  7469. {'attempts': attempts,
  7470. 'max': CONF.maximum_instance_delete_attempts},
  7471. instance=instance)
  7472. if attempts < CONF.maximum_instance_delete_attempts:
  7473. success = self.driver.delete_instance_files(instance)
  7474. instance.system_metadata['clean_attempts'] = str(attempts + 1)
  7475. if success:
  7476. instance.cleaned = True
  7477. with utils.temporary_mutation(context, read_deleted='yes'):
  7478. instance.save()
  7479. @periodic_task.periodic_task(spacing=CONF.instance_delete_interval)
  7480. def _cleanup_incomplete_migrations(self, context):
  7481. """Delete instance files on failed resize/revert-resize operation
  7482. During resize/revert-resize operation, if that instance gets deleted
  7483. in-between then instance files might remain either on source or
  7484. destination compute node because of race condition.
  7485. """
  7486. LOG.debug('Cleaning up deleted instances with incomplete migration ')
  7487. migration_filters = {'host': CONF.host,
  7488. 'status': 'error'}
  7489. migrations = objects.MigrationList.get_by_filters(context,
  7490. migration_filters)
  7491. if not migrations:
  7492. return
  7493. inst_uuid_from_migrations = set([migration.instance_uuid for migration
  7494. in migrations])
  7495. inst_filters = {'deleted': True, 'soft_deleted': False,
  7496. 'uuid': inst_uuid_from_migrations}
  7497. attrs = ['info_cache', 'security_groups', 'system_metadata']
  7498. with utils.temporary_mutation(context, read_deleted='yes'):
  7499. instances = objects.InstanceList.get_by_filters(
  7500. context, inst_filters, expected_attrs=attrs, use_slave=True)
  7501. for instance in instances:
  7502. if instance.host != CONF.host:
  7503. for migration in migrations:
  7504. if instance.uuid == migration.instance_uuid:
  7505. # Delete instance files if not cleanup properly either
  7506. # from the source or destination compute nodes when
  7507. # the instance is deleted during resizing.
  7508. self.driver.delete_instance_files(instance)
  7509. try:
  7510. migration.status = 'failed'
  7511. with migration.obj_as_admin():
  7512. migration.save()
  7513. except exception.MigrationNotFound:
  7514. LOG.warning("Migration %s is not found.",
  7515. migration.id,
  7516. instance=instance)
  7517. break
  7518. @messaging.expected_exceptions(exception.InstanceQuiesceNotSupported,
  7519. exception.QemuGuestAgentNotEnabled,
  7520. exception.NovaException,
  7521. NotImplementedError)
  7522. @wrap_exception()
  7523. def quiesce_instance(self, context, instance):
  7524. """Quiesce an instance on this host."""
  7525. context = context.elevated()
  7526. image_meta = objects.ImageMeta.from_instance(instance)
  7527. self.driver.quiesce(context, instance, image_meta)
  7528. def _wait_for_snapshots_completion(self, context, mapping):
  7529. for mapping_dict in mapping:
  7530. if mapping_dict.get('source_type') == 'snapshot':
  7531. def _wait_snapshot():
  7532. snapshot = self.volume_api.get_snapshot(
  7533. context, mapping_dict['snapshot_id'])
  7534. if snapshot.get('status') != 'creating':
  7535. raise loopingcall.LoopingCallDone()
  7536. timer = loopingcall.FixedIntervalLoopingCall(_wait_snapshot)
  7537. timer.start(interval=0.5).wait()
  7538. @messaging.expected_exceptions(exception.InstanceQuiesceNotSupported,
  7539. exception.QemuGuestAgentNotEnabled,
  7540. exception.NovaException,
  7541. NotImplementedError)
  7542. @wrap_exception()
  7543. def unquiesce_instance(self, context, instance, mapping=None):
  7544. """Unquiesce an instance on this host.
  7545. If snapshots' image mapping is provided, it waits until snapshots are
  7546. completed before unqueiscing.
  7547. """
  7548. context = context.elevated()
  7549. if mapping:
  7550. try:
  7551. self._wait_for_snapshots_completion(context, mapping)
  7552. except Exception as error:
  7553. LOG.exception("Exception while waiting completion of "
  7554. "volume snapshots: %s",
  7555. error, instance=instance)
  7556. image_meta = objects.ImageMeta.from_instance(instance)
  7557. self.driver.unquiesce(context, instance, image_meta)
  7558. @periodic_task.periodic_task(spacing=CONF.instance_delete_interval)
  7559. def _cleanup_expired_console_auth_tokens(self, context):
  7560. """Remove expired console auth tokens for this host.
  7561. Console authorization tokens and their connection data are stored
  7562. in the database when a user asks for a console connection to an
  7563. instance. After a time they expire. We periodically remove any expired
  7564. tokens from the database.
  7565. """
  7566. # If the database backend isn't in use, don't bother looking for
  7567. # expired tokens. The database backend is not supported for cells v1.
  7568. if not CONF.cells.enable:
  7569. objects.ConsoleAuthToken.\
  7570. clean_expired_console_auths_for_host(context, self.host)