From 02094040ecb89346b612e9a2bd2d5145ea54261f Mon Sep 17 00:00:00 2001 From: Scott Hussey Date: Mon, 12 Jun 2017 15:15:17 -0500 Subject: [PATCH] DRYD-2 MVP - phase 1 - node enlistment Add node driver task for IdentifyNode Implement MaaS API interface for Machines and Interfaces --- drydock_provisioner/config.py | 4 +- drydock_provisioner/drivers/node/__init__.py | 1 + .../drivers/node/maasdriver/driver.py | 132 ++++++++++++- .../drivers/node/maasdriver/models/base.py | 24 ++- .../node/maasdriver/models/interface.py | 34 ++++ .../drivers/node/maasdriver/models/machine.py | 185 ++++++++++++++++++ .../drivers/oob/pyghmi_driver/__init__.py | 22 +-- drydock_provisioner/drivers/readme.md | 1 + drydock_provisioner/drydock.py | 2 +- drydock_provisioner/ingester/__init__.py | 6 +- drydock_provisioner/ingester/plugins/yaml.py | 2 + drydock_provisioner/objects/fields.py | 1 + drydock_provisioner/orchestrator/__init__.py | 127 +++++++++--- setup.py | 1 + 14 files changed, 491 insertions(+), 51 deletions(-) create mode 100644 drydock_provisioner/drivers/node/maasdriver/models/interface.py create mode 100644 drydock_provisioner/drivers/node/maasdriver/models/machine.py diff --git a/drydock_provisioner/config.py b/drydock_provisioner/config.py index a8c501e1..f64e91e9 100644 --- a/drydock_provisioner/config.py +++ b/drydock_provisioner/config.py @@ -27,8 +27,8 @@ class DrydockConfig(object): node_driver = { 'maasdriver': { - 'api_key': 'KTMHgA42cNSMnfmJ82:cdg4yQUhp542aHsCTV:7Dc2KB9hQpWq3LfQAAAKAj6wdg22yWxZ', - 'api_url': 'http://localhost:5240/MAAS/api/2.0/', + 'api_key': 'UTBfxGL69XWjaffQek:NuKZSYGuBs6ZpYC6B9:byvXBgY8CsW5VQKxGdQjvJXtjXwr5G4U', + 'api_url': 'http://10.23.19.16:30773/MAAS/api/2.0/', }, } diff --git a/drydock_provisioner/drivers/node/__init__.py b/drydock_provisioner/drivers/node/__init__.py index 48802905..a28c7cec 100644 --- a/drydock_provisioner/drivers/node/__init__.py +++ b/drydock_provisioner/drivers/node/__init__.py @@ -28,6 +28,7 @@ class NodeDriver(ProviderDriver): hd_fields.OrchestratorAction.CreateStorageTemplate, hd_fields.OrchestratorAction.CreateBootMedia, hd_fields.OrchestratorAction.PrepareHardwareConfig, + hd_fields.OrchestratorAction.IdentifyNode, hd_fields.OrchestratorAction.ConfigureHardware, hd_fields.OrchestratorAction.InterrogateNode, hd_fields.OrchestratorAction.ApplyNodeNetworking, diff --git a/drydock_provisioner/drivers/node/maasdriver/driver.py b/drydock_provisioner/drivers/node/maasdriver/driver.py index c6d0cc9d..f7079dcc 100644 --- a/drydock_provisioner/drivers/node/maasdriver/driver.py +++ b/drydock_provisioner/drivers/node/maasdriver/driver.py @@ -11,6 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import time +import logging + import drydock_provisioner.error as errors import drydock_provisioner.config as config import drydock_provisioner.drivers as drivers @@ -22,6 +25,7 @@ from .api_client import MaasRequestFactory import drydock_provisioner.drivers.node.maasdriver.models.fabric as maas_fabric import drydock_provisioner.drivers.node.maasdriver.models.vlan as maas_vlan import drydock_provisioner.drivers.node.maasdriver.models.subnet as maas_subnet +import drydock_provisioner.drivers.node.maasdriver.models.machine as maas_machine class MaasNodeDriver(NodeDriver): @@ -34,6 +38,8 @@ class MaasNodeDriver(NodeDriver): self.config = config.DrydockConfig.node_driver[self.driver_key] + self.logger = logging.getLogger('drydock.nodedriver.maasdriver') + def execute_task(self, task_id): task = self.state_manager.get_task(task_id) @@ -104,6 +110,8 @@ class MaasNodeDriver(NodeDriver): site_design = self.orchestrator.get_effective_site(design_id) if task.action == hd_fields.OrchestratorAction.CreateNetworkTemplate: + self.orchestrator.task_field_update(task.get_id(), status=hd_fields.TaskStatus.Running) + subtask = self.orchestrator.create_task(task_model.DriverTask, parent_task_id=task.get_id(), design_id=design_id, action=task.action, site_name=task.site_name, @@ -111,8 +119,13 @@ class MaasNodeDriver(NodeDriver): runner = MaasTaskRunner(state_manager=self.state_manager, orchestrator=self.orchestrator, task_id=subtask.get_id(),config=self.config) + + self.logger.info("Starting thread for task %s to create network templates" % (subtask.get_id())) + runner.start() + # TODO Figure out coherent system for putting all the timeouts in + # the config runner.join(timeout=120) if runner.is_alive(): @@ -120,17 +133,85 @@ class MaasNodeDriver(NodeDriver): 'retry': False, 'detail': 'MaaS Network creation timed-out' } + self.logger.warn("Thread for task %s timed out after 120s" % (subtask.get_id())) self.orchestrator.task_field_update(task.get_id(), status=hd_fields.TaskStatus.Complete, result=hd_fields.ActionResult.Failure, result_detail=result) else: subtask = self.state_manager.get_task(subtask.get_id()) + self.logger.info("Thread for task %s completed - result %s" % (subtask.get_id(), subtask.get_result())) self.orchestrator.task_field_update(task.get_id(), status=hd_fields.TaskStatus.Complete, result=subtask.get_result()) return + elif task.action == hd_fields.OrchestratorAction.IdentifyNode: + self.orchestrator.task_field_update(task.get_id(), + status=hd_fields.TaskStatus.Running) + + subtasks = [] + + result_detail = { + 'detail': [] + } + + for n in task.node_list: + subtask = self.orchestrator.create_task(task_model.DriverTask, + parent_task_id=task.get_id(), design_id=design_id, + action=hd_fields.OrchestratorAction.IdentifyNode, + site_name=task.site_name, + task_scope={'site': task.site_name, 'node_names': [n]}) + runner = MaasTaskRunner(state_manager=self.state_manager, + orchestrator=self.orchestrator, + task_id=subtask.get_id(),config=self.config) + + self.logger.info("Starting thread for task %s to identify node %s" % (subtask.get_id(), n)) + + runner.start() + subtasks.append(subtask.get_id()) + + running_subtasks = len(subtasks) + attempts = 0 + worked = failed = False + + #TODO Add timeout to config + while running_subtasks > 0 and attempts < 3: + for t in subtasks: + subtask = self.state_manager.get_task(t) + + if subtask.status == hd_fields.TaskStatus.Complete: + self.logger.info("Task %s to identify node %s complete - status %s" % + (subtask.get_id(), n, subtask.get_result())) + + result_detail['detail'].extend(subtask.result_detail['detail']) + running_subtasks = running_subtasks - 1 + + if subtask.result in [hd_fields.ActionResult.Success, + hd_fields.ActionResult.PartialSuccess]: + worked = True + elif subtask.result in [hd_fields.ActionResult.Failure, + hd_fields.ActionResult.PartialSuccess]: + failed = True + + time.sleep(1 * 60) + attempts = attempts + 1 + + if running_subtasks > 0: + self.logger.warn("Time out for task %s before all subtask threads complete" % (task.get_id())) + result = hd_fields.ActionResult.DependentFailure + result_detail['detail'].append('Some subtasks did not complete before the timeout threshold') + if worked and failed: + result = hd_fields.ActionResult.PartialSuccess + elif worked: + result = hd_fields.ActionResult.Success + else: + result = hd_fields.ActionResult.Failure + + self.orchestrator.task_field_update(task.get_id(), + status=hd_fields.TaskStatus.Complete, + result=result, + result_detail=result_detail) class MaasTaskRunner(drivers.DriverTaskRunner): @@ -138,6 +219,7 @@ class MaasTaskRunner(drivers.DriverTaskRunner): super(MaasTaskRunner, self).__init__(**kwargs) self.driver_config = config + self.logger = logging.getLogger('drydock.nodedriver.maasdriver') def execute_task(self): task_action = self.task.action @@ -314,4 +396,52 @@ class MaasTaskRunner(drivers.DriverTaskRunner): self.orchestrator.task_field_update(self.task.get_id(), status=hd_fields.TaskStatus.Complete, result=action_result, - result_detail=result_detail) \ No newline at end of file + result_detail=result_detail) + elif task_action == hd_fields.OrchestratorAction.IdentifyNode: + try: + machine_list = maas_machine.Machines(self.maas_client) + machine_list.refresh() + except: + self.orchestrator.task_field_update(self.task.get_id(), + status=hd_fields.TaskStatus.Complete, + result=hd_fields.ActionResult.Failure, + result_detail={'detail': 'Error accessing MaaS Machines API', 'retry': True}) + return + + nodes = self.task.node_list + + result_detail = {'detail': []} + + worked = failed = False + + for n in nodes: + try: + node = site_design.get_baremetal_node(n) + machine = machine_list.identify_baremetal_node(node) + if machine is not None: + worked = True + result_detail['detail'].append("Node %s identified in MaaS" % n) + else: + failed = True + result_detail['detail'].append("Node %s not found in MaaS" % n) + except Exception as ex: + failed = True + result_detail['detail'].append("Error identifying node %s: %s" % (n, str(ex))) + + result = None + if worked and failed: + result = hd_fields.ActionResult.PartialSuccess + elif worked: + result = hd_fields.ActionResult.Success + elif failed: + result = hd_fields.ActionResult.Failure + + self.orchestrator.task_field_update(self.task.get_id(), + status=hd_fields.TaskStatus.Complete, + result=result, + result_detail=result_detail) + + + + + diff --git a/drydock_provisioner/drivers/node/maasdriver/models/base.py b/drydock_provisioner/drivers/node/maasdriver/models/base.py index fb033eec..7d6cbd49 100644 --- a/drydock_provisioner/drivers/node/maasdriver/models/base.py +++ b/drydock_provisioner/drivers/node/maasdriver/models/base.py @@ -13,6 +13,7 @@ # limitations under the License. import json import re +import logging import drydock_provisioner.error as errors """ @@ -28,6 +29,7 @@ class ResourceBase(object): def __init__(self, api_client, **kwargs): self.api_client = api_client + self.logger = logging.getLogger('drydock.drivers.maasdriver') for f in self.fields: if f in kwargs.keys(): @@ -143,13 +145,16 @@ class ResourceBase(object): return i -""" -A collection of MaaS resources. -Rather than a simple list, we will key the collection on resource -ID for more efficient access. -""" class ResourceCollectionBase(object): + """ + A collection of MaaS resources. + + Rather than a simple list, we will key the collection on resource + ID for more efficient access. + + :param api_client: An instance of api_client.MaasRequestFactory + """ collection_url = '' collection_resource = ResourceBase @@ -157,12 +162,13 @@ class ResourceCollectionBase(object): def __init__(self, api_client): self.api_client = api_client self.resources = {} + self.logger = logging.getLogger('drydock.drivers.maasdriver') - """ - Parse URL for placeholders and replace them with current - instance values - """ def interpolate_url(self): + """ + Parse URL for placeholders and replace them with current + instance values + """ pattern = '\{([a-z_]+)\}' regex = re.compile(pattern) start = 0 diff --git a/drydock_provisioner/drivers/node/maasdriver/models/interface.py b/drydock_provisioner/drivers/node/maasdriver/models/interface.py new file mode 100644 index 00000000..5c257279 --- /dev/null +++ b/drydock_provisioner/drivers/node/maasdriver/models/interface.py @@ -0,0 +1,34 @@ +# Copyright 2017 AT&T Intellectual Property. All other rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import drydock_provisioner.drivers.node.maasdriver.models.base as model_base + +class Interface(model_base.ResourceBase): + + resource_url = 'nodes/{system_id}/interfaces/{resource_id}/' + fields = ['resource_id', 'system_id', 'name', 'type', 'mac_address', 'vlan', + 'links', 'effective_mtu'] + json_fields = ['name', 'type', 'mac_address', 'vlan', 'links', 'effective_mtu'] + + def __init__(self, api_client, **kwargs): + super(Interface, self).__init__(api_client, **kwargs) + +class Interfaces(model_base.ResourceCollectionBase): + + collection_url = 'nodes/{system_id}/interfaces/' + collection_resource = Interface + + def __init__(self, api_client, **kwargs): + super(Interfaces, self).__init__(api_client) + self.system_id = kwargs.get('system_id', None) \ No newline at end of file diff --git a/drydock_provisioner/drivers/node/maasdriver/models/machine.py b/drydock_provisioner/drivers/node/maasdriver/models/machine.py new file mode 100644 index 00000000..f4ec3609 --- /dev/null +++ b/drydock_provisioner/drivers/node/maasdriver/models/machine.py @@ -0,0 +1,185 @@ +# Copyright 2017 AT&T Intellectual Property. All other rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import drydock_provisioner.drivers.node.maasdriver.models.base as model_base +import drydock_provisioner.drivers.node.maasdriver.models.interface as maas_interface +import bson +import yaml + +class Machine(model_base.ResourceBase): + + resource_url = 'machines/{resource_id}/' + fields = ['resource_id', 'hostname', 'power_type', 'power_state', 'power_parameters', 'interfaces', + 'boot_interface', 'memory', 'cpu_count', 'tag_names'] + json_fields = ['hostname', 'power_type'] + + def __init__(self, api_client, **kwargs): + super(Machine, self).__init__(api_client, **kwargs) + + # Replace generic dicts with interface collection model + if getattr(self, 'resource_id', None) is not None: + self.interfaces = maas_interface.Interfaces(api_client, system_id=self.resource_id) + self.interfaces.refresh() + + def get_power_params(self): + url = self.interpolate_url() + + resp = self.api_client.get(url, op='power_parameters') + + if resp.status_code == 200: + self.power_parameters = resp.json() + + def commission(self, debug=False): + url = self.interpolate_url() + + # If we want to debug this node commissioning, enable SSH + # after commissioning and leave the node powered up + + options = {'enable_ssh': '1' if debug else '0'} + + resp = self.api_client.post(url, op='commission', files=options) + + # Need to sort out how to handle exceptions + if not resp.ok: + raise Exception() + + def get_details(self): + url = self.interpolate_url() + + resp = self.api_client.get(url, op='details') + + if resp.status_code == 200: + detail_config = bson.loads(resp.text) + return detail_config + + + def to_dict(self): + """ + Serialize this resource instance into a dict matching the + MAAS representation of the resource + """ + data_dict = {} + + for f in self.json_fields: + if getattr(self, f, None) is not None: + if f == 'resource_id': + data_dict['system_id'] = getattr(self, f) + else: + data_dict[f] = getattr(self, f) + + return data_dict + + @classmethod + def from_dict(cls, api_client, obj_dict): + """ + Create a instance of this resource class based on a dict + of MaaS type attributes + + Customized for Machine due to use of system_id instead of id + as resource key + + :param api_client: Instance of api_client.MaasRequestFactory for accessing MaaS API + :param obj_dict: Python dict as parsed from MaaS API JSON representing this resource type + """ + + refined_dict = {k: obj_dict.get(k, None) for k in cls.fields} + + if 'system_id' in obj_dict.keys(): + refined_dict['resource_id'] = obj_dict.get('system_id') + + i = cls(api_client, **refined_dict) + return i + +class Machines(model_base.ResourceCollectionBase): + + collection_url = 'machines/' + collection_resource = Machine + + def __init__(self, api_client, **kwargs): + super(Machines, self).__init__(api_client) + + # Add the OOB power parameters to each machine instance + def collect_power_params(self): + for k, v in self.resources.items(): + v.get_power_params() + + + def identify_baremetal_node(self, node_model, update_name=True): + """ + Search all the defined MaaS Machines and attempt to match + one against the provided Drydock BaremetalNode model. Update + the MaaS instance with the correct hostname + + :param node_model: Instance of objects.node.BaremetalNode to search MaaS for matching resource + :param update_name: Whether Drydock should update the MaaS resource name to match the Drydock design + """ + node_oob_network = node_model.oob_network + node_oob_ip = node_model.get_network_address(node_oob_network) + + if node_oob_ip is None: + self.logger.warn("Node model missing OOB IP address") + raise ValueError('Node model missing OOB IP address') + + try: + self.collect_power_params() + + maas_node = self.singleton({'power_params.power_address': node_oob_ip}) + + self.logger.debug("Found MaaS resource %s matching Node %s" % (maas_node.resource_id, node_model.get_id())) + + if maas_node.hostname != node_model.name and update_name: + maas_node.hostname = node_model.name + maas_node.update() + self.logger.debug("Updated MaaS resource %s hostname to %s" % (maas_node.resource_id, node_model.name)) + return maas_node + + except ValueError as ve: + self.logger.warn("Error locating matching MaaS resource for OOB IP %s" % (node_oob_ip)) + return None + + def query(self, query): + """ + Custom query method to deal with complex fields + """ + result = list(self.resources.values()) + for (k, v) in query.items(): + if k.startswith('power_params.'): + field = k[13:] + result = [i for i in result + if str(getattr(i,'power_parameters', {}).get(field, None)) == str(v)] + else: + result = [i for i in result + if str(getattr(i, k, None)) == str(v)] + + return result + + + def add(self, res): + """ + Create a new resource in this collection in MaaS + + Customize as Machine resources use 'system_id' instead of 'id' + """ + data_dict = res.to_dict() + url = self.interpolate_url() + + resp = self.api_client.post(url, files=data_dict) + + if resp.status_code == 200: + resp_json = resp.json() + res.set_resource_id(resp_json.get('system_id')) + return res + + raise errors.DriverError("Failed updating MAAS url %s - return code %s" + % (url, resp.status_code)) \ No newline at end of file diff --git a/drydock_provisioner/drivers/oob/pyghmi_driver/__init__.py b/drydock_provisioner/drivers/oob/pyghmi_driver/__init__.py index a4378369..73c2bd5c 100644 --- a/drydock_provisioner/drivers/oob/pyghmi_driver/__init__.py +++ b/drydock_provisioner/drivers/oob/pyghmi_driver/__init__.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import time +import logging from pyghmi.ipmi.command import Command @@ -34,15 +35,19 @@ class PyghmiDriver(oob.OobDriver): self.driver_key = "pyghmi_driver" self.driver_desc = "Pyghmi OOB Driver" + self.logger = logging.getLogger('drydock.oobdriver.pyghmi') self.config = config.DrydockConfig.node_driver.get(self.driver_key, {}) def execute_task(self, task_id): task = self.state_manager.get_task(task_id) if task is None: + self.logger.error("Invalid task %s" % (task_id)) raise errors.DriverError("Invalid task %s" % (task_id)) if task.action not in self.supported_actions: + self.logger.error("Driver %s doesn't support task action %s" + % (self.driver_desc, task.action)) raise errors.DriverError("Driver %s doesn't support task action %s" % (self.driver_desc, task.action)) @@ -66,7 +71,7 @@ class PyghmiDriver(oob.OobDriver): result=hd_fields.ActionResult.Success) return - site_design = self.orchestrator.get_effective_site(design_id, task.site_name) + site_design = self.orchestrator.get_effective_site(design_id) target_nodes = [] @@ -118,13 +123,6 @@ class PyghmiDriver(oob.OobDriver): if x.get_result() in [hd_fields.ActionResult.PartialSuccess, hd_fields.ActionResult.Failure]] - print("Task %s successful subtasks: %s" % - (task.get_id(), len(success_subtasks))) - print("Task %s unsuccessful subtasks: %s" % - (task.get_id(), len(nosuccess_subtasks))) - print("Task %s total subtasks: %s" % - (task.get_id(), len(task.get_subtasks()))) - task_result = None if len(success_subtasks) > 0 and len(nosuccess_subtasks) > 0: task_result = hd_fields.ActionResult.PartialSuccess @@ -145,9 +143,11 @@ class PyghmiTaskRunner(drivers.DriverTaskRunner): def __init__(self, node=None, **kwargs): super(PyghmiTaskRunner, self).__init__(**kwargs) + self.logger = logging.getLogger('drydock.oobdriver.pyghmi') # We cheat here by providing the Node model instead # of making the runner source it from statemgmt if node is None: + self.logger.error("Did not specify target node") raise errors.DriverError("Did not specify target node") self.node = node @@ -172,7 +172,7 @@ class PyghmiTaskRunner(drivers.DriverTaskRunner): "task node scope") - ipmi_network = self.node.applied.get('oob_network') + ipmi_network = self.node.oob_network ipmi_address = self.node.get_network_address(ipmi_network) if ipmi_address is None: @@ -184,8 +184,8 @@ class PyghmiTaskRunner(drivers.DriverTaskRunner): self.orchestrator.task_field_update(self.task.get_id(), status=hd_fields.TaskStatus.Running) - ipmi_account = self.node.applied.get('oob_account', '') - ipmi_credential = self.node.applied.get('oob_credential', '') + ipmi_account = self.node.oob_account + ipmi_credential = self.node.oob_credential ipmi_session = Command(bmc=ipmi_address, userid=ipmi_account, password=ipmi_credential) diff --git a/drydock_provisioner/drivers/readme.md b/drydock_provisioner/drivers/readme.md index 0aab4c1c..63214dd7 100644 --- a/drydock_provisioner/drivers/readme.md +++ b/drydock_provisioner/drivers/readme.md @@ -32,6 +32,7 @@ and storage. * CreateStorageTemplate - Configure site-wide storage information in bootstrapper * CreateBootMedia - Ensure all needed boot media is available to the bootstrapper including external repositories * PrepareHardwareConfig - Prepare the bootstrapper to handle all hardware configuration actions (firmware updates, RAID configuration, driver installation) +* IdentifyNode - Correlate a node definition in the Drydock internal model with a node detected by the downstream node bootstrapper. * ConfigureHardware - Update and validate all hardware configurations on a node prior to deploying the OS on it * InterrogateNode - Interrogate the bootstrapper about node information. Depending on the current state of the node, this interrogation will produce different information. * ApplyNodeNetworking - Configure networking for a node diff --git a/drydock_provisioner/drydock.py b/drydock_provisioner/drydock.py index 93e91982..18e53a13 100644 --- a/drydock_provisioner/drydock.py +++ b/drydock_provisioner/drydock.py @@ -28,7 +28,7 @@ def start_drydock(): logger.setLevel(config.DrydockConfig.global_config.get('log_level')) ch = logging.StreamHandler() - formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(filename)s:%(funcName)s - %(message)s') ch.setFormatter(formatter) logger.addHandler(ch) diff --git a/drydock_provisioner/ingester/__init__.py b/drydock_provisioner/ingester/__init__.py index 0938f710..daaff310 100644 --- a/drydock_provisioner/ingester/__init__.py +++ b/drydock_provisioner/ingester/__init__.py @@ -83,7 +83,11 @@ class Ingester(object): self.logger.debug("Ingester:ingest_data ingesting design parts for design %s" % design_id) if plugin_name in self.registered_plugins: - design_items = self.registered_plugins[plugin_name].ingest_data(**kwargs) + try: + design_items = self.registered_plugins[plugin_name].ingest_data(**kwargs) + except ValueError as vex: + self.logger.warn("Ingester:ingest_data - Error process data - %s" % (str(vex))) + return None self.logger.debug("Ingester:ingest_data parsed %s design parts" % str(len(design_items))) for m in design_items: if context is not None: diff --git a/drydock_provisioner/ingester/plugins/yaml.py b/drydock_provisioner/ingester/plugins/yaml.py index 37e1ad06..d3a9c4dd 100644 --- a/drydock_provisioner/ingester/plugins/yaml.py +++ b/drydock_provisioner/ingester/plugins/yaml.py @@ -70,6 +70,8 @@ class YamlIngester(IngesterPlugin): def parse_docs(self, yaml_string): models = [] + self.logger.debug("yamlingester:parse_docs - Parsing YAML string \n%s" % (yaml_string)) + try: parsed_data = yaml.load_all(yaml_string) except yaml.YAMLError as err: diff --git a/drydock_provisioner/objects/fields.py b/drydock_provisioner/objects/fields.py index c6ac8ac3..2abb6098 100644 --- a/drydock_provisioner/objects/fields.py +++ b/drydock_provisioner/objects/fields.py @@ -44,6 +44,7 @@ class OrchestratorAction(BaseDrydockEnum): CreateStorageTemplate = 'create_storage_template' CreateBootMedia = 'create_boot_media' PrepareHardwareConfig = 'prepare_hardware_config' + IdentifyNode = 'identify_node' ConfigureHardware = 'configure_hardware' InterrogateNode = 'interrogate_node' ApplyNodeNetworking = 'apply_node_networking' diff --git a/drydock_provisioner/orchestrator/__init__.py b/drydock_provisioner/orchestrator/__init__.py index 3736faa2..69984336 100644 --- a/drydock_provisioner/orchestrator/__init__.py +++ b/drydock_provisioner/orchestrator/__init__.py @@ -16,6 +16,7 @@ import uuid import time import threading import importlib +import logging from copy import deepcopy @@ -33,6 +34,8 @@ class Orchestrator(object): self.state_manager = state_manager + self.logger = logging.getLogger('drydock.orchestrator') + if enabled_drivers is not None: oob_driver_name = enabled_drivers.get('oob', None) if oob_driver_name is not None: @@ -155,10 +158,14 @@ class Orchestrator(object): task_scope=task_scope, action=hd_fields.OrchestratorAction.CreateNetworkTemplate) + self.logger.info("Starting node driver task %s to create network templates" % (driver_task.get_id())) + driver.execute_task(driver_task.get_id()) driver_task = self.state_manager.get_task(driver_task.get_id()) + self.logger.info("Node driver task %s complete" % (driver_task.get_id())) + self.task_field_update(task_id, status=hd_fields.TaskStatus.Complete, result=driver_task.get_result()) @@ -167,12 +174,13 @@ class Orchestrator(object): self.task_field_update(task_id, status=hd_fields.TaskStatus.Running) - driver = self.enabled_drivers['oob'] + oob_driver = self.enabled_drivers['oob'] - if driver is None: + if oob_driver is None: self.task_field_update(task_id, status=hd_fields.TaskStatus.Errored, - result=hd_fields.ActionResult.Failure) + result=hd_fields.ActionResult.Failure, + result_detail={'detail': 'Error: No oob driver configured', 'retry': False}) return site_design = self.get_effective_site(design_id) @@ -186,30 +194,42 @@ class Orchestrator(object): task_scope = {'site' : task_site, 'node_names' : target_names} - driver_task = self.create_task(tasks.DriverTask, + oob_driver_task = self.create_task(tasks.DriverTask, parent_task_id=task.get_id(), design_id=design_id, - action=hd_fields.OrchestratorAction.InterrogateNode, + action=hd_fields.OrchestratorAction.InterrogateOob, task_scope=task_scope) - driver.execute_task(driver_task.get_id()) + oob_driver.execute_task(oob_driver_task.get_id()) - driver_task = self.state_manager.get_task(driver_task.get_id()) + oob_driver_task = self.state_manager.get_task(oob_driver_task.get_id()) self.task_field_update(task_id, status=hd_fields.TaskStatus.Complete, - result=driver_task.get_result()) + result=oob_driver_task.get_result()) return elif task.action == hd_fields.OrchestratorAction.PrepareNode: + failed = worked = False + self.task_field_update(task_id, status=hd_fields.TaskStatus.Running) - driver = self.enabled_drivers['oob'] + oob_driver = self.enabled_drivers['oob'] - if driver is None: + if oob_driver is None: self.task_field_update(task_id, status=hd_fields.TaskStatus.Errored, - result=hd_fields.ActionResult.Failure) + result=hd_fields.ActionResult.Failure, + result_detail={'detail': 'Error: No oob driver configured', 'retry': False}) + return + + node_driver = self.enabled_drivers['node'] + + if node_driver is None: + self.task_field_update(task_id, + status=hd_fields.TaskStatus.Errored, + result=hd_fields.ActionResult.Failure, + result_detail={'detail': 'Error: No node driver configured', 'retry': False}) return site_design = self.get_effective_site(design_id) @@ -229,33 +249,88 @@ class Orchestrator(object): action=hd_fields.OrchestratorAction.SetNodeBoot, task_scope=task_scope) - driver.execute_task(setboot_task.get_id()) + self.logger.info("Starting OOB driver task %s to set PXE boot" % (setboot_task.get_id())) + + oob_driver.execute_task(setboot_task.get_id()) + + self.logger.info("OOB driver task %s complete" % (setboot_task.get_id())) setboot_task = self.state_manager.get_task(setboot_task.get_id()) + if setboot_task.get_result() == hd_fields.ActionResult.Success: + worked = True + elif setboot_task.get_result() == hd_fields.ActionResult.PartialSuccess: + worked = failed = True + elif setboot_task.get_result() == hd_fields.ActionResult.Failure: + failed = True + cycle_task = self.create_task(tasks.DriverTask, parent_task_id=task.get_id(), design_id=design_id, action=hd_fields.OrchestratorAction.PowerCycleNode, task_scope=task_scope) - driver.execute_task(cycle_task.get_id()) + + self.logger.info("Starting OOB driver task %s to power cycle nodes" % (cycle_task.get_id())) + + oob_driver.execute_task(cycle_task.get_id()) + + self.logger.info("OOB driver task %s complete" % (cycle_task.get_id())) cycle_task = self.state_manager.get_task(cycle_task.get_id()) - if (setboot_task.get_result() == hd_fields.ActionResult.Success and - cycle_task.get_result() == hd_fields.ActionResult.Success): - self.task_field_update(task_id, - status=hd_fields.TaskStatus.Complete, - result=hd_fields.ActionResult.Success) - elif (setboot_task.get_result() == hd_fields.ActionResult.Success or - cycle_task.get_result() == hd_fields.ActionResult.Success): - self.task_field_update(task_id, - status=hd_fields.TaskStatus.Complete, - result=hd_fields.ActionResult.PartialSuccess) + if cycle_task.get_result() == hd_fields.ActionResult.Success: + worked = True + elif cycle_task.get_result() == hd_fields.ActionResult.PartialSuccess: + worked = failed = True + elif cycle_task.get_result() == hd_fields.ActionResult.Failure: + failed = True + + + # IdentifyNode success will take some time after PowerCycleNode finishes + # Retry the operation a few times if it fails before considering it a final failure + # Each attempt is a new task which might make the final task tree a bit confusing + + node_identify_attempts = 0 + + while True: + + node_identify_task = self.create_task(tasks.DriverTask, + parent_task_id=task.get_id(), + design_id=design_id, + action=hd_fields.OrchestratorAction.IdentifyNode, + task_scope=task_scope) + + self.logger.info("Starting node driver task %s to identify node - attempt %s" % + (node_identify_task.get_id(), node_identify_attempts+1)) + + node_driver.execute_task(node_identify_task.get_id()) + node_identify_attempts = node_identify_attempts + 1 + + node_identify_task = self.state_manager.get_task(node_identify_task.get_id()) + + if node_identify_task.get_result() == hd_fields.ActionResult.Success: + worked = True + break + elif node_identify_task.get_result() in [hd_fields.ActionResult.PartialSuccess, + hd_fields.ActionResult.Failure]: + # TODO This threshold should be a configurable default and tunable by task API + if node_identify_attempts > 2: + failed = True + break + + time.sleep(5 * 60) + + final_result = None + if worked and failed: + final_result = hd_fields.ActionResult.PartialSuccess + elif worked: + final_result = hd_fields.ActionResult.Success else: - self.task_field_update(task_id, - status=hd_fields.TaskStatus.Complete, - result=hd_fields.ActionResult.Failure) + final_result = hd_fields.ActionResult.Failure + + self.task_field_update(task_id, + status=hd_fields.TaskStatus.Complete, + result=final_result) return else: diff --git a/setup.py b/setup.py index dc97cc33..b090c19c 100644 --- a/setup.py +++ b/setup.py @@ -62,6 +62,7 @@ setup(name='drydock_provisioner', 'requests', 'oauthlib', 'uwsgi>1.4', + 'bson===0.4.7' ] )