deb-sahara/sahara/plugins/cdh/v5_4_0/validation.py
Ken Chen b144c7ee11 Enable YARN ResourceManager HA in CDH plugin
We add the codes about YARN RM HA implementation. If
YARN_STANDBYRM role is enabled in cluster, we will do some
validation, and call CM API enable_rm_ha to enable RM HA in the
cluster.

Partial-implements: blueprint cdh-ha-support
Change-Id: I5562a310ef5b6fffa3439b28db2ceb212cc6286f
2015-08-25 09:59:55 +08:00

323 lines
14 KiB
Python

# Copyright (c) 2015 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from sahara.i18n import _
from sahara.plugins.cdh.v5_4_0 import plugin_utils as pu
from sahara.plugins import exceptions as ex
from sahara.plugins import utils as u
from sahara.utils import general as gu
PU = pu.PluginUtilsV540()
def validate_cluster_creating(cluster):
mng_count = _get_inst_count(cluster, 'CLOUDERA_MANAGER')
if mng_count != 1:
raise ex.InvalidComponentCountException('CLOUDERA_MANAGER',
1, mng_count)
zk_count = _get_inst_count(cluster, 'ZOOKEEPER_SERVER')
nn_count = _get_inst_count(cluster, 'HDFS_NAMENODE')
if nn_count != 1:
raise ex.InvalidComponentCountException('HDFS_NAMENODE', 1, nn_count)
snn_count = _get_inst_count(cluster, 'HDFS_SECONDARYNAMENODE')
if snn_count != 1:
raise ex.InvalidComponentCountException('HDFS_SECONDARYNAMENODE', 1,
snn_count)
dn_count = _get_inst_count(cluster, 'HDFS_DATANODE')
replicas = PU.get_config_value('HDFS', 'dfs_replication', cluster)
if dn_count < replicas:
raise ex.InvalidComponentCountException(
'HDFS_DATANODE', replicas, dn_count,
_('Number of datanodes must be not less than dfs_replication.'))
jn_count = _get_inst_count(cluster, 'HDFS_JOURNALNODE')
if jn_count > 0:
if jn_count < 3:
raise ex.InvalidComponentCountException('HDFS_JOURNALNODE',
_('not less than 3'),
jn_count)
if not jn_count % 2:
raise ex.InvalidComponentCountException('HDFS_JOURNALNODE',
_('be odd'), jn_count)
if zk_count < 1:
raise ex.RequiredServiceMissingException('ZOOKEEPER',
required_by='HDFS HA')
if 'HDFS_SECONDARYNAMENODE' not in _get_anti_affinity(cluster):
raise ex.NameNodeHAConfigurationError(_('HDFS_SECONDARYNAMENODE '
'should be enabled '
'in anti_affinity.'))
if 'HDFS_NAMENODE' not in _get_anti_affinity(cluster):
raise ex.NameNodeHAConfigurationError(_('HDFS_NAMENODE '
'should be enabled '
'in anti_affinity.'))
rm_count = _get_inst_count(cluster, 'YARN_RESOURCEMANAGER')
if rm_count > 1:
raise ex.InvalidComponentCountException('YARN_RESOURCEMANAGER',
_('0 or 1'), rm_count)
stdb_rm_count = _get_inst_count(cluster, 'YARN_STANDBYRM')
if stdb_rm_count > 1:
raise ex.InvalidComponentCountException('YARN_STANDBYRM',
_('0 or 1'), stdb_rm_count)
if stdb_rm_count > 0:
if rm_count < 1:
raise ex.RequiredServiceMissingException('YARN_RESOURCEMANAGER',
required_by='RM HA')
if zk_count < 1:
raise ex.RequiredServiceMissingException('ZOOKEEPER',
required_by='RM HA')
if 'YARN_RESOURCEMANAGER' not in _get_anti_affinity(cluster):
raise ex.ResourceManagerHAConfigurationError(
_('YARN_RESOURCEMANAGER should be enabled in anti_affinity.'))
if 'YARN_STANDBYRM' not in _get_anti_affinity(cluster):
raise ex.ResourceManagerHAConfigurationError(
_('YARN_STANDBYRM should be enabled in anti_affinity.'))
hs_count = _get_inst_count(cluster, 'YARN_JOBHISTORY')
if hs_count > 1:
raise ex.InvalidComponentCountException('YARN_JOBHISTORY',
_('0 or 1'), hs_count)
if rm_count > 0 and hs_count < 1:
raise ex.RequiredServiceMissingException(
'YARN_JOBHISTORY', required_by='YARN_RESOURCEMANAGER')
nm_count = _get_inst_count(cluster, 'YARN_NODEMANAGER')
if rm_count == 0:
if nm_count > 0:
raise ex.RequiredServiceMissingException(
'YARN_RESOURCEMANAGER', required_by='YARN_NODEMANAGER')
oo_count = _get_inst_count(cluster, 'OOZIE_SERVER')
if oo_count > 1:
raise ex.InvalidComponentCountException('OOZIE_SERVER', _('0 or 1'),
oo_count)
if oo_count == 1:
if dn_count < 1:
raise ex.RequiredServiceMissingException(
'HDFS_DATANODE', required_by='OOZIE_SERVER')
if nm_count < 1:
raise ex.RequiredServiceMissingException(
'YARN_NODEMANAGER', required_by='OOZIE_SERVER')
if hs_count != 1:
raise ex.RequiredServiceMissingException(
'YARN_JOBHISTORY', required_by='OOZIE_SERVER')
hms_count = _get_inst_count(cluster, 'HIVE_METASTORE')
hvs_count = _get_inst_count(cluster, 'HIVE_SERVER2')
whc_count = _get_inst_count(cluster, 'HIVE_WEBHCAT')
if hms_count and rm_count < 1:
raise ex.RequiredServiceMissingException(
'YARN_RESOURCEMANAGER', required_by='HIVE_METASTORE')
if hms_count and not hvs_count:
raise ex.RequiredServiceMissingException(
'HIVE_SERVER2', required_by='HIVE_METASTORE')
if hvs_count and not hms_count:
raise ex.RequiredServiceMissingException(
'HIVE_METASTORE', required_by='HIVE_SERVER2')
if whc_count and not hms_count:
raise ex.RequiredServiceMissingException(
'HIVE_METASTORE', required_by='HIVE_WEBHCAT')
hue_count = _get_inst_count(cluster, 'HUE_SERVER')
if hue_count > 1:
raise ex.InvalidComponentCountException('HUE_SERVER', _('0 or 1'),
hue_count)
shs_count = _get_inst_count(cluster, 'SPARK_YARN_HISTORY_SERVER')
if shs_count > 1:
raise ex.InvalidComponentCountException('SPARK_YARN_HISTORY_SERVER',
_('0 or 1'), shs_count)
if shs_count and not rm_count:
raise ex.RequiredServiceMissingException(
'YARN_RESOURCEMANAGER', required_by='SPARK_YARN_HISTORY_SERVER')
if oo_count < 1 and hue_count:
raise ex.RequiredServiceMissingException(
'OOZIE_SERVER', required_by='HUE_SERVER')
if hms_count < 1 and hue_count:
raise ex.RequiredServiceMissingException(
'HIVE_METASTORE', required_by='HUE_SERVER')
hbm_count = _get_inst_count(cluster, 'HBASE_MASTER')
hbr_count = _get_inst_count(cluster, 'HBASE_REGIONSERVER')
if hbm_count >= 1:
if zk_count < 1:
raise ex.RequiredServiceMissingException('ZOOKEEPER',
required_by='HBASE')
if hbr_count < 1:
raise ex.InvalidComponentCountException(
'HBASE_REGIONSERVER', _('at least 1'), hbr_count)
elif hbr_count >= 1:
raise ex.InvalidComponentCountException('HBASE_MASTER',
_('at least 1'), hbm_count)
a_count = _get_inst_count(cluster, 'FLUME_AGENT')
if a_count >= 1:
if dn_count < 1:
raise ex.RequiredServiceMissingException(
'HDFS_DATANODE', required_by='FLUME_AGENT')
snt_count = _get_inst_count(cluster, 'SENTRY_SERVER')
if snt_count > 1:
raise ex.InvalidComponentCountException('SENTRY_SERVER', _('0 or 1'),
snt_count)
if snt_count == 1:
if dn_count < 1:
raise ex.RequiredServiceMissingException(
'HDFS_DATANODE', required_by='SENTRY_SERVER')
if zk_count < 1:
raise ex.RequiredServiceMissingException(
'ZOOKEEPER', required_by='SENTRY_SERVER')
slr_count = _get_inst_count(cluster, 'SOLR_SERVER')
if slr_count >= 1:
if dn_count < 1:
raise ex.RequiredServiceMissingException(
'HDFS_DATANODE', required_by='SOLR_SERVER')
if zk_count < 1:
raise ex.RequiredServiceMissingException(
'ZOOKEEPER', required_by='SOLR_SERVER')
s2s_count = _get_inst_count(cluster, 'SQOOP_SERVER')
if s2s_count > 1:
raise ex.InvalidComponentCountException('SQOOP_SERVER', _('0 or 1'),
s2s_count)
if s2s_count == 1:
if dn_count < 1:
raise ex.RequiredServiceMissingException(
'HDFS_DATANODE', required_by='SQOOP_SERVER')
if nm_count < 1:
raise ex.RequiredServiceMissingException(
'YARN_NODEMANAGER', required_by='SQOOP_SERVER')
if hs_count != 1:
raise ex.RequiredServiceMissingException(
'YARN_JOBHISTORY', required_by='SQOOP_SERVER')
lhbi_count = _get_inst_count(cluster, 'HBASE_INDEXER')
if lhbi_count >= 1:
if dn_count < 1:
raise ex.RequiredServiceMissingException(
'HDFS_DATANODE', required_by='HBASE_INDEXER')
if zk_count < 1:
raise ex.RequiredServiceMissingException(
'ZOOKEEPER', required_by='HBASE_INDEXER')
if slr_count < 1:
raise ex.RequiredServiceMissingException(
'SOLR_SERVER', required_by='HBASE_INDEXER')
if hbm_count < 1:
raise ex.RequiredServiceMissingException(
'HBASE_MASTER', required_by='HBASE_INDEXER')
ics_count = _get_inst_count(cluster, 'IMPALA_CATALOGSERVER')
iss_count = _get_inst_count(cluster, 'IMPALA_STATESTORE')
id_count = _get_inst_count(cluster, 'IMPALAD')
if ics_count > 1:
raise ex.InvalidComponentCountException('IMPALA_CATALOGSERVER',
_('0 or 1'), ics_count)
if iss_count > 1:
raise ex.InvalidComponentCountException('IMPALA_STATESTORE',
_('0 or 1'), iss_count)
if ics_count == 1:
datanodes = set(u.get_instances(cluster, "HDFS_DATANODE"))
impalads = set(u.get_instances(cluster, "IMPALAD"))
if len(datanodes ^ impalads) > 0:
raise ex.InvalidClusterTopology(
_("IMPALAD must be installed on every HDFS_DATANODE"))
if iss_count != 1:
raise ex.RequiredServiceMissingException(
'IMPALA_STATESTORE', required_by='IMPALA')
if id_count < 1:
raise ex.RequiredServiceMissingException(
'IMPALAD', required_by='IMPALA')
if dn_count < 1:
raise ex.RequiredServiceMissingException(
'HDFS_DATANODE', required_by='IMPALA')
if hms_count < 1:
raise ex.RequiredServiceMissingException(
'HIVE_METASTORE', required_by='IMPALA')
kms_count = _get_inst_count(cluster, 'KMS')
if kms_count > 1:
raise ex.InvalidComponentCountException('KMS',
_('0 or 1'), kms_count)
def validate_additional_ng_scaling(cluster, additional):
rm = PU.get_resourcemanager(cluster)
scalable_processes = _get_scalable_processes()
for ng_id in additional:
ng = gu.get_by_id(cluster.node_groups, ng_id)
if not set(ng.node_processes).issubset(scalable_processes):
msg = _("CDH plugin cannot scale nodegroup with processes: "
"%(processes)s")
raise ex.NodeGroupCannotBeScaled(
ng.name, msg % {'processes': ' '.join(ng.node_processes)})
if not rm and 'YARN_NODEMANAGER' in ng.node_processes:
msg = _("CDH plugin cannot scale node group with processes "
"which have no master-processes run in cluster")
raise ex.NodeGroupCannotBeScaled(ng.name, msg)
def validate_existing_ng_scaling(cluster, existing):
scalable_processes = _get_scalable_processes()
dn_to_delete = 0
for ng in cluster.node_groups:
if ng.id in existing:
if (ng.count > existing[ng.id] and
'HDFS_DATANODE' in ng.node_processes):
dn_to_delete += ng.count - existing[ng.id]
if not set(ng.node_processes).issubset(scalable_processes):
msg = _("CDH plugin cannot scale nodegroup with processes: "
"%(processes)s")
raise ex.NodeGroupCannotBeScaled(
ng.name, msg % {'processes': ' '.join(ng.node_processes)})
dn_count = _get_inst_count(cluster, 'HDFS_DATANODE') - dn_to_delete
replicas = PU.get_config_value('HDFS', 'dfs_replication', cluster)
if dn_count < replicas:
raise ex.ClusterCannotBeScaled(
cluster,
_('Number of datanodes must be not less than dfs_replication.'))
def _get_scalable_processes():
return ['HDFS_DATANODE', 'YARN_NODEMANAGER']
def _get_inst_count(cluster, process):
return sum([ng.count for ng in u.get_node_groups(cluster, process)])
def _get_anti_affinity(cluster):
return cluster.anti_affinity