b144c7ee11
We add the codes about YARN RM HA implementation. If YARN_STANDBYRM role is enabled in cluster, we will do some validation, and call CM API enable_rm_ha to enable RM HA in the cluster. Partial-implements: blueprint cdh-ha-support Change-Id: I5562a310ef5b6fffa3439b28db2ceb212cc6286f
323 lines
14 KiB
Python
323 lines
14 KiB
Python
# Copyright (c) 2015 Intel Corporation
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
# implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from sahara.i18n import _
|
|
from sahara.plugins.cdh.v5_4_0 import plugin_utils as pu
|
|
from sahara.plugins import exceptions as ex
|
|
from sahara.plugins import utils as u
|
|
from sahara.utils import general as gu
|
|
|
|
PU = pu.PluginUtilsV540()
|
|
|
|
|
|
def validate_cluster_creating(cluster):
|
|
mng_count = _get_inst_count(cluster, 'CLOUDERA_MANAGER')
|
|
if mng_count != 1:
|
|
raise ex.InvalidComponentCountException('CLOUDERA_MANAGER',
|
|
1, mng_count)
|
|
|
|
zk_count = _get_inst_count(cluster, 'ZOOKEEPER_SERVER')
|
|
nn_count = _get_inst_count(cluster, 'HDFS_NAMENODE')
|
|
if nn_count != 1:
|
|
raise ex.InvalidComponentCountException('HDFS_NAMENODE', 1, nn_count)
|
|
|
|
snn_count = _get_inst_count(cluster, 'HDFS_SECONDARYNAMENODE')
|
|
if snn_count != 1:
|
|
raise ex.InvalidComponentCountException('HDFS_SECONDARYNAMENODE', 1,
|
|
snn_count)
|
|
|
|
dn_count = _get_inst_count(cluster, 'HDFS_DATANODE')
|
|
replicas = PU.get_config_value('HDFS', 'dfs_replication', cluster)
|
|
if dn_count < replicas:
|
|
raise ex.InvalidComponentCountException(
|
|
'HDFS_DATANODE', replicas, dn_count,
|
|
_('Number of datanodes must be not less than dfs_replication.'))
|
|
|
|
jn_count = _get_inst_count(cluster, 'HDFS_JOURNALNODE')
|
|
if jn_count > 0:
|
|
if jn_count < 3:
|
|
raise ex.InvalidComponentCountException('HDFS_JOURNALNODE',
|
|
_('not less than 3'),
|
|
jn_count)
|
|
if not jn_count % 2:
|
|
raise ex.InvalidComponentCountException('HDFS_JOURNALNODE',
|
|
_('be odd'), jn_count)
|
|
if zk_count < 1:
|
|
raise ex.RequiredServiceMissingException('ZOOKEEPER',
|
|
required_by='HDFS HA')
|
|
if 'HDFS_SECONDARYNAMENODE' not in _get_anti_affinity(cluster):
|
|
raise ex.NameNodeHAConfigurationError(_('HDFS_SECONDARYNAMENODE '
|
|
'should be enabled '
|
|
'in anti_affinity.'))
|
|
if 'HDFS_NAMENODE' not in _get_anti_affinity(cluster):
|
|
raise ex.NameNodeHAConfigurationError(_('HDFS_NAMENODE '
|
|
'should be enabled '
|
|
'in anti_affinity.'))
|
|
|
|
rm_count = _get_inst_count(cluster, 'YARN_RESOURCEMANAGER')
|
|
if rm_count > 1:
|
|
raise ex.InvalidComponentCountException('YARN_RESOURCEMANAGER',
|
|
_('0 or 1'), rm_count)
|
|
|
|
stdb_rm_count = _get_inst_count(cluster, 'YARN_STANDBYRM')
|
|
if stdb_rm_count > 1:
|
|
raise ex.InvalidComponentCountException('YARN_STANDBYRM',
|
|
_('0 or 1'), stdb_rm_count)
|
|
if stdb_rm_count > 0:
|
|
if rm_count < 1:
|
|
raise ex.RequiredServiceMissingException('YARN_RESOURCEMANAGER',
|
|
required_by='RM HA')
|
|
if zk_count < 1:
|
|
raise ex.RequiredServiceMissingException('ZOOKEEPER',
|
|
required_by='RM HA')
|
|
if 'YARN_RESOURCEMANAGER' not in _get_anti_affinity(cluster):
|
|
raise ex.ResourceManagerHAConfigurationError(
|
|
_('YARN_RESOURCEMANAGER should be enabled in anti_affinity.'))
|
|
if 'YARN_STANDBYRM' not in _get_anti_affinity(cluster):
|
|
raise ex.ResourceManagerHAConfigurationError(
|
|
_('YARN_STANDBYRM should be enabled in anti_affinity.'))
|
|
|
|
hs_count = _get_inst_count(cluster, 'YARN_JOBHISTORY')
|
|
if hs_count > 1:
|
|
raise ex.InvalidComponentCountException('YARN_JOBHISTORY',
|
|
_('0 or 1'), hs_count)
|
|
|
|
if rm_count > 0 and hs_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'YARN_JOBHISTORY', required_by='YARN_RESOURCEMANAGER')
|
|
|
|
nm_count = _get_inst_count(cluster, 'YARN_NODEMANAGER')
|
|
if rm_count == 0:
|
|
if nm_count > 0:
|
|
raise ex.RequiredServiceMissingException(
|
|
'YARN_RESOURCEMANAGER', required_by='YARN_NODEMANAGER')
|
|
|
|
oo_count = _get_inst_count(cluster, 'OOZIE_SERVER')
|
|
if oo_count > 1:
|
|
raise ex.InvalidComponentCountException('OOZIE_SERVER', _('0 or 1'),
|
|
oo_count)
|
|
|
|
if oo_count == 1:
|
|
if dn_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'HDFS_DATANODE', required_by='OOZIE_SERVER')
|
|
|
|
if nm_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'YARN_NODEMANAGER', required_by='OOZIE_SERVER')
|
|
|
|
if hs_count != 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'YARN_JOBHISTORY', required_by='OOZIE_SERVER')
|
|
|
|
hms_count = _get_inst_count(cluster, 'HIVE_METASTORE')
|
|
hvs_count = _get_inst_count(cluster, 'HIVE_SERVER2')
|
|
whc_count = _get_inst_count(cluster, 'HIVE_WEBHCAT')
|
|
|
|
if hms_count and rm_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'YARN_RESOURCEMANAGER', required_by='HIVE_METASTORE')
|
|
|
|
if hms_count and not hvs_count:
|
|
raise ex.RequiredServiceMissingException(
|
|
'HIVE_SERVER2', required_by='HIVE_METASTORE')
|
|
|
|
if hvs_count and not hms_count:
|
|
raise ex.RequiredServiceMissingException(
|
|
'HIVE_METASTORE', required_by='HIVE_SERVER2')
|
|
|
|
if whc_count and not hms_count:
|
|
raise ex.RequiredServiceMissingException(
|
|
'HIVE_METASTORE', required_by='HIVE_WEBHCAT')
|
|
|
|
hue_count = _get_inst_count(cluster, 'HUE_SERVER')
|
|
if hue_count > 1:
|
|
raise ex.InvalidComponentCountException('HUE_SERVER', _('0 or 1'),
|
|
hue_count)
|
|
|
|
shs_count = _get_inst_count(cluster, 'SPARK_YARN_HISTORY_SERVER')
|
|
if shs_count > 1:
|
|
raise ex.InvalidComponentCountException('SPARK_YARN_HISTORY_SERVER',
|
|
_('0 or 1'), shs_count)
|
|
if shs_count and not rm_count:
|
|
raise ex.RequiredServiceMissingException(
|
|
'YARN_RESOURCEMANAGER', required_by='SPARK_YARN_HISTORY_SERVER')
|
|
|
|
if oo_count < 1 and hue_count:
|
|
raise ex.RequiredServiceMissingException(
|
|
'OOZIE_SERVER', required_by='HUE_SERVER')
|
|
|
|
if hms_count < 1 and hue_count:
|
|
raise ex.RequiredServiceMissingException(
|
|
'HIVE_METASTORE', required_by='HUE_SERVER')
|
|
|
|
hbm_count = _get_inst_count(cluster, 'HBASE_MASTER')
|
|
hbr_count = _get_inst_count(cluster, 'HBASE_REGIONSERVER')
|
|
|
|
if hbm_count >= 1:
|
|
if zk_count < 1:
|
|
raise ex.RequiredServiceMissingException('ZOOKEEPER',
|
|
required_by='HBASE')
|
|
if hbr_count < 1:
|
|
raise ex.InvalidComponentCountException(
|
|
'HBASE_REGIONSERVER', _('at least 1'), hbr_count)
|
|
elif hbr_count >= 1:
|
|
raise ex.InvalidComponentCountException('HBASE_MASTER',
|
|
_('at least 1'), hbm_count)
|
|
|
|
a_count = _get_inst_count(cluster, 'FLUME_AGENT')
|
|
if a_count >= 1:
|
|
if dn_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'HDFS_DATANODE', required_by='FLUME_AGENT')
|
|
|
|
snt_count = _get_inst_count(cluster, 'SENTRY_SERVER')
|
|
if snt_count > 1:
|
|
raise ex.InvalidComponentCountException('SENTRY_SERVER', _('0 or 1'),
|
|
snt_count)
|
|
if snt_count == 1:
|
|
if dn_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'HDFS_DATANODE', required_by='SENTRY_SERVER')
|
|
if zk_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'ZOOKEEPER', required_by='SENTRY_SERVER')
|
|
|
|
slr_count = _get_inst_count(cluster, 'SOLR_SERVER')
|
|
if slr_count >= 1:
|
|
if dn_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'HDFS_DATANODE', required_by='SOLR_SERVER')
|
|
if zk_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'ZOOKEEPER', required_by='SOLR_SERVER')
|
|
|
|
s2s_count = _get_inst_count(cluster, 'SQOOP_SERVER')
|
|
if s2s_count > 1:
|
|
raise ex.InvalidComponentCountException('SQOOP_SERVER', _('0 or 1'),
|
|
s2s_count)
|
|
if s2s_count == 1:
|
|
if dn_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'HDFS_DATANODE', required_by='SQOOP_SERVER')
|
|
if nm_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'YARN_NODEMANAGER', required_by='SQOOP_SERVER')
|
|
if hs_count != 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'YARN_JOBHISTORY', required_by='SQOOP_SERVER')
|
|
|
|
lhbi_count = _get_inst_count(cluster, 'HBASE_INDEXER')
|
|
if lhbi_count >= 1:
|
|
if dn_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'HDFS_DATANODE', required_by='HBASE_INDEXER')
|
|
if zk_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'ZOOKEEPER', required_by='HBASE_INDEXER')
|
|
if slr_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'SOLR_SERVER', required_by='HBASE_INDEXER')
|
|
if hbm_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'HBASE_MASTER', required_by='HBASE_INDEXER')
|
|
|
|
ics_count = _get_inst_count(cluster, 'IMPALA_CATALOGSERVER')
|
|
iss_count = _get_inst_count(cluster, 'IMPALA_STATESTORE')
|
|
id_count = _get_inst_count(cluster, 'IMPALAD')
|
|
if ics_count > 1:
|
|
raise ex.InvalidComponentCountException('IMPALA_CATALOGSERVER',
|
|
_('0 or 1'), ics_count)
|
|
if iss_count > 1:
|
|
raise ex.InvalidComponentCountException('IMPALA_STATESTORE',
|
|
_('0 or 1'), iss_count)
|
|
if ics_count == 1:
|
|
datanodes = set(u.get_instances(cluster, "HDFS_DATANODE"))
|
|
impalads = set(u.get_instances(cluster, "IMPALAD"))
|
|
if len(datanodes ^ impalads) > 0:
|
|
raise ex.InvalidClusterTopology(
|
|
_("IMPALAD must be installed on every HDFS_DATANODE"))
|
|
|
|
if iss_count != 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'IMPALA_STATESTORE', required_by='IMPALA')
|
|
if id_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'IMPALAD', required_by='IMPALA')
|
|
if dn_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'HDFS_DATANODE', required_by='IMPALA')
|
|
if hms_count < 1:
|
|
raise ex.RequiredServiceMissingException(
|
|
'HIVE_METASTORE', required_by='IMPALA')
|
|
|
|
kms_count = _get_inst_count(cluster, 'KMS')
|
|
if kms_count > 1:
|
|
raise ex.InvalidComponentCountException('KMS',
|
|
_('0 or 1'), kms_count)
|
|
|
|
|
|
def validate_additional_ng_scaling(cluster, additional):
|
|
rm = PU.get_resourcemanager(cluster)
|
|
scalable_processes = _get_scalable_processes()
|
|
|
|
for ng_id in additional:
|
|
ng = gu.get_by_id(cluster.node_groups, ng_id)
|
|
if not set(ng.node_processes).issubset(scalable_processes):
|
|
msg = _("CDH plugin cannot scale nodegroup with processes: "
|
|
"%(processes)s")
|
|
raise ex.NodeGroupCannotBeScaled(
|
|
ng.name, msg % {'processes': ' '.join(ng.node_processes)})
|
|
|
|
if not rm and 'YARN_NODEMANAGER' in ng.node_processes:
|
|
msg = _("CDH plugin cannot scale node group with processes "
|
|
"which have no master-processes run in cluster")
|
|
raise ex.NodeGroupCannotBeScaled(ng.name, msg)
|
|
|
|
|
|
def validate_existing_ng_scaling(cluster, existing):
|
|
scalable_processes = _get_scalable_processes()
|
|
dn_to_delete = 0
|
|
for ng in cluster.node_groups:
|
|
if ng.id in existing:
|
|
if (ng.count > existing[ng.id] and
|
|
'HDFS_DATANODE' in ng.node_processes):
|
|
dn_to_delete += ng.count - existing[ng.id]
|
|
|
|
if not set(ng.node_processes).issubset(scalable_processes):
|
|
msg = _("CDH plugin cannot scale nodegroup with processes: "
|
|
"%(processes)s")
|
|
raise ex.NodeGroupCannotBeScaled(
|
|
ng.name, msg % {'processes': ' '.join(ng.node_processes)})
|
|
|
|
dn_count = _get_inst_count(cluster, 'HDFS_DATANODE') - dn_to_delete
|
|
replicas = PU.get_config_value('HDFS', 'dfs_replication', cluster)
|
|
if dn_count < replicas:
|
|
raise ex.ClusterCannotBeScaled(
|
|
cluster,
|
|
_('Number of datanodes must be not less than dfs_replication.'))
|
|
|
|
|
|
def _get_scalable_processes():
|
|
return ['HDFS_DATANODE', 'YARN_NODEMANAGER']
|
|
|
|
|
|
def _get_inst_count(cluster, process):
|
|
return sum([ng.count for ng in u.get_node_groups(cluster, process)])
|
|
|
|
|
|
def _get_anti_affinity(cluster):
|
|
return cluster.anti_affinity
|