From 7cf61bd313b168f4c45ba70f84deef520b3f6842 Mon Sep 17 00:00:00 2001 From: Ethan Gafford Date: Mon, 26 Jan 2015 13:25:04 -0500 Subject: [PATCH] Spark Temporary Job Data Retention and Cleanup Introduces a periodic task for the cleanup of data from Spark jobs, in order to ease maintenance of long-lived clusters. Change-Id: Ia7dc2dde54ab62199a630c3d1b64c76f08698181 Implements: blueprint spark-cleanup --- sahara/plugins/spark/config_helper.py | 45 ++++++++++++++++- sahara/plugins/spark/plugin.py | 17 ++++++- .../spark/resources/spark-cleanup.cron | 2 + .../spark/resources/tmp-cleanup.sh.template | 48 +++++++++++++++++++ .../unit/plugins/spark/test_config_helper.py | 33 +++++++++++++ .../tests/unit/plugins/spark/test_plugin.py | 37 ++++++++++++++ 6 files changed, 180 insertions(+), 2 deletions(-) create mode 100644 sahara/plugins/spark/resources/spark-cleanup.cron create mode 100644 sahara/plugins/spark/resources/tmp-cleanup.sh.template diff --git a/sahara/plugins/spark/config_helper.py b/sahara/plugins/spark/config_helper.py index c9924e1c..b68aa161 100644 --- a/sahara/plugins/spark/config_helper.py +++ b/sahara/plugins/spark/config_helper.py @@ -22,6 +22,7 @@ from sahara.openstack.common import log as logging from sahara.plugins import provisioning as p from sahara.plugins import utils from sahara.topology import topology_helper as topology +from sahara.utils import files as f from sahara.utils import types as types from sahara.utils import xmlutils as x @@ -98,7 +99,28 @@ SPARK_CONFS = { ' (default: /opt/spark)', 'default': '/opt/spark', 'priority': 2, - } + }, + { + 'name': 'Minimum cleanup seconds', + 'description': 'Job data will never be purged before this' + ' amount of time elapses (default: 86400 = 1 day)', + 'default': '86400', + 'priority': 2, + }, + { + 'name': 'Maximum cleanup seconds', + 'description': 'Job data will always be purged after this' + ' amount of time elapses (default: 1209600 = 14 days)', + 'default': '1209600', + 'priority': 2, + }, + { + 'name': 'Minimum cleanup megabytes', + 'description': 'No job data will be purged unless the total' + ' job data exceeds this size (default: 4096 = 4GB)', + 'default': '4096', + 'priority': 2, + }, ] } } @@ -375,6 +397,27 @@ def generate_hadoop_setup_script(storage_paths, env_configs): return "\n".join(script_lines) +def generate_job_cleanup_config(cluster): + args = { + 'minimum_cleanup_megabytes': get_config_value( + "Spark", "Minimum cleanup megabytes", cluster), + 'minimum_cleanup_seconds': get_config_value( + "Spark", "Minimum cleanup seconds", cluster), + 'maximum_cleanup_seconds': get_config_value( + "Spark", "Maximum cleanup seconds", cluster) + } + job_conf = {'valid': (args['maximum_cleanup_seconds'] > 0 and + (args['minimum_cleanup_megabytes'] > 0 + and args['minimum_cleanup_seconds'] > 0))} + if job_conf['valid']: + job_conf['cron'] = f.get_file_text( + 'plugins/spark/resources/spark-cleanup.cron'), + job_cleanup_script = f.get_file_text( + 'plugins/spark/resources/tmp-cleanup.sh.template') + job_conf['script'] = job_cleanup_script.format(**args) + return job_conf + + def extract_name_values(configs): return dict((cfg['name'], cfg['value']) for cfg in configs) diff --git a/sahara/plugins/spark/plugin.py b/sahara/plugins/spark/plugin.py index 22228cc6..23b23008 100644 --- a/sahara/plugins/spark/plugin.py +++ b/sahara/plugins/spark/plugin.py @@ -150,6 +150,7 @@ class SparkProvider(p.ProvisioningPluginBase): else: config_slaves = "\n" + extra['job_cleanup'] = c_helper.generate_job_cleanup_config(cluster) for ng in cluster.node_groups: extra[ng.id] = { 'xml': c_helper.generate_xml_configs( @@ -273,6 +274,7 @@ class SparkProvider(p.ProvisioningPluginBase): self._write_topology_data(r, cluster, extra) self._push_master_configs(r, cluster, extra, instance) + self._push_cleanup_job(r, cluster, extra, instance) def _push_configs_to_existing_node(self, cluster, extra, instance): node_processes = instance.node_group.node_processes @@ -291,6 +293,7 @@ class SparkProvider(p.ProvisioningPluginBase): } r = remote.get_remote(instance) r.write_files_to(files) + self._push_cleanup_job(r, cluster, extra, instance) if need_update_hadoop: with remote.get_remote(instance) as r: self._write_topology_data(r, cluster, extra) @@ -303,10 +306,22 @@ class SparkProvider(p.ProvisioningPluginBase): def _push_master_configs(self, r, cluster, extra, instance): node_processes = instance.node_group.node_processes - if 'namenode' in node_processes: self._push_namenode_configs(cluster, r) + def _push_cleanup_job(self, r, cluster, extra, instance): + node_processes = instance.node_group.node_processes + if 'master' in node_processes: + if extra['job_cleanup']['valid']: + r.write_file_to('/etc/hadoop/tmp-cleanup.sh', + extra['job_cleanup']['script']) + r.execute_command("chmod 755 /etc/hadoop/tmp-cleanup.sh") + cmd = 'sudo sh -c \'echo "%s" > /etc/cron.d/spark-cleanup\'' + r.execute_command(cmd % extra['job_cleanup']['cron']) + else: + r.execute_command("sudo rm -f /etc/hadoop/tmp-cleanup.sh") + r.execute_command("sudo rm -f /etc/crond.d/spark-cleanup") + def _push_namenode_configs(self, cluster, r): r.write_file_to('/etc/hadoop/dn.incl', utils.generate_fqdn_host_names( diff --git a/sahara/plugins/spark/resources/spark-cleanup.cron b/sahara/plugins/spark/resources/spark-cleanup.cron new file mode 100644 index 00000000..e182a0e2 --- /dev/null +++ b/sahara/plugins/spark/resources/spark-cleanup.cron @@ -0,0 +1,2 @@ +# Cleans up old Spark job directories once per hour. +0 * * * * root /etc/hadoop/tmp-cleanup.sh \ No newline at end of file diff --git a/sahara/plugins/spark/resources/tmp-cleanup.sh.template b/sahara/plugins/spark/resources/tmp-cleanup.sh.template new file mode 100644 index 00000000..e715719b --- /dev/null +++ b/sahara/plugins/spark/resources/tmp-cleanup.sh.template @@ -0,0 +1,48 @@ +#!/bin/sh + +MINIMUM_CLEANUP_MEGABYTES={minimum_cleanup_megabytes} +MINIMUM_CLEANUP_SECONDS={minimum_cleanup_seconds} +MAXIMUM_CLEANUP_SECONDS={maximum_cleanup_seconds} + +CURRENT_TIMESTAMP=`date +%s` +POSSIBLE_CLEANUP_THRESHOLD=$(($CURRENT_TIMESTAMP - $MINIMUM_CLEANUP_SECONDS)) +DEFINITE_CLEANUP_THRESHOLD=$(($CURRENT_TIMESTAMP - $MAXIMUM_CLEANUP_SECONDS)) + +unset MAY_DELETE +unset WILL_DELETE + +if [ ! -d /tmp/spark-edp ] +then + exit 0 +fi + +cd /tmp/spark-edp +for JOB in $(find . -maxdepth 1 -mindepth 1 -type d -printf '%f\n') +do + for EXECUTION in $(find $JOB -maxdepth 1 -mindepth 1 -type d -printf '%f\n') + do + TIMESTAMP=`stat $JOB/$EXECUTION --printf '%Y'` + if [[ $TIMESTAMP -lt $DEFINITE_CLEANUP_THRESHOLD ]] + then + WILL_DELETE="$WILL_DELETE $JOB/$EXECUTION" + else + if [[ $TIMESTAMP -lt $POSSIBLE_CLEANUP_THRESHOLD ]] + then + MAY_DELETE="$MAY_DELETE $JOB/$EXECUTION" + fi + fi + done +done + +for EXECUTION in $WILL_DELETE +do + rm -Rf $EXECUTION +done + +for EXECUTION in $(ls $MAY_DELETE -trd) +do + if [[ `du -s -BM | grep -o '[0-9]\+'` -le $MINIMUM_CLEANUP_MEGABYTES ]]; then + break + fi + rm -Rf $EXECUTION +done diff --git a/sahara/tests/unit/plugins/spark/test_config_helper.py b/sahara/tests/unit/plugins/spark/test_config_helper.py index 25339815..53909dd9 100644 --- a/sahara/tests/unit/plugins/spark/test_config_helper.py +++ b/sahara/tests/unit/plugins/spark/test_config_helper.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import mock + from sahara.plugins.spark import config_helper as c_helper from sahara.tests.unit import base as test_base @@ -23,3 +25,34 @@ class ConfigHelperUtilsTest(test_base.SaharaTestCase): paths = c_helper.make_hadoop_path(storage_paths, '/spam') expected = ['/mnt/one/spam', '/mnt/two/spam'] self.assertEqual(expected, paths) + + @mock.patch('sahara.plugins.spark.config_helper.get_config_value') + def test_cleanup_configs(self, get_config_value): + getter = lambda plugin, key, cluster: plugin_configs[key] + get_config_value.side_effect = getter + plugin_configs = {"Minimum cleanup megabytes": 4096, + "Minimum cleanup seconds": 86400, + "Maximum cleanup seconds": 1209600} + configs = c_helper.generate_job_cleanup_config(None) + self.assertTrue(configs['valid']) + expected = ["MINIMUM_CLEANUP_MEGABYTES=4096", + "MINIMUM_CLEANUP_SECONDS=86400", + "MAXIMUM_CLEANUP_SECONDS=1209600"] + for config_value in expected: + self.assertIn(config_value, configs['script']) + self.assertIn("0 * * * * root /etc/hadoop/tmp-cleanup.sh", + configs['cron'][0]) + + plugin_configs['Maximum cleanup seconds'] = 0 + configs = c_helper.generate_job_cleanup_config(None) + self.assertFalse(configs['valid']) + self.assertNotIn(configs, 'script') + self.assertNotIn(configs, 'cron') + + plugin_configs = {"Minimum cleanup megabytes": 0, + "Minimum cleanup seconds": 0, + "Maximum cleanup seconds": 1209600} + configs = c_helper.generate_job_cleanup_config(None) + self.assertFalse(configs['valid']) + self.assertNotIn(configs, 'script') + self.assertNotIn(configs, 'cron') diff --git a/sahara/tests/unit/plugins/spark/test_plugin.py b/sahara/tests/unit/plugins/spark/test_plugin.py index f0880d11..4e129bfa 100644 --- a/sahara/tests/unit/plugins/spark/test_plugin.py +++ b/sahara/tests/unit/plugins/spark/test_plugin.py @@ -65,3 +65,40 @@ class SparkPluginTest(base.SaharaWithDbTestCase): self.assertIsInstance( plugin.get_edp_engine(cluster, edp.JOB_TYPE_SPARK), engine.SparkJobEngine) + + def test_cleanup_configs(self): + remote = mock.Mock() + instance = mock.Mock() + + extra_conf = {'job_cleanup': { + 'valid': True, + 'script': 'script_text', + 'cron': 'cron_text'}} + instance.node_group.node_processes = ["master"] + instance.node_group.id = id + cluster_dict = { + 'name': 'cluster', + 'plugin_name': 'spark', + 'hadoop_version': '1.0.0', + 'default_image_id': 'image'} + + cluster = conductor.cluster_create(context.ctx(), cluster_dict) + plugin = pb.PLUGINS.get_plugin(cluster.plugin_name) + plugin._push_cleanup_job(remote, cluster, extra_conf, instance) + remote.write_file_to.assert_called_with( + '/etc/hadoop/tmp-cleanup.sh', + 'script_text') + remote.execute_command.assert_called_with( + 'sudo sh -c \'echo "cron_text" > /etc/cron.d/spark-cleanup\'') + + remote.reset_mock() + instance.node_group.node_processes = ["worker"] + plugin._push_cleanup_job(remote, cluster, extra_conf, instance) + self.assertFalse(remote.called) + + remote.reset_mock() + instance.node_group.node_processes = ["master"] + extra_conf['job_cleanup']['valid'] = False + plugin._push_cleanup_job(remote, cluster, extra_conf, instance) + remote.execute_command.assert_called_with( + 'sudo rm -f /etc/crond.d/spark-cleanup')