Add split dataset method

In machine learning, user splits the dataset in general.
One is for creating a prediction model,
the other is for evaluating model.
Add a split dataset command to make it possible.

implements blueprint split-dataset

Change-Id: Idb323a1240d790d5628f9eb31cba8d3a29ad64e8
This commit is contained in:
Hiroyuki Eguchi 2017-02-08 16:37:37 +09:00
parent a197225848
commit 8009742c0f
5 changed files with 70 additions and 4 deletions

View File

@ -128,6 +128,12 @@ class DatasetController(wsgi.Controller, wsgi.AdminActionsMixin):
swift_tenant = dataset.get('swift_tenant')
swift_username = dataset.get('swift_username')
swift_password = dataset.get('swift_password')
percent_train = dataset.get('percent_train', '0.7')
percent_test = dataset.get('percent_test', '0.3')
if (method == 'split'
and not float(percent_train) + float(percent_test) == 1.0):
raise exc.HTTPUnprocessableEntity()
new_dataset = self.engine_api.create_dataset(context,
display_name,
@ -141,7 +147,9 @@ class DatasetController(wsgi.Controller, wsgi.AdminActionsMixin):
experiment.cluster_id,
swift_tenant,
swift_username,
swift_password)
swift_password,
percent_train,
percent_test)
return self._view_builder.detail(req, new_dataset)

View File

@ -521,6 +521,24 @@ class MeteosSparkController(object):
exec('self.data = self.data' + cmd)
self.save_data()
def split_dataset(self):
self.load_data()
percent_train = self.job_args['dataset']['percent_train']
percent_test = self.job_args['dataset']['percent_test']
# Split the data into training and test sets
(trainData, testData) = self.data.randomSplit([float(percent_train),
float(percent_test)])
self.data = trainData
self.save_data()
# save testData
testData.collect()
datapath = 'data-' + self.job_args['dataset']['test_dataset']['id']
testData.saveAsTextFile(datapath)
def create_model(self):
self.load_data()

View File

@ -20,6 +20,7 @@
Handles all requests relating to learnings.
"""
import copy
from oslo_log import log
from oslo_utils import excutils
import six
@ -230,7 +231,8 @@ class API(base.Base):
def create_dataset(self, context, name, description, method,
source_dataset_url, params, template_id,
job_template_id, experiment_id, cluster_id,
swift_tenant, swift_username, swift_password):
swift_tenant, swift_username, swift_password,
percent_train, percent_test):
"""Create a Dataset"""
policy.check_policy(context, 'dataset', 'create')
@ -247,6 +249,24 @@ class API(base.Base):
'cluster_id': cluster_id
}
test_dataset = {}
if method == 'split':
train_data_name = name + '_train_' + percent_train
test_data_name = name + '_test_' + percent_test
tmp_dataset = copy.deepcopy(dataset)
dataset['display_name'] = train_data_name
tmp_dataset['display_name'] = test_data_name
try:
test_dataset = self.db.dataset_create(context, tmp_dataset)
except Exception:
with excutils.save_and_reraise_exception():
self.db.dataset_delete(context, test_dataset['id'])
try:
result = self.db.dataset_create(context, dataset)
result['template_id'] = template_id
@ -254,11 +274,20 @@ class API(base.Base):
result['swift_tenant'] = swift_tenant
result['swift_username'] = swift_username
result['swift_password'] = swift_password
result['test_dataset'] = test_dataset
result['percent_train'] = percent_train
result['percent_test'] = percent_test
self.engine_rpcapi.create_dataset(context, result)
updates = {'status': constants.STATUS_CREATING}
LOG.info(_LI("Accepted parsing of dataset %s."), result['id'])
self.db.dataset_update(context, result['id'], updates)
if result['test_dataset']:
self.db.dataset_update(context,
result['test_dataset']['id'],
updates)
except Exception:
with excutils.save_and_reraise_exception():
self.db.dataset_delete(context, result['id'])

View File

@ -162,7 +162,10 @@ class GenericLearningDriver(driver.LearningDriver):
job_args['source_dataset_url'] = request_specs\
.get('source_dataset_url')
job_args['dataset_format'] = request_specs.get('dataset_format')
dataset_args = {'params': request_specs.get('params')}
dataset_args = {'params': request_specs.get('params'),
'test_dataset': request_specs.get('test_dataset'),
'percent_train': request_specs.get('percent_train'),
'percent_test': request_specs.get('percent_test')}
job_args['dataset'] = dataset_args
# Set parameters of Swift

View File

@ -227,6 +227,14 @@ class LearningManager(manager.Manager):
self._update_status(context, 'DataSet', request_spec['id'],
job_id, stdout, stderr)
if request_spec['test_dataset']:
self._update_status(context,
'DataSet',
request_spec['test_dataset']['id'],
job_id,
None,
stderr)
def delete_dataset(self, context, cluster_id=None, job_id=None, id=None):
"""Deletes a Dataset."""
context = context.elevated()