Add split dataset method
In machine learning, user splits the dataset in general. One is for creating a prediction model, the other is for evaluating model. Add a split dataset command to make it possible. implements blueprint split-dataset Change-Id: Idb323a1240d790d5628f9eb31cba8d3a29ad64e8
This commit is contained in:
parent
a197225848
commit
8009742c0f
|
@ -128,6 +128,12 @@ class DatasetController(wsgi.Controller, wsgi.AdminActionsMixin):
|
|||
swift_tenant = dataset.get('swift_tenant')
|
||||
swift_username = dataset.get('swift_username')
|
||||
swift_password = dataset.get('swift_password')
|
||||
percent_train = dataset.get('percent_train', '0.7')
|
||||
percent_test = dataset.get('percent_test', '0.3')
|
||||
|
||||
if (method == 'split'
|
||||
and not float(percent_train) + float(percent_test) == 1.0):
|
||||
raise exc.HTTPUnprocessableEntity()
|
||||
|
||||
new_dataset = self.engine_api.create_dataset(context,
|
||||
display_name,
|
||||
|
@ -141,7 +147,9 @@ class DatasetController(wsgi.Controller, wsgi.AdminActionsMixin):
|
|||
experiment.cluster_id,
|
||||
swift_tenant,
|
||||
swift_username,
|
||||
swift_password)
|
||||
swift_password,
|
||||
percent_train,
|
||||
percent_test)
|
||||
|
||||
return self._view_builder.detail(req, new_dataset)
|
||||
|
||||
|
|
|
@ -521,6 +521,24 @@ class MeteosSparkController(object):
|
|||
exec('self.data = self.data' + cmd)
|
||||
self.save_data()
|
||||
|
||||
def split_dataset(self):
|
||||
|
||||
self.load_data()
|
||||
|
||||
percent_train = self.job_args['dataset']['percent_train']
|
||||
percent_test = self.job_args['dataset']['percent_test']
|
||||
|
||||
# Split the data into training and test sets
|
||||
(trainData, testData) = self.data.randomSplit([float(percent_train),
|
||||
float(percent_test)])
|
||||
self.data = trainData
|
||||
self.save_data()
|
||||
|
||||
# save testData
|
||||
testData.collect()
|
||||
datapath = 'data-' + self.job_args['dataset']['test_dataset']['id']
|
||||
testData.saveAsTextFile(datapath)
|
||||
|
||||
def create_model(self):
|
||||
|
||||
self.load_data()
|
||||
|
|
|
@ -20,6 +20,7 @@
|
|||
Handles all requests relating to learnings.
|
||||
"""
|
||||
|
||||
import copy
|
||||
from oslo_log import log
|
||||
from oslo_utils import excutils
|
||||
import six
|
||||
|
@ -230,7 +231,8 @@ class API(base.Base):
|
|||
def create_dataset(self, context, name, description, method,
|
||||
source_dataset_url, params, template_id,
|
||||
job_template_id, experiment_id, cluster_id,
|
||||
swift_tenant, swift_username, swift_password):
|
||||
swift_tenant, swift_username, swift_password,
|
||||
percent_train, percent_test):
|
||||
"""Create a Dataset"""
|
||||
policy.check_policy(context, 'dataset', 'create')
|
||||
|
||||
|
@ -247,6 +249,24 @@ class API(base.Base):
|
|||
'cluster_id': cluster_id
|
||||
}
|
||||
|
||||
test_dataset = {}
|
||||
|
||||
if method == 'split':
|
||||
train_data_name = name + '_train_' + percent_train
|
||||
test_data_name = name + '_test_' + percent_test
|
||||
|
||||
tmp_dataset = copy.deepcopy(dataset)
|
||||
|
||||
dataset['display_name'] = train_data_name
|
||||
|
||||
tmp_dataset['display_name'] = test_data_name
|
||||
|
||||
try:
|
||||
test_dataset = self.db.dataset_create(context, tmp_dataset)
|
||||
except Exception:
|
||||
with excutils.save_and_reraise_exception():
|
||||
self.db.dataset_delete(context, test_dataset['id'])
|
||||
|
||||
try:
|
||||
result = self.db.dataset_create(context, dataset)
|
||||
result['template_id'] = template_id
|
||||
|
@ -254,11 +274,20 @@ class API(base.Base):
|
|||
result['swift_tenant'] = swift_tenant
|
||||
result['swift_username'] = swift_username
|
||||
result['swift_password'] = swift_password
|
||||
result['test_dataset'] = test_dataset
|
||||
result['percent_train'] = percent_train
|
||||
result['percent_test'] = percent_test
|
||||
|
||||
self.engine_rpcapi.create_dataset(context, result)
|
||||
updates = {'status': constants.STATUS_CREATING}
|
||||
|
||||
LOG.info(_LI("Accepted parsing of dataset %s."), result['id'])
|
||||
self.db.dataset_update(context, result['id'], updates)
|
||||
|
||||
if result['test_dataset']:
|
||||
self.db.dataset_update(context,
|
||||
result['test_dataset']['id'],
|
||||
updates)
|
||||
|
||||
except Exception:
|
||||
with excutils.save_and_reraise_exception():
|
||||
self.db.dataset_delete(context, result['id'])
|
||||
|
|
|
@ -162,7 +162,10 @@ class GenericLearningDriver(driver.LearningDriver):
|
|||
job_args['source_dataset_url'] = request_specs\
|
||||
.get('source_dataset_url')
|
||||
job_args['dataset_format'] = request_specs.get('dataset_format')
|
||||
dataset_args = {'params': request_specs.get('params')}
|
||||
dataset_args = {'params': request_specs.get('params'),
|
||||
'test_dataset': request_specs.get('test_dataset'),
|
||||
'percent_train': request_specs.get('percent_train'),
|
||||
'percent_test': request_specs.get('percent_test')}
|
||||
job_args['dataset'] = dataset_args
|
||||
|
||||
# Set parameters of Swift
|
||||
|
|
|
@ -227,6 +227,14 @@ class LearningManager(manager.Manager):
|
|||
self._update_status(context, 'DataSet', request_spec['id'],
|
||||
job_id, stdout, stderr)
|
||||
|
||||
if request_spec['test_dataset']:
|
||||
self._update_status(context,
|
||||
'DataSet',
|
||||
request_spec['test_dataset']['id'],
|
||||
job_id,
|
||||
None,
|
||||
stderr)
|
||||
|
||||
def delete_dataset(self, context, cluster_id=None, job_id=None, id=None):
|
||||
"""Deletes a Dataset."""
|
||||
context = context.elevated()
|
||||
|
|
Loading…
Reference in New Issue