Support FP-Growth Model

A spark.mllib provides a parallel implementation
of FP-growth which is a popular algorithm
to mining frequent itemsets.

So, Meteos should support this model.

This model allows user to find
frequent word sets from text data.

implements blueprint support-fp-growth

Change-Id: If44578acf586ed7a68afc1bd1a7cceaa1c036bd7
This commit is contained in:
Hiroyuki Eguchi 2016-12-30 14:22:58 +09:00
parent 4f3503c42c
commit 2581b6d46f
1 changed files with 31 additions and 1 deletions

View File

@ -45,6 +45,8 @@ from pyspark.mllib.classification import LogisticRegressionModel
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.mllib.feature import Word2Vec
from pyspark.mllib.feature import Word2VecModel
from pyspark.mllib.fpm import FPGrowth
from pyspark.mllib.fpm import FPGrowthModel
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import LinearRegressionWithSGD
@ -283,6 +285,31 @@ class Word2VecModelController(ModelController):
print("{}: {}".format(word, cosine_distance))
class FPGrowthModelController(ModelController):
def __init__(self):
super(FPGrowthModelController, self).__init__()
def create_model(self, data, params):
minSupport = params.get('minSupport', 0.2)
numPartitions = params.get('numPartitions', 10)
limits = params.get('limits', 10)
transactions = data.map(lambda line: line.strip().split(' '))
model= FPGrowth.train(transactions,
minSupport=minSupport,
numPartitions=numPartitions)
result = model.freqItemsets().collect()
for index, fi in enumerate(result):
if index == limits:
break
print(str(fi.items) + ':' + str(fi.freq))
class MeteosSparkController(object):
def init_context(self):
@ -316,6 +343,8 @@ class MeteosSparkController(object):
self.controller = DecisionTreeModelController()
elif model_type == 'Word2Vec':
self.controller = Word2VecModelController()
elif model_type == 'FPGrowth':
self.controller = FPGrowthModelController()
def save_data(self, collect=True):
@ -370,7 +399,8 @@ class MeteosSparkController(object):
else:
self.model = self.controller.create_model(self.data, list_params)
self.model.save(self.context, self.modelpath)
if self.model:
self.model.save(self.context, self.modelpath)
def download_dataset(self):