pyspark环境下如何使用mleap对模型进行序列化
import sys
sys.path.append("/home/devops/software/requirment/mleap/python")
import mleap.pyspark
from mleap.pyspark.spark_support import SimpleSparkSerializer
# Import standard PySpark Transformers and packages
from pyspark.ml.feature import VectorAssembler, StandardScaler, OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql import Row
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
l = [("Alice",1),("Bob",2)]
df = spark.createDataFrame(l,["name","age"])
df.show()
string_indexer = StringIndexer(inputCol='name', outputCol='name_string_index')
feature_assembler = VectorAssembler(inputCols=[string_indexer.getOutputCol()], outputCol="features")
feature_pipeline = [string_indexer, feature_assembler]
featurePipeline = Pipeline(stages=feature_pipeline)
fittedPipeline = featurePipeline.fit(df)
任务提交脚本
spark-submit --packages ml.combust.mleap:mleap-spark_2.11:0.13.0 mleap_test.py
网友评论