以逻辑回归为例
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import SparkSession
spark = SparkSession\
.builder\
.appName("EstimatorTransformerParamExample")\
.getOrCreate()
training = spark.createDataFrame([
(1.0, Vectors.dense([0.0, 1.1, 0.1])),
(0.0, Vectors.dense([2.0, 1.0, -1.0])),
(0.0, Vectors.dense([2.0, 1.3, 1.0])),
(1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])
# threshold:推断所用阈值。2分类,默认0.5;thresholds:多分类
# family:描述label分布,默认atuo,可选binomial和multinomial
lr = LogisticRegression(featuresCol='features',labelCol='label',predictionCol='prediction',
probabilityCol='probability',maxIter=10, regParam=0.01,elasticNetParam=0,tol=1e-6,
standardization=True,fitIntercept=True)
# 解释模型参数:这个方法很有用,因为上面LogisticRegression没有参数文档
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
model1 = lr.fit(training)
print(model1.extractParamMap())# 提取参数map
# 自定义参数map
paramMap = {lr.maxIter: 20}
paramMap[lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter.
paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55}) # Specify multiple Params.
# 合并参数map
paramMap2 = {lr.probabilityCol: "myProbability"} # Change output column name
paramMapCombined = paramMap.copy()
paramMapCombined.update(paramMap2)
model2 = lr.fit(training, paramMapCombined)# 使用自定义参数
# Prepare test data
test = spark.createDataFrame([
(1.0, Vectors.dense([-1.0, 1.5, 1.3])),
(0.0, Vectors.dense([3.0, 2.0, -0.1])),
(1.0, Vectors.dense([0.0, 2.2, -1.5]))], ["label", "features"])
prediction = model2.transform(test)# 预测
result = prediction.select("features", "label", "myProbability", "prediction") \
.collect()
for row in result:
print("features=%s, label=%s -> prob=%s, prediction=%s"
% (row.features, row.label, row.myProbability, row.prediction))
网友评论