from pyspark.sql import SparkSession
from pyspark.ml.feature import Binarizer
spark = SparkSession\
.builder\
.appName("BinarizerExample")\
.getOrCreate()
continuousDataFrame = spark.createDataFrame([
(0, 0.5),
(1, 0.8),
(2, 0.2)
], ["id", "feature"])
# threshold:设置阈值,大于阈值设置为1,小于等于阈值设置为0
binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")
binarizedDataFrame = binarizer.transform(continuousDataFrame)
print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
binarizedDataFrame.show()
网友评论