from pyspark.sql import SparkSession
from pyspark.ml.feature import FeatureHasher
spark = SparkSession\
.builder\
.appName("FeatureHasherExample")\
.getOrCreate()
# $example on$
dataset = spark.createDataFrame([
(2.2, True, "1", "foo"),
(3.3, False, "2", "bar"),
(4.4, False, "3", "baz"),
(5.5, False, "4", "foo")
], ["real", "bool", "stringNum", "string"])
# numFeatures:默认哈希桶容量
# categoricalCols:类别列
hasher = FeatureHasher(numFeatures=262144,inputCols=["real", "bool", "stringNum", "string"],
outputCol="features",categoricalCols=None,)
featurized = hasher.transform(dataset)
# 所有特征行被编码为长度262144的哈希桶中相应值
# 如第一行(262144,[174475,247670,257907,262126],[2.2,1.0,1.0,1.0])表示在指定4个索引的值为[2.2,1.0,1.0,1.0],其他索引处为0
featurized.show(truncate=False)
网友评论