常用于TD-IDF算法
from pyspark.sql import SparkSession
from pyspark.ml.feature import CountVectorizer
spark = SparkSession\
.builder\
.appName("CountVectorizerExample")\
.getOrCreate()
# Input data: Each row is a bag of words with a ID.
df = spark.createDataFrame([
(0, "a b c".split(" ")),
(1, "a b b c a".split(" "))
], ["id", "words"])
# fit a CountVectorizerModel from the corpus.
# minDF:最小文档频率(表示入选的单词至少要在minDF个文档中出现)
# minTF:最小词频(表示该单词在当前文档出现的最小频率)
# vocabSize:词汇表最大容量(只取频率最高的前vocabSize)
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)
model = cv.fit(df)
# features列:[vocabSize,[idx1,idx2,...],[count1,count2,...]]
# 索引从0开始,按频率出现高低排序,比如这里0对应字母a,1对应b,2对应c
# count表示索引对应的单词在当前文档中出现的频率
result = model.transform(df)
result.show(truncate=False)
网友评论