from pyspark.ml.feature import Word2Vec
# $example off$
from pyspark.sql import SparkSession
spark = SparkSession\
.builder\
.appName("Word2VecExample")\
.getOrCreate()
documentDF = spark.createDataFrame([
("Hi I heard about Spark".split(" "), ),
("I wish Java could use case classes".split(" "), ),
("Logistic regression models are neat".split(" "), )
], schema=["text"])# schema可以为列名
# Learn a mapping from words to Vectors.
# 将每个元素转为vectorSize长度的向量,minCount:计数为此以上的单词才纳入训练模型,inputCol:输入列。其他默认值
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result",maxIter=1,numPartitions=1,stepSize=0.025,windowSize=5,maxSentenceLength=1000)
model = word2Vec.fit(documentDF)
result = model.transform(documentDF)
for row in result.collect():
text, vector = row
print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))
model.getVectors().head(50)# 用head查看各单词的向量表示,可以发现,语句的向量表示并非由单词的向量相加而成
model.findSynonymsArray("I", 2)# 找出2个与I同义的词
网友评论