from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
text1 = "学习keras的Tokenizer"
text2 = "就是这么的简单"
texts = [text1, text2]
# num_words 表示用多少词语生成词典(vocabulary)
# char_level表示 if True, every character will be treated as a token.
# oov_token是out-of-vocabulary,用来代替那些字典上没有的字。
tokenizer = Tokenizer(num_words=5000, char_level=True, oov_token='UNK')
tokenizer.fit_on_texts(texts)
# 每个word出现了几次
print(tokenizer.word_counts)
# 每个word出现在几个文档中
print(tokenizer.word_docs)
# 每个word出现了几次
print(tokenizer.document_count)
# 每个word对应的index,字典映射
print(tokenizer.word_index)
# mode:‘binary’,‘count’,‘tfidf’,‘freq’之一,默认为‘binary’
# 返回值:形如(len(texts), nb_words)的numpy array
print(tokenizer.texts_to_matrix(texts))
# 序列的列表
print(tokenizer.texts_to_sequences(texts))
texts = tokenizer.texts_to_sequences(texts)
texts = sequence.pad_sequences(texts, maxlen=30, padding='post',truncating='post')
print(texts)
网友评论