首先,去https://www.sogou.com/labs/resource/ca.php下载语料,我下的是完整版711M数据。解压生成dat文件,并不是utf-8格式先要转换:
cat news_tensite_xml.dat | iconv -f gbk -t utf-8 -c | grep "<content>" > news.txt'
文件较大,后续处理可能不方便,所以将大语料切分到split_files目录下:
#! /usr/bin/env Python3
# -*- coding:utf8 -*-
import re #正则表达式
def split_file(base_dir, input_file, max_lines):
f = open(base_dir + input_file, "r")
line_index = 0 #每个txt行数
txt_num = 0 #txt数目
output = open(base_dir + "split_files/" + str(txt_num) + ".txt", 'w+')
for line in f:
if "</content>" in line and line_index > max_lines: #一篇文章结束且达到此t x t最大行
#写入此t x t最后一行,并开一篇新t x t,行数重设为0
output.write(line + "\n")
output.close()
txt_num += 1
output = open(base_dir + "split_files/" + str(txt_num) + ".txt", 'w+')
line_index = 0
print(txt_num)
else:
#写入此txt
output.write(line + "\n")
line_index += 1
split_file("data/sougou_corpus/", "news.txt", 100000)
然后提取content标签内容,分词后放在cut目录下。
import codecs
import sys
import re
import csv
import os
import codecs
rootdir = 'data/sougou_corpus/split_files'
list = os.listdir(rootdir)
pattern1 = r'<content>(.*?)</content>'
for x in range(0,len(list)):
path = os.path.join(rootdir, list[x]) #获取目录下文件名字
if os.path.isfile(path) and list[x].endswith(".txt"):
print(path)
output = open("data/sougou_corpus/cut/" + list[x], "w+", encoding = "utf-8")
content = open(path,'r').read()
texts = re.findall(pattern1,content,re.S)
for text in texts:
if text:
line = ' '.join(jieba.lcut(text))
output.write(line + "\n")
output.close()
用一个文件训练词向量并保存:
#coding=utf-8
from gensim.models import word2vec
sentences = word2vec.Text8Corpus("data/sougou_corpus/cut/12.txt") # 加载语料
model = word2vec.Word2Vec(sentences, size=200) # 默认window=5
model.save('data/sougou_corpus/models/word2vec.model')
查看效果:
image.png
还不错吧。
还可以继续训练其他文本。
def word2vec_train(input_file):
model = word2vec.Word2Vec.load("models/word2vec.model")
sentences = word2vec.Text8Corpus(input_file) # 加载语料
model.train(sentences, epochs=model.iter, total_examples=model.corpus_count)
model.save('models/word2vec.model')
网友评论