#!/usr/bin/python
# -*- coding: UTF-8 -*-
import codecs
import os, pprint
import os
import random, readJSON
import os, re
import jieba
import re
path = '/Users/lilong/Desktop/1.txt'
f = open(path, encoding='utf-8')
path_cp1 = '/Users/lilong/Desktop/cp1.txt'
op_cp1 = open(path_cp1, encoding='utf-8')
path_cp2 = '/Users/lilong/Desktop/cp2.txt'
op_cp2 = open(path_cp2, encoding='utf-8')
pool_op_cp = list(f)
pool_op_cp1 = list(op_cp1)
pool_op_cp2 = list(op_cp2)
# content = ''
def stopwordslist():
stopwords = [line.strip() for line in open('/Users/lilong/Desktop/stop_words', encoding='UTF-8').readlines()]
return stopwords
for line in pool_op_cp:
word = line
cleaned_data = re.findall(u"[\u4e00-\u9fa5]+", word)
r = ''
for ic in cleaned_data:
b = str(cleaned_data)
i = str(ic)
r += ic
a = jieba.lcut(r)
# print("正在分词...")
# print(a)
stopwords = stopwordslist()
# 输出结果为outstr
outstr = ''
# 去停用词
for word in a:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
outstr = outstr.replace(" ", "")
# content += line
random.shuffle(pool_op_cp1)
random.shuffle(pool_op_cp2)
con_cp1 = pool_op_cp1[0]
con_cp2 = pool_op_cp2[0]
all = outstr + con_cp1 + con_cp2
all = all.replace("\n", "")
print(all)
txt = os.path.join('/Users/lilong/Desktop/z.txt')
w_txt = open(txt, 'w')
f = codecs.open(txt, 'r+', encoding='utf-8')
print(f)
f.write(all)
f.close()
网友评论