#!/usr/bin/python
# -*- coding: UTF-8 -*-
import re
import os
import time
import jieba
import os, re
import random
import codecs
import pymysql
import os, pprint
import mysql.connector
path_url = '/Users/lilong/Desktop/likeshuo/url.txt' # 图片
op_url = open(path_url, encoding='utf-8')
path_cp1 = '/Users/lilong/Desktop/likeshuo/corpus_one.txt' # 标题后缀(疑问、诉求)
op_cp1 = open(path_cp1, encoding='utf-8')
path_cp2 = '/Users/lilong/Desktop/likeshuo/corpus_two.txt' # 标题末句子(价值输出)
op_cp2 = open(path_cp2, encoding='utf-8')
path = '/Users/lilong/Desktop/likeshuo/Keywords_provided.txt' # 提供的精准词
f = open(path, encoding='utf-8')
nbsp = " "
pool_op_cp = list(f)
pool_op_cp1 = list(op_cp1)
pool_op_cp2 = list(op_cp2)
pool_op_url = list(op_url)
def stopwordslist():
stopwords = [line.strip() for line in open('/Users/lilong/Desktop/likeshuo/stop_words', encoding='UTF-8').readlines()]
return stopwords # 停用词
all_cp = []
all_list_all = []
all_list_con_all = []
for line in pool_op_cp:
word = line
cleaned_data = re.findall(u"[\u4e00-\u9fa5]+", word)
r = ''
for ic in cleaned_data:
b = str(cleaned_data)
i = str(ic)
r += ic
a = jieba.lcut(r)
stopwords = stopwordslist()
# 输出结果为outstr
outstr = ''
# 去停用词
all_list = []
all_str = ''
all_con_str = ''
all_list_con = []
for word in a:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
outstr = outstr.replace(" ", "")
random.shuffle(pool_op_cp1)
random.shuffle(pool_op_cp2)
random.shuffle(pool_op_url)
con_cp1 = pool_op_cp1[0]
con_cp2 = pool_op_cp2[0]
con_url = pool_op_url[0]
all = outstr + con_cp1 + con_cp2
all = all.replace("\n", "")
all_str += all
all_list.append(all_str)
all_list_all.extend(all_list)
all_con_str += con_url
all_list_con.append(all_con_str)
all_list_con_all.extend(all_list_con)
The_original_title = all_list_all # 装在列表的标题词
The_url = all_list_con_all # 装在列表的所有图片
Key_word = []
dict_title = {}
num_title = 0
for i in The_original_title:
num_title += 1
dict_title[num_title] = i
from dtl_nlp import max_length_words
ll = max_length_words(i)
Key_word.append(ll)
keyword_all = Key_word # 关键词列表
dir_name = str(input("请输入生成文件夹名称:"))
td = input("请输入栏目编号(1-4):")
dict1 = {"1":'http://www.zxyyedu.com/peixun/',"2":'http://www.zxyyedu.com/jigou/',"3":'http://www.zxyyedu.com/feiyong/',"4":'http://www.zxyyedu.com/baike/'}
td_value = dict1['%s' % td]
all_link = '/Users/lilong/Desktop/in/{}'.format(dir_name)
dir = os.makedirs(all_link)
config = {'host': '127.0.0.1',
'user': 'root',
'password': 'yz1028959',
'port': 3306,
'database': 'data_likeshuo',
'charset': 'utf8'
}
cnn = mysql.connector.connect(**config)
cursor = cnn.cursor(buffered=True)
# 查询dede数据库文章总和
link = pymysql.Connect(host='localhost', port=3306, user='root', password='yz1028959', db='dede', charset='utf8')
cursorde = link.cursor()
count_dd = 'select count(1) from dede_addonarticle'
cursorde.execute(count_dd)
rand_x = cursorde.fetchall()[0][0]
cursorde.close()
link.close()
content_page_num = len(The_original_title)
num_count = rand_x + content_page_num # dede内容总和+要生成的文章内容总和
# print(num_count)
# 将关键词与链接构成键值对
Num_Keyword = rand_x
Link_Keyword = {} # 字典(链接数字:关键词)
for Keyword_one in keyword_all:
Num_Keyword += 1
Link_Keyword[Num_Keyword] = Keyword_one
dict2 = {"1":'peixun',"2":'jigou',"3":'feiyong',"4":'baike'}
td_vt = dict2['%s' % td]
all_one_middle = []
for keyword_one, j in zip(keyword_all,The_original_title):
n = random.randint(3, 5)
data = (keyword_one, n)
t_key = data[0]
keyList = [] # 某类的关键词列表
for key in Link_Keyword:
if Link_Keyword[key] == keyword_one:
keyList.append(key) # 添加到列表
random.shuffle(keyList) # 打乱列表
Num_link = keyList[0] # 抽取第一个
num_title = Num_link - rand_x
r_title = dict_title[num_title] # 从字典中抽取标题
pic = ' ''"' + j + '"'
p_lable = "<p>"
p_lable_last = "</p>"
# X_Link = '相似文章推荐:<body><a href="http://www.zxyyedu.com/{}/" target="_blank">{}</a></body>'.format(Num_link, r_title)
X_Link = '相似文章推荐:<body><a href="http://127.0.0.1/{}/{}.html" target="_blank">{}</a></body>'.format(td_vt, Num_link, r_title)
more = '<p>回到栏目页:<body><a href="{}" target="_blank">查看更多内容</a></body></p>'.format(td_value)
cx_sql = 'select t1.content from children_title t LEFT JOIN children_english t1 on t.ceid=t1.id where t.ckey=%s ORDER BY RAND() LIMIT %s'
cursor.execute(cx_sql, data, )
all = cursor.fetchall()
all = [t for t in all if t != (None,)]
content_page_num = len(The_original_title)
# 去掉空格
p1 = re.compile(r'\s+') # 正则
add_list = []
content_list = []
for i in all:
content = ''
content += i[0]
content = p1.sub('', content) # 去除内容空格
content_list.append(content) # 新内容添加到content_list
# print(content_list) # 内容中部装在一个列表[]里面,以字符串,分开
n_piecewise_content = ''
# print(The_original_title)
all_text = ''
for i,url in zip(content_list, The_url):
n_piecewise_content += p_lable + nbsp + i + p_lable_last
center_title = j.center(100) + '\r\n'
url = "<p style={center}><img src={url} alt={rt}></p>".format(center="text-align:center", url=url, rt=j) # 图片链接
middle = url + n_piecewise_content + X_Link + more # 文章中部
all_text = center_title + middle
all_one_middle.append(middle)
one_content = all_text + '\r\n'
txt = os.path.join(all_link, str(j) + '.txt')
w_txt = open(txt, 'w')
f = codecs.open(txt, 'r+', encoding='utf-8')
f.write(one_content)
f.close()
print("ok")
# print(The_original_title) # 标题
# print(len(all_one_middle)) # 内容
#####all重组的标题######
网友评论