import re
# 正则表达式模块,用来匹配图片地址
import urllib.request
# 用来获取HTML源码
import sys
import os
import re
from bs4 import BeautifulSoup
def geturls(path):
urls = []
with open(path, 'r', encoding='utf-8') as f1:
url_list = f1.readlines()
for url in url_list:
urls.append(url[3:])
print('get urls list ready!')
return urls
def getHtml(urls):
num = 546
for url in urls[num:]:
print('第'+str(num)+'条')
page = urllib.request.urlopen(url)
html = page.read()
num += 1
yield html
print('get htmls list ready!')
def getcontents(htmls, dir):
if not os.path.exists(dir):
os.makedirs(dir)
with open(dir +'/' + 'contents'+ ".txt", 'a') as f1:
reg = r'<div|<.jpg|.png|div|img'
contgre = re.compile(reg)
while (htmls.__next__()):
try:
html = htmls.__next__()
# 将网页内容格式化利用bs4库
soup = BeautifulSoup(html, 'lxml')
reg = r'<div id="post_content_.*?>(.*?)</div>'
contentre = re.compile(reg)
contentList = re.findall(contentre, html.decode('utf-8'))
for content in contentList:
if not bool(re.findall(contgre, content)):
f1.write(content)
f1.write('\n')
print('共' + str(len(contentList)) + '条文本')
except Exception as e:
print(' error~, continue')
print(e)
def main():
content_dir = 'content'
urls = geturls('_防诈骗.txt')
htmls = getHtml(urls)
getcontents(htmls, content_dir)
if __name__ == '__main__':
main()
网友评论