



遇到的问题:
response.encoding='utf-8' 中文显示乱码


原因:因为原网站是gb2312的,所以要设置成'gb2312'就不乱码了

这样也不会出错
完整代码:
import requests
import re
url = 'http://www.jjwxc.net/onebook.php?novelid=379995'
response = requests.get(url)
#html = response.text.encode('ISO-8859-1').decode('gbk')
response.encoding = 'gb2312'
html = response.text
title = re.findall(r'<span itemprop="articleSection">(.*?)</span>',html)[0]
#print(title)
# title = re.findall(r'<span itemprop="articleSection">(.*?)</span>',html)[0]
# print(title)
fb = open('%s.txt' % title,'w',encoding='utf-8')
list = re.findall(r'<a itemprop="url" href="(.*?)">(.*?)</a>',html,re.S)
#print(list)
for chapter_info in list:
chapter_url,chapter_title = chapter_info
chapter_response = requests.get(chapter_url)
chapter_response.encoding = 'gb2312'
chapter_html = chapter_response.text
chapter_content = re.findall(r'<div style="clear:both;"></div>(.*?)<div id="favoriteshow_3"',chapter_html,re.S)[0]
chapter_content = chapter_content.replace('<br>','\n')
chapter_content = chapter_content.replace('\u3000',' ')
chapter_content = chapter_content.replace('•','•')
fb.write(chapter_title)
fb.write(chapter_content)
fb.write('\n')
print(chapter_title)


网友评论