题引:了解页面结构后,就利用丰富的库类的python进行抓取,搬运的只是理解做记录下
章节dom树情况.png'''
python 比较严格代码书写规范,整齐且美观。
先定义一个函数,获取章节列表数据
'''
def get_chapter(url):
# 获取章节页面
try:
# 章节页面源代码
chapter_page_source = requests.get(url).text
chapter_dom = r'<dd><a href="(.*?)">(.*?)</a></dd>'
chapter_list = re.findall(chapter_dom, chapter_page_source)
except Exception as e:
print(e)
return chapter_list
内容页.png
def get_content(chapter_list, novel_name, novel_author):
count = 0
length = len(chapter_list)
for chapter_url, chapter_name in chapter_list:
try:
# 随机休眠几秒 防止反爬虫
time.sleep(1 + random.random())
content_source = requests.get(chapter_url).text
con_dom = r'<div id="content">(.*?)</div>'
content = re.findall(con_dom, content_source, re.S)[0]
content = content.replace('<br/>', '').replace(' ', '').replace('<p>', '').replace('</p>', '')
count += 1
with open(novel_name + '.txt', 'a', encoding='utf-8') as f:
f.write(
novel_name + '\n' * 2 + '作者: ' + novel_author + '\n' * 2 + '###' + chapter_name + '\n' * 2 + content + '\n' * 2)
# 为了给txt文本进行标识化,方便后期分割
print('正在写入: ' + '###' + chapter_name)
print('进度: {:.2%}'.format(count / length))
except Exception as e:
print(e)
*两个主要函数写好后,把之前入口搜书的地方写下待后面触发 *
def download(book_name):
'''
下载小说
:param book_name:
:return:
'''
search_real_url = 'https://www.biquge5200.com/modules/article/search.php?searchkey=' + book_name
try:
novel_source = requests.get(search_real_url).text
search_dom = r'<td class="odd"><a href="(.*?)">(.*?)</a></td>.*?<td class="odd">(.*?)</td>'
# 所有搜索到的结果(包括小说网址、名称、作者姓名)
novel_list = re.findall(search_dom, novel_source, re.S)
# 判断是否有结果返回
if len(novel_list) == 0:
print('~当前所查找的小数不存在!')
except Exception as e:
print(e)
for novel_url, novel_name, novel_author in novel_list:
if novel_name == book_name:
print('即将下载的小说:%s 作者:%s' % (novel_name, novel_author))
# 返回get_chapter, get_content 所需要的 三个入参
return novel_url, novel_name, novel_author
最后执行要查找的书
if __name__ == '__main__':
book_name = input('确保输入正确的小说名:')
novel_url, novel_name, novel_author = download(book_name)
chapter_list = get_chapter(novel_url)
get_content(chapter_list, novel_name, novel_author)
Run的结果
以上,好处是可以完整地把数据写入到txt里,缺点就是单线程。
PS:bye了个bye
网友评论