爬虫入门学习
一、爬虫基本流程
- 第一步,根据URL,获取网页的HTML信息。在Python3中,可以使用urllib.request和requests进行网页爬取。
- urllib库是python内置的,无需我们额外安装,只要安装了Python就可以使用这个库。
- requests库是第三方库,需要我们自己安装。
- 第二步,解析HTML信息,提取我们感兴趣的内容。提取的方法有很多,例如使用正则表达式、Xpath、Beautiful Soup等。
- 第三步,存放提取的内容。
二、爬虫实战
# -*- coding:UTF-8 -*-
import requests
from bs4 import BeautifulSoup
import sys
class downloader(object):
def __init__(self):
self.server = 'https://www.biquga.com/5_5693/'
self.target = 'https://www.biquga.com/5_5693/'
self.names = []
self.urls = []
self.nums = 0
def get_download_url(self):
req = requests.get(url=self.target)
html = req.content
div_bf = BeautifulSoup(html, features='lxml')
div = div_bf.find_all('div', {'id':'list'})
#print(div[0])
a_bf = BeautifulSoup(str(div[0]), features='lxml')
a = a_bf.find_all('a')
self.nums = len(a[9:-2])
# print(a[9:-2])
for each in a[9:-2]:
self.names.append(each.string)
self.urls.append(self.server + each.get('href'))
# print(self.names)
# print(self.urls)
def get_contents(self, target):
req = requests.get(url=target)
html = req.content
bf = BeautifulSoup(html, features='lxml')
texts = bf.find_all('div', {'id':'content'})
texts = texts[0].text
return texts
def writer(self, name, path, text):
write_flag = True
with open(path, 'a', encoding='utf-8') as f:
f.write(name + '\n')
f.writelines(text)
f.write('\n')
if __name__ == '__main__':
dl = downloader()
dl.get_download_url()
print('《一年永恒》开始下载:')
for i in range(dl.nums):
dl.writer(dl.names[i], '一念永恒.txt', dl.get_contents(dl.urls[i]))
sys.stdout.write(" 已下载:%.3f%%" % float(i / dl.nums) + '\r')
sys.stdout.flush()
print('《一年永恒》下载完成')
参考博客
网友评论