串行处理几百个网页还勉强,在爬取拥有100万网页的大型网站时,耗时超过11天(假设每秒1个网页的速度)
4.1
目标:
从http://s3.amazonaws.com/alexa-static/top-1m.csv.zip中获取全球热门网站列表
并取前500个网站用链接爬虫爬取
from io import BytesIO,TextIOWrapper
import requests
from zipfile import ZipFile
url='http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
urls=[]
resp=requests.get(url,stream=True)
with ZipFile(BytesIO(resp.content)) as zf:
csv_name=zf.namelist()[0]
with zf.open(csv_name) as csv_file:
for item in csv_file:
website=item.decode('utf-8').split(',')[1]
urls.append(website)
抽象成Alexa类:
import csv
from zipfile import ZipFile
from io import TextIOWrapper, BytesIO
import requests
class AlexaCallback:
def __init__(self, max_urls=500):
self.max_urls = max_urls
self.seed_url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
self.urls = []
def __call__(self):
resp = requests.get(self.seed_url, stream=True)
with ZipFile(BytesIO(resp.content)) as zf:
csv_filename = zf.namelist()[0]
with zf.open(csv_filename) as csv_file:
for _, website in csv.reader(TextIOWrapper(csv_file)):
self.urls.append('http://' + website)
if len(self.urls) == self.max_urls:
break
爬取目标改变
由于墙太厚,爬取速度太慢
我将爬取目标改为了爬取http://top.chinaz.com/上前500个购物网站
只获取内链的代码
if not re.match('^(/|'+domain+')',href):#筛选出内部网页
continue
4.2串行爬虫
相比之前的链接爬虫添加了:
1.爬取的域名变了,需要更新对每个站点中的robots.txt文件的处理方式。
2. url路径合法性问题
串行爬虫所用时间
以下代码从http://top.chinaz.com/获取排名前500的购物网站并且写入到本地
import requests
from bs4 import BeautifulSoup
import re
import json
import sys
import csv
sys.path.append('..')
from download import download
'''
从Chinaz.com爬取最受欢迎的前几百个网页存入列表
'''
class GetWebList(object):
def __init__(self,web_num=100,max_page=20):#web_num:需要的网页数 max_page:要爬取多少页
self.web_num=web_num
self.web_li=[]
self.max_page=max_page
def __call__(self):
start_url='http://search.top.chinaz.com/top.aspx?p='
page=1
while page<self.max_page:
url=start_url+str(page)
html=download(url)
soup=BeautifulSoup(html,'html.parser')
div_li=soup.find_all('div',attrs={'class':'ContTit ulli clearfix'})
for div in div_li:
href=div.find('div',attrs={'class':'w320 PCop'}).a.get('href')
href='http://'+re.findall(r'site_(.*?)\.html',href)[0]
self.web_li.append(href)
page+=1
path='D:\study\python\python_crawler\data\weblist.csv'
if __name__=='__main__':
gwl=GetWebList()
gwl()
with open(path,'w',newline='') as fp:
cw=csv.writer(fp)
for web in gwl.web_li:
cw.writerow([web])#cw.writerow函数会将传入的参数解包然后写入一行的各列
核心代码部分:
while crawl_li:
#观察urls发现 待爬取列表中都缺少http协议
url=crawl_li.pop()
print('url:',url)
domain='http://'+urlparse(url).netloc
no_robots=False
rp=robots.get('domain')
if not rp:#不在robots缓存
robot_url='{}/robots.txt'.format(domain)
rp=get_robot_parser(robot_url)
if not rp:
no_robots=True
else:
robots[domain]=rp
if no_robots or rp.can_fetch(user_agent,url):
depth=seen.get(url,0)
if depth==max_depth:#判断深度
print('this may be a trap:',url)
continue
html=d(url)
if not html:
continue
href_li=re.findall("""<a\shref=['"](.*?)['"]""",html)#一个html资源的所有链接
for href in href_li:
if not re.match('^(/|'+domain+')',href):#筛选出内部网页
continue
#以下为针对url的合法性处理
if 'http' not in href:
if href.startswith('//'):
href='{}{}'.format('http:',href)
elif href.startswith('://'):
href='{}{}'.format('http',href)
else:#href为相对链接的情况
href='{}{}'.format(domain,href)
if href in seen:#判断该href是否已经爬过
print('the url has been visited before!'
continue
crawl_li.append(href)
seen[url]=depth+1
爬取用时:
#由于只是为了学习爬取,没有对每个待爬网站做定制化处理
#有部分网站爬取失败
386.28738594055176
4.3多线程爬虫
多线程爬虫,提高cpu利用率
不需要上锁,不会导致死锁情形
顺便复习下python多线程相关知识
常规思路是在主线程中:
定义一个线程方法,里面是线程要执行的操作
定义一个线程队列,实时记录线程的情况
因此,要将单线程链接爬虫该为多线程,只要将涉及待爬取队列的操作由多个线程操作即可
相关代码:
def process_queen():#线程的操作函数
while crawl_li:
#观察urls发现 待爬取列表中都缺少http协议
url=crawl_li.pop()
print('url:',url)
domain='http://'+urlparse(url).netloc
no_robots=False
rp=robots.get('domain')
if not rp:#不在robots缓存
robot_url='{}/robots.txt'.format(domain)
rp=get_robot_parser(robot_url)
if not rp:
no_robots=True
else:
robots[domain]=rp
if no_robots or rp.can_fetch(user_agent,url):
depth=seen.get(url,0)
if depth==max_depth:#判断深度
print('this may be a trap:',url)
continue
html=d(url)
if not html:
continue
href_li=re.findall("""<a\shref=['"](.*?)['"]""",html)#一个html资源的所有链接
for href in href_li:
if 'com' not in href or 'cn' not in href:
continue
if 'http' not in href:
#以下为针对url的合法性处理
if href.startswith('//'):
href='{}{}'.format('http:',href)
elif href.startswith('://'):
href='{}{}'.format('http',href)
else:#href为相对链接的情况
href='{}{}'.format(domain,href)
if href in seen:#判断该href是否已经爬过
print('the url has been visited before!')
continue
crawl_li.append(href)
seen[url]=depth+1
threads=[]
while threads or crawl_li:
for thread in threads:
if not thread.isAlive():
threads.remove()
while len(threads)<max_thread and crawl_li:
thread=threading.Thread(target=process_queen)
thread.setDaemon(True)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
多进程爬虫
暂时不作了解
网友评论