练习Pool的使用
源代码
main.py
#!/usr/bin/python
# -*- coding: UTF-8 -*
# multiprocess
from multiprocessing import Pool
import time
from urlhandler import insert_urls_by_nav, get_nav_urls
from mongoconn import mongoset
table = mongoset('58sale', 'itemurls')
if __name__ == '__main__':
starttime = time.time()
print ('start: ')
print (time.strftime('%Y-%m-%d %H:%M:%S'))
pool = Pool()
url = 'http://sh.58.com/sale.shtml'
navurls = get_nav_urls(url)
pool.map(insert_urls_by_nav, navurls)
endtime = time.time()
print (time.strftime('%Y-%m-%d %H:%M:%S'))
elapsed = endtime - starttime
urlhandler.py
#!usr/bin/env python
#_*_ coding: utf-8 _*_
#
# functions to get item urls
from bs4 import BeautifulSoup
import requests
import time
from mongoconn import mongoset, mongoinsert
def get_soup(url):
source = requests.get(url)
soup = BeautifulSoup(source.text, 'lxml')
return soup
def combineurls(url, page):
pageurls = []
for i in range(1, page+1):
pageurl = '{}{}/'.format(url, i)
pageurls.append(pageurl)
return pageurls
def get_nav_urls(url):
soup = get_soup(url)
navlist = soup.select('ul.ym-mainmnu span.dlb > a')
absurls = []
for submnu in navlist:
try:
absurl = url[0:-11] + submnu.get('href')
except TypeError:
pass
except:
pass
if absurl not in absurls:
absurls.append(absurl)
return absurls
def get_page_urls(url):
# get urls with pages id
urls = combineurls(url + 'pn', 70)
return urls
def get_page_urls_bk(url):
curpage = 1
maxpage=0
while curpage > maxpage:
maxpage = curpage
pageurl = url + 'pn' + str(maxpage)
soup = get_soup(pageurl)
pager = soup.select('div.pager > a')
pagenum = pager[len(pager)-3].select('span')[0].get_text() #### -3是临时办法, 需要再想想
curpage = int(pagenum)
urls = combineurls(url+'pn', maxpage)
return urls
def listtodict(urls):
datamany = []
for itemurl in urls:
data = {
'itemurl': itemurl
}
datamany.append(data)
return datamany
def get_item_urls(url):
soup = get_soup(url)
print(url)
itemlist = soup.select('tr.zzinfo > td.img > a')
itemurls = []
if len(itemlist):
for item in itemlist:
try:
itemurl = item.get('href')
except:
pass
itemurls.append(itemurl)
#time.sleep(1)
return itemurls
def getemtext(element):
return element.get_text().strip().replace('\t', '').replace('\n', '').replace(' ','')
def get_urls_by_nav(navurl):
navurls = get_page_urls(navurl)
for pageurl in navurls:
itemurls = get_item_urls(pageurl)
mongoinsert(table, listtodict(itemurls))
table = mongoset('58sale', 'itemurls')
def insert_urls_by_nav(navurl):
navurls = get_page_urls(navurl)
for pageurl in navurls:
itemurls = get_item_urls(pageurl)
#mongoinsert(table, listtodict(itemurls))
if itemurls:
table.insert_many(listtodict(itemurls))
if __name__ == '__main__':
url = 'http://sh.58.com/sale.shtml'
get_nav_urls(url)
运行结果
2016-07-02 15:55:10
...
http://sh.58.com/shoujihao/pn4/
http://sh.58.com/shoujihao/pn5/
http://sh.58.com/danche/pn2/
http://sh.58.com/zixingche/pn2/
http://sh.58.com/shoujihao/pn6/
http://sh.58.com/shouji/pn2/
http://sh.58.com/shoujihao/pn7/
http://sh.58.com/shoujihao/pn8/
http://sh.58.com/danche/pn3/
http://sh.58.com/zixingche/pn3/
http://sh.58.com/shoujihao/pn9/
http://sh.58.com/shouji/pn3/
http://sh.58.com/shoujihao/pn10/
http://sh.58.com/shoujihao/pn11/
...
2016-07-02 15:57:38
总结
- 从输出结果看各类目的爬取是按照页面顺序进行的,但类目之间是并行的。另外shoujihao类目下面由于没有目标信息,处理的比别的类目都要快。
- mongo shell 中查看共抓到了41650个url
> db.itemurls.count()
41650
- 记录了运行的起始时间, 抓取41650个url共用时2分28秒
网友评论