美文网首页
Python爬虫实战笔记_2-3 多进程爬虫

Python爬虫实战笔记_2-3 多进程爬虫

作者: Sugeei | 来源:发表于2016-07-08 07:02 被阅读128次

    练习Pool的使用

    源代码

    main.py

    #!/usr/bin/python
    # -*- coding: UTF-8 -*
    
    #  multiprocess
    
    from multiprocessing import Pool
    import time
    
    from urlhandler import insert_urls_by_nav, get_nav_urls
    from mongoconn import mongoset
    
    table = mongoset('58sale', 'itemurls')
    
    if __name__ == '__main__':
        starttime = time.time()
        print ('start: ')
        print (time.strftime('%Y-%m-%d %H:%M:%S'))
    
        pool = Pool()
    
        url = 'http://sh.58.com/sale.shtml'
        navurls = get_nav_urls(url)
    
        pool.map(insert_urls_by_nav, navurls)
    
        endtime = time.time()
        print (time.strftime('%Y-%m-%d %H:%M:%S'))
        elapsed = endtime - starttime
    

    urlhandler.py

    #!usr/bin/env python
    #_*_ coding: utf-8 _*_
    #
    #  functions to get item urls
    
    from bs4 import BeautifulSoup
    import requests
    import time
    
    from mongoconn import mongoset, mongoinsert
    
    
    def get_soup(url):
        source = requests.get(url)
        soup = BeautifulSoup(source.text, 'lxml')
        return soup
    
    def combineurls(url, page):
        pageurls = []
        for i in range(1, page+1):
            pageurl = '{}{}/'.format(url, i)
            pageurls.append(pageurl)
        return pageurls
    
    def get_nav_urls(url):
        soup = get_soup(url)
        navlist = soup.select('ul.ym-mainmnu span.dlb > a')
        absurls = []
        for submnu in navlist:
            try:
                absurl = url[0:-11] + submnu.get('href')
            except TypeError:
                pass
            except:
                pass
            if absurl not in absurls:
                absurls.append(absurl)
        return absurls
    
    def get_page_urls(url):
        #  get urls with pages id
        urls = combineurls(url + 'pn', 70)
        return urls
    
    def get_page_urls_bk(url):
        curpage = 1
        maxpage=0
        while curpage > maxpage:
            maxpage = curpage
            pageurl = url + 'pn' + str(maxpage)
            soup = get_soup(pageurl)
            pager = soup.select('div.pager > a')
            pagenum = pager[len(pager)-3].select('span')[0].get_text() #### -3是临时办法, 需要再想想
            curpage = int(pagenum)
        urls = combineurls(url+'pn', maxpage)
        return urls
    
    def listtodict(urls):
        datamany = []
        for itemurl in urls:
            data = {
                'itemurl': itemurl
            }
            datamany.append(data)
        return datamany
    
    def get_item_urls(url):
        soup = get_soup(url)
        print(url)
        itemlist = soup.select('tr.zzinfo > td.img > a')
        itemurls = []
        if len(itemlist):
            for item in itemlist:
                try:
                    itemurl = item.get('href')
                except:
                    pass
                itemurls.append(itemurl)
        #time.sleep(1)
        return itemurls
    
    def getemtext(element):
        return element.get_text().strip().replace('\t', '').replace('\n', '').replace(' ','')
    
    def get_urls_by_nav(navurl):
        navurls = get_page_urls(navurl)
        for pageurl in navurls:
            itemurls = get_item_urls(pageurl)
            mongoinsert(table, listtodict(itemurls))
    
    table = mongoset('58sale', 'itemurls')
    
    def insert_urls_by_nav(navurl):
        navurls = get_page_urls(navurl)
        for pageurl in navurls:
            itemurls = get_item_urls(pageurl)
            #mongoinsert(table, listtodict(itemurls))
            if itemurls:
                table.insert_many(listtodict(itemurls))
    
    if __name__ == '__main__':
        url = 'http://sh.58.com/sale.shtml'
        get_nav_urls(url)
    
    
    运行结果
    2016-07-02 15:55:10
    ...
    http://sh.58.com/shoujihao/pn4/
    http://sh.58.com/shoujihao/pn5/
    http://sh.58.com/danche/pn2/
    http://sh.58.com/zixingche/pn2/
    http://sh.58.com/shoujihao/pn6/
    http://sh.58.com/shouji/pn2/
    http://sh.58.com/shoujihao/pn7/
    http://sh.58.com/shoujihao/pn8/
    http://sh.58.com/danche/pn3/
    http://sh.58.com/zixingche/pn3/
    http://sh.58.com/shoujihao/pn9/
    http://sh.58.com/shouji/pn3/
    http://sh.58.com/shoujihao/pn10/
    http://sh.58.com/shoujihao/pn11/
    ...
    2016-07-02 15:57:38
    
    总结
    • 从输出结果看各类目的爬取是按照页面顺序进行的,但类目之间是并行的。另外shoujihao类目下面由于没有目标信息,处理的比别的类目都要快。
    • mongo shell 中查看共抓到了41650个url
    > db.itemurls.count()
    41650
    
    • 记录了运行的起始时间, 抓取41650个url共用时2分28秒

    相关文章

      网友评论

          本文标题:Python爬虫实战笔记_2-3 多进程爬虫

          本文链接:https://www.haomeiwen.com/subject/qiuqjttx.html