美文网首页
Python实战计划学习笔记(11)第二周作业

Python实战计划学习笔记(11)第二周作业

作者: 如恒河沙 | 来源:发表于2016-09-05 10:01 被阅读0次

    任务

    抓取赶集网本站所有商品信息

    思路

    1. 抓取所有频道URL,一共20个
    2. 每个频道读取至少70页商品URL信息,排除其中超出page导航范围的页面,存入数据库
    3. 从数据库中逐条读取商品URL,打开网页抓取商品信息,存入数据库

    我的代码

    一共编写了3个程序文件

    1. get_channels.py 用于读取20个频道的URL
    2. page_parser.py 用于处理商品URL和商品信息
    3. counts_ganji.py 用于监测数据库商品URL数
    • get_channels.py
    from bs4 import BeautifulSoup
    import requests
    
    start_url = 'http://bj.ganji.com/wu/'
    url_host = 'http://bj.ganji.com'
    
    def get_channel_urls(url):
        web_data = requests.get(url)
        web_data.encoding = 'utf-8'  #读取网页出现乱码,要声明编码方式
        soup = BeautifulSoup(web_data.text, 'lxml')
        links = soup.select('dl.fenlei > dt > a')
        for link in links:
            page_url = url_host + link.get('href')
            channel_name = link.get_text()
            print(page_url)
        print('共计',len(links),'个频道。')
    
    get_channel_urls(start_url)
    
    #本程序用于获取以下频道列表
    
    channel_list = '''
        http://bj.ganji.com/jiaju/
        http://bj.ganji.com/rirongbaihuo/
        http://bj.ganji.com/shouji/
        http://bj.ganji.com/shoujihaoma/
        http://bj.ganji.com/bangong/
        http://bj.ganji.com/nongyongpin/
        http://bj.ganji.com/jiadian/
        http://bj.ganji.com/ershoubijibendiannao/
        http://bj.ganji.com/ruanjiantushu/
        http://bj.ganji.com/yingyouyunfu/
        http://bj.ganji.com/diannao/
        http://bj.ganji.com/xianzhilipin/
        http://bj.ganji.com/fushixiaobaxuemao/
        http://bj.ganji.com/meironghuazhuang/
        http://bj.ganji.com/shuma/
        http://bj.ganji.com/laonianyongpin/
        http://bj.ganji.com/xuniwupin/
        http://bj.ganji.com/qitawupin/
        http://bj.ganji.com/ershoufree/
        http://bj.ganji.com/wupinjiaohuan/
    '''
    
    • page_parser.py
    from multiprocessing import Pool
    from bs4 import BeautifulSoup
    import requests
    import time
    import pymongo
    
    client = pymongo.MongoClient('localhost',27017)
    test_ganji = client['test_ganji']
    url_list = test_ganji['url_list_ganji']
    item_info = test_ganji['item_info']
    
    channel_list = '''
        http://bj.ganji.com/jiaju/
        http://bj.ganji.com/rirongbaihuo/
        http://bj.ganji.com/shouji/
        http://bj.ganji.com/shoujihaoma/
        http://bj.ganji.com/bangong/
        http://bj.ganji.com/nongyongpin/
        http://bj.ganji.com/jiadian/
        http://bj.ganji.com/ershoubijibendiannao/
        http://bj.ganji.com/ruanjiantushu/
        http://bj.ganji.com/yingyouyunfu/
        http://bj.ganji.com/diannao/
        http://bj.ganji.com/xianzhilipin/
        http://bj.ganji.com/fushixiaobaxuemao/
        http://bj.ganji.com/meironghuazhuang/
        http://bj.ganji.com/shuma/
        http://bj.ganji.com/laonianyongpin/
        http://bj.ganji.com/xuniwupin/
        http://bj.ganji.com/qitawupin/
        http://bj.ganji.com/ershoufree/
        http://bj.ganji.com/wupinjiaohuan/
    '''
    
    #spider 1  爬取类别以下的所有在售条目
    def get_links_from(channel,pages,who_sells = '3'):
        list_view = '{}a{}o{}/'.format(channel,str(who_sells),str(pages))
        print('reading ',list_view)
        web_data = requests.get(list_view)
        web_data.encoding = 'utf-8'  # 读取网页出现乱码,要声明编码方式
        time.sleep(4)
        soup = BeautifulSoup(web_data.text,'lxml')
        page_validity = soup.select('div.pageBox')
        if page_validity:
            print('Page ',pages,' is valid')
            if soup.find('div','ft-db'):
                for link in soup.select('a.ft-tit'):
                    item_link = link.get('href')
                    if item_link[7:19] != 'bj.ganji.com':
                        print('external link omitted')
                        pass
                    elif url_list.find_one({'url':item_link}):
                        print('url exists')
                        pass
                    else:
                        url_list.insert_one({'url':item_link})
                        print('saving',item_link)
            else:
                pass
        else:
            print('Page ', pages, ' is invalid')
    
    
    
    #spider 2 爬取商品具体信息
    def get_item_info(url):
        url = url.split('?')[0]
        time.sleep(2)
        if item_info.find_one({'url':url}):
            print('item exists')
        else:
            web_data = requests.get(url)
            soup = BeautifulSoup(web_data.text,'lxml')
            title = soup.title.text
            post_time = soup.select('ul.title-info-l.clearfix > li:nth-of-type(1) > i')[0].get_text().strip().split(' ')[0]
            type = soup.select('div.leftBox > div:nth-of-type(3) > div > ul > li:nth-of-type(1) > span > a')[0].get_text()
            price = soup.select('i.f22')[0].get_text()
            location = list(soup.select('div.leftBox > div:nth-of-type(3) > div > ul > li:nth-of-type(3)>a')[0].stripped_strings)[1:]
            contact_name = soup.select('div.leftBox > div:nth-of-type(3) > div > ul > li:nth-of-type(5)>li')
            if soup.select('div > ul > li:nth-of-type(6) > span.phoneNum-style'):
                contact_number = soup.select('div > ul > li:nth-of-type(6) > span.phoneNum-style')[0].get_text().strip()
            else:
                contact_number=''
            if soup.select('div > ul > li:nth-of-type(7) > span'):
                contact_qq = soup.select('div > ul > li:nth-of-type(7) > span')[0].get_text().strip()
            else:
                contact_qq = ''
            print('saving ',url)
            item_info.insert_one({'url':url,'title':title,'post_time':post_time,'type':type,'price':price,'location':location,'contact_name':contact_name[0].text.split('\xa0')[2][0:8].strip(),'contact_number':contact_number,'contact_qq':contact_qq})
    
    
    
    #每个频道爬取70页
    def get_all_links_from(channel):
        for num in range(1,71,1):
            get_links_from(channel,num)
    
    
    #爬取所有频道的商品链接页URL
    if __name__ == '__main__':
        pool = Pool(processes=1)
        pool.map(get_all_links_from, channel_list.split())
    
    
    
    
    #读取的所有商品URL的具体信息
    if __name__ == '__main__':
        items = []
        for link in list(url_list.find()):
            items.append(link['url'])
        pool = Pool()
        pool.map(get_item_info,items)
    
    • counts_ganji.py
    import time
    import pymongo
    
    client = pymongo.MongoClient('localhost',27017)
    test_ganji = client['test_ganji']
    url_list = test_ganji['url_list_ganji']
    
    while True:
        print(url_list.find().count())
        time.sleep(5)
    

    运行结果

    1. 一共爬取了54943条商品记录
    2. 用多进程开4个爬虫性能明显提升
    1.jpg

    提高爬取效率的方法

    • 使用Proxy代理,减少sleep()
    • 单核CPU使用多线程,多核CPU使用多进程
    • 使用页面解析效率更高的lxml库
    • 使用异步非阻塞方式请求

    程序中断后,去除重复的另一个方法

    db_urls = [item['url'] for item in url_list.find()]
    index_urls = [item['url'] for item in item_info.find()]
    x = set(db_urls)
    y = set(index_urls)
    rest_of_urls = x - y
    

    相关文章

      网友评论

          本文标题:Python实战计划学习笔记(11)第二周作业

          本文链接:https://www.haomeiwen.com/subject/mhcrettx.html