美文网首页
Python实战作业 第二周:爬取赶集网商品信息

Python实战作业 第二周:爬取赶集网商品信息

作者: 浮生只言片语 | 来源:发表于2017-05-31 08:26 被阅读28次

    任务:

    1、从网站:http://sh.ganji.com/wu/ 中获取类型链接

    2、从类型地址中获取商品链接(设置末尾页判断,以防爬取错误信息)

    3、从商品地址中获取商品信息:商品标题—goods_title、价格—price、交易地点—swap_site

    成果:

    商品链接

    Snip20170531_5.png

    商品详情

    Snip20170531_6.png

    多进程

    Snip20170531_7.png

    代码:

    第一部分:

    from bs4 import BeautifulSoup
    import requests
    
    '''
    start_url = 'http://sh.ganji.com/wu/'
    url = 'http://sh.ganji.com'
    wb_data = requests.get(start_url)
    soup = BeautifulSoup(wb_data.text,'lxml')
    
    channel_links = soup.select('#wrapper > div.content > div > div > dl > dt > a')
    
    for channel_link in channel_links:
        link = url+channel_link.get('href')
        print(link)
    '''
    channels = '''
        http://sh.ganji.com/jiaju/
        http://sh.ganji.com/rirongbaihuo/
        http://sh.ganji.com/shouji/
        http://sh.ganji.com/bangong/
        http://sh.ganji.com/nongyongpin/
        http://sh.ganji.com/jiadian/
        http://sh.ganji.com/ershoubijibendiannao/
        http://sh.ganji.com/ruanjiantushu/
        http://sh.ganji.com/yingyouyunfu/
        http://sh.ganji.com/diannao/
        http://sh.ganji.com/xianzhilipin/
        http://sh.ganji.com/fushixiaobaxuemao/
        http://sh.ganji.com/meironghuazhuang/
        http://sh.ganji.com/shuma/
        http://sh.ganji.com/laonianyongpin/
        http://sh.ganji.com/xuniwupin/
        http://sh.ganji.com/qitawupin/
        http://sh.ganji.com/ershoufree/
        http://sh.ganji.com/wupinjiaohuan/
        '''
    

    第二部分:

    from bs4 import BeautifulSoup
    import requests
    import pymongo
    import time
    
    
    client = pymongo.MongoClient('localhost',27017)
    ganji = client['ganji']
    sheet_urls = ganji['sheet_urls']
    sheet_info = ganji['sheet_info']
    
    urls = ['http://sh.ganji.com/jiaju/o{}'.format(i) for i in range(57,63)]
    url = 'http://zhuanzhuan.ganji.com/detail/788991434206199812z.shtml'
    
    
    
    def get_url(url):
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text,'lxml')
        time.sleep(2)
        goods_urls = soup.select('#infolist > div > table > tbody > tr > td.t > a')
        infos = soup.select('#infolist > div')
        for info in infos:
            if info.get('class') != ['noinfo']:
                for goods_url in goods_urls:
                    if goods_url.get('href').split('/')[2]!='sh.ganji.com':
                        sheet_urls.insert_one({'url':goods_url.get('href').split('?')[0]})
                        print(goods_url.get('href').split('?')[0])
            else:
                print('End')
    
    def get_info(url):
        wb_data = requests.get(url)
        soup = BeautifulSoup(wb_data.text,'lxml')
        time.sleep(2)
        if soup.title.text.split() == ['【图】_的闲置物品-转转,赶集二手']:
            pass
        else:
            goods_title = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.box_left_top > h1')[0].text
            #times = soup.select('')
            #types = soup.select('')
            price = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.price_li > span > i')[0].text
            area = soup.select('body > div.content > div > div.box_left > div.info_lubotu.clearfix > div.info_massege.left > div.palce_li > span > i')[0].text
            #degrees = soup.select('')
            print(goods_title,price,area)
            sheet_info.insert_one({'goods_title':goods_title,'price':price,'area':area})
    

    第三部分:

    from multiprocessing import Pool
    from get_urls import get_url,get_info
    import pymongo
    from get_channel import channels
    
    
    client = pymongo.MongoClient('localhost',27017)
    ganji = client['ganji']
    sheet_urls = ganji['sheet_urls']
    sheet_info = ganji['sheet_info']
    
    def get_all_links(channel):
        for n in range(1,11):
            url = channel + 'o{}'.format(str(n))
            get_url(url)
    
    
    pool = Pool(4)
    
    #Spader 1 采用多进程方式,获取所有商品链接地址
    pool.map(get_all_links,channels.split())
    
    #Spader 2 采用多进程方式,获取所有商品详情
    for i in sheet_urls.find():
        url = i['url']
        pool.apply_async(get_info, args=(url,))
    pool.close()
    pool.join()
    

    相关文章

      网友评论

          本文标题:Python实战作业 第二周:爬取赶集网商品信息

          本文链接:https://www.haomeiwen.com/subject/gbydfxtx.html