美文网首页
week2_实战作业

week2_实战作业

作者: wangyw | 来源:发表于2016-07-29 16:29 被阅读0次

    设计思路

    1.从host页面获取赶集网所有城市的二手市场marketurl
    2.根据marketurl获取每个板块的blockurl
    3.通过分析各板块页脚页码元素,分析每个板块有多少个列表页,得到listurl
    ps:由于赶集网列表页设置非常混乱(如不显示页数控件,只有一页却可以访问任意页码等等),此功能并未实现,程序只抓取了每个板块的第一页,即blockurl=listurl
    4.分析出listurl中的所有详情页detailurl
    5.解析出详情页面中的信息,区分转转和赶集商品,获取商品信息
    ps:所有信息均入mongodb表中

    代码

    main.py

    #coding=utf-8
    '''爬取赶集网所有城市二手市场所有类目的商品信息,由于赶集网列表页数比较混乱,仅抓取各板块下第一页
    爬取类目下所有帖子,信息包括:商品标题,发帖时间, 类型,价格,交易地点,新旧程度等
    多进程方式爬取
    '''
    from configparser import ConfigParser
    from pymongo import MongoClient
    from ganji_crawler import get_citys_market,get_block_urls,get_detail_page_urls,getinfo_from_detailpage
    
    cf = ConfigParser()  # 创建conf文件解析对象
    cf.read("ganji.conf")  # 读取conf文件
    
    host = cf.get('mongodb', 'db_host')
    port = cf.getint('mongodb', 'db_port')
    client = MongoClient(host, port)  # 连接mongodb,创建客户端
    ganji_market = client[cf.get('databases', 'db_market')]  # 连接二手市场数据库
    
    # 创建collections实例
    citys_market = ganji_market[cf.get('collections', 'collection_citys')]  # 存放城市url表
    city_market_block_url = ganji_market[cf.get('collections', 'collection_block_url')]  # 各城市板块url表
    market_detailpage_url = ganji_market[cf.get('collections', 'collection_detail_url')]  # 详情页url
    market_goods_infos = ganji_market[cf.get('collections', 'collection_goods_info')]
    
    #############代码执行部分##############
    get_citys_market(host,citys_market)   #入库城市及二手市场url
    
    for city in citys_market.find():
        get_block_urls(city['link'],city_market_block_url)   #入库板块url
    
    for block in city_market_block_url.find({}, {'link': 1, '_id': 0}):
        get_detail_page_urls(block['link'],market_detailpage_url) #入库详情页url
    
    
    for idx, detail in enumerate(market_detailpage_url.find({}, {'link': 1, '_id': 0})):  # 入库商品信息
        getinfo_from_detailpage(detail['link'], market_goods_infos)
        if idx % 1000 == 0:
            print('{} records has been inserted ! '.format(idx))
    
    

    crawler.py

    #coding=utf-8
    '''爬取赶集网二手市场数据'''
    from bs4 import BeautifulSoup
    from pymongo import MongoClient,errors
    import requests,re,time
    
    def get_citys_market(host,collection):
    
        resp = requests.get(host)
        soup = BeautifulSoup(resp.content,'lxml')
        links = soup.select('div.all-city > dl > dd > a ') #获取城市列表中所有超链接
        for link in links:
            collection.insert_one({
                'link' : link['href']+'wu/',
                'city' : link.string
            })
    
    def get_block_urls(city_market_url,collection):
        '''从一个城市的二手市场页面抓取所有区块url
        city_market_url:某个城市的二手市场url
        collection:解析出的板块url存入的数据表'''
    
        resp = requests.get(city_market_url)
        soup = BeautifulSoup(resp.content,'lxml')
        try:
            div_navigate = soup.select('div.main')[0]
        except IndexError:
            return
        for a in div_navigate.select('a'):
            try:
                href = a['href']
                if href.startswith('/'):    #清洗脏数据,全部分类中有#开头的
                    collection.insert_one({'link': city_market_url[:-4] + href})   #拼拼凑板块url,这里因为赶集网设置比较特殊,需要去除一些无效字符
            except errors.DuplicateKeyError as e:
                print(e)
    
    def get_detail_page_urls(blockurl,collection):
        '''赶集网板块的页数判断和访问就是个坑啊,完全没有判断板块下有多少页的规律,这里先提取各版块第一页的详情页url'''
    
        resp = myRequestGet(blockurl)
        if not resp:
            return
    
        soup = BeautifulSoup(resp.content,'lxml')
        try:
            layoutlist = soup.select('dl.list-bigpic.clearfix') #定位到每条数据dl标签上
        except IndexError:
            return
        time.sleep(1)
        for layout in layoutlist:
            links = layout.select('a')  #获取此标签下所有超链
            for link in links:
                href = link['href']
                if href.startswith('http://m.zhuanzhuan.58.com'):       #筛选出转转数据url
                    #由于获取的转转url是通过js获取商品信息的,所以需要改一下url形式,以便css path可以找到目标信息
                    infoId = re.findall(r'infoId=(\d+)&', href)[0]
                    href = 'http://zhuanzhuan.58.com/detail/{}z.shtml'.format(infoId)
                    try:
                        collection.insert_one({'source':'zhuanzhuan','link': href}) #入库转转url-改写后
                    except errors.DuplicateKeyError as e:
                        print(e)
                elif href.endswith('.htm'):
                    try:
                        collection.insert_one({'source':'ganji','link': href})  #入库赶集url
                    except errors.DuplicateKeyError as e:
                        print(e)
    
    def getinfo_from_detailpage(detailurl,collection):
        '''从详情页获取商品信息'''
        resp = myRequestGet(detailurl)
        if not resp:
            return
        soup = BeautifulSoup(resp.content, 'lxml')
        time.sleep(0.1)
    
        #转转商品信息获取
        if resp.url.startswith('http://zhuanzhuan'):
            try:
                title = soup.select(' h1.info_titile ')[0].string
                price = ''.join(soup.select(' span.price_now ')[0].stripped_strings)
                area = soup.select(' div.palce_li > span > i ')[0].string
                desc = soup.select(' div.baby_kuang.clearfix > p')[0].string
            except IndexError:
                return
            #入表
            collection.insert_one({
                'source' : 'zhuanzhuan',
                'title': title,
                'price': price,
                'area': area,
                'desc': desc})
        else:   #赶集商品信息获取
            try:
                title = soup.select(' h1.title-name ')[0].string
                price = soup.select(' i.f22.fc-orange.f-type ')[0].string
                area = ''.join(soup.select(' ul.det-infor > li:nth-of-type(3) ')[0].stripped_strings)
                desc = soup.select(' .second-sum-cont')[0].get_text().strip()
            except IndexError:
                return
            collection.insert_one({
                'source': 'ganji',
                'title': title,
                'price': price,
                'area': area,
                'desc': desc
                })
    
    def myRequestGet(url):
        '''会遇到被封的情况,在这里把requests.get包裹一层函数,如果异常,则sleep(10)'''
        try:
            resp = requests.get(url)
            return resp
        except requests.exceptions.RequestException as e:
            print('Requests Error -----------{}-----------wait 10 seconds'.format(str(e.__class__)))
            time.sleep(10)
            return None
        except Exception as e:
            print('Other Eroor -----------{}-----------wait 10 seconds'.format(str(e.__class__)))
            time.sleep(10)
            return None
    
    
    #判断页面是否存在,由于判断时需要读取页面并生成soup后判断,所以干脆传入soup对象,而不是url
    def exists(soup):
        if soup.title.string == '您访问的网页不存在':
            return False
        else:
            return True
    
    
    if  __name__ == '__main__':
    
        ##############一些初始化##############
        client = MongoClient('mongodb://localhost:27017')
        ganji_market = client['ganji_market']    #赶集网二手市场数据库
        host = 'http://www.ganji.com/index.htm'
    
        #创建存储城市及其二手市场url的表
        citys_market = ganji_market['citys_market']
    
        #创建存储各城市二手市场板块url的collections,并将link字段设置为唯一索引,避免出现重复的link
        city_market_block_url = ganji_market['city_market_block_url']
        city_market_block_url.ensure_index('link', unique=True)
    
    
        #创建详情页url存储表
        market_detailpage_url = ganji_market['market_detailpage_url']
        market_detailpage_url.ensure_index('link', unique=True)
    
        #商品信息入库
        market_goods_infos = ganji_market['market_goods_infos']
    
        ##############代码执行部分##############
        # get_citys_market(host,citys_market)   #入库城市及二手市场url
        # get_citys_market(host)
    
        # for city in citys_market.find():
        #     get_block_urls(city['link'],city_market_block_url)   #入库板块url
        # get_block_urls('http://xa.ganji.com/wu/')
    
        # for block in city_market_block_url.find({}, {'link': 1, '_id': 0}):
        #     get_detail_page_urls(block['link'],market_detailpage_url) #入库详情页url
        # get_detail_page_urls('http://xa.ganji.com/ershoubijibendiannao/')
    
    
        for idx,detail in enumerate(market_detailpage_url.find({}, {'link': 1, '_id': 0}).skip(12000)):  #入库商品信息
            getinfo_from_detailpage(detail['link'],market_goods_infos)
            if idx % 1000 == 0:
                print ('{} records has been inserted ! '.format(idx))
    
        # getinfo_from_detailpage('http://zhuanzhuan.58.com/detail/755842657703362564z.shtml')
        # getinfo_from_detailpage('http://xa.ganji.com/ershoubijibendiannao/2266604261x.htm')
    

    ganji.conf

    [mongodb]
    db_host = localhost
    db_port = 27017
    
    [databases]
    db_market = ganji_market
    
    [collections]
    collection_citys = citys_market
    collection_block_url =  city_market_block_url
    collection_detail_url = market_detailpage_url
    collection_goods_info = market_goods_infos
    

    总结

    1.没有使用多进程,是因为函数都要传入两个参数(包括一个collection对象),暂且先不修改函数适用map方式了。multiprocessing和threading模块还需要再学习一下
    2.为了增强程序健壮性,执行过程中增加了一些try模块,在遇到爬虫被封的时候停顿10s再接着抓取,目前看效果不错
    3.抓取是按照模板设计的,通用性的问题依然存在,不知是否可以使用scrapy解决
    4.引入了一个configparser模块来创建配置文件读取相关信息,以后可以继续使用
    5.由于爬虫被封,商品详情url仅抓取了20w,因此商品信息也仅能解析出这么多。后增加了程序的健壮性,未重新运行。

    相关文章

      网友评论

          本文标题:week2_实战作业

          本文链接:https://www.haomeiwen.com/subject/ptuzjttx.html