美文网首页
Python爬虫框架Scrapy爬取企业信息

Python爬虫框架Scrapy爬取企业信息

作者: 程大哥T_T | 来源:发表于2018-12-05 16:49 被阅读83次

    爬黄页88网的所有企业信息http://b2b.huangye88.com/region/

    首先得安装scrapypymongo

    简单的安装和创建爬虫项目我们就简单的过一下

    pip install scrapy 
    pip install pymongo
    scrapy startproject sellsystem
    

    在spiders目录下创建我们的爬虫文件

    import copy
    
    import scrapy
    
    
    from ..items import SellItem
    
    
    # 先下一页
    class indexSpider(scrapy.Spider):
        name = 'sell_finally'
        all_province = []
        start_urls = [
            'http://b2b.huangye88.com/region/'
        ]
        page = 1
    
        def parse(self, response):  # 入口程序
            urls = response.xpath('//dl[@id="clist"]/dd/a/@href').extract()
            for itm in urls:
                print(itm)
                print('111111111111')
                yield scrapy.Request(itm, callback=self.parse_qu)  # url
    
        def parse_qu(self, response):  # http://b2b.huangye88.com/anyang/
            uurls = response.xpath('//*[@id="subarealist"]/div[2]/a/@href').extract()
            for url in uurls:
                print(url)
                print('22222222222222')
                yield scrapy.Request(url, callback=self.parse_instury_list)  # url
    
        def parse_instury_list(self, response):  # 各种不同的行业
            item = SellItem()
            urls = response.xpath('//div[@class="tag_tx"]/ul/li/a/@href').extract()
            privince = response.xpath('//div[@class="subNav"]/a[2]/text()').extract()[0][:-4]  # 省
            city = response.xpath('//div[@class="subNav"]/a[3]/text()').extract()[0][:-4]  # 市
            district = response.xpath('/html/body/div[3]/div[1]/text()').extract()[2]  # 区
            item['privince'] = privince  # 省
            item['city'] = city  # 市
            item['district'] = district[district.find('市') + 1:-6]  # 区
    
            for itm in urls:
                print('33333333333333')
                print(item)
                yield scrapy.Request(itm, callback=self.parse_instury, meta={'item': copy.deepcopy(item)},dont_filter=True)
    
        def parse_instury(self, response):  # 行业详情
            print('--------------------------')
            seitem = response.meta['item']
            print(seitem)
            print(response.url)
            # items = response.xpath('//*[@id="jubao"]/dl/dt/h4/a/text()')
            # 该页所有的企业url
            content_urls = response.xpath('//*[@id="jubao"]/dl/dt/h4/a/@href').extract()
            if len(content_urls) > 0:
                for itm in content_urls:
                    itm = itm + 'company_contact.html'  # 进入联系我们
                    print(itm)
                    print('4444444444444')
                    yield scrapy.Request(itm, callback=self.parse_content, meta={'item': copy.deepcopy(seitem)},dont_filter=True)
            # 下一页
            hrefs = response.xpath(
                '//div[@class="page_tag Baidu_paging_indicator"]/span/following-sibling::a[1]/@href').extract()
            if len(hrefs) > 0:
                print('下一页------')
                yield scrapy.Request(hrefs[0], callback=self.parse_instury, meta={'item': copy.deepcopy(seitem)},dont_filter=True)
    
        def parse_content(self, response):  # 内容页 联系我们
            item = response.meta['item']
            item['page_url'] = response.url
            print('===================')
            print(item)
            # 法人
            li_array = response.xpath('//ul[@class="con-txt"]/li').extract()
            index = 0
            for p in li_array:
                title = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/label/text()').extract()[0]
                print('index : %' + str(index) + ' len : ' + str(len(li_array)) + '  title : ' + title)
                if title == '联系人:':
                    tt = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()
                    if len(tt) > 0:
                        item['link_people'] = tt[0]
                    else:
                        item['link_people'] = \
                            response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/a/text()').extract()[
                                0]  # 1联系人
                if title == '公司名称:':
                    item['company_name'] = \
                        response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[0]  # 1公司名称
                if title == '地址:':
                    item['compay_place'] = \
                        response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[0]  # 1地址
                if title == '电话:':
                    item['phone'] = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[
                        0]  # 1联系人电话
                if title == '手机:':
                    item['phone2'] = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[
                        0]  # 1联系人手机
                if title == '公司主页:':
                    item['website'] = \
                        response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/a/text()').extract()[0]  # 1公司主页
                index += 1
            uu2 = response.xpath('//ul[@class="meun"]/a[2]/@href').extract()
            print('uu2: ' + uu2[0])
            if len(uu2) > 0:
                yield scrapy.Request(url=uu2[0], callback=self.parse_content2, meta={'item': copy.deepcopy(item)},dont_filter=True)
    
        def parse_content2(self, response):  # 内容页2 公司介绍
            item = response.meta['item']
            # 列表
            li_array = response.xpath('//ul[@class="con-txt"]/li').extract()
            print('3333333333333333333333333333333333333333')
            print(li_array)
            lenss = len(li_array)
            index = 0
            for p in li_array:
                title = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/label/text()').extract()[0]
                if title == '成立时间:':
                    item['establish_time'] = \
                        response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[0]
                    print('成立时间:' + item['establish_time'])
                if title == '员工人数:':
                    item['company_people_num'] = \
                        response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[0]
                if title == '主营产品:':
                    item['product'] = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[
                        0]
                if title == '主营行业:':
                    item['industry'] = \
                        response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/a/text()').extract()[0]
                if title == '企业法人:':
                    item['faren'] = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[0]
                index += 1
            item['introdocution'] = response.xpath('//p[@class="txt"]/text()').extract()[0]
            yield copy.deepcopy(item)
    
    

    需要注意的是我们在这里yied使用的meta数据scrapy默认浅复制,多线程下会发生数据错乱,采用深度复制就可以了copy.deepcopy()

    我们的item文件

    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class SellItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        link_people = scrapy.Field() #联系人
        phone = scrapy.Field()#电话
        phone2 = scrapy.Field()#电话
        company_name = scrapy.Field()#公司名称
    
        company_instury = scrapy.Field()  #主营产品
        compay_place = scrapy.Field()#公司地址
        website = scrapy.Field()#公司主页
        privince = scrapy.Field()#省
        city = scrapy.Field()#市
        district = scrapy.Field()#区
        establish_time =scrapy.Field()#成立时间
        company_people_num =scrapy.Field()#员工人数
        product =scrapy.Field()#主营产品
        industry =scrapy.Field()#行业
        faren =scrapy.Field()#法人
        introdocution = scrapy.Field() # 简介
        page_url = scrapy.Field() # 当前访问的url
    

    对采集后的数据进行处理pipelines.py,保存在MongoDB里

    # -*- coding: utf-8 -*-
    
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
    from .items import SellItem
    import pymongo
    from scrapy.conf import settings
    
    class SellsystemPipeline(object):
        def __init__(self):
            host = settings['MONGODB_HOST']
            port = settings['MONGODB_PORT']
            dbName = settings['MONGODB_DBNAME']
            client = pymongo.MongoClient(host=host,port=port)
            tdb = client[dbName]
            self.post = tdb[settings['MONGODB_DOCNAME']]
        def process_item(self, item, spider):
            bookInfo = dict(item)
            self.post.insert(bookInfo)
            return item
    
    

    在setting.py 文件中设置MongoDB的参数

    MONGODB_HOST = '127.0.0.1'
    MONGODB_PORT = 27017
    MONGODB_DBNAME = 'sell'
    MONGODB_DOCNAME = 'company'
    

    在项目根目录下创建一个main.py文件

    from scrapy import cmdline
    cmdline.execute('scrapy crawl sell_finally'.split())
    

    最后运行我们的main.py文件

    大概20分钟有10w多条数据,这个看个人网速

    个人博客

    相关文章

      网友评论

          本文标题:Python爬虫框架Scrapy爬取企业信息

          本文链接:https://www.haomeiwen.com/subject/jdoicqtx.html