搜企网爬虫作业

作者: dpkBat | 来源:发表于2017-07-03 16:28 被阅读82次

    作业要求

    (1)csv文件数据写入
    (2)mysql 操作,python mysql操作 这个需要安装mysql以及python操作mysql数据库的包 建议用pymysql
    (3)http://www.socom.cn 网站公司信息数据信息 可以看这个页面,有兴趣的可以抓全站,至少抓一个省的。

    爬取过程

    1. 先从首页获取http://www.socom.cn全国所有地区的url。

      地区
    2. 判断该地区还有没有分县(级市)、片区等


      分县(级市)、片区等
      该地区不分县(级市)、片区等
    3. 如果该地区不分县(级市)、片区等,爬去企业的分类


      企业分类
    4. 爬去每一类企业的所有企业的链接


      企业
    5. 根据企业的链接爬去企业的详细信息


      企业详细信息

    说明

    搜企黄页网数据比较多,不适合使用CSV文件保存(数据超过65536),表格最大支持65536行。经爬取验证,北京、天津两个地方的企业就已经有5万多,如果全站爬取的话,肯定会超过65536的限制。

    参考代码

    # sqwSpider.py
    import requests
    from lxml import etree
    import csv
    import time
    import sql
    # 首页url
    home_page_url = 'http://www.socom.cn'
    # 详情页url
    detail_url = 'http://www.socom.cn/company/16001195.html'
    def get_html(url):
        try:
            resp = requests.get(url)
            if resp.status_code == 200:
                return resp
            else:
                return None
        except TimeoutError:
            get_html(url) 
    
    
            
        except ConnectionError:
            get_html(url)
    
    # 获取到城市列表
    def parse_home_page(home_page_url):
        # /html/body/div[5]/div[2]/a[1]
        citys_url_list = []
        resp = get_html(home_page_url)
        if resp:
            html = resp.text
            root = etree.HTML(html)
            num = len(root.xpath('//body/div[@class="contentBox"][4]/div[@class="provinceBox"]'))
            for i in range(1, num + 1):
                province = root.xpath('//body/div[@class="contentBox"][4]/div[@class="provinceBox"][{}]/a/text()'.format(i))[0]
                # [-452:-4]
                citys = root.xpath('//body/div[@class="contentBox"][4]/div[@class="cityBox"][{}]/a/text()'.format(i))
                citys_url = root.xpath('//body/div[@class="contentBox"][4]/div[@class="cityBox"][{}]/a/@href'.format(i))
                # print(citys)
                citys = []
                for url in citys_url:
                    citys.append(home_page_url + url)
                citys_url_list.append(citys)
        return citys_url_list
    
    # 判断地址是不是最终地址(省 -> 地级市 -> 县级市)
    def city_is_end(city_url):
        resp = get_html(city_url)
        if resp:
            html = resp.text
            root = etree.HTML(html)
            province = len(root.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "contentBox", " " )) and (((count(preceding-sibling::*) + 1) = 3) and parent::*)]//*[contains(concat( " ", @class, " " ), concat( " ", "cityBox", " " ))]/a/text()'))
            print(province)
            if province == 35:
                return True
            else:
                return False
    
    # 获取县级市
    def get_city_part(city_url):
        resp = get_html(city_url)
        if resp:
            html = resp.text
            root = etree.HTML(html)
            city_parts= root.xpath('//body/div[@class="contentBox"][1]/div[@class="cityBox"]/a/@href')
            city_parts_url = []
            for part in city_parts:
                city_parts_url.append(home_page_url + part)
            return city_parts_url
        else:
            return None
        
    # 获取最终地区的企业分类
    def get_part_url(city_url):
        resp = get_html(city_url)
        if resp:
            html = resp.text
            root = etree.HTML(html)
            corps = root.xpath('//div[@class="contentBox"][2]/div[@class="cityBox"]/a[@class="countyBox"]/@href')
            corps_url = []
            for part in corps:
                corps_url.append(home_page_url + part)
            return corps_url
        else:
            return None
    
        
    # 获取一个分类的所有企业的链接
    def get_url_of_corp(part_url):
        resp = get_html(part_url)
        if resp:
            html = resp.text
            root = etree.HTML(html)
            parts = root.xpath('//div[@class="contentBox"][3]/div[@class="cityBox"]/a/@href')
            parts_url = []
            for part in parts:
                parts_url.append(home_page_url + part)
            return parts_url
        else:
            return None
    
    # 获取企业分类进入的url
    def get_all_detail_url(home_page_url):
        # urls_list = parse_home_page(home_page_url)
        # for urls in urls_list:
        #     for url in urls:
        #         if city_is_end(url):
        #             print(url)
        #         else:
        #             urls.extend(get_city_part(url))
        #             urls.remove(url)
        # return urls_list
        last_city_list = []
        urls_list = sum(parse_home_page(home_page_url), [])
        # print(urls_list)
        for url in urls_list:
            print(url, end='\t')
            if city_is_end(url):
                print('到头了')
                last_city_list.append(url)
                urls_list.remove(url)
                continue
            else:
                urls_list.extend(get_city_part(url))
                print('获取县级市')
                urls_list.remove(url)
        return last_city_list
    
    # 提取详情页的数据
    def parser_detail(resp, db):
        detail = {}
        if resp:
            html = resp.text
            root = etree.HTML(html)
            info = root.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "cityBox", " " ))]//div[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]/text()') if root.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "cityBox", " " ))]//div[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]/text()') else None
            # print(info)
            if info:
                detail['公司名称'] = root.xpath('//div[@class="contentBox"][2]/div[@class="provinceBox"]/text()')[0]
                detail['地址'] = info[0].strip().split(':')[-1]
                detail['电话'] = info[1].strip().split(':')[-1]
                detail['传真'] = info[2].strip().split(':')[-1]
                detail['手机'] = info[3].strip().split(':')[-1]
                detail['网址'] = info[4].strip().split(':')[-1]
                detail['邮箱'] = info[5].strip().split(':')[-1]
                detail['联系人'] = info[6].strip().split(':')[-1]
                detail['公司人数'] = info[7].strip().split(':')[-1]
                detail['注册资金'] = info[8].strip().split(':')[-1]
                detail['经济类型'] = info[9].strip().split(':')[-1]
                detail['公司产品'] = info[10].strip().split(':')[-1]
                detail['公司简介'] = info[11].strip().split(':')[-1]
                # with open('sqw.csv', 'a', newline='') as csv_file:
                #     writer = csv.writer(csv_file)
                #     print([info for info in detail.values()])
                #     writer.writerow([info for info in detail.values()])
                sql.insert_detail(db, detail)
                return detail
            else:
                return None
    
    
    
    def main():
        db = sql.init()
        # with open('sqw.csv', 'w', newline='') as csv_file:
        #     writer = csv.writer(csv_file)
        #     writer.writerow(['公司名称', '地址', '电话', '传真', '手机', '网址', '邮箱', '联系人', '公司人数', '注册资金', '经济类型', '公司产品', '公司简介'])
        detail_urls = get_all_detail_url(home_page_url)
        # 遍历所有的城市
        for url in detail_urls:
            print('下载', url)
            part_url = get_part_url(url)
            # 遍历所有的分类
            for part in part_url:
                print('分类信息', part)
                corp_url = get_url_of_corp(part)
                # 遍历所有的企业
                for corp in corp_url:
                    print('公司链接', corp)
                    detail = parser_detail(get_html(corp), db)
                    time.sleep(1)
            # detail = parser_detail(get_html(url))
            #     writer.writerow([info for info in detail.values()])
        # print(city_is_end('http://www.socom.cn/xinjiang/kelamayi/baijiantan/'))
    
    if __name__ == '__main__':
        main()
    
    # sql.py
    from pymongo import MongoClient
    
    def init():
        client = MongoClient('localhost', 27017)
        db = client.sqw
        collectios = db.corp_info
        return db
    
    def insert_detail(db, detail):
        db.my_collection.insert(detail)
    

    输出结果

    MongoDB数据库

    相关文章

      网友评论

        本文标题:搜企网爬虫作业

        本文链接:https://www.haomeiwen.com/subject/ojbyfxtx.html