静态网页爬取多页

作者: 王小鱼鱻 | 来源:发表于2017-07-11 01:13 被阅读123次

    爬取的网站:企业黄页

    Paste_Image.png

    这里我是抓取一个省的公司,按照这个分类抓取的,
    省——市——分类——公司,其实方法不限的;
    按照这个顺序一级一级往下获取所需的参数的,其实不是很难,就是数据太多了,爬的太慢,多进程和多线程的,自己还不是很懂,所以慢的很慢,有知道的各位,希望可以给个建议,最好有个最优解决的方法;
    不说废话了,看下代码,求各位高手的指导一下:

    import requests
    from lxml import etree
    import re
    import math
    import json
    import pymongo
    import multiprocessing
    
    
    # 连接到Mongo
    conn = pymongo.MongoClient(host = 'localhost', port = 27017)
    # 选择或创建数据库
    company = conn['company']
    # 选择或创建数据集合
    newsdata = company['infos']
    
    class Company(multiprocessing.Process):
        company_data = []
    
        def __init__(self, interval):
            multiprocessing.Process.__init__(self)
            self.interval = interval
    
        #获取一个省的所有城市
        def get_city(self, province_url):
            global city_name
            html = requests.get(province_url).content
            selector = etree.HTML(html)
            city_infos = selector.xpath('//div[@class="contentBox"]')[0]
            city_names = city_infos.xpath('div[@class="cityBox"]/a/text()')
            city_halfurls = city_infos.xpath('div[@class="cityBox"]/a/@href')
            for city_name,city_halfurl in zip(city_names,city_halfurls):
                city_url = "http://www.socom.cn" + city_halfurl
                print(city_name)
                print(city_url)
                self.get_item(city_url)
    
        #获取公司的分类
        def get_item(self,city_url):
            global item_name
            html = requests.get(city_url).content
            selector = etree.HTML(html)
            city_infos = selector.xpath('//div[@class="contentBox"]')[1]
            item_names = city_infos.xpath('div[@class="provinceBox"]/a/text()')
            item_halfurls = city_infos.xpath('div[@class="provinceBox"]/a/@href')
            for item_name,item_halfurl in zip(item_names,item_halfurls):
                item_url = "http://www.socom.cn" + item_halfurl
                print(item_name)
                print(item_url)
                self.get_page(item_url)
    
        #获取公司每个分类的页数;
        def get_page(self, item_url):
            global company_page
            html = requests.get(item_url).content
            selector = etree.HTML(html)
            page_infos = selector.xpath('//div[@class="contentBox"]')[0]
            page_halfurl = page_infos.xpath('div[@class="description"]/a/@href')[0]
            txt = page_infos.xpath('div[@class="description"]/text()')[0]
            #提取出该分类下的所有公司的数量
            companys = re.findall(r'\d+', txt)[0]
            # python里有一个向上取整的方法,math.ceil()
            pages = math.ceil(int(companys)/50)
            for page in range(1, pages + 1):
                company_page = []
                page_url = "http://www.socom.cn" + page_halfurl + "&name=&cp=%s" %str(page)
                self.get_company(page_url)
                # 插入多个数据
                newsdata.insert_many(company_page)
    
        #获取公司的url
        def get_company(self, page_url):
            html = requests.get(page_url).content
            selector = etree.HTML(html)
            company_infos = selector.xpath('//div[@class="contentBox"]')[3]
            company_halfurls = company_infos.xpath('div[@class="cityBox"]/a/@href')
            for company_halfurl in company_halfurls:
                company_url = "http://www.socom.cn" + company_halfurl
                print(company_url)
                self.company_detail(company_url)
    
        #获取公司的详情
        def company_detail(self, company_url):
            global company_data
            html = requests.get(company_url).text
            selector = etree.HTML(html)
            company_one = selector.xpath('//div[@class="contentBox"]')[1]
            company_name = company_one.xpath('div[@class="provinceBox"]/text()')[0]
            print(company_name)
            company_datas = company_one.xpath('div[@class="cityBox"]/div/text()')
            location = company_datas[0]
            phone = company_datas[1].strip()
            fax = company_datas[2].strip()
            mobile_phone = company_datas[3].strip()
            email = company_datas[5].strip()
            contact = company_datas[6].strip()
            person = company_datas[7].strip()
            capital = company_datas[8].strip()
            type = company_datas[9].strip()
            product = company_datas[10].strip()
            introduction = company_datas[11].strip()
            company_one = {
                'province': '广东',
                'city': city_name,
                'item': item_name,
                'company': company_name,
                'location': location[4:],
                'phone': phone[4:],
                'fax': fax[4:],
                'mobile_phone': mobile_phone[3:],
                'web': company_url,
                'email': email[3:],
                'contact': contact[4:],
                'person': person[5:],
                'capital': capital[5:],
                'type': type[5:],
                'product': product[5:],
                'introduction': introduction[5:]
            }
            company_page.append(company_one)
    
    if __name__ == "__main__":
        #输入要爬取的省的url
        province_url = "http://www.socom.cn/guangdong/"
        p = Company(4)
        p.get_city(province_url)
    
    
    

    爬取的结果:

    Paste_Image.png

    小结:
    1、可能数据比较多,爬的有点慢,大家可以用scrapy或者多线程尝试一下

    相关文章

      网友评论

        本文标题:静态网页爬取多页

        本文链接:https://www.haomeiwen.com/subject/kahphxtx.html