美文网首页
爬取企查查

爬取企查查

作者: NewForMe | 来源:发表于2019-05-24 09:42 被阅读0次

没啥技巧,就两个文件配置的爬虫,目的是将企查查网站上一些公司信息抓取下来。

所有源码

配置文件:config.py

MONGO_URL='localhost' #MongoDB本地连接
MONGO_DB='qichacha' #数据库名
MONGO_TABLE='qichacha' #表名

KEYWORD = '广州合道' #搜索关键词

爬取代码:spider.py

# -*- coding: utf-8 -*-
# @Time    : 2018/10/26 21:16
# @Author  : Xin
# @File    : spider_nologin.py
# @Software: PyCharm

import requests
from requests.exceptions import RequestException
from urllib.parse import urlencode
import json
import re
from pyquery import PyQuery as pq
from multiprocessing import Pool
from config import *
import pymongo

client = pymongo.MongoClient(MONGO_URL)#连接MongoDB
db = client[MONGO_DB]#创建数据库

#报文头
headers = {
        "cookie":"QCCSESSID=ejvdgnsi2rddlb9pbaue9ooch4; UM_distinctid=166b0853ff3287-096d0c0c314aee-3c604504-1fa400-166b0853ff5131; zg_did=%7B%22did%22%3A%20%22166b08540b44ac-08922195bb52cf-3c604504-1fa400-166b08540b54c4%22%7D; _uab_collina=154055981732518461862276; acw_tc=0ed717a715405598380581012e9866f92d13a6a352f024efbe09a35d3d; CNZZDATA1254842228=639133721-1540559402-null%7C1540899954; hasShow=1; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1540559815,1540734830,1540818836,1540902534; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1540904548; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201540902533286%2C%22updated%22%3A%201540904549411%2C%22info%22%3A%201540559814844%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22cuid%22%3A%20%22f6d5e6bd81b4649daa269182ad60cf95%22%7D",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}

#获取索引页
def get_page_index(page,keyword):
    data={
         'key' : keyword,
        'ajaxflag' : 1,
        'p' : page,
    }

    url='https://www.qichacha.com/search_index?'+urlencode(data)
    try:
        response = requests.get(url,headers=headers)
        if response.status_code==200:
            return response.text
        return None
    except RequestException:
        print("请求索引页url:{0}出错".format(url))
        return None

#解析索引页
def parse_page_index(html):
    #print(html)
    # pattern = re.compile(
    #     '<tr>.*?href="(.*?)".*?>(.*?)</a>.*?href.*?>(.*?)</a>.*?>(.*?)</span>.*?>(.*?)</span>.*?<p.*?>(.*?)<span.*?>(.*?)</span>.*?<p.*?<em>(.*?)</em>(.*?)</p>.*?<span.*?>(.*?)</span.*?</tr>',
    #     re.S)#取索引页所有数据
    pattern = re.compile('<tr>.*?href="(.*?)".*?>(.*?)</a>.*?</tr>',re.S)#只取详情页地址和公司名称
    result = re.findall(pattern,html)
    #print(result)
    for item in result:
        yield {
            'detail_url':item[0],
            'company':re.sub(r'<em>|</em>','',item[1]),#将获得的此格式内容再替换为纯公司名"<em>广州</em>侨<em>合</em>建设有限公司"
            # 'LegalRepresentative':item[2],
            # 'RegisteredCapital':item[3].strip()[5:],
            # 'CreatedTime':item[4].strip()[5:],
            # 'Email':item[6],
            # 'Phone':item[6],
            # 'Address':item[7]+item[8],
            # 'State':item[9]
        }

#获取详情页
def get_page_detail(company,detailurl):
    url = 'https://www.qichacha.com'+detailurl
    #print(url)
    print('开始爬取:{0},网址:{1}'.format(company,url))
    try:
        response = requests.get(url,headers=headers)
        if response.status_code==200:
            #print(200)
            return response.text
        return None
    except RequestException:
        print("请求详情页公司名{0},url:{1}出错!".format(company,url))

#解析详情页
def parse_page_detail(html,detailurl):
    url = 'https://www.qichacha.com' + detailurl
    doc = pq(html)
    company = doc('.container.p-t > #company-top > div.row > div.content > div.row.title > h1').text()  # 公司名称
    state = doc('.container.p-t >#company-top > div.row > div.content > div.row.title > span').text()  # 经营状态
    phone = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(2) > span.fc > span.cvlu > span').text()  # 联系电话
    official_website = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(2) > span.cvlu > a:nth-child(1)').text()  # 官网
    email = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(3) > span.fc > span.cvlu > a').text()  # 邮箱
    address = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(3) > span.cvlu > a:nth-child(1)').text()  # 公司地址
    # introduction = doc('#textShowMore').text()#简介
    boss = doc('#Cominfo > table:nth-child(3) > tr:nth-child(2) > td.ma_left > div > div.clearfix > div:nth-child(2) > a.bname > h2').text()  # 法人代表
    business_relations = doc('#Cominfo > table:nth-child(3) > tr:nth-child(2) > td:nth-child(2) > div.ba-table-base > a').attr('href')  # 企业关联图谱链接
    registered_capital = doc('#Cominfo > table:nth-child(4) > tr:nth-child(1) > td:nth-child(2)').text()  # 注册资本
    paid_capital = doc('#Cominfo > table:nth-child(4) > tr:nth-child(1) > td:nth-child(4)').text()  # 实缴资本
    create_date = doc('#Cominfo > table:nth-child(4) > tr:nth-child(2) > td:nth-child(4)').text()  # 成立日期
    credit_code = doc('#Cominfo > table:nth-child(4) > tr:nth-child(3) > td:nth-child(2)').text()  # 统一社会信用代码
    registration_number = doc('#Cominfo > table:nth-child(4) > tr:nth-child(4) > td:nth-child(2)').text()  # 注册号
    organization_code = doc('#Cominfo > table:nth-child(4) > tr:nth-child(4) > td:nth-child(4)').text()  # 组织机构代码
    company_type = doc('#Cominfo > table:nth-child(4) > tr:nth-child(5) > td:nth-child(2)').text()  # 公司类型
    industry_involved = doc('#Cominfo > table:nth-child(4) > tr:nth-child(5) > td:nth-child(4)').text()  # 所属行业
    approval_date = doc('#Cominfo > table:nth-child(4) > tr:nth-child(6) > td:nth-child(2)').text()  # 核准日期
    registration_authority = doc('#Cominfo > table:nth-child(4) > tr:nth-child(6) > td:nth-child(4)').text()  # 登记机关
    area = doc('#Cominfo > table:nth-child(4) > tr:nth-child(7) > td:nth-child(2)').text()  # 所属地区
    english_name = doc('#Cominfo > table:nth-child(4) > tr:nth-child(7) > td:nth-child(4)').text()  # 英文名
    former_name = doc('#Cominfo > table:nth-child(4) > tr:nth-child(8) > td:nth-child(2)').text()  # 曾用名
    insured_number = doc('#Cominfo > table:nth-child(4) > tr:nth-child(8) > td:nth-child(4)').text()  # 参保人数
    staff_size = doc('#Cominfo > table:nth-child(4) > tr:nth-child(9) > td:nth-child(2)').text()  # 人员规模
    business_term = doc('#Cominfo > table:nth-child(4) > tr:nth-child(9) > td:nth-child(4)').text()  # 营业期限
    business_scope = doc('#Cominfo > table:nth-child(4) > tr:nth-child(11) > td:nth-child(2)').text()  # 经营范围
    equity_through = doc('#guquanIframeTool > a:nth-child(1)').attr('href')  # 股权穿透图链接
    result = {
        'url':url,
        'company': company, 'state': state,
        'phone': phone, 'official_website': official_website,
        'email': email, 'address': address,
        'boss': boss, 'business_relations': business_relations,
        'registered_capital':registered_capital,'paid_capital':paid_capital,
        'create_date': create_date,'credit_code':credit_code,
        'registration_number': registration_number,'organization_code':organization_code,
        'company_type': company_type,'industry_involved':industry_involved,
        'approval_date': approval_date,'registration_authority':registration_authority,
        'area': area,'english_name':english_name,
        'former_name': former_name,'insured_number':insured_number,
        'staff_size': staff_size,'business_term':business_term,
        'business_scope': business_scope,'equity_through':equity_through,
    }
    return result

def write_to_file(company,result):
    # with open('{0}.txt'.format(company),'a',encoding='utf-8')as f:
    #     f.write(json.dumps(result,ensure_ascii=False))
    with open('result.txt','a',encoding='utf8')as f:
        print('{0},保存成功'.format(company))
        f.write(json.dumps(result,ensure_ascii=False)+'\n')

def save_to_mongo(result):
    if db[MONGO_TABLE].insert(result):  # 将结果插入到MongoDB
        print('存储到MongoDB成功', result['company'])
        return True
    return False

def main(i):
    html = get_page_index(i,KEYWORD)
    #print(html)
    for item in parse_page_index(html):
        #print(item['Company'],item['Detail_url'])
        text = get_page_detail(item['company'],item['detail_url'])#获取详情页内容
        if text:
            result=parse_page_detail(text,item['detail_url'])
            write_to_file(item['company'],result)
            save_to_mongo(result)

if __name__=="__main__":
    #main(1)
    pool = Pool()
    pool.map(main,[i for i in range(1,11)])
    print('爬取结束!')

相关文章

网友评论

      本文标题:爬取企查查

      本文链接:https://www.haomeiwen.com/subject/bllvtqtx.html