爬取企查查

作者: NewForMe | 来源:发表于2019-05-24 09:42 被阅读0次

爬取企查查
爬去企查查数据
python爬虫另辟蹊径绕过企查查的登录验证，我太冇财了
简易的企业信息数据爬取软件
Python爬虫爬企查查数据
java 解决企查查非法操作验证问题爬取企查查企业相关数据最
2019-07-07
Elasticsearch 生态&技术峰会 | Elastics
python爬取企查查江苏企业信息生成excel表格
selenium爬取某查查行业信息

没啥技巧，就两个文件配置的爬虫，目的是将企查查网站上一些公司信息抓取下来。

所有源码

配置文件:config.py

MONGO_URL='localhost' #MongoDB本地连接
MONGO_DB='qichacha' #数据库名
MONGO_TABLE='qichacha' #表名

KEYWORD = '广州合道' #搜索关键词

爬取代码:spider.py

# -*- coding: utf-8 -*-
# @Time    : 2018/10/26 21:16
# @Author  : Xin
# @File    : spider_nologin.py
# @Software: PyCharm

import requests
from requests.exceptions import RequestException
from urllib.parse import urlencode
import json
import re
from pyquery import PyQuery as pq
from multiprocessing import Pool
from config import *
import pymongo

client = pymongo.MongoClient(MONGO_URL)#连接MongoDB
db = client[MONGO_DB]#创建数据库

#报文头
headers = {
        "cookie":"QCCSESSID=ejvdgnsi2rddlb9pbaue9ooch4; UM_distinctid=166b0853ff3287-096d0c0c314aee-3c604504-1fa400-166b0853ff5131; zg_did=%7B%22did%22%3A%20%22166b08540b44ac-08922195bb52cf-3c604504-1fa400-166b08540b54c4%22%7D; _uab_collina=154055981732518461862276; acw_tc=0ed717a715405598380581012e9866f92d13a6a352f024efbe09a35d3d; CNZZDATA1254842228=639133721-1540559402-null%7C1540899954; hasShow=1; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1540559815,1540734830,1540818836,1540902534; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1540904548; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201540902533286%2C%22updated%22%3A%201540904549411%2C%22info%22%3A%201540559814844%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22cuid%22%3A%20%22f6d5e6bd81b4649daa269182ad60cf95%22%7D",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}

#获取索引页
def get_page_index(page,keyword):
    data={
         'key' : keyword,
        'ajaxflag' : 1,
        'p' : page,
    }

    url='https://www.qichacha.com/search_index?'+urlencode(data)
    try:
        response = requests.get(url,headers=headers)
        if response.status_code==200:
            return response.text
        return None
    except RequestException:
        print("请求索引页url:{0}出错".format(url))
        return None

#解析索引页
def parse_page_index(html):
    #print(html)
    # pattern = re.compile(
    #     '<tr>.*?href="(.*?)".*?>(.*?)</a>.*?href.*?>(.*?)</a>.*?>(.*?)</span>.*?>(.*?)</span>.*?<p.*?>(.*?)<span.*?>(.*?)</span>.*?<p.*?<em>(.*?)</em>(.*?)</p>.*?<span.*?>(.*?)</span.*?</tr>',
    #     re.S)#取索引页所有数据
    pattern = re.compile('<tr>.*?href="(.*?)".*?>(.*?)</a>.*?</tr>',re.S)#只取详情页地址和公司名称
    result = re.findall(pattern,html)
    #print(result)
    for item in result:
        yield {
            'detail_url':item[0],
            'company':re.sub(r'<em>|</em>','',item[1]),#将获得的此格式内容再替换为纯公司名"<em>广州</em>侨<em>合</em>建设有限公司"
            # 'LegalRepresentative':item[2],
            # 'RegisteredCapital':item[3].strip()[5:],
            # 'CreatedTime':item[4].strip()[5:],
            # 'Email':item[6],
            # 'Phone':item[6],
            # 'Address':item[7]+item[8],
            # 'State':item[9]
        }

#获取详情页
def get_page_detail(company,detailurl):
    url = 'https://www.qichacha.com'+detailurl
    #print(url)
    print('开始爬取:{0},网址：{1}'.format(company,url))
    try:
        response = requests.get(url,headers=headers)
        if response.status_code==200:
            #print(200)
            return response.text
        return None
    except RequestException:
        print("请求详情页公司名{0}，url:{1}出错！".format(company,url))

#解析详情页
def parse_page_detail(html,detailurl):
    url = 'https://www.qichacha.com' + detailurl
    doc = pq(html)
    company = doc('.container.p-t > #company-top > div.row > div.content > div.row.title > h1').text()  # 公司名称
    state = doc('.container.p-t >#company-top > div.row > div.content > div.row.title > span').text()  # 经营状态
    phone = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(2) > span.fc > span.cvlu > span').text()  # 联系电话
    official_website = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(2) > span.cvlu > a:nth-child(1)').text()  # 官网
    email = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(3) > span.fc > span.cvlu > a').text()  # 邮箱
    address = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(3) > span.cvlu > a:nth-child(1)').text()  # 公司地址
    # introduction = doc('#textShowMore').text()#简介
    boss = doc('#Cominfo > table:nth-child(3) > tr:nth-child(2) > td.ma_left > div > div.clearfix > div:nth-child(2) > a.bname > h2').text()  # 法人代表
    business_relations = doc('#Cominfo > table:nth-child(3) > tr:nth-child(2) > td:nth-child(2) > div.ba-table-base > a').attr('href')  # 企业关联图谱链接
    registered_capital = doc('#Cominfo > table:nth-child(4) > tr:nth-child(1) > td:nth-child(2)').text()  # 注册资本
    paid_capital = doc('#Cominfo > table:nth-child(4) > tr:nth-child(1) > td:nth-child(4)').text()  # 实缴资本
    create_date = doc('#Cominfo > table:nth-child(4) > tr:nth-child(2) > td:nth-child(4)').text()  # 成立日期
    credit_code = doc('#Cominfo > table:nth-child(4) > tr:nth-child(3) > td:nth-child(2)').text()  # 统一社会信用代码
    registration_number = doc('#Cominfo > table:nth-child(4) > tr:nth-child(4) > td:nth-child(2)').text()  # 注册号
    organization_code = doc('#Cominfo > table:nth-child(4) > tr:nth-child(4) > td:nth-child(4)').text()  # 组织机构代码
    company_type = doc('#Cominfo > table:nth-child(4) > tr:nth-child(5) > td:nth-child(2)').text()  # 公司类型
    industry_involved = doc('#Cominfo > table:nth-child(4) > tr:nth-child(5) > td:nth-child(4)').text()  # 所属行业
    approval_date = doc('#Cominfo > table:nth-child(4) > tr:nth-child(6) > td:nth-child(2)').text()  # 核准日期
    registration_authority = doc('#Cominfo > table:nth-child(4) > tr:nth-child(6) > td:nth-child(4)').text()  # 登记机关
    area = doc('#Cominfo > table:nth-child(4) > tr:nth-child(7) > td:nth-child(2)').text()  # 所属地区
    english_name = doc('#Cominfo > table:nth-child(4) > tr:nth-child(7) > td:nth-child(4)').text()  # 英文名
    former_name = doc('#Cominfo > table:nth-child(4) > tr:nth-child(8) > td:nth-child(2)').text()  # 曾用名
    insured_number = doc('#Cominfo > table:nth-child(4) > tr:nth-child(8) > td:nth-child(4)').text()  # 参保人数
    staff_size = doc('#Cominfo > table:nth-child(4) > tr:nth-child(9) > td:nth-child(2)').text()  # 人员规模
    business_term = doc('#Cominfo > table:nth-child(4) > tr:nth-child(9) > td:nth-child(4)').text()  # 营业期限
    business_scope = doc('#Cominfo > table:nth-child(4) > tr:nth-child(11) > td:nth-child(2)').text()  # 经营范围
    equity_through = doc('#guquanIframeTool > a:nth-child(1)').attr('href')  # 股权穿透图链接
    result = {
        'url':url,
        'company': company, 'state': state,
        'phone': phone, 'official_website': official_website,
        'email': email, 'address': address,
        'boss': boss, 'business_relations': business_relations,
        'registered_capital':registered_capital,'paid_capital':paid_capital,
        'create_date': create_date,'credit_code':credit_code,
        'registration_number': registration_number,'organization_code':organization_code,
        'company_type': company_type,'industry_involved':industry_involved,
        'approval_date': approval_date,'registration_authority':registration_authority,
        'area': area,'english_name':english_name,
        'former_name': former_name,'insured_number':insured_number,
        'staff_size': staff_size,'business_term':business_term,
        'business_scope': business_scope,'equity_through':equity_through,
    }
    return result

def write_to_file(company,result):
    # with open('{0}.txt'.format(company),'a',encoding='utf-8')as f:
    #     f.write(json.dumps(result,ensure_ascii=False))
    with open('result.txt','a',encoding='utf8')as f:
        print('{0}，保存成功'.format(company))
        f.write(json.dumps(result,ensure_ascii=False)+'\n')

def save_to_mongo(result):
    if db[MONGO_TABLE].insert(result):  # 将结果插入到MongoDB
        print('存储到MongoDB成功', result['company'])
        return True
    return False

def main(i):
    html = get_page_index(i,KEYWORD)
    #print(html)
    for item in parse_page_index(html):
        #print(item['Company'],item['Detail_url'])
        text = get_page_detail(item['company'],item['detail_url'])#获取详情页内容
        if text:
            result=parse_page_detail(text,item['detail_url'])
            write_to_file(item['company'],result)
            save_to_mongo(result)

if __name__=="__main__":
    #main(1)
    pool = Pool()
    pool.map(main,[i for i in range(1,11)])
    print('爬取结束！')

爬取企查查
没啥技巧，就两个文件配置的爬虫，目的是将企查查网站上一些公司信息抓取下来。所有源码配置文件:config.py...
爬去企查查数据
#-*- coding-8 -*- import requests import lxml import sys ...
python爬虫另辟蹊径绕过企查查的登录验证，我太冇财了
从企查查爬取企业信息，如果没有登录直接检索，邮箱、电话都被隐藏了点击详情，部分信息同样会被隐藏毕竟只是打工的，...
简易的企业信息数据爬取软件
项目简介该软件是根据企业信息汇总网站企查查中提供的数据源进行爬取的爬虫软件，采用Python语言实现，并配合依赖...
Python爬虫爬企查查数据
#-*- coding-8 -*- import requests import lxml import sys ...
java 解决企查查非法操作验证问题爬取企查查企业相关数据最
这两天需要用到某查查的数据。发现只能看到100页多余部分需要开启会员。导出表格也需要开vip .虽然在某宝上十几块...
2019-07-07
企查查
Elasticsearch 生态&技术峰会 | Elastics
本篇内容是企查查搜索部门经理范兆明分享的Elasticsearch在企查查的应用实践。分享人：企查查搜索部门经理范...
python爬取企查查江苏企业信息生成excel表格
1.前期准备具体请查看上一篇 2.准备库requests,BeautifulSoup,xlwt,lxml 3.具...
selenium爬取某查查行业信息
最近接到一个体力活儿，需要获取几百家企业的所属行业信息。某眼查需要登录，某查查免登录可用。由于水平有限，也为了节省...