没啥技巧,就两个文件配置的爬虫,目的是将企查查网站上一些公司信息抓取下来。
所有源码
配置文件:config.py
MONGO_URL='localhost' #MongoDB本地连接
MONGO_DB='qichacha' #数据库名
MONGO_TABLE='qichacha' #表名
KEYWORD = '广州合道' #搜索关键词
爬取代码:spider.py
# -*- coding: utf-8 -*-
# @Time : 2018/10/26 21:16
# @Author : Xin
# @File : spider_nologin.py
# @Software: PyCharm
import requests
from requests.exceptions import RequestException
from urllib.parse import urlencode
import json
import re
from pyquery import PyQuery as pq
from multiprocessing import Pool
from config import *
import pymongo
client = pymongo.MongoClient(MONGO_URL)#连接MongoDB
db = client[MONGO_DB]#创建数据库
#报文头
headers = {
"cookie":"QCCSESSID=ejvdgnsi2rddlb9pbaue9ooch4; UM_distinctid=166b0853ff3287-096d0c0c314aee-3c604504-1fa400-166b0853ff5131; zg_did=%7B%22did%22%3A%20%22166b08540b44ac-08922195bb52cf-3c604504-1fa400-166b08540b54c4%22%7D; _uab_collina=154055981732518461862276; acw_tc=0ed717a715405598380581012e9866f92d13a6a352f024efbe09a35d3d; CNZZDATA1254842228=639133721-1540559402-null%7C1540899954; hasShow=1; Hm_lvt_3456bee468c83cc63fb5147f119f1075=1540559815,1540734830,1540818836,1540902534; Hm_lpvt_3456bee468c83cc63fb5147f119f1075=1540904548; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201540902533286%2C%22updated%22%3A%201540904549411%2C%22info%22%3A%201540559814844%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22cuid%22%3A%20%22f6d5e6bd81b4649daa269182ad60cf95%22%7D",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}
#获取索引页
def get_page_index(page,keyword):
data={
'key' : keyword,
'ajaxflag' : 1,
'p' : page,
}
url='https://www.qichacha.com/search_index?'+urlencode(data)
try:
response = requests.get(url,headers=headers)
if response.status_code==200:
return response.text
return None
except RequestException:
print("请求索引页url:{0}出错".format(url))
return None
#解析索引页
def parse_page_index(html):
#print(html)
# pattern = re.compile(
# '<tr>.*?href="(.*?)".*?>(.*?)</a>.*?href.*?>(.*?)</a>.*?>(.*?)</span>.*?>(.*?)</span>.*?<p.*?>(.*?)<span.*?>(.*?)</span>.*?<p.*?<em>(.*?)</em>(.*?)</p>.*?<span.*?>(.*?)</span.*?</tr>',
# re.S)#取索引页所有数据
pattern = re.compile('<tr>.*?href="(.*?)".*?>(.*?)</a>.*?</tr>',re.S)#只取详情页地址和公司名称
result = re.findall(pattern,html)
#print(result)
for item in result:
yield {
'detail_url':item[0],
'company':re.sub(r'<em>|</em>','',item[1]),#将获得的此格式内容再替换为纯公司名"<em>广州</em>侨<em>合</em>建设有限公司"
# 'LegalRepresentative':item[2],
# 'RegisteredCapital':item[3].strip()[5:],
# 'CreatedTime':item[4].strip()[5:],
# 'Email':item[6],
# 'Phone':item[6],
# 'Address':item[7]+item[8],
# 'State':item[9]
}
#获取详情页
def get_page_detail(company,detailurl):
url = 'https://www.qichacha.com'+detailurl
#print(url)
print('开始爬取:{0},网址:{1}'.format(company,url))
try:
response = requests.get(url,headers=headers)
if response.status_code==200:
#print(200)
return response.text
return None
except RequestException:
print("请求详情页公司名{0},url:{1}出错!".format(company,url))
#解析详情页
def parse_page_detail(html,detailurl):
url = 'https://www.qichacha.com' + detailurl
doc = pq(html)
company = doc('.container.p-t > #company-top > div.row > div.content > div.row.title > h1').text() # 公司名称
state = doc('.container.p-t >#company-top > div.row > div.content > div.row.title > span').text() # 经营状态
phone = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(2) > span.fc > span.cvlu > span').text() # 联系电话
official_website = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(2) > span.cvlu > a:nth-child(1)').text() # 官网
email = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(3) > span.fc > span.cvlu > a').text() # 邮箱
address = doc('.container.p-t >#company-top > div.row > div.content > div:nth-child(3) > span.cvlu > a:nth-child(1)').text() # 公司地址
# introduction = doc('#textShowMore').text()#简介
boss = doc('#Cominfo > table:nth-child(3) > tr:nth-child(2) > td.ma_left > div > div.clearfix > div:nth-child(2) > a.bname > h2').text() # 法人代表
business_relations = doc('#Cominfo > table:nth-child(3) > tr:nth-child(2) > td:nth-child(2) > div.ba-table-base > a').attr('href') # 企业关联图谱链接
registered_capital = doc('#Cominfo > table:nth-child(4) > tr:nth-child(1) > td:nth-child(2)').text() # 注册资本
paid_capital = doc('#Cominfo > table:nth-child(4) > tr:nth-child(1) > td:nth-child(4)').text() # 实缴资本
create_date = doc('#Cominfo > table:nth-child(4) > tr:nth-child(2) > td:nth-child(4)').text() # 成立日期
credit_code = doc('#Cominfo > table:nth-child(4) > tr:nth-child(3) > td:nth-child(2)').text() # 统一社会信用代码
registration_number = doc('#Cominfo > table:nth-child(4) > tr:nth-child(4) > td:nth-child(2)').text() # 注册号
organization_code = doc('#Cominfo > table:nth-child(4) > tr:nth-child(4) > td:nth-child(4)').text() # 组织机构代码
company_type = doc('#Cominfo > table:nth-child(4) > tr:nth-child(5) > td:nth-child(2)').text() # 公司类型
industry_involved = doc('#Cominfo > table:nth-child(4) > tr:nth-child(5) > td:nth-child(4)').text() # 所属行业
approval_date = doc('#Cominfo > table:nth-child(4) > tr:nth-child(6) > td:nth-child(2)').text() # 核准日期
registration_authority = doc('#Cominfo > table:nth-child(4) > tr:nth-child(6) > td:nth-child(4)').text() # 登记机关
area = doc('#Cominfo > table:nth-child(4) > tr:nth-child(7) > td:nth-child(2)').text() # 所属地区
english_name = doc('#Cominfo > table:nth-child(4) > tr:nth-child(7) > td:nth-child(4)').text() # 英文名
former_name = doc('#Cominfo > table:nth-child(4) > tr:nth-child(8) > td:nth-child(2)').text() # 曾用名
insured_number = doc('#Cominfo > table:nth-child(4) > tr:nth-child(8) > td:nth-child(4)').text() # 参保人数
staff_size = doc('#Cominfo > table:nth-child(4) > tr:nth-child(9) > td:nth-child(2)').text() # 人员规模
business_term = doc('#Cominfo > table:nth-child(4) > tr:nth-child(9) > td:nth-child(4)').text() # 营业期限
business_scope = doc('#Cominfo > table:nth-child(4) > tr:nth-child(11) > td:nth-child(2)').text() # 经营范围
equity_through = doc('#guquanIframeTool > a:nth-child(1)').attr('href') # 股权穿透图链接
result = {
'url':url,
'company': company, 'state': state,
'phone': phone, 'official_website': official_website,
'email': email, 'address': address,
'boss': boss, 'business_relations': business_relations,
'registered_capital':registered_capital,'paid_capital':paid_capital,
'create_date': create_date,'credit_code':credit_code,
'registration_number': registration_number,'organization_code':organization_code,
'company_type': company_type,'industry_involved':industry_involved,
'approval_date': approval_date,'registration_authority':registration_authority,
'area': area,'english_name':english_name,
'former_name': former_name,'insured_number':insured_number,
'staff_size': staff_size,'business_term':business_term,
'business_scope': business_scope,'equity_through':equity_through,
}
return result
def write_to_file(company,result):
# with open('{0}.txt'.format(company),'a',encoding='utf-8')as f:
# f.write(json.dumps(result,ensure_ascii=False))
with open('result.txt','a',encoding='utf8')as f:
print('{0},保存成功'.format(company))
f.write(json.dumps(result,ensure_ascii=False)+'\n')
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result): # 将结果插入到MongoDB
print('存储到MongoDB成功', result['company'])
return True
return False
def main(i):
html = get_page_index(i,KEYWORD)
#print(html)
for item in parse_page_index(html):
#print(item['Company'],item['Detail_url'])
text = get_page_detail(item['company'],item['detail_url'])#获取详情页内容
if text:
result=parse_page_detail(text,item['detail_url'])
write_to_file(item['company'],result)
save_to_mongo(result)
if __name__=="__main__":
#main(1)
pool = Pool()
pool.map(main,[i for i in range(1,11)])
print('爬取结束!')
网友评论