本文主要拿的是法律法规的数据,但是北大法宝部分数据需要登录才能查看完,而且封ip,本文未用代理,拿到的数据有限
# encoding:utf-8
'''
pro:北大法宝
auth:zhoubobo
datatime:2019-4-8
ver:1.0
'''
import requests
import datetime
import time
import psycopg2
import re
from fake_useragent import UserAgent
import chardet
from lxml import etree
import hashlib
import csv
ua = UserAgent()
# 引入时间戳 暂时没用
time1 = str(time.time())[:10]
time2 = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# 第一次请求的headers, 精简过后不可缺少的参数
headers = {
'User-Agent': ua.random,
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '438',
'Content-Type': 'application/x-www-form-urlencoded',
'Host': 'www.pkulaw.cn',
'Referer': 'http://www.pkulaw.cn/cluster_form.aspx?Db=chl&menu_item=law&EncodingName=&clust_param=0/XA01&keyword=&range=name&',
}
def post_spidder(url, data):
'''
:param url: 请求的链接
:param data: 请求的参数
:return: 返回解码后的界面
'''
try:
print("请求页面中")
ses = requests.Session()
res = ses.post(url=url, data=data, headers=headers, timeout=10)
encoding = chardet.detect(res.content)
html = res.content.decode(encoding['encoding'], 'ignore')
print('返回界面信息')
return html
except Exception as e:
print(e)
pass
def get_spidder(url):
'''
:param url: get请求的链接
:return: 返回请求后的信息
'''
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': '',
'Host': 'www.pkulaw.cn',
'Referer': 'http://www.pkulaw.cn/cluster_form.aspx?Db=chl&menu_item=law&EncodingName=&clust_param=0/XA01&keyword=&range=name&',
'Upgrade-Insecure-Requests': '1',
'User-Agent': ua.random
}
# 目前cookieid是复制浏览器上边的,有数量限制
# Cookie = 'CookieId=oerh2flilolgygwcckakt2sj;'
Cookie = 'CookieId=qxn5jr5kmplmlhxcln1ny3wh;'
headers['Cookie'] = Cookie
print(headers)
res = requests.get(url, headers=headers)
return res
def clear_dada(res):
'''
:param res: 第一次请求返回的界面
:return: 返回界面上的所有链接
'''
try:
print("开始清洗数据1")
html = etree.HTML(res)
hrefs = html.xpath('//a[@class="main-ljwenzi"]/@href')
return save_content(hrefs)
except Exception as e:
# 请求的过程中可能会出现验证码,此处用作捕捉异常
print(e)
pass
def save_content(hrefs):
'''
:param hrefs: 详情页面的链接
:return: 返回清洗好的数据,并进行储存
'''
for href in hrefs:
url = 'http://www.pkulaw.cn/' + href
print(url)
res = get_spidder(url)
# print(res.text)
time.sleep(5)
html = etree.HTML(res.text)
try:
print("................开始清洗数据2")
title = html.xpath('//table[@id="tbl_content_main"]/tr[1]/td/span/strong/text()')[0] # 标题
# pub_dep = html.xpath('//table[@id="tbl_content_main"]/tr[2]/td[1]/a/text()')[0] # 发布部门
# pub_dep = re.findall('【发布部门】.*target="_blank">(.*?)</a>', res)[0]
# post_name = html.xpath('//table[@id="tbl_content_main"]/tr[2]/td[2]/text()')[0] # 发文字号
# rel_time = html.xpath('//table[@id="tbl_content_main"]/tr[3]/td[1]/text()')[0] # 发布时间
# pub_time = html.xpath('//table[@id="tbl_content_main"]/tr[3]/td[2]/text()')[0] # 实施时间
# time_line = html.xpath('//table[@id="tbl_content_main"]/tr[4]/td[1]/a/text()')[0] # 时效性
# eff_lev = html.xpath('//table[@id="tbl_content_main"]/tr[4]/td[2]/a/text()')[0] # 效力级别
# reg_cat = html.xpath('//table[@id="tbl_content_main"]/tr[5]/td/a/text()')[0] # 法规类别
# 【发布部门】
pub_dep = re.findall('【发布部门】.*?target=_blank>(.*?)</a>', res.text, re.S)
# 发文字号
post_name = re.findall('【发文字号】.*?([\u4e00-\u9fa5a-zA-Z-z0-9]+)</td>', res.text, re.S)
post_name = '无' if post_name == [] else post_name
# 发布时间
rel_time = re.findall('【发布日期】.*?(\d{4}.\d{1,2}.\d{1,2})', res.text, re.S)
# 执行时间
pub_time = re.findall('【实施日期】.*?(\d{4}.\d{1,2}.\d{1,2})', res.text, re.S)
# 时效性
time_line = re.findall('【时效性】.*?target=_blank>(.*?)</a>', res.text, re.S)
# 效力级别
eff_lev = re.findall('【发布部门】.*?target=_blank>(.*?)</a>', res.text, re.S)
# 法规类别
reg_cat = re.findall('【法规类别】.*?target=_blank>(.*?)</a>', res.text, re.S)
content = html.xpath('//div[@id="div_content"]//text()') # 内容
content = content[5:]
content = ''.join(content)
content = re.sub(r'(法宝联想.*?)', '', content)
sign = hashlib.md5((content).encode('utf-8')).hexdigest() # 唯一标记
print(title)
print(pub_dep)
print(post_name)
print(rel_time)
print(pub_time)
print(time_line)
print(eff_lev)
print(reg_cat)
print(content)
print(sign)
except Exception as e:
print(e)
with open('E://1.txt', 'a') as f:
f.write(url + '\n')
print(title)
print(pub_dep)
print(post_name)
print(rel_time)
print(pub_time)
print(time_line)
print(eff_lev)
print(reg_cat)
print(content)
print(sign)
conn = psycopg2.connect(database='falv_wenku', user="postgres", password='123456',
host='127.0.0.1', port='5432')
try:
cur = conn.cursor()
sql = "insert into beida_fabao_1(title, pub_dep, post_name, rel_time, pub_time, time_line, eff_lev, reg_cat, " \
"content1, sign)values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
cur.execute(sql, (title, pub_dep[0], post_name[0], rel_time[0], pub_time[0], time_line[0], eff_lev[0],
reg_cat[0], content, sign))
conn.commit()
except Exception as e:
print("数据库错误:", e)
if __name__ == '__main__':
# 通过分析ajax抓包的链接
url = 'http://www.pkulaw.cn/doSearch.ashx'
# 第一次请求带的参数,进行拼接
for i in range(0, 69):
data = {
'Db': 'chl', # 法律类
'clusterwhere': '%25e6%2595%2588%25e5%258a%259b%25e7%25ba%25a7%25e5%2588%25ab%253dXA0101',
'clust_db': 'chl',
'Search_Mode': '',
'range': 'name',
'aim_page': i, # 当前页面
'page_count': 24, # 总页数
}
# 通过post 进行请求得到第一个界面,并得到返回值
res = post_spidder(url, data)
# 通过请求得到链接列表
hrefs = clear_dada(res)
# 翻页加延时
time.sleep(20)
网友评论