法律法规信息库网站不稳定性因素比较大,有时候会进不去
import requests
import chardet
import re
import time
import sys
from lxml import etree
import psycopg2
import hashlib
headers = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E)'
}
# User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E)
# Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36
# 请求页面返回数据
def get_url(url, headers, data):
response = requests.post(url=url, headers=headers, data=data)
encoding = chardet.detect(response.content)
res = response.content.decode(encoding['encoding'], 'ignore')
return res
def post_url(url, headers, data):
response = requests.post(url=url, headers=headers, data=data)
encoding = chardet.detect(response.content)
res = response.content.decode(encoding['encoding'], 'ignore')
return res
# 对javascript: 链接进行筛选
def href_j(href):
params = re.findall("goMore\('(.*?)','(.*?)','(.*?)','(.*?)'", href)
# print(params)
zlsxid = params[0][0]
bmflid = params[0][1]
zdjg = params[0][2]
txtid = params[0][3]
data = {
'SFYX': '有效',
'zlsxid': zlsxid,
'bmflid': bmflid,
'zdjg': zdjg,
'txtid': txtid,
'resultSearch': 'false',
'pagesize': '50'
}
# 得到下个界面数据
url = 'http://law.npc.gov.cn/FLFG/getAllList.action?'
response = get_url(url, headers=headers, data=data)
return spidder1(response, data=data, i=1)
# 正常链接
def href_n(href):
response = get_url(href, headers=headers, data={})
return spidder1(response, data={}, i=1)
# 对第二界面进行分析
def spidder1(res, data, i):
html = etree.HTML(res)
href2s = html.xpath('//table//tr/td[2]/a[1]/@href')
for href2 in href2s:
# time.sleep(3)
params2 = re.findall("showLocation\('(.*?)','(.*?)','(.*?)'", href2)
param1 = params2[0][0]
param2 = params2[0][1]
param3 = params2[0][2]
# 请求详情(第三个)页面
url = "http://law.npc.gov.cn:80/FLFG/flfgByID.action?flfgID=" + param1 + "&keyword=" + param2 + "&zlsxid=" + param3
response = get_url(url=url, headers=headers, data={})
html = etree.HTML(response)
# 进行数据筛选
attr = (html.xpath('//*[@id="content"]/table//tr[1]/td[2]/text()'))[0].strip() # 资料属性
orga = (html.xpath('//*[@id="content"]/table//tr[2]/td[2]/text()'))[0].strip() # 指定机关
pub_date = (html.xpath('//*[@id="content"]/table//tr[4]/td[2]/text()'))[0].strip() # 发布日期
put_date = (html.xpath('//*[@id="content"]/table//tr[4]/td[4]/text()'))[0].strip() # 执行日期
content = '\n'.join(html.xpath('//*[@id="content"]/div/div//text()')) # 内容
title = (html.xpath('//div[@class="bt"]//text()'))[0].strip() # 标题
# 判断不确定性字段
try:
time_ok = (html.xpath('//*[@id="content"]/table//tr[5]/td[2]/text()'))[0].strip() # 时效性
except:
time_ok = '未知'
try:
clas = (html.xpath('//*[@id="content"]/table//tr[1]/td[4]/text()'))[0].strip() # 部门分类
except:
clas = '未知'
try:
num = (html.xpath('//*[@id="content"]/table//tr[3]/td[2]/text()'))[0].strip() # 颁布文号
if num == '':
num = '未知'
except:
num = '未知'
print("#"*300)
# print(attr, clas, orga, num, pub_date, put_date, time_ok,title)
# print(content)
item = {}
item['attr'] = attr
item['orga'] = orga
item['pub_date'] = pub_date
item['put_date'] = put_date
item['content'] = content
item['time_ok'] = time_ok
item['clas'] = clas
item['num'] = num
item['title'] = title
item['type'] = '司法解释及文件_检察院'
pipline(item)
if len(href2s) == 50:
i = i + 1
url = 'http://law.npc.gov.cn/FLFG/getAllList.action'
data['ispage'] = '1'
data['curPage'] = i
print(data)
response = post_url(url, headers=headers, data=data)
spidder1(response, data=data, i=i)
# 进行存储
def pipline(item):
# 进行数据去重
print(item)
sign = hashlib.md5((item['content']).encode('utf-8')).hexdigest()
print(str(sign))
conn = psycopg2.connect(database='falv_wenku', user="postgres", password='123456',
host='127.0.0.1', port='5432')
cur = conn.cursor()
try:
sql = 'INSERT INTO falv_fagui_xinxiku(attr, orga, pub_date, put_date, conten, time_ok, clas, num, sign, title,type1)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
cur.execute(sql, (item['attr'], item['orga'], item['pub_date'], item['put_date'], item['content'], item['time_ok'], item['clas'], item['num'], str(sign), item['title'], item['type']))
# cur.execute('INSERT INTO ver_1(attr)value ("1")')
conn.commit()
except Exception as e:
print("数据库插入错误:", e)
if __name__ == "__main__":
try:
url = 'http://law.npc.gov.cn/FLFG/index.jsp'
res1 = get_url(url, headers, data={})
# 拿到链接
href1s = re.findall('href="(.*?)">更多</a>', res1)
del href1s[2]
# print(len(href1s))
print(href1s)
href1s = href1s[12:13]
print(href1s)
for href1 in href1s:
if href1[0] == "j":
href_j(href1)
else:
href_n(href1)
except Exception as e:
print("错误", e)
网友评论