本文主要拿的是法律法规
import requests
import re
from urllib.parse import quote
from lxml import etree
import chardet
import psycopg2
import time
import hashlib
from fake_useragent import UserAgent
ua = UserAgent()
headers = {
'User_Agent': ua.random
}
def get(url):
print("开始请求界面")
response = requests.get(url=url, headers=headers)
time.sleep(3)
encoding = chardet.detect(response.content)
res = response.content.decode(encoding['encoding'], 'ignore')
return res
def clear1(res, i):
print('开始清洗第{}页'.format(i))
html = etree.HTML(res)
hrefs = html.xpath('//ul[@class="line2"]/li/a/@href')
next = html.xpath('//span[contains(text(),"第{}页")]/text()'.format(i)) # 用此进行判断,如果下一页没有链接属性,就停止循环
# print(next)
# print(type(next))
print(hrefs)
return hrefs, next
def clear2(res, url):
html = etree.HTML(res)
pub_time = re.findall('<li>【颁布时间】(\d{4}-\d{1,2}-\d{1,2})</li>', res)
title = re.findall('<li>【标题】(.*?)</li>', res)
wen_hao = re.findall('<li>【发文号】(.*?)</li>', res)
lose_time = re.findall('<li>【失效时间】(.*?)</li>', res)
pub_ora = re.findall('<li>【颁布单位】(.*?)</li>', res)
sour = re.findall('【法规来源】(h.*?)<', res)
content = ''.join((html.xpath('//div[@class="viewcontent"]//text()'))[9:])
content1 = re.sub('不分页显示 总共2页 1 \[2\] 下一页', '', content)
content2 = ''
next = html.xpath('//a[contains(text(),"下一页")]/@href')
# print(next)
if next != []:
print("进入下一页")
url = 'http://www.law-lib.com/law/law_view.asp' + next[0]
print(url)
res2 = get(url)
html2 = etree.HTML(res2)
content3 = html2.xpath('//div[@class="viewcontent"]//text()')
for data in content3:
if re.findall('第\S+条', data):
index1 = content3.index(data)
print(index1)
break
# print(content3)
try:
content2 = ''.join((html2.xpath('//div[@class="viewcontent"]//text()'))[index1:])
except:
pass
# print('2222', content2)
url = url
content = content1 + content2
content = re.sub('不分页显示 总共2页 \[1\] 2 上一页 ', '', content)
print(url)
print(title[0])
print(pub_time[0])
print(wen_hao[0])
print(lose_time[0])
print(pub_ora[0])
print(url)
if sour == []:
sour = ['']
# print(sour)
# print(type(sour))
print(content)
sign = hashlib.md5((content).encode('utf-8')).hexdigest()
print(sign)
return title[0], wen_hao[0], lose_time[0], pub_time[0], pub_ora[0], sour[0], content, url, sign
def save(data):
conn = psycopg2.connect(database='falv_wenku', user="postgres", password='123456',
host='127.0.0.1', port='5432')
try:
cur = conn.cursor()
sql = "insert into fa_lv_lib(title, wen_hao, lose_time, pub_time, pub_ora, sour, conten, url, sign)values" \
"(%s, %s, %s, %s, %s, %s,%s, %s, %s)"
cur.execute(sql, (data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8]))
conn.commit()
except Exception as e:
print("数据库错误:", e)
if __name__ == "__main__":
for key in ['最高人民法院', '最高人民检察院', '国务院', '国务院办公厅']:
i = 3
if key == '全国人民代表大会':
i = 49
while True:
i = i + 1
url = 'http://www.law-lib.com/law/lawml.asp?bbdw={}&pages={}'.format(quote(key.encode('GBk')), i)
# data = {}
res = get(url)
hrefs = clear1(res, i)
if hrefs[1] == []: # 如果没有下一页,结束循环
break
for href in hrefs[0]:
url = 'http://www.law-lib.com/law/'+href
# url = 'http://www.law-lib.com/law/law_view.asp?id=523891'
res = get(url)
data = clear2(res, url)
save(data)
# break
网友评论