美文网首页
爬虫实战(2)--法律法规信息库抓取

爬虫实战(2)--法律法规信息库抓取

作者: 周周周__ | 来源:发表于2019-05-22 17:27 被阅读0次

法律法规信息库网站不稳定性因素比较大,有时候会进不去

import requests
import chardet
import re
import time
import sys
from lxml import etree
import psycopg2
import hashlib

headers = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E)'
}

#    User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E)
#   Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36

# 请求页面返回数据
def get_url(url, headers, data):
    response = requests.post(url=url, headers=headers, data=data)
    encoding = chardet.detect(response.content)
    res = response.content.decode(encoding['encoding'], 'ignore')
    return res


def post_url(url, headers, data):
    response = requests.post(url=url, headers=headers, data=data)
    encoding = chardet.detect(response.content)
    res = response.content.decode(encoding['encoding'], 'ignore')
    return res


# 对javascript: 链接进行筛选
def href_j(href):
    params = re.findall("goMore\('(.*?)','(.*?)','(.*?)','(.*?)'", href)
    # print(params)
    zlsxid = params[0][0]
    bmflid = params[0][1]
    zdjg = params[0][2]
    txtid = params[0][3]

    data = {
        'SFYX': '有效',
        'zlsxid': zlsxid,
        'bmflid': bmflid,
        'zdjg': zdjg,
        'txtid': txtid,
        'resultSearch': 'false',
        'pagesize': '50'
    }
    # 得到下个界面数据
    url = 'http://law.npc.gov.cn/FLFG/getAllList.action?'
    response = get_url(url, headers=headers, data=data)
    return spidder1(response, data=data, i=1)


# 正常链接
def href_n(href):
    response = get_url(href, headers=headers, data={})
    return spidder1(response, data={}, i=1)


# 对第二界面进行分析
def spidder1(res, data, i):
    html = etree.HTML(res)
    href2s = html.xpath('//table//tr/td[2]/a[1]/@href')

    for href2 in href2s:
        # time.sleep(3)
        params2 = re.findall("showLocation\('(.*?)','(.*?)','(.*?)'", href2)
        param1 = params2[0][0]
        param2 = params2[0][1]
        param3 = params2[0][2]

        # 请求详情(第三个)页面
        url = "http://law.npc.gov.cn:80/FLFG/flfgByID.action?flfgID=" + param1 + "&keyword=" + param2 + "&zlsxid=" + param3
        response = get_url(url=url, headers=headers, data={})
        html = etree.HTML(response)

        # 进行数据筛选
        attr = (html.xpath('//*[@id="content"]/table//tr[1]/td[2]/text()'))[0].strip()  # 资料属性
        orga = (html.xpath('//*[@id="content"]/table//tr[2]/td[2]/text()'))[0].strip()  # 指定机关
        pub_date = (html.xpath('//*[@id="content"]/table//tr[4]/td[2]/text()'))[0].strip()  # 发布日期
        put_date = (html.xpath('//*[@id="content"]/table//tr[4]/td[4]/text()'))[0].strip()  # 执行日期
        content = '\n'.join(html.xpath('//*[@id="content"]/div/div//text()'))  # 内容
        title = (html.xpath('//div[@class="bt"]//text()'))[0].strip()  # 标题

        # 判断不确定性字段
        try:
            time_ok = (html.xpath('//*[@id="content"]/table//tr[5]/td[2]/text()'))[0].strip()  # 时效性
        except:
            time_ok = '未知'
        try:
            clas = (html.xpath('//*[@id="content"]/table//tr[1]/td[4]/text()'))[0].strip()  # 部门分类
        except:
            clas = '未知'
        try:
            num = (html.xpath('//*[@id="content"]/table//tr[3]/td[2]/text()'))[0].strip()  # 颁布文号
            if num == '':
                num = '未知'
        except:
            num = '未知'

        print("#"*300)
        # print(attr, clas, orga, num, pub_date, put_date, time_ok,title)
        # print(content)
        item = {}
        item['attr'] = attr
        item['orga'] = orga
        item['pub_date'] = pub_date
        item['put_date'] = put_date
        item['content'] = content
        item['time_ok'] = time_ok
        item['clas'] = clas
        item['num'] = num
        item['title'] = title
        item['type'] = '司法解释及文件_检察院'
        pipline(item)

    if len(href2s) == 50:
        i = i + 1
        url = 'http://law.npc.gov.cn/FLFG/getAllList.action'
        data['ispage'] = '1'
        data['curPage'] = i
        print(data)
        response = post_url(url, headers=headers, data=data)
        spidder1(response, data=data, i=i)


# 进行存储
def pipline(item):

    # 进行数据去重
    print(item)
    sign = hashlib.md5((item['content']).encode('utf-8')).hexdigest()
    print(str(sign))

    conn = psycopg2.connect(database='falv_wenku', user="postgres", password='123456',
                            host='127.0.0.1', port='5432')
    cur = conn.cursor()
    try:
        sql = 'INSERT INTO falv_fagui_xinxiku(attr, orga, pub_date, put_date, conten, time_ok, clas, num, sign, title,type1)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        cur.execute(sql, (item['attr'], item['orga'], item['pub_date'], item['put_date'], item['content'], item['time_ok'], item['clas'], item['num'], str(sign), item['title'], item['type']))
        # cur.execute('INSERT INTO ver_1(attr)value ("1")')
        conn.commit()
    except Exception as e:
        print("数据库插入错误:", e)


if __name__ == "__main__":
    try:
        url = 'http://law.npc.gov.cn/FLFG/index.jsp'

        res1 = get_url(url, headers, data={})
        # 拿到链接
        href1s = re.findall('href="(.*?)">更多</a>', res1)
        del href1s[2]
        # print(len(href1s))
        print(href1s)
        href1s = href1s[12:13]
        print(href1s)
        for href1 in href1s:
            if href1[0] == "j":
                href_j(href1)
            else:
                href_n(href1)
    except Exception as e:
        print("错误", e)






相关文章

网友评论

      本文标题:爬虫实战(2)--法律法规信息库抓取

      本文链接:https://www.haomeiwen.com/subject/yynuzqtx.html