美文网首页
爬虫实战(3)--北大法宝信息抓取

爬虫实战(3)--北大法宝信息抓取

作者: 周周周__ | 来源:发表于2019-05-22 17:30 被阅读0次

本文主要拿的是法律法规的数据,但是北大法宝部分数据需要登录才能查看完,而且封ip,本文未用代理,拿到的数据有限

# encoding:utf-8
'''
pro:北大法宝
auth:zhoubobo
datatime:2019-4-8
ver:1.0
'''
import requests
import datetime
import time
import psycopg2
import re
from fake_useragent import UserAgent
import chardet
from lxml import etree
import hashlib
import csv
ua = UserAgent()


# 引入时间戳 暂时没用
time1 = str(time.time())[:10]
time2 = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

# 第一次请求的headers, 精简过后不可缺少的参数
headers = {
    'User-Agent': ua.random,
    'Accept': '*/*',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    'Content-Length': '438',
    'Content-Type': 'application/x-www-form-urlencoded',
    'Host': 'www.pkulaw.cn',
    'Referer': 'http://www.pkulaw.cn/cluster_form.aspx?Db=chl&menu_item=law&EncodingName=&clust_param=0/XA01&keyword=&range=name&',
}


def post_spidder(url, data):
    '''

    :param url: 请求的链接
    :param data: 请求的参数
    :return: 返回解码后的界面
    '''
    try:
        print("请求页面中")
        ses = requests.Session()
        res = ses.post(url=url, data=data, headers=headers, timeout=10)
        encoding = chardet.detect(res.content)
        html = res.content.decode(encoding['encoding'], 'ignore')
        print('返回界面信息')
        return html
    except Exception as e:
        print(e)
        pass


def get_spidder(url):
    '''
    :param url: get请求的链接
    :return: 返回请求后的信息
    '''
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding':  'gzip, deflate',
        'Accept-Language':  'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Cookie': '',
        'Host': 'www.pkulaw.cn',
        'Referer': 'http://www.pkulaw.cn/cluster_form.aspx?Db=chl&menu_item=law&EncodingName=&clust_param=0/XA01&keyword=&range=name&',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': ua.random
    }
    # 目前cookieid是复制浏览器上边的,有数量限制
    # Cookie = 'CookieId=oerh2flilolgygwcckakt2sj;'
    Cookie = 'CookieId=qxn5jr5kmplmlhxcln1ny3wh;'

    headers['Cookie'] = Cookie
    print(headers)
    res = requests.get(url, headers=headers)
    return res


def clear_dada(res):
    '''
    :param res: 第一次请求返回的界面
    :return: 返回界面上的所有链接
    '''
    try:
        print("开始清洗数据1")
        html = etree.HTML(res)
        hrefs = html.xpath('//a[@class="main-ljwenzi"]/@href')
        return save_content(hrefs)
    except Exception as e:
        # 请求的过程中可能会出现验证码,此处用作捕捉异常
        print(e)
        pass


def save_content(hrefs):
    '''
    :param hrefs: 详情页面的链接
    :return: 返回清洗好的数据,并进行储存
    '''
    for href in hrefs:
        url = 'http://www.pkulaw.cn/' + href
        print(url)
        res = get_spidder(url)
        # print(res.text)
        time.sleep(5)
        html = etree.HTML(res.text)

        try:
            print("................开始清洗数据2")

            title = html.xpath('//table[@id="tbl_content_main"]/tr[1]/td/span/strong/text()')[0]  # 标题
            # pub_dep = html.xpath('//table[@id="tbl_content_main"]/tr[2]/td[1]/a/text()')[0]  # 发布部门
            # pub_dep = re.findall('【发布部门】.*target="_blank">(.*?)</a>', res)[0]
            # post_name = html.xpath('//table[@id="tbl_content_main"]/tr[2]/td[2]/text()')[0]  # 发文字号
            # rel_time = html.xpath('//table[@id="tbl_content_main"]/tr[3]/td[1]/text()')[0]  # 发布时间
            # pub_time = html.xpath('//table[@id="tbl_content_main"]/tr[3]/td[2]/text()')[0]  # 实施时间
            # time_line = html.xpath('//table[@id="tbl_content_main"]/tr[4]/td[1]/a/text()')[0]  # 时效性
            # eff_lev = html.xpath('//table[@id="tbl_content_main"]/tr[4]/td[2]/a/text()')[0]  # 效力级别
            # reg_cat = html.xpath('//table[@id="tbl_content_main"]/tr[5]/td/a/text()')[0]  # 法规类别

            # 【发布部门】
            pub_dep = re.findall('【发布部门】.*?target=_blank>(.*?)</a>', res.text, re.S)
            # 发文字号
            post_name = re.findall('【发文字号】.*?([\u4e00-\u9fa5a-zA-Z-z0-9]+)</td>', res.text, re.S)
            post_name = '无' if post_name == [] else post_name
            # 发布时间
            rel_time = re.findall('【发布日期】.*?(\d{4}.\d{1,2}.\d{1,2})', res.text, re.S)
            # 执行时间
            pub_time = re.findall('【实施日期】.*?(\d{4}.\d{1,2}.\d{1,2})', res.text, re.S)
            # 时效性
            time_line = re.findall('【时效性】.*?target=_blank>(.*?)</a>', res.text, re.S)
            # 效力级别
            eff_lev = re.findall('【发布部门】.*?target=_blank>(.*?)</a>', res.text, re.S)
            # 法规类别
            reg_cat = re.findall('【法规类别】.*?target=_blank>(.*?)</a>', res.text, re.S)



            content = html.xpath('//div[@id="div_content"]//text()')  # 内容
            content = content[5:]
            content = ''.join(content)
            content = re.sub(r'(法宝联想.*?)', '', content)
            sign = hashlib.md5((content).encode('utf-8')).hexdigest()  # 唯一标记

            print(title)
            print(pub_dep)
            print(post_name)
            print(rel_time)
            print(pub_time)
            print(time_line)
            print(eff_lev)
            print(reg_cat)
            print(content)
            print(sign)
        except Exception as e:
            print(e)
            with open('E://1.txt', 'a') as f:
                f.write(url + '\n')

        print(title)
        print(pub_dep)
        print(post_name)
        print(rel_time)
        print(pub_time)
        print(time_line)
        print(eff_lev)
        print(reg_cat)
        print(content)
        print(sign)

        conn = psycopg2.connect(database='falv_wenku', user="postgres", password='123456',
                                host='127.0.0.1', port='5432')
        try:
            cur = conn.cursor()
            sql = "insert into beida_fabao_1(title, pub_dep, post_name, rel_time, pub_time, time_line, eff_lev, reg_cat, " \
                  "content1, sign)values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
            cur.execute(sql, (title, pub_dep[0], post_name[0], rel_time[0], pub_time[0], time_line[0], eff_lev[0],
                              reg_cat[0], content, sign))
            conn.commit()
        except Exception as e:
            print("数据库错误:", e)



if __name__ == '__main__':
    # 通过分析ajax抓包的链接
    url = 'http://www.pkulaw.cn/doSearch.ashx'
    # 第一次请求带的参数,进行拼接
    for i in range(0, 69):
        data = {
        'Db': 'chl',  # 法律类
        'clusterwhere': '%25e6%2595%2588%25e5%258a%259b%25e7%25ba%25a7%25e5%2588%25ab%253dXA0101',
        'clust_db': 'chl',
        'Search_Mode': '',
        'range': 'name',
        'aim_page': i,  # 当前页面
        'page_count': 24,  # 总页数
        }

        # 通过post 进行请求得到第一个界面,并得到返回值
        res = post_spidder(url, data)
        # 通过请求得到链接列表
        hrefs = clear_dada(res)
        # 翻页加延时
        time.sleep(20)

相关文章

网友评论

      本文标题:爬虫实战(3)--北大法宝信息抓取

      本文链接:https://www.haomeiwen.com/subject/jinuzqtx.html