美文网首页
爬虫实战(4)--法律图书馆信息抓取

爬虫实战(4)--法律图书馆信息抓取

作者: 周周周__ | 来源:发表于2019-05-22 17:31 被阅读0次

    本文主要拿的是法律法规

    import requests
    import re
    from urllib.parse import quote
    from lxml import etree
    import chardet
    import psycopg2
    import time
    import hashlib
    from fake_useragent import UserAgent
    ua = UserAgent()
    headers = {
        'User_Agent': ua.random
    }
    
    def get(url):
        print("开始请求界面")
        response = requests.get(url=url, headers=headers)
        time.sleep(3)
        encoding = chardet.detect(response.content)
        res = response.content.decode(encoding['encoding'], 'ignore')
        return res
    
    
    def clear1(res, i):
        print('开始清洗第{}页'.format(i))
        html = etree.HTML(res)
        hrefs = html.xpath('//ul[@class="line2"]/li/a/@href')
        next = html.xpath('//span[contains(text(),"第{}页")]/text()'.format(i))   # 用此进行判断,如果下一页没有链接属性,就停止循环
        # print(next)
        # print(type(next))
        print(hrefs)
        return hrefs, next
    
    
    def clear2(res, url):
        html = etree.HTML(res)
        pub_time = re.findall('<li>【颁布时间】(\d{4}-\d{1,2}-\d{1,2})</li>', res)
        title = re.findall('<li>【标题】(.*?)</li>', res)
        wen_hao = re.findall('<li>【发文号】(.*?)</li>', res)
        lose_time = re.findall('<li>【失效时间】(.*?)</li>', res)
        pub_ora = re.findall('<li>【颁布单位】(.*?)</li>', res)
        sour = re.findall('【法规来源】(h.*?)<', res)
        content = ''.join((html.xpath('//div[@class="viewcontent"]//text()'))[9:])
        content1 = re.sub('不分页显示   总共2页  1 \[2\]   下一页', '', content)
        content2 = ''
        next = html.xpath('//a[contains(text(),"下一页")]/@href')
        # print(next)
        if next != []:
            print("进入下一页")
            url = 'http://www.law-lib.com/law/law_view.asp' + next[0]
            print(url)
            res2 = get(url)
            html2 = etree.HTML(res2)
            content3 = html2.xpath('//div[@class="viewcontent"]//text()')
            for data in content3:
                if re.findall('第\S+条', data):
                    index1 = content3.index(data)
                    print(index1)
                    break
            # print(content3)
            try:
                content2 = ''.join((html2.xpath('//div[@class="viewcontent"]//text()'))[index1:])
            except:
                pass
            # print('2222', content2)
    
        url = url
        content = content1 + content2
        content = re.sub('不分页显示   总共2页  \[1\] 2 上一页 ', '', content)
        print(url)
    
        print(title[0])
        print(pub_time[0])
        print(wen_hao[0])
        print(lose_time[0])
        print(pub_ora[0])
        print(url)
        if sour == []:
            sour = ['']
        # print(sour)
        # print(type(sour))
        print(content)
        sign = hashlib.md5((content).encode('utf-8')).hexdigest()
        print(sign)
        return title[0], wen_hao[0], lose_time[0], pub_time[0], pub_ora[0], sour[0], content, url, sign
    
    
    def save(data):
        conn = psycopg2.connect(database='falv_wenku', user="postgres", password='123456',
                                host='127.0.0.1', port='5432')
        try:
            cur = conn.cursor()
            sql = "insert into fa_lv_lib(title, wen_hao, lose_time, pub_time, pub_ora, sour, conten, url, sign)values" \
                  "(%s, %s, %s, %s, %s, %s,%s, %s, %s)"
            cur.execute(sql, (data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8]))
            conn.commit()
        except Exception as e:
            print("数据库错误:", e)
    
    
    if __name__ == "__main__":
        for key in ['最高人民法院', '最高人民检察院', '国务院', '国务院办公厅']:
            i = 3
            if key == '全国人民代表大会':
                i = 49
            while True:
                i = i + 1
                url = 'http://www.law-lib.com/law/lawml.asp?bbdw={}&pages={}'.format(quote(key.encode('GBk')), i)
    
                # data = {}
                res = get(url)
                hrefs = clear1(res, i)
                if hrefs[1] == []:  # 如果没有下一页,结束循环
                    break
                for href in hrefs[0]:
                    url = 'http://www.law-lib.com/law/'+href
                    # url = 'http://www.law-lib.com/law/law_view.asp?id=523891'
                    res = get(url)
                    data = clear2(res, url)
                    save(data)
                    # break
    
    

    相关文章

      网友评论

          本文标题:爬虫实战(4)--法律图书馆信息抓取

          本文链接:https://www.haomeiwen.com/subject/zznuzqtx.html