美文网首页
爬虫实战(1)--广州法律法规信息抓取

爬虫实战(1)--广州法律法规信息抓取

作者: 周周周__ | 来源:发表于2019-05-22 17:18 被阅读0次

    本文网站会封ip,作者用的是固定ip

    # -*- coding: utf-8 -*-
    '''
    Time    : 2019/5/22 10:02
    Author  : zhouzhou
    Email   : 1085089422@qq.com
    File    : guang_zhou_law.py
    Software: PyCharm
    url     : http://www.gz.gov.cn/gzgov/s2792/gk_fggw_list2.shtml
    database: law/guang_zhou_law
    '''
    
    import requests
    from fake_useragent import UserAgent
    ua = UserAgent()
    import re
    from lxml import etree
    import psycopg2
    import time
    
    def get_list(url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3641.400 QQBrowser/10.4.3284.400',
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
        }
        proxies = {
            "http": "http:c9zrdbya0q:diwbjxksqt@122.114.166.184:23128"
        }
    
    
        print((requests.get(url='http://www.icanhazip.com/', headers=headers, proxies=proxies)).text)
        response = requests.get(url, headers=headers)
        response.encoding = 'utf8'
        # print(response.text)
        html = etree.HTML(response.text)
        hrefs = html.xpath('//ul[@class="news_list"]/li/a/@href')
        # print(hrefs)
        for href in hrefs:
            get_href(href)
            # break
    
    
    def get_href(href):
        if '../../' in href:
            url = 'http://www.gz.gov.cn/' + href.replace('../../', '')
        else:
            url = href
        print("~"*200)
        print(url)
        get_con(url)
    
    
    def get_con(url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3641.400 QQBrowser/10.4.3284.400',
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
        }
        proxies = {
            "http": "http:c9zrdbya0q:diwbjxksqt@122.114.166.184:23128"
        }
    
        time.sleep(2)
        response = requests.get(url, headers=headers, proxies=proxies)
        response.encoding = 'utf8'
        # print(response.text)
        html = etree.HTML(response.text)
        try:
            title = html.xpath('//h1[@class="content_title"]/text()')
            print(title)
            title = title[0].strip()
        except:
            title = html.xpath('//h1[@class="info_title"]/text()')[0].strip()
        print(title)
        try:
            wen_hao = re.findall('([穗府|厅外字].*?号)', response.text)[0]
        except:
            wen_hao = re.findall('(第\d+号)', response.text)
            print(bool(wen_hao))
            if bool(wen_hao) is False:
                pass
            else:
                wen_hao = '广州市人民政府令' + wen_hao[0]
        print(wen_hao)
        data = re.findall('>{0,1}(\S{1,4}年\S{1,2}月.{1,3}日)', response.text)[0]
        print(data)
    
        con = html.xpath('//div[@class="mainbox_bg content clearfix"]//text()')
        if con == []:
            # print("初期筛选为空")
            con = html.xpath('//div[@class="info_cont"]//text()')
        con = ''.join(con)
        print(con.encode('GBK', 'ignore').decode('GBk'))
        save(title, wen_hao, data, con)
    
    
    def save(title, wen_hao, data, con):
        conn = psycopg2.connect(database='law', user='postgres', password='123456', host='127.0.0.1', port='5432')
        cur = conn.cursor()
        sql = 'insert into guang_zhou_law(title, data_time,wen_hao, content1)values (%s, %s, %s, %s)'
        try:
            cur.execute(sql, (title, data, wen_hao, con))
        except Exception as e:
            print('数据库插入错误:', e)
            return False
        conn.commit()
    
    
    if __name__ == "__main__":
        for i in range(1, 67):
            print("iiiiiiii", i)
            url = 'http://www.gz.gov.cn/gzgov/s2792/gk_fggw_list2_{}.shtml'.format(i)
            get_list(url)
            # break
    

    相关文章

      网友评论

          本文标题:爬虫实战(1)--广州法律法规信息抓取

          本文链接:https://www.haomeiwen.com/subject/hlguzqtx.html