美文网首页
爬虫模板1

爬虫模板1

作者: TheoKm | 来源:发表于2018-08-03 09:19 被阅读0次
    # -*- coding: utf-8 -*-
    import requests
    from bs4 import BeautifulSoup
    import re
    from time import sleep
    from multiprocessing import Pool
    from multiprocessing import freeze_support
    
    
    # 父网页的连接,去除页数
    base_url = ""
    
    
    # 通用头部
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) '
                      'AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/67.0.3396.79 Safari/537.36',
    }
    
    
    # 初始化url数组
    def initialize_url(url):
        url_list = []
        # lent 是需要的页数
        lent = 10
        for counter in range(1,lent):
            url_list.append(url + str(counter) + '.html')
        return url_list
    
    
    # 通用requests获得html
    def get_html(url):
        response = requests.get(url, headers=headers)
        try:
            if response.status_code == 200:
                response.encoding =response.apparent_encoding
                return response.text
        except requests.ConnectionError as e:
            print('Error', e.args)
    
    
    # 解析父网页,获得子网页的url数组、标题数组、日期数组
    def parse_father_html(html):
        if html is not None:
            soup = BeautifulSoup(html, 'html.parser')
            if soup:
    
                # 解析父网页内容:
                content = soup.find(attrs={'id':''})
                data = content.find_all(href=re.compile(""), attrs={'': ''})
                date = content.find_all(attrs={'': ''})
    
                # 初始化三个数组:
                url_list = []
                title_list = []
                date_list = []
    
                # 完成三个数组:
                for item in data:
                    # 大多数网页a标签的url都是 "../../*****" 的格式
                    # 用正则的sub函数补完url全部内容
                    item_content = re.sub('../', 'http://*.*.com/', str(item.attrs['href']))
                    url_list.append(item_content)
                for item in data:
                    # 正则捕捉所有汉字,去掉item_content内的非汉字字符
                    item_content = re.findall('[\u4e00-\u9fa5]', str(item.attrs['title']))
                    item_content = re.sub('\'', '', str(item_content))
                    item_content = re.sub('\, ', '', str(item_content))
                    item_content = re.sub('\[', '', str(item_content))
                    item_content = re.sub('\]', '', str(item_content))
                    title_list.append(item_content)
                for item in date:
                    date_list.append(item.text)
                if url_list is not None and title_list is not None and date_list is not None:
                    return url_list, title_list, date_list
                else:
                    print("父网页结构有变,请重新编写父网页解析模块")
            else:
                print("父网页汤初始化异常")
        else:
            print("父网页HTML不存在")
    
    
    # 解析子网页
    def parse_child_html(child_html):
        if child_html is not None:
            child_soup = BeautifulSoup(child_html, 'html.parser')
            if child_soup is not None:
    
                # 解析子网页内容,一般情况下,子网页获得网页p标签即可;
                # 有时可能还需要获得图片,特殊情况特殊考虑
                content = child_soup.find(attrs={'': ''})
    
                if content is not None:
                    return content
                else:
                    print("网页结构有变请重新编写子网页解析模块")
            else:
                print("子网页汤初始化异常")
        else:
                print("子网页HTML不存在")
    
    
    # 输出子网页的内容:
    def out_put_html(html_content, title, time):
        # 输出格式 【时间】标题.html
        fout = open('【' + time + '】' + title + '.html', 'w+', encoding='utf-8')
        fout.write("""<!DOCTYPE html><html lang="zh" dir="ltr"><head>
            <meta charset="utf-8"><title></title></head><body>""")
        fout.write(str(html_content))
        fout.write("</body></html>")
        fout.close()
    
    
    # 主函数
    def main(url_item):
        # 获得父网页的html
        html = get_html(url_item)
        # 解析父网页,获得子网页的url数组,标题数组,日期数组
        urls, date, dates = parse_father_html(html)
        # 获得子网页的数量
        lent = len(dates)
        # 循环,逐个获得、解析、输出子网页
        for counter in range(0, lent):
            # 解析子网页html
            children_html = get_html(urls[counter])
            # 获得子网页的内容
            children_content = parse_child_html(children_html)
            # 保存子网页到本地
            out_put_html(str(children_content), str(date[counter]), str(dates[counter]))
            # 每完成一个睡眠1秒
            sleep(1)
    
    
    # run方法
    def run():
        # freeze_support防止pyinstaller打包在windows环境下运行时导致的内存爆炸
        freeze_support()
        # 初始化父网页连接数组
        url_list = initialize_url(base_url)
        # pool进程池,多进程爬虫,提高爬虫运行速度
        pool = Pool(10)
        pool.map(main, url_list)
        pool.close()
        pool.join()
    
    

    相关文章

      网友评论

          本文标题:爬虫模板1

          本文链接:https://www.haomeiwen.com/subject/jgzevftx.html