美文网首页
day10、抓取m.sohu.com的所有页面,并且存入redi

day10、抓取m.sohu.com的所有页面,并且存入redi

作者: 是东东 | 来源:发表于2018-08-27 17:23 被阅读0次

    太复杂。本人不会

    import requests
    from lxml import etree
    from urllib.parse import urlparse
    from time import sleep
    
    from threading import Thread
    from queue import Queue
    import sys
    from redis import Redis
    # 用IDLE来记录线程的空闲
    IDLE = 0
    # 用WORKING来记录线程的工作状态
    WORKING = 1
    
    rds  = Redis("127.0.0.1",6379,db=10)
    
    # 定义一个装饰器对象
    class retry(object):
        def __init__(self,max_tries=3,wait=3,exceptions=(Exception,)):
            self.max_tries = max_tries
            self.wait = wait
            self.exceptions = exceptions
    
        def __call__(self, f):
            def wrapper(*args,**kwargs):
                for i in range(self.max_tries+1):
                    try:
                        result = f(*args,**kwargs)
                    except self.exceptions as e:
                        print("waitting",e)
                        sleep(self.wait) # 如果有异常休眠一会再请求
                        print("retry %s "%(i+1))
                        continue
                    else:
                        return result
            return wrapper
    
    
    # 定义一个集合,用于存放访问过的网址
    REQUESTED_URL = set()
    # 下载页面
    @retry(3,3)
    def fetch(url):
        print(f'Fetching: {url}')
        res = requests.get(url)
        # 请求过以后,把网址添加到集合中
        REQUESTED_URL.add(url)
        # 请求成功,把页面内容返回出去
        if res.status_code == 200:
            return res.text
        return None
    
    # 解析数据
    def paese(html):
        # 剔除一些无效的url对应的页面
        if html in [None,'',b'']:
            return []
        # 通过xpath语法寻找a标签
        doc = etree.HTML(html)
        if doc is None:
            return []
    
        # 取当前页面中url
        urls = doc.xpath("//a/@href")
    
        # print(urls)
        # 对页面上获取到的链接进行清洗
        # 定义一个列表,用于存放清洗完成url
        url_list = []
        for ori_url in urls:
            parse_url = urlparse(ori_url) # ParseResult(scheme='', netloc='', path='/a/249902322_102150', params='', query='_f=m-index_business_news_9', fragment='')
            # print(parse_url)
            # 过滤域名
            domain = parse_url.netloc.strip() or "m.sohu.com"
            if domain == "m.sohu.com":
                # 过滤协议
                scheme = parse_url.scheme.strip() or "http"
                path = parse_url.path.strip()
                query = f'?{parse_url.query}'.strip() if parse_url.query else ''
                # 拼接url
                url = f'{scheme}://{domain}{path}{query}'
                # 把拼接好的url存储
                url_list.append(url)
        return url_list
    
    
    # 定义一个函数调用下载与解析
    def get_and_parse(url,url_queue):
        html = fetch(url)
        # paese(html)
        # print(html)
        for url in paese(html):
            url_queue.put(url)
    
    # #定义一个函数,用于处理线程
    # def process(url_list):
    #     queue = Queue()
    #     workers = []
    #     for url in url_list:
    #         t = Thread(target=get_and_parse,args=(url,queue))
    #         t.setDaemon(True)
    #         workers.append(t)
    #         t.start()
    #     for t in workers:
    #         t.join()
    #     return list(queue.queue)
    
    # 创建一个多线程爬虫类
    class Spider(Thread):
        # def __init__(self,todo_list):
        def __init__(self):
            super().__init__()
            # self.todo_list = todo_list
            self.stat = IDLE
    
        def is_idle(self):
            return self.stat == IDLE
    
        def run(self):
            while True:
                url = rds.blpop("TODO_LIST")[1]
                # url = self.todo_list.get()
                # 开始抓取
                self.stat = WORKING
                html = fetch(url)
    
                # url_list = set(paese(html))
                # url_list -= REQUESTED_URL
                url_list = set([url.encode('utf-8') for url in paese(html)])
                url_list -= rds.smembers("REQUEST_URL") # 去重
    
                # # 将新得到的url添加到循环里面
                # for url in url_list:
                #     self.todo_list.put(url)
                if url_list:
                    rds.lpush("TODO_LIST",*url_list)
    
                # 把工作状态设置为空闲
                self.stat = IDLE
    
    
    def main(max_threads):
        # 添加任务
        print("Start")
        # todo_list = Queue() # 待抓取的url
        # todo_list.put("http://m.sohu.com")
        print(rds.lpush("TODO_LIST","http://m.sohu.com/"))
        # 创建n个线程,并启动
        # spiders = [Spider(todo_list) for i in range(max_threads)]
        spiders = [Spider() for i in range(max_threads)]
        for spd in spiders:
            spd.start()
    
        # 检测所有的线程是否全部完成工作
        while True:
            # 改成redis键的判断
            # if todo_list.empty() and  [spd.is_idle() for spd in spiders]:
            if rds.llen("TODO_LIST")==0 and [spd.is_idle() for spd in spiders]:
                # 当前待抓取的列表为空,所有的线程也全部为空闲,退出程序
                print("所有的工作都完成了")
                sys.exit(0)
            else:
                print("REQUESRED %d" % rds.scard("REQUEST_URL"))
                sleep(1)
    
    if __name__ == '__main__':
        if len(sys.argv) >= 2:
            max_threads = int(sys.argv[1])
            main(max_threads)
    

    相关文章

      网友评论

          本文标题:day10、抓取m.sohu.com的所有页面,并且存入redi

          本文链接:https://www.haomeiwen.com/subject/chetwftx.html