美文网首页
python多线程爬取百度云电影网站

python多线程爬取百度云电影网站

作者: NO123456 | 来源:发表于2018-04-24 16:31 被阅读0次
    import queue
    import threading
    import requests
    import re
    from lxml import etree
    import time
    import random
    import json
    
    # 已爬 url
    urlList = []
    
    # 正在爬url对列
    urlsData = queue.Queue()
    
    # urlERRor失败次数
    urlError = {}
    # 第几个爬虫
    count = 0
    
    # 模拟header头
    header = {
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36",
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.8',
    }
    
    class Counter(threading.Thread):
        # @summary: 初始化对象。
        # @param lock: 琐对象。
        # @param threadName: 线程名称
        # @param requests: 线程名称
        # @param url:  爬取来源url
        # @param name: 数据名称
        # @param id: 数据id
        def __init__(self, lock, threadName, requests, url):
    
            print(threadName+'run..')
            super(Counter, self).__init__(name=threadName)
            self.lock = lock
            self.requests = requests
            self.url = url
    
        def _data_get(self):
    
            # 开始任务
            try:
                # 爬取来源地址
                html = requests.get(self.url,headers=header)
                rs = etree.HTML(html.content)
                # 解析网页百度地址
                url = re.findall(r'href="(https://pan.baidu.com/s/.*?|http://pan.baidu.com/s/.*?)"',html.content.decode('utf-8'))
                # 解析网页百度云密码
                password = re.findall(r'密码(:|;|: )(\w{0,4})', html.content.decode('utf-8'))
                name = rs.xpath('//h1/text()')
                # 打印
                try:
                    password = password[0][1]
                except BaseException as e:
                    password = ''
    
                # 爬取豆瓣电影封面图
                try:
                    url1 = "http://www.baidu.com/s?"
                    html = requests.get(url1,params={
                        'wd':"site:movie.douban.com {}".format(self.name)
                    })
                    select = etree.HTML(html.content)
                    # saveHtml("text1", html.content)
                    a = select.xpath('//h3[@class="t"]/a/@href')
                    html = requests.get(a[0])
                    select = etree.HTML(html.content)
                    # print(html.content)
                    ase = select.xpath('//img/@src')
                    img = ase[0]
                except BaseException as e:
                    print(self.name,'豆瓣电影封面获取失败')
                    img = ''
                # 提交数据
    
                print(name[0])
                # 提交数据
                rr = requests.post('http://localhost/basic/index.php?r=bian/update', {
                    'password': password,
                    'url': url[0],
                    'img': img,
                    'source_url': self.url,
                    'name': name[0]
                })
                threadmax.release()
                print(rr.content)
    #             message = '''
    #             '%s','%s','%s','%s';
    # ''' % (password, url[0], img, name[0])
    #             print(message)
    
            except BaseException as e:
                if self.url in urlError:
                    urlError[self.url] = urlError[self.url] + 1
                else:
                    urlError[self.url] = 1
                if urlError[self.url]<3:
                    urlsData.put(self.url)
                print('百度云地址解析失败',self.url,'失败次数',urlError[self.url],e)
                print('目前剩余任务', urlsData.qsize())
    
    
        def run(self):
            global count
            self.lock.acquire()
            self._data_get()
            self.lock.release()
    
    if __name__ == '__main__':
        threadmax = threading.BoundedSemaphore(100)
        lock = threading.Lock()
        i = 0
        try:
            # 单进程爬所有任务url
            for index1 in range(20):
                index = 1038 - index1
                html = requests.get('http://www.xiexingeini.com/page/{}'.format(index), headers=header)
                html = etree.HTML(html.content)
                # 所有任务
                urls = html.xpath('//header/h2[@class="entry-title"]/a/@href')
                for url in range(len(urls)):
                    urlsData.put(urls[url])
                print('已抓取url',urlsData.qsize())
            print('全部任务:',urlsData.qsize())
    
            # 对列循环爬取
            while True:
                threads = []
                uu = urlsData.get()
                i = i+1
                try:
                    threadmax.acquire()
                    ts = Counter(lock, "thread-" + str(i), requests, uu).start()
                except BaseException as e:
                    print(e)
                    # 重新插入对列
                    urlsData.put(uu)
                    if e == "can't start new thread":
                        print('线程开启失败')
                        time.sleep(180)
                    else:
                        print(uu,'error')
        except BaseException as e:
            print('url error')
            print(e)
        # while True:
    
    # # 添加数据:吃
    # q1.put('a')
    # q1.put('b')
    #
    # # 打印队列中的数据
    # print(q1.queue)
    #
    # # 取出队列中的数据:先进先出原则
    # print(q1.get())
    # print(q1.queue)
    # print(q1.qsize())
    # print(q1.get())
    # 当队列里没有数据是,get获取不到到数据,会造成阻塞

    相关文章

      网友评论

          本文标题:python多线程爬取百度云电影网站

          本文链接:https://www.haomeiwen.com/subject/avmjlftx.html