美文网首页
多线程、动态代理爬虫

多线程、动态代理爬虫

作者: 二十二_0cbe | 来源:发表于2019-06-13 16:08 被阅读0次
# -*- coding: UTF-8 -*-
import json
import os
import re
import time
import shutil
import logging
import traceback
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
import configparser


# logging.basicConfig(filename='log.log')

class abuyun():
    config = configparser.ConfigParser()
    config.read("./config.ini")
    conf_abuyun = config["abuyun"]
    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
        "host": conf_abuyun["proxyHost"],
        "port": conf_abuyun["proxyPort"],
        "user": conf_abuyun["proxyUser"],
        "pass": conf_abuyun["proxyPass"],
    }

    proxies = {
        "http": proxyMeta,
        "https": proxyMeta,
    }


def paragraph_info(id, order=0):
    try:
        artic = {}
        artic['uuid'] = id

        header = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
        url = "https://dict.eudic.net/webting/play?id={}&order={}".format(id, order)
        print(url)
        response = requests.get(url, headers=header, proxies=abuyun().proxies)

        soup = BeautifulSoup(response.content, "html.parser")
        article = soup.select_one("#article")
        # print(article)
        if not article:
            article = soup.select_one("div.article")
        sentence = article.select("span.sentence[data-starttime]")

        scripts = soup.select("script[type='text/javascript']")
        script = ""
        for s in scripts:
            script += s.get_text()
        # print(script)
        # script = str(soup.select("script[type='text/javascript']")[0].get_text())
        # print(script)
        translate = re.search('{"translation":.*,"wordhints"', script)
        start, end = translate.span()
        translate = script[start + 16:end - 13]

        translate = [t + "}" for t in translate.split("},")]
        translate[-1] = translate[-1][:-1]
        artic['sentence'] = []
        for index, sen in enumerate(sentence):
            temp = {}
            en = sen.get_text()

            # 语音最后一句的时间戳
            if sen["data-endtime"] == '':
                timestamps = [sen["data-starttime"], '99:99:99.99']
            else:
                timestamps = [sen["data-starttime"], sen["data-endtime"]]

            temp['order'] = index
            temp['timestamps'] = timestamps
            temp['english'] = en
            temp['chinese'] = ""

            if translate and translate != ['']:
                cn = json.loads(translate[index])
                cn_timestamps = cn["timestamps"][1:-1].split("],[")
                temp['chinese'] = cn['text']

                # 修正两个时间戳格式不一样的情况,只有20个uuid是这种情况
                for t in range(2):
                    timestamps[t] = timestamps[t][3:]

                # print(timestamps, cn_timestamps)
                # 处理两句英文对应一句中文的情况
                if timestamps[0] == cn_timestamps[0] and timestamps[1] == cn_timestamps[1]:
                    # print("equal!")
                    pass
                elif timestamps[0] == cn_timestamps[0] and timestamps[1] < cn_timestamps[1]:
                    translate.insert(index, translate[index])
                    order_num = index
                    temp['order'] = order_num
                elif timestamps[0] > cn_timestamps[0] and timestamps[1] < cn_timestamps[1]:
                    translate.insert(index, translate[index])
                    # order_num = order_num - 1
                    temp['order'] = order_num
                elif timestamps[0] > cn_timestamps[0] and timestamps[1] == cn_timestamps[1]:
                    # order_num = order_num - 1
                    if cn_timestamps[0] == '00:00.00':
                        order_num = 0
                    temp['order'] = order_num
                # print(temp)
                # print("")
            else:
                temp['chinese'] = ""
            artic['sentence'].append(temp)

        video = soup.select("video.video")
        if video:
            mp4_url = video[0]['src']
            # print(mp4_url)
            req = requests.get(mp4_url)
            save_path = os.path.abspath(os.curdir) + "/audios/" + id + ".mp4"  # 当前工作目录下的文件路径
            with open(save_path, 'wb') as code:
                code.write(req.content)
            print(id + ".mp4 [Download successful!]")
        else:
            mp3 = re.search('initPlayPage(.*)', script)
            start, end = mp3.span()
            mp3_url = script[start + 14:end - 4]
            # print(mp3_url)
            req = requests.get(mp3_url)
            save_path = os.path.abspath(os.curdir) + "/audios/" + id + ".mp3"  # 当前工作目录下的文件路径
            with open(save_path, 'wb') as code:
                code.write(req.content)
            print(id + ".mp3 [Download successful!]")

        artic['audio_path'] = save_path

        lesson_file = os.path.abspath(os.curdir) + "/lesson_info"
        with open(lesson_file, 'a', encoding='utf8') as writer:
            writer.write(str(artic) + "\n")
    except OSError:
        paragraph_info(id)
    except UnboundLocalError:
        Unbound_info(id)
    except Exception:
        logging.error("lesson uuid: " + id)
        e = traceback.format_exc()
        logging.error(e)


def main(ids, thread_num=5):
    # 统计该爬虫的消耗时间
    print('*' * 50)
    start = time.time()
    log_file = os.path.abspath(os.curdir) + "/log.log"
    if os.path.exists(log_file):
        os.remove(log_file)
    logging.basicConfig(filename='log.log')

    mp3_path = os.path.abspath(os.curdir) + "/audios"
    if os.path.exists(mp3_path):
        shutil.rmtree(mp3_path)
    os.mkdir(mp3_path)

    lesson_file = os.path.abspath(os.curdir) + "/lesson_info"
    if os.path.exists(lesson_file):
        os.remove(lesson_file)
    # lesson_info = open(lesson_file, "w", encoding='utf8')

    # 利用并发下载
    executor = ThreadPoolExecutor(max_workers=thread_num)  # 可以自己调整max_workers,即线程的个数
    # submit()的参数: 第一个为函数, 之后为该函数的传入参数,允许有多个
    # ids = ["e27a4e2d-f7dc-442a-937e-6b10691275e5", "520ae730-20d2-11e6-bcc9-000c29ffef9b",
    #        "c01daf43-3c4d-11e7-866e-000c29ffef9b", "1faa27d9-f6f7-11e8-a44b-000c29ffef9b"]

    future_tasks = [executor.submit(paragraph_info, id) for id in ids]

    # 等待所有的线程完成,才进入后续的执行
    wait(future_tasks, return_when=ALL_COMPLETED)

    end = time.time()
    print("[All works are done.]")
    print('使用多线程,总共耗时:%s' % (end - start))
    print('总处理了:%s个文件。' % len(ids))
    print('*' * 50)


if __name__ == '__main__':
    # ids = ["36222876-6bb2-11e7-93a3-e954ea2f9385"]
    # uuid_file = open("lesson_uuid", "r")
    # uuid_file = open("undone_uuid", "r")
    #
    # reader = uuid_file.readlines()
    # ids = []
    # for uuid in reader:
    #     ids.append(uuid[:-1])
    # uuid_file.close()
    # # ids = ids[::501]
    # main(ids, 5)

    # paragraph_info("e27a4e2d-f7dc-442a-937e-6b10691275e5")
    paragraph_info("f90fea4e-ef01-4e89-b495-346707023bbb")

相关文章

  • 多线程、动态代理爬虫

  • Python实现爬取可用代理IP

    在实现爬虫时,动态设置代理IP可以有效防止反爬虫,但对于普通爬虫初学者需要在代理网站上测试可用代理IP。由于手动测...

  • 2021校招 复习总结

    笔记导航: JAVA: 泛型 反射和动态代理 注解 JAVA多线程 ReentrantLock,Volatile,...

  • Squid配置多代理动态自动转发

    需求是这样的 爬虫需要代理。 代理会过期需要动态更新。 代理有快有慢,有时失效,希望自动过期失效代理。 只前两点的...

  • 面试系列~动态代理实现与原理

    动态代理有JDK动态代理, CGLIB动态代理, SpringAOP动态代理 一,JDK动态代理  jdk动态代理...

  • 编程常用的设计模式

    动态代理和静态代理 静态代理 动态代理 静态代理与动态代理的区别 JDK中的动态代理和CGLIB 实现动态代理的方...

  • Spring的AOP原理分析

    一 动态代理 动态代理分为JDK动态代理和CGLIB动态代理 jdk动态代理 被代理类(目标类)和代理类必须实现同...

  • 大数据学习路线

    一、linux基础(基本掌握) 二、java多线程、并发包下的队列、JMS、JVM、反射和动态代理 学习参...

  • 设计模式之代理模式

    代理分为静态代理和动态代理。 动态代理又包括基于JDK的动态代理、基于CGlib 的动态代理、基于Aspectj实...

  • Java高级主题(五)——动态代理

    代理可以分为静态代理、动态代理,动态代理又可以分为 jvm的动态代理 和 cglib的动态代理。像spring框架...

网友评论

      本文标题:多线程、动态代理爬虫

      本文链接:https://www.haomeiwen.com/subject/lomsfctx.html