多线程、动态代理爬虫

作者: 二十二_0cbe | 来源:发表于2019-06-13 16:08 被阅读0次

多线程、动态代理爬虫
Python实现爬取可用代理IP
2021校招复习总结
Squid配置多代理动态自动转发
面试系列~动态代理实现与原理
编程常用的设计模式
Spring的AOP原理分析
大数据学习路线
设计模式之代理模式
Java高级主题（五）——动态代理

# -*- coding: UTF-8 -*-
import json
import os
import re
import time
import shutil
import logging
import traceback
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
import configparser


# logging.basicConfig(filename='log.log')

class abuyun():
    config = configparser.ConfigParser()
    config.read("./config.ini")
    conf_abuyun = config["abuyun"]
    proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
        "host": conf_abuyun["proxyHost"],
        "port": conf_abuyun["proxyPort"],
        "user": conf_abuyun["proxyUser"],
        "pass": conf_abuyun["proxyPass"],
    }

    proxies = {
        "http": proxyMeta,
        "https": proxyMeta,
    }


def paragraph_info(id, order=0):
    try:
        artic = {}
        artic['uuid'] = id

        header = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
        url = "https://dict.eudic.net/webting/play?id={}&order={}".format(id, order)
        print(url)
        response = requests.get(url, headers=header, proxies=abuyun().proxies)

        soup = BeautifulSoup(response.content, "html.parser")
        article = soup.select_one("#article")
        # print(article)
        if not article:
            article = soup.select_one("div.article")
        sentence = article.select("span.sentence[data-starttime]")

        scripts = soup.select("script[type='text/javascript']")
        script = ""
        for s in scripts:
            script += s.get_text()
        # print(script)
        # script = str(soup.select("script[type='text/javascript']")[0].get_text())
        # print(script)
        translate = re.search('{"translation":.*,"wordhints"', script)
        start, end = translate.span()
        translate = script[start + 16:end - 13]

        translate = [t + "}" for t in translate.split("},")]
        translate[-1] = translate[-1][:-1]
        artic['sentence'] = []
        for index, sen in enumerate(sentence):
            temp = {}
            en = sen.get_text()

            # 语音最后一句的时间戳
            if sen["data-endtime"] == '':
                timestamps = [sen["data-starttime"], '99:99:99.99']
            else:
                timestamps = [sen["data-starttime"], sen["data-endtime"]]

            temp['order'] = index
            temp['timestamps'] = timestamps
            temp['english'] = en
            temp['chinese'] = ""

            if translate and translate != ['']:
                cn = json.loads(translate[index])
                cn_timestamps = cn["timestamps"][1:-1].split("],[")
                temp['chinese'] = cn['text']

                # 修正两个时间戳格式不一样的情况，只有20个uuid是这种情况
                for t in range(2):
                    timestamps[t] = timestamps[t][3:]

                # print(timestamps, cn_timestamps)
                # 处理两句英文对应一句中文的情况
                if timestamps[0] == cn_timestamps[0] and timestamps[1] == cn_timestamps[1]:
                    # print("equal!")
                    pass
                elif timestamps[0] == cn_timestamps[0] and timestamps[1] < cn_timestamps[1]:
                    translate.insert(index, translate[index])
                    order_num = index
                    temp['order'] = order_num
                elif timestamps[0] > cn_timestamps[0] and timestamps[1] < cn_timestamps[1]:
                    translate.insert(index, translate[index])
                    # order_num = order_num - 1
                    temp['order'] = order_num
                elif timestamps[0] > cn_timestamps[0] and timestamps[1] == cn_timestamps[1]:
                    # order_num = order_num - 1
                    if cn_timestamps[0] == '00:00.00':
                        order_num = 0
                    temp['order'] = order_num
                # print(temp)
                # print("")
            else:
                temp['chinese'] = ""
            artic['sentence'].append(temp)

        video = soup.select("video.video")
        if video:
            mp4_url = video[0]['src']
            # print(mp4_url)
            req = requests.get(mp4_url)
            save_path = os.path.abspath(os.curdir) + "/audios/" + id + ".mp4"  # 当前工作目录下的文件路径
            with open(save_path, 'wb') as code:
                code.write(req.content)
            print(id + ".mp4 [Download successful!]")
        else:
            mp3 = re.search('initPlayPage(.*)', script)
            start, end = mp3.span()
            mp3_url = script[start + 14:end - 4]
            # print(mp3_url)
            req = requests.get(mp3_url)
            save_path = os.path.abspath(os.curdir) + "/audios/" + id + ".mp3"  # 当前工作目录下的文件路径
            with open(save_path, 'wb') as code:
                code.write(req.content)
            print(id + ".mp3 [Download successful!]")

        artic['audio_path'] = save_path

        lesson_file = os.path.abspath(os.curdir) + "/lesson_info"
        with open(lesson_file, 'a', encoding='utf8') as writer:
            writer.write(str(artic) + "\n")
    except OSError:
        paragraph_info(id)
    except UnboundLocalError:
        Unbound_info(id)
    except Exception:
        logging.error("lesson uuid: " + id)
        e = traceback.format_exc()
        logging.error(e)


def main(ids, thread_num=5):
    # 统计该爬虫的消耗时间
    print('*' * 50)
    start = time.time()
    log_file = os.path.abspath(os.curdir) + "/log.log"
    if os.path.exists(log_file):
        os.remove(log_file)
    logging.basicConfig(filename='log.log')

    mp3_path = os.path.abspath(os.curdir) + "/audios"
    if os.path.exists(mp3_path):
        shutil.rmtree(mp3_path)
    os.mkdir(mp3_path)

    lesson_file = os.path.abspath(os.curdir) + "/lesson_info"
    if os.path.exists(lesson_file):
        os.remove(lesson_file)
    # lesson_info = open(lesson_file, "w", encoding='utf8')

    # 利用并发下载
    executor = ThreadPoolExecutor(max_workers=thread_num)  # 可以自己调整max_workers,即线程的个数
    # submit()的参数： 第一个为函数， 之后为该函数的传入参数，允许有多个
    # ids = ["e27a4e2d-f7dc-442a-937e-6b10691275e5", "520ae730-20d2-11e6-bcc9-000c29ffef9b",
    #        "c01daf43-3c4d-11e7-866e-000c29ffef9b", "1faa27d9-f6f7-11e8-a44b-000c29ffef9b"]

    future_tasks = [executor.submit(paragraph_info, id) for id in ids]

    # 等待所有的线程完成，才进入后续的执行
    wait(future_tasks, return_when=ALL_COMPLETED)

    end = time.time()
    print("[All works are done.]")
    print('使用多线程，总共耗时：%s' % (end - start))
    print('总处理了：%s个文件。' % len(ids))
    print('*' * 50)


if __name__ == '__main__':
    # ids = ["36222876-6bb2-11e7-93a3-e954ea2f9385"]
    # uuid_file = open("lesson_uuid", "r")
    # uuid_file = open("undone_uuid", "r")
    #
    # reader = uuid_file.readlines()
    # ids = []
    # for uuid in reader:
    #     ids.append(uuid[:-1])
    # uuid_file.close()
    # # ids = ids[::501]
    # main(ids, 5)

    # paragraph_info("e27a4e2d-f7dc-442a-937e-6b10691275e5")
    paragraph_info("f90fea4e-ef01-4e89-b495-346707023bbb")