# -*- coding: UTF-8 -*-
import json
import os
import re
import time
import shutil
import logging
import traceback
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
import configparser
# logging.basicConfig(filename='log.log')
class abuyun():
config = configparser.ConfigParser()
config.read("./config.ini")
conf_abuyun = config["abuyun"]
proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
"host": conf_abuyun["proxyHost"],
"port": conf_abuyun["proxyPort"],
"user": conf_abuyun["proxyUser"],
"pass": conf_abuyun["proxyPass"],
}
proxies = {
"http": proxyMeta,
"https": proxyMeta,
}
def paragraph_info(id, order=0):
try:
artic = {}
artic['uuid'] = id
header = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
url = "https://dict.eudic.net/webting/play?id={}&order={}".format(id, order)
print(url)
response = requests.get(url, headers=header, proxies=abuyun().proxies)
soup = BeautifulSoup(response.content, "html.parser")
article = soup.select_one("#article")
# print(article)
if not article:
article = soup.select_one("div.article")
sentence = article.select("span.sentence[data-starttime]")
scripts = soup.select("script[type='text/javascript']")
script = ""
for s in scripts:
script += s.get_text()
# print(script)
# script = str(soup.select("script[type='text/javascript']")[0].get_text())
# print(script)
translate = re.search('{"translation":.*,"wordhints"', script)
start, end = translate.span()
translate = script[start + 16:end - 13]
translate = [t + "}" for t in translate.split("},")]
translate[-1] = translate[-1][:-1]
artic['sentence'] = []
for index, sen in enumerate(sentence):
temp = {}
en = sen.get_text()
# 语音最后一句的时间戳
if sen["data-endtime"] == '':
timestamps = [sen["data-starttime"], '99:99:99.99']
else:
timestamps = [sen["data-starttime"], sen["data-endtime"]]
temp['order'] = index
temp['timestamps'] = timestamps
temp['english'] = en
temp['chinese'] = ""
if translate and translate != ['']:
cn = json.loads(translate[index])
cn_timestamps = cn["timestamps"][1:-1].split("],[")
temp['chinese'] = cn['text']
# 修正两个时间戳格式不一样的情况,只有20个uuid是这种情况
for t in range(2):
timestamps[t] = timestamps[t][3:]
# print(timestamps, cn_timestamps)
# 处理两句英文对应一句中文的情况
if timestamps[0] == cn_timestamps[0] and timestamps[1] == cn_timestamps[1]:
# print("equal!")
pass
elif timestamps[0] == cn_timestamps[0] and timestamps[1] < cn_timestamps[1]:
translate.insert(index, translate[index])
order_num = index
temp['order'] = order_num
elif timestamps[0] > cn_timestamps[0] and timestamps[1] < cn_timestamps[1]:
translate.insert(index, translate[index])
# order_num = order_num - 1
temp['order'] = order_num
elif timestamps[0] > cn_timestamps[0] and timestamps[1] == cn_timestamps[1]:
# order_num = order_num - 1
if cn_timestamps[0] == '00:00.00':
order_num = 0
temp['order'] = order_num
# print(temp)
# print("")
else:
temp['chinese'] = ""
artic['sentence'].append(temp)
video = soup.select("video.video")
if video:
mp4_url = video[0]['src']
# print(mp4_url)
req = requests.get(mp4_url)
save_path = os.path.abspath(os.curdir) + "/audios/" + id + ".mp4" # 当前工作目录下的文件路径
with open(save_path, 'wb') as code:
code.write(req.content)
print(id + ".mp4 [Download successful!]")
else:
mp3 = re.search('initPlayPage(.*)', script)
start, end = mp3.span()
mp3_url = script[start + 14:end - 4]
# print(mp3_url)
req = requests.get(mp3_url)
save_path = os.path.abspath(os.curdir) + "/audios/" + id + ".mp3" # 当前工作目录下的文件路径
with open(save_path, 'wb') as code:
code.write(req.content)
print(id + ".mp3 [Download successful!]")
artic['audio_path'] = save_path
lesson_file = os.path.abspath(os.curdir) + "/lesson_info"
with open(lesson_file, 'a', encoding='utf8') as writer:
writer.write(str(artic) + "\n")
except OSError:
paragraph_info(id)
except UnboundLocalError:
Unbound_info(id)
except Exception:
logging.error("lesson uuid: " + id)
e = traceback.format_exc()
logging.error(e)
def main(ids, thread_num=5):
# 统计该爬虫的消耗时间
print('*' * 50)
start = time.time()
log_file = os.path.abspath(os.curdir) + "/log.log"
if os.path.exists(log_file):
os.remove(log_file)
logging.basicConfig(filename='log.log')
mp3_path = os.path.abspath(os.curdir) + "/audios"
if os.path.exists(mp3_path):
shutil.rmtree(mp3_path)
os.mkdir(mp3_path)
lesson_file = os.path.abspath(os.curdir) + "/lesson_info"
if os.path.exists(lesson_file):
os.remove(lesson_file)
# lesson_info = open(lesson_file, "w", encoding='utf8')
# 利用并发下载
executor = ThreadPoolExecutor(max_workers=thread_num) # 可以自己调整max_workers,即线程的个数
# submit()的参数: 第一个为函数, 之后为该函数的传入参数,允许有多个
# ids = ["e27a4e2d-f7dc-442a-937e-6b10691275e5", "520ae730-20d2-11e6-bcc9-000c29ffef9b",
# "c01daf43-3c4d-11e7-866e-000c29ffef9b", "1faa27d9-f6f7-11e8-a44b-000c29ffef9b"]
future_tasks = [executor.submit(paragraph_info, id) for id in ids]
# 等待所有的线程完成,才进入后续的执行
wait(future_tasks, return_when=ALL_COMPLETED)
end = time.time()
print("[All works are done.]")
print('使用多线程,总共耗时:%s' % (end - start))
print('总处理了:%s个文件。' % len(ids))
print('*' * 50)
if __name__ == '__main__':
# ids = ["36222876-6bb2-11e7-93a3-e954ea2f9385"]
# uuid_file = open("lesson_uuid", "r")
# uuid_file = open("undone_uuid", "r")
#
# reader = uuid_file.readlines()
# ids = []
# for uuid in reader:
# ids.append(uuid[:-1])
# uuid_file.close()
# # ids = ids[::501]
# main(ids, 5)
# paragraph_info("e27a4e2d-f7dc-442a-937e-6b10691275e5")
paragraph_info("f90fea4e-ef01-4e89-b495-346707023bbb")
网友评论