美文网首页我爱编程
从"纪录片天地"获取所有电影的百度网盘链接

从"纪录片天地"获取所有电影的百度网盘链接

作者: 941疯子 | 来源:发表于2018-01-18 11:26 被阅读0次

1.目标

最近想找一些纪录片电影来看,从知乎的文章中发现了纪录片天地([http://www.jlpcn.net](http://www.jlpcn.net/))这个网站。网站内的资源非常丰富,看过一些电影之后,有了批量下载电影的想法。

查看网页的源代码后发现电影的链接,分类,描述等信息比较规则,适合使用脚本爬取。

2.实现思路

2.1使用requests获取网页内容

def getSoup(self, url):
        '''使用request获取网页源代码,并传入bs'''
        headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
        }
        try:
            r = requests.get(url, headers=headers)
            r.encoding = 'utf-8'
            soup = bsp(r.text, "html.parser")
            return soup
        except Exception as identifier:
            print('getSoup ex:\n%s' % traceback.format_exc())
            return None

2.2截取概要信息

http://www.jlpcn.net/vodtypehtml/1.html页面为例,这是“内容分类”->“科普”类的第一页,总共23页。

image.png
首先通过bs定位class='pages'的div元素,然后正则表达式提取总的页面数。
# 分析当前分类的纪录片有多少页
    def findAllLinks(self, url):
        '''
        纪录片分类的地址第一页
        http://www.jlpcn.net/vodtypehtml/1.html
        '''
        links = []
        links.append(url)
        soup = self.getSoup(url)
        if not soup:
            return None
        index1 = url.rfind('.')
        base1 = url[0:index1]
        div = soup.find('div', attrs={"class", "pages"})
        if div:
            pagestr = re.findall(r'当前:1/(.+?)页', div.text)
            if len(pagestr) > 0:
                try:
                    page_cnt = int(pagestr[0])
                    for x in range(2, page_cnt + 1):
                        url_t = "{0}-{1}.html".format(base1, x)
                        links.append(url_t)
                except Exception as ex:
                    traceback.print_exc()
        return links

提取页面总数后,然后可分析每一页总列举的电影的概要信息


image.png

代码如下

# 在分类页面中查找当前页面的纪录片的概要信息
    def findMoives(self, url):
        resultList = []
        soup = self.getSoup(url)
        if not soup:
            return None
        # print(soup.encode_contents())
        li_list = soup.find_all('li', attrs={"class": "mov"})
        for li in li_list:
            imgbox = li.find('img', attrs={"class": "scrollLoading"})
            if imgbox:
                minfo = models.movie_summary()
                minfo.img_url = imgbox["data-url"]
                a_pic = li.find('a', attrs={"class": "pic"})
                if a_pic:
                    minfo.href = a_pic["href"]
                    minfo.title = a_pic["title"]
                    minfo.title = minfo.title.replace(' ', '')
                r1 = li.find('div', attrs={"class": "r1"})
                minfo.update_time = r1.string[5:]
                r3 = li.find_all('div', attrs={"class": "r3"})
                if r3 and len(r3) > 0:
                    for r in r3:
                        if "内容分类" in r.string:
                            minfo.content_category = r.string[5:]
                        elif "频道分类" in r.string:
                            minfo.channel_category = r.string[5:]
                        elif "语言字幕" in r.string:
                            minfo.subtitles = r.string[5:]
                        elif "最后更新" in r.string:
                            minfo.last_update_time = r.string[5:]
                r5 = li.find('div', attrs={"class": "r5"})
                minfo.last_update_time = r5.string[5:]
                print("http://www.jlpcn.net" + minfo.href, minfo.title)
                resultList.append(minfo)
        print(len(li_list))
        return resultList

2.3进入下一层页面,获取更详细信息

http://www.jlpcn.net/vodhtml/3308.html为例,进入页面后

image.png
图中标注的位置的几个按钮,对应的超链接就是纪录片的百度网盘地址。分析网页源代码,定位到这些超链接的位置。
image.png
代码如下,
# 获取纪录片的详细信息
    def findMovieDetail(self, url):
        resultList = []
        soup = self.getSoup(url)
        if not soup:
            return None

        down_list_2 = soup.find('div', attrs={"id", "down_list_2"})
        if down_list_2:
            scripts = down_list_2.find_all('script')
            if len(scripts) > 0:
                for script in scripts:
                    print(script.string)

        div_list = soup.find_all('div', attrs={"class": "wanpan"})
        for div in div_list:
            a_bd = div.find('a')
            href = a_bd["href"]
            text = a_bd.string
            if not text:
                text = ','.join(a_bd.strings)
            text = text.replace(' ', '')
            # print(href, text)
            detail = models.movie_detail()
            detail.cur_url = url
            detail.title = text
            detail.href = href

            resultList.append(detail)
        # last_url = resultList[-1].href
        # r = requests.get(last_url)
        # print(r.text)
        return resultList

到这里为止,基本分析的差不多了,剩下的只是将这些信息存储起来。

3.全部代码

# encoding:utf-8
__author__ = "liujinquan"
__date__ = "2018/1/16"

import os
import re
import threading
import traceback
import uuid

import requests
from bs4 import BeautifulSoup as bsp
from sqlalchemy import (
    create_engine, )
from sqlalchemy.orm import sessionmaker

import models


# 从http://www.jlpcn.net/爬取记录片的详细信息,存储到sqlite数据库中
# 读取slqite数据库, 使用you-get工具下载包含pan.baidu.com的链接的纪录片
class SearchMoviesBaiduyun(object):
    def __init__(self):
        super(SearchMoviesBaiduyun, self).__init__()
        self.dbpath = r'sqlite:///F:\liujinquan\python\down_movie\movies.db'
        engine = create_engine(self.dbpath)
        self.Session = sessionmaker(bind=engine)

    def getSoup(self, url):
        '''使用request获取网页源代码,并传入bs'''
        headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
        }
        try:
            r = requests.get(url, headers=headers)
            r.encoding = 'utf-8'
            soup = bsp(r.text, "html.parser")
            return soup
        except Exception as identifier:
            print('getSoup ex:\n%s' % traceback.format_exc())
            return None

    # 分析当前分类的纪录片有多少页
    def findAllLinks(self, url):
        '''
        纪录片分类的地址第一页
        http://www.jlpcn.net/vodtypehtml/1.html
        '''
        links = []
        links.append(url)
        soup = self.getSoup(url)
        if not soup:
            return None
        index1 = url.rfind('.')
        base1 = url[0:index1]
        div = soup.find('div', attrs={"class", "pages"})
        if div:
            pagestr = re.findall(r'当前:1/(.+?)页', div.text)
            if len(pagestr) > 0:
                try:
                    page_cnt = int(pagestr[0])
                    for x in range(2, page_cnt + 1):
                        url_t = "{0}-{1}.html".format(base1, x)
                        links.append(url_t)
                except Exception as ex:
                    traceback.print_exc()
        return links

    # 在分类页面中查找当前页面的纪录片的概要信息
    def findMoives(self, url):
        resultList = []
        soup = self.getSoup(url)
        if not soup:
            return None
        # print(soup.encode_contents())
        li_list = soup.find_all('li', attrs={"class": "mov"})
        for li in li_list:
            imgbox = li.find('img', attrs={"class": "scrollLoading"})
            if imgbox:
                minfo = models.movie_summary()
                minfo.img_url = imgbox["data-url"]
                a_pic = li.find('a', attrs={"class": "pic"})
                if a_pic:
                    minfo.href = a_pic["href"]
                    minfo.title = a_pic["title"]
                    minfo.title = minfo.title.replace(' ', '')
                r1 = li.find('div', attrs={"class": "r1"})
                minfo.update_time = r1.string[5:]
                r3 = li.find_all('div', attrs={"class": "r3"})
                if r3 and len(r3) > 0:
                    for r in r3:
                        if "内容分类" in r.string:
                            minfo.content_category = r.string[5:]
                        elif "频道分类" in r.string:
                            minfo.channel_category = r.string[5:]
                        elif "语言字幕" in r.string:
                            minfo.subtitles = r.string[5:]
                        elif "最后更新" in r.string:
                            minfo.last_update_time = r.string[5:]
                r5 = li.find('div', attrs={"class": "r5"})
                minfo.last_update_time = r5.string[5:]
                print("http://www.jlpcn.net" + minfo.href, minfo.title)
                resultList.append(minfo)
        print(len(li_list))
        return resultList

    # 获取纪录片的详细信息
    def findMovieDetail(self, url):
        resultList = []
        soup = self.getSoup(url)
        if not soup:
            return None

        down_list_2 = soup.find('div', attrs={"id", "down_list_2"})
        if down_list_2:
            scripts = down_list_2.find_all('script')
            if len(scripts) > 0:
                for script in scripts:
                    print(script.string)

        div_list = soup.find_all('div', attrs={"class": "wanpan"})
        for div in div_list:
            a_bd = div.find('a')
            href = a_bd["href"]
            text = a_bd.string
            if not text:
                text = ','.join(a_bd.strings)
            text = text.replace(' ', '')
            # print(href, text)
            detail = models.movie_detail()
            detail.cur_url = url
            detail.title = text
            detail.href = href

            resultList.append(detail)
        # last_url = resultList[-1].href
        # r = requests.get(last_url)
        # print(r.text)
        return resultList

    # 查找某种分类的所有纪录片的概要和详细信息,存储在数据库中
    def searchAllLinks(self, url1):
        base_url = "http://www.jlpcn.net/"
        results = []
        links = self.findAllLinks(url1)
        if len(links) > 0:
            for url in links:
                print("searching -> {0}".format(url))
                movies = self.findMoives(url)
                if len(movies) > 0:
                    for m in movies:
                        self.saveToSummaryTable(
                            self.convertToMovieSummary(base_url, m))
                        url_d = base_url + m.href
                        # print(url_d)
                        details = self.findMovieDetail(url_d)
                        if len(details) > 0:
                            for d in details:
                                # if "pan.baidu.com" in d.href:
                                soup1 = self.getSoup(d.href)
                                if not soup1:
                                    continue
                                title1 = soup1.title.string
                                d.video_name = m.title.replace(
                                    ' ', ''
                                ) + "_" + d.title + self.getMovieType(title1)
                                self.saveToDetailTable(
                                    self.convertToMovieDetail(d))
                                print(d.href, title1, d.video_name)
                                results.append(d)
        # for r in results:
        #     print(r.href, r.title, r.cur_url)
        # print("result len: {0}".format(len(results)))
        # list_url = [x.href for x in results]
        # moveToBaiduYun(list_url)
        # s2 = json.dumps(
        #     results,
        #     default=lambda obj: obj.__dict__,
        #     sort_keys=True,
        #     indent=None,
        #     ensure_ascii=False)
        # print(s2)
        return results

    def getMovieType(self, title):
        if ".avi" in title:
            return ".avi"
        elif ".mp4" in title:
            return ".mp4"
        elif ".rmvb" in title:
            return ".rmvb"
        elif ".mkv" in title:
            return ".mkv"
        elif ".ts" in title:
            return ".ts"
        else:
            return ".avi"

    def saveToDetailTable(self, detail):
        try:
            if isinstance(detail, models.MovieDetail):
                session = self.Session()
                detail.md_id = str(uuid.uuid1())
                session.add(detail)
                session.commit()
                session.close()
        except Exception as identifier:
            print('saveToDetailTable ex:\n%s' % traceback.format_exc())

    def saveToSummaryTable(self, summary):
        try:
            if isinstance(summary, models.MovieSummary):
                session = self.Session()
                summary.m_id = str(uuid.uuid1())
                session.add(summary)
                session.commit()
                session.close()
        except Exception as identifier:
            print('saveToSummaryTable ex:\n%s' % traceback.format_exc())

    def convertToMovieSummary(self, base_url, movie):
        md = models.MovieSummary()
        md.title = movie.title
        md.href = base_url + movie.href
        md.img_url = base_url + movie.img_url
        md.update_time = movie.update_time
        md.content_category = movie.content_category
        md.channel_category = movie.channel_category
        md.subtitles = movie.subtitles
        md.last_update_time = movie.last_update_time
        md.summary = movie.summary
        return md

    def convertToMovieDetail(self, detail):
        d = models.MovieDetail()
        d.cur_url = detail.cur_url
        d.title = detail.title
        d.href = detail.href
        d.video_name = detail.video_name
        return d


if __name__ == '__main__':
    search = SearchMoviesBaiduyun()
    types = [
        32, 20, 29, 31, 36, 30, 28, 27, 24, 19, 25, 39, 38, 22, 21, 37, 40, 23,
        33, 34, 35, 26, 46, 47, 44, 41, 42, 45
    ]
    for t in types:
        url1 = r'http://www.jlpcn.net/vodtypehtml/{0}.html'.format(t)
        search.searchAllLinks(url1)

还有对应的model类

# coding: utf-8
from sqlalchemy import Column, Text, text
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()
metadata = Base.metadata


class MovieDetail(Base):
    __tablename__ = 'movie_detail'

    md_id = Column(Text(36), primary_key=True)
    cur_url = Column(Text(256))
    title = Column(Text(128))
    href = Column(Text(512))
    video_name = Column(Text(128))
    is_downloaded = Column(Text(3), server_default=text("'0'"))
    down_time = Column(Text(32))


class MovieSummary(Base):
    __tablename__ = 'movie_summary'

    m_id = Column(Text(36), primary_key=True)
    title = Column(Text(50))
    href = Column(Text(255))
    img_url = Column(Text(255))
    update_time = Column(Text(32))
    content_category = Column(Text(128))
    channel_category = Column(Text(128))
    subtitles = Column(Text(512))
    last_update_time = Column(Text(32))
    summary = Column(Text(512))


# 业务中使用的实体类
class movie_summary(object):
    def __init__(self):
        super(movie_summary, self).__init__()
        self.title = ""
        self.href = ""
        self.img_url = ""
        self.update_time = ""
        self.content_category = ""
        self.channel_category = ""
        self.subtitles = ""
        self.last_update_time = ""
        self.summary = ""


class movie_detail(object):
    def __init__(self):
        super(movie_detail, self).__init__()
        self.cur_url = ""
        self.title = ""
        self.href = ""
        self.video_name = ""


2018/1/18 下午

前面部分的代码只是采集了电影的百度网盘连接,最终存储在sqlite数据库中,复制这些到浏览器后可直接观看电影。但是如何通过这些链接将电影下载到本地有一些难度。其实在电影详细页面支持迅雷下载,从源代码中也可以搜索到磁力链接。
将这些磁力链接或ed2k链接解析并保存,然后使用aria2或者迅雷下载也是不错的方案

# 获取纪录片的详细信息
    def findMovieDetail(self, url):
        resultList = []
        soup = self.getSoup(url)
        if not soup:
            return None
        down_list_2 = soup.find(id="down_list_2")
        if down_list_2:
            # print(down_list_2)
            scripts = down_list_2.find_all(
                'script', text=re.compile(r'ThunderEncode'))
            # print(len(scripts))
            if len(scripts) > 0:
                for script in scripts:
                    s = str(script.string)
                    # 找到磁力链接
                    flag1 = r'ThunderEncode("'
                    index1 = s.index(flag1) + len(flag1)
                    index2 = s.index(r'"', index1)
                    href_str = s[index1:index2]
                    # 找到标题
                    flag2 = r'file_name="'
                    index3 = s.index(flag2) + len(flag2)
                    index4 = s.index(r'"', index3)
                    title_str = s[index3:index4]
                    # 缓存到列表中
                    detail = models.movie_detail()
                    detail.cur_url = url
                    detail.title = title_str.replace(' ', '')
                    detail.href = href_str
                    resultList.append(detail)
        return resultList

2018/2/5 下午

获取这些记录片的资源后,就想直接下载自己的硬盘中。
前面的代码中我们可以获取百度链接和磁力链接两种资源方式,磁力资源本身获取的比较少,而且大部分已经过期,及时使用迅雷也无法下载。百度连接的资源前后也尝试过you-get, aria2c等工具,效果都不是很理想。不是速度受限就是根本不能下载。
后来想想还是先转存到百度网盘,然后批量下载。关于百度网盘链接转存到自己的判断网盘,网上有一篇文章,但我没有尝试成功,只好采用selenium的方式。

# encoding:utf-8
__author__ = "liujinquan"
__date__ = "2018/1/28"

import datetime
import json
import logging
import os
import re
import threading
import time
import traceback
import urllib.parse
import uuid

import requests
from bs4 import BeautifulSoup as bsp
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

import models
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains

chromedriver = "C:/Users/Administrator/AppData/Local/Google/Chrome/Application/chromedriver.exe"
os.environ["webdriver.chrome.driver"] = chromedriver

profile_dir = r"C:\Users\Administrator\AppData\Local\Mozilla\Firefox\Profiles\cqupe01d.default"
profile = webdriver.FirefoxProfile(profile_dir)
driver = webdriver.Firefox(profile)

# from selenium import webdriver
# from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

# dcap = dict(DesiredCapabilities.PHANTOMJS)  #设置userAgent
# dcap["phantomjs.page.settings.userAgent"] = (
#     "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"
# )


class MoveToBaiduYun(object):
    def __init__(self):
        super(MoveToBaiduYun, self).__init__()
        self.dbpath = r'sqlite:///F:\liujinquan\python\down_movie\movies.db'
        self.engine = create_engine(self.dbpath)
        self.Session = sessionmaker(bind=self.engine)

    def getAllBaiduLinks(self):
        try:
            session = self.Session()
            links = session.query(models.MovieDetail.href).all()
            session.commit()
            session.close()
            print(len(links))
            return list(links)
        except Exception as identifier:
            print('getAllBaiduLinks ex:\n%s' % traceback.format_exc())
            return None

    def moveToBaiduYun(self, list_url):
        # url = 'https://pan.baidu.com/s/1o8ID1hC'
        # 这里可以用Chrome、Phantomjs等,如果没有加入环境变量,需要指定具体的位置
        # options = webdriver.ChromeOptions()
        # options.add_argument(
        #     "--user-data-dir=" +
        #     r"C:/Users/Administrator/AppData/Local/Google/Chrome/User Data")
        # driver = webdriver.Chrome(
        #     executable_path=chromedriver, options=options)
        # driver.maximize_window()

        # driver = webdriver.PhantomJS(
        #     executable_path='C:\Python\Python36\Scripts\phantomjs.exe',
        #     desired_capabilities=dcap)  #加载网址
        # driver.maximize_window()

        profile = webdriver.FirefoxProfile(profile_dir)
        driver = webdriver.Firefox(profile)
        driver.maximize_window()

        for url in list_url:
            driver.get(url)
            print('开始登录:' + url)
            try:
                save_to_pans = driver.find_element_by_class_name(
                    "bar").find_elements_by_css_selector(
                        "[class='g-button g-button-blue']")
                print(len(save_to_pans))
                for tag in save_to_pans:
                    print(tag.text)
                    time.sleep(1)
                    if tag.get_attribute("data-button-id") == u'b1':
                        print("find target.")
                        time.sleep(1)
                        tag.click()
                        # for x in range(1, 10):
                        #     time.sleep(1)
                        #     tag.click()
                        time.sleep(1)
                        driver.switch_to_default_content()
                        save_buttons = driver.find_element_by_id(
                            "fileTreeDialog").find_element_by_css_selector(
                                "[class='dialog-footer g-clearfix']"
                            ).find_elements_by_css_selector(
                                "[class='g-button g-button-blue-large']")
                        print(len(save_buttons))
                        for btn in save_buttons:
                            if btn.get_attribute("data-button-id") == u'b13':
                                print("find target again!")
                                time.sleep(1)
                                btn.click()
                        break
                time.sleep(3)
            except Exception as identifier:
                logging.error('down_movies ex:\n%s' % traceback.format_exc())
        return driver.get_cookies()

    def moveToBaiduYun_OldUrl(self, list_url):
        profile = webdriver.FirefoxProfile(profile_dir)
        driver = webdriver.Firefox(profile)
        driver.maximize_window()

        for url in list_url:
            driver.get(url)
            print('开始登录:' + url)
            try:
                # save_to_pans = driver.find_element_by_class_name(
                #     "bar").find_elements_by_css_selector(
                #         "[class='g-button g-button-blue']")
                save_to_pans = driver.find_element_by_id('emphsizeButton')
                if save_to_pans:
                    print("find target")
                    print(save_to_pans.text)
                    time.sleep(0.5)
                    save_to_pans.click()
                    time.sleep(0.5)

                    driver.switch_to_default_content()
                    save_buttons = driver.find_element_by_id('_disk_id_8')
                    if save_buttons:
                        print("find target again!")
                        time.sleep(0.5)
                        save_buttons.click()

                time.sleep(3)

            except Exception as identifier:
                logging.error('down_movies ex:\n%s' % traceback.format_exc())
        return driver.get_cookies()


if __name__ == '__main__':
    move = MoveToBaiduYun()
    links = move.getAllBaiduLinks()
    print(links[0], links[1])
    # links = [x[0] for x in links if 'pan.baidu.com' in x[0]]
    # print(len(links))
    # move.moveToBaiduYun(links)
    links = [x[0] for x in links if 'yun.baidu.com' in x[0]]
    print(len(links))
    move.moveToBaiduYun_OldUrl(links)

相关文章

网友评论

    本文标题:从"纪录片天地"获取所有电影的百度网盘链接

    本文链接:https://www.haomeiwen.com/subject/llmioxtx.html