美文网首页Python新世界python热爱者
python采集腾视频Vip电影,看电影我从来不充会员!

python采集腾视频Vip电影,看电影我从来不充会员!

作者: 48e0a32026ae | 来源:发表于2018-11-26 15:25 被阅读12次

用python实现的抓取腾讯视频所有电影的爬虫

学习Python中有不明白推荐加入交流群

                号:516107834

                群里有志同道合的小伙伴,互帮互助,

                群里有不错的学习教程!

# -*- coding: utf-8 -*-

import re

import urllib2

from bs4 import BeautifulSoup

import string, time

import pymongo

NUM = 0 #全局变量,电影数量

m_type = u'' #全局变量,电影类型

m_site = u'qq' #全局变量,电影网站

#根据指定的URL获取网页内容

def gethtml(url):

req = urllib2.Request(url)

response = urllib2.urlopen(req)

html = response.read()

return html

#从电影分类列表页面获取电影分类

def gettags(html):

global m_type

soup = BeautifulSoup(html) #过滤出分类内容

#print soup

#

    tags_all = soup.find_all('ul', {'class' : 'clearfix _group' , 'gname' : 'mi_type'})

    #print len(tags_all), tags_all

    #print str(tags_all[1]).replace('', '')

    #动作

    re_tags = r'.+?'

    p = re.compile(re_tags, re.DOTALL)

    tags = p.findall(str(tags_all[0]))

    if tags:

    tags_url = {}

    #print tags

    for tag in tags:

    tag_url = tag[0].decode('utf-8')

    #print tag_url

    m_type = tag[1].decode('utf-8')

    tags_url[m_type] = tag_url

    else:

    print "Not Find"

    return tags_url

    #获取每个分类的页数

    def get_pages(tag_url):

    tag_html = gethtml(tag_url)

    #div class="paginator

    soup = BeautifulSoup(tag_html) #过滤出标记页面的html

    #print soup

    #

    div_page = soup.find_all('div', {'class' : 'mod_pagenav', 'id' : 'pager'})

    #print div_page #len(div_page), div_page[0]

    #25

    re_pages = r'(.+?)'

    p = re.compile(re_pages, re.DOTALL)

    pages = p.findall(str(div_page[0]))

    #print pages

    if len(pages) > 1:

    return pages[-2]

    else:

    return 1

    def getmovielist(html):

    soup = BeautifulSoup(html)

    #

      divs = soup.find_all('ul', {'class' : 'mod_list_pic_130'})

      #print divs

      for div_html in divs:

      div_html = str(div_html).replace('', '')

      #print div_html

      getmovie(div_html)

      def getmovie(html):

      global NUM

      global m_type

      global m_site

      re_movie = r'

    • '
    • p = re.compile(re_movie, re.DOTALL)

      movies = p.findall(html)

      if movies:

      conn = pymongo.Connection('localhost', 27017)

      movie_db = conn.dianying

      playlinks = movie_db.playlinks

      #print movies

      for movie in movies:

      #print movie

      NUM += 1

      print "%s : %d" % ("=" * 70, NUM)

      values = dict(

      movie_title = movie[1],

      movie_url = movie[0],

      movie_site = m_site,

      movie_type = m_type

      )

      print values

      playlinks.insert(values)

      print "_" * 70

      NUM += 1

      print "%s : %d" % ("=" * 70, NUM)

      #else:

      # print "Not Find"

      def getmovieinfo(url):

      html = gethtml(url)

      soup = BeautifulSoup(html)

      #pack pack_album album_cover

      divs = soup.find_all('div', {'class' : 'pack pack_album album_cover'})

      #print divs[0]

      #

      re_info = r''

      p_info = re.compile(re_info, re.DOTALL)

      m_info = p_info.findall(str(divs[0]))

      if m_info:

      return m_info

      else:

      print "Not find movie info"

      return m_info

      def insertdb(movieinfo):

      global conn

      movie_db = conn.dianying_at

      movies = movie_db.movies

      movies.insert(movieinfo)

      if __name__ == "__main__":

      global conn

      tags_url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"

      #print tags_url

      tags_html = gethtml(tags_url)

      #print tags_html

      tag_urls = gettags(tags_html)

      #print tag_urls

      for url in tag_urls.items():

      print str(url[1]).encode('utf-8') #,url[0]

      maxpage = int(get_pages(str(url[1]).encode('utf-8')))

      print maxpage

      for x in range(0, maxpage):

      #http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html

      m_url = str(url[1]).replace('0_20_0_-1_0.html', '')

      movie_url = "%s%d_20_0_-1_0.html" % (m_url, x)

      print movie_url

      movie_html = gethtml(movie_url.encode('utf-8'))

      #print movie_html

      getmovielist(movie_html)

      time.sleep(0.1)

      最后,如果有想一起学习python,web、爬虫,可以一起交流!

      相关文章

      网友评论

        本文标题:python采集腾视频Vip电影,看电影我从来不充会员!

        本文链接:https://www.haomeiwen.com/subject/peybqqtx.html