美文网首页
python数据分析-豆瓣电影Top250

python数据分析-豆瓣电影Top250

作者: 931180482c82 | 来源:发表于2017-12-30 11:46 被阅读0次

    数据爬取源码

    详细过程大家可以在其他帖子中看到.
    这里采用python2.7以及原生库urllib2和re库进行爬取.

    # coding=utf-8
    import urllib2
    import re
    import time
    
    
    def get_Request(page):
       url = 'https://movie.douban.com/top250'
       headers = {
           'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36',
           'Host': 'movie.douban.com',
       }
       req = urllib2.Request(url + '?start=' + str(page) + '&filter=', headers=headers)
       res = urllib2.urlopen(req).read()
    
       return res
    
    
    def get_content(res):
       total = re.findall('<div class="item">([\s\S]+?)</div>\s+</li>', res, re.S)
       all_data = list()
       for item in total:
           try:
               title = re.findall('<span class="title">(.*?)</span>', item)
               count = re.findall('<em class="">(\d+)</em>', item)
               director = re.findall('<p class="">\s+导演:\s+(.*?)&nbsp;&nbsp;&nbsp;', item)
               other = re.findall('<br>\s+(\d{4,})&nbsp;/&nbsp;(.*?)&nbsp;/&nbsp;(.*?)\s+</p>', item)
               score = re.findall('<span class="rating_num" property="v:average">(\d\.\d+)</span>', item)
               name = title[0]
               other_name = ''
               counts = count[0]
               direct = director[0]
               year = other[0][0]
               country = other[0][1].replace(' ', '/'),
               type = other[0][2].replace(' ', '/'),
               scores = score[0],
               if len(title) > 1:
                   other_name = title[1].replace('&nbsp;/&nbsp;', '').replace(',', ' '),
               else:
                   title.append('0')
                   other_name = title[1]
               data = dict(
                   name=name,
                   other_name=other_name[0],
                   director=direct,
                   year=year,
                   country=country[0],
                   type=type[0],
                   score=scores[0],
               )
               all_data.append(data)
           except:
               all_data.append(data)
    
       return all_data
    
    
    def save_data(data):
       value = ''
       with open('data.txt', 'a') as f:
           for line in data:
               for values in ['name', 'other_name', 'director', 'year', 'country', 'type', 'score']:
                   if values == 'score':
                       value += line[values]
                   else:
                       value += line[values] + ','
               f.write(str(value) + '\n')
               print value + '\n'
               value = ''
       f.close()
    
    
    def run(page):
       res = get_Request(page)
       data = get_content(res)
       save_data(data)
    
    
    if __name__ == "__main__":
       page = 0
       while page < 250:
           run(page=page)
           page += 25
           time.sleep(0.5)
       print 'finished data crawl'
    
    

    数据分析

    影片类型分析

    这里对于爬取下来的数据集,并对类型中的"/"进行切割.
    分割前:


    1.jpg1.jpg

    相关文章

      网友评论

          本文标题:python数据分析-豆瓣电影Top250

          本文链接:https://www.haomeiwen.com/subject/btmggxtx.html