美文网首页
豆瓣电影Top250数据分析

豆瓣电影Top250数据分析

作者: 北海鲸落 | 来源:发表于2020-06-01 09:09 被阅读0次

    一、requirements

    beautifulsoup4==4.9.1
    bs4==0.0.1
    click==7.1.2
    cycler==0.10.0
    Flask==1.1.2
    itsdangerous==1.1.0
    jieba==0.42.1
    Jinja2==2.11.2
    kiwisolver==1.2.0
    MarkupSafe==1.1.1
    matplotlib==3.2.1
    numpy==1.18.4
    Pillow==7.1.2
    pyparsing==2.4.7
    python-dateutil==2.8.1
    six==1.15.0
    soupsieve==2.0.1
    Werkzeug==1.0.1
    wordcloud @ file:python_reptile/flask/static/extend/wordcloud-1.7.0-cp36-cp36m-win32.whl
    xlwt==1.3.0
    

    二、获取并存储数据

    爬取豆瓣TOP250数据,并存储到数据库

    步骤:

    1. 定义爬取地址

    2. 获取URL的数据列表

      通过User-Agent,得到指定一个URL的网页内容

    3. 存储到sqlite数据库(数据库名:movie.db,表名:movie250

    # -*- coding:utf-8 -*-
     
    # date: 2020-5-10
    # author: jingluo
    import sys
    from bs4 import BeautifulSoup
    import sqlite3
    import re
    import urllib.request, urllib.error
    import xlwt
    
    # 搜索规则
    findLink = re.compile(r'<a href="(.*?)">')
    findImageSrc = re.compile(r'<img.*src="(.*?)"', re.S) # re.S让换行符包含在字符中
    findTitle = re.compile(r'<span class="title">(.*)</span>')
    findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
    findJudge = re.compile(r'<span>(\d*)人评价</span>')
    findInq = re.compile(r'<span class="inq">(.*)</span>')
    findBd = re.compile(r'<p class="">(.*?)</p>', re.S)
    
    def main():
        # 1. 定义爬取网址
        base_url = "https://movie.douban.com/top250?start="
        # 2. 获取数据列表
        data_list = getData(base_url)
        # 3. 定义数据库名称
        dbpath = "movie.db"
        # 4. 存储到sqlite数据库
        saveData2DB(data_list, dbpath)
    
    # 获取数据列表
    def getData(base_url):
        data_list = []
        for i in range(0, 10):
            url = base_url + str(i*25)
            html = askURl(url)
            
            # 逐一解析网页
            soup = BeautifulSoup(html, "html.parser")
            for item in soup.find_all("div", class_="item"):
                data = []
                item = str(item)
    
                link = re.findall(findLink, item)[0]
                data.append(link)
                imgSrc = re.findall(findImageSrc, item)[0]
                data.append(imgSrc)
                titles = re.findall(findTitle, item)
                if len(titles) == 2:
                    ctitle = titles[0]
                    data.append(ctitle)
                    otitle = titles[1].replace("/", "")
                    data.append(otitle)
                else:
                    data.append(titles[0])
                    data.append('')
                rating = re.findall(findRating, item)[0]
                data.append(rating)
                judege = re.findall(findJudge, item)[0]
                data.append(judege)
                inq = re.findall(findInq, item)
                if len(inq) != 0:
                    data.append(inq[0].replace("。", ""))
                else:
                    data.append('')
                bd = re.findall(findBd, item)[0]
                bd = re.sub('<br(\s+)?/>(\s+)?', " ", bd)
                bd = re.sub('/', " ", bd)
                data.append(bd.strip())
                data_list.append(data)
        return data_list
    
    # 得到指定一个URL的网页内容
    def askURl(url):
        # 用户验证信息
        head = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"}
        request = urllib.request.Request(url, headers = head)
        html = ""
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode("utf-8")
        except urllib.error.URLError as e:
            if hasattr(e, "code"):
                print("请求出错",e.code)
            if hasattr(e, "reason"):
                print("错误原因",e.reason)
        return html
    
    # 保存到sqlite数据库中
    def saveData2DB(data_list, dbpath):
        init_db(dbpath)
        conn = sqlite3.connect(dbpath)
        cur = conn.cursor()
    
        for data in data_list:
            for index in range(len(data)):
                if index == 4 or index == 5:
                    continue
                data[index] = '"' +data[index] + '"'
            sql = '''
            insert into movie250
            (
            info_link,pic_link,cname,ename,score,rated,instroduction,info
            )
            values(%s)'''%",".join(data)
            cur.execute(sql)
            conn.commit()
        cur.close()
        conn.close()
    
    # 初始化数据库
    def init_db(dbpath):
        sql = '''
            create table movie250
            (
            id integer primary key autoincrement,
            info_link text,
            pic_link text,
            cname varchar,
            ename varchar,
            score numeric,
            rated numeric,
            instroduction text,
            info text
            );
        '''
        conn = sqlite3.connect(dbpath)
        cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        conn.close()
    

    三、获取词云

    1. 读取数据库
    2. 使用jieba进行分割
    3. 使用word_length.txt存储词云长度
    4. 将原始图转成数组
    5. 使用ImageWordCloud初始化图片
    6. 使用pyplot生成和保存图片
    def makeWordCloud():
        # 准备词云所需的词
        con = sqlite3.connect('movie.db')
        cur = con.cursor()
        sql = 'select instroduction from movie250'
        data = cur.execute(sql)
        text = ""
        for item in data:
            text = text + item[0]
        cur.close()
        con.close()
    
        cut = jieba.cut(text)
        string = ' '.join(cut)
    
        filename = 'word_length.txt'
        with open(filename, 'w') as file:
            file.write(str(len(string)))
            file.close()
    
        img = Image.open(r'../static/assets/img/tree.jpg')
        img_arry = np.array(img) # 将图片转换成数组
        wc = WordCloud(
            background_color = 'white',
            mask = img_arry,
            font_path = 'STCAIYUN.TTF' # 字体锁在位置: C:\Windows\Fonts
            )
        wc.generate_from_text(string)
    
        # 绘制图片
        fig = plt.figure(1)
        plt.imshow(wc)
        plt.axis('off') # 是否显示坐标轴
        # plt.show() # 显示生成的词云图片
    
        # 输出词云图片到文件
        plt.savefig(r'../static/assets/img/word.jpg', dpi=800)
        plt.close()
    

    四、完成业务代码

    # -*- coding:utf-8 -*-
     
    # date: 2020-5-30
    # author: jingluo
    from flask import Flask, render_template,request, session
    import get_douban_databses
    import sqlite3
    import os
    
    # 分词
    import jieba
    # 绘图,数据可视化
    from matplotlib import pyplot as plt
    # 词云
    from wordcloud import WordCloud
    # 图片处理
    from PIL import Image
    # 矩阵运算
    import numpy as np
    
    # 自定义template路径
    app = Flask(__name__,template_folder="../templates/",
        static_folder='../static/') #应用
    
    # flask的session需要用到的秘钥字符串
    app.config["SECRET_KEY"] = "akjsdhkjashdkjhaksk120191101asd"
    
    @app.route("/")
    def index():
        try:
            with open('word_length.txt', 'r') as file:
                word_length = file.readline()
                session['word_length'] = word_length
                file.close()
        except:
            word_length = 5633
            session['word_length'] = word_length
        return render_template("template/home.html",word_length = word_length)
    
    @app.route("/home")
    def home():
        word_length = session.get('word_length')
        return render_template("template/home.html",word_length = word_length)
    
    @app.route("/movie")
    def movie():
        movies = []
        con = sqlite3.connect("movie.db")
        cur = con.cursor()
        sql = "select * from movie250"
        data = cur.execute(sql)
        for item in data:
            movies.append(item)
        cur.close()
        con.close()
        return render_template("template/movie.html",movies = movies)
    
    @app.route("/score")
    def score():
        score = []
        number = []
        con = sqlite3.connect("movie.db")
        cur = con.cursor()
        sql = "select score,count(score) from movie250 group by score"
        data = cur.execute(sql)
        for item in data:
            score.append(item[0])
            number.append(item[1])
        cur.close()
        con.close()
        return render_template("template/score.html", score = score, number = number)
    
    # 生成词云图片
    def makeWordCloud():
        # 准备词云所需的词
        con = sqlite3.connect('movie.db')
        cur = con.cursor()
        sql = 'select instroduction from movie250'
        data = cur.execute(sql)
        text = ""
        for item in data:
            text = text + item[0]
        cur.close()
        con.close()
    
        cut = jieba.cut(text)
        string = ' '.join(cut)
    
        filename = 'word_length.txt'
        with open(filename, 'w') as file:
            file.write(str(len(string)))
            file.close()
    
        img = Image.open(r'../static/assets/img/tree.jpg')
        img_arry = np.array(img) # 将图片转换成数组
        wc = WordCloud(
            background_color = 'white',
            mask = img_arry,
            font_path = 'STCAIYUN.TTF' # 字体锁在位置: C:\Windows\Fonts
            )
        wc.generate_from_text(string)
    
        # 绘制图片
        fig = plt.figure(1)
        plt.imshow(wc)
        plt.axis('off') # 是否显示坐标轴
        # plt.show() # 显示生成的词云图片
    
        # 输出词云图片到文件
        plt.savefig(r'../static/assets/img/word.jpg', dpi=800)
        plt.close()
    
    @app.route("/word")
    def word():
        return render_template("template/word.html")
    
    @app.route("/team")
    def team():
        return render_template("template/team.html")
    
    if __name__ == '__main__':
        app.config.update(DEBUG=True)
        if not os.path.exists('movie.db'):
            get_douban_databses.main()
        if not os.path.exists('../static/assets/img/word.jpg'):
            makeWordCloud()
        app.run()
    

    五、使用教程

    1. git clone https://gitee.com/jingluoonline/python_reptile.git
    2. cd python_reptile/flsk/apps
    3. 输入创建虚拟化境的命令virtualenv FlaskPath
    4. 进入虚拟环境FlaskPath\Scripts\activate.bat
    5. 安装相关依赖
      1. 其中wordcloud下载有时候会有问题,可以选择使用whl文件下载,网址https://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud找到相应的包下载到本地,进行本地安装
    6. python index.py,有点慢,因为爬取数据和生成图片都是在初始化时
    7. 浏览器输入http://127.0.0.1:5000/

    六、效果图

    1. 主页
    tQ5r8S.png
    1. 电影
    tQ5HKJ.png
    1. 评分
    tQ5OV1.png
    1. 词云
    tQIeG8.png
    1. 团队
    tQI8I0.png

    相关文章

      网友评论

          本文标题:豆瓣电影Top250数据分析

          本文链接:https://www.haomeiwen.com/subject/zwcjzhtx.html