美文网首页
python爬虫:爬取豆瓣近期电影数据以及存入数据库

python爬虫:爬取豆瓣近期电影数据以及存入数据库

作者: xgz_pmx | 来源:发表于2018-10-13 17:45 被阅读0次

    个人博客地址:http://pmx-xiguazi.com/
    思路:根据豆瓣api获取数据-->处理数据-->存入mysql数据库

    1.根据豆瓣API获取数据

    #获取豆瓣近期电影数据
    def douBanSplider():
      url = 'http://api.douban.com/v2/movie/in_theaters';
      try:
        reponse = requests.get(url);
        if reponse.status_code == 200:
          list = eval(reponse.text.encode('utf-8'))
          return list.get('subjects')
          return None
        except RequestException as e:
          print('请求索引页出错')
          return None
    

    2.处理数据

    def main():
        html = douBanSplider();
        conn = connectMysql()
        movie_list = []
        avatar_list = []
        directors_list = []
        movie_avatar_list = []
        genres_list = []
        for x in html:
            # print(type(x))
            mv_id = x.get('id')
            title = x.get('title')
            original_title = x.get('original_title')
            subtype = x.get('subtype')
            collect_count = x.get('collect_count')
            create_date = datetime.datetime.now()
            directors = x.get('directors')
            year = x.get('year')
    
            genres = x.get('genres')
            genres_str = ','.join(genres)
            for i in range(0,len(genres)):
                genres_data= (str(uuid.uuid4()),genres[i])
                genres_list.append(genres_data)
    
            rating = x.get('rating')
            average = rating.get('average')
            stars = rating.get('stars')
            max_grade = rating.get('max')
            min_grade = rating.get('min')
            #将电影信息存入数据库
            movie_data = (mv_id,title,original_title,year,subtype,directors[0].get('id'),
                          genres_str,average,max_grade,min_grade,stars,collect_count,create_date)
            directors_data = (directors[0].get('id'),directors[0].get('name'),
                              directors[0].get('alt').replace('\\',''),
                              directors[0].get('avatars').get('small').replace('\\',''),
                              directors[0].get('avatars').get('medium').replace('\\',''),
                              directors[0].get('avatars').get('large').replace('\\',''))
            movie_list.append(movie_data)
            directors_list.append(directors_data)
    
            casts = x.get('casts')
            for avatar in casts:
                avatar_id = avatar.get('id')
                avatar_name = avatar.get('name')
                avatar_alt = avatar.get('alt').replace('\\','')
                avatar_image_small = avatar.get('avatars').get('small').replace('\\','')
                avatar_image_medium = avatar.get('avatars').get('medium').replace('\\','')
                avatar_image_large = avatar.get('avatars').get('large').replace('\\','')
                avatar_data = (avatar_id,avatar_name,avatar_alt,avatar_image_small,avatar_image_medium,avatar_image_large)
                movie_avatar_data = (mv_id,avatar_id)
                avatar_list.append(avatar_data)
                movie_avatar_list.append(movie_avatar_data)
        insertMovie(conn,movie_list)
        insertDirectors(conn,directors_list)
        insertAvatar(conn,avatar_list)
        insertGenres(conn,genres_list)
        insertMovieAvatars(conn,movie_avatar_list)
        closeMysql(conn)
    

    3.存入MYSQL数据库

    #连接数据库
    def connectMysql():
        try:
            conn = pymysql.connect(host='127.0.0.1', user='root', password='' ,db='mv',charset="utf8")
            print("成功连接数据库!")
            return conn
        except Exception as e:
            print("ERROR:"+e)
    
    def  closeMysql(conn):
        conn.close();
        print("关闭数据库!")
    
    #向数据库添加电影信息
    def insertMovie(conn,list):
        cursor = conn.cursor()
        sql = "replace into mv_movie(id,title,original_title,year,subtype,directors,genres,average,max,min,stars,collect_count,create_date) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    
        try:
            cursor.executemany(sql,list)
            print("插入mv_movie"+ str(cursor.rowcount) +"条数据")
            conn.commit()
        except Exception as e:
            print("插入mv_movie数据库错误="+e)
            conn.rollback()
    
    #向数据库添加演员
    def insertAvatar(conn,list):
        cursor = conn.cursor()
        sql = "replace into mv_avatar(id,name,alt,image_small,image_medium,image_large) values(%s,%s,%s,%s,%s,%s)"
    
        try:
            cursor.executemany(sql,list)
            print("插入mv_avatar"+ str(cursor.rowcount) +"条数据")
            conn.commit()
        except Exception as e:
            print("插入mv_avatar数据库错误="+e)
            conn.rollback()
    
    #向数据库添加导演
    def insertDirectors(conn,list):
        cursor = conn.cursor()
        sql = "replace into mv_director(id,name,alt,image_small,image_medium,image_large) values(%s,%s,%s,%s,%s,%s)"
    
        try:
            cursor.executemany(sql,list)
            print("插入mv_director"+ str(cursor.rowcount) +"条数据")
            conn.commit()
        except Exception as e:
            print("插入mv_director数据库错误="+e)
            conn.rollback()
    
    #向数据库添加电影演员关联数据
    def insertMovieAvatars(conn,list):
        cursor = conn.cursor()
        sql = "replace into mv_movie_avatar_ids(movie_id,avatar_id) values(%s,%s)"
    
        try:
            cursor.executemany(sql,list)
            print("插入mv_movie_avatar_ids"+ str(cursor.rowcount) +"条数据")
            conn.commit()
        except Exception as e:
            print("插入mv_movie_avatar_ids数据库错误="+e)
            conn.rollback()
    
    #向数据库添加电影类型
    def insertGenres(conn,list):
        cursor = conn.cursor()
        sql = "replace into mv_genres(id,genres) values(%s,%s)"
        try:
            cursor.executemany(sql,list)
            print("插入mv_genres"+ str(cursor.rowcount) +"条数据")
            conn.commit()
        except Exception as e:
            print("插入mv_genres数据库错误="+e)
            conn.rollback()
    
    #获取豆瓣近期电影数据
    def douBanSplider():
        url = 'http://api.douban.com/v2/movie/in_theaters';
        try:
            reponse = requests.get(url);
            if reponse.status_code == 200:
                list = eval(reponse.text.encode('utf-8'))
                return list.get('subjects')
            return None
        except RequestException as e:
            print('请求索引页出错')
            return None
    

    相关文章

      网友评论

          本文标题:python爬虫:爬取豆瓣近期电影数据以及存入数据库

          本文链接:https://www.haomeiwen.com/subject/foooaftx.html