导入和插入数据代码
-- mysql --local-infile=1 -u root -p
create database movie_db default character set='utf8';
use movie_db;
create table ratings (
user_id int null,
movie_id int null,
ratings int null,
rating_time int null
);
-- -- 指定导入文件的位置
-- LOAD DATA local INFILE '/Users/carmack/datasets/movielens/ratings.dat'
-- -- 导入到哪张表
-- INTO TABLE ratings
-- -- 字段分隔符用什么
-- FIELDS TERMINATED BY '::'
-- -- 双引号是括住字段
-- ENCLOSED BY '"'
-- -- 你用什么符号来区别换行
-- LINES TERMINATED BY '\n';
create table movies (
movie_id int null,
title varchar(256) null,
genres varchar(512) null
);
-- LOAD DATA local INFILE '/Users/carmack/datasets/movielens/movies.dat'
-- INTO TABLE movies FIELDS TERMINATED BY '::' ENCLOSED BY '"' LINES TERMINATED BY '\n';
create table users (
user_id int null,
gender varchar(2) null,
age int null,
occupation int null,
zip_code varchar(128) null
);
-- LOAD DATA local INFILE '/Users/carmack/datasets/movielens/users.dat'
-- INTO TABLE users FIELDS TERMINATED BY '::' ENCLOSED BY '"' LINES TERMINATED BY '\n';
-- * 1: "Under 18" < 18
-- * 18: "18-24" 18 <= < 25
-- * 25: "25-34" 25 <= < 35
-- * 35: "35-44" 35 <= < 45
-- * 45: "45-49" 45 <= <50
-- * 50: "50-55" 50 <= < 56
-- * 56: "56+" 56 <=
create table age_dict (
age_id int null,
age_min int null,
age_max int null
);
insert into age_dict (age_id, age_min, age_max) values(1, 0, 18);
insert into age_dict (age_id, age_min, age_max) values(18, 18, 25);
insert into age_dict (age_id, age_min, age_max) values(25, 25, 35);
insert into age_dict (age_id, age_min, age_max) values(35, 35, 45);
insert into age_dict (age_id, age_min, age_max) values(45, 45, 50);
insert into age_dict (age_id, age_min, age_max) values(50, 50, 56);
insert into age_dict (age_id, age_min, age_max) values(56, 56, 300);
-- - Occupation is chosen from the following choices:
-- * 0: "other" or not specified
-- * 1: "academic/educator"
-- * 2: "artist"
-- * 3: "clerical/admin"
-- * 4: "college/grad student"
-- * 5: "customer service"
-- * 6: "doctor/health care"
-- * 7: "executive/managerial"
-- * 8: "farmer"
-- * 9: "homemaker"
-- * 10: "K-12 student"
-- * 11: "lawyer"
-- * 12: "programmer"
-- * 13: "retired"
-- * 14: "sales/marketing"
-- * 15: "scientist"
-- * 16: "self-employed"
-- * 17: "technician/engineer"
-- * 18: "tradesman/craftsman"
-- * 19: "unemployed"
-- * 20: "writer"
create table occup_dict (
occup_id int null,
occup_title varchar(256) null
);
insert into occup_dict (occup_id, occup_title) values(0, 'other or not specified');
insert into occup_dict (occup_id, occup_title) values(1, 'academic/educator');
insert into occup_dict (occup_id, occup_title) values(2, 'artist');
insert into occup_dict (occup_id, occup_title) values(3, 'clerical/admin');
insert into occup_dict (occup_id, occup_title) values(4, 'college/grad student');
insert into occup_dict (occup_id, occup_title) values(5, 'customer service');
insert into occup_dict (occup_id, occup_title) values(6, 'doctor/health care');
insert into occup_dict (occup_id, occup_title) values(7, 'executive/managerial');
insert into occup_dict (occup_id, occup_title) values(8, 'farmer');
insert into occup_dict (occup_id, occup_title) values(9, 'homemaker');
insert into occup_dict (occup_id, occup_title) values(10, 'K-12 student');
insert into occup_dict (occup_id, occup_title) values(11, 'lawyer');
insert into occup_dict (occup_id, occup_title) values(12, 'programmer');
insert into occup_dict (occup_id, occup_title) values(13, 'retired');
insert into occup_dict (occup_id, occup_title) values(14, 'sales/marketing');
insert into occup_dict (occup_id, occup_title) values(15, 'scientist');
insert into occup_dict (occup_id, occup_title) values(16, 'self-employed');
insert into occup_dict (occup_id, occup_title) values(17, 'technician/engineer');
insert into occup_dict (occup_id, occup_title) values(18, 'tradesman/craftsman');
insert into occup_dict (occup_id, occup_title) values(19, 'unemployed');
insert into occup_dict (occup_id, occup_title) values(20, 'writer');
#在ipython中插入数据代码(先定义再执行后面倒入代码)
--db= sqla.create_engine('mysql+pymysql://root:Vff123456@127.0.0.1/movie_db?charset=utf8')
--
--mnames = ['movie_id', 'title', 'genres']
--movies = pd.read_table(r'datasets/movielens/movies.dat', sep='::', header=None, names=mnames,engine='python')#前面加r转义不行就加r
--
--mnames = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
--users = pd.read_table('datasets/movielens/users.dat', sep='::', header=None, names=mnames,engine='python')
--
--mnames = ['user_id', 'movie_id', 'ratings', 'rating_time']
--ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::', header=None, names=mnames,engine='python')
--
--movies.to_sql('movies', db, index=False, if_exists='append')
--users.to_sql('users', db, index=False, if_exists='append')
--ratings.to_sql('ratings', db, index=False, if_exists='append')
查询代码
找出用户的性别、年龄、电影名、打分
select gender, age_dict.age_min, age_dict.age_max, title, ratings
from users, movies, ratings, age_dict
where users.user_id = ratings.user_id and
ratings.movie_id = movies.movie_id and
age_dict.age_id = users.age;
select a.gender, d.age_min, d.age_max, c.title, b.ratings
from users a inner join ratings b
on a.user_id = b.user_id
inner join movies c
on b.movie_id = c.movie_id
inner join age_dict d
on a.age = d.age_id;
select case when a.gender = 'F' then '女' else '男' end as 性别,
avg(b.ratings) as 平均打分
from users a inner join ratings b
on a.user_id = b.user_id
group by a.gender;
select a.title, avg(b.ratings)
from movies a inner join ratings b
on a.movie_id = b.movie_id
group by a.title;
select a.gender, c.title, avg(b.ratings)
from users a inner join ratings b
on a.user_id = b.user_id
inner join movies c
on c.movie_id = b.movie_id
group by a.gender, c.title
order by c.title
limit 3;
查询不同年龄段的电影平均评分,不分性别
set @rownum=0;
create table temp_result as
select @rownum:=@rownum+1 as rownum, CONCAT(c.age_min,'~',c.age_max) as 年龄段, d.title, avg(b.ratings) as 平均评分
from users a inner join ratings b
on a.user_id = b.user_id
inner join age_dict c
on a.age = c.age_id
inner join movies d
on d.movie_id = b.movie_id
group by c.age_min, c.age_max, d.title;
不同职业的电影平均得分,要分性别
select c.occup_title, a.gender, avg(b.ratings)
from users a inner join ratings b
on a.user_id = b.user_id
inner join occup_dict c
on a.occupation = c.occup_id
group by c.occup_title, a.gender;
各个年龄段最喜欢的电影
select d.age_min, d.age_max, c.title, b.ratings
from users a inner join ratings b
on a.user_id = b.user_id
inner join movies c
on c.movie_id = b.movie_id
inner join age_dict d
on d.age_id = a.age
group by d.age_min, d.age_max
order by b.ratings desc
limit 0,1;
select a.gender, c.title, avg(b.ratings)
from users a inner join ratings b
on a.user_id = b.user_id
inner join movies c
on c.movie_id = b.movie_id
group by a.gender, c.title
order by c.title
select * from temp_result
where (年龄段, 平均评分, rownum) in
(
select a.年龄段, a.平均评分, max(a.rownum) from
(
select rownum, 年龄段, 平均评分, title
from temp_result
where (年龄段, 平均评分) in (
select 年龄段, max(平均评分)
from temp_result
group by 年龄段
)
) a
group by a.年龄段, a.平均评分
)
create view temp_view2 as
select gender, age_dict.age_min, age_dict.age_max, title, ratings
from users, movies, ratings, age_dict
where users.user_id = ratings.user_id and
ratings.movie_id = movies.movie_id and
age_dict.age_id = users.age;
网友评论