参加《21天晋级大数据实战营》——DAY11 让编排变的简单-数据湖工厂实验小结
1、创建桶
2、获取原始数据
https://grouplens.org/datasets/movielens/
将ml-latest-small.zip解压
3、将原始数据movies.csv、ratings.csv上传到桶
4、创建数据表
新建DLI SQL
create table movie(movieId int,title varchar(256),genres varchar(256)) using csv options (path 's3a://obs-movies-richblue88/movies.csv');
create table rating(userId int,movieId int,rating float,rating_date long) using csv options (path 's3a://obs-movies-richblue88/ratings.csv');
create table top_rating_movie(title varchar(256),avg_rating float,rating_user_number int);
create table top_active_movie(title varchar(256),avg_rating float,rating_user_number int);
5、分析数据:评分最高Top20电影
insert overwrite table top_rating_movie
select
a.title,
b.ratings / b.rating_user_number as avg_rating,
b.rating_user_number
from
movie a,
(
select
movieId,
sum(rating) ratings,
count(1) as rating_user_number
from
rating
group by
movieId
) b
where
rating_user_number > 100
and a.movieId = b.movieId
order by
avg_rating desc
limit
20
6、分析数据(最活跃Top20电影)
insert overwrite table top_active_movie
select * from
(
select
a.title,
b.ratingSum / b.rating_user_number as avg_rating,
b.rating_user_number
from
movie a,
(
select
movieId,
sum(rating) ratingSum,
count(1) as rating_user_number
from
rating
group by
movieId
) b
where
a.movieId = b.movieId
) t
where
t.avg_rating > 3.5
order by
rating_user_number desc
limit
20
7、创建作业
测试运行
调度配置
- 点赞
- 收藏
- 关注作者
评论(0)