Q2 a) Hive solutions:
#Creating a directory in hadoop:
hdfs dfs -ls /user/
hdfs dfs -mkdir /user/seshadri
hdfs dfs -mkdir /user/seshadri/data
#Storing the files on hadoop:
hdfs dfs -put u.data /user/Seshadri/data/
hdfs dfs -put u.item /user/Seshadri/data/
hdfs dfs -put u.user /user/Seshadri/data/
hdfs dfs -ls /user/Seshadri/data
#Starting Hive session:
hive
#Creating Hive tables:
create table ratings(user_id INT, item_id INT , rating INT , timestamp INT)
row format delimited
fields terminated by “|”;
load data inpath ‘/user/Seshadri/data/u.data’ overwrite into table ratings;
create table movies(movie_id INT , movie_title STRING, release_date STRING ,video release date
STRING, IMDb_URL STRING , unknown INT, Action INT, Adventure INT, Animation INT, Children INT ,
Comedy INT, Crime INT, Documentry INT, Drama INT, Fantasy INT, Film_Noir INT, Horror INT, Musical
INT, Mystery INT, Romance INT, Sci-Fi INT, Thriller INT , War INT, Western INT)
row format delimited
fields terminated by “| \t”
load data inpath ‘/user/Seshadri/data/u.item’ overwrite into table movies;
create table user(user_id INT, age INT, gender STRING, occupation STRING, zip code STRING)
row format delimited
fields terminated by “|”
load data inpath ‘/user/Seshadri/data/u.user ‘ overwrite into table user;
#The files has been loaded as tables in HIVE datawarehouse
hdfs dfs -ls /user/hive/warehouse
Q2 b)
i) Top 30 most rated movies
Create table most_rated_movies as
Select a.user_id,b.movie_id,b.movie_title,a.rating from ratings a left join movies b on
(a.item_id=b.movie_id) ;
Create table top30_rated_movies as
Select movie_title,count(movie_title) as rating_count from most_rated_movies group by movie_title
Order by rating_count desc limit 30;
select * from top30_rated_movies;
ii) Top 30 Highest rated movies
create table highest_rated_movies as
select movie_title,avg(rating) as avg_rating from most_rated_movies group by movie_title order by
avg_rating desc limit 30;
select * from highest_rated_movies;
iii) Top 30 movies having atleast 50,20 and 100 raters with their average rating
atleast 50 users:
create table t1 as
select movie_title, count(distinct(user_id)) as count_user, avg(rating) as avg_rating from
most_rated_movies group by movie_title having count_user >=50 order by avg_rating desc limit 30;
select * from t1;
#atleast 20 users:
create table t2 as
select movie_title, count(distinct(user_id)) as count_user, avg(rating) as avg_rating from
most_rated_movies group by movie_title having count_user >=20 order by avg_rating desc limit 30;
select * from t2;
#atleast 100 users:
create table t3 as
select movie_title, count(distinct(user_id)) as count_user, avg(rating) as avg_rating from
most_rated_movies group by movie_title having count_user >=100 order by avg_rating desc limit 30;
select * from t3;
Q2 c)
# Joining the ratings table and the user table
create table t4 as
select a.user_id,b.gender,b.occupation,a.item_id,a.rating from ratings a left join user b on
(a.user_id=b.user_id );
#Joining table4 with movies table
create table t5 as
select a.*,b.movie_title,( case
when b.unknown = 1 then "Unknown"
else " "
end) as Unknown,
( case
when b.Action = 1 then "Action"
else " "
end) as Action,
( case
when b.Adventure = 1 then "Adventure"
else " "
end) as Adventure,
( case
when b.Animation = 1 then "Animation"
else " "
end) as Animation,
( case
when b.Children = 1 then "Children"
else " "
end) as Children,
( case
when b.Comedy = 1 then "Comedy"
else " "
end) as Comedy,
( case
when b.Crime = 1 then "Crime"
else " "
end) as Crime,
( case
when b.Documentry = 1 then "Documentry"
else " "
end) as Documentry,
( case
when b.Drama = 1 then "Drama"
else " "
end) as Drama,
( case
when b.Fantasy = 1 then "Fantasy"
else " "
end) as Fantasy,
( case
when b.Film_Noir = 1 then "Film_Noir"
else " "
end) as Film_Noir,
( case
when b.Horror = 1 then "Horror"
else " "
end) as Horror,
( case
when b.Musical = 1 then "Musical"
else " "
end) as Musical,
( case
when b.Mystery = 1 then "Mystery"
else " "
end) as Mystery,
( case
when b.Romance = 1 then "Romance"
else " "
end) as Romance,
( case
when b.Sci_Fi = 1 then "Sci-Fi"
else " "
end) as Sci_Fi,
( case
when b.Thriller = 1 then "Thriller"
else " "
end) as Thriller,
( case
when b.War = 1 then "War"
else " "
end) as War,
( case
when b.Western = 1 then "Western"
else " "
end) as Western
from t4 a left join movies b on (a.item_id=b.movie_id);
# Average rating per gender:
create table t6 as
select gender , avg(rating) as avg_rating from t5 group by gender;
Q2 c) a)
#Creating a column column for genre by concatenating the all the genres.
create table t7 as
select *, Concat(TRIM(unknown),TRIM(Action),TRIM(Adventure) , TRIM(Animation) , TRIM(Children) ,
TRIM(Comedy) , TRIM(Crime) , TRIM(Documentry) , TRIM(Drama) , TRIM(Fantasy) , TRIM(Film_Noir) ,
TRIM(Horror) , TRIM(Musical) , TRIM(Mystery) , TRIM(Romance) , TRIM(Sci_Fi) , TRIM(Thriller) ,
TRIM(War) , TRIM(Western)) as Genre from t5;
#The average rating per gender and genre
#For Males
create table t8 as
select gender,Genre,avg(rating) as avg_rating from t7 where gender ="M" group by gender,Genre order
by avg_rating desc limit 30;
#For females
create table t9 as
select gender,Genre,avg(rating) as avg_rating from t7 where gender ="F" group by gender,Genre order
by avg_rating desc limit 30;
Q2 d) The average rating given by users from each occupations
create table t10 as
select occupation,avg(rating) as avg_rating from t4 group by occupation order by avg_rating desc ;
Q2 e) The count of ratings and average rating of each genre.
create table t11 as
select Genre,count(rating) as count_rating,avg(rating) as avg_rating from t7 group by Genre order by
avg_rating desc ;