analisis ini digunakan sebagai personal project untuk mengetahui trend - trend yang ada di dataset movies, data set ini di peroleh dari (https://www.kaggle.com/datasets/akshaypawar7/millions-of-movies)
library(dplyr)
library(janitor)
library(tidyverse)
library(ggplot2)
library(lubridate)
library(tidyr)
library(data.table)
library(scales)
library(grid)
library(gridExtra)
library(gghighlight)
library(stringr)
Library yang dibutuhkan untuk membantu dalam analisis
movies <- read.csv("movies.csv", na.strings = c('')) %>% clean_names()
glimpse(movies)
## Rows: 739,048
## Columns: 20
## $ id <int> 663712, 732459, 436270, 675054, 420634, 956101, 5…
## $ title <chr> "Terrifier 2", "Blade of the 47 Ronin", "Black Ad…
## $ genres <chr> "Horror-Thriller", "Action-Fantasy", "Action-Fant…
## $ original_language <chr> "en", "en", "en", "es", "en", "la", "en", "zh", "…
## $ overview <chr> "After being resurrected by a sinister entity Art…
## $ popularity <dbl> 4608.567, 3821.739, 3772.253, 3401.452, 2352.470,…
## $ production_companies <chr> "Bloody Disgusting-Dark Age Cinema-Fuzz on the Le…
## $ release_date <chr> "2022-10-06", "2022-10-25", "2022-10-19", "2022-1…
## $ budget <dbl> 2.5e+05, 0.0e+00, 2.0e+08, 0.0e+00, 1.0e+05, 0.0e…
## $ revenue <dbl> 10155347, 0, 319000000, 0, 0, 0, 0, 626203271, 28…
## $ runtime <dbl> 138, 106, 125, 93, 84, 0, 98, 149, 88, 107, 105, …
## $ status <chr> "Released", "Released", "Released", "Released", "…
## $ tagline <chr> "Who's laughing now?", NA, "The world needed a he…
## $ vote_average <dbl> 7.095, 6.691, 6.854, 7.415, 6.555, 4.600, 5.600, …
## $ vote_count <dbl> 496, 47, 939, 53, 908, 10, 132, 23, 432, 1011, 10…
## $ credits <chr> "Lauren LaVera-David Howard Thornton-Elliott Full…
## $ keywords <chr> "clown-halloween-resurrection-sequel-gore-slasher…
## $ poster_path <chr> "/wRKHUqYGrp3PO91mZVQ18xlwYzW.jpg", "/kjFDIlUCJkc…
## $ backdrop_path <chr> "/y5Z0WesTjvn59jP6yo459eUsbli.jpg", "/pGx6O6IwqAD…
## $ recommendations <chr> "436270-732459-928123-575322-675054-420634-642721…
movies$release_date <- ymd(movies$release_date)
mengubah tipe data menjadi date menggunakan library lubridate
unique(movies$status)
## [1] "Released" "Post Production" "In Production" "Planned"
## [5] "Canceled" "Rumored"
melihat apa saja yang ada di kolom status menggunakan unique
movies <- movies %>%
select(-c(keywords, credits, poster_path, backdrop_path)) %>%
mutate(year = year(release_date))%>%
filter(year>=2000, year<=2022, status=='Released')
movies$popularity <- as.numeric(gsub(".", "", movies$popularity, fixed=TRUE))
movies <- subset(movies, id!="168626")#data not relevance
nrow(movies)
## [1] 429160
sum(duplicated(movies$id))
## [1] 54182
movies <- movies %>% distinct(id, .keep_all=TRUE)
nrow(movies)
## [1] 374978
setelah data duplikat di hapus terdapat 375036 baris
movies_genre <- as.data.frame(movies$genres, stringsAsFactors = FALSE)
movies_genre2 <- as.data.frame(tstrsplit(movies_genre[,1], '[-]', type.convert=TRUE),
stringsAsFactors=FALSE)
colnames (movies_genre2) <- c("genre1", "genre2")
movies_genre <- movies_genre2 %>% select(c("genre1", "genre2"))
movies <- cbind (movies, movies_genre)
str(movies$genre1)
## chr [1:374978] "Horror" "Action" "Action" "Horror" "Horror" "Thriller" ...
str( movies$genre2)
## chr [1:374978] "Thriller" "Fantasy" "Fantasy" "Action" "Thriller" NA NA ...
genre1 <-
movies %>%
select(genre1) %>%
drop_na() %>%
group_by(genre1) %>%
summarise (total = n()) %>%
arrange(desc(genre1)) %>%
ggplot(aes(x = reorder(genre1,total), y=total, fill = total))+
geom_bar(stat='identity')+
coord_flip()+
scale_fill_gradient2(mid = "purple", high = "black")+
labs(x = "Genres",
y = " Total",
title = "Movie by Genre 1")+
theme_minimal()+
theme(legend.position = "none")
genre2 <-
movies_genre %>% select(genre2) %>%
drop_na() %>%
group_by(genre2) %>%
summarise (total = n()) %>%
ggplot(aes(x = reorder(genre2,total), y=total, fill = total))+
geom_bar(stat='identity')+
scale_fill_gradient2(mid = "purple", high = "black")+
coord_flip()+
labs(x = "Genres",
y = " Total",
title = "Movie by Genre 2")+
theme_minimal()+theme(legend.position = "none")
grid.arrange(genre1, genre2, ncol=2)
movies %>%
group_by(year,genre1) %>%
drop_na(genre1) %>%
summarise(total = n()) %>%
ggplot(aes(year, total))+
geom_line(aes(color = genre1), linewidth = 2)+
labs(x = "Year",
y = "Total Movies",
title = "Movies By Genre 1 From 2000 to 2022 more than 1000 in a year")+
theme_minimal()+
gghighlight(total>=1000, label_key = genre1)
vote_genre <- movies %>% select(c(genre1, vote_average, year)) %>%
filter(vote_average != 0) %>%
drop_na(genre1) %>%
ggplot(aes(genre1, year, fill = vote_average))+
geom_tile()+
coord_flip()+
scale_fill_gradient(low = "white", high = "purple")+
labs ( x = "Genre 1",
y = "Year",
title = "Vote Average by Genre 1")+
theme_minimal()
vote_genre2 <- movies %>% select(c(genre2, vote_average, year)) %>%
filter(vote_average != 0) %>%
drop_na(genre2) %>%
ggplot(aes(genre2, year, fill = vote_average))+
geom_tile()+
coord_flip()+
scale_fill_gradient(low = "white", high = "purple")+
labs ( x = "Genre 2",
y = "Year",
title = "Vote Average by Genre 2")+
theme_minimal()
grid.arrange(vote_genre+theme(legend.position = 'hidden'), vote_genre2+theme(legend.position = 'top'), ncol=2)
karena movies berdasarkan genre1 yang terbanyak adalah “Documentary” jadi pengambilan data berdasarkan genre1 “Documentary” menggunakan filter
library(tidytext)
library(textclean)
documentary <- movies %>%
select (genre1, overview) %>%
filter (genre1 == "Documentary") %>%
drop_NA()
overviews <- documentary$overview
# cleaning
overviews <- overviews %>%
str_to_lower() %>% #Change words to lower case
replace_contraction() %>% #Replace contractions with both words (ex : i'm = i am)
replace_word_elongation() %>% #Replace word elongations with shortened form (ex : filmmm = film)
strip() #Remove all non word characters
#tokenize & remove stopwords
documentary <- enframe(overviews, value = "word", name=NULL) %>% #vector to data frame
unnest_tokens(word, word) %>% #changing 1 word to 1 coloumn
count(word, sort = T) %>% #counting word sorting by desc
anti_join(stop_words) #anti join stop words (a, is, of, the..)
## Joining with `by = join_by(word)`
#creating wordcloud2 viz
library(wordcloud2)
fil_doc <- documentary %>%
arrange(desc(n)) %>%
slice(1:200) #sorting top 200 words
wordcloud2(fil_doc, size=.6, color = 'random-light', backgroundColor = "black", fontWeight = 'bold', shape = 'pentagon')
movies %>% select(title, popularity, genre1) %>%
filter(genre1=='Documentary') %>%
arrange(desc(popularity)) %>%
slice(1:10) %>%
mutate(title = str_wrap(title, width = 20)) %>%
ggplot(aes(reorder(title,popularity),popularity, fill=title))+
geom_bar(stat='identity')+
coord_flip()+
theme_minimal()+
scale_y_continuous(labels = label_comma())+
labs (x= "Title",
y= "Popularity",
title = "TOP 10 Genre Documentary by Popularity")+theme(legend.position = "none")
movies %>% select(title, revenue, genre1) %>%
filter(genre1=="Documentary") %>%
arrange(desc(revenue)) %>%
slice(1:10) %>%
mutate(title = str_wrap(title, width = 20)) %>%
ggplot(aes(reorder(title,revenue), revenue, fill = revenue))+
geom_bar(stat="identity")+
coord_flip()+
scale_y_continuous(label=label_comma())+
scale_fill_gradient(low = "purple", high = "black")+
labs(x = "Title",
y = "Revenue $",
title = "TOP 10 Genre Documentary by Revenue")
movies %>% select(title, budget, genre1) %>%
filter(genre1=="Documentary") %>%
arrange(desc(budget)) %>%
slice(1:10) %>%
mutate(title = str_wrap(title, width = 20)) %>%
ggplot(aes(reorder(title,budget), budget, fill = budget))+
geom_bar(stat="identity")+
coord_flip()+
scale_y_continuous(labels = label_comma())+
scale_fill_gradient(low = "purple", high = "black")+
labs(x = "Title",
y = "Budget $",
title = "TOP 10 Genre Documentary by Budget")
movies %>% group_by(original_language) %>%
summarise(total = n()) %>%
arrange(desc(total)) %>%
slice(1:10) %>%
ggplot(aes(reorder(original_language,total), total, fill=total))+
geom_bar(stat = 'identity')+
coord_flip()+
scale_fill_gradient2(low = "black", high = "purple")+
labs(x= "Original Language",
y= "Total",
title = "TOP 10 movies by original language")+theme_minimal()
movies %>%
group_by(year) %>%
summarise(revenue = mean(revenue),
budget = mean(budget)) %>%
pivot_longer(cols = c(revenue, budget), names_to = "Variable", values_to = "Value") %>% # Mengubah struktur data
ggplot(aes(x = year, y = Value, color = Variable)) +
geom_line() + # Menggunakan satu geom_line() dengan warna yang diatur oleh variabel "Variable"
labs(x = "Year",
y = "Total",
title = "Revenue VS Budget") +
scale_color_manual(values = c("revenue" = "red", "budget" = "green"), # Menentukan warna dan label legenda
labels = c("Budget", "Revenue")) +
theme_minimal()
cor_data <- movies %>% select(revenue, budget, popularity, vote_average, runtime) %>%
filter(revenue>0, budget>0) %>% drop_na(runtime)
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
usr <- par("usr")
on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
Cor <- abs(cor(x, y)) # Remove abs function if desired
txt <- paste0(prefix, format(c(Cor, 0.123456789), digits = digits)[1])
if(missing(cex.cor)) {
cex.cor <- 0.4 / strwidth(txt)
}
text(0.5, 0.5, txt,
cex = 1 + cex.cor * Cor) # Resize the text by level of correlation
}
# Plotting the correlation matrix
pairs(cor_data,
upper.panel = panel.cor, # Correlation panel
lower.panel = panel.smooth) # Smoothed regression lines