Movies Analyze

analisis ini digunakan sebagai personal project untuk mengetahui trend - trend yang ada di dataset movies, data set ini di peroleh dari (https://www.kaggle.com/datasets/akshaypawar7/millions-of-movies)

Prepare

library(dplyr)
library(janitor)
library(tidyverse)
library(ggplot2) 
library(lubridate)
library(tidyr)
library(data.table)
library(scales)
library(grid)
library(gridExtra)
library(gghighlight)
library(stringr)

Library yang dibutuhkan untuk membantu dalam analisis

Import File

movies <- read.csv("movies.csv", na.strings = c('')) %>% clean_names()
glimpse(movies)
## Rows: 739,048
## Columns: 20
## $ id                   <int> 663712, 732459, 436270, 675054, 420634, 956101, 5…
## $ title                <chr> "Terrifier 2", "Blade of the 47 Ronin", "Black Ad…
## $ genres               <chr> "Horror-Thriller", "Action-Fantasy", "Action-Fant…
## $ original_language    <chr> "en", "en", "en", "es", "en", "la", "en", "zh", "…
## $ overview             <chr> "After being resurrected by a sinister entity Art…
## $ popularity           <dbl> 4608.567, 3821.739, 3772.253, 3401.452, 2352.470,…
## $ production_companies <chr> "Bloody Disgusting-Dark Age Cinema-Fuzz on the Le…
## $ release_date         <chr> "2022-10-06", "2022-10-25", "2022-10-19", "2022-1…
## $ budget               <dbl> 2.5e+05, 0.0e+00, 2.0e+08, 0.0e+00, 1.0e+05, 0.0e…
## $ revenue              <dbl> 10155347, 0, 319000000, 0, 0, 0, 0, 626203271, 28…
## $ runtime              <dbl> 138, 106, 125, 93, 84, 0, 98, 149, 88, 107, 105, …
## $ status               <chr> "Released", "Released", "Released", "Released", "…
## $ tagline              <chr> "Who's laughing now?", NA, "The world needed a he…
## $ vote_average         <dbl> 7.095, 6.691, 6.854, 7.415, 6.555, 4.600, 5.600, …
## $ vote_count           <dbl> 496, 47, 939, 53, 908, 10, 132, 23, 432, 1011, 10…
## $ credits              <chr> "Lauren LaVera-David Howard Thornton-Elliott Full…
## $ keywords             <chr> "clown-halloween-resurrection-sequel-gore-slasher…
## $ poster_path          <chr> "/wRKHUqYGrp3PO91mZVQ18xlwYzW.jpg", "/kjFDIlUCJkc…
## $ backdrop_path        <chr> "/y5Z0WesTjvn59jP6yo459eUsbli.jpg", "/pGx6O6IwqAD…
## $ recommendations      <chr> "436270-732459-928123-575322-675054-420634-642721…

Proses

movies$release_date <- ymd(movies$release_date)

mengubah tipe data menjadi date menggunakan library lubridate

unique(movies$status)
## [1] "Released"        "Post Production" "In Production"   "Planned"        
## [5] "Canceled"        "Rumored"

melihat apa saja yang ada di kolom status menggunakan unique

Filter Data by “Year >= 2000 <=2022” and “Status == Released”

movies <- movies %>% 
  select(-c(keywords, credits, poster_path, backdrop_path)) %>% 
  mutate(year = year(release_date))%>% 
  filter(year>=2000, year<=2022, status=='Released') 

movies$popularity <- as.numeric(gsub(".", "", movies$popularity, fixed=TRUE)) 

 
movies <- subset(movies, id!="168626")#data not relevance
nrow(movies)
## [1] 429160

Checking Duplicate Data

sum(duplicated(movies$id))
## [1] 54182

Removing Duplicate Data by “id”

movies <- movies %>% distinct(id, .keep_all=TRUE)
nrow(movies)
## [1] 374978

setelah data duplikat di hapus terdapat 375036 baris

Split Genre by “-”

movies_genre <- as.data.frame(movies$genres, stringsAsFactors = FALSE)
movies_genre2 <- as.data.frame(tstrsplit(movies_genre[,1], '[-]', type.convert=TRUE), 
                                    stringsAsFactors=FALSE)
colnames (movies_genre2) <- c("genre1", "genre2")

movies_genre <-  movies_genre2 %>% select(c("genre1", "genre2"))

movies <- cbind (movies, movies_genre) 


str(movies$genre1)
##  chr [1:374978] "Horror" "Action" "Action" "Horror" "Horror" "Thriller" ...
str( movies$genre2)
##  chr [1:374978] "Thriller" "Fantasy" "Fantasy" "Action" "Thriller" NA NA ...

Analyze

Movies by Genre 1 & Genre 2

genre1 <-  
  movies %>% 
  select(genre1) %>% 
  drop_na() %>% 
  group_by(genre1) %>% 
  summarise (total = n()) %>% 
  arrange(desc(genre1)) %>% 
  ggplot(aes(x = reorder(genre1,total), y=total, fill = total))+
  geom_bar(stat='identity')+
  coord_flip()+
  scale_fill_gradient2(mid = "purple", high = "black")+
  labs(x = "Genres",
       y = " Total",
       title = "Movie by Genre 1")+
  theme_minimal()+
  theme(legend.position = "none")

genre2 <- 
  movies_genre %>% select(genre2) %>% 
  drop_na() %>% 
  group_by(genre2) %>% 
  summarise (total = n()) %>% 
  ggplot(aes(x = reorder(genre2,total), y=total, fill = total))+
  geom_bar(stat='identity')+
  scale_fill_gradient2(mid = "purple", high = "black")+
  coord_flip()+
  labs(x = "Genres",
       y = " Total",
       title = "Movie by Genre 2")+
  theme_minimal()+theme(legend.position = "none")

grid.arrange(genre1, genre2, ncol=2)

Released Movies by Genre, Year

movies %>%
  group_by(year,genre1) %>%
  drop_na(genre1) %>% 
  summarise(total = n()) %>%
  ggplot(aes(year, total))+
  geom_line(aes(color = genre1), linewidth = 2)+
  labs(x = "Year",
       y = "Total Movies",
       title = "Movies By Genre 1 From 2000 to 2022 more than 1000 in a year")+
  theme_minimal()+
  gghighlight(total>=1000, label_key = genre1)

vote_average by genre

vote_genre <- movies %>% select(c(genre1, vote_average, year)) %>%  
    filter(vote_average != 0) %>% 
    drop_na(genre1) %>%
    ggplot(aes(genre1, year, fill = vote_average))+
    geom_tile()+
    coord_flip()+
    scale_fill_gradient(low = "white", high = "purple")+
    labs ( x = "Genre 1",
           y = "Year",
           title = "Vote Average by Genre 1")+
    theme_minimal()

vote_genre2 <- movies %>% select(c(genre2, vote_average, year)) %>%  
  filter(vote_average != 0) %>% 
  drop_na(genre2) %>%
  ggplot(aes(genre2, year, fill = vote_average))+
  geom_tile()+
  coord_flip()+
  scale_fill_gradient(low = "white", high = "purple")+
  labs ( x = "Genre 2",
         y = "Year",
         title = "Vote Average by Genre 2")+
  theme_minimal()

grid.arrange(vote_genre+theme(legend.position = 'hidden'), vote_genre2+theme(legend.position = 'top'), ncol=2)

Overview Movies by genre1 “Documentary” with wordcloud

karena movies berdasarkan genre1 yang terbanyak adalah “Documentary” jadi pengambilan data berdasarkan genre1 “Documentary” menggunakan filter

library(tidytext)
library(textclean)


documentary <- movies %>%
  select (genre1, overview) %>% 
  filter (genre1 == "Documentary") %>% 
  drop_NA()

overviews <- documentary$overview


# cleaning
overviews <- overviews %>% 
  str_to_lower() %>% #Change words to lower case
  replace_contraction() %>%  #Replace contractions with both words (ex : i'm = i am)
  replace_word_elongation() %>%   #Replace word elongations with shortened form (ex : filmmm = film) 
  strip() #Remove all non word characters


#tokenize & remove stopwords
documentary <- enframe(overviews, value = "word", name=NULL) %>% #vector to data frame
  unnest_tokens(word, word) %>% #changing 1 word to 1 coloumn
  count(word, sort = T) %>% #counting word sorting by desc
  anti_join(stop_words) #anti join stop words (a, is, of, the..)
## Joining with `by = join_by(word)`
#creating wordcloud2 viz

library(wordcloud2)
fil_doc <- documentary %>% 
  arrange(desc(n)) %>% 
  slice(1:200) #sorting top 200 words
  
wordcloud2(fil_doc, size=.6, color = 'random-light', backgroundColor = "black", fontWeight = 'bold', shape = 'pentagon')

Top 10 movies by popularity

movies %>% select(title, popularity, genre1) %>%
  filter(genre1=='Documentary') %>% 
  arrange(desc(popularity)) %>% 
  slice(1:10) %>% 
  mutate(title = str_wrap(title, width = 20)) %>% 
  ggplot(aes(reorder(title,popularity),popularity, fill=title))+
  geom_bar(stat='identity')+
  coord_flip()+
  theme_minimal()+
  scale_y_continuous(labels = label_comma())+
  labs (x= "Title",
        y= "Popularity",
        title = "TOP 10 Genre Documentary by Popularity")+theme(legend.position = "none")

movies %>% select(title, revenue, genre1) %>%
  filter(genre1=="Documentary") %>% 
  arrange(desc(revenue)) %>% 
  slice(1:10) %>% 
  mutate(title = str_wrap(title, width = 20)) %>% 
  ggplot(aes(reorder(title,revenue), revenue, fill = revenue))+
  geom_bar(stat="identity")+
  coord_flip()+
  scale_y_continuous(label=label_comma())+
  scale_fill_gradient(low = "purple", high = "black")+
  labs(x = "Title",
       y = "Revenue $",
       title = "TOP 10 Genre Documentary by Revenue")

movies %>% select(title, budget, genre1) %>%
  filter(genre1=="Documentary") %>% 
  arrange(desc(budget)) %>% 
  slice(1:10) %>% 
  mutate(title = str_wrap(title, width = 20)) %>% 
  ggplot(aes(reorder(title,budget), budget, fill = budget))+
  geom_bar(stat="identity")+
  coord_flip()+
  scale_y_continuous(labels = label_comma())+
  scale_fill_gradient(low = "purple", high = "black")+
  labs(x = "Title",
       y = "Budget $", 
       title = "TOP 10 Genre Documentary by Budget")

movies %>% group_by(original_language) %>% 
  summarise(total = n()) %>%
  arrange(desc(total)) %>% 
  slice(1:10) %>%  
  ggplot(aes(reorder(original_language,total), total, fill=total))+
  geom_bar(stat = 'identity')+
  coord_flip()+
  scale_fill_gradient2(low = "black", high = "purple")+
  labs(x= "Original Language",
       y= "Total",
       title = "TOP 10 movies by original language")+theme_minimal()

movies %>% 
  group_by(year) %>% 
  summarise(revenue = mean(revenue),
            budget = mean(budget)) %>% 
  pivot_longer(cols = c(revenue, budget), names_to = "Variable", values_to = "Value") %>%  # Mengubah struktur data
  ggplot(aes(x = year, y = Value, color = Variable)) +
  geom_line() +  # Menggunakan satu geom_line() dengan warna yang diatur oleh variabel "Variable"
  labs(x = "Year",
       y = "Total",
       title = "Revenue VS Budget") +
  scale_color_manual(values = c("revenue" = "red", "budget" = "green"),  # Menentukan warna dan label legenda
                     labels = c("Budget", "Revenue")) +
  theme_minimal()

Correlation between Revenue, Budget, Popularity, Vote_average, Runtime

cor_data <- movies %>% select(revenue, budget, popularity, vote_average, runtime) %>% 
  filter(revenue>0, budget>0) %>% drop_na(runtime)

panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
  usr <- par("usr")
  on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  Cor <- abs(cor(x, y)) # Remove abs function if desired
  txt <- paste0(prefix, format(c(Cor, 0.123456789), digits = digits)[1])
  if(missing(cex.cor)) {
    cex.cor <- 0.4 / strwidth(txt)
  }
  text(0.5, 0.5, txt,
       cex = 1 + cex.cor * Cor) # Resize the text by level of correlation
}

# Plotting the correlation matrix
pairs(cor_data,
      upper.panel = panel.cor,    # Correlation panel
      lower.panel = panel.smooth) # Smoothed regression lines