About Dataset
https://www.kaggle.com/datasets/vivek468/superstore-dataset-final
Prepare
library(dplyr)
library(janitor)
library(tidyverse)
Read Data
superstore <- read_csv("Sample - Superstore.csv") %>% clean_names()
superstore <- superstore %>% select(-c(row_id))
Changing Data type of date using lubridate
library(lubridate)
superstore$order_date <- mdy(superstore$order_date)
superstore$ship_date <- mdy(superstore$ship_date)
Cleaning
# Checking Duplicate Value
sum(duplicated(superstore))
## [1] 1
# Find Index Duplicate Value
which(duplicated(superstore))
## [1] 3407
# Find rows 3407 and remove it
superstore[3407,]
## # A tibble: 1 × 20
## order_id order_date ship_date ship_mode customer_id customer_name segment
## <chr> <date> <date> <chr> <chr> <chr> <chr>
## 1 US-2014-150… 2014-04-23 2014-04-27 Standard… LB-16795 Laurel Beltr… Home O…
## # ℹ 13 more variables: country <chr>, city <chr>, state <chr>,
## # postal_code <chr>, region <chr>, product_id <chr>, category <chr>,
## # sub_category <chr>, product_name <chr>, sales <dbl>, quantity <dbl>,
## # discount <dbl>, profit <dbl>
superstore <- superstore[-3407,]
check na values
sapply(superstore, function(x)sum(is.na(x)))
## order_id order_date ship_date ship_mode customer_id
## 0 0 0 0 0
## customer_name segment country city state
## 0 0 0 0 0
## postal_code region product_id category sub_category
## 0 0 0 0 0
## product_name sales quantity discount profit
## 0 0 0 0 0
Quick Summary
summary(superstore)
## order_id order_date ship_date
## Length:9993 Min. :2014-01-03 Min. :2014-01-07
## Class :character 1st Qu.:2015-05-23 1st Qu.:2015-05-27
## Mode :character Median :2016-06-26 Median :2016-06-29
## Mean :2016-04-30 Mean :2016-05-04
## 3rd Qu.:2017-05-14 3rd Qu.:2017-05-18
## Max. :2017-12-30 Max. :2018-01-05
## ship_mode customer_id customer_name segment
## Length:9993 Length:9993 Length:9993 Length:9993
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## country city state postal_code
## Length:9993 Length:9993 Length:9993 Length:9993
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## region product_id category sub_category
## Length:9993 Length:9993 Length:9993 Length:9993
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## product_name sales quantity discount
## Length:9993 Min. : 0.444 Min. : 1.00 Min. :0.0000
## Class :character 1st Qu.: 17.280 1st Qu.: 2.00 1st Qu.:0.0000
## Mode :character Median : 54.480 Median : 3.00 Median :0.2000
## Mean : 229.853 Mean : 3.79 Mean :0.1562
## 3rd Qu.: 209.940 3rd Qu.: 5.00 3rd Qu.:0.2000
## Max. :22638.480 Max. :14.00 Max. :0.8000
## profit
## Min. :-6599.978
## 1st Qu.: 1.731
## Median : 8.671
## Mean : 28.661
## 3rd Qu.: 29.364
## Max. : 8399.976
superstore %>% mutate(year = year(order_date)) %>%
group_by(year) %>%
summarise(sales_year = sum(sales)) %>%
mutate(change = (sales_year/lag(sales_year))-1,
change_percent = scales::percent(change)) %>%
select(-c(change)) %>%
ggplot(aes(year, sales_year))+
geom_line(color = "blue", linewidth = 1)+
geom_point(color = "blue")+
scale_y_continuous(labels = scales::dollar)+
geom_label(aes(label = change_percent))+
labs(x = "Year",
y = "Sales",
title = "Sales From 2014 to 2017")
superstore %>% select(order_date, sales) %>%
mutate(month = month(order_date, label = TRUE),
year = year(order_date)) %>%
group_by(year, month) %>%
summarise(total = sum(sales)) %>%
ggplot(aes(month, total, group = year))+
geom_line(color = "red", linewidth = 1)+
geom_point()+
facet_wrap(~year)+
scale_y_continuous(labels = scales::dollar)+
labs(title= "Sales From 2014-2017 by Month")
Does the discount given and the season have an impact on sales?
# adding variable season, discounted
s_diag <- superstore %>% select(order_date, sales, profit, quantity, discount) %>%
mutate(season = case_when(
month(order_date) %in% 3:5~"Spring",
month(order_date) %in% 6:8~"Summer",
month(order_date) %in% 9:11~"Fall",
TRUE ~"Winter"),
holiday = ifelse(weekdays(order_date)%in%c("Saturday", "Sunday"), "Weekend", "Weekday"),
discounted = ifelse(discount>0, "Yes", "No"))
head(s_diag)
## # A tibble: 6 × 8
## order_date sales profit quantity discount season holiday discounted
## <date> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 2016-11-08 262. 41.9 2 0 Fall Weekday No
## 2 2016-11-08 732. 220. 3 0 Fall Weekday No
## 3 2016-06-12 14.6 6.87 2 0 Summer Weekend No
## 4 2015-10-11 958. -383. 5 0.45 Fall Weekend Yes
## 5 2015-10-11 22.4 2.52 2 0.2 Fall Weekend Yes
## 6 2014-06-09 48.9 14.2 7 0 Summer Weekday No
s_diag %>%
mutate(month = month(order_date, label = TRUE),
year = year(order_date)) %>%
group_by(year, month, season) %>%
summarise (total = sum(sales),
discounted = sum(discounted == "Yes")) %>%
ggplot(aes(month, total, fill = season))+
geom_bar(stat='identity')+
facet_wrap(~year)+
geom_text(aes(label = discounted), position = position_stack(vjust = 0.5))+
scale_y_continuous(labels = scales::dollar)+
theme(legend.position = "bottom")+
labs(title = "Total Sales in each year and total discount given")
discounts do not affect sales value, but after entering the end of the year during the fall season customers tend to make purchases at the beginning, late fall, and end of the year (early winter). the total discount given is also more than other seasons. in early spring, sales are always increasing in every year from the previous month
s_diag %>%
mutate(month = month(order_date, label = TRUE)) %>%
group_by(month, holiday, season) %>%
summarise (avg_sales = mean(sales)) %>%
ggplot(aes(month, avg_sales, fill = holiday)) +
geom_bar(stat='identity', position = position_dodge())+
scale_y_continuous(labels = scales::dollar)+
labs(title = "Avg_sales and type of days")+
theme(legend.position = "bottom")
From the graph, we know that holidays didn’t affect sales. Companies don’t need to make a decision about whether to provide discounts on weekdays or weekends. However, for the 4 months (mar, sep, nov, dec), we may consider providing a discount or promotion, either on weekdays or weekends.
superstore %>%
group_by(segment) %>%
summarise(total_sales = sum(sales),
total_profit = sum(profit),
total_orders = n()) %>%
mutate_at(vars(total_sales, total_profit), scales::dollar)
## # A tibble: 3 × 4
## segment total_sales total_profit total_orders
## <chr> <chr> <chr> <int>
## 1 Consumer $1,161,401 $134,119 5191
## 2 Corporate $706,146 $91,979 3020
## 3 Home Office $429,372 $60,311 1782
superstore %>%
mutate(year = year(order_date)) %>%
group_by(year, segment) %>%
summarise(total_sales = sum(sales), group = 1) %>%
ggplot(aes(year, total_sales, fill = segment))+
geom_bar(stat = 'identity')+
scale_y_continuous(labels = scales::dollar)+
theme_minimal()+
facet_wrap(~segment)+
theme(legend.position = "none")
From the table, we know that Segment of consumer provide the most sales and profit
Region and State
#Checking Unique Region
unique(superstore$region)
## [1] "South" "West" "Central" "East"
#Checking Unique State
unique(superstore$state)
## [1] "Kentucky" "California" "Florida"
## [4] "North Carolina" "Washington" "Texas"
## [7] "Wisconsin" "Utah" "Nebraska"
## [10] "Pennsylvania" "Illinois" "Minnesota"
## [13] "Michigan" "Delaware" "Indiana"
## [16] "New York" "Arizona" "Virginia"
## [19] "Tennessee" "Alabama" "South Carolina"
## [22] "Oregon" "Colorado" "Iowa"
## [25] "Ohio" "Missouri" "Oklahoma"
## [28] "New Mexico" "Louisiana" "Connecticut"
## [31] "New Jersey" "Massachusetts" "Georgia"
## [34] "Nevada" "Rhode Island" "Mississippi"
## [37] "Arkansas" "Montana" "New Hampshire"
## [40] "Maryland" "District of Columbia" "Kansas"
## [43] "Vermont" "Maine" "South Dakota"
## [46] "Idaho" "North Dakota" "Wyoming"
## [49] "West Virginia"
Which region the most sales?
superstore %>% group_by(region) %>%
summarise(total_sales = sum(sales)) %>%
arrange(desc(total_sales)) %>%
mutate_at(vars(total_sales), scales::dollar)
## # A tibble: 4 × 2
## region total_sales
## <chr> <chr>
## 1 West $725,458
## 2 East $678,500
## 3 Central $501,240
## 4 South $391,722
TOP 3 Total Sales In State By Region
superstore %>%
group_by(region,state) %>%
summarise(total_sales = sum(sales)) %>%
arrange(desc(total_sales)) %>%
slice(1:3) %>%
ggplot(aes(reorder(state, -total_sales), total_sales, fill = region))+
geom_bar(stat = 'identity')+
labs(title = 'Top 3 Sales In State By Region',
x = 'state')+
scale_y_continuous(labels = scales::dollar)+
facet_wrap(~region, scales = "free_x")+
theme(legend.position = "none")
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.
TOP 10 Sales by City
superstore %>%
group_by(city) %>%
summarise(total_sales = sum(sales)) %>%
arrange(desc(total_sales)) %>%
slice(1:10) %>%
ggplot(aes(reorder(city, total_sales), total_sales, fill = 'red'))+
geom_bar(stat = 'identity')+
coord_flip()+
scale_y_continuous(labels = scales::dollar)+
theme(legend.position = "none")+
labs(x = "City")
How many products are purchased in each segment?
superstore %>% mutate(year = year(order_date)) %>%
group_by(year, segment) %>%
summarise(total_qty = sum(quantity)) %>%
ggplot(aes(year, total_qty, fill = segment))+
geom_bar(stat = 'identity', position = position_dodge())+
labs(x = "")
p_quantity <-
superstore %>%
group_by (product_name) %>%
summarise (total_quantity = sum(quantity)) %>%
arrange(desc(total_quantity)) %>%
slice(1:10) %>%
mutate(product_name = str_replace(product_name, "(^[^ ]* [^ ]* [^ ]*).*", "\\1"),
product_name = str_wrap(product_name, width = 15)) %>%
# "(^[^ ]* [^ ]* [^ ]*).*" menunjukkan pola tiga kata pertama (di mana ^ menunjukkan awal string,
# [^ ]* menunjukkan setiap karakter kecuali spasi, dan [ ] menunjukkan spasi)
# diikuti oleh karakter apa pun (.*). "\\1" menunjukkan pengembalian dari tiga kata pertama.
# str_wrap digunakan untuk membungkus kata dengan lebar yang telah ditentukan
ggplot(aes(reorder(product_name,total_quantity), total_quantity, fill=total_quantity))+
geom_bar(stat = 'identity')+
coord_flip()+
scale_fill_gradient2(mid="purple", high = "black")+
labs(x = "Product Name",
y = "Total_Quantity")+
theme_minimal()+theme(legend.position = "none")
p_sales <-
superstore %>%
group_by(product_name) %>%
summarise (total_sales = sum(sales)) %>%
arrange(desc(total_sales)) %>%
slice(1:10) %>%
mutate(product_name = str_replace(product_name, "(^[^ ]* [^ ]* [^ ]*).*", "\\1"),
product_name = str_wrap(product_name, width = 15)) %>%
ggplot(aes(reorder(product_name, total_sales), total_sales, fill = total_sales))+
geom_bar(stat = 'identity')+
coord_flip()+
scale_fill_gradient2(mid = "purple", high = "black")+
labs(x = "Product Name",
y = "Total_Sales")+
theme_minimal()+theme(legend.position = "none")+
scale_y_continuous(labels = scales::dollar)
library(grid)
library(gridExtra)
grid.arrange(p_quantity, p_sales, ncol=2)
charts showing top 10 product based on quantity and sales
Top 10 Product by Loss and Profitable
superstore %>% select(product_name, sales, profit, discount) %>%
mutate(product_name = str_replace(product_name, "(^[^ ]* [^ ]* [^ ]*).*", "\\1"))%>%
arrange(profit) %>%
slice(1:10) %>%
mutate(sales = scales::dollar(sales),
profit = scales::dollar(profit),
discount = paste0(round(discount*100), "%"))
## # A tibble: 10 × 4
## product_name sales profit discount
## <chr> <chr> <chr> <chr>
## 1 Cubify CubeX 3D $4,499.98 -$6,599.98 70%
## 2 Cubify CubeX 3D $7,999.98 -$3,839.99 50%
## 3 GBC DocuBind P400 $2,177.58 -$3,701.89 80%
## 4 Lexmark MX611dhe Monochrome $2,549.98 -$3,399.98 70%
## 5 Ibico EPK-21 Electric $1,889.99 -$2,929.48 80%
## 6 Cubify CubeX 3D $1,799.99 -$2,639.99 70%
## 7 Fellowes PB500 Electric $1,525.19 -$2,287.78 80%
## 8 Chromcraft Bull-Nose Wood $4,297.64 -$1,862.31 40%
## 9 GBC DocuBind P400 $1,088.79 -$1,850.95 80%
## 10 Cisco TelePresence System $22,638.48 -$1,811.08 50%
superstore %>% select(product_name, sales, profit, discount) %>%
mutate(product_name = str_replace(product_name, "(^[^ ]* [^ ]* [^ ]*).*", "\\1"))%>%
arrange(desc(profit)) %>%
slice(1:10) %>%
mutate(sales = scales::dollar(sales),
profit = scales::dollar(profit),
discount = paste0(round(discount*100), "%"))
## # A tibble: 10 × 4
## product_name sales profit discount
## <chr> <chr> <chr> <chr>
## 1 Canon imageCLASS 2200 $17,499.95 $8,399.98 0%
## 2 Canon imageCLASS 2200 $13,999.96 $6,719.98 0%
## 3 Canon imageCLASS 2200 $10,499.97 $5,039.99 0%
## 4 GBC Ibimaster 500 $9,892.74 $4,946.37 0%
## 5 Ibico EPK-21 Electric $9,449.95 $4,630.48 0%
## 6 Canon imageCLASS 2200 $11,199.97 $3,919.99 20%
## 7 Fellowes PB500 Electric $6,354.95 $3,177.48 0%
## 8 HP Designjet T520 $8,749.95 $2,799.98 0%
## 9 Hewlett Packard LaserJet $5,399.91 $2,591.96 0%
## 10 GBC DocuBind P400 $5,443.96 $2,504.22 0%