superstoore

About Dataset

https://www.kaggle.com/datasets/vivek468/superstore-dataset-final

Metadata
Row ID => Unique ID for each row.
Order ID => Unique Order ID for each Customer.
Order Date => Order Date of the product.
Ship Date => Shipping Date of the Product.
Ship Mode=> Shipping Mode specified by the Customer.
Customer ID => Unique ID to identify each Customer.
Customer Name => Name of the Customer.
Segment => The segment where the Customer belongs.
Country => Country of residence of the Customer.
City => City of residence of of the Customer.
State => State of residence of the Customer.
Postal Code => Postal Code of every Customer.
Region => Region where the Customer belong.
Product ID => Unique ID of the Product.
Category => Category of the product ordered.
Sub-Category => Sub-Category of the product ordered.
Product Name => Name of the Product
Sales => Sales of the Product.
Quantity => Quantity of the Product.
Discount => Discount provided.
Profit => Profit/Loss incurred.

Prepare

library(dplyr)
library(janitor)
library(tidyverse)

Read Data

superstore <- read_csv("Sample - Superstore.csv") %>% clean_names()
superstore <- superstore %>% select(-c(row_id))

Changing Data type of date using lubridate

library(lubridate)
superstore$order_date <- mdy(superstore$order_date)
superstore$ship_date <-  mdy(superstore$ship_date)

Cleaning

# Checking Duplicate Value
sum(duplicated(superstore))

## [1] 1

# Find Index Duplicate Value
which(duplicated(superstore))

## [1] 3407

# Find rows 3407 and remove it
superstore[3407,]

## # A tibble: 1 × 20
##   order_id     order_date ship_date  ship_mode customer_id customer_name segment
##   <chr>        <date>     <date>     <chr>     <chr>       <chr>         <chr>  
## 1 US-2014-150… 2014-04-23 2014-04-27 Standard… LB-16795    Laurel Beltr… Home O…
## # ℹ 13 more variables: country <chr>, city <chr>, state <chr>,
## #   postal_code <chr>, region <chr>, product_id <chr>, category <chr>,
## #   sub_category <chr>, product_name <chr>, sales <dbl>, quantity <dbl>,
## #   discount <dbl>, profit <dbl>

superstore <- superstore[-3407,]

check na values

sapply(superstore, function(x)sum(is.na(x)))

##      order_id    order_date     ship_date     ship_mode   customer_id 
##             0             0             0             0             0 
## customer_name       segment       country          city         state 
##             0             0             0             0             0 
##   postal_code        region    product_id      category  sub_category 
##             0             0             0             0             0 
##  product_name         sales      quantity      discount        profit 
##             0             0             0             0             0

Quick Summary

summary(superstore)

##    order_id           order_date           ship_date         
##  Length:9993        Min.   :2014-01-03   Min.   :2014-01-07  
##  Class :character   1st Qu.:2015-05-23   1st Qu.:2015-05-27  
##  Mode  :character   Median :2016-06-26   Median :2016-06-29  
##                     Mean   :2016-04-30   Mean   :2016-05-04  
##                     3rd Qu.:2017-05-14   3rd Qu.:2017-05-18  
##                     Max.   :2017-12-30   Max.   :2018-01-05  
##   ship_mode         customer_id        customer_name        segment         
##  Length:9993        Length:9993        Length:9993        Length:9993       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    country              city              state           postal_code       
##  Length:9993        Length:9993        Length:9993        Length:9993       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     region           product_id          category         sub_category      
##  Length:9993        Length:9993        Length:9993        Length:9993       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  product_name           sales              quantity        discount     
##  Length:9993        Min.   :    0.444   Min.   : 1.00   Min.   :0.0000  
##  Class :character   1st Qu.:   17.280   1st Qu.: 2.00   1st Qu.:0.0000  
##  Mode  :character   Median :   54.480   Median : 3.00   Median :0.2000  
##                     Mean   :  229.853   Mean   : 3.79   Mean   :0.1562  
##                     3rd Qu.:  209.940   3rd Qu.: 5.00   3rd Qu.:0.2000  
##                     Max.   :22638.480   Max.   :14.00   Max.   :0.8000  
##      profit         
##  Min.   :-6599.978  
##  1st Qu.:    1.731  
##  Median :    8.671  
##  Mean   :   28.661  
##  3rd Qu.:   29.364  
##  Max.   : 8399.976

Task ~ Sales Analyze

superstore %>%  mutate(year = year(order_date)) %>%
  group_by(year) %>% 
  summarise(sales_year = sum(sales)) %>% 
  mutate(change = (sales_year/lag(sales_year))-1, 
         change_percent = scales::percent(change)) %>% 
  select(-c(change)) %>% 
  ggplot(aes(year, sales_year))+ 
  geom_line(color = "blue", linewidth = 1)+
  geom_point(color = "blue")+
  scale_y_continuous(labels = scales::dollar)+
  geom_label(aes(label = change_percent))+
  labs(x = "Year",
       y = "Sales", 
       title = "Sales From 2014 to 2017")

superstore %>% select(order_date, sales) %>% 
  mutate(month = month(order_date, label = TRUE),
         year = year(order_date)) %>%
  group_by(year, month) %>% 
  summarise(total = sum(sales)) %>% 
  ggplot(aes(month, total, group = year))+
  geom_line(color = "red", linewidth = 1)+
  geom_point()+
  facet_wrap(~year)+
  scale_y_continuous(labels = scales::dollar)+
  labs(title= "Sales From 2014-2017 by Month")

Does the discount given and the season have an impact on sales?

# adding variable season, discounted

   s_diag <-  superstore %>% select(order_date, sales, profit, quantity, discount) %>% 
  mutate(season = case_when(
    month(order_date) %in% 3:5~"Spring",
    month(order_date) %in% 6:8~"Summer",
    month(order_date) %in% 9:11~"Fall",
    TRUE ~"Winter"),
    holiday = ifelse(weekdays(order_date)%in%c("Saturday", "Sunday"), "Weekend", "Weekday"),
    discounted = ifelse(discount>0, "Yes", "No"))
head(s_diag)

## # A tibble: 6 × 8
##   order_date sales  profit quantity discount season holiday discounted
##   <date>     <dbl>   <dbl>    <dbl>    <dbl> <chr>  <chr>   <chr>     
## 1 2016-11-08 262.    41.9         2     0    Fall   Weekday No        
## 2 2016-11-08 732.   220.          3     0    Fall   Weekday No        
## 3 2016-06-12  14.6    6.87        2     0    Summer Weekend No        
## 4 2015-10-11 958.  -383.          5     0.45 Fall   Weekend Yes       
## 5 2015-10-11  22.4    2.52        2     0.2  Fall   Weekend Yes       
## 6 2014-06-09  48.9   14.2         7     0    Summer Weekday No

 s_diag %>%
   mutate(month = month(order_date, label = TRUE),
          year = year(order_date)) %>%
   group_by(year, month, season) %>%
   summarise (total = sum(sales),
              discounted = sum(discounted == "Yes")) %>%
   ggplot(aes(month, total, fill = season))+
   geom_bar(stat='identity')+
   facet_wrap(~year)+
   geom_text(aes(label = discounted), position = position_stack(vjust = 0.5))+
   scale_y_continuous(labels = scales::dollar)+
   theme(legend.position = "bottom")+
   labs(title = "Total Sales in each year and total discount given")

discounts do not affect sales value, but after entering the end of the year during the fall season customers tend to make purchases at the beginning, late fall, and end of the year (early winter). the total discount given is also more than other seasons. in early spring, sales are always increasing in every year from the previous month

 s_diag %>%
   mutate(month = month(order_date, label = TRUE)) %>%
   group_by(month, holiday, season) %>%
   summarise (avg_sales = mean(sales)) %>%
   ggplot(aes(month, avg_sales, fill = holiday)) +
   geom_bar(stat='identity', position = position_dodge())+
   scale_y_continuous(labels = scales::dollar)+
   labs(title = "Avg_sales and type of days")+
   theme(legend.position = "bottom")

From the graph, we know that holidays didn’t affect sales. Companies don’t need to make a decision about whether to provide discounts on weekdays or weekends. However, for the 4 months (mar, sep, nov, dec), we may consider providing a discount or promotion, either on weekdays or weekends.

Task ~ Analyze Customer Segments

superstore %>% 
  group_by(segment) %>% 
  summarise(total_sales = sum(sales),
            total_profit = sum(profit),
            total_orders = n()) %>% 
  mutate_at(vars(total_sales, total_profit), scales::dollar)

## # A tibble: 3 × 4
##   segment     total_sales total_profit total_orders
##   <chr>       <chr>       <chr>               <int>
## 1 Consumer    $1,161,401  $134,119             5191
## 2 Corporate   $706,146    $91,979              3020
## 3 Home Office $429,372    $60,311              1782

superstore %>% 
  mutate(year = year(order_date)) %>% 
  group_by(year, segment) %>% 
  summarise(total_sales = sum(sales), group = 1) %>% 
  ggplot(aes(year, total_sales, fill = segment))+ 
  geom_bar(stat = 'identity')+
  scale_y_continuous(labels = scales::dollar)+
  theme_minimal()+
  facet_wrap(~segment)+
  theme(legend.position = "none")

From the table, we know that Segment of consumer provide the most sales and profit

Region and State

#Checking Unique Region 

unique(superstore$region)

## [1] "South"   "West"    "Central" "East"

#Checking Unique State

unique(superstore$state)

##  [1] "Kentucky"             "California"           "Florida"             
##  [4] "North Carolina"       "Washington"           "Texas"               
##  [7] "Wisconsin"            "Utah"                 "Nebraska"            
## [10] "Pennsylvania"         "Illinois"             "Minnesota"           
## [13] "Michigan"             "Delaware"             "Indiana"             
## [16] "New York"             "Arizona"              "Virginia"            
## [19] "Tennessee"            "Alabama"              "South Carolina"      
## [22] "Oregon"               "Colorado"             "Iowa"                
## [25] "Ohio"                 "Missouri"             "Oklahoma"            
## [28] "New Mexico"           "Louisiana"            "Connecticut"         
## [31] "New Jersey"           "Massachusetts"        "Georgia"             
## [34] "Nevada"               "Rhode Island"         "Mississippi"         
## [37] "Arkansas"             "Montana"              "New Hampshire"       
## [40] "Maryland"             "District of Columbia" "Kansas"              
## [43] "Vermont"              "Maine"                "South Dakota"        
## [46] "Idaho"                "North Dakota"         "Wyoming"             
## [49] "West Virginia"

Which region the most sales?

superstore %>% group_by(region) %>% 
  summarise(total_sales = sum(sales)) %>% 
  arrange(desc(total_sales)) %>% 
  mutate_at(vars(total_sales), scales::dollar)

## # A tibble: 4 × 2
##   region  total_sales
##   <chr>   <chr>      
## 1 West    $725,458   
## 2 East    $678,500   
## 3 Central $501,240   
## 4 South   $391,722

TOP 3 Total Sales In State By Region

superstore %>% 
group_by(region,state) %>% 
  summarise(total_sales = sum(sales)) %>% 
  arrange(desc(total_sales)) %>% 
  slice(1:3) %>%
  ggplot(aes(reorder(state, -total_sales), total_sales, fill = region))+
  geom_bar(stat = 'identity')+
  labs(title = 'Top 3 Sales In State By Region',
       x = 'state')+
  scale_y_continuous(labels = scales::dollar)+
  facet_wrap(~region, scales = "free_x")+
  theme(legend.position = "none")

## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.

TOP 10 Sales by City

superstore %>% 
  group_by(city) %>% 
  summarise(total_sales = sum(sales)) %>% 
  arrange(desc(total_sales)) %>%
  slice(1:10) %>% 
  ggplot(aes(reorder(city, total_sales), total_sales, fill = 'red'))+
  geom_bar(stat = 'identity')+
  coord_flip()+
  scale_y_continuous(labels = scales::dollar)+
  theme(legend.position = "none")+
  labs(x = "City")

How many products are purchased in each segment?

superstore %>% mutate(year = year(order_date)) %>% 
  group_by(year, segment) %>% 
  summarise(total_qty = sum(quantity)) %>% 
  ggplot(aes(year, total_qty, fill = segment))+
  geom_bar(stat = 'identity', position = position_dodge())+
  labs(x = "")

Task ~ Produk Analyze

p_quantity <- 
  superstore %>% 
  group_by (product_name) %>% 
  summarise (total_quantity = sum(quantity)) %>% 
  arrange(desc(total_quantity)) %>% 
  slice(1:10) %>% 
  mutate(product_name = str_replace(product_name, "(^[^ ]* [^ ]* [^ ]*).*", "\\1"),
         product_name = str_wrap(product_name, width = 15)) %>% 
  # "(^[^ ]* [^ ]* [^ ]*).*" menunjukkan pola tiga kata pertama (di mana ^ menunjukkan awal string, 
  # [^ ]* menunjukkan setiap karakter kecuali spasi, dan [ ] menunjukkan spasi)
  # diikuti oleh karakter apa pun (.*). "\\1" menunjukkan pengembalian dari tiga kata pertama.
  # str_wrap digunakan untuk membungkus kata dengan lebar yang telah ditentukan
  ggplot(aes(reorder(product_name,total_quantity), total_quantity, fill=total_quantity))+
  geom_bar(stat = 'identity')+
  coord_flip()+
  scale_fill_gradient2(mid="purple", high = "black")+
  labs(x = "Product Name",
       y = "Total_Quantity")+
  theme_minimal()+theme(legend.position = "none")


p_sales <- 
  superstore  %>% 
  group_by(product_name) %>% 
  summarise (total_sales = sum(sales)) %>% 
  arrange(desc(total_sales)) %>% 
  slice(1:10) %>% 
  mutate(product_name = str_replace(product_name, "(^[^ ]* [^ ]* [^ ]*).*", "\\1"),
         product_name = str_wrap(product_name, width = 15)) %>%
  ggplot(aes(reorder(product_name, total_sales), total_sales, fill = total_sales))+
  geom_bar(stat = 'identity')+
  coord_flip()+
  scale_fill_gradient2(mid = "purple", high = "black")+
  labs(x = "Product Name",
       y = "Total_Sales")+
  theme_minimal()+theme(legend.position = "none")+
  scale_y_continuous(labels = scales::dollar)


library(grid)
library(gridExtra)
grid.arrange(p_quantity, p_sales, ncol=2)

charts showing top 10 product based on quantity and sales

Top 10 Product by Loss and Profitable

superstore %>% select(product_name, sales, profit, discount) %>% 
  mutate(product_name = str_replace(product_name, "(^[^ ]* [^ ]* [^ ]*).*", "\\1"))%>% 
  arrange(profit) %>% 
  slice(1:10) %>% 
  mutate(sales = scales::dollar(sales),
         profit = scales::dollar(profit),
         discount = paste0(round(discount*100), "%"))

## # A tibble: 10 × 4
##    product_name                sales      profit     discount
##    <chr>                       <chr>      <chr>      <chr>   
##  1 Cubify CubeX 3D             $4,499.98  -$6,599.98 70%     
##  2 Cubify CubeX 3D             $7,999.98  -$3,839.99 50%     
##  3 GBC DocuBind P400           $2,177.58  -$3,701.89 80%     
##  4 Lexmark MX611dhe Monochrome $2,549.98  -$3,399.98 70%     
##  5 Ibico EPK-21 Electric       $1,889.99  -$2,929.48 80%     
##  6 Cubify CubeX 3D             $1,799.99  -$2,639.99 70%     
##  7 Fellowes PB500 Electric     $1,525.19  -$2,287.78 80%     
##  8 Chromcraft Bull-Nose Wood   $4,297.64  -$1,862.31 40%     
##  9 GBC DocuBind P400           $1,088.79  -$1,850.95 80%     
## 10 Cisco TelePresence System   $22,638.48 -$1,811.08 50%

superstore %>% select(product_name, sales, profit, discount) %>% 
  mutate(product_name = str_replace(product_name, "(^[^ ]* [^ ]* [^ ]*).*", "\\1"))%>% 
  arrange(desc(profit)) %>% 
  slice(1:10) %>% 
  mutate(sales = scales::dollar(sales),
         profit = scales::dollar(profit),
         discount = paste0(round(discount*100), "%"))

## # A tibble: 10 × 4
##    product_name             sales      profit    discount
##    <chr>                    <chr>      <chr>     <chr>   
##  1 Canon imageCLASS 2200    $17,499.95 $8,399.98 0%      
##  2 Canon imageCLASS 2200    $13,999.96 $6,719.98 0%      
##  3 Canon imageCLASS 2200    $10,499.97 $5,039.99 0%      
##  4 GBC Ibimaster 500        $9,892.74  $4,946.37 0%      
##  5 Ibico EPK-21 Electric    $9,449.95  $4,630.48 0%      
##  6 Canon imageCLASS 2200    $11,199.97 $3,919.99 20%     
##  7 Fellowes PB500 Electric  $6,354.95  $3,177.48 0%      
##  8 HP Designjet T520        $8,749.95  $2,799.98 0%      
##  9 Hewlett Packard LaserJet $5,399.91  $2,591.96 0%      
## 10 GBC DocuBind P400        $5,443.96  $2,504.22 0%

superstoore

Rama

2023-04-02

Task ~ Sales Analyze

Task ~ Analyze Customer Segments

Task ~ Produk Analyze