Dataset

the data set -> https://www.kaggle.com/datasets/arashnic/fitbit

The goals

The main goal is to train the ability to clean, process, analyze and visualize fitbit data. And to see the activity of 33 users for one month. creating dummy values weight, height for BMI category. Predict calorie and distance values using linear regression

Load library

library(dplyr)
library(janitor)
library(lubridate)

import dataset

steps <- read.csv("dailySteps_merged.csv") %>% clean_names()
calories <- read.csv("dailyCalories_merged.csv") %>% clean_names()
sleep <- read.csv("sleepDay_merged.csv") %>% clean_names()
intensities <- read.csv("dailyIntensities_merged.csv") %>% clean_names()
colnames(intensities)
##  [1] "id"                         "activity_day"              
##  [3] "sedentary_minutes"          "lightly_active_minutes"    
##  [5] "fairly_active_minutes"      "very_active_minutes"       
##  [7] "sedentary_active_distance"  "light_active_distance"     
##  [9] "moderately_active_distance" "very_active_distance"
intensities <- intensities %>%
  mutate (total_active = round((sedentary_minutes+fairly_active_minutes+
           lightly_active_minutes+very_active_minutes)/60,1))

creating new column total_active by summing the sedentary_minutes+fairly_active_minutes+lightly_active_minutes+very_active_minutes columns.

activity <- read.csv("dailyActivity_merged.csv") %>% clean_names()
colnames(activity)
##  [1] "id"                         "activity_date"             
##  [3] "total_steps"                "total_distance"            
##  [5] "tracker_distance"           "logged_activities_distance"
##  [7] "very_active_distance"       "moderately_active_distance"
##  [9] "light_active_distance"      "sedentary_active_distance" 
## [11] "very_active_minutes"        "fairly_active_minutes"     
## [13] "lightly_active_minutes"     "sedentary_minutes"         
## [15] "calories"
activity <- activity %>% select(c(id,activity_date,total_distance)) 

Showing All Coulumn Names

colnames(activity)
## [1] "id"             "activity_date"  "total_distance"
colnames(steps)
## [1] "id"           "activity_day" "step_total"
colnames(intensities)
##  [1] "id"                         "activity_day"              
##  [3] "sedentary_minutes"          "lightly_active_minutes"    
##  [5] "fairly_active_minutes"      "very_active_minutes"       
##  [7] "sedentary_active_distance"  "light_active_distance"     
##  [9] "moderately_active_distance" "very_active_distance"      
## [11] "total_active"
colnames(sleep)
## [1] "id"                   "sleep_day"            "total_sleep_records" 
## [4] "total_minutes_asleep" "total_time_in_bed"
colnames(calories)
## [1] "id"           "activity_day" "calories"

view all column names for each table to perform table joins

Joining Table

fitbit <- left_join(activity,steps, by = c('id'='id', 'activity_date'='activity_day'))

fitbit <- left_join(fitbit, calories, by = c('id'='id','activity_date'='activity_day'))

intensities <- intensities %>% select(c(id, activity_day, total_active))

fitbit <- left_join(fitbit,intensities, by = c('id'='id','activity_date'='activity_day'))

sleep <- sleep %>% select(-c(total_sleep_records)) 

#changing data type before joining table sleep, because different format
fitbit$activity_date <- mdy(fitbit$activity_date)
sleep$sleep_day <- mdy_hms(sleep$sleep_day)

fitbit <- left_join(fitbit, sleep, by = c('id'='id','activity_date'= 'sleep_day'))

Creat Dummy Value for Weight and Height, generate random

# extract id from dataset
id_values <- unique(fitbit$id)

# determine the minimum and maximum values, make sure the values are same with total of id (33)
min_values <- c(50,80,66,64,46,102,84,74,86,44,58,68,70,100,65,75,54,86,66,76,56,65,84,94,86,96,74,104,56,66,76,74,84)
max_values <- c(52,82,68,66,48,104,86,78,88,48,60,70,72,102,67,77,56,88,67,78,58,67,86,96,88,98,76,106,58,68,78,76,86)

# Random value for height between 155 - 189
height <-runif(33, min=155, max = 189)

#Create data frame from min, max, height
df <- data.frame(id=id_values, min=min_values, max = max_values, height = height)

Joining df with master table(fitbit)

fitbit <- fitbit %>% left_join(df, by='id')

Generate Predefined Values

fitbit <- fitbit %>% mutate(weight = runif(n(), min=min, max=max)) %>% 
  select(-c(min,max))

create a weight column by using the run if(n) function, when the number of rows matches then the value will be executed according to the number of rows, because in the table not all ids have the same number of rows.

Adding BMI and BMI Category

#Adding BMI value 
fitbit$bmi<- fitbit$weight/(fitbit$height/100)^2
#Adding BMI Category
fitbit$bmi_category <- cut(fitbit$bmi, 
                              breaks = c(-Inf, 18.5, 25, 30, Inf), 
                              labels = c("Underweight", "Normal weight", "Overweight", "Obesity"))

BMI with value < 18.5 is underweight, value < 25 is normal weight, value < 30 is overweight, value > 30 is overweight.

changing digits

fitbit$height<- round(fitbit$height, digits= 0)
fitbit$weight<- round(fitbit$weight, digits= 2)
fitbit$bmi<- round(fitbit$bmi, digits= 1)

organize the digits in the column so that it is not confusing, for height without digits, weight 2 digits, and BMI one digits

Changing data type for Step Total and Calories

fitbit$step_total <- as.numeric(fitbit$step_total)
fitbit$calories <- as.numeric(fitbit$calories)

Segmentation user

Activty of user by Using APPS in Hour

  type_of_usage_device <- fitbit %>% 
    group_by(id) %>% 
    summarise(activity = mean(total_active)) %>% 
    mutate(usage = case_when(
      activity<=10~"low",
      activity<=20~"moderate", 
      TRUE ~"high use")) %>% 
    select(-c(activity)) %>% print(n=5)
## # A tibble: 33 × 2
##           id usage   
##        <dbl> <chr>   
## 1 1503960366 moderate
## 2 1624580081 high use
## 3 1644430081 high use
## 4 1844505072 high use
## 5 1927972279 high use
## # ℹ 28 more rows

categorizes the average use of the application in hours, the value <=10 is low, <=20 is moderate, the rest is High Use.

Activty of user by Average Step

  user_type <- fitbit %>% 
    group_by(id) %>% 
    summarise(mean_steps = mean(step_total)) %>% 
    mutate(user_type = case_when(
      mean_steps <= 5000 ~ "sendentary",
      mean_steps<7500~"lightly active",
      mean_steps<10000~"fairly active",
      TRUE ~ "very active"
    )) %>%select(-c(mean_steps)) %>% 
    print(n=5)
## # A tibble: 33 × 2
##           id user_type     
##        <dbl> <chr>         
## 1 1503960366 very active   
## 2 1624580081 lightly active
## 3 1644430081 lightly active
## 4 1844505072 sendentary    
## 5 1927972279 sendentary    
## # ℹ 28 more rows
##joining the data to master(fitbit)
  fitbit <- left_join(fitbit, type_of_usage_device,by="id")
  fitbit <- left_join(fitbit, user_type ,by="id")

categorizes the average step per user, the value <=5000 is sendentary, <7500 is lightly active, <10000 is fairly active, else very active.

Checking NA Values

  #checking na values
  sapply(fitbit, function(x)sum(is.na(x)))
##                   id        activity_date       total_distance 
##                    0                    0                    0 
##           step_total             calories         total_active 
##                    0                    0                    0 
## total_minutes_asleep    total_time_in_bed               height 
##                  530                  530                    0 
##               weight                  bmi         bmi_category 
##                    0                    0                    0 
##                usage            user_type 
##                    0                    0

total_minutes_asleep and total_time_in_bed there are 530 Na Values, this is due to incomplete data. and can still be used

Analyze Phase

#QUICK Summary
summary(fitbit)
##        id            activity_date                   total_distance  
##  Min.   :1.504e+09   Min.   :2016-04-12 00:00:00.0   Min.   : 0.000  
##  1st Qu.:2.320e+09   1st Qu.:2016-04-19 00:00:00.0   1st Qu.: 2.620  
##  Median :4.445e+09   Median :2016-04-26 00:00:00.0   Median : 5.260  
##  Mean   :4.858e+09   Mean   :2016-04-26 07:21:18.9   Mean   : 5.503  
##  3rd Qu.:6.962e+09   3rd Qu.:2016-05-04 00:00:00.0   3rd Qu.: 7.720  
##  Max.   :8.878e+09   Max.   :2016-05-12 00:00:00.0   Max.   :28.030  
##                                                                      
##    step_total       calories     total_active  total_minutes_asleep
##  Min.   :    0   Min.   :   0   Min.   : 0.0   Min.   : 58.0       
##  1st Qu.: 3795   1st Qu.:1830   1st Qu.:16.5   1st Qu.:361.0       
##  Median : 7439   Median :2140   Median :24.0   Median :433.0       
##  Mean   : 7652   Mean   :2308   Mean   :20.3   Mean   :419.5       
##  3rd Qu.:10734   3rd Qu.:2796   3rd Qu.:24.0   3rd Qu.:490.0       
##  Max.   :36019   Max.   :4900   Max.   :24.0   Max.   :796.0       
##                                                NA's   :530         
##  total_time_in_bed     height          weight            bmi       
##  Min.   : 61.0     Min.   :157.0   Min.   : 44.02   Min.   :12.80  
##  1st Qu.:403.0     1st Qu.:162.0   1st Qu.: 65.72   1st Qu.:22.15  
##  Median :463.0     Median :171.0   Median : 75.00   Median :25.70  
##  Mean   :458.6     Mean   :170.6   Mean   : 73.95   Mean   :25.94  
##  3rd Qu.:526.0     3rd Qu.:179.0   3rd Qu.: 85.29   3rd Qu.:30.90  
##  Max.   :961.0     Max.   :188.0   Max.   :105.97   Max.   :40.30  
##  NA's   :530                                                       
##         bmi_category    usage            user_type        
##  Underweight  :142   Length:943         Length:943        
##  Normal weight:262   Class :character   Class :character  
##  Overweight   :297   Mode  :character   Mode  :character  
##  Obesity      :242                                        
##                                                           
##                                                           
## 

Distribution of Bmi Category

library(ggplot2)
fitbit %>% group_by(bmi_category) %>% 
  summarise (total = n_distinct(id),.groups = "drop")%>%
  ggplot(aes(bmi_category,total, fill=bmi_category))+
  geom_bar(stat="identity")+
  geom_text(aes(label = total))+
  labs(title="Total by BMI Category")+
  theme_minimal()

there is a change in data from 33 to 35, it can be ascertained that there are two ids experiencing changes in the bmi category, over the past month. from the chart we know that normal weight, overweight, and obesity are high users of the app.

usage type per BMI Category

fitbit %>% group_by(bmi_category, usage) %>% 
  summarise (total = n_distinct(id),.groups = "drop")%>%
  ggplot(aes(usage,total, fill=bmi_category))+
  geom_bar(stat="identity", position=position_dodge())+
  labs(title="Total by BMI Category",
       caption = "high use = average user using the apps more than 20 hours, 
       moderate = average user using the apps from 11 to 20 hours")+
  theme_minimal()

User Type

fitbit %>% group_by(bmi_category, user_type) %>% 
  summarise (total = n_distinct(id),.groups="drop")%>%
  ggplot(aes(user_type,total, fill=bmi_category))+
  geom_bar(stat="identity", position=position_dodge())+
  labs(title="Total by BMI Category",
       caption = "categorizes the average step per user, 
       the value <=5000 is sendentary, <7500 is lightly active, 
       <10000 is fairly active, else very active")+
  theme_minimal()

this is the number of type of user based on bmi category

Average Step by Days of Week

fitbit %>% 
  mutate(activity_date = wday(activity_date, label=TRUE)) %>% 
  group_by(activity_date, bmi_category) %>% 
  summarise(mean_distance = mean(total_distance),.groups = "drop") %>% 
  ggplot(aes(activity_date, mean_distance, group = bmi_category, color=bmi_category))+
  geom_line(linewidth = 1.4)+
  theme_minimal()+
  labs(title ="Activity Average Distance by BMI Category",
    y = "Total Distance (KM)")

almost all bmi categories take steps above the recommended 4000-5000 per day. obesity is most active in taking an average of more than 6.5KM of steps per day.

Time Sleep vs Total Step

library(gridExtra)
grid.arrange(
{fitbit %>% mutate(activity_date = wday(activity_date, label=TRUE)) %>% 
  group_by(activity_date, bmi_category) %>% 
  summarise(mean_asleep = round(mean(total_minutes_asleep, na.rm = TRUE)/60,2),.groups = "drop") %>% 
  ggplot(aes(activity_date, mean_asleep, fill=bmi_category))+
  geom_bar(stat = 'identity', position = position_dodge())+
  geom_hline(yintercept = 8, color = "black")+
  geom_hline(yintercept = 6, color = "black")+
  labs(title = 'Mean Sleep By BMI Category',
       y = "Hour",
       x = "Day of Week")+
      theme(legend.position = "top")+
    guides(fill = guide_legend(nrow = 2, title = NULL))},
  
{fitbit %>% 
    ggplot(aes(step_total, total_minutes_asleep))+
    geom_jitter()+
    geom_smooth(color="red")+
  labs(title = 'Total Step vs Total Sleep',
       y = "Total Sleep (in minutes)",
       x = "Total Step",
       caption = "correlation :-0.1868665 ")},
ncol=2 
)

Due to the lack of 530 rows of data, the results of the plot above show the average per day in hours spent by users to sleep. the average user already has the recommended quality of sleep between 6 to 8 hours. but obesity on Sundays, tuesday and saturday has below average quality, while underweight on Monday, Wednesday, Thursday has below average sleep time. there is no correlation how much taking step in a day for quality of sleep in a day

Creating correlation heatpmap between multiple columns

#load library
library(reshape2)
# changing fitbit tabel to long format
fitbit_melt <- fitbit %>% select(-c(bmi_category,id,activity_date, usage, user_type, total_minutes_asleep,total_time_in_bed))

# calculating correlation
fitbit_cor <- round(cor(fitbit_melt),2)

melt <- melt(fitbit_cor)
head(melt)
##             Var1           Var2 value
## 1 total_distance total_distance  1.00
## 2     step_total total_distance  0.99
## 3       calories total_distance  0.65
## 4   total_active total_distance  0.00
## 5         height total_distance -0.23
## 6         weight total_distance  0.24

Creating Heatmap Correlation

ggplot(melt, aes(x = Var1, y = Var2,fill = value)) +
  geom_tile(aes(), colour = "white") +
  scale_fill_gradient(low = "white", high = "red") +
  ggtitle("Heatmap Correlation Fitbit")+
  geom_text(aes(label = value))

Predict Calories using Linear Regression Model

model <- lm(total_distance~step_total+weight+height, data =fitbit)
summary(model)
## 
## Call:
## lm(formula = total_distance ~ step_total + weight + height, data = fitbit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7944 -0.3640  0.0154  0.3666  5.9202 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.806e-01  4.780e-01   2.051  0.04052 *  
## step_total   7.566e-04  4.415e-06 171.357  < 2e-16 ***
## weight       1.060e-03  1.620e-03   0.655  0.51281    
## height      -7.888e-03  2.411e-03  -3.271  0.00111 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6648 on 939 degrees of freedom
## Multiple R-squared:  0.9714, Adjusted R-squared:  0.9713 
## F-statistic: 1.064e+04 on 3 and 939 DF,  p-value: < 2.2e-16

weight and height are not very significant, while step_total is very significant to determine the amount of total_distance. every 1 unit in total_step will increase the value of the dependent variable (distance) by 0.000757.

Predict

df <- data.frame(step_total=c(12000),weight=c(80),height=c(171))
predictions <- predict(model, newdata=df)
predictions
##        1 
## 8.795483

if we enter the value of 12000 for total steps, 80 for weight, and 171 for height. then the result is a distance of 8.8km.

Prediction Calories Burned

model <- lm(calories~step_total+weight+height, data =fitbit)
summary(model)
## 
## Call:
## lm(formula = calories ~ step_total + weight + height, data = fitbit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2062.08  -378.94    31.87   424.18  1738.99 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.225e+03  4.148e+02   7.776 1.96e-14 ***
## step_total   8.382e-02  3.831e-03  21.881  < 2e-16 ***
## weight      -4.121e+00  1.405e+00  -2.932 0.003447 ** 
## height      -7.353e+00  2.092e+00  -3.515 0.000461 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 576.8 on 939 degrees of freedom
## Multiple R-squared:  0.3617, Adjusted R-squared:  0.3597 
## F-statistic: 177.4 on 3 and 939 DF,  p-value: < 2.2e-16

all variables except weight have a p-value less than 0.05, which indicates that they are significant in predicting the dependent variable total_distance is very significant to determine the amount of total_distance. every 1 unit in step_total will increase the value of the dependent variable (calories) by 0.08652.

Predict

df <- data.frame(step_total=c(9000),weight=c(80),height=c(171))
predictions <- predict(model, newdata=df)
predictions
##        1 
## 2392.568

if we enter the value of 9000 for total steps, 80 for weight, and 171 for height. then the result is a 2422.95 calories burned