the data set -> https://www.kaggle.com/datasets/arashnic/fitbit
The main goal is to train the ability to clean, process, analyze and visualize fitbit data. And to see the activity of 33 users for one month. creating dummy values weight, height for BMI category. Predict calorie and distance values using linear regression
library(dplyr)
library(janitor)
library(lubridate)
steps <- read.csv("dailySteps_merged.csv") %>% clean_names()
calories <- read.csv("dailyCalories_merged.csv") %>% clean_names()
sleep <- read.csv("sleepDay_merged.csv") %>% clean_names()
intensities <- read.csv("dailyIntensities_merged.csv") %>% clean_names()
colnames(intensities)
## [1] "id" "activity_day"
## [3] "sedentary_minutes" "lightly_active_minutes"
## [5] "fairly_active_minutes" "very_active_minutes"
## [7] "sedentary_active_distance" "light_active_distance"
## [9] "moderately_active_distance" "very_active_distance"
intensities <- intensities %>%
mutate (total_active = round((sedentary_minutes+fairly_active_minutes+
lightly_active_minutes+very_active_minutes)/60,1))
creating new column total_active by summing the sedentary_minutes+fairly_active_minutes+lightly_active_minutes+very_active_minutes columns.
activity <- read.csv("dailyActivity_merged.csv") %>% clean_names()
colnames(activity)
## [1] "id" "activity_date"
## [3] "total_steps" "total_distance"
## [5] "tracker_distance" "logged_activities_distance"
## [7] "very_active_distance" "moderately_active_distance"
## [9] "light_active_distance" "sedentary_active_distance"
## [11] "very_active_minutes" "fairly_active_minutes"
## [13] "lightly_active_minutes" "sedentary_minutes"
## [15] "calories"
activity <- activity %>% select(c(id,activity_date,total_distance))
colnames(activity)
## [1] "id" "activity_date" "total_distance"
colnames(steps)
## [1] "id" "activity_day" "step_total"
colnames(intensities)
## [1] "id" "activity_day"
## [3] "sedentary_minutes" "lightly_active_minutes"
## [5] "fairly_active_minutes" "very_active_minutes"
## [7] "sedentary_active_distance" "light_active_distance"
## [9] "moderately_active_distance" "very_active_distance"
## [11] "total_active"
colnames(sleep)
## [1] "id" "sleep_day" "total_sleep_records"
## [4] "total_minutes_asleep" "total_time_in_bed"
colnames(calories)
## [1] "id" "activity_day" "calories"
view all column names for each table to perform table joins
fitbit <- left_join(activity,steps, by = c('id'='id', 'activity_date'='activity_day'))
fitbit <- left_join(fitbit, calories, by = c('id'='id','activity_date'='activity_day'))
intensities <- intensities %>% select(c(id, activity_day, total_active))
fitbit <- left_join(fitbit,intensities, by = c('id'='id','activity_date'='activity_day'))
sleep <- sleep %>% select(-c(total_sleep_records))
#changing data type before joining table sleep, because different format
fitbit$activity_date <- mdy(fitbit$activity_date)
sleep$sleep_day <- mdy_hms(sleep$sleep_day)
fitbit <- left_join(fitbit, sleep, by = c('id'='id','activity_date'= 'sleep_day'))
# extract id from dataset
id_values <- unique(fitbit$id)
# determine the minimum and maximum values, make sure the values are same with total of id (33)
min_values <- c(50,80,66,64,46,102,84,74,86,44,58,68,70,100,65,75,54,86,66,76,56,65,84,94,86,96,74,104,56,66,76,74,84)
max_values <- c(52,82,68,66,48,104,86,78,88,48,60,70,72,102,67,77,56,88,67,78,58,67,86,96,88,98,76,106,58,68,78,76,86)
# Random value for height between 155 - 189
height <-runif(33, min=155, max = 189)
#Create data frame from min, max, height
df <- data.frame(id=id_values, min=min_values, max = max_values, height = height)
fitbit <- fitbit %>% left_join(df, by='id')
fitbit <- fitbit %>% mutate(weight = runif(n(), min=min, max=max)) %>%
select(-c(min,max))
create a weight column by using the run if(n) function, when the number of rows matches then the value will be executed according to the number of rows, because in the table not all ids have the same number of rows.
#Adding BMI value
fitbit$bmi<- fitbit$weight/(fitbit$height/100)^2
#Adding BMI Category
fitbit$bmi_category <- cut(fitbit$bmi,
breaks = c(-Inf, 18.5, 25, 30, Inf),
labels = c("Underweight", "Normal weight", "Overweight", "Obesity"))
BMI with value < 18.5 is underweight, value < 25 is normal weight, value < 30 is overweight, value > 30 is overweight.
fitbit$height<- round(fitbit$height, digits= 0)
fitbit$weight<- round(fitbit$weight, digits= 2)
fitbit$bmi<- round(fitbit$bmi, digits= 1)
organize the digits in the column so that it is not confusing, for height without digits, weight 2 digits, and BMI one digits
fitbit$step_total <- as.numeric(fitbit$step_total)
fitbit$calories <- as.numeric(fitbit$calories)
type_of_usage_device <- fitbit %>%
group_by(id) %>%
summarise(activity = mean(total_active)) %>%
mutate(usage = case_when(
activity<=10~"low",
activity<=20~"moderate",
TRUE ~"high use")) %>%
select(-c(activity)) %>% print(n=5)
## # A tibble: 33 × 2
## id usage
## <dbl> <chr>
## 1 1503960366 moderate
## 2 1624580081 high use
## 3 1644430081 high use
## 4 1844505072 high use
## 5 1927972279 high use
## # ℹ 28 more rows
categorizes the average use of the application in hours, the value <=10 is low, <=20 is moderate, the rest is High Use.
user_type <- fitbit %>%
group_by(id) %>%
summarise(mean_steps = mean(step_total)) %>%
mutate(user_type = case_when(
mean_steps <= 5000 ~ "sendentary",
mean_steps<7500~"lightly active",
mean_steps<10000~"fairly active",
TRUE ~ "very active"
)) %>%select(-c(mean_steps)) %>%
print(n=5)
## # A tibble: 33 × 2
## id user_type
## <dbl> <chr>
## 1 1503960366 very active
## 2 1624580081 lightly active
## 3 1644430081 lightly active
## 4 1844505072 sendentary
## 5 1927972279 sendentary
## # ℹ 28 more rows
##joining the data to master(fitbit)
fitbit <- left_join(fitbit, type_of_usage_device,by="id")
fitbit <- left_join(fitbit, user_type ,by="id")
categorizes the average step per user, the value <=5000 is sendentary, <7500 is lightly active, <10000 is fairly active, else very active.
#checking na values
sapply(fitbit, function(x)sum(is.na(x)))
## id activity_date total_distance
## 0 0 0
## step_total calories total_active
## 0 0 0
## total_minutes_asleep total_time_in_bed height
## 530 530 0
## weight bmi bmi_category
## 0 0 0
## usage user_type
## 0 0
total_minutes_asleep and total_time_in_bed there are 530 Na Values, this is due to incomplete data. and can still be used
#QUICK Summary
summary(fitbit)
## id activity_date total_distance
## Min. :1.504e+09 Min. :2016-04-12 00:00:00.0 Min. : 0.000
## 1st Qu.:2.320e+09 1st Qu.:2016-04-19 00:00:00.0 1st Qu.: 2.620
## Median :4.445e+09 Median :2016-04-26 00:00:00.0 Median : 5.260
## Mean :4.858e+09 Mean :2016-04-26 07:21:18.9 Mean : 5.503
## 3rd Qu.:6.962e+09 3rd Qu.:2016-05-04 00:00:00.0 3rd Qu.: 7.720
## Max. :8.878e+09 Max. :2016-05-12 00:00:00.0 Max. :28.030
##
## step_total calories total_active total_minutes_asleep
## Min. : 0 Min. : 0 Min. : 0.0 Min. : 58.0
## 1st Qu.: 3795 1st Qu.:1830 1st Qu.:16.5 1st Qu.:361.0
## Median : 7439 Median :2140 Median :24.0 Median :433.0
## Mean : 7652 Mean :2308 Mean :20.3 Mean :419.5
## 3rd Qu.:10734 3rd Qu.:2796 3rd Qu.:24.0 3rd Qu.:490.0
## Max. :36019 Max. :4900 Max. :24.0 Max. :796.0
## NA's :530
## total_time_in_bed height weight bmi
## Min. : 61.0 Min. :157.0 Min. : 44.02 Min. :12.80
## 1st Qu.:403.0 1st Qu.:162.0 1st Qu.: 65.72 1st Qu.:22.15
## Median :463.0 Median :171.0 Median : 75.00 Median :25.70
## Mean :458.6 Mean :170.6 Mean : 73.95 Mean :25.94
## 3rd Qu.:526.0 3rd Qu.:179.0 3rd Qu.: 85.29 3rd Qu.:30.90
## Max. :961.0 Max. :188.0 Max. :105.97 Max. :40.30
## NA's :530
## bmi_category usage user_type
## Underweight :142 Length:943 Length:943
## Normal weight:262 Class :character Class :character
## Overweight :297 Mode :character Mode :character
## Obesity :242
##
##
##
library(ggplot2)
fitbit %>% group_by(bmi_category) %>%
summarise (total = n_distinct(id),.groups = "drop")%>%
ggplot(aes(bmi_category,total, fill=bmi_category))+
geom_bar(stat="identity")+
geom_text(aes(label = total))+
labs(title="Total by BMI Category")+
theme_minimal()
there is a change in data from 33 to 35, it can be ascertained that there are two ids experiencing changes in the bmi category, over the past month. from the chart we know that normal weight, overweight, and obesity are high users of the app.
fitbit %>% group_by(bmi_category, usage) %>%
summarise (total = n_distinct(id),.groups = "drop")%>%
ggplot(aes(usage,total, fill=bmi_category))+
geom_bar(stat="identity", position=position_dodge())+
labs(title="Total by BMI Category",
caption = "high use = average user using the apps more than 20 hours,
moderate = average user using the apps from 11 to 20 hours")+
theme_minimal()
fitbit %>% group_by(bmi_category, user_type) %>%
summarise (total = n_distinct(id),.groups="drop")%>%
ggplot(aes(user_type,total, fill=bmi_category))+
geom_bar(stat="identity", position=position_dodge())+
labs(title="Total by BMI Category",
caption = "categorizes the average step per user,
the value <=5000 is sendentary, <7500 is lightly active,
<10000 is fairly active, else very active")+
theme_minimal()
this is the number of type of user based on bmi category
fitbit %>%
mutate(activity_date = wday(activity_date, label=TRUE)) %>%
group_by(activity_date, bmi_category) %>%
summarise(mean_distance = mean(total_distance),.groups = "drop") %>%
ggplot(aes(activity_date, mean_distance, group = bmi_category, color=bmi_category))+
geom_line(linewidth = 1.4)+
theme_minimal()+
labs(title ="Activity Average Distance by BMI Category",
y = "Total Distance (KM)")
almost all bmi categories take steps above the recommended 4000-5000 per day. obesity is most active in taking an average of more than 6.5KM of steps per day.
library(gridExtra)
grid.arrange(
{fitbit %>% mutate(activity_date = wday(activity_date, label=TRUE)) %>%
group_by(activity_date, bmi_category) %>%
summarise(mean_asleep = round(mean(total_minutes_asleep, na.rm = TRUE)/60,2),.groups = "drop") %>%
ggplot(aes(activity_date, mean_asleep, fill=bmi_category))+
geom_bar(stat = 'identity', position = position_dodge())+
geom_hline(yintercept = 8, color = "black")+
geom_hline(yintercept = 6, color = "black")+
labs(title = 'Mean Sleep By BMI Category',
y = "Hour",
x = "Day of Week")+
theme(legend.position = "top")+
guides(fill = guide_legend(nrow = 2, title = NULL))},
{fitbit %>%
ggplot(aes(step_total, total_minutes_asleep))+
geom_jitter()+
geom_smooth(color="red")+
labs(title = 'Total Step vs Total Sleep',
y = "Total Sleep (in minutes)",
x = "Total Step",
caption = "correlation :-0.1868665 ")},
ncol=2
)
Due to the lack of 530 rows of data, the results of the plot above show the average per day in hours spent by users to sleep. the average user already has the recommended quality of sleep between 6 to 8 hours. but obesity on Sundays, tuesday and saturday has below average quality, while underweight on Monday, Wednesday, Thursday has below average sleep time. there is no correlation how much taking step in a day for quality of sleep in a day
#load library
library(reshape2)
# changing fitbit tabel to long format
fitbit_melt <- fitbit %>% select(-c(bmi_category,id,activity_date, usage, user_type, total_minutes_asleep,total_time_in_bed))
# calculating correlation
fitbit_cor <- round(cor(fitbit_melt),2)
melt <- melt(fitbit_cor)
head(melt)
## Var1 Var2 value
## 1 total_distance total_distance 1.00
## 2 step_total total_distance 0.99
## 3 calories total_distance 0.65
## 4 total_active total_distance 0.00
## 5 height total_distance -0.23
## 6 weight total_distance 0.24
ggplot(melt, aes(x = Var1, y = Var2,fill = value)) +
geom_tile(aes(), colour = "white") +
scale_fill_gradient(low = "white", high = "red") +
ggtitle("Heatmap Correlation Fitbit")+
geom_text(aes(label = value))
model <- lm(total_distance~step_total+weight+height, data =fitbit)
summary(model)
##
## Call:
## lm(formula = total_distance ~ step_total + weight + height, data = fitbit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7944 -0.3640 0.0154 0.3666 5.9202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.806e-01 4.780e-01 2.051 0.04052 *
## step_total 7.566e-04 4.415e-06 171.357 < 2e-16 ***
## weight 1.060e-03 1.620e-03 0.655 0.51281
## height -7.888e-03 2.411e-03 -3.271 0.00111 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6648 on 939 degrees of freedom
## Multiple R-squared: 0.9714, Adjusted R-squared: 0.9713
## F-statistic: 1.064e+04 on 3 and 939 DF, p-value: < 2.2e-16
weight and height are not very significant, while step_total is very significant to determine the amount of total_distance. every 1 unit in total_step will increase the value of the dependent variable (distance) by 0.000757.
df <- data.frame(step_total=c(12000),weight=c(80),height=c(171))
predictions <- predict(model, newdata=df)
predictions
## 1
## 8.795483
if we enter the value of 12000 for total steps, 80 for weight, and 171 for height. then the result is a distance of 8.8km.
model <- lm(calories~step_total+weight+height, data =fitbit)
summary(model)
##
## Call:
## lm(formula = calories ~ step_total + weight + height, data = fitbit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2062.08 -378.94 31.87 424.18 1738.99
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.225e+03 4.148e+02 7.776 1.96e-14 ***
## step_total 8.382e-02 3.831e-03 21.881 < 2e-16 ***
## weight -4.121e+00 1.405e+00 -2.932 0.003447 **
## height -7.353e+00 2.092e+00 -3.515 0.000461 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 576.8 on 939 degrees of freedom
## Multiple R-squared: 0.3617, Adjusted R-squared: 0.3597
## F-statistic: 177.4 on 3 and 939 DF, p-value: < 2.2e-16
all variables except weight have a p-value less than 0.05, which indicates that they are significant in predicting the dependent variable total_distance is very significant to determine the amount of total_distance. every 1 unit in step_total will increase the value of the dependent variable (calories) by 0.08652.
df <- data.frame(step_total=c(9000),weight=c(80),height=c(171))
predictions <- predict(model, newdata=df)
predictions
## 1
## 2392.568
if we enter the value of 9000 for total steps, 80 for weight, and 171 for height. then the result is a 2422.95 calories burned