The four pillars:
Predictive analytics seeks to model relationships within data to make predictions about future (or unseen) outcomes.
All models are wrong, but some are useful. - George Box
Let’s use a simple regression model (supervised) and use the interaction of categorical variables plant and weekday.
tb_subset = tb %>%
filter(day >= as.Date("2020/11/01")) %>%
mutate(uid = paste0(plant, "_", machine_id))
tb_formatted_data = tb_subset %>%
group_by(day, plant) %>%
summarize(tot_dt = sum(dt), dt_instances = n()) %>%
ungroup() %>%
group_by(plant) %>%
complete(day = seq(min(tb_subset$day), max(tb_subset$day), by = 'day')) %>%
mutate(wk = epiweek(day)) %>%
mutate(yr = year(day)) %>%
mutate(wid = paste0(yr, "_", wk)) %>%
replace_na(list(tot_dt = 0, dt_instances = 0)) %>%
mutate(wday = weekdays(day))
# tb_weekly_avg = tb_formatted_data %>%
# group_by(plant, wid) %>%
# summarize(
# wk_avg_dt_inst = mean(dt_instances),
# wk_avg_tot_dt = mean(tot_dt)
# ) %>%
# mutate(lagged_wk_avg_inst = lag(wk_avg_dt_inst)) %>%
# mutate(lagged_wk_avg_tot_dt = lag(wk_avg_tot_dt)) %>%
# ungroup()
tb_reg_data = tb_formatted_data %>%
dummy_cols(select_columns = c("plant", "wday")) %>%
select(dt_instances, tot_dt, day, wid, plant, wday, 9:18)
# left_join(tb_weekly_avg, by = c("plant", "wid")) %>%
# filter(wk_avg_dt_inst > 0 & lagged_wk_avg_inst > 0) %>%
# drop_na()
cat_nms = names(tb_reg_data)[7:16]
cat_vars = expand_grid(plant = cat_nms[1:3], wday = cat_nms[4:10]) %>%
mutate(vars = paste0(plant, ":", wday)) %>%
pull(vars) %>%
paste0(collapse = " + ")
tot_inst_formula = paste0(
"dt_instances ~ ", cat_vars, "- 1"
) %>% as.formula()
mod_inst = lm(tot_inst_formula, tb_reg_data)
mod_inst %>% summary()
##
## Call:
## lm(formula = tot_inst_formula, data = tb_reg_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -244.538 -3.647 3.872 22.686 144.487
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## plant_cooking:wday_Friday 179.316 8.238 21.768 < 2e-16 ***
## plant_cooking:wday_Monday 221.256 8.131 27.211 < 2e-16 ***
## plant_cooking:wday_Saturday 105.500 8.238 12.807 < 2e-16 ***
## plant_cooking:wday_Sunday 1.868 8.238 0.227 0.821
## plant_cooking:wday_Thursday 213.513 8.131 26.258 < 2e-16 ***
## plant_cooking:wday_Tuesday 244.538 8.131 30.074 < 2e-16 ***
## plant_cooking:wday_Wednesday 242.077 8.131 29.771 < 2e-16 ***
## wday_Friday:plant_refrigeration 67.158 8.238 8.153 1.40e-15 ***
## wday_Monday:plant_refrigeration 83.692 8.131 10.293 < 2e-16 ***
## wday_Saturday:plant_refrigeration 1.789 8.238 0.217 0.828
## wday_Sunday:plant_refrigeration 0.000 8.238 0.000 1.000
## wday_Thursday:plant_refrigeration 74.385 8.131 9.148 < 2e-16 ***
## wday_Tuesday:plant_refrigeration 95.179 8.131 11.705 < 2e-16 ***
## wday_Wednesday:plant_refrigeration 92.692 8.131 11.400 < 2e-16 ***
## wday_Friday:plant_ventilation 44.684 8.238 5.424 7.73e-08 ***
## wday_Monday:plant_ventilation 50.410 8.131 6.200 9.12e-10 ***
## wday_Saturday:plant_ventilation 6.368 8.238 0.773 0.440
## wday_Sunday:plant_ventilation 0.000 8.238 0.000 1.000
## wday_Thursday:plant_ventilation 53.205 8.131 6.543 1.08e-10 ***
## wday_Tuesday:plant_ventilation 60.000 8.131 7.379 4.05e-13 ***
## wday_Wednesday:plant_ventilation 60.282 8.131 7.414 3.17e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 50.78 on 789 degrees of freedom
## Multiple R-squared: 0.8537, Adjusted R-squared: 0.8498
## F-statistic: 219.2 on 21 and 789 DF, p-value: < 2.2e-16
p = tb_reg_data %>%
mutate(prediction = mod_inst$fitted.values) %>%
ggplot(aes(dt_instances, prediction, color = plant, text = wday)) +
geom_point(alpha = 0.3) +
geom_abline(slope = 1, color = "red") +
labs(x = "downtime instances", title = "actual vs prediction") +
theme_minimal()
ggplotly(p)
Maybe we should drop the data where instances are zero (plant closed, data collection down, etc.)
tb_reg_data2 = tb_reg_data %>%
filter(dt_instances > 5)
mod_inst2 = lm(tot_inst_formula, tb_reg_data2)
mod_inst2 %>% summary()
##
## Call:
## lm(formula = tot_inst_formula, data = tb_reg_data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -204.242 -9.141 1.357 14.342 105.758
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## plant_cooking:wday_Friday 227.133 4.701 48.318 < 2e-16 ***
## plant_cooking:wday_Monday 253.735 4.416 57.462 < 2e-16 ***
## plant_cooking:wday_Saturday 166.958 5.256 31.767 < 2e-16 ***
## plant_cooking:wday_Sunday 35.500 18.206 1.950 0.0517 .
## plant_cooking:wday_Thursday 252.242 4.482 56.278 < 2e-16 ***
## plant_cooking:wday_Tuesday 257.757 4.233 60.894 < 2e-16 ***
## plant_cooking:wday_Wednesday 255.162 4.233 60.281 < 2e-16 ***
## wday_Friday:plant_refrigeration 82.323 4.624 17.802 < 2e-16 ***
## wday_Monday:plant_refrigeration 93.257 4.352 21.428 < 2e-16 ***
## wday_Saturday:plant_refrigeration 15.250 12.874 1.185 0.2367
## wday_Sunday:plant_refrigeration NA NA NA NA
## wday_Thursday:plant_refrigeration 93.581 4.624 20.236 < 2e-16 ***
## wday_Tuesday:plant_refrigeration 97.684 4.177 23.387 < 2e-16 ***
## wday_Wednesday:plant_refrigeration 95.132 4.177 22.776 < 2e-16 ***
## wday_Friday:plant_ventilation 60.643 4.866 12.463 < 2e-16 ***
## wday_Monday:plant_ventilation 63.419 4.624 13.714 < 2e-16 ***
## wday_Saturday:plant_ventilation 48.200 11.515 4.186 3.33e-05 ***
## wday_Sunday:plant_ventilation NA NA NA NA
## wday_Thursday:plant_ventilation 64.844 4.552 14.246 < 2e-16 ***
## wday_Tuesday:plant_ventilation 66.857 4.352 15.362 < 2e-16 ***
## wday_Wednesday:plant_ventilation 67.171 4.352 15.434 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25.75 on 521 degrees of freedom
## Multiple R-squared: 0.9752, Adjusted R-squared: 0.9742
## F-statistic: 1076 on 19 and 521 DF, p-value: < 2.2e-16
p = tb_reg_data2 %>%
mutate(prediction = mod_inst2$fitted.values) %>%
ggplot(aes(dt_instances, prediction, color = plant, text = wday)) +
geom_point(alpha = 0.3) +
geom_abline(slope = 1, color = "red") +
labs(x = "downtime instances", title = "actual vs prediction | linear regression") +
theme_minimal()
ggplotly(p)
Should we incorporate autocorrelation? Autocorrelation is a measurement of the relationship between a time series variable with a lagged version of itself.
cooking_dt_instances = tb_reg_data %>%
filter(plant == "cooking") %>%
pull(dt_instances)
refrigeration_dt_instances = tb_reg_data %>%
filter(plant == "refrigeration") %>%
pull(dt_instances)
ventilation_dt_instances = tb_reg_data %>%
filter(plant == "ventilation") %>%
pull(dt_instances)
# autocorrelation function plots | known as a "correlogram"
# blue lines indicate whether correlations are significantly different than 0
ggplotly(ggAcf(cooking_dt_instances))
ggplotly(ggAcf(refrigeration_dt_instances))
ggplotly(ggAcf(ventilation_dt_instances))
tb_reg_data3 = tb_reg_data %>%
mutate(l1_dt_instances = lag(dt_instances, 1)) %>%
mutate(l7_dt_instances = lag(dt_instances, 7)) %>%
drop_na() %>%
filter(dt_instances > 5)
tot_inst_formula2 = paste0(
"dt_instances ~ ", cat_vars, "+ l1_dt_instances + l7_dt_instances - 1"
) %>% as.formula()
mod_inst3 = lm(tot_inst_formula2, tb_reg_data3)
mod_inst3 %>% summary()
##
## Call:
## lm(formula = tot_inst_formula2, data = tb_reg_data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -205.210 -8.858 1.178 14.194 102.643
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## l1_dt_instances 0.15286 0.03219 4.748 2.67e-06 ***
## l7_dt_instances -0.05433 0.01855 -2.929 0.003546 **
## plant_cooking:wday_Friday 197.84689 10.05140 19.684 < 2e-16 ***
## plant_cooking:wday_Monday 266.09618 5.87140 45.321 < 2e-16 ***
## plant_cooking:wday_Saturday 138.70127 9.06752 15.296 < 2e-16 ***
## plant_cooking:wday_Sunday 3.93526 18.80269 0.209 0.834303
## plant_cooking:wday_Thursday 225.32785 10.36025 21.749 < 2e-16 ***
## plant_cooking:wday_Tuesday 235.85782 9.81593 24.028 < 2e-16 ***
## plant_cooking:wday_Wednesday 230.55194 10.44163 22.080 < 2e-16 ***
## wday_Friday:plant_refrigeration 72.72715 5.48448 13.261 < 2e-16 ***
## wday_Monday:plant_refrigeration 97.79596 4.50690 21.699 < 2e-16 ***
## wday_Saturday:plant_refrigeration 0.61405 12.81312 0.048 0.961796
## wday_Sunday:plant_refrigeration NA NA NA NA
## wday_Thursday:plant_refrigeration 83.00961 5.70368 14.554 < 2e-16 ***
## wday_Tuesday:plant_refrigeration 89.56021 5.21781 17.164 < 2e-16 ***
## wday_Wednesday:plant_refrigeration 85.05679 5.41634 15.704 < 2e-16 ***
## wday_Friday:plant_ventilation 53.30933 5.19764 10.256 < 2e-16 ***
## wday_Monday:plant_ventilation 66.24452 4.57035 14.494 < 2e-16 ***
## wday_Saturday:plant_ventilation 40.16811 11.31573 3.550 0.000421 ***
## wday_Sunday:plant_ventilation NA NA NA NA
## wday_Thursday:plant_ventilation 58.39325 4.95329 11.789 < 2e-16 ***
## wday_Tuesday:plant_ventilation 61.60560 4.69160 13.131 < 2e-16 ***
## wday_Wednesday:plant_ventilation 60.30582 4.83309 12.478 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.87 on 513 degrees of freedom
## Multiple R-squared: 0.9767, Adjusted R-squared: 0.9758
## F-statistic: 1026 on 21 and 513 DF, p-value: < 2.2e-16
p = tb_reg_data3 %>%
mutate(prediction = mod_inst3$fitted.values) %>%
ggplot(aes(dt_instances, prediction, color = plant, text = wday)) +
geom_point(alpha = 0.3) +
geom_abline(slope = 1, color = "red") +
labs(x = "downtime instances", title = "actual vs prediction | linear regression") +
theme_minimal()
ggplotly(p)
Quick check of autocorrelation:
cooking_tot_dt = tb_reg_data %>%
filter(plant == "cooking") %>%
pull(tot_dt)
refrigeration_tot_dt = tb_reg_data %>%
filter(plant == "refrigeration") %>%
pull(tot_dt)
ventilation_tot_dt = tb_reg_data %>%
filter(plant == "ventilation") %>%
pull(tot_dt)
# autocorrelation function plots | known as a "correlogram"
ggplotly(ggAcf(cooking_tot_dt))
ggplotly(ggAcf(refrigeration_tot_dt))
ggplotly(ggAcf(ventilation_tot_dt))
tb_reg_data4 = tb_reg_data %>%
mutate(l1_tot_dt = lag(tot_dt, 1)) %>%
mutate(l7_tot_dt = lag(tot_dt, 7)) %>%
drop_na() %>%
filter(dt_instances > 5)
tot_dt_formula = paste0(
"tot_dt ~ ", cat_vars, "+ l1_tot_dt + l7_tot_dt - 1"
) %>% as.formula()
mod_tot_dt = lm(tot_dt_formula, tb_reg_data4)
mod_tot_dt %>% summary()
##
## Call:
## lm(formula = tot_dt_formula, data = tb_reg_data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7784.6 -657.3 -22.2 609.4 5946.8
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## l1_tot_dt 1.511e-01 4.174e-02 3.621 0.000323 ***
## l7_tot_dt -6.078e-02 2.665e-02 -2.281 0.022969 *
## plant_cooking:wday_Friday 6.852e+03 4.576e+02 14.975 < 2e-16 ***
## plant_cooking:wday_Monday 8.321e+03 2.959e+02 28.121 < 2e-16 ***
## plant_cooking:wday_Saturday 5.355e+03 4.360e+02 12.282 < 2e-16 ***
## plant_cooking:wday_Sunday -1.011e+03 1.035e+03 -0.977 0.328978
## plant_cooking:wday_Thursday 7.529e+03 4.545e+02 16.565 < 2e-16 ***
## plant_cooking:wday_Tuesday 7.728e+03 4.355e+02 17.744 < 2e-16 ***
## plant_cooking:wday_Wednesday 7.204e+03 4.640e+02 15.525 < 2e-16 ***
## wday_Friday:plant_refrigeration 2.646e+03 2.847e+02 9.293 < 2e-16 ***
## wday_Monday:plant_refrigeration 3.384e+03 2.429e+02 13.932 < 2e-16 ***
## wday_Saturday:plant_refrigeration -4.602e+01 6.957e+02 -0.066 0.947284
## wday_Sunday:plant_refrigeration NA NA NA NA
## wday_Thursday:plant_refrigeration 2.946e+03 2.908e+02 10.132 < 2e-16 ***
## wday_Tuesday:plant_refrigeration 3.021e+03 2.667e+02 11.329 < 2e-16 ***
## wday_Wednesday:plant_refrigeration 2.930e+03 2.728e+02 10.743 < 2e-16 ***
## wday_Friday:plant_ventilation 2.488e+03 2.889e+02 8.614 < 2e-16 ***
## wday_Monday:plant_ventilation 3.208e+03 2.536e+02 12.650 < 2e-16 ***
## wday_Saturday:plant_ventilation 1.254e+03 6.190e+02 2.025 0.043361 *
## wday_Sunday:plant_ventilation NA NA NA NA
## wday_Thursday:plant_ventilation 2.576e+03 2.769e+02 9.302 < 2e-16 ***
## wday_Tuesday:plant_ventilation 3.001e+03 2.663e+02 11.268 < 2e-16 ***
## wday_Wednesday:plant_ventilation 2.785e+03 2.752e+02 10.120 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1361 on 513 degrees of freedom
## Multiple R-squared: 0.9405, Adjusted R-squared: 0.9381
## F-statistic: 386.1 on 21 and 513 DF, p-value: < 2.2e-16
p = tb_reg_data4 %>%
mutate(prediction = mod_tot_dt$fitted.values) %>%
mutate(prediction = prediction/60) %>%
mutate(tot_dt = tot_dt/60) %>%
ggplot(aes(tot_dt, prediction, color = plant, text = wday)) +
geom_point(alpha = 0.3) +
geom_abline(slope = 1, color = "red") +
labs(
x = "total downtime (hours)",
y = "prediction (hours)",
title = "actual vs prediction | linear regression"
) +
theme_minimal()
ggplotly(p)
Let’s use markov chains (supervised) to model machine behavior.
unique_states = tb_subset$reason_code_id %>% unique()
unique_machines = tb_subset %>%
group_by(uid) %>%
summarize(n = n()) %>%
filter(n > 100) %>%
pull(uid)
tm_list = lapply(unique_machines, function(u_id) {
state_seq = tb_subset %>%
filter(uid == u_id) %>%
pull(reason_code_id)
mc_fit = markovchainFit(data = state_seq, possibleStates = unique_states)
return(mc_fit$estimate@transitionMatrix)
}) %>% setNames(unique_machines)
# Calculate the distance matrix
tb_machine_dist = expand_grid(row = unique_machines, col = unique_machines) %>%
rowwise() %>%
mutate(d = norm(tm_list[[row]] - tm_list[[col]], 'F'))
tb_subset %>%
group_by(reason_code_id) %>%
slice(1) %>%
select(reason_code_id, reason_code)
## # A tibble: 18 × 2
## # Groups: reason_code_id [18]
## reason_code_id reason_code
## <dbl> <chr>
## 1 1 Tool Change(dull)
## 2 2 Tool Change(special)
## 3 3 Program Error
## 4 4 Sheet Crash
## 5 5 Clamp Failure
## 6 6 Loader Failure
## 7 7 Unloader Failure
## 8 8 Set Up
## 9 9 Load Material
## 10 10 No Work
## 11 11 No Operator
## 12 12 Maintenance
## 13 13 Tool Crib
## 14 14 Tool Other
## 15 15 Hit Counter Due
## 16 18 Maintenance PM
## 17 19 Operator PM
## 18 20 Move to another machine
Let’s look at an example of a markov chain model for one of the machines.
tb_subset %>%
filter(uid == "cooking_130") %>%
slice(1) %>%
select(plant, description, category)
## # A tibble: 1 × 3
## plant description category
## <chr> <chr> <chr>
## 1 cooking Guifil Press Brake Press Brake
state_seq = tb_subset %>%
filter(uid == "cooking_130") %>%
pull(reason_code_id)
mc_fit = markovchainFit(data = state_seq)
plotmat(
t(mc_fit$estimate@transitionMatrix %>% round(3)),
relsize = 0.8,
cex.txt = 0.8,
lwd = 0.5,
box.size = 0.05
)
Now that we have a model of machine behavior, we can see which machines have similar behavior. We’ll use hierarchical clustering (unsupervised learning).
# Create the distance matrix
M = tb_machine_dist %>%
pivot_wider(names_from = col, values_from = d) %>%
select(-row) %>%
as.matrix() %>%
magrittr::set_rownames(colnames(.))
# Get the hierarchical tree
M_dist = as.dist(M)
M_tree = hclust(M_dist)
M_dend = as.dendrogram(M_tree)
nodePar = list(lab.cex = 0.2, pch = c(NA, 19), cex = 0.3)
plot(M_dend, nodePar = nodePar, leaflab = 'none')
K = 4
plot(color_branches(M_dend, k = K), nodePar = nodePar, leaflab = 'none')
clusters = cutree(M_dend, k=K, order_clusters_as_data = FALSE)
tb_clusters = tibble(uid = names(clusters), cluster = clusters)
uid_order = tb_clusters$uid
tb_dist_with_cluster = tb_machine_dist %>%
rename(uid = row) %>%
left_join(tb_clusters, by = 'uid') %>%
mutate(uid = factor(uid, uid_order)) %>%
mutate(col = factor(col, uid_order))
p = tb_dist_with_cluster %>%
ggplot(aes(col, uid, fill = d)) +
geom_tile() +
scale_fill_viridis_c(direction = -1, option = 'A') +
theme(axis.text = element_blank()) +
labs(x = "", y = "", title = 'machine behavior similarity')
ggplotly(p)
machines_to_check = c(
"cooking_79", "ventilation_60", "cooking_78", "ventilation_27", "cooking_139",
"cooking_89", "cooking_114", "cooking_80", "cooking_116", "ventilation_61"
)
tb_subset %>%
filter(uid %in% machines_to_check) %>%
group_by(uid) %>%
slice(1) %>%
select(plant, uid, description, category)
## # A tibble: 10 × 4
## # Groups: uid [10]
## plant uid description category
## <chr> <chr> <chr> <chr>
## 1 cooking cooking_114 G & P SANDER Sander
## 2 cooking cooking_116 Welder Welder
## 3 cooking cooking_139 Welder Welder
## 4 cooking cooking_78 Woodtek Sander Sander
## 5 cooking cooking_79 Woodtek Sander Sander
## 6 cooking cooking_80 Welder Welder
## 7 cooking cooking_89 WELDER Welder
## 8 ventilation ventilation_27 G & P SANDER Sander
## 9 ventilation ventilation_60 MILLER WELDER Welder
## 10 ventilation ventilation_61 MILLER WELDER Welder