The four pillars:

  • Descriptive Analytics
  • Diagnostic Analytics
  • Predictive Analytics
  • Prescriptive Analytics

Predictive Analytics | What will happen?

Predictive analytics seeks to model relationships within data to make predictions about future (or unseen) outcomes.

All models are wrong, but some are useful. - George Box

Predictive models

Can we predict the number of downtime instances per plant?

Let’s use a simple regression model (supervised) and use the interaction of categorical variables plant and weekday.

tb_subset = tb %>% 
  filter(day >= as.Date("2020/11/01")) %>% 
  mutate(uid = paste0(plant, "_", machine_id))

tb_formatted_data = tb_subset %>% 
  group_by(day, plant) %>% 
  summarize(tot_dt = sum(dt), dt_instances = n()) %>% 
  ungroup() %>% 
  group_by(plant) %>% 
  complete(day = seq(min(tb_subset$day), max(tb_subset$day), by = 'day')) %>% 
  mutate(wk = epiweek(day)) %>% 
  mutate(yr = year(day)) %>% 
  mutate(wid = paste0(yr, "_", wk)) %>% 
  replace_na(list(tot_dt = 0, dt_instances = 0)) %>%
  mutate(wday = weekdays(day))

# tb_weekly_avg = tb_formatted_data %>% 
#   group_by(plant, wid) %>% 
#   summarize(
#     wk_avg_dt_inst = mean(dt_instances),
#     wk_avg_tot_dt = mean(tot_dt)
#     ) %>% 
#   mutate(lagged_wk_avg_inst = lag(wk_avg_dt_inst)) %>% 
#   mutate(lagged_wk_avg_tot_dt = lag(wk_avg_tot_dt)) %>% 
#   ungroup()

tb_reg_data = tb_formatted_data %>% 
  dummy_cols(select_columns = c("plant", "wday")) %>% 
  select(dt_instances, tot_dt, day, wid, plant, wday, 9:18)
  # left_join(tb_weekly_avg, by = c("plant", "wid")) %>% 
  # filter(wk_avg_dt_inst > 0 & lagged_wk_avg_inst > 0) %>% 
  # drop_na()

cat_nms = names(tb_reg_data)[7:16]

cat_vars = expand_grid(plant = cat_nms[1:3], wday = cat_nms[4:10]) %>% 
  mutate(vars = paste0(plant, ":", wday)) %>% 
  pull(vars) %>% 
  paste0(collapse = " + ")

tot_inst_formula = paste0(
  "dt_instances ~ ", cat_vars, "- 1"
) %>% as.formula()

mod_inst = lm(tot_inst_formula, tb_reg_data)

mod_inst %>% summary()
## 
## Call:
## lm(formula = tot_inst_formula, data = tb_reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -244.538   -3.647    3.872   22.686  144.487 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## plant_cooking:wday_Friday           179.316      8.238  21.768  < 2e-16 ***
## plant_cooking:wday_Monday           221.256      8.131  27.211  < 2e-16 ***
## plant_cooking:wday_Saturday         105.500      8.238  12.807  < 2e-16 ***
## plant_cooking:wday_Sunday             1.868      8.238   0.227    0.821    
## plant_cooking:wday_Thursday         213.513      8.131  26.258  < 2e-16 ***
## plant_cooking:wday_Tuesday          244.538      8.131  30.074  < 2e-16 ***
## plant_cooking:wday_Wednesday        242.077      8.131  29.771  < 2e-16 ***
## wday_Friday:plant_refrigeration      67.158      8.238   8.153 1.40e-15 ***
## wday_Monday:plant_refrigeration      83.692      8.131  10.293  < 2e-16 ***
## wday_Saturday:plant_refrigeration     1.789      8.238   0.217    0.828    
## wday_Sunday:plant_refrigeration       0.000      8.238   0.000    1.000    
## wday_Thursday:plant_refrigeration    74.385      8.131   9.148  < 2e-16 ***
## wday_Tuesday:plant_refrigeration     95.179      8.131  11.705  < 2e-16 ***
## wday_Wednesday:plant_refrigeration   92.692      8.131  11.400  < 2e-16 ***
## wday_Friday:plant_ventilation        44.684      8.238   5.424 7.73e-08 ***
## wday_Monday:plant_ventilation        50.410      8.131   6.200 9.12e-10 ***
## wday_Saturday:plant_ventilation       6.368      8.238   0.773    0.440    
## wday_Sunday:plant_ventilation         0.000      8.238   0.000    1.000    
## wday_Thursday:plant_ventilation      53.205      8.131   6.543 1.08e-10 ***
## wday_Tuesday:plant_ventilation       60.000      8.131   7.379 4.05e-13 ***
## wday_Wednesday:plant_ventilation     60.282      8.131   7.414 3.17e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.78 on 789 degrees of freedom
## Multiple R-squared:  0.8537, Adjusted R-squared:  0.8498 
## F-statistic: 219.2 on 21 and 789 DF,  p-value: < 2.2e-16
p = tb_reg_data %>%
  mutate(prediction = mod_inst$fitted.values) %>% 
  ggplot(aes(dt_instances, prediction, color = plant, text = wday)) + 
  geom_point(alpha = 0.3) + 
  geom_abline(slope = 1, color = "red") + 
  labs(x = "downtime instances", title = "actual vs prediction") + 
  theme_minimal()
  
  
ggplotly(p)  

Maybe we should drop the data where instances are zero (plant closed, data collection down, etc.)

tb_reg_data2 = tb_reg_data %>% 
  filter(dt_instances > 5)

mod_inst2 = lm(tot_inst_formula, tb_reg_data2)

mod_inst2 %>% summary()
## 
## Call:
## lm(formula = tot_inst_formula, data = tb_reg_data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -204.242   -9.141    1.357   14.342  105.758 
## 
## Coefficients: (2 not defined because of singularities)
##                                    Estimate Std. Error t value Pr(>|t|)    
## plant_cooking:wday_Friday           227.133      4.701  48.318  < 2e-16 ***
## plant_cooking:wday_Monday           253.735      4.416  57.462  < 2e-16 ***
## plant_cooking:wday_Saturday         166.958      5.256  31.767  < 2e-16 ***
## plant_cooking:wday_Sunday            35.500     18.206   1.950   0.0517 .  
## plant_cooking:wday_Thursday         252.242      4.482  56.278  < 2e-16 ***
## plant_cooking:wday_Tuesday          257.757      4.233  60.894  < 2e-16 ***
## plant_cooking:wday_Wednesday        255.162      4.233  60.281  < 2e-16 ***
## wday_Friday:plant_refrigeration      82.323      4.624  17.802  < 2e-16 ***
## wday_Monday:plant_refrigeration      93.257      4.352  21.428  < 2e-16 ***
## wday_Saturday:plant_refrigeration    15.250     12.874   1.185   0.2367    
## wday_Sunday:plant_refrigeration          NA         NA      NA       NA    
## wday_Thursday:plant_refrigeration    93.581      4.624  20.236  < 2e-16 ***
## wday_Tuesday:plant_refrigeration     97.684      4.177  23.387  < 2e-16 ***
## wday_Wednesday:plant_refrigeration   95.132      4.177  22.776  < 2e-16 ***
## wday_Friday:plant_ventilation        60.643      4.866  12.463  < 2e-16 ***
## wday_Monday:plant_ventilation        63.419      4.624  13.714  < 2e-16 ***
## wday_Saturday:plant_ventilation      48.200     11.515   4.186 3.33e-05 ***
## wday_Sunday:plant_ventilation            NA         NA      NA       NA    
## wday_Thursday:plant_ventilation      64.844      4.552  14.246  < 2e-16 ***
## wday_Tuesday:plant_ventilation       66.857      4.352  15.362  < 2e-16 ***
## wday_Wednesday:plant_ventilation     67.171      4.352  15.434  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25.75 on 521 degrees of freedom
## Multiple R-squared:  0.9752, Adjusted R-squared:  0.9742 
## F-statistic:  1076 on 19 and 521 DF,  p-value: < 2.2e-16
p = tb_reg_data2 %>%
  mutate(prediction = mod_inst2$fitted.values) %>% 
  ggplot(aes(dt_instances, prediction, color = plant, text = wday)) + 
  geom_point(alpha = 0.3) + 
  geom_abline(slope = 1, color = "red") + 
  labs(x = "downtime instances", title = "actual vs prediction | linear regression") + 
  theme_minimal()
  
  
ggplotly(p)  

Should we incorporate autocorrelation? Autocorrelation is a measurement of the relationship between a time series variable with a lagged version of itself.

cooking_dt_instances = tb_reg_data %>% 
  filter(plant == "cooking") %>% 
  pull(dt_instances)
refrigeration_dt_instances = tb_reg_data %>% 
  filter(plant == "refrigeration") %>% 
  pull(dt_instances)
ventilation_dt_instances = tb_reg_data %>% 
  filter(plant == "ventilation") %>% 
  pull(dt_instances)

# autocorrelation function plots | known as a "correlogram"
# blue lines indicate whether correlations are significantly different than 0
ggplotly(ggAcf(cooking_dt_instances))
ggplotly(ggAcf(refrigeration_dt_instances))
ggplotly(ggAcf(ventilation_dt_instances))
tb_reg_data3 = tb_reg_data %>% 
  mutate(l1_dt_instances = lag(dt_instances, 1)) %>% 
  mutate(l7_dt_instances = lag(dt_instances, 7)) %>% 
  drop_na() %>% 
  filter(dt_instances > 5)

tot_inst_formula2 = paste0(
  "dt_instances ~ ", cat_vars, "+ l1_dt_instances + l7_dt_instances - 1"
) %>% as.formula()

mod_inst3 = lm(tot_inst_formula2, tb_reg_data3)

mod_inst3 %>% summary()
## 
## Call:
## lm(formula = tot_inst_formula2, data = tb_reg_data3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -205.210   -8.858    1.178   14.194  102.643 
## 
## Coefficients: (2 not defined because of singularities)
##                                     Estimate Std. Error t value Pr(>|t|)    
## l1_dt_instances                      0.15286    0.03219   4.748 2.67e-06 ***
## l7_dt_instances                     -0.05433    0.01855  -2.929 0.003546 ** 
## plant_cooking:wday_Friday          197.84689   10.05140  19.684  < 2e-16 ***
## plant_cooking:wday_Monday          266.09618    5.87140  45.321  < 2e-16 ***
## plant_cooking:wday_Saturday        138.70127    9.06752  15.296  < 2e-16 ***
## plant_cooking:wday_Sunday            3.93526   18.80269   0.209 0.834303    
## plant_cooking:wday_Thursday        225.32785   10.36025  21.749  < 2e-16 ***
## plant_cooking:wday_Tuesday         235.85782    9.81593  24.028  < 2e-16 ***
## plant_cooking:wday_Wednesday       230.55194   10.44163  22.080  < 2e-16 ***
## wday_Friday:plant_refrigeration     72.72715    5.48448  13.261  < 2e-16 ***
## wday_Monday:plant_refrigeration     97.79596    4.50690  21.699  < 2e-16 ***
## wday_Saturday:plant_refrigeration    0.61405   12.81312   0.048 0.961796    
## wday_Sunday:plant_refrigeration           NA         NA      NA       NA    
## wday_Thursday:plant_refrigeration   83.00961    5.70368  14.554  < 2e-16 ***
## wday_Tuesday:plant_refrigeration    89.56021    5.21781  17.164  < 2e-16 ***
## wday_Wednesday:plant_refrigeration  85.05679    5.41634  15.704  < 2e-16 ***
## wday_Friday:plant_ventilation       53.30933    5.19764  10.256  < 2e-16 ***
## wday_Monday:plant_ventilation       66.24452    4.57035  14.494  < 2e-16 ***
## wday_Saturday:plant_ventilation     40.16811   11.31573   3.550 0.000421 ***
## wday_Sunday:plant_ventilation             NA         NA      NA       NA    
## wday_Thursday:plant_ventilation     58.39325    4.95329  11.789  < 2e-16 ***
## wday_Tuesday:plant_ventilation      61.60560    4.69160  13.131  < 2e-16 ***
## wday_Wednesday:plant_ventilation    60.30582    4.83309  12.478  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.87 on 513 degrees of freedom
## Multiple R-squared:  0.9767, Adjusted R-squared:  0.9758 
## F-statistic:  1026 on 21 and 513 DF,  p-value: < 2.2e-16
p = tb_reg_data3 %>%
  mutate(prediction = mod_inst3$fitted.values) %>% 
  ggplot(aes(dt_instances, prediction, color = plant, text = wday)) + 
  geom_point(alpha = 0.3) + 
  geom_abline(slope = 1, color = "red") + 
  labs(x = "downtime instances", title = "actual vs prediction | linear regression") + 
  theme_minimal()
  
ggplotly(p)  

What about total downtime?

Quick check of autocorrelation:

cooking_tot_dt = tb_reg_data %>% 
  filter(plant == "cooking") %>% 
  pull(tot_dt)
refrigeration_tot_dt = tb_reg_data %>% 
  filter(plant == "refrigeration") %>% 
  pull(tot_dt)
ventilation_tot_dt = tb_reg_data %>% 
  filter(plant == "ventilation") %>% 
  pull(tot_dt)

# autocorrelation function plots | known as a "correlogram"
ggplotly(ggAcf(cooking_tot_dt))
ggplotly(ggAcf(refrigeration_tot_dt))
ggplotly(ggAcf(ventilation_tot_dt))
tb_reg_data4 = tb_reg_data %>% 
  mutate(l1_tot_dt = lag(tot_dt, 1)) %>% 
  mutate(l7_tot_dt = lag(tot_dt, 7)) %>% 
  drop_na() %>% 
  filter(dt_instances > 5)

tot_dt_formula = paste0(
  "tot_dt ~ ", cat_vars, "+ l1_tot_dt + l7_tot_dt - 1"
) %>% as.formula()

mod_tot_dt = lm(tot_dt_formula, tb_reg_data4)

mod_tot_dt %>% summary()
## 
## Call:
## lm(formula = tot_dt_formula, data = tb_reg_data4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7784.6  -657.3   -22.2   609.4  5946.8 
## 
## Coefficients: (2 not defined because of singularities)
##                                      Estimate Std. Error t value Pr(>|t|)    
## l1_tot_dt                           1.511e-01  4.174e-02   3.621 0.000323 ***
## l7_tot_dt                          -6.078e-02  2.665e-02  -2.281 0.022969 *  
## plant_cooking:wday_Friday           6.852e+03  4.576e+02  14.975  < 2e-16 ***
## plant_cooking:wday_Monday           8.321e+03  2.959e+02  28.121  < 2e-16 ***
## plant_cooking:wday_Saturday         5.355e+03  4.360e+02  12.282  < 2e-16 ***
## plant_cooking:wday_Sunday          -1.011e+03  1.035e+03  -0.977 0.328978    
## plant_cooking:wday_Thursday         7.529e+03  4.545e+02  16.565  < 2e-16 ***
## plant_cooking:wday_Tuesday          7.728e+03  4.355e+02  17.744  < 2e-16 ***
## plant_cooking:wday_Wednesday        7.204e+03  4.640e+02  15.525  < 2e-16 ***
## wday_Friday:plant_refrigeration     2.646e+03  2.847e+02   9.293  < 2e-16 ***
## wday_Monday:plant_refrigeration     3.384e+03  2.429e+02  13.932  < 2e-16 ***
## wday_Saturday:plant_refrigeration  -4.602e+01  6.957e+02  -0.066 0.947284    
## wday_Sunday:plant_refrigeration            NA         NA      NA       NA    
## wday_Thursday:plant_refrigeration   2.946e+03  2.908e+02  10.132  < 2e-16 ***
## wday_Tuesday:plant_refrigeration    3.021e+03  2.667e+02  11.329  < 2e-16 ***
## wday_Wednesday:plant_refrigeration  2.930e+03  2.728e+02  10.743  < 2e-16 ***
## wday_Friday:plant_ventilation       2.488e+03  2.889e+02   8.614  < 2e-16 ***
## wday_Monday:plant_ventilation       3.208e+03  2.536e+02  12.650  < 2e-16 ***
## wday_Saturday:plant_ventilation     1.254e+03  6.190e+02   2.025 0.043361 *  
## wday_Sunday:plant_ventilation              NA         NA      NA       NA    
## wday_Thursday:plant_ventilation     2.576e+03  2.769e+02   9.302  < 2e-16 ***
## wday_Tuesday:plant_ventilation      3.001e+03  2.663e+02  11.268  < 2e-16 ***
## wday_Wednesday:plant_ventilation    2.785e+03  2.752e+02  10.120  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1361 on 513 degrees of freedom
## Multiple R-squared:  0.9405, Adjusted R-squared:  0.9381 
## F-statistic: 386.1 on 21 and 513 DF,  p-value: < 2.2e-16
p = tb_reg_data4 %>%
  mutate(prediction = mod_tot_dt$fitted.values) %>% 
  mutate(prediction = prediction/60) %>% 
  mutate(tot_dt = tot_dt/60) %>% 
  ggplot(aes(tot_dt, prediction, color = plant, text = wday)) + 
  geom_point(alpha = 0.3) + 
  geom_abline(slope = 1, color = "red") + 
  labs(
    x = "total downtime (hours)", 
    y = "prediction (hours)",
    title = "actual vs prediction | linear regression"
    ) + 
  theme_minimal()
  
ggplotly(p)  

Can we model the downtime “behavior” of machines?

Let’s use markov chains (supervised) to model machine behavior.

unique_states = tb_subset$reason_code_id %>% unique()
unique_machines = tb_subset %>% 
  group_by(uid) %>% 
  summarize(n = n()) %>% 
  filter(n > 100) %>% 
  pull(uid)

tm_list = lapply(unique_machines, function(u_id) {
  state_seq = tb_subset %>% 
    filter(uid == u_id) %>% 
    pull(reason_code_id)
  
  mc_fit = markovchainFit(data = state_seq, possibleStates = unique_states)
  
  return(mc_fit$estimate@transitionMatrix)
}) %>% setNames(unique_machines)

# Calculate the distance matrix
tb_machine_dist = expand_grid(row = unique_machines, col = unique_machines) %>% 
  rowwise() %>% 
  mutate(d = norm(tm_list[[row]] - tm_list[[col]], 'F'))

tb_subset %>% 
  group_by(reason_code_id) %>% 
  slice(1) %>% 
  select(reason_code_id, reason_code)
## # A tibble: 18 × 2
## # Groups:   reason_code_id [18]
##    reason_code_id reason_code            
##             <dbl> <chr>                  
##  1              1 Tool Change(dull)      
##  2              2 Tool Change(special)   
##  3              3 Program Error          
##  4              4 Sheet Crash            
##  5              5 Clamp Failure          
##  6              6 Loader Failure         
##  7              7 Unloader Failure       
##  8              8 Set Up                 
##  9              9 Load Material          
## 10             10 No Work                
## 11             11 No Operator            
## 12             12 Maintenance            
## 13             13 Tool Crib              
## 14             14 Tool Other             
## 15             15 Hit Counter Due        
## 16             18 Maintenance PM         
## 17             19 Operator PM            
## 18             20 Move to another machine

Let’s look at an example of a markov chain model for one of the machines.

tb_subset %>% 
  filter(uid == "cooking_130") %>% 
  slice(1) %>% 
  select(plant, description, category)
## # A tibble: 1 × 3
##   plant   description        category   
##   <chr>   <chr>              <chr>      
## 1 cooking Guifil Press Brake Press Brake
state_seq = tb_subset %>% 
  filter(uid == "cooking_130") %>% 
  pull(reason_code_id)

mc_fit = markovchainFit(data = state_seq)

plotmat(
  t(mc_fit$estimate@transitionMatrix %>% round(3)),
  relsize = 0.8,
  cex.txt = 0.8,
  lwd = 0.5,
  box.size = 0.05
)

Now that we have a model of machine behavior, we can see which machines have similar behavior. We’ll use hierarchical clustering (unsupervised learning).

# Create the distance matrix
M = tb_machine_dist %>%
  pivot_wider(names_from = col, values_from = d) %>%
  select(-row) %>%
  as.matrix() %>%
  magrittr::set_rownames(colnames(.))


# Get the hierarchical tree
M_dist = as.dist(M)

M_tree = hclust(M_dist)

M_dend = as.dendrogram(M_tree)

nodePar = list(lab.cex = 0.2, pch = c(NA, 19), cex = 0.3)

plot(M_dend, nodePar = nodePar, leaflab = 'none')

K = 4

plot(color_branches(M_dend, k = K), nodePar = nodePar, leaflab = 'none')

clusters    = cutree(M_dend, k=K, order_clusters_as_data = FALSE)
tb_clusters = tibble(uid = names(clusters), cluster = clusters)

uid_order = tb_clusters$uid

tb_dist_with_cluster = tb_machine_dist %>% 
  rename(uid = row) %>% 
  left_join(tb_clusters, by = 'uid') %>% 
  mutate(uid = factor(uid, uid_order)) %>% 
  mutate(col = factor(col, uid_order))

p = tb_dist_with_cluster %>% 
  ggplot(aes(col, uid, fill = d)) +
  geom_tile() + 
  scale_fill_viridis_c(direction = -1, option = 'A') + 
  theme(axis.text = element_blank()) + 
  labs(x = "", y = "", title = 'machine behavior similarity')

ggplotly(p)
machines_to_check = c(
  "cooking_79", "ventilation_60", "cooking_78", "ventilation_27", "cooking_139",
  "cooking_89", "cooking_114", "cooking_80", "cooking_116", "ventilation_61"
)

tb_subset %>% 
  filter(uid %in% machines_to_check) %>% 
  group_by(uid) %>% 
  slice(1) %>% 
  select(plant, uid, description, category)
## # A tibble: 10 × 4
## # Groups:   uid [10]
##    plant       uid            description    category
##    <chr>       <chr>          <chr>          <chr>   
##  1 cooking     cooking_114    G & P SANDER   Sander  
##  2 cooking     cooking_116    Welder         Welder  
##  3 cooking     cooking_139    Welder         Welder  
##  4 cooking     cooking_78     Woodtek Sander Sander  
##  5 cooking     cooking_79     Woodtek Sander Sander  
##  6 cooking     cooking_80     Welder         Welder  
##  7 cooking     cooking_89     WELDER         Welder  
##  8 ventilation ventilation_27 G & P SANDER   Sander  
##  9 ventilation ventilation_60 MILLER WELDER  Welder  
## 10 ventilation ventilation_61 MILLER WELDER  Welder