The four pillars:

  • Descriptive Analytics
  • Diagnostic Analytics
  • Predictive Analytics
  • Prescriptive Analytics

Diagnostic Analytics | Why did it happen?

Diagnostic analytics seek to explain why something happened in the data. Here, we are looking for relationships within the data.

Questions about the data

What is normal for downtimes per machine category?

machine_categories = tb$category %>% unique()

tb_cat_dist = lapply(machine_categories, function(c) {
  tmp = tb %>% 
    filter(category == c) %>% 
    pull(dt) %>% 
    summary()
  
  tmp %>% 
    matrix(nrow = 1) %>% 
    as_tibble() %>% 
    magrittr::set_colnames(names(tmp))
}) %>% 
  bind_rows() %>% 
  mutate(category = machine_categories) 

tb_cat_dist %>% 
  left_join(
    tb %>% 
      group_by(category) %>% 
      summarize(tot_dt = sum(dt), dt_inst = n()) %>% 
      mutate(p_dt = tot_dt/sum(tot_dt)),
    by = "category"
  ) %>% 
  mutate(Mean = round(Mean, 2)) %>% 
  arrange(desc(tot_dt)) %>% 
  DT::datatable() %>% 
  DT::formatPercentage(c("p_dt"), digits = 2)

What is normal downtimes per reason code?

reason_codes = tb$reason_code %>% unique()

tb_reason_dist = lapply(reason_codes, function(c) {
  tmp = tb %>% 
    filter(reason_code == c) %>% 
    pull(dt) %>% 
    summary()
  
  tmp %>% 
    matrix(nrow = 1) %>% 
    as_tibble() %>% 
    magrittr::set_colnames(names(tmp))
}) %>% 
  bind_rows() %>% 
  mutate(reason_code = reason_codes) 

tb_reason_dist %>% 
  left_join(
    tb %>% 
      group_by(reason_code) %>% 
      summarize(tot_dt = sum(dt), dt_inst = n()) %>% 
      mutate(p_dt = tot_dt/sum(tot_dt)),
    by = "reason_code"
  ) %>% 
  mutate(Mean = round(Mean, 2)) %>% 
  arrange(desc(tot_dt)) %>% 
  DT::datatable() %>% 
  DT::formatPercentage(c("p_dt"), digits = 2)

When do long downtimes occur?

“No Operator” and “No Work” cause ~66% of the total downtime. When are these codes occurring?

clrs = hue_pal()(2)

tb_day_summary_tmp = tb %>% 
  filter(reason_code %in% c("No Operator", "No Work")) %>% 
  group_by(day, reason_code) %>% 
  summarize(
    dt_instances = n(), avg_dt = mean(dt), med_dt = median(dt),
    tot_dt = sum(dt)
    ) %>% 
  ungroup()

tb_day_summary = lapply(c("No Operator", "No Work"), function(i) {
  tb_day_summary_tmp %>% 
    filter(reason_code == i) %>% 
    complete(day = seq(min(day), max(day), by = 'day')) %>% 
    mutate(wday = weekdays(day)) %>% 
    replace_na(list(reason_code = i))
}) %>% bind_rows()

p = tb_day_summary %>% 
  ggplot(aes(day, dt_instances, color = reason_code)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "loess", formula = y ~ x) + 
  labs(color = "") + 
  theme_minimal()

ggplotly(p) %>% 
  layout(
    xaxis = list(title = "day"),
    yaxis = list(title = "downtime instances"),
    title = list(text = paste0(
      "downtime instances per day", 
      "<br>", 
      "<sup>",
      '"No Operator" and "No Work"',
      "</sup>"
      ))
    )
work_week_order = weekdays(x=as.Date(seq(7), origin="1950-01-01"))

p = tb_day_summary %>% 
  replace_na(list(dt_instances = 0)) %>% 
  mutate(wday = factor(wday, work_week_order)) %>% 
  group_by(wday, reason_code) %>% 
  summarize(mu_instances = mean(dt_instances)) %>%
  ggplot(aes(reason_code, mu_instances, fill = wday)) +
  geom_bar(stat = 'identity', position = 'dodge') + 
  scale_fill_npg() + 
  labs(
    fill = "", x = "", y = "average # of instances",
    title = "entire period"
    ) + 
  theme_minimal()

ggplotly(p)
work_week_order = weekdays(x=as.Date(seq(7), origin="1950-01-01"))

p = tb_day_summary %>% 
  filter(day < as.Date("2020/11/01")) %>% 
  replace_na(list(dt_instances = 0)) %>% 
  mutate(wday = factor(wday, work_week_order)) %>% 
  group_by(wday, reason_code) %>% 
  summarize(mu_instances = mean(dt_instances)) %>%
  ggplot(aes(reason_code, mu_instances, fill = wday)) +
  geom_bar(stat = 'identity', position = 'dodge') + 
  scale_fill_npg() + 
  labs(
    fill = "", x = "", y = "average # of instances",
    title = "before November 1, 2020"
    ) + 
  theme_minimal()

ggplotly(p)
work_week_order = weekdays(x=as.Date(seq(7), origin="1950-01-01"))

p = tb_day_summary %>% 
  filter(day >= as.Date("2020/11/01")) %>% 
  replace_na(list(dt_instances = 0)) %>% 
  mutate(wday = factor(wday, work_week_order)) %>% 
  group_by(wday, reason_code) %>% 
  summarize(mu_instances = mean(dt_instances)) %>%
  ggplot(aes(reason_code, mu_instances, fill = wday)) +
  geom_bar(stat = 'identity', position = 'dodge') + 
  scale_fill_npg() + 
  labs(
    fill = "", x = "", y = "average # of instances",
    title = "after (including) November 1, 2020"
    ) + 
  theme_minimal()

ggplotly(p)
work_week_order = weekdays(x=as.Date(seq(7), origin="1950-01-01"))

p = tb_day_summary %>% 
  filter(day >= as.Date("2020/11/01")) %>% 
  replace_na(list(dt_instances = 0)) %>% 
  mutate(wday = factor(wday, rev(work_week_order))) %>% 
  ggplot(aes(wday, dt_instances)) +
  facet_wrap(~reason_code, scales = "free_x") + 
  geom_boxplot() + 
  scale_fill_npg() + 
  labs(fill = "", x = "", y = "# of instances") + 
  coord_flip() + 
  theme_minimal()

ggplotly(p) %>% 
  layout(title = "after (including) November 1, 2020")
tb_day_summary_plant_tmp = tb %>% 
  filter(reason_code %in% c("No Operator", "No Work")) %>% 
  group_by(day, reason_code, plant) %>% 
  summarize(
    dt_instances = n(), avg_dt = mean(dt), med_dt = median(dt),
    tot_dt = sum(dt)
    ) %>% 
  ungroup()

tb_day_plant_summary = lapply(c("No Operator", "No Work"), function(i) {
  tb_day_summary_plant_tmp %>% 
    filter(reason_code == i) %>% 
    group_by(plant) %>% 
    complete(day = seq(min(tb$day), max(tb$day), by = 'day')) %>% 
    mutate(wday = weekdays(day)) %>% 
    replace_na(list(reason_code = i))
}) %>% bind_rows() %>% 
  ungroup()

work_week_order = weekdays(x=as.Date(seq(7), origin="1950-01-01"))

p = tb_day_plant_summary %>% 
  filter(day >= as.Date("2020/11/01")) %>% 
  replace_na(list(dt_instances = 0)) %>% 
  mutate(wday = factor(wday, work_week_order)) %>% 
  group_by(wday, reason_code, plant) %>% 
  summarize(mu_instances = mean(dt_instances)) %>%
  ggplot(aes(reason_code, mu_instances, fill = wday)) +
  geom_bar(stat = 'identity', position = 'dodge') + 
  facet_wrap(~plant) +
  scale_fill_npg() + 
  labs(
    fill = "", x = "", y = "average # of instances",
    title = "after (including) November 1, 2020"
    ) + 
  theme_minimal()

ggplotly(p)
p = tb_day_plant_summary %>% 
  filter(day >= as.Date("2020/11/01")) %>% 
  replace_na(list(tot_dt = 0)) %>% 
  mutate(wday = factor(wday, work_week_order)) %>% 
  group_by(wday, reason_code, plant) %>% 
  summarize(mu_dt = mean(tot_dt)/60) %>%
  ggplot(aes(reason_code, mu_dt, fill = wday)) +
  geom_bar(stat = 'identity', position = 'dodge') + 
  facet_wrap(~plant) +
  scale_fill_npg() + 
  labs(
    fill = "", x = "", y = "average total downtime per day (hours)",
    title = "after (including) November 1, 2020"
    ) + 
  theme_minimal()

ggplotly(p)

We’ll continue our analysis only looking at dates at and beyond November 1, 2020.

Reexamining the “normal”

reason_codes = tb$reason_code %>% unique()

tb_subset = tb %>% 
  filter(day >= as.Date("2020/11/01"))

tb_reason_dist = lapply(reason_codes, function(c) {
  tmp = tb_subset %>% 
    filter(reason_code == c) %>% 
    pull(dt) %>% 
    summary()
  
  tmp %>% 
    matrix(nrow = 1) %>% 
    as_tibble() %>% 
    magrittr::set_colnames(names(tmp))
}) %>% 
  bind_rows() %>% 
  mutate(reason_code = reason_codes) 

tb_reason_dist %>% 
  left_join(
    tb_subset %>% 
      group_by(reason_code) %>% 
      summarize(tot_dt = sum(dt), dt_inst = n()) %>% 
      mutate(p_dt = tot_dt/sum(tot_dt)),
    by = "reason_code"
  ) %>% 
  mutate(Mean = round(Mean, 2)) %>% 
  arrange(desc(tot_dt)) %>% 
  DT::datatable() %>% 
  DT::formatPercentage(c("p_dt"), digits = 2)

What is the relationship between total downtime and downtime instances?

plants = c("all", tb$plant %>% unique())

tb_tmp = lapply(plants, function(p) {
  
  tmp = tb %>% 
    filter(day >= as.Date("2020/11/01")) %>% 
    mutate(m_id = paste0(plant, "_", machine_id)) %>% 
    group_by(m_id, category, plant) %>% 
    summarize(tot_dt = sum(dt)/(60*24), dt_inst = n()) %>% 
    mutate(group = p)
  
  # To help in redraw of visualization
  dummy_data = tibble(category = tb$category %>% unique()) %>% 
    mutate(tot_dt = 0) %>% 
    mutate(dt_inst = 0) %>% 
    mutate(plant = p)
  
  if (p != "all") {
    tmp = tmp %>% filter(plant == p)
  }
  
  tmp %>% 
    select(-plant) %>% 
    rename(plant = group) %>% 
    bind_rows(dummy_data) %>% 
    return()
}) %>% bind_rows()

tb_tmp %>% 
  plot_ly(x = ~tot_dt, y = ~dt_inst, text = ~m_id) %>% 
  add_markers(data = tb_tmp, frame = ~plant, color = ~category, alpha = 0.7) %>% 
  layout(
    xaxis = list(
      title = "total downtime (days)"
      ),
    yaxis = list(
      title = "total downtime instances"
      ),
    title = "after (including) November 1, 2020"
  ) %>% 
  animation_button(visible = F) %>% 
  animation_opts(redraw = F, transition = 0) %>% 
  animation_slider(currentvalue = list(font = list(size = 0, color = "white")))

Are there correlations between reason codes?

tb_reason_code_day_summary_tmp = tb_subset %>% 
  group_by(day, reason_code) %>% 
  summarize(dt_instances = n(), tot_dt = sum(dt)) %>% 
  ungroup()

tb_reason_code_day_summary = lapply(reason_codes, function(i) {
  tb_reason_code_day_summary_tmp %>% 
    filter(reason_code == i) %>% 
    complete(day = seq(min(tb_subset$day), max(tb_subset$day), by = 'day')) %>% 
    replace_na(list(reason_code = i, dt_instances = 0, tot_dt = 0))
}) %>% bind_rows() 

M_reason_instances = tb_reason_code_day_summary %>% 
  select(-tot_dt) %>% 
  pivot_wider(names_from = reason_code, values_from = dt_instances) %>% 
  select(-day) %>% 
  as.matrix()

M_ri_cor = cor(M_reason_instances)

plot_ly(
  x = M_ri_cor %>% rownames(), 
  y = M_ri_cor %>% rownames(), 
  z = M_ri_cor, type = "heatmap"
  ) %>% 
  layout(
    title = "linear correlations in reason code instances"
  )
p = M_reason_instances %>% 
  as_tibble() %>% 
  ggpairs(
    columns = c(1:2, 4, 13), 
    mapping = aes(alpha = 0.5)
    ) + 
  theme_minimal()

ggplotly(p) %>% 
  layout(
    title = list(text = paste0(
      "relationship of select reason codes", 
      "<br>", 
      "<sup>",
      '# downtime instances per day',
      "</sup>"
      ))
  )
M_reason_total = tb_reason_code_day_summary %>% 
  select(-dt_instances) %>% 
  pivot_wider(names_from = reason_code, values_from = tot_dt) %>% 
  select(-day) %>% 
  as.matrix()

M_rt_cor = cor(M_reason_total)

plot_ly(
  x = M_rt_cor %>% rownames(), 
  y = M_rt_cor %>% rownames(), 
  z = M_rt_cor, type = "heatmap"
  ) %>% 
  layout(
    title = "linear correlations in reason code daily total downtime"
  )
p = M_reason_total %>% 
  as_tibble() %>% 
  ggpairs(
    columns = c(1:2, 4, 13), 
    mapping = aes(alpha = 0.5)
    ) + 
  theme_minimal()

ggplotly(p) %>% 
  layout(
    title = list(text = paste0(
      "relationship of select reason codes", 
      "<br>", 
      "<sup>",
      'daily total downtime',
      "</sup>"
      ))
  )

Are there correlations between machine downtimes?

tb_machine_day_summary_tmp = tb_subset %>% 
  mutate(m_id = paste0(plant, "_", machine_id)) %>% 
  group_by(day, m_id) %>% 
  summarize(dt_instances = n(), tot_dt = sum(dt)) %>% 
  ungroup()

m_ids = tb_subset %>% 
  mutate(m_id = paste0(plant, "_", machine_id)) %>% 
  pull(m_id) %>% 
  unique()

tb_machine_day_summary = lapply(m_ids, function(i) {
  tb_machine_day_summary_tmp %>% 
    filter(m_id == i) %>% 
    complete(day = seq(min(tb_subset$day), max(tb_subset$day), by = 'day')) %>% 
    replace_na(list(m_id = i, dt_instances = 0, tot_dt = 0))
}) %>% bind_rows() 

M_machine_instances = tb_machine_day_summary %>% 
  select(-tot_dt) %>% 
  pivot_wider(names_from = m_id, values_from = dt_instances) %>% 
  select(-day) %>% 
  as.matrix()

M_mi_cor = cor(M_machine_instances)

plot_ly(
  x = M_mi_cor %>% rownames(), 
  y = M_mi_cor %>% rownames(), 
  z = M_mi_cor, type = "heatmap"
  ) %>% 
  layout(
    title = "linear correlations in machines daily instances"
  )
p = M_machine_instances %>% 
  as_tibble() %>% 
  ggpairs(
    columns = 30:33, 
    mapping = aes(alpha = 0.5),
    lower = list(continuous=wrap("points", position=position_jitter(height=0.25, width=0.25)))
    ) + 
  theme_minimal()


ggplotly(p) %>% 
  layout(
    title = list(text = paste0(
      "relationship of select machines", 
      "<br>", 
      "<sup>",
      '# downtime instances per day',
      "</sup>"
      ))
  )
tb_subset %>% 
  mutate(m_id = paste0(plant, "_", machine_id)) %>% 
  filter(
    m_id %in%  (M_machine_instances %>% 
      as_tibble() %>% names() %>% magrittr::extract(30:33))
    ) %>% 
  group_by(m_id) %>% 
  slice(1) %>% 
  ungroup() %>% 
  select(plant, machine_id, m_id, description, category)
## # A tibble: 4 × 5
##   plant   machine_id m_id       description    category
##   <chr>        <dbl> <chr>      <chr>          <chr>   
## 1 cooking         18 cooking_18 Woodtek Sander Sander  
## 2 cooking         19 cooking_19 Woodtek Sander Sander  
## 3 cooking         75 cooking_75 Woodtek Sander Sander  
## 4 cooking         77 cooking_77 Woodtek Sander Sander
M_machine_dt = tb_machine_day_summary %>% 
  select(-dt_instances) %>% 
  pivot_wider(names_from = m_id, values_from = tot_dt) %>% 
  select(-day) %>% 
  as.matrix()

M_mt_cor = cor(M_machine_dt)

plot_ly(
  x = M_mt_cor %>% rownames(), 
  y = M_mt_cor %>% rownames(), 
  z = M_mt_cor, type = "heatmap"
  ) %>% 
  layout(
    title = "linear correlations in machines daily total downtime"
  )
p = M_machine_dt %>% 
  as_tibble() %>% 
  ggpairs(
    columns = 121:123, 
    mapping = aes(alpha = 0.5)
    ) + 
  theme_minimal()

ggplotly(p) %>% 
  layout(
    title = list(text = paste0(
      "relationship of select machines", 
      "<br>", 
      "<sup>",
      'daily total downtime',
      "</sup>"
      ))
  )
tb_subset %>% 
  mutate(m_id = paste0(plant, "_", machine_id)) %>% 
  filter(
    m_id %in%  (M_machine_instances %>% 
      as_tibble() %>% names() %>% magrittr::extract(121:123))
    ) %>% 
  group_by(m_id) %>% 
  slice(1) %>% 
  ungroup() %>% 
  select(plant, machine_id, m_id, description, category)
## # A tibble: 3 × 5
##   plant   machine_id m_id       description                   category    
##   <chr>        <dbl> <chr>      <chr>                         <chr>       
## 1 cooking         43 cooking_43 Amada FBD 8025 NT Press Brake Press Brake 
## 2 cooking         54 cooking_54 Amada Vipros 255              Turret Press
## 3 cooking          7 cooking_7  Amada HDS 8025 NT Press Brake Press Brake
sessionInfo()
## R version 4.2.1 (2022-06-23 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] GGally_2.1.2      scales_1.2.1      dendextend_1.16.0 diagram_1.6.5    
##  [5] shape_1.4.6       markovchain_0.9.0 plotly_4.10.1     ggsci_2.9        
##  [9] lubridate_1.8.0   readxl_1.4.1      forcats_0.5.2     stringr_1.5.0    
## [13] dplyr_1.0.10      purrr_1.0.1       readr_2.1.3       tidyr_1.3.0      
## [17] tibble_3.1.8      ggplot2_3.4.0     tidyverse_1.3.2  
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-157        fs_1.5.2            bit64_4.0.5        
##  [4] RColorBrewer_1.1-3  httr_1.4.4          tools_4.2.1        
##  [7] backports_1.4.1     bslib_0.4.1         DT_0.26            
## [10] utf8_1.2.2          R6_2.5.1            mgcv_1.8-40        
## [13] DBI_1.1.3           lazyeval_0.2.2      colorspace_2.0-3   
## [16] withr_2.5.0         tidyselect_1.2.0    gridExtra_2.3      
## [19] bit_4.0.4           compiler_4.2.1      cli_3.4.1          
## [22] rvest_1.0.3         expm_0.999-6        xml2_1.3.3         
## [25] labeling_0.4.2      sass_0.4.2          digest_0.6.30      
## [28] rmarkdown_2.17      pkgconfig_2.0.3     htmltools_0.5.3    
## [31] dbplyr_2.2.1        fastmap_1.1.0       htmlwidgets_1.5.4  
## [34] rlang_1.0.6         rstudioapi_0.14     farver_2.1.1       
## [37] jquerylib_0.1.4     generics_0.1.3      jsonlite_1.8.3     
## [40] crosstalk_1.2.0     vroom_1.6.0         googlesheets4_1.0.1
## [43] magrittr_2.0.3      Matrix_1.5-3        Rcpp_1.0.9         
## [46] munsell_0.5.0       fansi_1.0.3         viridis_0.6.2      
## [49] lifecycle_1.0.3     stringi_1.7.8       yaml_2.3.6         
## [52] plyr_1.8.7          grid_4.2.1          parallel_4.2.1     
## [55] crayon_1.5.2        lattice_0.20-45     splines_4.2.1      
## [58] haven_2.5.1         hms_1.1.2           knitr_1.40         
## [61] pillar_1.8.1        igraph_1.3.5        stats4_4.2.1       
## [64] reprex_2.0.2        glue_1.6.2          evaluate_0.18      
## [67] data.table_1.14.4   RcppParallel_5.1.5  modelr_0.1.9       
## [70] vctrs_0.5.2         tzdb_0.3.0          cellranger_1.1.0   
## [73] gtable_0.3.1        reshape_0.8.9       assertthat_0.2.1   
## [76] cachem_1.0.6        xfun_0.34           broom_1.0.1        
## [79] googledrive_2.0.0   viridisLite_0.4.1   gargle_1.2.1       
## [82] ellipsis_0.3.2