The four pillars:
Descriptive analytics seek to explain what happened in the data via statistical summaries and visualizations.
Let’s look at some example data:
tb %>% str()
## tibble [155,313 × 10] (S3: tbl_df/tbl/data.frame)
## $ id : num [1:155313] 225121 225128 225129 225130 225131 ...
## $ day : POSIXct[1:155313], format: "2020-01-02" "2020-01-02" ...
## $ plant : Named chr [1:155313] "cooking" "cooking" "cooking" "cooking" ...
## ..- attr(*, "names")= chr [1:155313] "C" "C" "C" "C" ...
## $ machine_id : num [1:155313] 1 1 4 4 4 4 4 4 4 4 ...
## $ shift : num [1:155313] 1 1 1 1 1 1 1 1 1 1 ...
## $ dt : num [1:155313] 15 15 20 8 4 3 8 10 5 3 ...
## $ reason_code_id: num [1:155313] 8 19 8 8 8 8 8 8 8 8 ...
## $ description : chr [1:155313] "Amada Shear" "Amada Shear" "Amada Vipros 255" "Amada Vipros 255" ...
## $ category : chr [1:155313] "Shear" "Shear" "Turret Press" "Turret Press" ...
## $ reason_code : chr [1:155313] "Set Up" "Operator PM" "Set Up" "Set Up" ...
tb %>% head(10) %>% DT::datatable()
This data contains 155313 observations. Observations are generated when a machine is taken offline. Each observation of this data provides,
Let’s investigate these variables.
tb$day %>% range()
## [1] "2020-01-02 UTC" "2021-07-29 UTC"
tb$plant %>% unique()
## [1] "cooking" "ventilation" "refrigeration"
tb %>%
mutate(m_id = paste0(machine_id, "_", plant)) %>%
pull(m_id) %>%
unique() %>%
length()
## [1] 165
tb %>%
mutate(m_id = paste0(machine_id, "_", plant)) %>%
group_by(plant, m_id) %>%
slice(1) %>%
ungroup() %>%
group_by(plant) %>%
summarize(count = n())
## # A tibble: 3 × 2
## plant count
## <chr> <int>
## 1 cooking 108
## 2 refrigeration 34
## 3 ventilation 23
tb$dt %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 5.00 10.00 39.12 15.00 4360.00
Let’s look at some histograms.
tb %>%
plot_ly(x = ~dt, type = "histogram") %>%
layout(
xaxis = list(title = "downtime (m)"),
yaxis = list(title = "count")
)
Let’s look at probability.
tb %>%
plot_ly(x = ~dt, type = "histogram", histnorm = "probability") %>%
layout(
xaxis = list(title = "downtime (m)"),
yaxis = list(title = "proportion")
)
Let’s look at the cumulative probability.
f_dt_ecdf = ecdf(tb$dt)
tibble(x = seq(0, max(tb$dt), length.out = 10000)) %>%
mutate(y = f_dt_ecdf(x)) %>%
plot_ly(x = ~x, y = ~y, type = "scatter", mode = "lines") %>%
layout(
xaxis = list(type = "log", title = "downtime (m) | log scaled"),
yaxis = list(title = "probability", tickformat = ".2%")
)
tb %>%
group_by(shift) %>%
summarize(count = n(), total_dt = sum(dt)) %>%
mutate(proportion = count/sum(count)) %>%
mutate(total_dt_proportion = total_dt/sum(total_dt))
## # A tibble: 2 × 5
## shift count total_dt proportion total_dt_proportion
## <dbl> <int> <dbl> <dbl> <dbl>
## 1 1 152191 6025220 0.980 0.992
## 2 2 3122 50132 0.0201 0.00825
tb$reason_code %>% unique()
## [1] "Set Up" "Operator PM"
## [3] "Load Material" "No Operator"
## [5] "Program Error" "Tool Change(special)"
## [7] "Maintenance" "Tool Other"
## [9] "Tool Change(dull)" "Tool Crib"
## [11] "Clamp Failure" "Move to another machine"
## [13] "No Work" "Loader Failure"
## [15] "Sheet Crash" "Unloader Failure"
## [17] "Maintenance PM" "Hit Counter Due"
tb %>%
group_by(reason_code) %>%
summarize(count = n()) %>%
mutate(reason_code = fct_reorder(reason_code, count, .desc = T)) %>%
plot_ly(x = ~reason_code, y = ~count, type = "bar")
p = tb %>%
group_by(reason_code, plant) %>%
summarize(dt_instances = n()) %>%
ungroup() %>%
mutate(reason_code = fct_reorder(reason_code, dt_instances, .desc = F)) %>%
ggplot(aes(dt_instances, reason_code)) +
facet_wrap(~plant) +
geom_bar(stat = 'identity') +
labs(x = 'downtime instances', y = "reason code") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
ggplotly(p)
tb %>%
group_by(reason_code) %>%
summarize(`total time (years)` = sum(dt)/(60*24*365)) %>%
mutate(reason_code = fct_reorder(reason_code, `total time (years)`, .desc = T)) %>%
plot_ly(x = ~reason_code, y = ~`total time (years)`, type = "bar") %>%
layout(yaxis = list(title = "total downtime (years)"))
p = tb %>%
group_by(reason_code, plant) %>%
summarize(`total time (years)` = sum(dt)/(60*24*365)) %>%
ungroup() %>%
mutate(reason_code = fct_reorder(reason_code, `total time (years)`, .desc = F)) %>%
ggplot(aes(`total time (years)`, reason_code)) +
facet_wrap(~plant) +
geom_bar(stat = 'identity') +
labs(x = 'total downtime (years)', y = "reason code") +
theme_minimal()
ggplotly(p)
tb %>%
group_by(machine_id, category) %>%
slice(1) %>%
ungroup() %>%
group_by(category) %>%
summarize(count = n()) %>%
mutate(category = fct_reorder(category, count, .fun = sum, .desc = T)) %>%
plot_ly(x = ~category, y = ~count, type = "bar")
tb_cat_count = tb %>%
group_by(machine_id, category) %>%
slice(1) %>%
ungroup() %>%
group_by(category) %>%
summarize(count = n())
tb %>%
group_by(category) %>%
summarize(total_dt = sum(dt)/(60*24)) %>%
left_join(tb_cat_count, by = "category") %>%
mutate(scaled_dt = total_dt/count) %>%
mutate(category = fct_reorder(category, scaled_dt, .desc = T)) %>%
plot_ly(x = ~category, y = ~scaled_dt, type = "bar") %>%
layout(yaxis = list(title = "downtime per machine (days)"))
tb %>%
mutate(m_id = paste0(machine_id, "_", plant)) %>%
group_by(m_id, plant) %>%
summarize(dt_instances = n()) %>%
ungroup() %>%
mutate(m_id = fct_reorder(m_id, dt_instances, .desc = T)) %>%
plot_ly(x = ~m_id, y = ~dt_instances, color = ~plant, type = "bar") %>%
layout(
xaxis = list(showticklabels = FALSE, title = "machine id"),
yaxis = list(title = "downtime instances")
)
tb %>%
mutate(m_id = paste0(machine_id, "_", plant)) %>%
group_by(m_id, category) %>%
summarize(dt_instances = n()) %>%
ungroup() %>%
mutate(m_id = fct_reorder(m_id, dt_instances, .desc = T)) %>%
plot_ly(x = ~m_id, y = ~dt_instances, color = ~category, type = "bar") %>%
layout(
xaxis = list(showticklabels = FALSE, title = "machine id"),
yaxis = list(title = "downtime instances")
)
tb %>%
mutate(m_id = paste0(machine_id, "_", plant)) %>%
group_by(m_id) %>%
summarize(dt_instances = n()) %>%
mutate(m_id = fct_reorder(m_id, dt_instances, .fun = max)) %>%
arrange(desc(m_id)) %>%
mutate(cum_dt = cumsum(dt_instances)) %>%
mutate(p = round(cum_dt/sum(dt_instances), 4)) %>%
rowid_to_column(var = 'n_machines') %>%
mutate(p_machines = scales::percent(n_machines/max(n_machines), accuracy = 0.01)) %>%
plot_ly(
x = ~n_machines, y = ~p, text = ~p_machines, type = 'scatter', mode = "lines",
hovertemplate = paste(
'Percent of total downtime instances: %{y}',
'<br>Number of machines: %{x}',
'<br>Percent of machines: %{text}'
),
name = ''
) %>%
layout(
title = "cumulative % of downtime instances by machines",
yaxis = list(tickformat = ".2%", title = '% total downtime instances'),
xaxis = list(title = "number of machines")
)
tb %>%
mutate(m_id = paste0(machine_id, "_", plant)) %>%
group_by(m_id, plant) %>%
summarize(total_dt = sum(dt)/(60*24)) %>%
ungroup() %>%
mutate(m_id = fct_reorder(m_id, total_dt, .desc = T)) %>%
plot_ly(x = ~m_id, y = ~total_dt, color = ~plant, type = "bar") %>%
layout(
xaxis = list(showticklabels = FALSE, title = "machine id"),
yaxis = list(title = "downtime (days)")
)
tb %>%
mutate(m_id = paste0(machine_id, "_", plant)) %>%
group_by(m_id, category) %>%
summarize(total_dt = sum(dt)/(60*24)) %>%
ungroup() %>%
mutate(m_id = fct_reorder(m_id, total_dt, .desc = T)) %>%
plot_ly(x = ~m_id, y = ~total_dt, color = ~category, type = "bar") %>%
layout(
xaxis = list(showticklabels = FALSE, title = "machine id"),
yaxis = list(title = "downtime (days)")
)
tb %>%
mutate(m_id = paste0(machine_id, "_", plant)) %>%
group_by(m_id) %>%
summarize(tot_dt = sum(dt)) %>%
mutate(m_id = fct_reorder(m_id, tot_dt, .fun = max)) %>%
arrange(desc(m_id)) %>%
mutate(cum_dt = cumsum(tot_dt)) %>%
mutate(p = round(cum_dt/sum(tot_dt), 4)) %>%
rowid_to_column(var = 'n_machines') %>%
mutate(p_machines = scales::percent(n_machines/max(n_machines), accuracy = 0.01)) %>%
plot_ly(
x = ~n_machines, y = ~p, text = ~p_machines, type = 'scatter', mode = "lines",
hovertemplate = paste(
'Percent of total downtime: %{y}',
'<br>Number of machines: %{x}',
'<br>Percent of machines: %{text}'
),
name = ''
) %>%
layout(
title = "cumulative % of total downtime by machines",
yaxis = list(tickformat = ".2%", title = '% total downtime'),
xaxis = list(title = "number of machines")
)
tb_day_summary_tmp = tb %>%
group_by(day, plant) %>%
summarize(
dt_instances = n(), avg_dt = mean(dt), med_dt = median(dt),
tot_dt = sum(dt)
) %>%
ungroup()
tb_day_summary = lapply(c('cooking', 'refrigeration', 'ventilation'), function(i) {
tb_day_summary_tmp %>%
filter(plant == i) %>%
complete(day = seq(min(day), max(day), by = 'day')) %>%
mutate(plant = i) %>%
mutate(wday = weekdays(day))
}) %>% bind_rows()
tb_day_summary %>%
plot_ly(
x = ~day, y = ~dt_instances, color = ~plant, type = "scatter", mode = "lines",
text = ~wday, alpha = 0.8
) %>%
layout(
xaxis = list(title = "day"),
yaxis = list(title = "downtime instances"),
title = "downtime instances per day"
)
tb_day_summary %>%
mutate(tot_dt = tot_dt/(60*24)) %>%
plot_ly(
x = ~day, y = ~tot_dt, color = ~plant, type = "scatter", mode = "lines",
text = ~wday, alpha = 0.8
) %>%
layout(
xaxis = list(title = "day"),
yaxis = list(title = "total downtime (days)"),
title = "total downtime per day"
)
tb_num_machines = tb %>%
mutate(m_id = paste0(machine_id, "_", plant)) %>%
group_by(m_id, plant) %>%
slice(1) %>%
ungroup() %>%
group_by(plant) %>%
summarize(num_machines = n())
tb_day_summary2 = tb_day_summary %>%
left_join(tb_num_machines, by = 'plant') %>%
mutate(dt_instances_per_machine = dt_instances/num_machines) %>%
mutate(tot_dt_per_machine = tot_dt/num_machines)
tb_day_summary2 %>%
plot_ly(
x = ~day, y = ~dt_instances_per_machine, color = ~plant, type = "scatter", mode = "lines",
text = ~wday, alpha = 0.8
) %>%
layout(
xaxis = list(title = "day"),
yaxis = list(title = "downtime instances per machine"),
title = "scaled downtime instances per day"
)
tb_day_summary2 %>%
ggplot(aes(dt_instances_per_machine, plant)) +
geom_boxplot() +
geom_violin(alpha = 0.5, fill = 'black', linewidth = 0) +
labs(
x = "downtime instances per machine, per day",
y = "",
title = "distribution of scaled downtime instances"
) +
theme_minimal()
tb_day_summary2 %>%
plot_ly(
x = ~day, y = ~tot_dt_per_machine, color = ~plant, type = "scatter", mode = "lines",
text = ~wday, alpha = 0.8
) %>%
layout(
xaxis = list(title = "day"),
yaxis = list(title = "total downtime per machine"),
title = "scaled total downtime per day"
)
tb_day_summary2 %>%
ggplot(aes(tot_dt_per_machine, plant)) +
geom_boxplot() +
geom_violin(alpha = 0.5, fill = 'black', linewidth = 0) +
labs(
x = "total downtime per machine, per day",
y = "",
title = "distribution of scaled total downtime"
) +
theme_minimal()
work_week_order = weekdays(x=as.Date(seq(7), origin="1950-01-01"))
p = tb_day_summary2 %>%
mutate(day_of_week = weekdays(day)) %>%
mutate(day_of_week = factor(day_of_week, work_week_order)) %>%
group_by(plant, day_of_week) %>%
summarize(med_dt_instances_per_machine = median(dt_instances_per_machine, na.rm = T)) %>%
ggplot(aes(plant, med_dt_instances_per_machine, fill = day_of_week)) +
geom_bar(stat = 'identity', position = 'dodge') +
scale_fill_npg() +
labs(fill = "", x = "", y = "median downtime instances per machine") +
theme_minimal()
ggplotly(p)
work_week_order = weekdays(x=as.Date(seq(7), origin="1950-01-01"))
p = tb_day_summary2 %>%
mutate(day_of_week = weekdays(day)) %>%
mutate(day_of_week = factor(day_of_week, work_week_order)) %>%
group_by(plant, day_of_week) %>%
summarize(med_dt = median(tot_dt_per_machine, na.rm = T)) %>%
ggplot(aes(plant, med_dt, fill = day_of_week)) +
geom_bar(stat = 'identity', position = 'dodge') +
scale_fill_npg() +
labs(fill = "", x = "", y = "median daily downtime per machine (m)") +
theme_minimal()
ggplotly(p)
sessionInfo()
## R version 4.2.1 (2022-06-23 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19044)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] dendextend_1.16.0 diagram_1.6.5 shape_1.4.6 markovchain_0.9.0
## [5] plotly_4.10.1 ggsci_2.9 lubridate_1.8.0 readxl_1.4.1
## [9] forcats_0.5.2 stringr_1.5.0 dplyr_1.0.10 purrr_1.0.1
## [13] readr_2.1.3 tidyr_1.3.0 tibble_3.1.8 ggplot2_3.4.0
## [17] tidyverse_1.3.2
##
## loaded via a namespace (and not attached):
## [1] fs_1.5.2 bit64_4.0.5 RColorBrewer_1.1-3
## [4] httr_1.4.4 tools_4.2.1 backports_1.4.1
## [7] bslib_0.4.1 DT_0.26 utf8_1.2.2
## [10] R6_2.5.1 DBI_1.1.3 lazyeval_0.2.2
## [13] colorspace_2.0-3 withr_2.5.0 tidyselect_1.2.0
## [16] gridExtra_2.3 bit_4.0.4 compiler_4.2.1
## [19] cli_3.4.1 rvest_1.0.3 expm_0.999-6
## [22] xml2_1.3.3 labeling_0.4.2 sass_0.4.2
## [25] scales_1.2.1 digest_0.6.30 rmarkdown_2.17
## [28] pkgconfig_2.0.3 htmltools_0.5.3 highr_0.9
## [31] dbplyr_2.2.1 fastmap_1.1.0 htmlwidgets_1.5.4
## [34] rlang_1.0.6 rstudioapi_0.14 farver_2.1.1
## [37] jquerylib_0.1.4 generics_0.1.3 jsonlite_1.8.3
## [40] crosstalk_1.2.0 vroom_1.6.0 googlesheets4_1.0.1
## [43] magrittr_2.0.3 Matrix_1.5-3 Rcpp_1.0.9
## [46] munsell_0.5.0 fansi_1.0.3 viridis_0.6.2
## [49] lifecycle_1.0.3 stringi_1.7.8 yaml_2.3.6
## [52] grid_4.2.1 parallel_4.2.1 crayon_1.5.2
## [55] lattice_0.20-45 haven_2.5.1 hms_1.1.2
## [58] knitr_1.40 pillar_1.8.1 igraph_1.3.5
## [61] stats4_4.2.1 reprex_2.0.2 glue_1.6.2
## [64] evaluate_0.18 data.table_1.14.4 RcppParallel_5.1.5
## [67] modelr_0.1.9 vctrs_0.5.2 tzdb_0.3.0
## [70] cellranger_1.1.0 gtable_0.3.1 assertthat_0.2.1
## [73] cachem_1.0.6 xfun_0.34 broom_1.0.1
## [76] googledrive_2.0.0 viridisLite_0.4.1 gargle_1.2.1
## [79] ellipsis_0.3.2