khs_features.Rmd

---
title: "R Notebook"
output: document_md
---

# Initilization
```{r, message=FALSE, warning=FALSE}
##### Clear Environment #####
rm(list = ls())

##### Source Functions #####
source("./R/utils.R")
source("./R/pc_plots.R")

##### Load Packages #####

required_packages <- c("magrittr", "data.table", "pbapply", "imputeTS", "ggfortify", "qs", "feasts", "future")

load_packages(required_packages)

##### Collect Garbage #####
invisible(gc())

```

```{r}
M4 <- qread(processed_data_paths[grep("M4DT", processed_data_paths)], nthreads = getDTthreads())
M4[period == "Weekly", ] %>%
  ggplot(aes(n)) +
  stat_ecdf() +
  scale_x_continuous(breaks = seq(0, 3000, 100)) +
  scale_y_continuous(breaks = seq(0, 1, 0.1)) + 
  geom_vline(xintercept = 104)

M4[period == "Weekly", sum(n > 104) / .N]
M4[period == "Daily", sum(n > 365*2) / .N]
```

# Download Kaggle data
# To use script you will need to accept competition rules at https://www.kaggle.com/c/<competition-name>/rules.
```{r, eval = FALSE}
if(.Platform$OS.type == "unix") {
  system("./Munge/00\\ -\\ Download\\ raw\\ kaggle\\ data.sh")
} else {
  shell("./Munge/00\\ -\\ Download\\ raw\\ kaggle\\ data.sh")
}

####### Prepare Data Information #######
data_info <-  data.table(
  id = sub(".zip", "", list.files("./Data/Raw_Data")),
  raw_data_dir = list.files("./Data/Raw_Data", full.names = TRUE),
  train_data = list("train", "air_visit_data", "train", "train", "train", c("train_1", "train_2")),
  test_data = c("test", NA, "test", NA, "test", NA),
  group = list(c("store_nbr", "item_nbr"), c("air_store_id"), c("store"), c("store_nbr", "item_nbr"), c("store", "dept"), c("page")),
  target = c("unit_sales", "visitors", "sales", "units", "weekly_sales", "views"),
  index = c("date", "visit_date", "date", "date", "date", "date"),
  ts_frequency = c(7, 7, 7, 7, 52, 7)
)
```

# Restructure Kaggle Data from Multiple "XX.CSV" files to Single "Combined.RDS" file 
```{r, eval=FALSE}

####### Initiate Control Parameters #######
restructured_write_path <- "./Data/Restructured/"

####### Source Batch Assembly Script #######
rstudioapi::jobRunScript(path = "./Munge/01 - Assemble Kaggle Data to Combined RDS File.R", workingDir = getwd(), importEnv = TRUE)
```

# Run Preprocessing Script
```{r, eval=FALSE}

####### Initiate Control Parameters #######
data_info[, restructured_data_path := list.files("./Data/Restructured", full.names = T)]
processed_training_save_path <- "./Data/Processed/"
imputation_method <- imputeTS::na_seasplit

####### Source Preprocessing Script #######
rstudioapi::jobRunScript(path = "./Munge/02 - Preprocess Data.R", workingDir = getwd(), importEnv = TRUE)

```

# Install ForeCA (Required for estimation of Spectral Entropy)
```{r, eval=FALSE}
sapa_url <- "https://cran.r-project.org/src/contrib/Archive/sapa/sapa_2.0-2.tar.gz"
ifultools_url <- "https://cran.r-project.org/src/contrib/Archive/ifultools/ifultools_2.0-5.tar.gz"
foreca_url <- "https://cran.r-project.org/src/contrib/Archive/ForeCA/ForeCA_0.2.6.tar.gz"
install.packages("splus2R")
install.packages(ifultools_url, repos = NULL)
install.packages(sapa_url, repos = NULL)
install.packages(foreca_url, repos = NULL)
```


# Extract Kang, Hyndman, Smith Features from Kaggle and M Competitions
```{r, eval = FALSE}

####### Initiate Control Parameters #######
processed_data_paths <- list.files("./Data/Processed", full.names = T)
khs_feat_save_path <- "./Results/khs_features/"
khs_retry_save_path <- "./Cache/khs_retry/"
khs_number_of_retry <- 3
khs_plot_save_path <- "./Figures/"


# Use adjusted frequency: Daily = 7; Weekly = 52
adjusted_weekly_daily_frequency <- TRUE
# Include `Frequency`in prcomp / plot
include_frequency <- TRUE

####### Extract and Plot KHS Features #######
rstudioapi::jobRunScript(path = "./Munge/03 - Extract KHS Features.R", workingDir = getwd(), importEnv = TRUE)
rstudioapi::jobRunScript(path = "./Munge/04 - Plot KHS Features.R", workingDir = getwd(), importEnv = TRUE)

# With frequency: Daily = 1, Weekly = 1
adjusted_weekly_daily_frequency <- FALSE
# Include `Frequency`in prcomp / plot
include_frequency <- TRUE

rstudioapi::jobRunScript(path = "./Munge/03 - Extract KHS Features.R", workingDir = getwd(), importEnv = TRUE)
rstudioapi::jobRunScript(path = "./Munge/04 - Plot KHS Features.R", workingDir = getwd(), importEnv = TRUE)

```

# SBC Classification
```{r, eval = FALSE}

####### Initiate Control Parameters #######
processed_data_paths <- list.files("./Cache/processed_training_data/", full.names = T)
sbc_save_path <- "./Results/sbc_class/"
sbc_plot_save_path <- "./Figures/"

####### Source SBC Classification Script #######
rstudioapi::jobRunScript(path = "./Munge/05 - Extract SBC Classification.R", workingDir = getwd(), importEnv = TRUE)


all_sbc_classes <- rbindlist(lapply(list.files(sbc_save_path, full.names = T), readRDS))

all_sbc_classes[, id := dplyr::case_when(id == "favorita-grocery-sales-forecasting" ~ "Corporacion Favorita",
                                         id == "recruit-restaurant-visitor-forecasting" ~ "Recruit Restaurant",
                                         id == "rossmann-store-sales" ~ "Rossmann",
                                         id == "walmart-recruiting-store-sales-forecasting" ~ "Walmart Store Sales",
                                         id == "walmart-recruiting-sales-in-stormy-weather" ~ "Walmart Stormy Weather",
                                         id == "web-traffic-time-series-forecasting" ~ "Wikipedia",
                                         TRUE ~ id)]

sbc_point_plot <- function(.data) {
  .data[complete.cases(.data)] %>%
    ggplot(aes(ADI, CV2)) +
    geom_vline(xintercept = 1.32) + # ADI Threshold
    geom_hline(yintercept = 0.49) + # CV2 Threshold
    geom_point(na.rm = T, alpha = 0.5) +
    scale_y_log10() + scale_x_log10() +
    facet_wrap(~id)
}


sbc_point_plot(all_sbc_classes)

sbc_box_plot <- function(.data) {
  .data[complete.cases(.data), ] %>%
    ggplot(aes(x = id, y = ADI)) +
    geom_boxplot() +
    scale_y_log10() +
    #facet_wrap(~id, scales = "free_y") +
    theme_bw() +
    theme(axis.title.y = element_blank()) +
    labs(y = "Log10( Average Demand Interval )") +
    coord_flip()
}

sbc_box_plot(all_sbc_classes)


sbc_plot(all_sbc_classes)

sbc_box_plot <- function(.data) {
  .data[complete.cases(.data), ] %>%
    ggplot(aes(x = 1, y = ADI)) +
    geom_boxplot() +
    scale_y_log10() +
    facet_wrap(~id, scales = "free_y") +
    theme_bw() +
    theme(axis.title.x = element_blank()) +
    labs(y = "Log10( Average Demand Interval )")
}

```

# Method to control script sourcing seperate session
```{r}
library(future)

assert_is_list <- function(x, recursive = FALSE) {
  parent <- deparse(substitute(x, env = parent.frame()))
  if(!is.list(x)) {
    stop(sprintf("Failed is.list(%s) : %s must be a list", parent, parent))
  } 
  
  if(isTRUE(recursive)) {
    child_check <- sapply(x, is.list)
    child_names <- names(child_check)
    if(!all(child_check)) {
      if(!is.null(child_names)) {
        child_msg <- paste(child_names[!child_check], collapse = ", ")
        stop(sprintf("Following child names failed is.list() : %s", child_msg))
      } else {
        child_msg <- paste(which(child_check == FALSE), collapse = ", ")
        stop(sprintf("Following child indices failed is.list() : %s", child_msg))
      }
      
    }
  }
}

assert_arguments <- function(x) {
  check_names <- sapply(x, function(xx) names(xx) %in% c("FUN", "args"))
  if(!all(check_names)) {
    stop(sprintf("Failed argument check : Arguments must be named 'FUN' and 'args'"))
  }
  if(!is.call(xx$FUN)) {
    stop(sprintf("Failed argument check : Argument 'FUN' must be a call created with 'substitute()'"))
  }
  if(is.list(xx$args) &&  !is.null(names(xx$args))) {
    stop(sprintf("Failed argument check : Argument 'args' must be a named list"))
  }
}

sources <- list(
  "(1) Prepare Data Information" = list(FUN = substitute(source("./Munge/01 - Preprocess Data.R")),
                                        args = list(data_info = data_info,
                                                    processed_training_save_path = "./Cache/processed_training_data/",
                                                    imputation_method = imputeTS::na_seasplit)),
  "(X) Extract SBC Classification" = list(FUN = substitute(source("./Munge/04 - Extract SBC Classification.R")),
                                          args = list(data_info = data_info, 
                                                      sbc_save_path = "./Results/sbc_class/", 
                                                      processed_data_paths = list.files("./Cache/processed_training_data/", full.names = T),
                                                      sbc_classifier = sbc_classifier))
)

future_source <- function(x, workers = 1) {
  require(future, quietly = TRUE)
  assert_is_list(x, recursive = T)
  assert_arguments(x)
  
  for(i in seq_along(x)) {
    
    writeLines(sprintf("##### %s #####", names(x)[i]))
    
    # Create/reset session to run calculations
    if(i != length(x)) plan(cluster, workers = workers)
    
    # Run calculation in session
    f <- future(x[[i]]$FUN, 
                substitute = FALSE,
                globals = x[[i]]$args)
    
    # Check if calculation has resolved
    while(TRUE) {
      if(!resolved(f)) {
        Sys.sleep(1)
      } else {
        value(f)
        break()
      }
    }
    
    # Close return to sequential when all calculations are finished
    if(i == length(sources)) plan(sequential)
  }
}
```


```{r}


```