04_run-report.Rmd

---
title: "Run Report"
output: html_document
date: '2022-12-19'
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE)
library(tidyverse)
library(here)
base_dir <- "MANDIFORE_runs"
```

# Logfile summary

This table shows each ensemble for each site of the run, and checks the logfile for ED2 errors, and    `PEcAn.ED2::model2netcdf.ED2()` errors.  Ensembles that didn't complete the full run, but have no errors are most likely due to the HPC walltime being reached and those runs getting killed.

```{r}
logs <- list.files(base_dir, "logfile.txt", recursive = TRUE, full.names = TRUE)
# read in ends of log files
log_ends <- logs |> 
  map(~{
    read_lines(.x) |> 
      tail(50)
  })

#check end of logs for common patterns
finished <- log_ends |> 
  map_lgl(\(x) x |> str_detect("MODEL FINISHED") |> any())
ed_error <- log_ends |> 
  map_lgl(\(x) {
    x |> str_detect("ERROR IN MODEL RUN") |> any() |
      x |> str_detect("FATAL ERROR") |> any()
    })
ed_error_reason <- log_ends |> 
  map_chr(\(x) {
    out <- x[str_detect(x, "(?<=---> Reason:).+") | str_detect(x, "SIGABRT")] |> 
      str_remove("---> Reason:") |> str_trim()
    if(length(out) == 0) {
      out <- NA_character_
    }
    out
  })
model2netcdf_error <- log_ends |> 
  map_lgl(\(x) x |> str_detect("ERROR IN model2netcdf.ED2") |> any())
```

```{r}
#make report table
log_report <-
  tibble(path = logs, finished, ed_error, ed_error_reason, model2netcdf_error) |>
  mutate(
    ensemble = path |>  str_extract("(?<=ENS-)\\d+") |> as.numeric(),
    site = path |> str_extract("MANDIFORE-(PNW|SEUS)-\\d+")
  ) |>
  select(site, ensemble, everything()) |>
  arrange(site, desc(finished), ensemble)
```

```{r}
ensemble_report <- 
  tibble(path = list.files(base_dir, "analysis-E-.+\\.h5", recursive = TRUE)) |> 
  mutate(
    ensemble = path |>  str_extract("(?<=ENS-)\\d+") |> as.numeric(),
    site = path |> str_extract("MANDIFORE-(PNW|SEUS)-\\d+"),
    date = path |> basename() |> str_extract("(?<=E-)\\d{4}-\\d{2}") |> lubridate::ym()
  ) |> 
  group_by(site, ensemble) |> 
  summarize(
    date_start = min(date),
    date_end = max(date)
  ) |>
  full_join(log_report) |> 
  select(-path)
reactable::reactable(ensemble_report, filterable = TRUE)
```

# Ensemble data summary

Get the samples.Rdata for every site run and combine into a dataframe.

```{r}
samples_rdata <- list.files(base_dir, "samples\\.Rdata", recursive = TRUE, full.names = TRUE) 
sitenames <- samples_rdata |> str_extract("MANDIFORE-(PNW|SEUS)-\\d+")

ensemble_inputs <-
  samples_rdata |> 
  set_names(sitenames) |> 
  map_dfr(\(Rdata) {
    load(Rdata)
    ensemble.samples[-which(names(ensemble.samples)=="env")] |> 
      map(\(.x) mutate(.x, ensemble = 1:n())) |> 
      bind_rows(.id = "pft_name")
  }, .id = "site") |> 
  select(site, ensemble, pft_name, everything())

reactable::reactable(ensemble_inputs, filterable = TRUE)
```

1) load samples.Rdata and create a table for each ensemble member with 1 column per parameter per pft

join with ensemble report and actual data summary and model when variables go extreme (as a binary) as a function of inputs.

data in ensemble.samples.NOENSEMBLEID.Rdata inside objec `ens.samples`


2) plot distributions for traits in `trait.samples` (in samples.Rdata).  Look for anything extreme or weird

# Posterior summary

A work-in-progress.  Open and visualize posterior distributions with the `distributional` and `ggdist` packages.

```{r}
library(distributional)
library(ggdist)
library(dplyr)
library(glue)
posterior_rdata <- list.files(base_dir, "post\\.distns\\.Rdata", recursive = TRUE, full.names = TRUE)
dists_raw <- 
  posterior_rdata |> 
  set_names(posterior_rdata) |> 
  map_dfr(\(Rdata) {
    load(Rdata)
    post.distns |> as_tibble(rownames = "var")
  }, .id = "path") #|> 
  # group_by(pft_name, var) |> 
  # summarize(across(starts_with("param"), mean))
  # #make distributions for distributional and ggdist packages
  # mutate(dist_string = glue("{distn}({parama}, {paramb})")) |> 
  # parse_dist(dist_string)

dists <-
  dists_raw |> 
  mutate(site = str_extract(path, "MANDIFORE-(PNW|SEUS)-\\d+"),
         pft_name = str_extract(path, "(?<=pft/)[[a-zA-z\\.]]+")) |> 
  select(-path) |> 
  group_by(var, pft_name) |> 
  summarize(
    distn = unique(distn),
    across(starts_with("param"), mean)
  ) |> 
  # #make distributions for distributional and ggdist packages
  mutate(dist_string = glue("{distn}({parama}, {paramb})")) |>
  parse_dist(dist_string)

vars <- dists$var |> unique()
varslist <- split(vars, ceiling(seq_along(vars)/5))
map(varslist, \(x) {
  dists |> 
    filter(var %in% x) |>
    ggplot() +
    stat_slabinterval(
      aes(dist = .dist, args = .args),
      orientation = "y",
      size = 1
      ) +
    facet_grid(pft_name~var, scales = "free")
})

```