_troubleShootingTmp.Rmd

---
title: "Optical Model no AOD"
author: "Simon Topp"
date: "6/7/2019"
output: html_document
editor_options: 
  chunk_output_type: console
---

```{r setup, include=FALSE}
library(tidyverse)
library(feather)
library(googledrive)
library(viridis)
library(knitr)
library(sf)
library(rgdal)
library(maps)
library(magrittr)
library(mlbench)
library(caret)
library(randomForest)
library(doParallel)
library(furrr)
library(lubridate)
library(groupdata2)
library(CAST)
library(mapview)
library(onehot)
library(Metrics)
library(kableExtra)
library(ggpmisc)
library(stlplus)
library(Hmisc)
library(xgboost)
library(scales)
knitr::opts_chunk$set(error = F)
knitr::opts_chunk$set(warning = F)
knitr::opts_chunk$set(message = F)
knitr::opts_chunk$set(echo = F)
```

```{r}
## Table of variables for manuscript

features <- "NR, BG, dWL, inStreamCat, PctCarbResidCat, Precip8110Cat, Tmean8110Cat, RunoffCat, ClayCat, SandCat, OmCat, PermCat, RckdepCat, WtDepCat, BFICat, ElevCat, AOD, KffactCat, CatAreaSqKm, PctAg2006Slp10Cat, NPDESDensCat, HydrlCondCat, PctImp2006Cat, pctUrban2006, pctForest2006, PctCrop2006Cat, pctWetland2006, WetIndexCat, areasqkm, meandused"

varTable <- tibble(Variable = c('Lake Area', 'Lake Depth', 'NIR/Red', 'Blue/Green', 'Forel-Ule Index', 'Network Status', 'Percent Carbonate', 'Mean Precipitation', 'Mean Temperature', 'Runoff', 'Percent Clay', 'Percent Sand', 'Percent Organic Matter', 'Soil Permeability', 'Bedrock Depth', 'Water Table Depth', 'Base Flow Index', 'Elevation', 'Atmospheric Optical Depth', 'Kffactor', 'Catchement Area', 'Percent Agriculture', 'NPDE Density', 'Hydraulic Conductivity', 'Percent Impervious Surface', 'Percent Urban', 'Percent Forest', 'Percent Crop', 'Percent Wetland', 'Topographic Wetness Index'), Description = c('Lake area from NHD v2 (sq. km)', 'Mean lake depth from NHD v2 (m)', 'Band Ratio', 'Band Ratio', 'Dominant Color Wavelength as defined by Wang et al. (2015)', 'Binary on/off network status for lake based on NHD flowlines', 'Percent of catchment area classified as lithology type: carbonate residual material', 'PRISM climate data - 30-year normal mean precipitation (mm): Annual period: 1981-2010 within the catchment', 'PRISM climate data - 30-year normal mean temperature (C°): Annual period: 1981-2010 within the catchment', 'Mean runoff (mm) within catchment', 'Mean % clay content of soils (STATSGO) within catchment', 'Mean % sand content of soils (STATSGO) within catchment', 'Mean organic matter content (% by weight) of soils (STATSGO) within catchment', 'Mean permeability (cm/hour) of soils (STATSGO) within catchment', 'Mean depth (cm) to bedrock of soils (STATSGO) within catchment','Mean seasonal water table depth (cm) of soils (STATSGO) within catchment','Base flow is the component of streamflow that can be attributed to ground-water discharge into streams. The BFI is the ratio of base flow to total flow, expressed as a percentage, within catchment', 'Mean catchment elevation (m)', 'Atmospheric Optical Depth over observation pulled from MERRA 2 reanalysis data', 'Mean of STATSGO Kffactor raster within catchment. The Universal Soil Loss Equation (USLE) and represents a relative index of susceptibility of bare, cultivated soil to particle detachment and transport by rainfall', 'Area of local catchment (square km)', '% of catchment area classified as ag land cover (NLCD 2006 classes 81-82) occurring on slopes > 10%', 'Density of permitted NPDES (National Pollutant Discharge Elimination System) sites within catchment (sites/square km)', 'Mean lithological hydraulic conductivity (micrometers per second) content in surface or near surface geology within catchment', 'Mean imperviousness of anthropogenic surfaces within catchment (NLCD 2006)','Percent of catchment classified as either medium or high density urban (NLCD 2006)', 'Percent of catchment classified as either deciduous, coniferous, or mixed forest (NLCD 2006)', 'Percent of catchment area classified as crop land use (NLCD 2006 class 82)', 'Percent of catchment classified as woody or herbaceous wetland landcover (NLCD 2006)', 'Mean Composite Topographic Index (CTI)[Wetness Index] within catchment'))

varTable %>%
  kable(.) %>% #,'latex', booktabs = T) %>%
  kable_styling(bootstrap_options = c('striped', "condensed")) %>%
  column_spec(1, border_right = T) %>%
  row_spec(c(2,3,4,5,9,10,27,29, 30, 16, 20), bold = T) %>%
  save_kable('figures/FigTable.png', height = 9)


## Make a table summarizing sample sizes
sampTable <- tibble(Sample = c('2012 National Lake Assessment', 'Large Random Sample', 'Lakes Over 10 sq km', 'Total Unique Lakes', 'Total Lakes with >25 years of Observations'), Initial.Count = c(1038, 18000, 1170, 19964, 8897), `Post.QAQC.count` = c(1029, 13362, 1105, 14971, 8897)) %>%
  kable(col.names = c('Sample', 'Initial Count', 'Post QA/QC Count'),
        format.args = list(big.mark = ","), align = 'c') %>%
  kable_styling()
```


```{r}
## Look at the differences between preds for lake vs cntr pulls
over10NLA <- lake.join %>% filter(areasqkm > 10)

pullComp <- read_feather(paste0('out/TS_Preds/NLA2012_cntr_',iteration,'.feather')) %>%
  bind_rows(read_feather(paste0('out/TS_Preds/NLA2012_FullLake_',iteration,'.feather')) %>% mutate(lakeSamp = 'NLA2012_FullLake')) %>% filter(COMID %in% over10NLA$COMID)

#filter where we have preds for both
counts <- pullComp %>%
  group_by(COMID, system.index) %>%
  summarise(count = n()) %>%
  filter(count > 1) %>%
  mutate(uID = paste0(COMID, system.index))

check <- pullComp %>% mutate(uID = paste0(COMID, system.index)) %>%
  filter(uID %in% counts$uID) %>%
  select(uID, value, lakeSamp) %>%
  spread(lakeSamp, value)

ggplot(check, aes(x = NLA2012_cntr, y = NLA2012_FullLake)) +
  geom_point() + geom_abline(color = 'red') +
  ggtitle('Full Lake vs Centerpoint Predictions \nfor lakes over 10 sq km in the NLA2012 sample\n red line is 1:1') +
  stat_poly_eq(aes(label =  paste(stat(adj.rr.label))),
               formula = y~x, parse = TRUE, 
               label.y = Inf, vjust = 1.3)

summ.meds <- pullComp %>%
  filter(month %in% c(5:9),
         year < 2019) %>% ##Incomplete obs for 2019 so we won't use it.
  group_by(COMID, year, region, lakeSamp) %>%
  summarise(secchi.med = median(value))

##Check how many years we have observations for each lake and only take lakes with > 10 years of obs. We loose ~500 lakes.
counts <- summ.meds %>%
  group_by(COMID) %>%
  summarise(count = n()) %>%
  #filter(count > 10) %>%
  left_join(lake.join)

####  Create wide dataset 
summ.meds.wide <- summ.meds %>%
  filter(COMID %in% counts$COMID) %>%
  spread(year, secchi.med)

##Check regional counts to make sure they're reasonable
summ.meds.wide %>% group_by(region) %>% summarise(count = n())

###### Apply 1000 rounds of bootstrapping to yearly summer medians

## Map over the regions and pull out summary stats
bootstrapped.ts <- summ.meds.wide %>%
  group_by(region, lakeSamp) %>%
  nest() %>%
  mutate(boot.means= purrr::map(data, ~boot(.,boot.med, R = 1000)),
         boots.summ = purrr::map(boot.means, boot.summary)) %>%
  select(-boot.means, -data) %>%
  unnest(boots.summ)

## Generate a figure showing the mean summer clarity and 'stability' (sd of bootstrap iterations) for each lake.
bootstrapped.ts %>%
  filter(year < 2019) %>%
  ggplot(., aes(x = year, y = mean, color = lakeSamp)) +
  #geom_errorbar(aes(ymin = mean - se, ymax = mean + se), color = 'grey60', alpha = .4) +
  geom_line() +
  theme_bw() +
  geom_point() +
  facet_wrap(~region, scales = 'free', ncol = 2)
```

```{r}
check <- st_read('/Users/simontopp/Downloads/Sentinel-2_Descending_Orbits_shape_file/Sentinel-2_Descending_Orbit_Shape_File.shp')

st_layers('/Users/simontopp/Downloads/NHDPlusNationalData/NHDPlusV21_National_Seamless_Flattened_Lower48.gdb')

check <- st_read('/Users/simontopp/Downloads/NHDPlusNationalData/NHDPlusV21_National_Seamless_Flattened_Lower48.gdb', layer = 'NHDWaterbody')

check2 <- check %>% mutate_at(vars(MeanDepth, LakeVolume, LakeArea), ~format(round(., 5), nsmall =5 ))
check2$ELEVATION[check2$ELEVATION == -99999999] = -9999

st_write(check2, 'in/NHD/NHDPlusV2_NHDWaterbodies.shp')

nhd <- st_read('in/NHD/NHDPLusV2_NHDWaterbodies.shp')

swotViz <- nhd %>% filter(LakeAre > 62500)

test <- swotViz[1,]

check <- test %>% st_bbox(crs = 4269)
points <- st_sfc(st_point(c(check[1],check[2])),
                 st_point(c(check[1],check[4])),
                 st_point(c(check[3],check[2])), crs = 4269)


calcDim <- function(i){
  bb <- st_bbox(i, crs = 4269)
  points <- st_sfc(st_point(c(bb[1],bb[2])),
                 st_point(c(bb[1],bb[4])),
                 st_point(c(bb[3],bb[2])), crs = 4269)
  width <- st_distance(points[1],points[3])[1] %>% round(1)
  height <- st_distance(points[1],points[2])[1] %>% round(1)
  return(list(width, height))
}

calcDim(swotViz$geometry[3])

plan(multiprocess)
test <- swotViz %>%
  mutate(dims = furrr::future_map(geometry, calcDim, .progress = T),
         width = purrr::map_dbl(dims, 1),
         height = purrr::map_dbl(dims,2)) %>%
  select(-dims)
plan(sequential)

swotViz<- test %>% filter(width > 300 & height > 300)


st_distance(point[1],point[2])[1]
st_distance(points[1],points[3])[1]

st_is_longlat(st_point(c(check[1],check[2]), dim = 'XY'))


```