impact-initiatives · yannsay-impact · Jun 19, 2023 · Jun 14, 2023 · Jun 15, 2023 · Jun 16, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -28,3 +28,4 @@ Suggests:
     testthat (>= 3.0.0),
     tibble
 Config/testthat/edition: 3
+LazyData: true
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export("%>%")
+export(add_weights)
 export(adding_analysis_key)
 export(adding_analysis_key_ratio)
 export(adding_group_var_value)

diff --git a/R/add_weights.R b/R/add_weights.R
@@ -0,0 +1,80 @@
+#' Add a weight variable using the sample frame
+#'
+#' @param .dataset the clean dataframe
+#' @param sample_data sample dataframe including poplution numbers and the strata
+#' @param strata_column_dataset name of strata column in the clean dataframe
+#' @param strata_column_sample name of strata column in the sample dataframe
+#' @param population_column name of population column in the sample dataframe
+#' @param weight_column name of the added weight column. By default "weights"
+#'
+#' @return The clean dataset with 1 new column: weight
+#' @export
+#'
+#' @examples
+#' clean_data <- data.frame(uuid = c(1,2,3,4,5,6,7,8),
+#'                          strata = c("strata1","strata2","strata1",
+#'                                     "strata2","strata1","strata2",
+#'                                     "strata1","strata1"))
+#' sample <- data.frame(strata = c("strata1","strata2"),
+#'                      population = c(30000,50000))
+#'
+#' clean_data_weighted <- clean_data %>%
+#'     add_weights(sample,
+#'                 strata_column_dataset = "strata",
+#'                 strata_column_sample = "strata",
+#'                 population_column = "population")
+add_weights<- function(.dataset,
+                        sample_data,
+                        strata_column_dataset = NULL,
+                        strata_column_sample = NULL,
+                        population_column = NULL,
+                        weight_column = "weights"){
+
+  # make dataset a dataframe
+  .dataset <- as.data.frame(.dataset)
+
+  # If strata_column do not exist in sample_data or dataset
+  if(!strata_column_sample %in% names(sample_data))
+    stop("Cannot find the defined strata column in the provided sample frame.")
+  if(!strata_column_dataset %in% names(.dataset))
+    stop("Cannot find the defined strata column in the provided dataset.")
+
+  # IF all strata from dataset not in sample frame
+  if(!all(.dataset[[strata_column_dataset]] %in% sample_data[[strata_column_sample]]))
+    stop("Not all strata from dataset are in sample frame")
+
+  if(!all(sample_data[[strata_column_sample]] %in% .dataset[[strata_column_dataset]]))
+    stop("Not all strata from sample frame are in dataset")
+
+  # If population_column do not exist in sample_data
+  if(!population_column %in% names(sample_data))
+    stop("Cannot find the defined population_column column in the provided sample frame.")
+
+  # if weight column already exist in dataset
+  if(weight_column %in% names(.dataset))
+    stop("Weight column already exists in the dataset. Please input another weights column")
+
+  # Count number of entries by strata
+  count <- .dataset %>%
+    dplyr::group_by(!!rlang::sym(strata_column_dataset)) %>%
+    dplyr::summarise(count = dplyr::n())
+
+  # Create a weight table to left_join to the dataset
+  weights <- sample_data %>%
+    dplyr::rename(!!strata_column_dataset := !!rlang::sym(strata_column_sample)) %>%
+    dplyr::group_by(!!rlang::sym(strata_column_dataset))%>%
+    dplyr::summarise(population = sum(as.numeric(!!rlang::sym(population_column)))) %>%
+    dplyr::left_join(count, by = strata_column_dataset) %>%
+    dplyr::mutate(
+      !!rlang::sym(weight_column) := (as.numeric(population)/sum(as.numeric(population)))/(as.numeric(count)/sum(as.numeric(count)))) %>%
+    dplyr::select(dplyr::all_of(strata_column_dataset),dplyr::all_of(weight_column))
+
+  # join to dataset
+  .dataset <- .dataset %>%
+    dplyr::left_join(weights,by = strata_column_dataset)
+
+  if(any(is.na(.dataset[[weight_column]])))
+    stop("There are NA values in the weights column")
+
+  return(.dataset)
+}
diff --git a/R/data_set_documentation.R b/R/data_set_documentation.R
@@ -0,0 +1,26 @@
+############################### data documentation #######################
+
+
+
+
+#' Clean data and Sample Frame
+#'
+#'
+#'
+#' @format NULL
+#' @examples
+#' analysistools_clean_data
+#' analysistools_sample_frame
+
+#' @title Clean data
+#' @name analysistools_clean_data
+#' @rdname analysistools_clean_data
+#' @format NULL
+"analysistools_clean_data"
+
+#' @name analysistools_sample_frame
+#' @title Sample frame
+#' @rdname analysistools_sample_frame
+#' @format NULL
+"analysistools_sample_frame"
+
diff --git a/data/analysistools_clean_data.rda b/data/analysistools_clean_data.rda
diff --git a/data/analysistools_sample_frame.rda b/data/analysistools_sample_frame.rda
diff --git a/man/add_weights.Rd b/man/add_weights.Rd
diff --git a/man/analysistools_clean_data.Rd b/man/analysistools_clean_data.Rd
diff --git a/man/analysistools_sample_frame.Rd b/man/analysistools_sample_frame.Rd
diff --git a/tests/testthat/test-add_weights.R b/tests/testthat/test-add_weights.R
@@ -0,0 +1,71 @@
+testthat::test_that("Error checks", {
+  testthat::expect_error(add_weights(.dataset = analysistools::analysistools_clean_data,
+                                     sample_data = analysistools::analysistools_sample_frame,
+                                     strata_column_dataset = "sdsd",
+                                     strata_column_sample = "Neighbourhood",
+                                     population_column = "Total.no.of.HH"))
+  testthat::expect_error(add_weights(.dataset = analysistools::analysistools_clean_data,
+                                     sample_data = analysistools::analysistools_sample_frame,
+                                     strata_column_dataset = "neighbourhood",
+                                     strata_column_sample = "Neighbourd",
+                                     population_column = "Total.no.of.HH"))
+  testthat::expect_error(add_weights(.dataset = analysistools::analysistools_clean_data,
+                                     sample_data = analysistools::analysistools_sample_frame,
+                                     strata_column_dataset = "neighbourhood",
+                                     strata_column_sample = "Neighbourhood",
+                                     population_column = "Total"))
+
+  test_data <- analysistools::analysistools_clean_data %>%
+    dplyr::mutate(weights = 1)
+  testthat::expect_error(add_weights(.dataset = test_data,
+                                     sample_data = analysistools::analysistools_sample_frame,
+                                     strata_column_dataset = "neighbourhood",
+                                     strata_column_sample = "Neighbourhood",
+                                     population_column = "Total.no.of.HH"))
+  testthat::expect_no_error(add_weights(.dataset = analysistools::analysistools_clean_data,
+                                     sample_data = analysistools::analysistools_sample_frame,
+                                     strata_column_dataset = "neighbourhood",
+                                     strata_column_sample = "Neighbourhood",
+                                     population_column = "Total.no.of.HH"))
+  test_data <- analysistools::analysistools_clean_data
+  test_data$neighbourhood[2] <- "not_applicable"
+  testthat::expect_error(add_weights(.dataset = test_data,
+                                     sample_data = analysistools::analysistools_sample_frame,
+                                     strata_column_dataset = "neighbourhood",
+                                     strata_column_sample = "Neighbourhood",
+                                     population_column = "Total.no.of.HH"))
+
+  test_data <- analysistools::analysistools_clean_data %>%
+    dplyr::filter(neighbourhood != "oyt")
+  testthat::expect_error(add_weights(.dataset = test_data,
+                                     sample_data = analysistools::analysistools_sample_frame,
+                                     strata_column_dataset = "neighbourhood",
+                                     strata_column_sample = "Neighbourhood",
+                                     population_column = "Total.no.of.HH"))
+})
+
+testthat::test_that("add_weights works", {
+  test_clean_data <- data.frame(uuid = c(1,2,3,4,5,6,7,8),
+                                strata = c("strata1","strata2","strata1",
+                                           "strata2","strata1","strata2",
+                                           "strata1","strata1"))
+  test_sample <- data.frame(strata = c("strata1","strata2"),
+                            population = c("30000","50000"))
+  actual_output <- test_clean_data %>%
+    add_weights(test_sample,
+                strata_column_dataset = "strata",
+                strata_column_sample = "strata",
+                population_column = "population")
+
+  testthat::expect_equal(sum(actual_output$weight),nrow(test_clean_data))
+  expected_output <- data.frame(uuid = c(1,2,3,4,5,6,7,8),
+                                strata = c("strata1","strata2","strata1",
+                                           "strata2","strata1","strata2",
+                                           "strata1","strata1"),
+                                weights = c(0.60,1.67,0.60,1.67,0.60,1.67,0.60,0.60))
+
+  rounded_output <- actual_output %>%
+    dplyr::mutate(weights = round(weights,2))
+
+  testthat::expect_equal(rounded_output, expected_output)
+})