From 78ba63c099a35a283f5a97542ce93f235603cc66 Mon Sep 17 00:00:00 2001
From: Stephanie Reinders <reinders.stephanie@gmail.com>
Date: Thu, 5 Dec 2024 15:00:28 -0600
Subject: [PATCH] Created `get_writer_profiles()`

`get_writer_profiles()` calculates the writer profiles (cluster fill rates) for every document in an input folder.
---
 DESCRIPTION                |  1 -
 NAMESPACE                  |  1 +
 NEWS.md                    |  2 +
 R/compare.R                | 94 +++++++++++++++++++++++++++++---------
 man/get_writer_profiles.Rd | 58 +++++++++++++++++++++++
 5 files changed, 134 insertions(+), 22 deletions(-)
 create mode 100644 man/get_writer_profiles.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index ef351ed..da27120 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -16,7 +16,6 @@ Suggests:
     rmarkdown,
     testthat (>= 3.0.0),
     tibble
-VignetteBuilder: knitr
 Depends: 
     R (>= 2.10)
 Imports: 
diff --git a/NAMESPACE b/NAMESPACE
index 50b61e7..1b3c374 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -7,6 +7,7 @@ export(compare_writer_profiles)
 export(get_cluster_fill_rates)
 export(get_distances)
 export(get_ref_scores)
+export(get_writer_profiles)
 export(interpret_slr)
 export(plot_scores)
 export(plot_writer_profiles)
diff --git a/NEWS.md b/NEWS.md
index 499a379..3e137e9 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -4,6 +4,8 @@
 
 * Created `compare_documents()` to compare two handwritten documents using either a similarity score or a score-based likelihood ratio as a comparison method.
 
+* Created functions `get_writer_profiles()` and `compare_writer_profiles()` to make experiments faster on large numbers of documents compared to `compare_documents()`. `get_writer_profiles()` estimates writer profiles for every handwritten document in a folder. Then `compare_writer_profiles()` calculates either a similarity score or score-based likelihood ratio for every pair of documents.    
+
 * Created new data frames of writer profiles `train`, `validation`, and `test`. Created a new `random_forest` from `train`. Created `ref_scores`, a list of same writer and different writer similarity scores, from `validation`. 
 
 ## Minor improvements and bug fixes
diff --git a/R/compare.R b/R/compare.R
index 784510a..15e1268 100644
--- a/R/compare.R
+++ b/R/compare.R
@@ -106,21 +106,11 @@ compare_documents <- function(sample1,
 
   params <- copy_samples_to_project_dir(params)
 
-  handwriter::process_batch_dir(
-    input_dir = file.path(params$project_dir, "docs"),
-    output_dir = file.path(params$project_dir, "graphs")
-  )
-
-  clusters <- handwriter::get_clusters_batch(
-    template = templateK40,
-    input_dir = file.path(params$project_dir, "graphs"),
-    output_dir = file.path(params$project_dir, "clusters"),
-    num_cores = 1,
-    save_master_file = FALSE
-  )
-
   message("Estimating writer profiles...")
-  profiles <- get_writer_profiles(clusters = clusters)
+  profiles <- get_writer_profiles(input_dir = file.path(params$project_dir, "docs"),
+                                  template = templateK40,
+                                  num_cores = 1,
+                                  output_dir = params$project_dir)
 
   message("Calculating distance between samples...")
   dist_measures <- which_dists(rforest = params$rforest)
@@ -210,6 +200,73 @@ compare_writer_profiles <- function(
 }
 
 
+#' Estimate Writer Profiles
+#'
+#' Estimate writer profiles from handwritten documents scanned and saved as PNG
+#' files. Each file in `input_dir` is split into component shapes called graphs
+#' with [`handwriter::process_batch_dir`]. Then the graphs are sorted into
+#' clusters with similar shapes using the cluster `template` and
+#' [`handwriter::get_clusters_batch`]. An estimate of the writer profile for a
+#' document is the proportion of graphs from that document assigned to each of
+#' the clusters in `template`. The writer profiles are estimated by running
+#' [`handwriter::get_cluster_fill_counts`] and then [`get_cluster_fill_rates`].
+#'
+#' The functions [`handwriter::process_batch_dir`] and
+#' [`handwriter::get_clusters_batch`] take upwards of 30 seconds per document
+#' and the results are saved to RDS files in `project_dir` > graphs and
+#' `project_dir` > clusters, respectively.
+#'
+#' @param input_dir A filepath to a folder containing one or more handwritten
+#'   documents, scanned and saved as PNG file(s).
+#' @param num_cores An integer number greater than or equal to 1 of cores to use
+#'   for parallel processing.
+#' @param template Optional. A cluster template created with
+#'   [`handwriter::make_clustering_template`]. The default is the cluster
+#'   template `templateK40` included with 'handwriterRF'.
+#' @param output_dir Optional. A filepath to a folder to save the RDS files
+#'   created by [`handwriter::process_batch_dir`] and
+#'   [`handwriter::get_clusters_batch`]. If no folder is supplied, the RDS files
+#'   will be saved to the temporary directory and then deleted before the
+#'   function terminates.
+#'
+#' @return A data frame
+#' @export
+#'
+#' @examples
+#' \donttest{
+#' docs <- system.file(file.path("extdata", "docs"), package = "handwriterRF")
+#' profiles <- get_writer_profiles(docs)
+#'
+#' plot_writer_profiles(profiles)
+#' }
+#'
+get_writer_profiles <- function(input_dir, num_cores = 1, template = templateK40, output_dir = NULL) {
+  if (is.null(output_dir)) {
+    output_dir <- file.path(tempdir(), "writer_profiles")
+    create_dir(output_dir)
+  }
+
+  handwriter::process_batch_dir(
+    input_dir = input_dir,
+    output_dir = file.path(output_dir, "graphs")
+  )
+
+  clusters <- handwriter::get_clusters_batch(
+    template = template,
+    input_dir = file.path(output_dir, "graphs"),
+    output_dir = file.path(output_dir, "clusters"),
+    num_cores = num_cores,
+    save_master_file = FALSE
+  )
+  counts <- handwriter::get_cluster_fill_counts(clusters)
+  profiles <- get_cluster_fill_rates(counts)
+
+  if (output_dir == file.path(tempdir(), "writer_profiles")) {
+    unlink(file.path(tempdir(), "writer_profiles"), recursive = TRUE)
+  }
+
+  return(profiles)
+}
 
 # Internal Functions ------------------------------------------------------
 
@@ -245,8 +302,8 @@ handle_samples_w_same_name <- function(params) {
 
   # samples in two different directories CAN have the same filename
   if (!is.null(params$samples) &&
-    (params$samples$original_path1 != params$samples$original_path2) &&
-    (params$samples$name1 == params$samples$name2)) {
+      (params$samples$original_path1 != params$samples$original_path2) &&
+      (params$samples$name1 == params$samples$name2)) {
     message("Samples have the same file name so they will be renamed 'sample1.png' and 'sample2.png'.")
     params$samples$name1 <- "sample1.png"
     params$samples$name2 <- "sample2.png"
@@ -307,11 +364,6 @@ copy_samples_to_project_dir <- function(params) {
   return(params)
 }
 
-get_writer_profiles <- function(clusters) {
-  counts <- handwriter::get_cluster_fill_counts(clusters)
-  profiles <- get_cluster_fill_rates(counts)
-  return(profiles)
-}
 
 get_slr <- function(params) {
   get_slr_for_single_score <- function(score, densities) {
diff --git a/man/get_writer_profiles.Rd b/man/get_writer_profiles.Rd
new file mode 100644
index 0000000..15ac0af
--- /dev/null
+++ b/man/get_writer_profiles.Rd
@@ -0,0 +1,58 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/compare.R
+\name{get_writer_profiles}
+\alias{get_writer_profiles}
+\title{Estimate Writer Profiles}
+\usage{
+get_writer_profiles(
+  input_dir,
+  num_cores = 1,
+  template = templateK40,
+  output_dir = NULL
+)
+}
+\arguments{
+\item{input_dir}{A filepath to a folder containing one or more handwritten
+documents, scanned and saved as PNG file(s).}
+
+\item{num_cores}{An integer number greater than or equal to 1 of cores to use
+for parallel processing.}
+
+\item{template}{Optional. A cluster template created with
+[`handwriter::make_clustering_template`]. The default is the cluster
+template `templateK40` included with 'handwriterRF'.}
+
+\item{output_dir}{Optional. A filepath to a folder to save the RDS files
+created by [`handwriter::process_batch_dir`] and
+[`handwriter::get_clusters_batch`]. If no folder is supplied, the RDS files
+will be saved to the temporary directory and then deleted before the
+function terminates.}
+}
+\value{
+A data frame
+}
+\description{
+Estimate writer profiles from handwritten documents scanned and saved as PNG
+files. Each file in `input_dir` is split into component shapes called graphs
+with [`handwriter::process_batch_dir`]. Then the graphs are sorted into
+clusters with similar shapes using the cluster `template` and
+[`handwriter::get_clusters_batch`]. An estimate of the writer profile for a
+document is the proportion of graphs from that document assigned to each of
+the clusters in `template`. The writer profiles are estimated by running
+[`handwriter::get_cluster_fill_counts`] and then [`get_cluster_fill_rates`].
+}
+\details{
+The functions [`handwriter::process_batch_dir`] and
+[`handwriter::get_clusters_batch`] take upwards of 30 seconds per document
+and the results are saved to RDS files in `project_dir` > graphs and
+`project_dir` > clusters, respectively.
+}
+\examples{
+\donttest{
+docs <- system.file(file.path("extdata", "docs"), package = "handwriterRF")
+profiles <- get_writer_profiles(docs)
+
+plot_writer_profiles(profiles)
+}
+
+}