From 78ba63c099a35a283f5a97542ce93f235603cc66 Mon Sep 17 00:00:00 2001 From: Stephanie Reinders Date: Thu, 5 Dec 2024 15:00:28 -0600 Subject: [PATCH] Created `get_writer_profiles()` `get_writer_profiles()` calculates the writer profiles (cluster fill rates) for every document in an input folder. --- DESCRIPTION | 1 - NAMESPACE | 1 + NEWS.md | 2 + R/compare.R | 94 +++++++++++++++++++++++++++++--------- man/get_writer_profiles.Rd | 58 +++++++++++++++++++++++ 5 files changed, 134 insertions(+), 22 deletions(-) create mode 100644 man/get_writer_profiles.Rd diff --git a/DESCRIPTION b/DESCRIPTION index ef351ed..da27120 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -16,7 +16,6 @@ Suggests: rmarkdown, testthat (>= 3.0.0), tibble -VignetteBuilder: knitr Depends: R (>= 2.10) Imports: diff --git a/NAMESPACE b/NAMESPACE index 50b61e7..1b3c374 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(compare_writer_profiles) export(get_cluster_fill_rates) export(get_distances) export(get_ref_scores) +export(get_writer_profiles) export(interpret_slr) export(plot_scores) export(plot_writer_profiles) diff --git a/NEWS.md b/NEWS.md index 499a379..3e137e9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,8 @@ * Created `compare_documents()` to compare two handwritten documents using either a similarity score or a score-based likelihood ratio as a comparison method. +* Created functions `get_writer_profiles()` and `compare_writer_profiles()` to make experiments faster on large numbers of documents compared to `compare_documents()`. `get_writer_profiles()` estimates writer profiles for every handwritten document in a folder. Then `compare_writer_profiles()` calculates either a similarity score or score-based likelihood ratio for every pair of documents. + * Created new data frames of writer profiles `train`, `validation`, and `test`. Created a new `random_forest` from `train`. Created `ref_scores`, a list of same writer and different writer similarity scores, from `validation`. ## Minor improvements and bug fixes diff --git a/R/compare.R b/R/compare.R index 784510a..15e1268 100644 --- a/R/compare.R +++ b/R/compare.R @@ -106,21 +106,11 @@ compare_documents <- function(sample1, params <- copy_samples_to_project_dir(params) - handwriter::process_batch_dir( - input_dir = file.path(params$project_dir, "docs"), - output_dir = file.path(params$project_dir, "graphs") - ) - - clusters <- handwriter::get_clusters_batch( - template = templateK40, - input_dir = file.path(params$project_dir, "graphs"), - output_dir = file.path(params$project_dir, "clusters"), - num_cores = 1, - save_master_file = FALSE - ) - message("Estimating writer profiles...") - profiles <- get_writer_profiles(clusters = clusters) + profiles <- get_writer_profiles(input_dir = file.path(params$project_dir, "docs"), + template = templateK40, + num_cores = 1, + output_dir = params$project_dir) message("Calculating distance between samples...") dist_measures <- which_dists(rforest = params$rforest) @@ -210,6 +200,73 @@ compare_writer_profiles <- function( } +#' Estimate Writer Profiles +#' +#' Estimate writer profiles from handwritten documents scanned and saved as PNG +#' files. Each file in `input_dir` is split into component shapes called graphs +#' with [`handwriter::process_batch_dir`]. Then the graphs are sorted into +#' clusters with similar shapes using the cluster `template` and +#' [`handwriter::get_clusters_batch`]. An estimate of the writer profile for a +#' document is the proportion of graphs from that document assigned to each of +#' the clusters in `template`. The writer profiles are estimated by running +#' [`handwriter::get_cluster_fill_counts`] and then [`get_cluster_fill_rates`]. +#' +#' The functions [`handwriter::process_batch_dir`] and +#' [`handwriter::get_clusters_batch`] take upwards of 30 seconds per document +#' and the results are saved to RDS files in `project_dir` > graphs and +#' `project_dir` > clusters, respectively. +#' +#' @param input_dir A filepath to a folder containing one or more handwritten +#' documents, scanned and saved as PNG file(s). +#' @param num_cores An integer number greater than or equal to 1 of cores to use +#' for parallel processing. +#' @param template Optional. A cluster template created with +#' [`handwriter::make_clustering_template`]. The default is the cluster +#' template `templateK40` included with 'handwriterRF'. +#' @param output_dir Optional. A filepath to a folder to save the RDS files +#' created by [`handwriter::process_batch_dir`] and +#' [`handwriter::get_clusters_batch`]. If no folder is supplied, the RDS files +#' will be saved to the temporary directory and then deleted before the +#' function terminates. +#' +#' @return A data frame +#' @export +#' +#' @examples +#' \donttest{ +#' docs <- system.file(file.path("extdata", "docs"), package = "handwriterRF") +#' profiles <- get_writer_profiles(docs) +#' +#' plot_writer_profiles(profiles) +#' } +#' +get_writer_profiles <- function(input_dir, num_cores = 1, template = templateK40, output_dir = NULL) { + if (is.null(output_dir)) { + output_dir <- file.path(tempdir(), "writer_profiles") + create_dir(output_dir) + } + + handwriter::process_batch_dir( + input_dir = input_dir, + output_dir = file.path(output_dir, "graphs") + ) + + clusters <- handwriter::get_clusters_batch( + template = template, + input_dir = file.path(output_dir, "graphs"), + output_dir = file.path(output_dir, "clusters"), + num_cores = num_cores, + save_master_file = FALSE + ) + counts <- handwriter::get_cluster_fill_counts(clusters) + profiles <- get_cluster_fill_rates(counts) + + if (output_dir == file.path(tempdir(), "writer_profiles")) { + unlink(file.path(tempdir(), "writer_profiles"), recursive = TRUE) + } + + return(profiles) +} # Internal Functions ------------------------------------------------------ @@ -245,8 +302,8 @@ handle_samples_w_same_name <- function(params) { # samples in two different directories CAN have the same filename if (!is.null(params$samples) && - (params$samples$original_path1 != params$samples$original_path2) && - (params$samples$name1 == params$samples$name2)) { + (params$samples$original_path1 != params$samples$original_path2) && + (params$samples$name1 == params$samples$name2)) { message("Samples have the same file name so they will be renamed 'sample1.png' and 'sample2.png'.") params$samples$name1 <- "sample1.png" params$samples$name2 <- "sample2.png" @@ -307,11 +364,6 @@ copy_samples_to_project_dir <- function(params) { return(params) } -get_writer_profiles <- function(clusters) { - counts <- handwriter::get_cluster_fill_counts(clusters) - profiles <- get_cluster_fill_rates(counts) - return(profiles) -} get_slr <- function(params) { get_slr_for_single_score <- function(score, densities) { diff --git a/man/get_writer_profiles.Rd b/man/get_writer_profiles.Rd new file mode 100644 index 0000000..15ac0af --- /dev/null +++ b/man/get_writer_profiles.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compare.R +\name{get_writer_profiles} +\alias{get_writer_profiles} +\title{Estimate Writer Profiles} +\usage{ +get_writer_profiles( + input_dir, + num_cores = 1, + template = templateK40, + output_dir = NULL +) +} +\arguments{ +\item{input_dir}{A filepath to a folder containing one or more handwritten +documents, scanned and saved as PNG file(s).} + +\item{num_cores}{An integer number greater than or equal to 1 of cores to use +for parallel processing.} + +\item{template}{Optional. A cluster template created with +[`handwriter::make_clustering_template`]. The default is the cluster +template `templateK40` included with 'handwriterRF'.} + +\item{output_dir}{Optional. A filepath to a folder to save the RDS files +created by [`handwriter::process_batch_dir`] and +[`handwriter::get_clusters_batch`]. If no folder is supplied, the RDS files +will be saved to the temporary directory and then deleted before the +function terminates.} +} +\value{ +A data frame +} +\description{ +Estimate writer profiles from handwritten documents scanned and saved as PNG +files. Each file in `input_dir` is split into component shapes called graphs +with [`handwriter::process_batch_dir`]. Then the graphs are sorted into +clusters with similar shapes using the cluster `template` and +[`handwriter::get_clusters_batch`]. An estimate of the writer profile for a +document is the proportion of graphs from that document assigned to each of +the clusters in `template`. The writer profiles are estimated by running +[`handwriter::get_cluster_fill_counts`] and then [`get_cluster_fill_rates`]. +} +\details{ +The functions [`handwriter::process_batch_dir`] and +[`handwriter::get_clusters_batch`] take upwards of 30 seconds per document +and the results are saved to RDS files in `project_dir` > graphs and +`project_dir` > clusters, respectively. +} +\examples{ +\donttest{ +docs <- system.file(file.path("extdata", "docs"), package = "handwriterRF") +profiles <- get_writer_profiles(docs) + +plot_writer_profiles(profiles) +} + +}