Merge pull request #44 from mpjashby/dev

Release v0.9.0
mpjashby · Feb 10, 2025 · ec37981 · ec37981
2 parents 51dc28a + b34282e
commit ec37981
Show file tree

Hide file tree

Showing 84 changed files with 11,742 additions and 1,414 deletions.
diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: sfhotspot
 Title: Hot-Spot Analysis with Simple Features
-Version: 0.8.0
+Version: 0.9.0
 Authors@R: 
     person("Matt", "Ashby", , "matthew.ashby@ucl.ac.uk", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0003-4201-9239"))
@@ -20,7 +20,7 @@ URL: http://pkgs.lesscrime.info/sfhotspot/
 BugReports: https://github.com/mpjashby/sfhotspot/issues
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 Imports:
     ggplot2,
     rlang,
@@ -29,7 +29,7 @@ Imports:
     spdep,
     tibble
 Depends: 
-    R (>= 2.10)
+    R (>= 3.5)
 Suggests:
     testthat (>= 3.0.0),
     lubridate,

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,22 @@
+# sfhotspot 0.9.0
+
+* New dataset `memphis_precincts` showing Memphis Police Department precincts,
+  which is required to test the new functionality of `hotspot_grid()`.
+* `count_points_in_polygons()` now passes through columns in the original 
+  dataset, which makes `hotspot_count()` more useful (#41).
+* `hotspot_grid()` if provided with polygons now bases the grid on the
+  boundary of the polygons rather than the convex hull of the boundary (#42).
+* `hotspot_gistar()` now extracts nearest neighbour distance from provided grid
+  and does not wrongly rely on (and report) an automatically generated cell
+  size (#38).
+* Warnings about grids containing very large numbers of cells is now printed
+  before the cells are created, helping explain why code may be running slower
+  than expected (#33).
+* Improved error message produced when point data and provided grid do not
+  overlap (#39).
+* Suppressed progress bar previously included in README (#36).
+
+
 # sfhotspot 0.8.0
 
 * All functions can now handle SF objects in which the geometry column has a

diff --git a/R/count_points_in_polygons.R b/R/count_points_in_polygons.R
@@ -30,6 +30,38 @@ count_points_in_polygons <- function(points, polygons, weights = NULL) {
     }
   }
 
+  # Warn if polygons object contains column names used internally
+  if ("n" %in% names(polygons)) {
+    rlang::warn(c(
+      "Existing column 'n' will be overwritten.",
+      "i" = "Consider renaming the existing column first."
+    ))
+  }
+  if (".polygon_id" %in% names(polygons)) {
+    rlang::warn(c(
+      "Existing column '.polygon_id' will be removed.",
+      "i" = "Consider renaming the existing column first."
+    ))
+  }
+  if ("x" %in% names(polygons)) {
+    rlang::warn(c(
+      "Existing column 'x' will be removed.",
+      "i" = "Consider renaming the existing column first."
+    ))
+  }
+  if (!rlang::is_null(weights) & "sum" %in% names(polygons)) {
+    rlang::warn(c(
+      "Existing column 'sum' will be overwritten.",
+      "i" = "Consider renaming the existing column first."
+    ))
+  } else if ("sum" %in% names(polygons)) {
+    rlang::warn(c(
+      "Existing column 'sum' will be removed.",
+      "i" = "Consider renaming the existing column first."
+    ))
+  }
+  polygons$n <- polygons$x <- polygons$sum <- polygons$`.polygon_id` <- NULL
+
   # Replace name of geometry column in SF objects if necessary
   polygons <- set_geometry_name(polygons)
 
@@ -78,17 +110,13 @@ count_points_in_polygons <- function(points, polygons, weights = NULL) {
   }
 
   # Remove working columns and convert to SF object
-  if (!rlang::is_null(weights)) {
-    counts <- sf::st_as_sf(
-      tibble::as_tibble(counts[, c("n", "sum", "geometry")]),
-      sf_column_name = "geometry"
-    )
-  } else {
-    counts <- sf::st_as_sf(
-      tibble::as_tibble(counts[, c("n", "geometry")]),
-      sf_column_name = "geometry"
-    )
-  }
+  # This also ensures that `geometry` is the last column, as is the convention
+  # for SF objects
+  result_names <- setdiff(names(counts), c("geometry", ".polygon_id", "x"))
+  counts <- sf::st_as_sf(
+    tibble::as_tibble(counts[, c(result_names, "geometry")]),
+    sf_column_name = "geometry"
+  )
 
   counts
 

diff --git a/R/create_grid.R b/R/create_grid.R
@@ -61,17 +61,28 @@ create_grid <- function(
     cell_size <- set_cell_size(data, round = TRUE, quiet = quiet)
 
   # Create buffered convex hull around data
-  hull <- sf::st_buffer(
-    sf::st_convex_hull(sf::st_union(data)),
-    dist = cell_size / 2
-  )
+  geometry_types <- as.character(sf::st_geometry_type(data))
+  if (all(geometry_types %in% c("POLYGON", "MULTIPOLYGON"))) {
+    hull <- sf::st_buffer(
+      sf::st_cast(sf::st_boundary(sf::st_union(data)), "MULTIPOLYGON"),
+      dist = cell_size / 2
+    )
+  } else {
+    hull <- sf::st_buffer(
+      sf::st_convex_hull(sf::st_union(data)),
+      dist = cell_size / 2
+    )
+  }
 
   # Warn if there will be so many cells that the function will be very slow
   hull_bbox <- sf::st_bbox(hull)
   cells_n_x <- (hull_bbox$xmax - hull_bbox$xmin) / cell_size
   cells_n_y <- (hull_bbox$ymax - hull_bbox$ymin) / cell_size
   if (cells_n_x * cells_n_y > 100000 & quiet == FALSE) {
-    rlang::warn(
+    # Although this is a warning, warnings are only printed when a function
+    # finishes, which is no use. Messages are printed immediately, so this has
+    # to be a message. See https://github.com/mpjashby/sfhotspot/issues/33
+    rlang::inform(
       c(
         "The grid will contain a large number of cells",
         "!" = "This may cause other functions to run slowly or not work",

diff --git a/R/gistar.R b/R/gistar.R
@@ -21,7 +21,7 @@
 #'   counts in neighbouring cells when calculating the values of
 #'   \ifelse{html}{\out{<i>G</i><sub><i>i</i></sub><sup>*</sup>}}{\eqn{G^*_i}}
 #'   (if \code{include_self = TRUE}, the default) or
-#'   \ifelse{html}{\out{<i>G</i><sub><i>i</i></sub><sup>*</sup>}}{\eqn{G_i}}
+#'   \ifelse{html}{\out{<i>G</i><sub><i>i</i></sub>}}{\eqn{G_i}}
 #'   (if \code{include_self = FALSE}) values? You are unlikely to want to change
 #'   the default value.
 #' @param p_adjust_method The method to be used to adjust \emph{p}-values for
@@ -33,12 +33,12 @@
 #'   \code{TRUE}.
 #' @return An \code{\link[sf]{sf}} tibble of regular grid cells with
 #'   corresponding point counts,
-#'   \ifelse{html}{\out{<i>G</i><sub><i>i</i></sub><sup>*</sup>}}{\eqn{G_i}} or
+#'   \ifelse{html}{\out{<i>G</i><sub><i>i</i></sub>}}{\eqn{G_i}} or
 #'   \ifelse{html}{\out{<i>G</i><sub><i>i</i></sub><sup>*</sup>}}{\eqn{G^*_i}}
 #'   values for each cell. Values greater than zero indicate more points than
 #'   would be expected for randomly distributed points and values less than zero
 #'   indicate fewer points. Critical values of
-#'   \ifelse{html}{\out{<i>G</i><sub><i>i</i></sub><sup>*</sup>}}{\eqn{G_i}} and
+#'   \ifelse{html}{\out{<i>G</i><sub><i>i</i></sub>}}{\eqn{G_i}} and
 #'   \ifelse{html}{\out{<i>G</i><sub><i>i</i></sub><sup>*</sup>}}{\eqn{G^*_i}}
 #'   are given in the manual page for \code{\link[spdep]{localG}}.
 #' @noRd
@@ -92,15 +92,27 @@ gistar <- function(
   # Replace name of geometry column in SF objects if necessary
   counts <- set_geometry_name(counts)
 
-  # Set cell size if not specified
-  if (rlang::is_null(nb_dist) & rlang::is_null(cell_size))
-    cell_size <- set_cell_size(counts, round = TRUE, quiet = quiet)
+  # Get centroids
+  centroids <- suppressWarnings(sf::st_centroid(counts))
 
   # Set neighbour distance if not specified
-  if (rlang::is_null(nb_dist)) nb_dist <- cell_size * sqrt(2)
+  if (rlang::is_null(nb_dist)) {
+
+    # Derive cell size from grid cells if required
+    if (rlang::is_null(cell_size)) {
+      cell_size <- as.numeric(mean(sf::st_distance(
+        centroids,
+        centroids[sf::st_nearest_feature(centroids), ],
+        by_element = TRUE
+      )))
+    }
+
+    # Derive neighbour distance from cell size
+    nb_dist <- cell_size * sqrt(2)
+
+  }
 
   # Find neighbours
-  centroids <- suppressWarnings(sf::st_centroid(counts))
   nb <- spdep::dnearneigh(sf::st_coordinates(centroids), 0, nb_dist)
 
   # Determine if each cell should be treated as a neighbour of itself

diff --git a/R/hotspot_gistar.R b/R/hotspot_gistar.R
@@ -180,8 +180,9 @@ hotspot_gistar <- function(
 
   # Set cell size if not specified (do this here because it is needed by both
   # `create_grid()` and `gistar()`)
-  if (rlang::is_null(cell_size))
+  if (rlang::is_null(cell_size) & rlang::is_null(grid)) {
     cell_size <- set_cell_size(data, round = TRUE, quiet = quiet)
+  }
 
   # Create grid
   if (rlang::is_null(grid)) {

diff --git a/R/memphis_precincts.R b/R/memphis_precincts.R
@@ -0,0 +1,16 @@
+#' Memphis Police Department Precincts
+#'
+#' A dataset containing the boundaries of Memphis Police Department precincts.
+#'
+#' @format A simple-features tibble with 9 rows and two variables:
+#' \describe{
+#'   \item{precinct}{the precinct name}
+#'   \item{geometry}{the boundary of each precinct, stored in simple-features
+#'     polygon format}
+#' }
+#'
+#' Licence: Public domain <https://data.memphistn.gov/d/tdws-78iq>
+#'
+#' @source City of Memphis <https://data.memphistn.gov/d/rqqz-pj4u>
+#'
+"memphis_precincts"
diff --git a/R/validate_inputs.R b/R/validate_inputs.R
@@ -65,9 +65,12 @@ validate_inputs <- function(
   if (!rlang::is_null(grid)) {
     if (!inherits(grid, "sf"))
       rlang::abort("`grid` must be either an SF object or `NULL`.", call = call)
-    if (any(!sf::st_is(grid, "POLYGON")))
+    if (any(!sf::st_is(grid, c("POLYGON", "MULTIPOLYGON"))))
       rlang::abort(
-        "`grid` must be `NULL` or an SF object containing polygons.",
+        paste0(
+          "`grid` must be `NULL` or an SF object containing polygons or ",
+          "multipolygons."
+        ),
         call = call
       )
     if (any(sf::st_is_empty(grid))) {
@@ -117,6 +120,26 @@ validate_inputs <- function(
       )
   }
 
+  # Check that data and grid overlap
+  if (!rlang::is_null(grid)) {
+    check_overlap <- sf::st_intersects(
+      sf::st_union(data),
+      sf::st_union(grid),
+      sparse = FALSE
+    )
+    if (rlang::is_false(check_overlap[1, 1])) {
+      rlang::abort(
+        c(
+          "`data` and `grid` must overlap",
+          "i" = paste0(
+            "Check data (e.g. by mapping) to ensure inputs overlap in space."
+          )
+        ),
+        call = call
+      )
+    }
+  }
+
   # Validate `quiet`
   if (!rlang::is_logical(quiet, n = 1))
     rlang::abort("`quiet` must be one of `TRUE` or `FALSE`.", call = call)

diff --git a/README.Rmd b/README.Rmd
@@ -115,6 +115,10 @@ distributed randomly. In this example, the points represent the locations of
 personal robberies in Memphis, which is a dataset included with the package.
 
 ```{r example}
+#| fig.alt: >
+#|   A map showing hotspots of robbery in Memphis, TN created using the
+#|   `hotspot_gistar()` function in the sfhotspot package
+
 # Load packages
 library(sf)
 library(sfhotspot)
@@ -123,19 +127,19 @@ library(tidyverse)
 
 # Transform data to UTM zone 15N so that we can think in metres, not decimal 
 # degrees
-memphis_robberies_utm <- st_transform(memphis_robberies, 32615)
+memphis_robberies_utm <- st_transform(memphis_robberies, "EPSG:32615")
 
 
 # Identify hotspots, set all the parameters automatically by not specifying cell 
 # size, bandwidth, etc.
-memphis_robberies_hotspots <- hotspot_gistar(memphis_robberies_utm)
+memphis_robberies_htspt <- hotspot_gistar(memphis_robberies_utm, quiet = TRUE)
 
 
 # Visualise the hotspots by showing only those cells that have significantly
 # more points than expected by chance. For those cells, show the estimated
 # density of robberies.
-memphis_robberies_hotspots %>% 
-  filter(gistar > 0, pvalue < 0.05) %>% 
+memphis_robberies_htspt |> 
+  filter(gistar > 0, pvalue < 0.05) |> 
   ggplot(aes(colour = kde, fill = kde)) +
   geom_sf() +
   scale_colour_distiller(aesthetics = c("colour", "fill"), direction = 1) +

diff --git a/README.md b/README.md
@@ -35,13 +35,13 @@ sfhotspot has the following functions. All can be used by just supplying
 an SF object containing points, or can be configured using the optional
 arguments to each function.
 
-| name                 | use                                                                                                                                                                                                                                      |
-|:---------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `hotspot_count()`    | Count the number of points in each cell of a regular grid. Cell size can be set by the user or chosen automatically.                                                                                                                     |
-| `hotspot_change()`   | Measure the change in the count of points in each cell between two periods of time.                                                                                                                                                      |
-| `hotspot_kde()`      | Estimate kernel density for each cell in a regular grid. Cell size and bandwidth can be set by the user or chosen automatically.                                                                                                         |
-| `hotspot_dual_kde()` | Compare the kernel density of two layers of points, e.g. to estimate the local risk of an event occurring relative to local population.                                                                                                  |
-| `hotspot_gistar()`   | Calculate the Getis–Ord $G_i^*$ statistic for each cell in a regular grid, while optionally estimating kernel density. Cell size, bandwidth and neighbour distance can be set by the user or chosen automatically.                       |
+| name | use |
+|:---|:---|
+| `hotspot_count()` | Count the number of points in each cell of a regular grid. Cell size can be set by the user or chosen automatically. |
+| `hotspot_change()` | Measure the change in the count of points in each cell between two periods of time. |
+| `hotspot_kde()` | Estimate kernel density for each cell in a regular grid. Cell size and bandwidth can be set by the user or chosen automatically. |
+| `hotspot_dual_kde()` | Compare the kernel density of two layers of points, e.g. to estimate the local risk of an event occurring relative to local population. |
+| `hotspot_gistar()` | Calculate the Getis–Ord $G_i^*$ statistic for each cell in a regular grid, while optionally estimating kernel density. Cell size, bandwidth and neighbour distance can be set by the user or chosen automatically. |
 | `hotspot_classify()` | Classify grid cells according to whether they have had significant clusters of points at different time periods. All parameters can be chosen automatically or be set by the user using the `hotspot_classify_params()` helper function. |
 
 The results produced by `hotspot_count()`, `hotspot_change()`,
@@ -72,10 +72,10 @@ library(sf)
 library(sfhotspot)
 library(tidyverse)
 #> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
-#> ✔ dplyr     1.1.2     ✔ readr     2.1.4
-#> ✔ forcats   1.0.0     ✔ stringr   1.5.0
-#> ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
-#> ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
+#> ✔ dplyr     1.1.4     ✔ readr     2.1.5
+#> ✔ forcats   1.0.0     ✔ stringr   1.5.1
+#> ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
+#> ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
 #> ✔ purrr     1.0.2
 #> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
 #> ✖ dplyr::filter() masks stats::filter()
@@ -85,34 +85,24 @@ library(tidyverse)
 
 # Transform data to UTM zone 15N so that we can think in metres, not decimal 
 # degrees
-memphis_robberies_utm <- st_transform(memphis_robberies, 32615)
+memphis_robberies_utm <- st_transform(memphis_robberies, "EPSG:32615")
 
 
 # Identify hotspots, set all the parameters automatically by not specifying cell 
 # size, bandwidth, etc.
-memphis_robberies_hotspots <- hotspot_gistar(memphis_robberies_utm)
-#> Cell size set to 500 metres automatically
-#> Bandwidth set to 5,592 metres automatically based on rule of thumb
-#> The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
-#> which was just loaded, will retire in October 2023. Please refer to R-spatial
-#> evolution reports for details, especially
-#> https://r-spatial.org/r/2023/05/15/evolution4.html. It may be desirable to make
-#> the sf package available; package maintainers should consider adding sf to
-#> Suggests:. The sp package is now running under evolution status 2 (status 2
-#> uses the sf package in place of rgdal)
-#> Done: [--------------------------------------------------------------------] .Done: [======================================------------------------------] .Done: [=======================================-----------------------------] .Done: [========================================----------------------------] .Done: [=========================================---------------------------] .Done: [==========================================--------------------------] .Done: [===========================================-------------------------] .Done: [============================================------------------------] .Done: [=============================================-----------------------] .Done: [==============================================----------------------] .Done: [===============================================---------------------] .Done: [================================================--------------------] .Done: [=================================================-------------------] .Done: [==================================================------------------] .Done: [===================================================-----------------] .Done: [====================================================----------------] .Done: [=====================================================---------------] .Done: [======================================================--------------] .Done: [=======================================================-------------] .Done: [========================================================------------] .Done: [=========================================================-----------] .Done: [==========================================================----------] .Done: [===========================================================---------] .Done: [============================================================--------] .Done: [=============================================================-------] .Done: [==============================================================------] .Done: [===============================================================-----] .Done: [================================================================----] .Done: [=================================================================---] .Done: [==================================================================--] .Done: [===================================================================-] .Done: [====================================================================] .                                                                              
+memphis_robberies_htspt <- hotspot_gistar(memphis_robberies_utm, quiet = TRUE)
 
 
 # Visualise the hotspots by showing only those cells that have significantly
 # more points than expected by chance. For those cells, show the estimated
 # density of robberies.
-memphis_robberies_hotspots %>% 
-  filter(gistar > 0, pvalue < 0.05) %>% 
+memphis_robberies_htspt |> 
+  filter(gistar > 0, pvalue < 0.05) |> 
   ggplot(aes(colour = kde, fill = kde)) +
   geom_sf() +
   scale_colour_distiller(aesthetics = c("colour", "fill"), direction = 1) +
   labs(title = "Density of robberies in Memphis, 2019") +
   theme_void()
 ```
 
-<img src="man/figures/README-example-1.png" width="100%" />
+<img src="man/figures/README-example-1.png" alt="A map showing hotspots of robbery in Memphis, TN created using the `hotspot_gistar()` function in the sfhotspot package" width="100%" />
diff --git a/data-raw/memphis_precincts.R b/data-raw/memphis_precincts.R
@@ -0,0 +1,7 @@
+# This script prepares the `memphis_precincts` dataset
+
+memphis_precincts <- sf::read_sf("https://data.memphistn.gov/resource/rqqz-pj4u.geojson") |>
+  dplyr::group_by(precinct) |>
+  dplyr::summarise()
+
+usethis::use_data(memphis_precincts, overwrite = TRUE)
diff --git a/data/memphis_precincts.rda b/data/memphis_precincts.rda