-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathucrbook.Rmd
1980 lines (1626 loc) · 113 KB
/
ucrbook.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Uniform Crime Reporting (UCR) Program Data: An Opinionated Guide to FBI Data"
author: "Jacob Kaplan, Ph.D."
date: "`r Sys.Date()`"
bibliography: [book.bib]
biblio-style: apalike
link-citations: yes
colorlinks: yes
description: "This is a comprehensive guide to using the FBI's Uniform Crime Reporting Program Data, including the Summary Reporting System (SRS) files and the National Incident-Based Reporting System (NIBRS) files."
url: "https://ucrbook.com"
github-repo: "jacobkap/ucrbook"
site: bookdown::bookdown_site
documentclass: krantz
monofont: "Source Code Pro"
monofontoptions: "Scale=0.7"
graphics: yes
---
```{r include=FALSE, cache=FALSE}
library(groundhog)
# devtools::install_github("wmurphyrd/fiftystater")
library(fiftystater)
packages <- c(
"crimeutils",
"dplyr",
"readr",
"kableExtra",
"knitr",
"scales",
"tidyr",
"ggplot2",
"mapproj",
"lubridate",
"gridExtra",
"priceR",
"blscrapeR",
"janitor",
# "quantmod",
"ggh4x",
"sf",
"tigris"
)
groundhog.library(packages, "2024-08-27")
#library(priceR)
#library(quantmod)
# For agencies with a covered by ORI value, assign the last month reported
# variable to the value the covering agency has
#
# offenses_known_yearly <- readRDS("data/offenses_known_yearly_1960_2023.rds")
# offenses_known_yearly$covered_by_ori <- toupper(offenses_known_yearly$covered_by_ori)
#
# offenses_known_yearly$ori_year <- paste(offenses_known_yearly$ori,
# offenses_known_yearly$year)
# offenses_known_yearly_covered_by <-
# offenses_known_yearly %>%
# filter(!is.na(covered_by_ori))
#
# offenses_known_yearly_covering <-
# offenses_known_yearly %>%
# filter(paste(ori, year) %in% paste(offenses_known_yearly_covered_by$covered_by_ori,
# offenses_known_yearly_covered_by$year))
#
# offenses_known_yearly_covered_by$last_month_reported_old <-
# offenses_known_yearly_covered_by$last_month_reported
# offenses_known_yearly_covered_by$number_of_months_missing_old <-
# offenses_known_yearly_covered_by$number_of_months_missing
# pb <- progress_bar$new(
# format = " [:bar] :current/:total :percent eta: :eta",
# total = nrow(offenses_known_yearly_covered_by), clear = FALSE, width= 60)
# table(offenses_known_yearly_covered_by$last_month_reported)
# table(offenses_known_yearly_covered_by$number_of_months_missing)
# for (i in 1:nrow(offenses_known_yearly_covered_by)) {
# temp <-
# offenses_known_yearly_covering %>%
# filter(ori %in% offenses_known_yearly_covered_by$covered_by_ori[i],
# year %in% offenses_known_yearly_covered_by$year[i])
# if (nrow(temp) > 0) {
# offenses_known_yearly_covered_by$last_month_reported[i] <- temp$last_month_reported
# offenses_known_yearly_covered_by$number_of_months_missing[i] <- temp$number_of_months_missing
# }
# pb$tick()
#
# }
# table(offenses_known_yearly_covered_by$last_month_reported)
# table(offenses_known_yearly_covered_by$number_of_months_missing)
#
# offenses_known_yearly <-
# offenses_known_yearly %>%
# filter(!ori_year %in% offenses_known_yearly_covered_by$ori_year) %>%
# bind_rows(offenses_known_yearly_covered_by)
#
# saveRDS(offenses_known_yearly, "data/offenses_known_yearly_with_covered_by_last_month_reported.rds")
options(tidygeocoder.quiet = TRUE)
options(tidygeocoder.verbose = FALSE)
options(readr.show_col_types = FALSE)
knitr::opts_chunk$set(
comment = "#>",
collapse = TRUE,
out.width = "90%",
fig.align = "center",
fig.width = 15.33333,
fig.asp = (1 / 1.618033988749895), # 1 / phi
fig.show = "hold",
error = TRUE
)
# library(formatR)
# knitr::opts_chunk$set(
# comment = "#",
# # collapse = TRUE,
# fig.align = 'center',
# fig.width = 9,
# fig.asp = 0.618,
# fig.show = "hold",
# #error = TRUE,
# # fig.pos = "!H",
# out.extra = "",
# tidy = "styler",
# out.width = "100%",
# out.height= "45%"
# )
get_replace_single_month <- function(data, crime_col, crime) {
data <- data[match(data$month, month.name), ]
data$imputed_if_missing <- NA
data <- data.frame(data)
for (i in 1:nrow(data)) {
rows <- 1:12
rows <- rows[!rows %in% i]
data$imputed_if_missing[i] <- sum(data[, crime_col][rows]) * (12 / 11)
}
data$imputed_if_missing <- round(data$imputed_if_missing, 0)
data$actual_crimes <- data[, crime_col]
data$annual_crimes <- sum(data$actual_crimes)
data <-
data %>%
select(month, actual_crimes, annual_crimes, imputed_if_missing)
data$percent_change <- get_percent_change(sum(data$actual_crimes), data$imputed_if_missing)
data <-
data %>%
mutate_if(is.numeric, formatC, format = "d", big.mark = ",")
data$actual_crimes <- as.character(data$actual_crimes)
crime_percent <- parse_number(data$actual_crimes) / sum(parse_number(data$actual_crimes)) * 100
crime_percent <- round(crime_percent, 2)
crime_percent <- pad_decimals(crime_percent, 2)
crime_percent <- paste0("(", crime_percent, "%)")
data$actual_crimes <- paste(data$actual_crimes, crime_percent)
names(data) <- c(
"Month",
paste(crime, "That Month"),
paste("Actual Annual", crime),
paste("Imputed Annual", crime),
"Percent Change"
)
return(data)
}
get_average_months_missing_simulation <- function(data, variable) {
results_of_months_missing <- data.frame(months_missing = 1:9,
mean = NA,
median = NA,
min = NA,
max = NA)
data$variable <- data[, variable]
actual_value <- data.frame(months_missing = 0,
mean = sum(data$variable),
median = sum(data$variable),
min = sum(data$variable),
max = sum(data$variable))
for (n in 1:9) {
months_missing <- n
final <- vector(mode = "logical", length = 10000)
set.seed(19104)
for (i in 1:10000) {
temp <- data[-sample(1:12, months_missing, replace = FALSE), ]
final[i] <- sum(temp$variable) * 12 / (12 - months_missing)
}
results_of_months_missing$mean[n] <- mean(final)
results_of_months_missing$median[n] <- median(final)
results_of_months_missing$min[n] <- min(final)
results_of_months_missing$max[n] <- max(final)
}
results_of_months_missing <-
results_of_months_missing %>%
bind_rows(actual_value) %>%
arrange(months_missing) %>%
mutate_if(is.numeric, round, 2)
results_of_months_missing$months_missing[1] <- "Full data"
results_of_months_missing$months_missing[2] <- "1 month"
results_of_months_missing <-
results_of_months_missing %>%
rename(`# of Months Missing` = months_missing,
`Mean Imputed Value` = mean,
`Median Imputed Value` = median,
`Minimum Imputed Value` = min,
`Maximum Imputed Value` = max,
)
return(results_of_months_missing)
}
get_percent_change <- function(number1,
number2) {
final <- number2 - number1
final <- final / abs(number1) * 100
final <- round(final, 2)
final <- pad_decimals(final, 2)
final[-grep("-", final)] <- paste0("+", final[-grep("-", final)])
return(final)
}
get_murder_by_pop_group <- function(data) {
data$population_group[data$population_group %in% c("city 1,000,000+",
"city 250,000 thru 499,999",
"city 500,000 thru 999,999")] <- "city 250,000+"
data$population_group[data$population_group %in% "Non-MSA Counties/non-MSA State Police"] <- "msa counties and msa state police"
data$population_group[data$population_group %in% "MSA Counties/MSA State Police"] <- "non-msa counties and non-msa state police"
final <- data.frame(
agency_size = c(
"city under 2,500",
"city 2,500 thru 9,999",
"city 10,000 thru 24,999",
"city 25,000 thru 49,999",
"city 50,000 thru 99,999",
"city 100,000 thru 249,999",
"city 250,000+",
"msa counties and msa state police",
"non-msa counties and non-msa state police"
),
mean_murder = NA,
median_murder = NA,
percentile_90 = NA,
min_murder = NA,
max_murder = NA
)
for (i in 1:nrow(final)) {
temp <- data[data$population_group %in% final$agency_size[i], ]
if (nrow(temp) > 0) {
final$mean_murder[i] <- mean(temp$actual_murder)
final$median_murder[i] <- median(temp$actual_murder)
final$min_murder[i] <- min(temp$actual_murder)
final$percentile_90[i] <- as.numeric(quantile(temp$actual_murder, 0.90))
final$max_murder[i] <- max(temp$actual_murder)
}
}
final <-
final %>%
mutate_if(is.numeric, round, 1) %>%
mutate_if(is.numeric, formatC, format = "d", big.mark = ",") %>%
mutate(agency_size = capitalize_words(agency_size))
final$mean_murder[final$mean_murder == "NA"] <- "-"
final$median_murder[final$median_murder == "NA"] <- "-"
final$min_murder[final$min_murder == "NA"] <- "-"
final$percentile_90[final$percentile_90 == "NA"] <- "-"
final$max_murder[final$max_murder == "NA"] <- "-"
final$agency_size <- gsub(" thru ", "-", final$agency_size, ignore.case = TRUE)
final$agency_size <- gsub("msa", "MSA", final$agency_size, ignore.case = TRUE)
final$agency_size <- gsub("And", "and", final$agency_size, ignore.case = TRUE)
names(final) <- c(
"Population Group",
"Mean Murder",
"Median Murder",
"90th Percentile Murder",
"Minimum Murder",
"Max Murder"
)
return(final)
}
make_frequency_table_year <- function(data, column, col_names) {
temp <- unique(data[, column])
temp <- temp[!is.na(temp)]
temp_df <- data.frame(
col1 = temp,
first_year = NA,
number = NA
)
for (i in 1:nrow(temp_df)) {
loop_value <- temp_df$col1[i]
storage <- data[data[, column] %in% loop_value, ]
temp_df$number[i] <- nrow(storage)
temp_df$first_year[i] <- min(storage$year)
}
temp_df <-
temp_df %>%
mutate(percent = number / sum(number)) %>%
arrange(
desc(first_year),
desc(number)
) %>%
mutate(col1 = crimeutils::capitalize_words(col1))
temp_df$percent <- temp_df$percent * 100
temp_df$percent <- round(temp_df$percent, 2)
temp_df$percent <- pad_decimals(temp_df$percent, 2)
temp_df$percent <- paste0(temp_df$percent, "\\%")
total <- data.frame(col1 = "Total", number = sum(temp_df$number), percent = "100\\%")
temp_df <- bind_rows(temp_df, total)
temp_df$number <- formatC(temp_df$number, format = "d", big.mark = ",")
names(temp_df) <- col_names
return(temp_df)
}
make_frequency_table <- function(data, column, col_names) {
temp <- unique(data[, column])
temp <- temp[!is.na(temp)]
temp_df <- data.frame(
col1 = temp,
number = NA
)
for (i in 1:nrow(temp_df)) {
loop_value <- temp_df$col1[i]
storage <- data[data[, column] %in% loop_value, ]
temp_df$number[i] <- nrow(storage)
}
temp_df <-
temp_df %>%
mutate(percent = number / sum(number)) %>%
arrange(desc(number)) %>%
mutate(col1 = crimeutils::capitalize_words(col1))
temp_df$percent <- temp_df$percent * 100
temp_df$percent <- round(temp_df$percent, 2)
temp_df$percent <- pad_decimals(temp_df$percent, 2)
temp_df$percent <- paste0(temp_df$percent, "\\%")
total <- data.frame(col1 = "Total", number = sum(temp_df$number), percent = "100\\%")
temp_df <- bind_rows(temp_df, total)
temp_df$number <- formatC(temp_df$number, format = "d", big.mark = ",")
names(temp_df) <- col_names
return(temp_df)
}
```
```{r setup, include=FALSE}
options(
htmltools.dir.version = FALSE,
formatR.indent = 1,
width = 65,
digits = 4,
warnPartialMatchAttr = FALSE,
warnPartialMatchDollar = FALSE,
echo = FALSE,
warning = FALSE
)
# bookdown::render_book("index.Rmd", "bookdown::gitbook")
# bookdown::render_book("index.Rmd", "bookdown::pdf_book")
```
```{r, echo = FALSE}
knitr::opts_chunk$set(
echo = FALSE,
warning = FALSE,
error = FALSE,
message = FALSE
)
```
```{r}
nyc <- read_csv("data/New_York_New_York_City_Police_Department.csv")
offenses_known_yearly <- readRDS("data/offenses_known_yearly_1960_2023.rds")
SRS_hate_crimes <- readRDS("data/hate_crimes_1991_2023.rds")
```
# (PART) Welcome {-}
# Preface
If you have read an article about crime or arrests in the United States in the last half century, in most cases it was referring to the FBI's Uniform Crime Reporting Program Data, otherwise known as UCR data. UCR data is, with the exception of the more detailed data that only covers murders, a *monthly number of crimes or arrests reported to a single police agency* which is then gathered by the FBI into one file that includes all reporting agencies. It is actually a collection of different datasets, all of which have information about crimes and arrests that occur in a particular jurisdiction. Think of your home town. This data will tell you how many crimes were reported for a small number of crime categories or how many people (broken down by age, sex, and race) were arrested for a (larger) set of crime categories in that city (if the city has multiple police agencies then each agency will report crimes/arrests under their jurisdiction though the largest agency - usually the local police department - will cover the vast majority of crimes/arrests in that city) in a given month.
This is a very broad measure of crime, and its uses in research - or uses for understanding crime at all - is fairly limited. Yet is has become over much of the last century - and will likely remain among researchers for at least the next decade - the most important crime data in the United States.
UCR data is important for three reasons:
1. The definitions are standard, and all agencies (tend to) follow them so you can compare across agencies and over time.^[We will see many examples of when agencies do not follow the definitions, which really limits this data.]
2. The data is available since 1960 (for most of the datasets) so there is a long period of available data.^[While the original UCR data first reported in 1929, there is only machine-readable data since 1960.]
3. The data is available for most of the 18,000 police agencies in the United States so you can compare across agencies.
More than many other datasets, there will be times when using UCR data that you will think "that is weird". This book will cover this weirdness and when we think the weirdness is just an odd - but acceptable - quirk of the data, and when it is a sign of a big problem in the data or in that particular variable and that we should avoid using it. For most of this book we will be discussing the caveats of the above reasons - or, more directly, why these assumptions are wrong - but these are the reasons why the data is so influential.
## Goal of the book
By the end of each chapter you should have a firm grasp on the dataset that is covered and how to use it properly. However, this book cannot possibly cover every potential use case for the data so make sure to carefully examine the data yourself for your own particular use.
I get a lot of emails from people asking questions about this data so my own goal is to create a single place that answers as many questions as I can about the data. Again, this is among the most commonly used crime datasets and there are still many current papers published with incorrect information about the data (including such simple aspects like what geographic unit data is in and what time unit it is in). So hopefully this book will decrease the number of misconceptions about this data, increasing overall research quality.
Since manuals are boring, I will try to include graphs and images to try to alleviate the boredom. That said, I do not think it is possible to make it too fun so sorry in advanced. This book is a mix of facts about the data, such as how many years are available, and my opinions about it, such as whether it is reliable. In cases of facts I will just say a statement - e.g. "the offenses data is available since 1960". In cases of opinion I will temper the statement by saying something like "in my opinion..." or "I think".
## Structure of the book
This book will be divided into ten chapters: this chapter, an intro chapter briefly summarizing each dataset and going over overall issues with UCR data, and seven chapters each covering one of the seven UCR datasets. The final chapter will cover county-level UCR data, a commonly used but highly flawed aggregation of UCR data that I recommend against using. Each chapter will follow the same format: we will start with a brief summary of the data such as when it first because available and how it can be used. Next we will look at how many agencies report their data to this dataset, often looking at how to measure this reporting rate a couple of different ways. Finally, we will cover the important variables included in the data and how to use them properly (including not using them at all) - this will be the bulk of each chapter.
## Citing this book
If this data was useful in your research, please cite it. To cite this book, please use the below citation:
Kaplan J (2021). *Uniform Crime Reporting (UCR) Program Data: A Practitioner's Guide to FBI Data*. https://ucrbook.com/.
BibTeX format:
```bibtex
@Manual{ucrbook,
title = {Uniform Crime Reporting (UCR) Program Data: A Practitioner's Guide to FBI Data},
author = {{Jacob Kaplan}},
year = {2021},
url = {https://ucrbook.com/},
}
```
## Sources of UCR data
### My own collection
#### openICPSR
#### [Crimedatatool.com][https://crimedatatool.com/]
### NACJD
### FBI (raw data)
### Raw data
### Crime Data Explorer
### Crimes in the United States report
### FBI (Crime Data Explorer)
### FBI (Crimes in the United States Report)
There are a few different sources of UCR data available today. First, and probably most commonly used, is the data put together by the [National Archive of Criminal Justice Data (NACJD)](https://www.icpsr.umich.edu/web/pages/NACJD/index.html)). This a team out of the University of Michigan who manages a huge number of criminal justice datasets and makes them available to the public. If you have any questions about crime data - UCR or other crime data - I highly recommend you reach out to them for answers. They have a collection of data and excellent documentation available for UCR data available on their site [here](https://www.icpsr.umich.edu/web/NACJD/series/57). One limitation to their data, however, is that each year of data is available as an individual file meaning that you will need to concatenate each year together into a single file. Some years also have different column names (generally minor changes like spelling robbery "rob" one year and "robb" the next) which requires more work to standardize before you could concatenate. They also only have data through 2016 which means that the most recent years (UCR data is available through 2019) of data are (as of this writing) unavailable.
Next, and most usable for the general public - but limited for researchers - is the FBI's official website [Crime Data Explorer](https://crime-data-explorer.fr.cloud.gov/). On this site you can chose an agency and see annual crime data (remember, UCR data is monthly so this is not as detailed as it can be) for certain crimes (and not even all the crimes actually available in the data). This is okay for the general public but only provides a fraction of the data available in the actual data so is really not good for researchers.
It is worth mentioning a final source of UCR information. This is the annual Crimes in the United States report released by the FBI each year around the start of October. As an example, here is the [website for the 2019 report](https://ucr.fbi.gov/crime-in-the-u.s/2019/crime-in-the-u.s.-2019). In this report is summarized data which in most cases estimates missing data and provides information about national and subnational (though rarely city-level) crime data. As with the FBI's site, it is only a fraction of the true data available so is not a very useful source of crime data for quality research. Still, this is a very common source of information used by researchers.
## Recommended reading
While this book is designed to help researchers use this data, the FBI has an excellent manual on this data designed to help police agencies submit their data. That manual, called the "Summary Reporting System (SRS) User Manual" provides excellent definitions and examples of many variables included in the data. In this book when I quote the FBI, such as defining a crime, I quote from this manual. The manual is available to download as a PDF on the FBI's site and I have also posted it on my GitHub page [here](https://github.com/jacobkap/ucrbook/blob/main/FBI%20Uniform%20Crime%20Reporting%20(UCR)%20Program%20User%20Manual.pdf) for convenience. I highly recommend that you read this manual before using the data. That manual, alongside this book which tries to explain when and how the agencies do not follow the manual, will provide a solid foundation for your understanding of UCR data.
## How to contribute to this book
If you have any questions, suggestions (such as a topic to cover), or find any issues, please make a post on the [Issues page](https://github.com/jacobkap/ucrbook/issues) for this book on GitHub. On this page you can create a new issue (which is basically just a post on this forum) with a title and a longer description of your issue. You'll need a GitHub account to make a post. Posting here lets me track issues and respond to your message or alert you when the issue is closed (i.e. I have finished or denied the request). Issues are also public so you can see if someone has already posted something similar.
For more minor issues like typos or grammar mistakes, you can edit the book directly through its GitHub page. That'll make an update for me to accept, which will change the book to include your edit. To do that, click the edit button at the top of the site - the button is highlighted in the below figure. You will need to make a GitHub account to make edits. When you click on that button you will be taken to a page that looks like a Word Doc where you can make edits. Make any edits you want and then scroll to the bottom of the page. There you can write a short (please, no more than a sentence or two) description of what you have done and then submit the changes for me to review.
```{r, echo = FALSE, fig.cap="The edit button for how to make edits of this book."}
knitr::include_graphics('images/edit_button.PNG')
```
Please only use the above two methods to contribute or make suggestions about the book. While it is a bit more work for you to do it this way, since you will need to make a GitHub account if you do not already have one, it helps me organize all the questions in one place and update the book if I decide to add answers to certain questions.
## How to identify a particular agency (ORI codes) {#ori}
In NIBRS and other FBI data sets, agencies are identified using **OR**iginating Agency **I**dentifiers or an ORI. An ORI is a unique ID code used to identify an agency.^[This is referred to as an "ORI", "ORI code", and "ORI number", all of which mean the same thing.] If we used the agency's name we would end up with some duplicates since there can be multiple agencies in the country (and in a state, those this is very rare) with the same name. For example, if you looked for the Philadelphia Police Department using the agency name, you would find both the "Philadelphia Police Department" in Pennsylvania and the one in Mississippi. Each ORI is a 9-digit value starting with the state abbreviation (for some reason the FBI incorrectly puts the abbreviation for Nebraska as NB instead of NE) followed by 7 numbers. In the UCR data (another FBI data set) the ORI uses only a 7-digit code - with only the 5 numbers following the state abbreviation instead of 7. So the NIBRS ORI codes are sometimes called ORI9. For nearly all agencies, the only difference between the UCR ORI and the NIBRS ORI is that the NIBRS ORI has "00" at the end so it is technically 9 characters long but is not any more specific than the 7-character UCR ORI code.
When dealing with specific agencies, make sure to use the ORI rather than the agency name to avoid any mistakes. For an easy way to find the ORI number of an agency, use [this page](https://crimedatatool.com/crosswalk.html) on my site. Type an agency name or an ORI code into the search section and it will return everything that is a match.
## The data as you get it from the FBI
We will finish this overview of the SRS data by briefly talking about format of the data that is released by the FBI, before the processing done by myself or [NACJD](https://www.icpsr.umich.edu/web/pages/NACJD/index.html) that converts the data to a type that software like R or Stata or Excel can understand. The FBI releases their data as fixed-width ASCII files which are basically just an Excel file but with all of the columns squished together. As an example, Figure \@ref(fig:SRSascii) shows what the data looks like as you receive it from the FBI for the Offenses Known and Clearances by Arrest dataset for 1960, the first year with data available. In the figure, it seems like there are multiple rows but that is just because the software that I opened the file in is not wide enough - in reality what is shown is a single row that is extremely wide because there are over 1,500 columns in this data. If you scroll down enough you will see the next row, but that is not shown in the current image. What is shown is a single row with a ton of columns all pushed up next to each other. Since all of the columns are squished together (the gaps are just blank spaces because the value there is a space, but that does not mean there is a in the data. Spaces are possible values in the data and are meaningful), you need some way to figure out which parts of the data belong in which column.
```{r SRSascii, fig.cap="Fixed-width ASCII file for the 1960 Offenses Known and Clearances by Arrest dataset."}
knitr::include_graphics('images/offenses_known_raw_ascii_1960.PNG')
```
```{r ascii, fig.cap="Fixed-width ASCII file for the 1991 National Incident-Based Reporting System (NIBRS) dataset."}
knitr::include_graphics('images/nibrs_ascii.PNG')
```
The "fixed-width" part of the file type is how this works (the ASCII part basically means it is a text file). Each row is the same width - literally the same number of characters, including blank spaces. So you must tell the software you are using to process this file - by literally writing code in something called a "setup file" but is basically just instructions for whatever software you use (R, SPSS, Stata, SAS can all do this) - which characters are certain columns. For example, in this data the first character says which type of SRS data it is (1 means the Offenses Known and Clearances by Arrest data) and the next two characters (in the setup file written as 2-3 since it is characters 2 through 3 [inclusive]) are the state number (01 is the state code for Alabama). So we can read this row as the first column indicating it is an Offenses Known data, the second column indicating that it is for the state of Alabama, and so on for each of the remaining columns. To read in this data you will need a setup file that covers every column in the data (some software, like R, can handle just reading in the specific columns you want and do not need to include every column in the setup file).
The second important thing to know about reading in a fixed-width ASCII file is something called a "value label."^[For most fixed-width ASCII files there are also missing values where it'll have placeholder value such as -8 and the setup file will instruct the software to convert that to NA. SRS data, however, does not have this and does not indicate when values are missing in this manner.] For example, in the above image we saw the characters 2-3 is the state and in the row we have the value "01" which means that the state is "Alabama." Since this type of data is trying to be as small as efficient as possible, it often replaces longer values with shorter one and provides a translation for the software to use to convert it to the proper value when reading it. "Alabama" is more characters than "01" so it saves space to say "01" and just replace that with "Alabama" later on. So "01" would be the "value" and "Alabama" would be the "label" that it changes to once read.
Fixed-width ASCII files may seem awful to you reading it today, and it is awful to use. But it appears to be an efficient way to store data back many decades ago when data releases began but now is extremely inefficient - in terms of speed, file size, ease of use - compared to modern software so I am not sure why they *still* release data in this format. But they do, and even the more *modern* (if starting in 1991, before I was born, is modern!) NIBRS data comes in this format. For you, however, the important part to understand is not how exactly to read this type of data, but to understand that people who made this data publicly available (such as myself and the team at NACJD) must make this conversion process.^[For those interested in reading in this type of data, please see my R package asciiSetupReader.] **This conversion process, from fixed-width ASCII to a useful format is the most dangerous step taken in using this data - and one that is nearly entirely unseen by researchers.**
Every line of code you write (or, for SPSS users, click you make) invites the possibility of making a mistake.^[Even highly experienced programmers who are doing something like can make mistakes. For example, if you type out "2+2" 100 times - something extremely simple that anyone can do - how often will you mistype a character and get a wrong result? I would guess that at least once you would make a mistake.] The FBI does not provide a setup file with the fixed-width ASCII data so to read in this data you need to make it yourself. Since some SRS data are massive, this involves assigning the column width for thousands of columns and the value labels for hundreds of different value labels.^[With the exception of the arrest data and some value label changes in hate crimes and homicide data, the setup files remain consistent so a single file will work for all years for a given dataset. You do not need to make a setup file for each year.] A typo anywhere could have potentially far-reaching consequences, so this is a crucial weak point in the data cleaning process - and one in which I have not seen anything written about before. While I have been diligent in checking the setup files and my code to seek out any issues - and I know that NACJD has a robust checking process for their own work - that does not mean our work is perfect.^[For evidence of this, please see any of the openICPSR pages for my detail as they detail changes I have made in the data such as decisions on what level to aggregate to and mistakes that I made and later found and fixed.] Even with perfection in processing the raw data to useful files, decisions we make (e.g. what level to aggregate to, what is an outlier) can affect both what type of questions you can ask when using this data, and how well you can answer them.
## Common issues
In this section we will discuss issues common to most or all of the SRS datasets. For some of these, we will come back to the issues in more detail in the chapter for the datasets most affected by the problem.
### Population
Each of the SRS datasets include a population variable that has the estimated population under the jurisdiction of that agency.^[Jurisdiction here refers to the boundaries of the local government, not any legal authority for where the officer can make arrests. For example, the Los Angeles Police Department's jurisdiction in this case refers to crimes that happen inside the city or are otherwise investigated by the LAPD - and are not primarily investigated by another agency.] This variable is often used to create crime rates that control for population. In cases where jurisdiction overlaps, such as when a city has university police agencies or county sheriffs in counties where the cities in that county have their own police, SRS data assigns the population covered to the most local agency and zero population to the overlapping agency. So an agency's population is the number of people in that jurisdiction that is not already covered by a different agency.
For example, the city of Los Angeles in California has nearly four million residents according to the US Census. There are multiple police agencies in the city, including the Los Angeles Police Department, the Los Angeles County Sheriff's Office, the California Highway Patrol that operates in the area, airport and port police, and university police departments. If each agency reported the number of people in their jurisdiction - which all overlap with each other - we would end up with a population far higher than LA's four million people. To prevent double-counting population when agency's jurisdictions overlap, the non-primary agency will report a population of 0, even though they still report crime data like normal. As an example, in 2018 the police department for California State University - Los Angeles reported 92 thefts and a population of 0. Those 92 thefts are not counted in the Los Angeles Police Department data, even though the department counts the population. To get complete crime counts in Los Angeles, you would need to add up all police agencies within in the city; since the population value is 0 for non-LAPD agencies, both the population and the crime sum will be correct.
The SRS uses this method even when only parts of a jurisdiction overlaps. Los Angeles County Sheriff has a population of about one million people, far less than the actual county population (the number of residents, according to the Census) of about 10 million people. This is because the other nine million people are accounted for by other agencies, mainly the local police agencies in the cities that make up Los Angeles County.
The population value is the population who reside in that jurisdiction and does not count people who are in the area but do not live there, such as tourists or people who commute there for work. This means that using the population value to determine a rate can be misleading as some places have much higher numbers of non-residents in the area (e.g. Las Vegas, Washington D.C.) than others.
### Voluntary reporting {#voluntary}
When an agency reports their data to the FBI, they do so voluntarily - there is no national requirement to report.^[Some states do mandate that their agencies report, but this is not always followed.] This means that there is inconsistency in which agencies report, how many months of the year they report for, and which variables they include in their data submissions.
In general, more agencies report their data every year and once an agency begins reporting data they tend to keep reporting. The SRS datasets are a collection of separate, though related, datasets and an agency can report to as many of these datasets as they want - an agency that reports to one dataset does not mean that they report to other datasets. Figure \@ref(fig:SRSagenciesReporting) shows the number of agencies that submitted at least one month of data to the Offenses Known and Clearances by Arrest data in the given year. For the first decade of available data under 8,000 agencies reported data and this grew to over 13,500 by the late 1970s before plateauing for about a decade. The number of agencies that reported their data actually declined in the 1990s, driven primarily by many Florida agencies temporarily dropping out, before growing steadily to nearly 17,000 agencies in 2010; from here it kept increasing but slower than before.
```{r SRSagenciesReporting, fig.cap="The annual number of agencies reporting to the Offenses Known and Clearances by Arrest dataset. Reporting is based on the agency reporting at least one month of data in that year."}
offenses_known_yearly %>%
dplyr::filter(!last_month_reported %in% "no months reported") %>%
count(year) %>%
ggplot(aes(x = year, y = n)) +
geom_line(linewidth = 1.05) +
xlab("Year") +
ylab("# of Agencies Reporting") +
theme_crim() +
scale_y_continuous(labels = scales::comma, expand = c(0, 0), limits = c(0, NA)) +
expand_limits(y = 0)
```
There are approximately 18,000 police agencies in the United States so recent data has reports from nearly all agencies, while older data has far fewer agencies reporting. When trying to estimate to larger geographies, such as state or national-level, later years will be more accurate as you are missing less data. For earlier data, however, you are dealing with a smaller share of agencies meaning that you have a large amount of missing data and a less representative sample.
Figure \@ref(fig:bigAgenciesReporting) repeats the above figure but now including only agencies with 100,000 people or more in their jurisdiction. While these agencies have a far more linear trend than all agencies, the basic lesson is the same: recent data has most agencies reporting; old data excludes many agencies.
```{r bigAgenciesReporting, fig.cap = "The annual number of agencies with a population of 100,000 or higher reporting to the Offenses Known and Clearances by Arrest dataset. Reporting is based on the agency reporting at least one month of data in that year."}
offenses_known_yearly %>%
dplyr::filter(!last_month_reported %in% "no months reported",
population >= 100000) %>%
count(year) %>%
ggplot(aes(x = year, y = n)) +
geom_line(linewidth = 1.05) +
xlab("Year") +
ylab("# of Agencies Reporting") +
theme_crim() +
scale_y_continuous(labels = scales::comma, expand = c(0, 0), limits = c(0, NA)) +
expand_limits(y = 0)
```
This voluntariness extends beyond whether they report or not, but into which variables they report. While in practice most agencies report every crime when they report any, they do have the choice to report only a subset of offenses. This is especially true for subsets of larger categories - such as gun assaults, a subset of aggravated assaults, or marijuana possession arrests which is a subset of drug possession arrests. As an example, Figure \@ref(fig:nycGunAssaults) shows the annual number of aggravated assaults with a gun in New York City. In 2003 the New York Police Department stopped reporting this category of offense, resuming only in 2013. They continued to report the broader aggravated assault category, but not any of the subsections of aggravated assaults which say which weapon was used during the assault.
```{r nycGunAssaults, fig.cap = "Monthly reports of gun assaults in New York City, 1960-2023."}
nyc_annual <- offenses_known_yearly[offenses_known_yearly$ori %in% "NY03030",]
ggplot(nyc_annual, aes(x = year, y = actual_assault_with_a_gun)) +
geom_line(size = 1.02) +
xlab("Year") +
ylab("Gun Assaults") +
theme_crim() +
scale_y_continuous(labels = scales::comma, expand = c(0, 0), limits = c(0, NA))
```
Given that agencies can join or drop out of the SRS program at will, and report only partial data, it is highly important to carefully examine your data to make sure that there are no issues caused by this.
Even when an agency reports SRS data, and even when they report every crime category, they can report fewer than 12 months of data. In some cases they simply report all of their data in December, or report quarterly or semi-annually so some months have zero crimes reported while others count multiple months in that month's data. One example of this is New York City, shown in Figure \@ref(fig:nycMurderMonthly), in the early-2000s to the mid-2010s where they began reporting data quarterly instead of monthly.
```{r nycMurderMonthly, fig.cap = "Monthly murders in New York City, 1990-2023. During the 2000s, the police department began reporting quarterly instead of monthly and then resumed monthly reporting."}
nyc %>%
filter(lubridate::year(year) >= 1990) %>%
ggplot(aes(x = year, y = actual_murder)) +
geom_line() +
xlab("Year") +
ylab("Murders") +
theme_crim()
```
When you sum up each month into an annual count, as shown in Figure \@ref(fig:nycMurderYearly), the problem disappears since the zero months are accounted for in the months that have the quarterly data. If you are using monthly data and only examine the data at the annual level, you will fall into the trap of having incorrect data that is hidden due to the level of aggregation examined. While cases like NYC are obvious when viewed monthly, for people that are including thousands of agencies in their data, it is unfeasible to look at each agency for each crime included. This can introduce errors as the best way to examine the data is manually viewing graphs and the automated method, looking for outliers through some kind of comparison to expected values, can be incorrect.
```{r nycMurderYearly, fig.cap = "Annual murders in New York City, 1990-2023."}
nyc_annual %>%
filter(year >= 1990) %>%
ggplot(aes(x = year, y = actual_murder)) +
geom_line(size = 1.02) +
xlab("Year") +
ylab("Murders") +
theme_crim() +
scale_y_continuous(labels = scales::comma, expand = c(0, 0), limits = c(0, NA))
```
In other cases when agencies report fewer than 12 months of the year, they simply report partial data and as a result undercount crimes. Figure \@ref(fig:miamiDadeMurderAnnual) shows annual murders in Miami-Dade, Florida and has three years of this issue occurring. The first two years with this issue are the two where zero murders are reported - this is because the agency did not report any months of data. The final year is in 2018, the last year of data in this graph, where it looks like murder suddenly dropped significantly. That is just because Miami-Dade only reported through June, so they are missing half of 2018.
```{r miamiDadeMurderAnnual, fig.cap = "Annual murders in Miami-Dade, Florida, 1960-2023."}
offenses_known_yearly %>%
dplyr::filter(ori %in% "FL01300") %>%
ggplot(aes(x = year, y = actual_murder)) +
geom_line(size = 1.02) +
xlab("Year") +
ylab("Murders") +
theme_crim() +
scale_y_continuous(labels = scales::comma, expand = c(0, 0), limits = c(0, NA))
```
### Zero crimes vs no reports
When an agency does not report, we see it in the data as reporting zero crimes, not reporting NA or any indicator that they did not report. In cases where the agency says they did not report that month we can be fairly sure (not entirely since that variable is not always accurate) that the zero crimes reported are simply that the agency did not report. In cases where the agency says they report that month but report zero crimes, we cannot be sure if that is a true no crimes reported to the agency or the agency not reporting to the SRS. As agencies can report some crimes but not others in a given month and still be considered reporting that month, just saying they reported does not mean that the zero is a true zero.
In some cases it is easy to see when a zero crimes reported is actually the agency not reporting. As Figure \@ref(fig:nycGunAssaults) shows with New York City gun assaults, there is a massive and sustained drop-off to zero crimes and then a sudden return years later. Obviously, going from hundreds of crimes to zero crimes is not a matter of crimes not occurring anymore, it is a matter of the agency not reporting - and New York City did report other crimes these years so in the data it says that they reported every month. So in agencies which tend to report many crimes - and many here can be a few as 10 a year since going from 10 to 0 is a big drop - a sudden report of zero crimes is probably just non-reporting.
Differentiating zero crimes and no reports becomes tricky in agencies that tend to report few crimes, which most small towns do. As an example, Figure \@ref(fig:danvilleRape) shows the annual reports of rape in Danville, California, a city of approximately 45,000 people. The city reports on average 2.8 rapes per year and in five years reported zero rapes. In cases like this it is not clear whether we should consider those zero years as true zeros (that no one was raped or reported their rape to the police) or whether the agency simply did not report rape data that year.
```{r danvilleRape, fig.cap = "Annual rapes reported in Danville, CA, 1960-2023."}
danville <- offenses_known_yearly[offenses_known_yearly$ori %in% "CA00723",]
ggplot(danville, aes(x = year, y = actual_rape_total)) +
geom_line(size = 1.02) +
xlab("Year") +
ylab("Rapes") +
theme_crim()
```
### Agency data covered by another agency
<!--chapter:end:index.Rmd-->
```{r include=FALSE, cache=FALSE}
library(groundhog)
# devtools::install_github("wmurphyrd/fiftystater")
library(fiftystater)
packages <- c(
"crimeutils",
"dplyr",
"readr",
"kableExtra",
"knitr",
"scales",
"tidyr",
"ggplot2",
"mapproj",
"lubridate",
"gridExtra",
"priceR",
"blscrapeR",
"janitor",
# "quantmod",
"ggh4x",
"sf",
"tigris"
)
groundhog.library(packages, "2024-08-27")
#library(priceR)
#library(quantmod)
# For agencies with a covered by ORI value, assign the last month reported
# variable to the value the covering agency has
#
# offenses_known_yearly <- readRDS("data/offenses_known_yearly_1960_2023.rds")
# offenses_known_yearly$covered_by_ori <- toupper(offenses_known_yearly$covered_by_ori)
#
# offenses_known_yearly$ori_year <- paste(offenses_known_yearly$ori,
# offenses_known_yearly$year)
# offenses_known_yearly_covered_by <-
# offenses_known_yearly %>%
# filter(!is.na(covered_by_ori))
#
# offenses_known_yearly_covering <-
# offenses_known_yearly %>%
# filter(paste(ori, year) %in% paste(offenses_known_yearly_covered_by$covered_by_ori,
# offenses_known_yearly_covered_by$year))
#
# offenses_known_yearly_covered_by$last_month_reported_old <-
# offenses_known_yearly_covered_by$last_month_reported
# offenses_known_yearly_covered_by$number_of_months_missing_old <-
# offenses_known_yearly_covered_by$number_of_months_missing
# pb <- progress_bar$new(
# format = " [:bar] :current/:total :percent eta: :eta",
# total = nrow(offenses_known_yearly_covered_by), clear = FALSE, width= 60)
# table(offenses_known_yearly_covered_by$last_month_reported)
# table(offenses_known_yearly_covered_by$number_of_months_missing)
# for (i in 1:nrow(offenses_known_yearly_covered_by)) {
# temp <-
# offenses_known_yearly_covering %>%
# filter(ori %in% offenses_known_yearly_covered_by$covered_by_ori[i],
# year %in% offenses_known_yearly_covered_by$year[i])
# if (nrow(temp) > 0) {
# offenses_known_yearly_covered_by$last_month_reported[i] <- temp$last_month_reported
# offenses_known_yearly_covered_by$number_of_months_missing[i] <- temp$number_of_months_missing
# }
# pb$tick()
#
# }
# table(offenses_known_yearly_covered_by$last_month_reported)
# table(offenses_known_yearly_covered_by$number_of_months_missing)
#
# offenses_known_yearly <-
# offenses_known_yearly %>%
# filter(!ori_year %in% offenses_known_yearly_covered_by$ori_year) %>%
# bind_rows(offenses_known_yearly_covered_by)
#
# saveRDS(offenses_known_yearly, "data/offenses_known_yearly_with_covered_by_last_month_reported.rds")
options(tidygeocoder.quiet = TRUE)
options(tidygeocoder.verbose = FALSE)
options(readr.show_col_types = FALSE)
knitr::opts_chunk$set(
comment = "#>",
collapse = TRUE,
out.width = "90%",
fig.align = "center",
fig.width = 15.33333,
fig.asp = (1 / 1.618033988749895), # 1 / phi
fig.show = "hold",
error = TRUE
)
# library(formatR)
# knitr::opts_chunk$set(
# comment = "#",
# # collapse = TRUE,
# fig.align = 'center',
# fig.width = 9,
# fig.asp = 0.618,
# fig.show = "hold",
# #error = TRUE,
# # fig.pos = "!H",
# out.extra = "",
# tidy = "styler",
# out.width = "100%",
# out.height= "45%"
# )
get_replace_single_month <- function(data, crime_col, crime) {
data <- data[match(data$month, month.name), ]
data$imputed_if_missing <- NA
data <- data.frame(data)
for (i in 1:nrow(data)) {
rows <- 1:12
rows <- rows[!rows %in% i]
data$imputed_if_missing[i] <- sum(data[, crime_col][rows]) * (12 / 11)
}
data$imputed_if_missing <- round(data$imputed_if_missing, 0)
data$actual_crimes <- data[, crime_col]
data$annual_crimes <- sum(data$actual_crimes)
data <-
data %>%
select(month, actual_crimes, annual_crimes, imputed_if_missing)
data$percent_change <- get_percent_change(sum(data$actual_crimes), data$imputed_if_missing)
data <-
data %>%
mutate_if(is.numeric, formatC, format = "d", big.mark = ",")
data$actual_crimes <- as.character(data$actual_crimes)
crime_percent <- parse_number(data$actual_crimes) / sum(parse_number(data$actual_crimes)) * 100
crime_percent <- round(crime_percent, 2)
crime_percent <- pad_decimals(crime_percent, 2)
crime_percent <- paste0("(", crime_percent, "%)")
data$actual_crimes <- paste(data$actual_crimes, crime_percent)
names(data) <- c(
"Month",
paste(crime, "That Month"),
paste("Actual Annual", crime),
paste("Imputed Annual", crime),
"Percent Change"
)
return(data)
}
get_average_months_missing_simulation <- function(data, variable) {
results_of_months_missing <- data.frame(months_missing = 1:9,
mean = NA,
median = NA,
min = NA,
max = NA)
data$variable <- data[, variable]
actual_value <- data.frame(months_missing = 0,
mean = sum(data$variable),
median = sum(data$variable),
min = sum(data$variable),
max = sum(data$variable))
for (n in 1:9) {
months_missing <- n
final <- vector(mode = "logical", length = 10000)
set.seed(19104)
for (i in 1:10000) {
temp <- data[-sample(1:12, months_missing, replace = FALSE), ]
final[i] <- sum(temp$variable) * 12 / (12 - months_missing)
}
results_of_months_missing$mean[n] <- mean(final)
results_of_months_missing$median[n] <- median(final)
results_of_months_missing$min[n] <- min(final)
results_of_months_missing$max[n] <- max(final)
}
results_of_months_missing <-
results_of_months_missing %>%
bind_rows(actual_value) %>%
arrange(months_missing) %>%
mutate_if(is.numeric, round, 2)
results_of_months_missing$months_missing[1] <- "Full data"
results_of_months_missing$months_missing[2] <- "1 month"
results_of_months_missing <-
results_of_months_missing %>%
rename(`# of Months Missing` = months_missing,
`Mean Imputed Value` = mean,
`Median Imputed Value` = median,
`Minimum Imputed Value` = min,
`Maximum Imputed Value` = max,
)
return(results_of_months_missing)
}
get_percent_change <- function(number1,
number2) {
final <- number2 - number1
final <- final / abs(number1) * 100
final <- round(final, 2)
final <- pad_decimals(final, 2)
final[-grep("-", final)] <- paste0("+", final[-grep("-", final)])
return(final)
}
get_murder_by_pop_group <- function(data) {
data$population_group[data$population_group %in% c("city 1,000,000+",
"city 250,000 thru 499,999",
"city 500,000 thru 999,999")] <- "city 250,000+"
data$population_group[data$population_group %in% "Non-MSA Counties/non-MSA State Police"] <- "msa counties and msa state police"
data$population_group[data$population_group %in% "MSA Counties/MSA State Police"] <- "non-msa counties and non-msa state police"
final <- data.frame(
agency_size = c(
"city under 2,500",
"city 2,500 thru 9,999",
"city 10,000 thru 24,999",
"city 25,000 thru 49,999",
"city 50,000 thru 99,999",
"city 100,000 thru 249,999",
"city 250,000+",
"msa counties and msa state police",
"non-msa counties and non-msa state police"
),
mean_murder = NA,
median_murder = NA,
percentile_90 = NA,
min_murder = NA,
max_murder = NA
)
for (i in 1:nrow(final)) {
temp <- data[data$population_group %in% final$agency_size[i], ]
if (nrow(temp) > 0) {
final$mean_murder[i] <- mean(temp$actual_murder)
final$median_murder[i] <- median(temp$actual_murder)
final$min_murder[i] <- min(temp$actual_murder)
final$percentile_90[i] <- as.numeric(quantile(temp$actual_murder, 0.90))
final$max_murder[i] <- max(temp$actual_murder)
}
}
final <-
final %>%
mutate_if(is.numeric, round, 1) %>%
mutate_if(is.numeric, formatC, format = "d", big.mark = ",") %>%
mutate(agency_size = capitalize_words(agency_size))
final$mean_murder[final$mean_murder == "NA"] <- "-"
final$median_murder[final$median_murder == "NA"] <- "-"
final$min_murder[final$min_murder == "NA"] <- "-"
final$percentile_90[final$percentile_90 == "NA"] <- "-"
final$max_murder[final$max_murder == "NA"] <- "-"
final$agency_size <- gsub(" thru ", "-", final$agency_size, ignore.case = TRUE)
final$agency_size <- gsub("msa", "MSA", final$agency_size, ignore.case = TRUE)
final$agency_size <- gsub("And", "and", final$agency_size, ignore.case = TRUE)
names(final) <- c(
"Population Group",
"Mean Murder",
"Median Murder",
"90th Percentile Murder",
"Minimum Murder",
"Max Murder"
)
return(final)
}
make_frequency_table_year <- function(data, column, col_names) {
temp <- unique(data[, column])
temp <- temp[!is.na(temp)]
temp_df <- data.frame(
col1 = temp,
first_year = NA,
number = NA
)
for (i in 1:nrow(temp_df)) {
loop_value <- temp_df$col1[i]
storage <- data[data[, column] %in% loop_value, ]
temp_df$number[i] <- nrow(storage)
temp_df$first_year[i] <- min(storage$year)
}
temp_df <-
temp_df %>%
mutate(percent = number / sum(number)) %>%
arrange(
desc(first_year),
desc(number)
) %>%
mutate(col1 = crimeutils::capitalize_words(col1))
temp_df$percent <- temp_df$percent * 100
temp_df$percent <- round(temp_df$percent, 2)
temp_df$percent <- pad_decimals(temp_df$percent, 2)
temp_df$percent <- paste0(temp_df$percent, "\\%")
total <- data.frame(col1 = "Total", number = sum(temp_df$number), percent = "100\\%")
temp_df <- bind_rows(temp_df, total)
temp_df$number <- formatC(temp_df$number, format = "d", big.mark = ",")
names(temp_df) <- col_names
return(temp_df)
}
make_frequency_table <- function(data, column, col_names) {
temp <- unique(data[, column])
temp <- temp[!is.na(temp)]
temp_df <- data.frame(
col1 = temp,
number = NA
)
for (i in 1:nrow(temp_df)) {
loop_value <- temp_df$col1[i]
storage <- data[data[, column] %in% loop_value, ]
temp_df$number[i] <- nrow(storage)
}
temp_df <-
temp_df %>%
mutate(percent = number / sum(number)) %>%
arrange(desc(number)) %>%
mutate(col1 = crimeutils::capitalize_words(col1))
temp_df$percent <- temp_df$percent * 100
temp_df$percent <- round(temp_df$percent, 2)
temp_df$percent <- pad_decimals(temp_df$percent, 2)
temp_df$percent <- paste0(temp_df$percent, "\\%")
total <- data.frame(col1 = "Total", number = sum(temp_df$number), percent = "100\\%")
temp_df <- bind_rows(temp_df, total)
temp_df$number <- formatC(temp_df$number, format = "d", big.mark = ",")
names(temp_df) <- col_names
return(temp_df)
}
```
# About the Author {-}