diff --git a/DESCRIPTION b/DESCRIPTION index a44e9c8..bcfd647 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: repometrics Title: Metrics for Your Code Repository -Version: 0.1.3.020 +Version: 0.1.3.031 Authors@R: person("Mark", "Padgham", , "mark.padgham@email.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-2172-5265")) diff --git a/R/analyse-users.R b/R/analyse-users.R new file mode 100644 index 0000000..becdd26 --- /dev/null +++ b/R/analyse-users.R @@ -0,0 +1,135 @@ +#' Construct user-by-user square matrices of strengths of relation between +#' users. +#' +#' @param user_data Result of `lapply(logins, repometrics_data_user)`. +#' Contains the following fields: +#' \enumerate{ +#' \item general (not considered here) +#' \item commit_cmt Comments on commits +#' \item commits Commits to different repositories +#' \item followers GitHub followers +#' \item following Logins of people/orgs followed by user on GitHub +#' \item issue_cmts Comments on issues +#' \item issues Issues opened by user. +#' } +#' @return A `data.frame` of pairwise user logins, and proportions of overlap +#' betwen repositories in the six variables described above. +#' @noRd +user_relation_matrices <- function (user_data) { + + user_names <- names (user_data) + user_data <- add_user_login_cols (user_data) |> + combine_user_data () + + # Pre-processing to name grouping column "repo" and count column "n": + user_data$commit_cmt$repo <- + paste0 (user_data$commit_cmt$org, user_data$commit_cmt$repo) + + user_data$followers <- + dplyr::rename (user_data$followers, repo = followers) |> + dplyr::mutate (n = 1L) + user_data$following <- + dplyr::rename (user_data$following, repo = following) |> + dplyr::mutate (n = 1L) + + user_data$issue_cmts <- + dplyr::rename (user_data$issue_cmts, repo = org_repo) |> + dplyr::group_by (repo, login) |> + dplyr::summarise (n = sum (num_comments), .groups = "keep") + user_data$issues <- dplyr::rename (user_data$issues, repo = org_repo) |> + dplyr::group_by (repo, login) |> + dplyr::summarise (n = dplyr::n (), .groups = "keep") + + overlap <- lapply (names (user_data), function (n) { + user_data [[n]] <- user_relate_fields (user_data, user_names, what = n) + }) + + res <- dplyr::left_join (overlap [[1]], overlap [[2]], by = c ("login1", "login2")) |> + dplyr::left_join (overlap [[3]], by = c ("login1", "login2")) |> + dplyr::left_join (overlap [[4]], by = c ("login1", "login2")) |> + dplyr::left_join (overlap [[5]], by = c ("login1", "login2")) |> + dplyr::left_join (overlap [[6]], by = c ("login1", "login2")) + + return (res) +} + +#' Add 'login' columns to all user data, so each element can be combined. +#' @noRd +add_user_login_cols <- function (user_data) { + + nms <- names (user_data) + res <- lapply (seq_along (user_data), function (u) { + nms_u <- names (user_data [[u]]) + res_u <- lapply (seq_along (user_data [[u]]), function (i) { + ud <- user_data [[u]] [[i]] + if (is.data.frame (ud) && nrow (ud) > 0L) { + ud$login <- names (user_data) [u] + } else if (is.character (ud)) { + ud <- data.frame (ud, login = names (user_data) [u]) + names (ud) [1] <- names (user_data [[u]]) [i] + } + return (ud) + }) + names (res_u) <- nms_u + + return (res_u) + }) + names (res) <- nms + + return (res) +} + +#' Combine all individual elements of 'user_data' for all users. +#' +#' The `add_user_login_cols` enables all data to be `rbind`-ed here. +#' @noRd +combine_user_data <- function (user_data) { + + data <- lapply (names (user_data [[1]]), function (n) { + these <- lapply (user_data, function (i) i [[n]]) + res <- do.call (rbind, these) + rownames (res) <- NULL + return (res) + }) + + names (data) <- names (user_data [[1]]) + data$general <- NULL + + return (data) +} + +user_relate_fields <- function (user_data, user_names, what = "commits") { + + user_combs <- t (combn (user_names, m = 2L)) + if (what == "commits") { + user_data [[what]] <- dplyr::rename (user_data [[what]], n = num_commits) + } else if (what == "commit_cmt") { + user_data$commit_cmt$n <- 1L + } + + res <- apply (user_combs, 1, function (i) { + cmt1 <- dplyr::filter (user_data [[what]], login == i [1]) |> + dplyr::group_by (repo) |> + dplyr::summarise (n1 = sum (n)) + cmt2 <- dplyr::filter (user_data [[what]], login == i [2]) |> + dplyr::group_by (repo) |> + dplyr::summarise (n2 = sum (n)) + overlap <- dplyr::inner_join (cmt1, cmt2, by = "repo") + + res <- 0 + if (nrow (overlap) > 0L) { + res <- (sum (overlap$n1) + sum (overlap$n2)) / + (sum (cmt1$n1) + sum (cmt2$n2)) + } + return (res) + }) + + res <- data.frame ( + login1 = user_combs [, 1], + login2 = user_combs [, 2], + res + ) + names (res) [3] <- what + + return (res) +} diff --git a/R/data-gh-user.R b/R/data-gh-user.R index d528631..f6f217c 100644 --- a/R/data-gh-user.R +++ b/R/data-gh-user.R @@ -87,8 +87,8 @@ gh_user_general_internal <- function (login = "", name = org_name, gh_org = org_gh_org, url = org_url, - web_url = org_web_url, - location = org_location, + web_url = null2na_char (org_web_url), + location = null2na_char (org_location), num_members = org_num_members ) diff --git a/codemeta.json b/codemeta.json index 643d4e7..468e962 100644 --- a/codemeta.json +++ b/codemeta.json @@ -8,7 +8,7 @@ "codeRepository": "https://github.com/ropensci-review-tools/repometrics", "issueTracker": "https://github.com/ropensci-review-tools/repometrics/issues", "license": "https://spdx.org/licenses/GPL-3.0", - "version": "0.1.3.020", + "version": "0.1.3.031", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", diff --git a/tests/testthat/helper-user-relations.R b/tests/testthat/helper-user-relations.R new file mode 100644 index 0000000..838fb9c --- /dev/null +++ b/tests/testthat/helper-user-relations.R @@ -0,0 +1,82 @@ +# Mock version of data constructed in data-gh-user.R +mock_user_rel_data <- function () { + + general <- list ( + user = data.frame ( + login = "me", + name = "me too", + email = "me@here.com", + location = "somewhere", + company = "noway", + bio = NA_character_, + avatarUrl = NA_character_, + num_repositories = 1L, + repos_contributed_to = 2L, + num_starred_repos = 3L + ), + orgs = data.frame ( + name = "org", + gh_org = "org", + url = "https://github.com/org", + web_url = NA_character_, + location = NA_character_, + num_members = 0L + ) + ) + + randchars <- function (len = 6L) { + x <- sample (c (letters, LETTERS), size = len, replace = TRUE) + paste0 (x, collapse = "") + } + followers <- vapply (1:10, function (i) randchars (), character (1L)) + following <- vapply (1:5, function (i) randchars (), character (1L)) + + timestamp <- as.POSIXct ("2024-01-01T00:00:01") + timestamp_minus_year <- as.POSIXct ("2023-01-01T00:00:01") + + commits <- data.frame ( + repo = paste0 ("org", c ("one", "two")), + num_commits = 1:2, + date = rep (timestamp, 2L) + ) + + commit_cmt <- data.frame ( + repo = commits$repo, + num_commits = 1:2, + date = rep (timestamp, 2L) + ) + attr (commit_cmt, "started_at") <- timestamp_minus_year + attr (commit_cmt, "ended_at") <- timestamp + + issues <- data.frame ( + opened_at = rep (timestamp, 2L), + closed_at = rep (timestamp, 2L), + org_repo = commits$repo, + issue_num = 1:2, + num_issue_comments = 3:4, + num_issue_participants = 5:6, + num_repo_languages = 7:8, + repo_languages = I (c ("R", "C")) + ) + attr (issues, "started_at") <- timestamp_minus_year + attr (issues, "ended_at") <- timestamp + + issue_cmts <- data.frame ( + org_repo = commits$repo, + issue_num = 1:2, + created_at = rep (timestamp, 2L), + num_comments = 1:2, + num_participants = 3:4 + ) + + # Then assemble all: + list ( + general = general, + commit_cmt = commit_cmt, + commits = commits, + followers = followers, + following = following, + issue_cmts = issue_cmts, + issues = issues + ) +} diff --git a/tests/testthat/test-rm-data-gh-user.R b/tests/testthat/test-data-rm-gh-user.R similarity index 100% rename from tests/testthat/test-rm-data-gh-user.R rename to tests/testthat/test-data-rm-gh-user.R diff --git a/tests/testthat/test-rm-data-git.R b/tests/testthat/test-data-rm-git.R similarity index 100% rename from tests/testthat/test-rm-data-git.R rename to tests/testthat/test-data-rm-git.R diff --git a/tests/testthat/test-rm-data-github.R b/tests/testthat/test-data-rm-github.R similarity index 100% rename from tests/testthat/test-rm-data-github.R rename to tests/testthat/test-data-rm-github.R diff --git a/tests/testthat/test-rm-data.R b/tests/testthat/test-data-rm.R similarity index 100% rename from tests/testthat/test-rm-data.R rename to tests/testthat/test-data-rm.R diff --git a/tests/testthat/test-data-user.R b/tests/testthat/test-data-user.R new file mode 100644 index 0000000..43991a5 --- /dev/null +++ b/tests/testthat/test-data-user.R @@ -0,0 +1,16 @@ +test_that ("user data martrices", { + + user_data <- lapply (1:2, function (i) mock_user_rel_data ()) + names (user_data) <- c ("a", "b") + + mats <- user_relation_matrices (user_data) + + expect_s3_class (mats, "data.frame") + expect_equal (ncol (mats), 8L) + nms <- c ( + "login1", "login2", "commit_cmt", "commits", "followers", "following", + "issue_cmts", "issues" + ) + expect_equal (names (mats), nms) + expect_true (nrow (mats) > 0L) +})