Merge pull request #58 from ropensci-review-tools/user-connections

start 'R/analyse-users.R'
ropensci-review-tools · Dec 16, 2024 · a234d33 · a234d33
2 parents bcd75d8 + 61e651d
commit a234d33
Show file tree

Hide file tree

Showing 10 changed files with 237 additions and 4 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: repometrics
 Title: Metrics for Your Code Repository
-Version: 0.1.3.020
+Version: 0.1.3.031
 Authors@R: 
     person("Mark", "Padgham", , "mark.padgham@email.com", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0003-2172-5265"))

diff --git a/R/analyse-users.R b/R/analyse-users.R
@@ -0,0 +1,135 @@
+#' Construct user-by-user square matrices of strengths of relation between
+#' users.
+#'
+#' @param user_data Result of `lapply(logins, repometrics_data_user)`.
+#' Contains the following fields:
+#' \enumerate{
+#' \item general (not considered here)
+#' \item commit_cmt Comments on commits
+#' \item commits Commits to different repositories
+#' \item followers GitHub followers
+#' \item following Logins of people/orgs followed by user on GitHub
+#' \item issue_cmts Comments on issues
+#' \item issues Issues opened by user.
+#' }
+#' @return A `data.frame` of pairwise user logins, and proportions of overlap
+#' betwen repositories in the six variables described above.
+#' @noRd
+user_relation_matrices <- function (user_data) {
+
+    user_names <- names (user_data)
+    user_data <- add_user_login_cols (user_data) |>
+        combine_user_data ()
+
+    # Pre-processing to name grouping column "repo" and count column "n":
+    user_data$commit_cmt$repo <-
+        paste0 (user_data$commit_cmt$org, user_data$commit_cmt$repo)
+
+    user_data$followers <-
+        dplyr::rename (user_data$followers, repo = followers) |>
+        dplyr::mutate (n = 1L)
+    user_data$following <-
+        dplyr::rename (user_data$following, repo = following) |>
+        dplyr::mutate (n = 1L)
+
+    user_data$issue_cmts <-
+        dplyr::rename (user_data$issue_cmts, repo = org_repo) |>
+        dplyr::group_by (repo, login) |>
+        dplyr::summarise (n = sum (num_comments), .groups = "keep")
+    user_data$issues <- dplyr::rename (user_data$issues, repo = org_repo) |>
+        dplyr::group_by (repo, login) |>
+        dplyr::summarise (n = dplyr::n (), .groups = "keep")
+
+    overlap <- lapply (names (user_data), function (n) {
+        user_data [[n]] <- user_relate_fields (user_data, user_names, what = n)
+    })
+
+    res <- dplyr::left_join (overlap [[1]], overlap [[2]], by = c ("login1", "login2")) |>
+        dplyr::left_join (overlap [[3]], by = c ("login1", "login2")) |>
+        dplyr::left_join (overlap [[4]], by = c ("login1", "login2")) |>
+        dplyr::left_join (overlap [[5]], by = c ("login1", "login2")) |>
+        dplyr::left_join (overlap [[6]], by = c ("login1", "login2"))
+
+    return (res)
+}
+
+#' Add 'login' columns to all user data, so each element can be combined.
+#' @noRd
+add_user_login_cols <- function (user_data) {
+
+    nms <- names (user_data)
+    res <- lapply (seq_along (user_data), function (u) {
+        nms_u <- names (user_data [[u]])
+        res_u <- lapply (seq_along (user_data [[u]]), function (i) {
+            ud <- user_data [[u]] [[i]]
+            if (is.data.frame (ud) && nrow (ud) > 0L) {
+                ud$login <- names (user_data) [u]
+            } else if (is.character (ud)) {
+                ud <- data.frame (ud, login = names (user_data) [u])
+                names (ud) [1] <- names (user_data [[u]]) [i]
+            }
+            return (ud)
+        })
+        names (res_u) <- nms_u
+
+        return (res_u)
+    })
+    names (res) <- nms
+
+    return (res)
+}
+
+#' Combine all individual elements of 'user_data' for all users.
+#'
+#' The `add_user_login_cols` enables all data to be `rbind`-ed here.
+#' @noRd
+combine_user_data <- function (user_data) {
+
+    data <- lapply (names (user_data [[1]]), function (n) {
+        these <- lapply (user_data, function (i) i [[n]])
+        res <- do.call (rbind, these)
+        rownames (res) <- NULL
+        return (res)
+    })
+
+    names (data) <- names (user_data [[1]])
+    data$general <- NULL
+
+    return (data)
+}
+
+user_relate_fields <- function (user_data, user_names, what = "commits") {
+
+    user_combs <- t (combn (user_names, m = 2L))
+    if (what == "commits") {
+        user_data [[what]] <- dplyr::rename (user_data [[what]], n = num_commits)
+    } else if (what == "commit_cmt") {
+        user_data$commit_cmt$n <- 1L
+    }
+
+    res <- apply (user_combs, 1, function (i) {
+        cmt1 <- dplyr::filter (user_data [[what]], login == i [1]) |>
+            dplyr::group_by (repo) |>
+            dplyr::summarise (n1 = sum (n))
+        cmt2 <- dplyr::filter (user_data [[what]], login == i [2]) |>
+            dplyr::group_by (repo) |>
+            dplyr::summarise (n2 = sum (n))
+        overlap <- dplyr::inner_join (cmt1, cmt2, by = "repo")
+
+        res <- 0
+        if (nrow (overlap) > 0L) {
+            res <- (sum (overlap$n1) + sum (overlap$n2)) /
+                (sum (cmt1$n1) + sum (cmt2$n2))
+        }
+        return (res)
+    })
+
+    res <- data.frame (
+        login1 = user_combs [, 1],
+        login2 = user_combs [, 2],
+        res
+    )
+    names (res) [3] <- what
+
+    return (res)
+}
diff --git a/R/data-gh-user.R b/R/data-gh-user.R
@@ -87,8 +87,8 @@ gh_user_general_internal <- function (login = "",
         name = org_name,
         gh_org = org_gh_org,
         url = org_url,
-        web_url = org_web_url,
-        location = org_location,
+        web_url = null2na_char (org_web_url),
+        location = null2na_char (org_location),
         num_members = org_num_members
     )
 

diff --git a/codemeta.json b/codemeta.json
@@ -8,7 +8,7 @@
   "codeRepository": "https://github.com/ropensci-review-tools/repometrics",
   "issueTracker": "https://github.com/ropensci-review-tools/repometrics/issues",
   "license": "https://spdx.org/licenses/GPL-3.0",
-  "version": "0.1.3.020",
+  "version": "0.1.3.031",
   "programmingLanguage": {
     "@type": "ComputerLanguage",
     "name": "R",

diff --git a/tests/testthat/helper-user-relations.R b/tests/testthat/helper-user-relations.R
@@ -0,0 +1,82 @@
+# Mock version of data constructed in data-gh-user.R
+mock_user_rel_data <- function () {
+
+    general <- list (
+        user = data.frame (
+            login = "me",
+            name = "me too",
+            email = "me@here.com",
+            location = "somewhere",
+            company = "noway",
+            bio = NA_character_,
+            avatarUrl = NA_character_,
+            num_repositories = 1L,
+            repos_contributed_to = 2L,
+            num_starred_repos = 3L
+        ),
+        orgs = data.frame (
+            name = "org",
+            gh_org = "org",
+            url = "https://github.com/org",
+            web_url = NA_character_,
+            location = NA_character_,
+            num_members = 0L
+        )
+    )
+
+    randchars <- function (len = 6L) {
+        x <- sample (c (letters, LETTERS), size = len, replace = TRUE)
+        paste0 (x, collapse = "")
+    }
+    followers <- vapply (1:10, function (i) randchars (), character (1L))
+    following <- vapply (1:5, function (i) randchars (), character (1L))
+
+    timestamp <- as.POSIXct ("2024-01-01T00:00:01")
+    timestamp_minus_year <- as.POSIXct ("2023-01-01T00:00:01")
+
+    commits <- data.frame (
+        repo = paste0 ("org", c ("one", "two")),
+        num_commits = 1:2,
+        date = rep (timestamp, 2L)
+    )
+
+    commit_cmt <- data.frame (
+        repo = commits$repo,
+        num_commits = 1:2,
+        date = rep (timestamp, 2L)
+    )
+    attr (commit_cmt, "started_at") <- timestamp_minus_year
+    attr (commit_cmt, "ended_at") <- timestamp
+
+    issues <- data.frame (
+        opened_at = rep (timestamp, 2L),
+        closed_at = rep (timestamp, 2L),
+        org_repo = commits$repo,
+        issue_num = 1:2,
+        num_issue_comments = 3:4,
+        num_issue_participants = 5:6,
+        num_repo_languages = 7:8,
+        repo_languages = I (c ("R", "C"))
+    )
+    attr (issues, "started_at") <- timestamp_minus_year
+    attr (issues, "ended_at") <- timestamp
+
+    issue_cmts <- data.frame (
+        org_repo = commits$repo,
+        issue_num = 1:2,
+        created_at = rep (timestamp, 2L),
+        num_comments = 1:2,
+        num_participants = 3:4
+    )
+
+    # Then assemble all:
+    list (
+        general = general,
+        commit_cmt = commit_cmt,
+        commits = commits,
+        followers = followers,
+        following = following,
+        issue_cmts = issue_cmts,
+        issues = issues
+    )
+}
diff --git a/tests/testthat/test-rm-data-gh-user.R → tests/testthat/test-data-rm-gh-user.R b/tests/testthat/test-rm-data-gh-user.R → tests/testthat/test-data-rm-gh-user.R
diff --git a/tests/testthat/test-rm-data-git.R → tests/testthat/test-data-rm-git.R b/tests/testthat/test-rm-data-git.R → tests/testthat/test-data-rm-git.R
diff --git a/tests/testthat/test-rm-data-github.R → tests/testthat/test-data-rm-github.R b/tests/testthat/test-rm-data-github.R → tests/testthat/test-data-rm-github.R
diff --git a/tests/testthat/test-rm-data.R → tests/testthat/test-data-rm.R b/tests/testthat/test-rm-data.R → tests/testthat/test-data-rm.R
diff --git a/tests/testthat/test-data-user.R b/tests/testthat/test-data-user.R
@@ -0,0 +1,16 @@
+test_that ("user data martrices", {
+
+    user_data <- lapply (1:2, function (i) mock_user_rel_data ())
+    names (user_data) <- c ("a", "b")
+
+    mats <- user_relation_matrices (user_data)
+
+    expect_s3_class (mats, "data.frame")
+    expect_equal (ncol (mats), 8L)
+    nms <- c (
+        "login1", "login2", "commit_cmt", "commits", "followers", "following",
+        "issue_cmts", "issues"
+    )
+    expect_equal (names (mats), nms)
+    expect_true (nrow (mats) > 0L)
+})