From 98bc310612c6519f6c9c99f3590b0004f192ead1 Mon Sep 17 00:00:00 2001
From: driazati <driazati@users.noreply.github.com>
Date: Wed, 22 Jun 2022 14:31:17 -0700
Subject: [PATCH] [release] Add script to gather PRs for a release

---
 tests/scripts/release/gather_prs.py | 216 ++++++++++++++++++++++++++++
 1 file changed, 216 insertions(+)
 create mode 100644 tests/scripts/release/gather_prs.py

diff --git a/tests/scripts/release/gather_prs.py b/tests/scripts/release/gather_prs.py
new file mode 100644
index 0000000000000..0720a87d042bc
--- /dev/null
+++ b/tests/scripts/release/gather_prs.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import os
+import pickle
+from pathlib import Path
+import csv
+import sys
+from typing import Callable, Dict, List, Any
+
+REPO_ROOT = Path(__file__).resolve().parent.parent.parent.parent
+sys.path.append(str(REPO_ROOT / "tests" / "scripts"))
+
+from git_utils import git, GitHubRepo
+from github_tag_teams import tags_from_title
+
+GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
+
+
+PRS_QUERY = """
+query ($owner: String!, $name: String!, $after: String, $pageSize: Int!) {
+  repository(owner: $owner, name: $name) {
+    defaultBranchRef {
+      name
+      target {
+        ... on Commit {
+          oid
+          history(after: $after, first: $pageSize) {
+            pageInfo {
+              hasNextPage
+              endCursor
+            }
+            nodes {
+              oid
+              committedDate
+              associatedPullRequests(first: 1) {
+                nodes {
+                  number
+                  additions
+                  changedFiles
+                  deletions
+                  author {
+                    login
+                  }
+                  title
+                  body
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+"""
+
+
+def append_and_save(items, file):
+    if not file.exists():
+        data = []
+    else:
+        with open(file, "rb") as f:
+            data = pickle.load(f)
+
+    data += items
+    with open(file, "wb") as f:
+        pickle.dump(data, f)
+
+
+def fetch_pr_data(args, cache):
+    github = GitHubRepo(user=user, repo=repo, token=GITHUB_TOKEN)
+
+    if args.from_commit is None or args.to_commit is None:
+        print("--from-commit and --to-commit must be specified if --skip-query is not used")
+        exit(1)
+
+    i = 0
+    page_size = 80
+    cursor = f"{args.from_commit} {i}"
+
+    while True:
+        r = github.graphql(
+            query=PRS_QUERY,
+            variables={
+                "owner": user,
+                "name": repo,
+                "after": cursor,
+                "pageSize": page_size,
+            },
+        )
+        data = r["data"]["repository"]["defaultBranchRef"]["target"]["history"]
+        if not data["pageInfo"]["hasNextPage"]:
+            break
+        cursor = data["pageInfo"]["endCursor"]
+        results = data["nodes"]
+
+        to_add = []
+        stop = False
+        for r in results:
+            if r["oid"] == args.to_commit:
+                print(f"Found {r['oid']}, stopping")
+                stop = True
+                break
+            else:
+                to_add.append(r)
+
+        oids = [r["oid"] for r in to_add]
+        print(oids)
+        append_and_save(to_add, cache)
+        if stop:
+            break
+        print(i)
+        i += page_size
+
+
+def write_csv(
+    filename: str, data: List[Dict[str, Any]], filter: Callable[[Dict[str, Any]], bool]
+) -> None:
+    with open(filename, "w", newline="") as csvfile:
+        writer = csv.writer(csvfile, quotechar='"')
+        writer.writerow(
+            (
+                "category",
+                "description",
+                "date",
+                "number",
+                "author",
+                "tags",
+                "title",
+                "additions",
+                "deletions",
+                "changed files",
+            )
+        )
+        for item in data:
+            pr = item["associatedPullRequests"]["nodes"][0]
+            if not filter(pr):
+                continue
+            tags = tags_from_title(pr["title"])
+            actual_tags = []
+            for t in tags:
+                items = [x.strip() for x in t.split(",")]
+                actual_tags += items
+            tags = actual_tags
+            tags = [t.lower() for t in tags]
+            category = ""
+            if len(tags) == 1:
+                category = tags[0]
+            writer.writerow(
+                (
+                    category,
+                    "",
+                    item["committedDate"],
+                    f'https://github.com/apache/tvm/pull/{pr["number"]}',
+                    pr["author"]["login"],
+                    ", ".join(tags),
+                    pr["title"],
+                    pr["additions"],
+                    pr["deletions"],
+                    pr["changedFiles"],
+                )
+            )
+
+
+if __name__ == "__main__":
+    help = "List out commits with attached PRs since a certain commit"
+    parser = argparse.ArgumentParser(description=help)
+    parser.add_argument("--from-commit", help="commit to start checking PRs from")
+    parser.add_argument("--to-commit", help="commit to stop checking PRs from")
+    parser.add_argument(
+        "--threshold", default=150, help="sum of additions + deletions to consider large"
+    )
+    parser.add_argument(
+        "--skip-query", action="store_true", help="don't query GitHub and instead use cache file"
+    )
+    args = parser.parse_args()
+    user = "apache"
+    repo = "tvm"
+    threshold = int(args.threshold)
+
+    cache = Path("out.pkl")
+    if not args.skip_query:
+        fetch_pr_data(args, cache)
+
+    with open(cache, "rb") as f:
+        data = pickle.load(f)
+
+    print(f"Found {len(data)} PRs")
+
+    write_csv(
+        filename="out-large.csv",
+        data=data,
+        filter=lambda pr: pr["additions"] + pr["deletions"] > threshold,
+    )
+    write_csv(
+        filename="out-small.csv",
+        data=data,
+        filter=lambda pr: pr["additions"] + pr["deletions"] <= threshold,
+    )