From fe55f8062a3730807e724cd582d065a8aa87a980 Mon Sep 17 00:00:00 2001 From: h1alexbel Date: Thu, 5 Dec 2024 14:03:50 +0300 Subject: [PATCH] feat(#245): junit --- .github/workflows/datasets.yml | 2 +- justfile | 15 ++++++++------- sr-data/resources/pipeline.json | 5 +++++ sr-data/src/sr_data/steps/junit_tests.py | 2 +- sr-data/src/sr_data/steps/numerical.py | 3 ++- sr-data/src/tests/resources/to-numerical.csv | 6 +++--- sr-data/src/tests/test_junit_tests.py | 4 ++-- sr-data/src/tests/test_numerical.py | 3 ++- 8 files changed, 24 insertions(+), 16 deletions(-) diff --git a/.github/workflows/datasets.yml b/.github/workflows/datasets.yml index 295b6255..a7acbdf9 100644 --- a/.github/workflows/datasets.yml +++ b/.github/workflows/datasets.yml @@ -66,7 +66,7 @@ jobs: - name: Create run: | docker run --rm -v "$(pwd)/output:/collection" \ - -e QUERY="stars:>10 language:java,rust size:>=20 mirror:false template:false NOT android" \ + -e QUERY="stars:>10 language:java size:>=20 mirror:false template:false NOT android" \ -e START="${{ inputs.start }}" -e END="${{ inputs.end }}" \ -e COLLECT_TOKEN="${{ secrets.COLLECT_TOKEN_1 }}" \ -e GH_TOKEN="${{ secrets.GITHUB_TOKEN }}" \ diff --git a/justfile b/justfile index 3f626588..c76c7b8b 100644 --- a/justfile +++ b/justfile @@ -198,13 +198,14 @@ dataset folder out: # Cluster repositories. cluster dir out: - cd sr-train && poetry poe cluster --dataset "{{dir}}/d1-scores.csv" --dir {{out}} - cd sr-train && poetry poe cluster --dataset "{{dir}}/d2-sbert.csv" --dir {{out}} - cd sr-train && poetry poe cluster --dataset "{{dir}}/d3-e5.csv" --dir {{out}} - cd sr-train && poetry poe cluster --dataset "{{dir}}/d4-embedv3.csv" --dir {{out}} - cd sr-train && poetry poe cluster --dataset "{{dir}}/d5-scores+sbert.csv" --dir {{out}} - cd sr-train && poetry poe cluster --dataset "{{dir}}/d6-scores+e5.csv" --dir {{out}} - cd sr-train && poetry poe cluster --dataset "{{dir}}/d7-scores+embedv3.csv" --dir {{out}} + cd sr-train && poetry poe cluster --dataset "{{dir}}/d0-numerical.csv" --dir {{out}} +# cd sr-train && poetry poe cluster --dataset "{{dir}}/d1-scores.csv" --dir {{out}} +# cd sr-train && poetry poe cluster --dataset "{{dir}}/d2-sbert.csv" --dir {{out}} +# cd sr-train && poetry poe cluster --dataset "{{dir}}/d3-e5.csv" --dir {{out}} +# cd sr-train && poetry poe cluster --dataset "{{dir}}/d4-embedv3.csv" --dir {{out}} +# cd sr-train && poetry poe cluster --dataset "{{dir}}/d5-scores+sbert.csv" --dir {{out}} +# cd sr-train && poetry poe cluster --dataset "{{dir}}/d6-scores+e5.csv" --dir {{out}} +# cd sr-train && poetry poe cluster --dataset "{{dir}}/d7-scores+embedv3.csv" --dir {{out}} # Statistics about generated clusters. clusterstat out dir="experiment": diff --git a/sr-data/resources/pipeline.json b/sr-data/resources/pipeline.json index c9dabf58..8e9790d0 100644 --- a/sr-data/resources/pipeline.json +++ b/sr-data/resources/pipeline.json @@ -24,5 +24,10 @@ "workflows": { "repos": "@in", "out": "../after-workflows.csv" + }, + "junit": { + "repos": "@in", + "token": "$GH_TOKEN", + "out": "../after-junit.csv" } } diff --git a/sr-data/src/sr_data/steps/junit_tests.py b/sr-data/src/sr_data/steps/junit_tests.py index db499b52..2a194bfb 100644 --- a/sr-data/src/sr_data/steps/junit_tests.py +++ b/sr-data/src/sr_data/steps/junit_tests.py @@ -40,7 +40,7 @@ def main(repos, out, token): frame = pd.read_csv(repos) logger.info(f"Counting JUnit tests in {len(frame)} repositories") for idx, row in frame.iterrows(): - frame.at[idx, "junit_tests"] = count_of_tests( + frame.at[idx, "tests"] = count_of_tests( row["repo"], row["branch"], token diff --git a/sr-data/src/sr_data/steps/numerical.py b/sr-data/src/sr_data/steps/numerical.py index e5a4e3dc..7065c8b4 100644 --- a/sr-data/src/sr_data/steps/numerical.py +++ b/sr-data/src/sr_data/steps/numerical.py @@ -37,7 +37,8 @@ def main(repos, out): "open_issues", "branches", "workflows", - "has_release_workflow" + "has_release_workflow", + "tests" ] ] frame["has_release_workflow"] = frame["has_release_workflow"].astype(int) diff --git a/sr-data/src/tests/resources/to-numerical.csv b/sr-data/src/tests/resources/to-numerical.csv index 153404ce..542c6b86 100644 --- a/sr-data/src/tests/resources/to-numerical.csv +++ b/sr-data/src/tests/resources/to-numerical.csv @@ -1,3 +1,3 @@ -repo,branch,readme,releases,pulls,open_issues,branches,license,workflows,w_jobs,w_oss,w_steps,has_release_workflow -foo/bar,master,"",0,0,1,1,"MIT",1,2,3,4,True -bar/foo,master,"",0,0,1,1,"MIT",1,2,3,4,False \ No newline at end of file +repo,branch,readme,releases,pulls,open_issues,branches,license,workflows,w_jobs,w_oss,w_steps,has_release_workflow,tests +foo/bar,master,"",0,0,1,1,"MIT",1,2,3,4,True,0 +bar/foo,master,"",0,0,1,1,"MIT",1,2,3,4,False,0 \ No newline at end of file diff --git a/sr-data/src/tests/test_junit_tests.py b/sr-data/src/tests/test_junit_tests.py index a135b207..88e059d6 100644 --- a/sr-data/src/tests/test_junit_tests.py +++ b/sr-data/src/tests/test_junit_tests.py @@ -66,5 +66,5 @@ def test_counts_junit_tests_in_csv(self): os.environ["GH_TESTING_TOKEN"] ) frame = pd.read_csv(path) - self.assertEqual(frame.iloc[0]["junit_tests"], 187) - self.assertEqual(frame.iloc[1]["junit_tests"], 0) + self.assertEqual(frame.iloc[0]["tests"], 187) + self.assertEqual(frame.iloc[1]["tests"], 0) diff --git a/sr-data/src/tests/test_numerical.py b/sr-data/src/tests/test_numerical.py index faff0079..6ba65d48 100644 --- a/sr-data/src/tests/test_numerical.py +++ b/sr-data/src/tests/test_numerical.py @@ -55,7 +55,8 @@ def test_creates_csv_with_numerical(self): "open_issues", "branches", "workflows", - "has_release_workflow" + "has_release_workflow", + "tests" ] ), f"Frame {frame.columns} doesn't have expected columns"