From 7d90097ad6e8c582e03e9393692eaf57fe13a231 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Tue, 17 May 2022 16:08:00 -0400 Subject: [PATCH 01/25] initialization graphql stargazers --- tap_github/repository_streams.py | 59 ++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index c05acafa..ba1f0482 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -1517,6 +1517,65 @@ def post_process(self, row: dict, context: Optional[Dict] = None) -> dict: ).to_dict() +class StargazersGraphqlStream(GitHubGraphqlStream): + """Defines 'UserContributedToStream' stream. Warning: this stream 'only' gets the first 100 projects (by stars).""" + + name = "stargazers" + query_jsonpath = "$.data.repositoryStargazers.repository.[*]" + primary_keys = ["user_id", "repo_id"] + replication_key = "starred_at" + parent_stream_type = RepositoryStream + state_partitioning_keys = ["repo_id"] + # The parent repository object changes if the number of stargazers changes. + ignore_parent_replication_key = False + + def post_process(self, row: dict, context: Optional[Dict] = None) -> dict: + """ + Add a user_id top-level field to be used as state replication key. + """ + row["user_id"] = row["user"]["id"] + if context is not None: + row["repo_id"] = context["repo_id"] + return row + + @property + def query(self) -> str: + """Return dynamic GraphQL query.""" + # Graphql id is equivalent to REST node_id. To keep the tap consistent, we rename "id" to "node_id". + return """ + query repositoryStargazers($repo: String! $org: String! $nextPageCursor_0: String) { + repository(name: $repo: owner: $org after: $nextPageCursor_0) { + stargazers(first: 100 orderBy: {field: STARRED_AT direction: DESC}) { + pageInfo { + hasNextPage_0: hasNextPage + startCursor_0: startCursor + endCursor_0: endCursor + } + edges { + node { + user_node_id: id + user_id: databaseId + login: username + } + starredAt + } + } + } + } + """ + + schema = th.PropertiesList( + # Parent Keys + th.Property("repo", th.StringType), + th.Property("org", th.StringType), + th.Property("repo_id", th.IntegerType), + th.Property("user_id", th.IntegerType), + # Stargazer Info + th.Property("starred_at", th.DateTimeType), + th.Property("user", user_object), + ).to_dict() + + class StatsContributorsStream(GitHubRestStream): """ Defines 'StatsContributors' stream. Fetching contributors activity. From 0440ec093037ea645bc6fd367302804f53a68af8 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Tue, 17 May 2022 22:16:08 -0400 Subject: [PATCH 02/25] Fix stargazers query --- tap_github/repository_streams.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index ba1f0482..740ffafa 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -1521,31 +1521,34 @@ class StargazersGraphqlStream(GitHubGraphqlStream): """Defines 'UserContributedToStream' stream. Warning: this stream 'only' gets the first 100 projects (by stars).""" name = "stargazers" - query_jsonpath = "$.data.repositoryStargazers.repository.[*]" + query_jsonpath = "$.data.repository.stargazers.edges.[*]" primary_keys = ["user_id", "repo_id"] replication_key = "starred_at" parent_stream_type = RepositoryStream state_partitioning_keys = ["repo_id"] # The parent repository object changes if the number of stargazers changes. - ignore_parent_replication_key = False + ignore_parent_replication_key = True def post_process(self, row: dict, context: Optional[Dict] = None) -> dict: """ Add a user_id top-level field to be used as state replication key. """ - row["user_id"] = row["user"]["id"] + processed_row: Dict[str, Any] = dict() + processed_row.update(row['node']) + processed_row['starred_at'] = row['starred_at'] if context is not None: - row["repo_id"] = context["repo_id"] - return row + processed_row["repo_id"] = context["repo_id"] + return processed_row + @property def query(self) -> str: """Return dynamic GraphQL query.""" # Graphql id is equivalent to REST node_id. To keep the tap consistent, we rename "id" to "node_id". return """ - query repositoryStargazers($repo: String! $org: String! $nextPageCursor_0: String) { - repository(name: $repo: owner: $org after: $nextPageCursor_0) { - stargazers(first: 100 orderBy: {field: STARRED_AT direction: DESC}) { + query repositoryStargazers($repo: String! $org: String! $nextPageCursor_0: String) { + repository(name: $repo owner: $org) { + stargazers(first: 100 orderBy: {field: STARRED_AT direction: DESC} after: $nextPageCursor_0) { pageInfo { hasNextPage_0: hasNextPage startCursor_0: startCursor @@ -1555,9 +1558,10 @@ def query(self) -> str: node { user_node_id: id user_id: databaseId - login: username + username: login + avatar_url: avatarUrl } - starredAt + starred_at: starredAt } } } @@ -1569,10 +1573,12 @@ def query(self) -> str: th.Property("repo", th.StringType), th.Property("org", th.StringType), th.Property("repo_id", th.IntegerType), - th.Property("user_id", th.IntegerType), # Stargazer Info + th.Property("user_id", th.IntegerType), + th.Property("user_node_id", th.IntegerType), th.Property("starred_at", th.DateTimeType), - th.Property("user", user_object), + th.Property("username", th.StringType), + th.Property("avatar_url", th.StringType), ).to_dict() From 89bb7662483bcafef1bcefc695e75ec98c405d2d Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Tue, 17 May 2022 22:25:52 -0400 Subject: [PATCH 03/25] simplify query --- tap_github/repository_streams.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index 740ffafa..3e822920 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -1533,13 +1533,10 @@ def post_process(self, row: dict, context: Optional[Dict] = None) -> dict: """ Add a user_id top-level field to be used as state replication key. """ - processed_row: Dict[str, Any] = dict() - processed_row.update(row['node']) - processed_row['starred_at'] = row['starred_at'] + row["user_id"] = row["user"]["user_id"] if context is not None: - processed_row["repo_id"] = context["repo_id"] - return processed_row - + row["repo_id"] = context["repo_id"] + return row @property def query(self) -> str: @@ -1555,10 +1552,10 @@ def query(self) -> str: endCursor_0: endCursor } edges { - node { - user_node_id: id - user_id: databaseId - username: login + user: node { + node_id: id + id: databaseId + login avatar_url: avatarUrl } starred_at: starredAt @@ -1575,10 +1572,8 @@ def query(self) -> str: th.Property("repo_id", th.IntegerType), # Stargazer Info th.Property("user_id", th.IntegerType), - th.Property("user_node_id", th.IntegerType), th.Property("starred_at", th.DateTimeType), - th.Property("username", th.StringType), - th.Property("avatar_url", th.StringType), + th.Property("user", user_object), ).to_dict() From 348f30308521af55b694e423a3ecec1a2fa2603a Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Tue, 17 May 2022 22:26:14 -0400 Subject: [PATCH 04/25] Update repository_streams.py --- tap_github/repository_streams.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index 3e822920..5cdf74e2 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -1533,7 +1533,7 @@ def post_process(self, row: dict, context: Optional[Dict] = None) -> dict: """ Add a user_id top-level field to be used as state replication key. """ - row["user_id"] = row["user"]["user_id"] + row["user_id"] = row["user"]["id"] if context is not None: row["repo_id"] = context["repo_id"] return row From 42310c832d82f9ff95d1dd26388cd93e25d256da Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Tue, 17 May 2022 22:45:31 -0400 Subject: [PATCH 05/25] Add early exit with fake "since" --- tap_github/client.py | 5 +++++ tap_github/repository_streams.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/tap_github/client.py b/tap_github/client.py index 0ae8b878..6715976f 100644 --- a/tap_github/client.py +++ b/tap_github/client.py @@ -333,4 +333,9 @@ def get_url_params( params["per_page"] = self.MAX_PER_PAGE if next_page_token: params.update(next_page_token) + + since = self.get_starting_timestamp(context) + if self.replication_key and since: + params["since"] = str(since) + return params diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index 5cdf74e2..466ffb14 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -6,6 +6,9 @@ from singer_sdk import typing as th # JSON Schema typing helpers from singer_sdk.helpers.jsonpath import extract_jsonpath +from dateutil.parser import parse +from urllib.parse import parse_qs, urlparse + from tap_github.client import GitHubGraphqlStream, GitHubRestStream from tap_github.schema_objects import ( user_object, @@ -1538,6 +1541,33 @@ def post_process(self, row: dict, context: Optional[Dict] = None) -> dict: row["repo_id"] = context["repo_id"] return row + def get_next_page_token( + self, response: requests.Response, previous_token: Optional[Any] + ) -> Optional[Any]: + """ + Exit early if a since parameter is provided. + """ + request_parameters = parse_qs(str(urlparse(response.request.url).query)) + + # parse_qs interprets "+" as a space, revert this to keep an aware datetime + try: + since = ( + request_parameters["since"][0].replace(" ", "+") + if "since" in request_parameters + else "" + ) + except IndexError: + since = "" + + # If since parameter is present, try to exit early by looking at the last "starred_at". + # Noting that we are traversing in DESCENDING order by STARRED_AT. + if since: + results = extract_jsonpath(self.query_jsonpath, input=response.json()) + *_, last = results + if parse(last["starred_at"]) < parse(since): + return None + return super().get_next_page_token(response, previous_token) + @property def query(self) -> str: """Return dynamic GraphQL query.""" From ed77f2727f54db37f74b6b4acda7315ed498514f Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Tue, 17 May 2022 22:48:36 -0400 Subject: [PATCH 06/25] Add new stream and order alphabetically --- tap_github/repository_streams.py | 2 +- tap_github/streams.py | 56 +++++++++++++++++--------------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index 466ffb14..ff23b3c2 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -1530,7 +1530,7 @@ class StargazersGraphqlStream(GitHubGraphqlStream): parent_stream_type = RepositoryStream state_partitioning_keys = ["repo_id"] # The parent repository object changes if the number of stargazers changes. - ignore_parent_replication_key = True + ignore_parent_replication_key = False def post_process(self, row: dict, context: Optional[Dict] = None) -> dict: """ diff --git a/tap_github/streams.py b/tap_github/streams.py index 2ae5df57..5a265468 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -5,6 +5,9 @@ from tap_github.repository_streams import ( AnonymousContributorsStream, + AssigneesStream, + CollaboratorsStream, + CommitCommentsStream, CommitsStream, CommunityProfileStream, ContributorsStream, @@ -13,26 +16,24 @@ IssueEventsStream, IssuesStream, LanguagesStream, + MilestonesStream, + ProjectCardsStream, + ProjectColumnsStream, + ProjectsStream, + PullRequestCommits, PullRequestsStream, ReadmeHtmlStream, ReadmeStream, + ReleasesStream, RepositoryStream, + ReviewCommentsStream, + ReviewsStream, + StargazersGraphqlStream, StargazersStream, StatsContributorsStream, - AssigneesStream, - CollaboratorsStream, - ReviewsStream, - ReviewCommentsStream, - ProjectsStream, - ProjectColumnsStream, - ProjectCardsStream, - PullRequestCommits, - MilestonesStream, - CommitCommentsStream, - ReleasesStream, - WorkflowsStream, WorkflowRunJobsStream, WorkflowRunsStream, + WorkflowsStream, ) from tap_github.user_streams import ( StarredStream, @@ -41,9 +42,9 @@ ) from tap_github.organization_streams import ( OrganizationStream, - TeamsStream, TeamMembersStream, TeamRolesStream, + TeamsStream, ) @@ -63,34 +64,35 @@ def __init__(self, valid_queries: Set[str], streams: List[Type[Stream]]): {"repositories", "organizations", "searches"}, [ AnonymousContributorsStream, - CommitsStream, + AssigneesStream, + CollaboratorsStream, CommitCommentsStream, + CommitsStream, CommunityProfileStream, ContributorsStream, EventsStream, - MilestonesStream, - ReleasesStream, - CollaboratorsStream, - AssigneesStream, - IssuesStream, IssueCommentsStream, IssueEventsStream, + IssuesStream, LanguagesStream, - PullRequestsStream, + MilestonesStream, + ProjectCardsStream, + ProjectColumnsStream, + ProjectsStream, PullRequestCommits, - ReviewsStream, - ReviewCommentsStream, + PullRequestsStream, ReadmeHtmlStream, ReadmeStream, + ReleasesStream, RepositoryStream, + ReviewCommentsStream, + ReviewsStream, + StargazersGraphqlStream, StargazersStream, StatsContributorsStream, - ProjectsStream, - ProjectColumnsStream, - ProjectCardsStream, - WorkflowsStream, WorkflowRunJobsStream, WorkflowRunsStream, + WorkflowsStream, ], ) USERS = ( @@ -103,7 +105,7 @@ def __init__(self, valid_queries: Set[str], streams: List[Type[Stream]]): ) ORGANIZATIONS = ( {"organizations"}, - [OrganizationStream, TeamsStream, TeamMembersStream, TeamRolesStream], + [OrganizationStream, TeamMembersStream, TeamRolesStream, TeamsStream], ) @classmethod From 3977deeb258a69fff0b992830cad850cf76051a8 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Tue, 17 May 2022 22:54:03 -0400 Subject: [PATCH 07/25] Add missing user params --- tap_github/repository_streams.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index ff23b3c2..a870c048 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -1587,6 +1587,9 @@ def query(self) -> str: id: databaseId login avatar_url: avatarUrl + html_url: url + type: __typename + site_admin: isSiteAdmin } starred_at: starredAt } From eec8d3de0e3ab1e31fc99dc6245a745f2888e0e8 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Tue, 17 May 2022 23:20:47 -0400 Subject: [PATCH 08/25] Test different path for api_calls_tests_cache --- .github/workflows/test_tap.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test_tap.yml b/.github/workflows/test_tap.yml index f8e119c0..b7401a90 100644 --- a/.github/workflows/test_tap.yml +++ b/.github/workflows/test_tap.yml @@ -26,13 +26,11 @@ jobs: uses: actions/cache@v2.1.7 with: # must match the path in tests/__init__.py - path: '**/api_calls_tests_cache.sqlite' + path: 'api_calls_tests_cache.sqlite' # github cache expires after 1wk, and we expire the content after 24h # this key should not need to change unless we need to clear the cache key: api-cache-v3 - restore-keys: | - api-cache-v3 - api-cache- + restore-keys: api-cache-v3 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: From 3b8e9b3d2880e232809acaee5e592f20b8850cc0 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Wed, 18 May 2022 00:34:52 -0400 Subject: [PATCH 09/25] Retry pytest once on error --- .github/workflows/test_tap.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_tap.yml b/.github/workflows/test_tap.yml index b7401a90..1a6502c2 100644 --- a/.github/workflows/test_tap.yml +++ b/.github/workflows/test_tap.yml @@ -59,5 +59,10 @@ jobs: run: | poetry run mypy . --ignore-missing-imports - name: Test with pytest - run: | - poetry run pytest --capture=no + uses: nick-fields/retry@v2.7.0 + with: + timeout_seconds: 3600 + max_attempts: 2 + retry_on: error + command: | + poetry run pytest --capture=no From f46c2e10cce7fd92271783f386e99dfd40d4e9aa Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Wed, 18 May 2022 00:38:29 -0400 Subject: [PATCH 10/25] Fix push target in github action --- .github/workflows/test_tap.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_tap.yml b/.github/workflows/test_tap.yml index 1a6502c2..be3b6ecc 100644 --- a/.github/workflows/test_tap.yml +++ b/.github/workflows/test_tap.yml @@ -1,11 +1,11 @@ name: Test tap-github on: - # Run on all pull requests and on pushes to master. + # Run on all pull requests and on pushes to main. pull_request: push: branches: - - master + - main jobs: tests: From 6ee276589c192907140c171b992ac136c09d5506 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Wed, 18 May 2022 01:46:53 -0400 Subject: [PATCH 11/25] Update test_tap.yml --- .github/workflows/test_tap.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_tap.yml b/.github/workflows/test_tap.yml index be3b6ecc..d55dbf17 100644 --- a/.github/workflows/test_tap.yml +++ b/.github/workflows/test_tap.yml @@ -61,7 +61,7 @@ jobs: - name: Test with pytest uses: nick-fields/retry@v2.7.0 with: - timeout_seconds: 3600 + retry_wait_seconds: 3600 max_attempts: 2 retry_on: error command: | From 7d43ea67e8b437ce45940cfa370534cdcde79f70 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Wed, 18 May 2022 01:50:43 -0400 Subject: [PATCH 12/25] Update test_tap.yml --- .github/workflows/test_tap.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_tap.yml b/.github/workflows/test_tap.yml index d55dbf17..32a3e043 100644 --- a/.github/workflows/test_tap.yml +++ b/.github/workflows/test_tap.yml @@ -61,8 +61,9 @@ jobs: - name: Test with pytest uses: nick-fields/retry@v2.7.0 with: - retry_wait_seconds: 3600 - max_attempts: 2 retry_on: error + max_attempts: 2 + retry_wait_seconds: 3600 + timeout_minutes: 70 command: | poetry run pytest --capture=no From 2b433c6436be17fef611882d97116b16f0b727f5 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Wed, 18 May 2022 01:51:03 -0400 Subject: [PATCH 13/25] Update test_tap.yml --- .github/workflows/test_tap.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_tap.yml b/.github/workflows/test_tap.yml index 32a3e043..9b1157f9 100644 --- a/.github/workflows/test_tap.yml +++ b/.github/workflows/test_tap.yml @@ -64,6 +64,6 @@ jobs: retry_on: error max_attempts: 2 retry_wait_seconds: 3600 - timeout_minutes: 70 + timeout_minutes: 80 command: | poetry run pytest --capture=no From 8ad4f696b3b11e391e08c444eb954bb27111fca6 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Wed, 18 May 2022 01:59:54 -0400 Subject: [PATCH 14/25] Update test_tap.yml --- .github/workflows/test_tap.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/test_tap.yml b/.github/workflows/test_tap.yml index 9b1157f9..dec3f97b 100644 --- a/.github/workflows/test_tap.yml +++ b/.github/workflows/test_tap.yml @@ -61,7 +61,6 @@ jobs: - name: Test with pytest uses: nick-fields/retry@v2.7.0 with: - retry_on: error max_attempts: 2 retry_wait_seconds: 3600 timeout_minutes: 80 From 5b5b64c1c3f0ec9cb209f96879abfcabe8cc02f5 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Wed, 18 May 2022 02:43:05 -0400 Subject: [PATCH 15/25] Use alternative_sync_chidren in tap_core --- tap_github/tests/fixtures.py | 29 ++++++++++++++++++++++++++++- tap_github/tests/test_core.py | 9 +++++++-- tap_github/tests/test_tap.py | 27 +-------------------------- 3 files changed, 36 insertions(+), 29 deletions(-) diff --git a/tap_github/tests/fixtures.py b/tap_github/tests/fixtures.py index 16bd08ad..822afc50 100644 --- a/tap_github/tests/fixtures.py +++ b/tap_github/tests/fixtures.py @@ -1,3 +1,5 @@ +import os +import logging import datetime import pytest @@ -25,7 +27,7 @@ def repo_list_config(request): """ marker = request.node.get_closest_marker("repo_list") if marker is None: - repo_list = ["octocat/hello-world", "mapswipe/mapswipe"] + repo_list = ["MeltanoLabs/tap-github", "mapswipe/mapswipe"] else: repo_list = marker.args[0] @@ -93,3 +95,28 @@ def organization_list_config(request): "organizations": organization_list, "rate_limit_buffer": 100, } + + +def alternative_sync_chidren(self, child_context: dict) -> None: + """ + Override for Stream._sync_children. + Enabling us to use an ORG_LEVEL_TOKEN for the collaborators sream. + """ + for child_stream in self.child_streams: + # Use org:write access level credentials for collaborators stream + if child_stream.name in ["collaborators"]: + ORG_LEVEL_TOKEN = os.environ.get("ORG_LEVEL_TOKEN") + if not ORG_LEVEL_TOKEN: + logging.warning( + 'No "ORG_LEVEL_TOKEN" found. Skipping collaborators stream sync.' + ) + continue + SAVED_GTHUB_TOKEN = os.environ.get("GITHUB_TOKEN") + os.environ["GITHUB_TOKEN"] = ORG_LEVEL_TOKEN + child_stream.sync(context=child_context) + os.environ["GITHUB_TOKEN"] = SAVED_GTHUB_TOKEN or "" + continue + + # default behavior: + if child_stream.selected or child_stream.has_selected_descendents: + child_stream.sync(context=child_context) diff --git a/tap_github/tests/test_core.py b/tap_github/tests/test_core.py index edbc527a..59847341 100644 --- a/tap_github/tests/test_core.py +++ b/tap_github/tests/test_core.py @@ -3,7 +3,9 @@ import logging from unittest import mock +from unittest.mock import patch +from .fixtures import alternative_sync_chidren from singer_sdk.testing import get_standard_tap_tests from tap_github.tap import TapGitHub @@ -27,8 +29,11 @@ def test_standard_tap_tests_for_search_mode(search_config): def test_standard_tap_tests_for_repo_list_mode(repo_list_config): """Run standard tap tests from the SDK.""" tests = get_standard_tap_tests(TapGitHub, config=repo_list_config) - for test in tests: - test() + with patch( + "singer_sdk.streams.core.Stream._sync_children", alternative_sync_chidren + ): + for test in tests: + test() def test_standard_tap_tests_for_username_list_mode(username_list_config): diff --git a/tap_github/tests/test_tap.py b/tap_github/tests/test_tap.py index bdf6af54..c79c938e 100644 --- a/tap_github/tests/test_tap.py +++ b/tap_github/tests/test_tap.py @@ -9,7 +9,7 @@ from singer_sdk.helpers import _catalog as cat_helpers from tap_github.tap import TapGitHub -from .fixtures import repo_list_config, username_list_config +from .fixtures import alternative_sync_chidren, repo_list_config, username_list_config repo_list_2 = [ "MeltanoLabs/tap-github", @@ -49,31 +49,6 @@ def test_validate_repo_list_config(repo_list_config): assert partitions == repo_list_context -def alternative_sync_chidren(self, child_context: dict) -> None: - """ - Override for Stream._sync_children. - Enabling us to use an ORG_LEVEL_TOKEN for the collaborators sream. - """ - for child_stream in self.child_streams: - # Use org:write access level credentials for collaborators stream - if child_stream.name in ["collaborators"]: - ORG_LEVEL_TOKEN = os.environ.get("ORG_LEVEL_TOKEN") - if not ORG_LEVEL_TOKEN: - logging.warning( - 'No "ORG_LEVEL_TOKEN" found. Skipping collaborators stream sync.' - ) - continue - SAVED_GTHUB_TOKEN = os.environ.get("GITHUB_TOKEN") - os.environ["GITHUB_TOKEN"] = ORG_LEVEL_TOKEN - child_stream.sync(context=child_context) - os.environ["GITHUB_TOKEN"] = SAVED_GTHUB_TOKEN or "" - continue - - # default behavior: - if child_stream.selected or child_stream.has_selected_descendents: - child_stream.sync(context=child_context) - - def run_tap_with_config(capsys, config_obj: dict, skip_stream: Optional[str]) -> str: """ Run the tap with the given config and capture stdout, optionally From 9e0dd6d197e73bbb575bd103560be3167afda3c9 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Wed, 18 May 2022 02:49:05 -0400 Subject: [PATCH 16/25] Update test_tap.yml --- .github/workflows/test_tap.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/test_tap.yml b/.github/workflows/test_tap.yml index dec3f97b..386e2845 100644 --- a/.github/workflows/test_tap.yml +++ b/.github/workflows/test_tap.yml @@ -64,5 +64,4 @@ jobs: max_attempts: 2 retry_wait_seconds: 3600 timeout_minutes: 80 - command: | - poetry run pytest --capture=no + command: poetry run pytest --capture=no From 6c58936e49f17a9e0e16e357e947ae63bacc00fe Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Wed, 18 May 2022 02:50:08 -0400 Subject: [PATCH 17/25] Update test_tap.yml --- .github/workflows/test_tap.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_tap.yml b/.github/workflows/test_tap.yml index 386e2845..85f8b040 100644 --- a/.github/workflows/test_tap.yml +++ b/.github/workflows/test_tap.yml @@ -63,5 +63,5 @@ jobs: with: max_attempts: 2 retry_wait_seconds: 3600 - timeout_minutes: 80 + timeout_minutes: 10 command: poetry run pytest --capture=no From 0bb3531d16d6610c857e5d05e2d3ea4bf16579ee Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Wed, 18 May 2022 03:08:07 -0400 Subject: [PATCH 18/25] Update test_tap.yml --- .github/workflows/test_tap.yml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test_tap.yml b/.github/workflows/test_tap.yml index 85f8b040..c7eeb5da 100644 --- a/.github/workflows/test_tap.yml +++ b/.github/workflows/test_tap.yml @@ -59,9 +59,13 @@ jobs: run: | poetry run mypy . --ignore-missing-imports - name: Test with pytest - uses: nick-fields/retry@v2.7.0 - with: - max_attempts: 2 - retry_wait_seconds: 3600 - timeout_minutes: 10 - command: poetry run pytest --capture=no + id: test_pytest + continue-on-error: true + run: | + poetry run pytest --capture=no + - name: Test with pytest (run 2) + id: retry_test_pytest + if: steps.test_pytest.outcome=='failure' # check the step outcome, wait and retry + run: | + sleep 60m + poetry run pytest --capture=no From b58691a573ceed8212c24376abc3800ee9c7892c Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Wed, 18 May 2022 18:55:48 -0400 Subject: [PATCH 19/25] Update tap_github/tests/fixtures.py Co-authored-by: Laurent Savaete --- tap_github/tests/fixtures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tap_github/tests/fixtures.py b/tap_github/tests/fixtures.py index 822afc50..ec431e8e 100644 --- a/tap_github/tests/fixtures.py +++ b/tap_github/tests/fixtures.py @@ -100,7 +100,7 @@ def organization_list_config(request): def alternative_sync_chidren(self, child_context: dict) -> None: """ Override for Stream._sync_children. - Enabling us to use an ORG_LEVEL_TOKEN for the collaborators sream. + Enabling us to use an ORG_LEVEL_TOKEN for the collaborators stream. """ for child_stream in self.child_streams: # Use org:write access level credentials for collaborators stream From 288dd8113669928a17c6520db48e86fe492cd36a Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Thu, 19 May 2022 00:17:41 -0400 Subject: [PATCH 20/25] Override REST stream --- tap_github/repository_streams.py | 55 ++++---------------------------- tap_github/streams.py | 2 -- tap_github/user_streams.py | 2 +- 3 files changed, 8 insertions(+), 51 deletions(-) diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index a870c048..ce9cae6a 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -1476,59 +1476,15 @@ def parse_response(self, response: requests.Response) -> Iterable[dict]: ).to_dict() -class StargazersStream(GitHubRestStream): - """Defines 'Stargazers' stream. Warning: this stream does NOT track star deletions.""" - - name = "stargazers" - path = "/repos/{org}/{repo}/stargazers" - primary_keys = ["user_id", "repo", "org"] - parent_stream_type = RepositoryStream - state_partitioning_keys = ["repo", "org"] - replication_key = "starred_at" - # GitHub is missing the "since" parameter on this endpoint. - missing_since_parameter = True - - @property - def http_headers(self) -> dict: - """Return the http headers needed. - - Overridden to use an endpoint which includes starred_at property: - https://docs.github.com/en/rest/reference/activity#custom-media-types-for-starring - """ - headers = super().http_headers - headers["Accept"] = "application/vnd.github.v3.star+json" - return headers - - def post_process(self, row: dict, context: Optional[Dict] = None) -> dict: - """ - Add a user_id top-level field to be used as state replication key. - """ - row["user_id"] = row["user"]["id"] - if context is not None: - row["repo_id"] = context["repo_id"] - return row - - schema = th.PropertiesList( - # Parent Keys - th.Property("repo", th.StringType), - th.Property("org", th.StringType), - th.Property("repo_id", th.IntegerType), - th.Property("user_id", th.IntegerType), - # Stargazer Info - th.Property("starred_at", th.DateTimeType), - th.Property("user", user_object), - ).to_dict() - - -class StargazersGraphqlStream(GitHubGraphqlStream): - """Defines 'UserContributedToStream' stream. Warning: this stream 'only' gets the first 100 projects (by stars).""" +class StargazersStream(GitHubGraphqlStream): + """Defines 'Stargazers' stream.""" name = "stargazers" query_jsonpath = "$.data.repository.stargazers.edges.[*]" - primary_keys = ["user_id", "repo_id"] + primary_keys = ["user_id", "repo", "org"] replication_key = "starred_at" parent_stream_type = RepositoryStream - state_partitioning_keys = ["repo_id"] + state_partitioning_keys = ["repo", "org"] # The parent repository object changes if the number of stargazers changes. ignore_parent_replication_key = False @@ -1538,7 +1494,10 @@ def post_process(self, row: dict, context: Optional[Dict] = None) -> dict: """ row["user_id"] = row["user"]["id"] if context is not None: + print(context) row["repo_id"] = context["repo_id"] + row["repo"] = context["repo"] + row["org"] = context["org"] return row def get_next_page_token( diff --git a/tap_github/streams.py b/tap_github/streams.py index 5a265468..20769cec 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -28,7 +28,6 @@ RepositoryStream, ReviewCommentsStream, ReviewsStream, - StargazersGraphqlStream, StargazersStream, StatsContributorsStream, WorkflowRunJobsStream, @@ -87,7 +86,6 @@ def __init__(self, valid_queries: Set[str], streams: List[Type[Stream]]): RepositoryStream, ReviewCommentsStream, ReviewsStream, - StargazersGraphqlStream, StargazersStream, StatsContributorsStream, WorkflowRunJobsStream, diff --git a/tap_github/user_streams.py b/tap_github/user_streams.py index 971bca19..f1daa5df 100644 --- a/tap_github/user_streams.py +++ b/tap_github/user_streams.py @@ -256,7 +256,7 @@ def post_process(self, row: dict, context: Optional[Dict] = None) -> dict: class UserContributedToStream(GitHubGraphqlStream): - """Defines 'UserContributedToStream' stream. Warning: this stream 'only' gets the first 100 projects (by stars).""" + """Defines 'UserContributedToStream' stream.""" name = "user_contributed_to" query_jsonpath = "$.data.user.repositoriesContributedTo.nodes.[*]" From dc5283013e8758e45d0328e87585b593d6a3d8ca Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Thu, 19 May 2022 00:22:17 -0400 Subject: [PATCH 21/25] Update repository_streams.py --- tap_github/repository_streams.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index ce9cae6a..0e330e13 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -1494,7 +1494,6 @@ def post_process(self, row: dict, context: Optional[Dict] = None) -> dict: """ row["user_id"] = row["user"]["id"] if context is not None: - print(context) row["repo_id"] = context["repo_id"] row["repo"] = context["repo"] row["org"] = context["org"] From 79d19708cb37cd708f62a5e78bf633d7a6548c32 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Thu, 19 May 2022 09:36:45 -0400 Subject: [PATCH 22/25] Update test_tap.yml --- .github/workflows/test_tap.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/test_tap.yml b/.github/workflows/test_tap.yml index c7eeb5da..064e718c 100644 --- a/.github/workflows/test_tap.yml +++ b/.github/workflows/test_tap.yml @@ -7,6 +7,10 @@ on: branches: - main +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: tests: From 443825b582eb1505b1efe551995225fb1e38dc61 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Thu, 19 May 2022 14:47:37 -0400 Subject: [PATCH 23/25] Revert "Override REST stream" This reverts commit 288dd8113669928a17c6520db48e86fe492cd36a. --- tap_github/repository_streams.py | 54 ++++++++++++++++++++++++++++---- tap_github/streams.py | 2 ++ tap_github/user_streams.py | 2 +- 3 files changed, 51 insertions(+), 7 deletions(-) diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index 0e330e13..a870c048 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -1476,15 +1476,59 @@ def parse_response(self, response: requests.Response) -> Iterable[dict]: ).to_dict() -class StargazersStream(GitHubGraphqlStream): - """Defines 'Stargazers' stream.""" +class StargazersStream(GitHubRestStream): + """Defines 'Stargazers' stream. Warning: this stream does NOT track star deletions.""" name = "stargazers" - query_jsonpath = "$.data.repository.stargazers.edges.[*]" + path = "/repos/{org}/{repo}/stargazers" primary_keys = ["user_id", "repo", "org"] - replication_key = "starred_at" parent_stream_type = RepositoryStream state_partitioning_keys = ["repo", "org"] + replication_key = "starred_at" + # GitHub is missing the "since" parameter on this endpoint. + missing_since_parameter = True + + @property + def http_headers(self) -> dict: + """Return the http headers needed. + + Overridden to use an endpoint which includes starred_at property: + https://docs.github.com/en/rest/reference/activity#custom-media-types-for-starring + """ + headers = super().http_headers + headers["Accept"] = "application/vnd.github.v3.star+json" + return headers + + def post_process(self, row: dict, context: Optional[Dict] = None) -> dict: + """ + Add a user_id top-level field to be used as state replication key. + """ + row["user_id"] = row["user"]["id"] + if context is not None: + row["repo_id"] = context["repo_id"] + return row + + schema = th.PropertiesList( + # Parent Keys + th.Property("repo", th.StringType), + th.Property("org", th.StringType), + th.Property("repo_id", th.IntegerType), + th.Property("user_id", th.IntegerType), + # Stargazer Info + th.Property("starred_at", th.DateTimeType), + th.Property("user", user_object), + ).to_dict() + + +class StargazersGraphqlStream(GitHubGraphqlStream): + """Defines 'UserContributedToStream' stream. Warning: this stream 'only' gets the first 100 projects (by stars).""" + + name = "stargazers" + query_jsonpath = "$.data.repository.stargazers.edges.[*]" + primary_keys = ["user_id", "repo_id"] + replication_key = "starred_at" + parent_stream_type = RepositoryStream + state_partitioning_keys = ["repo_id"] # The parent repository object changes if the number of stargazers changes. ignore_parent_replication_key = False @@ -1495,8 +1539,6 @@ def post_process(self, row: dict, context: Optional[Dict] = None) -> dict: row["user_id"] = row["user"]["id"] if context is not None: row["repo_id"] = context["repo_id"] - row["repo"] = context["repo"] - row["org"] = context["org"] return row def get_next_page_token( diff --git a/tap_github/streams.py b/tap_github/streams.py index 20769cec..5a265468 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -28,6 +28,7 @@ RepositoryStream, ReviewCommentsStream, ReviewsStream, + StargazersGraphqlStream, StargazersStream, StatsContributorsStream, WorkflowRunJobsStream, @@ -86,6 +87,7 @@ def __init__(self, valid_queries: Set[str], streams: List[Type[Stream]]): RepositoryStream, ReviewCommentsStream, ReviewsStream, + StargazersGraphqlStream, StargazersStream, StatsContributorsStream, WorkflowRunJobsStream, diff --git a/tap_github/user_streams.py b/tap_github/user_streams.py index f1daa5df..971bca19 100644 --- a/tap_github/user_streams.py +++ b/tap_github/user_streams.py @@ -256,7 +256,7 @@ def post_process(self, row: dict, context: Optional[Dict] = None) -> dict: class UserContributedToStream(GitHubGraphqlStream): - """Defines 'UserContributedToStream' stream.""" + """Defines 'UserContributedToStream' stream. Warning: this stream 'only' gets the first 100 projects (by stars).""" name = "user_contributed_to" query_jsonpath = "$.data.user.repositoriesContributedTo.nodes.[*]" From 1c79c06c1c7289db9d79c1fe936b90fa17378920 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Thu, 19 May 2022 14:52:37 -0400 Subject: [PATCH 24/25] Keep both streams but add warning --- tap_github/repository_streams.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index a870c048..ac40ce03 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -1479,7 +1479,7 @@ def parse_response(self, response: requests.Response) -> Iterable[dict]: class StargazersStream(GitHubRestStream): """Defines 'Stargazers' stream. Warning: this stream does NOT track star deletions.""" - name = "stargazers" + name = "stargazers_rest" path = "/repos/{org}/{repo}/stargazers" primary_keys = ["user_id", "repo", "org"] parent_stream_type = RepositoryStream @@ -1488,6 +1488,13 @@ class StargazersStream(GitHubRestStream): # GitHub is missing the "since" parameter on this endpoint. missing_since_parameter = True + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # TODO - remove warning with next release. + self.logger.warning( + "This stream is deprecated. Please use the Graphql version instead 'stargazers'." + ) + @property def http_headers(self) -> dict: """Return the http headers needed. @@ -1532,6 +1539,14 @@ class StargazersGraphqlStream(GitHubGraphqlStream): # The parent repository object changes if the number of stargazers changes. ignore_parent_replication_key = False + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # TODO - remove warning with next release. + self.logger.warning( + "This stream might conflict with previous implementation of 'stargazer'. " + "Looking for the older version, use 'stargazers_rest'." + ) + def post_process(self, row: dict, context: Optional[Dict] = None) -> dict: """ Add a user_id top-level field to be used as state replication key. From 862e7ca5e2d6c5b91d54a438bac5f729ca95ff47 Mon Sep 17 00:00:00 2001 From: Eric Boucher Date: Thu, 19 May 2022 15:45:11 -0400 Subject: [PATCH 25/25] Update repository_streams.py --- tap_github/repository_streams.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index ac40ce03..5946ccab 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -1492,7 +1492,7 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # TODO - remove warning with next release. self.logger.warning( - "This stream is deprecated. Please use the Graphql version instead 'stargazers'." + "The stream 'stargazers_rest' is deprecated. Please use the Graphql version instead: 'stargazers'." ) @property @@ -1543,8 +1543,8 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # TODO - remove warning with next release. self.logger.warning( - "This stream might conflict with previous implementation of 'stargazer'. " - "Looking for the older version, use 'stargazers_rest'." + "This stream 'stargazers' might conflict with previous implementation. " + "Looking for the older version? Use 'stargazers_rest'." ) def post_process(self, row: dict, context: Optional[Dict] = None) -> dict: