From f9316188ef9cd712af2bd3a59c179e818cf6991e Mon Sep 17 00:00:00 2001 From: Laurent Savaete Date: Mon, 9 May 2022 19:32:15 +0300 Subject: [PATCH 1/3] Fix broken api endpoint --- tap_github/user_streams.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/tap_github/user_streams.py b/tap_github/user_streams.py index 5606d810..44925db3 100644 --- a/tap_github/user_streams.py +++ b/tap_github/user_streams.py @@ -1,8 +1,10 @@ """User Stream types classes for tap-github.""" +import re from typing import Dict, List, Optional, Iterable, Any from singer_sdk import typing as th # JSON Schema typing helpers +from singer_sdk.exceptions import FatalAPIError from tap_github.client import GitHubGraphqlStream, GitHubRestStream from tap_github.schema_objects import user_object @@ -64,13 +66,21 @@ def query(self) -> str: # there is probably some limit to how many items can be requested # in a single query, but it's well above 1k. for i, user in enumerate(self.user_list): + # we use the `repositoryOwner` query which is the only one that + # works on both users and orgs with graphql. REST is less picky + # and the /user endpoint works for all types. chunks.append( - f'user{i}: user(login: "{user}") ' "{ login databaseId }" + f'user{i}: repositoryOwner(login: "{user}") ' + "{ login avatarUrl}" ) return "query {" + " ".join(chunks) + " }" users_with_ids: list = list() temp_stream = TempStream(self._tap, list(user_list)) + + databaseIdPattern: re.Pattern = re.compile( + r"https://avatars.githubusercontent.com/u/(\d+)?.*" + ) # replace manually provided org/repo values by the ones obtained # from github api. This guarantees that case is correct in the output data. # See https://github.com/MeltanoLabs/tap-github/issues/110 @@ -92,9 +102,18 @@ def query(self) -> str: ) ) continue - users_with_ids.append( - {"username": username, "user_id": record[item]["databaseId"]} - ) + # the databaseId (in graphql language) is not available on + # repositoryOwner, so we parse the avatarUrl to get it :/ + m = databaseIdPattern.match(record[item]["avatarUrl"]) + if m is not None: + dbId = m.group(1) + users_with_ids.append({"username": username, "user_id": dbId}) + else: + # If we get here, github's API is not returning what + # we expected, so it's most likely a breaking change on + # their end, and the tap's code needs updating + raise FatalAPIError("Unexpected GitHub API error: Breaking change?") + self.logger.info(f"Running the tap on {len(users_with_ids)} users") self.logger.info(users_with_ids) return users_with_ids From a9886126f3cf9680088c82e69c34d81d6ac9e219 Mon Sep 17 00:00:00 2001 From: Laurent Savaete Date: Mon, 9 May 2022 19:32:27 +0300 Subject: [PATCH 2/3] Remove extraneous logging --- tap_github/repository_streams.py | 2 +- tap_github/user_streams.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index f12ade34..930643de 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -104,7 +104,7 @@ def query(self) -> str: # one of the repos returned `None`, which means it does # not exist, log some details, and move on to the next one repo_full_name = "/".join(repo_list[int(item[4:])]) - self.logger.warn( + self.logger.debug( ( f"Repository not found: {repo_full_name} \t" "Removing it from list" diff --git a/tap_github/user_streams.py b/tap_github/user_streams.py index 44925db3..40791276 100644 --- a/tap_github/user_streams.py +++ b/tap_github/user_streams.py @@ -87,7 +87,6 @@ def query(self) -> str: # Also remove repos which do not exist to avoid crashing further down # the line. for record in temp_stream.request_records({}): - self.logger.error(record) for item in record.keys(): try: username = record[item]["login"] @@ -95,9 +94,9 @@ def query(self) -> str: # one of the usernames returned `None`, which means it does # not exist, log some details, and move on to the next one invalid_username = user_list[int(item[4:])] - self.logger.warn( + self.logger.debug( ( - f"Repository not found: {invalid_username} \t" + f"Username not found: {invalid_username} \t" "Removing it from list" ) ) @@ -115,7 +114,6 @@ def query(self) -> str: raise FatalAPIError("Unexpected GitHub API error: Breaking change?") self.logger.info(f"Running the tap on {len(users_with_ids)} users") - self.logger.info(users_with_ids) return users_with_ids def get_records(self, context: Optional[Dict]) -> Iterable[Dict[str, Any]]: From 32d5813723244bceae1f7c03a1e98883c1b4df03 Mon Sep 17 00:00:00 2001 From: Laurent Savaete Date: Mon, 9 May 2022 19:38:44 +0300 Subject: [PATCH 3/3] Adjust logging level --- tap_github/repository_streams.py | 2 +- tap_github/user_streams.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index 930643de..c05acafa 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -104,7 +104,7 @@ def query(self) -> str: # one of the repos returned `None`, which means it does # not exist, log some details, and move on to the next one repo_full_name = "/".join(repo_list[int(item[4:])]) - self.logger.debug( + self.logger.info( ( f"Repository not found: {repo_full_name} \t" "Removing it from list" diff --git a/tap_github/user_streams.py b/tap_github/user_streams.py index 40791276..3b1af078 100644 --- a/tap_github/user_streams.py +++ b/tap_github/user_streams.py @@ -94,7 +94,7 @@ def query(self) -> str: # one of the usernames returned `None`, which means it does # not exist, log some details, and move on to the next one invalid_username = user_list[int(item[4:])] - self.logger.debug( + self.logger.info( ( f"Username not found: {invalid_username} \t" "Removing it from list"