Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix user verification query and accept organization logins #117

Merged
merged 3 commits into from
May 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tap_github/repository_streams.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def query(self) -> str:
# one of the repos returned `None`, which means it does
# not exist, log some details, and move on to the next one
repo_full_name = "/".join(repo_list[int(item[4:])])
self.logger.warn(
self.logger.info(
(
f"Repository not found: {repo_full_name} \t"
"Removing it from list"
Expand Down
33 changes: 25 additions & 8 deletions tap_github/user_streams.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""User Stream types classes for tap-github."""

import re
from typing import Dict, List, Optional, Iterable, Any

from singer_sdk import typing as th # JSON Schema typing helpers
from singer_sdk.exceptions import FatalAPIError

from tap_github.client import GitHubGraphqlStream, GitHubRestStream
from tap_github.schema_objects import user_object
Expand Down Expand Up @@ -64,39 +66,54 @@ def query(self) -> str:
# there is probably some limit to how many items can be requested
# in a single query, but it's well above 1k.
for i, user in enumerate(self.user_list):
# we use the `repositoryOwner` query which is the only one that
# works on both users and orgs with graphql. REST is less picky
# and the /user endpoint works for all types.
chunks.append(
f'user{i}: user(login: "{user}") ' "{ login databaseId }"
f'user{i}: repositoryOwner(login: "{user}") '
"{ login avatarUrl}"
)
return "query {" + " ".join(chunks) + " }"

users_with_ids: list = list()
temp_stream = TempStream(self._tap, list(user_list))

databaseIdPattern: re.Pattern = re.compile(
r"https://avatars.githubusercontent.com/u/(\d+)?.*"
)
# replace manually provided org/repo values by the ones obtained
# from github api. This guarantees that case is correct in the output data.
# See https://github.com/MeltanoLabs/tap-github/issues/110
# Also remove repos which do not exist to avoid crashing further down
# the line.
for record in temp_stream.request_records({}):
self.logger.error(record)
for item in record.keys():
try:
username = record[item]["login"]
except TypeError:
# one of the usernames returned `None`, which means it does
# not exist, log some details, and move on to the next one
invalid_username = user_list[int(item[4:])]
self.logger.warn(
self.logger.info(
(
f"Repository not found: {invalid_username} \t"
f"Username not found: {invalid_username} \t"
"Removing it from list"
)
)
continue
users_with_ids.append(
{"username": username, "user_id": record[item]["databaseId"]}
)
# the databaseId (in graphql language) is not available on
# repositoryOwner, so we parse the avatarUrl to get it :/
m = databaseIdPattern.match(record[item]["avatarUrl"])
if m is not None:
dbId = m.group(1)
users_with_ids.append({"username": username, "user_id": dbId})
else:
# If we get here, github's API is not returning what
# we expected, so it's most likely a breaking change on
# their end, and the tap's code needs updating
raise FatalAPIError("Unexpected GitHub API error: Breaking change?")

self.logger.info(f"Running the tap on {len(users_with_ids)} users")
self.logger.info(users_with_ids)
return users_with_ids

def get_records(self, context: Optional[Dict]) -> Iterable[Dict[str, Any]]:
Expand Down