LabelBot should take into account all comments on an issue.

* As described in kubeflow#133 as people comment on an issue; label bot should take these additional comments into account when predicting labels. * Hopefully these additional comments will lead to better predictions as they will contain valuable information. To support this: * get_issue should get all comments (not just the body) * We also need to get any labels that have been explicitly removed as well as any labels already on the issue. We need this because we want to take into account multiple comments and not just the first one when predicting labels. * Since we are going to add additional labels based on additional comments we want to be sure not to add back labels which were explicitly removed. * issue_label_predictor should filter out labels which have already been applied or any labels which have been explicitly removed. This is necessary to ensure we don't spam the issue when we allow the bot to comment not just in response to the first comment but additional comments. * Likewise, we only want to apply the comment about not being able to label an issue once. So we need to check if the label bot has already commented on the issue. * Update the readme to account for the new staging and prod environments for the front end as described in machine-learning-apps/Issue-Label-Bot#57
jlewi · May 3, 2020 · bd8de6b · bd8de6b
1 parent 18449a2
commit bd8de6b
Show file tree

Hide file tree

Showing 8 changed files with 388 additions and 142 deletions.
diff --git a/Label_Microservice/README.md b/Label_Microservice/README.md
@@ -62,7 +62,7 @@ The following describes the GCP projects and clusters where the two services are
     - **repository**: [machine-learning-apps/Issue-Label-Bot](https://github.com/machine-learning-apps/Issue-Label-Bot)
     - **GCP project**: github-probots
     - **cluster**: kf-ci-ml
-    - **namespace**: mlapp
+    - **namespace**: label-bot-prod
     - **yaml files**: [deployment](https://github.com/machine-learning-apps/Issue-Label-Bot/tree/master/deployment)
 
 1. Repo-specific label microservice
@@ -76,9 +76,9 @@ The following describes the GCP projects and clusters where the two services are
 
 1. The flask app
     - **repository**: [machine-learning-apps/Issue-Label-Bot](https://github.com/machine-learning-apps/Issue-Label-Bot)
-    - **GCP project**: issue-label-bot-dev
-    - **cluster**: github-mlapp-test
-    - **namespace**: mlapp
+    - **GCP project**: github-probots
+    - **cluster**: kf-ci-ml
+    - **namespace**: label-bot-dev
     - **yaml files**: [deployment](https://github.com/machine-learning-apps/Issue-Label-Bot/tree/master/deployment)
 
 1. Repo-specific label microservice
@@ -88,6 +88,10 @@ The following describes the GCP projects and clusters where the two services are
     - **namespace**: default
     - **yaml files**: [Label\_Microservice/deployment](https://github.com/kubeflow/code-intelligence/tree/master/Label_Microservice/deployment)
 
+1, GitHub bot - **kf-label-bot-dev**
+
+     - see [kubeflow/code-intelligence#84](https://github.com/kubeflow/code-intelligence/issues/84) for information on the setup
+     - see [machine-learning-apps/Issue-Label-Bot#57](https://github.com/machine-learning-apps/Issue-Label-Bot/issues/57)
 
 ## Instructions
 

diff --git a/py/code_intelligence/embeddings.py b/py/code_intelligence/embeddings.py
@@ -42,6 +42,7 @@ def get_issue_text(num, idx, owner, repo, skip_issue=True):
     dict
         {'title':str, 'body':str}
     """
+    logging.warning("get_issue_text is deprecated; use github_util.get_issue")
     url = f'https://github.com/{owner}/{repo}/issues/{num}'
     status_code = requests.head(url).status_code
     if status_code != 200:
@@ -73,60 +74,6 @@ def get_issue_text(num, idx, owner, repo, skip_issue=True):
             'labels': labels,
             'num': num}
 
-# TODO(https://github.com/kubeflow/code-intelligence/issues/126): This function should replace
-# get_issue_text
-def get_issue(url, gh_client):
-  """Fetch the issue data using GraphQL
-  
-  Args:
-    url: Url of the GitHub isue to fetch
-    gh_client: GitHub GraphQl client.
-    
-  Returns
-    ------
-    dict
-        {'title':str, 'body':str}
-  """
-  issue_query = """query getIssue($url: URI!) {
-  resource(url: $url) {
-    __typename
-    ... on Issue {
-      author {
-        __typename
-        ... on User {
-          login
-        }
-        ... on Bot {
-          login
-        }
-      }
-      id
-      title
-      body
-      url
-      state
-      labels(first: 30) {
-        totalCount
-        edges {
-          node {
-            name
-          }
-        }
-      }
-    }
-  }
-}"""
-
-  variables = {
-          "url": url,
-  }
-  issue_results = gh_client.run_query(issue_query, variables)
-
-  if "errors" in issue_results:
-    logging.error(f"There was a problem running the github query; {issue_results['errors']}")
-    raise ValueError(f"There was a problem running the github query: {issue_results['errors']}")
-  return issue_results["data"]["resource"]
-
 def get_all_issue_text(owner, repo, inf_wrapper, workers=64):
     """
     Prepare embedding features of all issues in a given repository.
@@ -191,9 +138,9 @@ def load_model_artifact(model_url, local_dir=None):
     if not local_dir:
       home = str(Path.home())
       local_dir = os.path.join(home, "model_files")
-      
+
     full_path = os.path.join(local_dir, 'model.pkl')
-    
+
     if not full_path.exists():
         logging.info('Loading model.')
         path.mkdir(exist_ok=True)

diff --git a/py/code_intelligence/github_util.py b/py/code_intelligence/github_util.py
@@ -1,57 +1,212 @@
+import fire
 import os
 import logging
 from code_intelligence import github_app
 import typing
 import yaml
 
 def get_issue_handle(installation_id, username, repository, number):
-    "get an issue object."
-    ghapp = github_app.GitHubApp.create_from_env()
-    install = ghapp.get_installation(installation_id)
-    return install.issue(username, repository, number)
+  "get an issue object."
+  ghapp = github_app.GitHubApp.create_from_env()
+  install = ghapp.get_installation(installation_id)
+  return install.issue(username, repository, number)
 
 def get_yaml(owner, repo, ghapp=None):
-    """
-    Looks for the yaml file in a /.github directory.
-
-    yaml file must be named issue_label_bot.yaml
-    """
-
-    if not ghapp:
-        # TODO(jlewi): Should we deprecate this code path and always pass
-        # in the github app?
-        ghapp = github_app.GitHubApp.create_from_env()
-
-    try:
-        # get the app installation handle
-        inst_id = ghapp.get_installation_id(owner=owner, repo=repo)
-        inst = ghapp.get_installation(installation_id=inst_id)
-        # get the repo handle, which allows you got get the file contents
-        repo = inst.repository(owner=owner, repository=repo)
-        results = repo.file_contents('.github/issue_label_bot.yaml').decoded
-    # TODO(jlewi): We should probably catching more narrow exceptions and
-    # not swallowing all exceptions. The exceptions we should swallow are
-    # the ones related to the configuration file not existing.
-    except Exception as e:
-        logging.info(f"Exception occured getting .github/issue_label_bot.yaml: {e}")
-        return None
-
-    return yaml.safe_load(results)
+  """
+  Looks for the yaml file in a /.github directory.
+
+  yaml file must be named issue_label_bot.yaml
+  """
+
+  if not ghapp:
+    # TODO(jlewi): Should we deprecate this code path and always pass
+    # in the github app?
+    ghapp = github_app.GitHubApp.create_from_env()
+
+  try:
+    # get the app installation handle
+    inst_id = ghapp.get_installation_id(owner=owner, repo=repo)
+    inst = ghapp.get_installation(installation_id=inst_id)
+    # get the repo handle, which allows you got get the file contents
+    repo = inst.repository(owner=owner, repository=repo)
+    results = repo.file_contents('.github/issue_label_bot.yaml').decoded
+  # TODO(jlewi): We should probably catching more narrow exceptions and
+  # not swallowing all exceptions. The exceptions we should swallow are
+  # the ones related to the configuration file not existing.
+  except Exception as e:
+    logging.info(f"Exception occured getting .github/issue_label_bot.yaml: {e}")
+    return None
+
+  return yaml.safe_load(results)
 
 def build_issue_doc(org:str, repo:str, title:str, text:typing.List[str]):
-    """Build a document string out of various github features.
-
-    Args:
-     org: The organization the issue belongs in
-     repo: The repository.
-     title: Issue title
-     text: List of contents of the comments on the issue
-
-    Returns:
-     content: The document to classify
-    """
-    pieces = [title]
-    pieces.append(f"{org.lower()}_{repo.lower()}")
-    pieces.extend(text)
-    content = "\n".join(pieces)
-    return content
+  """Build a document string out of various github features.
+
+  Args:
+   org: The organization the issue belongs in
+   repo: The repository.
+   title: Issue title
+   text: List of contents of the comments on the issue
+
+  Returns:
+   content: The document to classify
+  """
+  pieces = [title]
+  pieces.append(f"{org.lower()}_{repo.lower()}")
+  pieces.extend(text)
+  content = "\n".join(pieces)
+  return content
+
+# TODO(https://github.com/kubeflow/code-intelligence/issues/126): This function should replace
+# get_issue_text
+def get_issue(url, gh_client):
+  """Fetch the issue data using GraphQL.
+
+  Args:
+    url: Url of the GitHub isue to fetch
+    gh_client: GitHub GraphQl client.
+
+  Returns
+    ------
+    dict
+        {'title':str,
+         'comments':List[str]
+         'labels': List[str]
+         'removed_labels': List[str]}
+
+    comments is a list of comments. The first one will be the body of the issue.
+
+    labels: Labels currently on the issue
+    removed_labels: Labels that have been removed
+  """
+
+  # The "!" means the variable can't be null. We allow the cursors
+  # to be null so that on the first call we fetch the first couple items.
+  issue_query = """query getIssue($url: URI!, $labelCursor: String, $timelineCursor: String, $commentsCursor: String) {
+  resource(url: $url) {
+    __typename
+    ... on Issue {
+      author {
+        __typename
+        ... on User {
+          login
+        }
+        ... on Bot {
+          login
+        }
+      }
+      id
+      title
+      body
+      url
+      state
+      comments(first: 100, after: $commentsCursor) {
+        totalCount
+        edges {
+          node {
+            author {
+              login
+            }
+            body
+          }
+        }
+        pageInfo {
+          hasNextPage
+          endCursor
+        }
+      }
+      timelineItems(first: 100, itemTypes: [UNLABELED_EVENT], after: $timelineCursor) {
+        totalCount
+        edges {
+          node {
+            __typename
+             ... on UnlabeledEvent {
+                  createdAt
+                  label {
+                    name
+                  }
+                }
+          }
+        }
+        pageInfo {
+          hasNextPage
+          endCursor
+        }
+      }
+      labels(first: 100, after: $labelCursor) {
+        totalCount
+        pageInfo {
+          hasNextPage
+          endCursor
+        }
+        edges {
+          node {
+            name
+          }
+        }
+      }
+    }
+  }
+}"""
+
+  variables = {
+    "url": url,
+    "labelCursor": None,
+    "commentsCursor": None,
+    "timelineCurosr": None,
+  }
+
+  has_more = True
+
+  result = {
+    "title": None,
+    "comments": [],
+    "comment_authors": [],
+    "labels": set(),
+    "removed_labels": set(),
+  }
+  while has_more:
+    issue_results = gh_client.run_query(issue_query, variables)
+
+    if "errors" in issue_results:
+      logging.error(f"There was a problem running the github query; {issue_results['errors']}")
+      raise ValueError(f"There was a problem running the github query: {issue_results['errors']}")
+
+    issue = issue_results["data"]["resource"]
+
+    # Only set the title once on the first call
+    if not result["title"]:
+      result["title"] = issue["title"]
+
+    if not result["comments"]:
+      result["comments"].append(issue["body"])
+      result["comment_authors"].append(issue["author"]["login"])
+
+    for e in issue["comments"]["edges"]:
+      node = e["node"]
+      result["comments"].append(node["body"])
+      result["comment_authors"].append(node["author"]["login"])
+
+    for e in issue["labels"]["edges"]:
+      node = e["node"]
+      result["labels"].add(node["name"])
+
+    for e in issue["timelineItems"]["edges"]:
+      node = e["node"]
+      result["removed_labels"].add(node["label"]["name"])
+
+    has_more = False
+
+    for f in ["comments", "labels", "timelineItems"]:
+      has_more = has_more or issue[f].get("pageInfo").get("hasNextPage")
+
+    variables["labelCursor"] = issue["labels"]["pageInfo"]["endCursor"]
+    variables["commentsCursor"] = issue["comments"]["pageInfo"]["endCursor"]
+    variables["timelineCursor"] = issue["timelineItems"]["pageInfo"]["endCursor"]
+
+  # For removed_labels we only want labels that were permanently removed
+  result["removed_labels"] = result["removed_labels"] - result["labels"]
+
+  result["labels"] = list(result["labels"])
+  result["removed_labels"] = list(result["removed_labels"])
+  return result