add get and list jobs interface to launcher and implement it for EMR

oavdeev · khorshuheng · commit b542df5db732 · 2020-10-27T09:35:01.000+08:00
Signed-off-by: Oleg Avdeev &lt;oleg.v.avdeev@gmail.com&gt;
diff --git a/sdk/python/feast/pyspark/abc.py b/sdk/python/feast/pyspark/abc.py
@@ -470,3 +470,11 @@ def stage_dataframe(
             FileSource: representing the uploaded dataframe.
         """
         raise NotImplementedError
+
+    @abc.abstractmethod
+    def get_job_by_id(self, job_id: str) -> SparkJob:
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def list_jobs(self, include_terminated: bool) -> List[SparkJob]:
+        raise NotImplementedError
diff --git a/sdk/python/feast/pyspark/launchers/aws/emr.py b/sdk/python/feast/pyspark/launchers/aws/emr.py
@@ -5,6 +5,7 @@
 
 import boto3
 import pandas
+from botocore.config import Config as BotoConfig
 
 from feast.data_format import ParquetFormat
 from feast.data_source import FileSource
@@ -14,6 +15,7 @@
     JobLauncher,
     RetrievalJob,
     RetrievalJobParameters,
+    SparkJob,
     SparkJobFailure,
     SparkJobStatus,
     StreamIngestionJob,
@@ -22,13 +24,19 @@
 
 from .emr_utils import (
     FAILED_STEP_STATES,
+    HISTORICAL_RETRIEVAL_JOB_TYPE,
     IN_PROGRESS_STEP_STATES,
+    OFFLINE_TO_ONLINE_JOB_TYPE,
+    STREAM_TO_ONLINE_JOB_TYPE,
     SUCCEEDED_STEP_STATES,
     TERMINAL_STEP_STATES,
     EmrJobRef,
+    JobInfo,
     _cancel_job,
     _get_job_state,
     _historical_retrieval_step,
+    _job_ref_to_str,
+    _list_jobs,
     _load_new_cluster_template,
     _random_string,
     _s3_upload,
@@ -50,7 +58,7 @@ def __init__(self, emr_client, job_ref: EmrJobRef):
         self._emr_client = emr_client
 
     def get_id(self) -> str:
-        return f'{self._job_ref.cluster_id}:{self._job_ref.step_id or ""}'
+        return _job_ref_to_str(self._job_ref)
 
     def get_status(self) -> SparkJobStatus:
         emr_state = _get_job_state(self._emr_client, self._job_ref)
@@ -164,7 +172,10 @@ def __init__(
         self._region = region
 
     def _emr_client(self):
-        return boto3.client("emr", region_name=self._region)
+
+        # Use an increased number of retries since DescribeStep calls have a pretty low rate limit.
+        config = BotoConfig(retries={"max_attempts": 10, "mode": "standard"})
+        return boto3.client("emr", region_name=self._region, config=config)
 
     def _submit_emr_job(self, step: Dict[str, Any]) -> EmrJobRef:
         """
@@ -211,15 +222,15 @@ def historical_feature_retrieval(
         )
 
         step = _historical_retrieval_step(
-            pyspark_script_path, args=job_params.get_arguments()
+            pyspark_script_path,
+            args=job_params.get_arguments(),
+            output_file_uri=job_params.get_destination_path(),
         )
 
         job_ref = self._submit_emr_job(step)
 
         return EmrRetrievalJob(
-            self._emr_client(),
-            job_ref,
-            os.path.join(job_params.get_destination_path()),
+            self._emr_client(), job_ref, job_params.get_destination_path(),
         )
 
     def offline_to_online_ingestion(
@@ -297,3 +308,67 @@ def stage_dataframe(
             file_format=ParquetFormat(),
             file_url=file_url,
         )
+
+    def _job_from_job_info(self, job_info: JobInfo) -> SparkJob:
+        if job_info.job_type == HISTORICAL_RETRIEVAL_JOB_TYPE:
+            assert job_info.output_file_uri is not None
+            return EmrRetrievalJob(
+                emr_client=self._emr_client(),
+                job_ref=job_info.job_ref,
+                output_file_uri=job_info.output_file_uri,
+            )
+        elif job_info.job_type == OFFLINE_TO_ONLINE_JOB_TYPE:
+            return EmrBatchIngestionJob(
+                emr_client=self._emr_client(), job_ref=job_info.job_ref,
+            )
+        elif job_info.job_type == STREAM_TO_ONLINE_JOB_TYPE:
+            return EmrStreamIngestionJob(
+                emr_client=self._emr_client(), job_ref=job_info.job_ref,
+            )
+        else:
+            # We should never get here
+            raise ValueError(f"Unknown job type {job_info.job_type}")
+
+    def list_jobs(self, include_terminated: bool) -> List[SparkJob]:
+        """
+        Find EMR job by a string id.
+
+        Args:
+            include_terminated: whether to include terminated jobs.
+
+        Returns:
+            A list of SparkJob instances.
+        """
+
+        jobs = _list_jobs(
+            emr_client=self._emr_client(),
+            job_type=None,
+            table_name=None,
+            active_only=not include_terminated,
+        )
+
+        result = []
+        for job_info in jobs:
+            result.append(self._job_from_job_info(job_info))
+        return result
+
+    def get_job_by_id(self, job_id: str) -> SparkJob:
+        """
+        Find EMR job by a string id. Note that it will also return terminated jobs.
+
+        Raises:
+            KeyError if the job not found.
+        """
+        # FIXME: this doesn't have to be a linear search but that'll do for now
+        jobs = _list_jobs(
+            emr_client=self._emr_client(),
+            job_type=None,
+            table_name=None,
+            active_only=True,
+        )
+
+        for job_info in jobs:
+            if _job_ref_to_str(job_info.job_ref) == job_id:
+                return self._job_from_job_info(job_info)
+        else:
+            raise KeyError(f"Job not found {job_id}")
diff --git a/sdk/python/feast/pyspark/launchers/aws/emr_utils.py b/sdk/python/feast/pyspark/launchers/aws/emr_utils.py
@@ -165,15 +165,27 @@ def _sync_offline_to_online_step(
     }
 
 
+class EmrJobRef(NamedTuple):
+    """ EMR job reference. step_id can be None when using on-demand clusters, in that case each
+    cluster has only one step """
+
+    cluster_id: str
+    step_id: Optional[str]
+
+
+def _job_ref_to_str(job_ref: EmrJobRef) -> str:
+    return ":".join(["emr", job_ref.cluster_id, job_ref.step_id or ""])
+
+
 class JobInfo(NamedTuple):
+    job_ref: EmrJobRef
     job_type: str
-    cluster_id: str
-    step_id: str
-    table_name: str
     state: str
+    table_name: Optional[str]
+    output_file_uri: Optional[str]
 
 
-def list_jobs(
+def _list_jobs(
     emr_client, job_type: Optional[str], table_name: Optional[str], active_only=True
 ) -> List[JobInfo]:
     """
@@ -212,6 +224,10 @@ def list_jobs(
                     ) or props.get("feast.step_metadata.offline_to_online.table_name")
                     step_job_type = props["feast.step_metadata.job_type"]
 
+                    output_file_uri = props.get(
+                        "feast.step_metadata.historical_retrieval.output_file_uri"
+                    )
+
                     if table_name and step_table_name != table_name:
                         continue
 
@@ -221,32 +237,24 @@ def list_jobs(
                     res.append(
                         JobInfo(
                             job_type=step_job_type,
-                            cluster_id=cluster_id,
-                            step_id=step["Id"],
+                            job_ref=EmrJobRef(cluster_id, step["Id"]),
                             state=step["Status"]["State"],
                             table_name=step_table_name,
+                            output_file_uri=output_file_uri,
                         )
                     )
     return res
 
 
 def _get_stream_to_online_job(emr_client, table_name: str) -> List[JobInfo]:
-    return list_jobs(
+    return _list_jobs(
         emr_client,
         job_type=STREAM_TO_ONLINE_JOB_TYPE,
         table_name=table_name,
         active_only=True,
     )
 
 
-class EmrJobRef(NamedTuple):
-    """ EMR job reference. step_id can be None when using on-demand clusters, in that case each
-    cluster has only one step """
-
-    cluster_id: str
-    step_id: Optional[str]
-
-
 def _get_first_step_id(emr_client, cluster_id: str) -> str:
     response = emr_client.list_steps(ClusterId=cluster_id,)
     assert len(response["Steps"]) == 1
@@ -329,7 +337,7 @@ def _upload_dataframe(s3prefix: str, df: pandas.DataFrame) -> str:
 
 
 def _historical_retrieval_step(
-    pyspark_script_path: str, args: List[str],
+    pyspark_script_path: str, args: List[str], output_file_uri: str,
 ) -> Dict[str, Any]:
 
     return {
@@ -340,6 +348,10 @@ def _historical_retrieval_step(
                     "Key": "feast.step_metadata.job_type",
                     "Value": HISTORICAL_RETRIEVAL_JOB_TYPE,
                 },
+                {
+                    "Key": "feast.step_metadata.historical_retrieval.output_file_uri",
+                    "Value": output_file_uri,
+                },
             ],
             "Args": ["spark-submit", pyspark_script_path] + args,
             "Jar": "command-runner.jar",
diff --git a/sdk/python/feast/pyspark/launchers/gcloud/dataproc.py b/sdk/python/feast/pyspark/launchers/gcloud/dataproc.py
@@ -1,6 +1,6 @@
 import os
 import uuid
-from typing import cast
+from typing import List, cast
 from urllib.parse import urlparse
 
 from google.api_core.operation import Operation
@@ -14,6 +14,7 @@
     JobLauncher,
     RetrievalJob,
     RetrievalJobParameters,
+    SparkJob,
     SparkJobFailure,
     SparkJobParameters,
     SparkJobStatus,
@@ -173,3 +174,9 @@ def stage_dataframe(
         self, df, event_timestamp_column: str, created_timestamp_column: str,
     ):
         raise NotImplementedError
+
+    def get_job_by_id(self, job_id: str) -> SparkJob:
+        raise NotImplementedError
+
+    def list_jobs(self, include_terminated: bool) -> List[SparkJob]:
+        raise NotImplementedError
diff --git a/sdk/python/feast/pyspark/launchers/standalone/local.py b/sdk/python/feast/pyspark/launchers/standalone/local.py
@@ -3,6 +3,7 @@
 import subprocess
 import uuid
 from contextlib import closing
+from typing import List
 
 import requests
 from requests.exceptions import RequestException
@@ -13,6 +14,7 @@
     JobLauncher,
     RetrievalJob,
     RetrievalJobParameters,
+    SparkJob,
     SparkJobFailure,
     SparkJobParameters,
     SparkJobStatus,
@@ -226,3 +228,9 @@ def stage_dataframe(
         self, df, event_timestamp_column: str, created_timestamp_column: str,
     ):
         raise NotImplementedError
+
+    def get_job_by_id(self, job_id: str) -> SparkJob:
+        raise NotImplementedError
+
+    def list_jobs(self, include_terminated: bool) -> List[SparkJob]:
+        raise NotImplementedError