Added inference

dmmiller612 · Dec 4, 2019 · 1b855f9 · 1b855f9
1 parent 8bee217
commit 1b855f9
Show file tree

Hide file tree

Showing 4 changed files with 106 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -185,7 +185,7 @@ Then you can perform predictions, etc with:
 predictions = p.transform(df)
 ```
 
-#### Getting the pytorch model from the training session
+#### Getting the Pytorch model from the training session
 
 If you just want to get the Pytorch model after training, you can execute the following code:
 
@@ -202,6 +202,24 @@ stm = SparkTorch(
 py_model = stm.getPytorchModel()
 ```
 
+
+#### Using a pretrained Pytorch model for inference
+
+If you already have a trained Pytorch model, you can attach it your existing pipeline by directly creating a SparkTorchModel. 
+This can be done by running the following:
+
+```python
+from sparktorch import create_spark_torch_model
+
+net = ... # Pretrained Network
+
+spark_torch_model = create_spark_torch_model(
+    net, 
+    inputCol='features',
+    predictionCol='predictions'
+)
+```
+
 ## Running
 
 One big thing to remember is to add the `--executor cores 1` option to spark to ensure

diff --git a/sparktorch/__init__.py b/sparktorch/__init__.py
@@ -1,3 +1,4 @@
 from sparktorch.util import serialize_torch_obj, serialize_torch_obj_lazy
 from sparktorch.torch_distributed import SparkTorch
 from sparktorch.pipeline_util import PysparkPipelineWrapper
+from sparktorch.inference import create_spark_torch_model
diff --git a/sparktorch/inference.py b/sparktorch/inference.py
@@ -0,0 +1,61 @@
+from sparktorch.torch_distributed import SparkTorchModel
+import torch.nn as nn
+import codecs
+import dill
+from pyspark.ml.pipeline import PipelineModel
+
+
+def convert_to_serialized_torch(network: nn.Module) -> str:
+    """
+    Converts an existing torch network to a serialized string.
+
+    :param network: a nn.Module that you want to serialize
+    :return: Returns the serialized torch model.
+    """
+    return codecs.encode(dill.dumps(network), "base64").decode()
+
+
+def create_spark_torch_model(
+    network: nn.Module,
+    inputCol: str = 'features',
+    predictionCol: str = 'predicted',
+    useVectorOut: bool = False
+) -> SparkTorchModel:
+    """
+    Creates a spark SparkTorchModel from an already trained network. Useful for running inference on large datasets.
+
+    :param network: an already trained network
+    :param inputCol: The spark dataframe input column
+    :param predictionCol: The spark dataframe prediction columns
+    :param useVectorOut: Determines whether the output should return a spark vector
+    :return: Returns a SparkTorchModel
+    """
+
+    return SparkTorchModel(
+        inputCol=inputCol,
+        predictionCol=predictionCol,
+        modStr=convert_to_serialized_torch(network),
+        useVectorOut=useVectorOut
+    )
+
+
+def attach_pytorch_model_to_pipeline(
+    network: nn.Module,
+    pipeline_model: PipelineModel,
+    inputCol: str = 'features',
+    predictionCol: str = 'predicted',
+    useVectorOut: bool = False
+) -> PipelineModel:
+    """
+    Attaches a pytorch model to an existing pyspark pipeline.
+
+    :param network: Pytorch Network
+    :param pipeline_model: An existing spark pipeline model (This is a fitted pipeline)
+    :param inputCol: The input column to the dataframe for the pytorch network
+    :param predictionCol: The prediction column.
+    :param useVectorOut: option to use a vector output.
+    :return: a spark PipelineModel
+    """
+
+    spark_model = create_spark_torch_model(network, inputCol, predictionCol, useVectorOut)
+    return PipelineModel(stages=[pipeline_model, spark_model])
diff --git a/sparktorch/tests/test_sparktorch.py b/sparktorch/tests/test_sparktorch.py
@@ -4,6 +4,7 @@
 from pyspark.ml.linalg import Vectors
 import torch.nn as nn
 import torch
+from sparktorch.inference import create_spark_torch_model
 from sparktorch.util import serialize_torch_obj, serialize_torch_obj_lazy
 from sparktorch.torch_distributed import SparkTorch
 from sparktorch.tests.simple_net import Net, AutoEncoder, ClassificationNet, NetworkWithParameters
@@ -79,6 +80,29 @@ def test_model_parameters(data, network_with_params):
     assert py_model.fc2 is not None
 
 
+def test_inference(lazy_model, data):
+    stm = SparkTorch(
+        inputCol='features',
+        labelCol='label',
+        predictionCol='predictions',
+        torchObj=lazy_model,
+        verbose=1,
+        iters=10
+    ).fit(data)
+
+    first_res = stm.transform(data).take(1)
+
+    res = stm.getPytorchModel()
+    spark_model = create_spark_torch_model(
+        res,
+        'features',
+        'predictions'
+    )
+
+    res = spark_model.transform(data).take(1)
+    assert first_res == res
+
+
 def test_lazy(lazy_model, data):
     stm = SparkTorch(
         inputCol='features',
@@ -259,3 +283,4 @@ def test_validation_pct(data, general_model):
 
     res = stm.transform(data).take(1)
     assert 'predictions' in res[0]
+