mc-digital · mhiro2 · Oct 19, 2021 · Oct 19, 2021 · Oct 19, 2021 · Oct 19, 2021
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,26 @@
+name: CI
+
+on:
+  pull_request:
+    paths:
+      - carling/**
+      - .github/workflows/ci.yml
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python: [3.7, 3.8, 3.9]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python }}
+      - name: Install dependencies
+        run: |
+          python -m pip install poetry tox
+      - name: Test with tox
+        run: poetry run tox -e py,black,flake8,isort
diff --git a/carling/__init__.py b/carling/__init__.py
@@ -1,30 +1,59 @@
-# isort: skip_file
 __version__ = "0.3.1"
 
+from .categorical import (
+    CreateCategoricalDicts,
+    DigestCategoricalColumns,
+    PairWithIndexNumber,
+    ReplaceCategoricalColumns,
+)
 from .group import (
-    UniqueOnly,
-    SingletonOnly,
-    Intersection,
+    DifferencePerKey,
     FilterByKey,
     FilterByKeyUsingSideInput,
-    DifferencePerKey,
+    Intersection,
     MaxSelectPerKey,
     PartitionRowsContainingNone,
+    SingletonOnly,
+    UniqueOnly,
 )
 from .mapping import (
-    Label,
-    Select,
-    Project,
+    Exclude,
     IndexBy,
-    Stringify,
     IndexBySingle,
+    Label,
+    Project,
     RenameFromTo,
-    Exclude,
+    Select,
+    Stringify,
 )
-from .categorical import (
-    PairWithIndexNumber,
-    DigestCategoricalColumns,
-    CreateCategoricalDicts,
-    ReplaceCategoricalColumns,
+from .util import LogSample, MemoizedValueProviderWrapper, ReifyMultiValueOption
+
+__all__ = (
+    # categorical
+    "CreateCategoricalDicts",
+    "DigestCategoricalColumns",
+    "PairWithIndexNumber",
+    "ReplaceCategoricalColumns",
+    # group
+    "DifferencePerKey",
+    "FilterByKey",
+    "FilterByKeyUsingSideInput",
+    "Intersection",
+    "MaxSelectPerKey",
+    "PartitionRowsContainingNone",
+    "SingletonOnly",
+    "UniqueOnly",
+    # mapping
+    "Exclude",
+    "IndexBy",
+    "IndexBySingle",
+    "Label",
+    "Project",
+    "RenameFromTo",
+    "Select",
+    "Stringify",
+    # util
+    "LogSample",
+    "MemoizedValueProviderWrapper",
+    "ReifyMultiValueOption",
 )
-from .util import LogSample, ReifyMultiValueOption, MemoizedValueProviderWrapper
diff --git a/carling/categorical.py b/carling/categorical.py
@@ -1,8 +1,6 @@
 from copy import deepcopy
 
 import apache_beam as beam
-from apache_beam.coders import VarIntCoder
-from apache_beam.transforms.userstate import CombiningValueStateSpec
 
 
 class _PairWithIndexNumberDoFn(beam.DoFn):
@@ -52,9 +50,9 @@ def _CreateCategoricalDict(pcoll, existing_dict_pairs):
     """
     existing_max_value = (
         existing_dict_pairs
-        | f"just values" >> beam.Map(lambda r: r[1])
-        | f"get max" >> beam.combiners.Top.Of(1)
-        | f"extract" >> beam.FlatMap(lambda r: r)
+        | "just values" >> beam.Map(lambda r: r[1])
+        | "get max" >> beam.combiners.Top.Of(1)
+        | "extract" >> beam.FlatMap(lambda r: r)
     )
 
     new_pairs = (
@@ -101,7 +99,8 @@ def CreateCategoricalDicts(pcoll, cat_cols, existing_dict_rows):
 
     It then creates a transform which takes a pcollection and
       - looks at the input pcoll for unseen values in each categorical column
-      - creates new unique integers for each distinct unseen value, starting at max(previous value for column)+1
+      - creates new unique integers for each distinct unseen value, starting at
+        max(previous value for column)+1
       - ammends the existing mappings with (col, unseen_value, new_unique_int)
 
     Output is:
@@ -115,9 +114,7 @@ def CreateCategoricalDicts(pcoll, cat_cols, existing_dict_rows):
 
     existing_dicts = (
         existing_dict_rows
-        | beam.Map(
-            lambda r: beam.pvalue.TaggedOutput(r[0], (r[1], r[2]))
-        ).with_outputs()
+        | beam.Map(lambda r: beam.pvalue.TaggedOutput(r[0], (r[1], r[2]))).with_outputs()
     )
 
     for column in cat_cols:
@@ -131,16 +128,19 @@ def CreateCategoricalDicts(pcoll, cat_cols, existing_dict_rows):
             #       value by reference.
             >> beam.Map(lambda r, column=column: r[column])
             | _CreateCategoricalDict(existing_dict_pairs)
-            | f"re-append column [{column}]"
-            >> beam.Map(lambda r, column=column: (column, *r))
+            | f"re-append column [{column}]" >> beam.Map(lambda r, column=column: (column, *r))
         )
 
     return acc | beam.Flatten()
 
 
 @beam.ptransform_fn
 def ReplaceCategoricalColumns(
-    inputs, cat_cols, categorical_dict_rows, default_unseen=None, features_key=None
+    inputs,
+    cat_cols,
+    categorical_dict_rows,
+    default_unseen=None,
+    features_key=None,
 ):
     """
     Utilizes the "categorical dictionary rows" generated by

diff --git a/carling/group.py b/carling/group.py
@@ -1,21 +1,9 @@
-"""
-Generic grouping transform utils
-
-Author: Tsuyoki Kumazaki (tsuyoki@mcdigital.jp)
-"""
 from functools import reduce
 
 import apache_beam as beam
 
+from carling.iter_utils import is_none, is_some, take_as_singleton, take_top, unwrap_or_none
 from carling.mapping import IndexBy
-from carling.iter_utils import (
-    take_top,
-    is_none,
-    is_some,
-    unwrap,
-    unwrap_or_none,
-    take_as_singleton,
-)
 
 
 def _merge_two(x, y):
@@ -50,7 +38,6 @@ def expand(self, pcoll):
 
 
 class UniqueOnly(beam.PTransform):
-
     """Produces elements that are the only elements per key after deduplication.
 
     Given a `PCollection` of `(K, V)`,
@@ -76,7 +63,6 @@ def expand(self, pcoll):
 
 
 class SingletonOnly(beam.PTransform):
-
     """Produces elements that are the only elements per key.
 
     Given a `PCollection` of `(K, V)`,
@@ -93,8 +79,7 @@ def expand(self, pcoll):
         return (
             pcoll
             | "Group" >> beam.GroupByKey()
-            | "Remove Non-singleton Elements"
-            >> beam.Map(lambda kv: take_as_singleton(kv[1]))
+            | "Remove Non-singleton Elements" >> beam.Map(lambda kv: take_as_singleton(kv[1]))
             | "Remove None" >> beam.Filter(lambda v: len(v) > 0)
             | "Unwrap Values" >> beam.Map(lambda v: v[0])
         )
@@ -115,7 +100,6 @@ def process(self, row):
 
 
 class Intersection(beam.PTransform):
-
     """Produces the intersection of given `PCollection`s.
 
     Given a list of `PCollection`s,
@@ -152,7 +136,6 @@ def process(self, row):
 
 
 class FilterByKey(beam.PTransform):
-
     """Filters elements by their keys.
 
     The constructor receives one or more `PCollection`s of `K`s,
@@ -179,8 +162,7 @@ def expand(self, pcoll):
 
 @beam.ptransform_fn
 def FilterByKeyUsingSideInput(pcoll, lookup_entries, filter_key):
-    """
-    Filters a single collection by a single lookup collection, using a common key.
+    """Filters a single collection by a single lookup collection, using a common key.
 
     Given:
       - a `PCollection` (lookup_entries) of `(V)`, as a lookup collection
@@ -307,7 +289,6 @@ def process(self, row):
 
 
 class DifferencePerKey(beam.PTransform):
-
     """Produces the difference per key between two `PCollection`s.
 
     Given two `PCollection`s of `V`,
@@ -351,20 +332,18 @@ def MaxSelectPerKey(pcoll, index_keys, sort_key_fn, reverse=False):
     return (
         pcoll
         | f"Index by {index_keys}" >> IndexBy(*index_keys)
-        | f"Top 1 per key"
-        >> beam.combiners.Top.PerKey(1, key=sort_key_fn, reverse=reverse)
+        | "Top 1 per key" >> beam.combiners.Top.PerKey(1, key=sort_key_fn, reverse=reverse)
         | "De-Index" >> beam.Map(lambda k_v: k_v[1][0])
     )
 
 
 @beam.ptransform_fn
 def PartitionRowsContainingNone(pcoll):
-    """
-    Emits two tagged pcollections:
+    """Emits two tagged pcollections:
 
-      - None: Default emitted collection.
-              Rows are guaranteed not to have any `None` values
-      - contains_none: At least one column in the row had a `None` value
+    - None: Default emitted collection.
+            Rows are guaranteed not to have any `None` values
+    - contains_none: At least one column in the row had a `None` value
     """
 
     def _separator(row):

diff --git a/carling/io/__init__.py b/carling/io/__init__.py
@@ -1,3 +1,6 @@
-#!/usr/bin/env python3
+from .avro_schema import generate_avro_schema_from_template, load_avro_schema
 
-from .avro_schema import load_avro_schema, generate_avro_schema_from_template
+__all__ = (
+    "generate_avro_schema_from_template",
+    "load_avro_schema",
+)
diff --git a/carling/iter_utils/__init__.py b/carling/iter_utils/__init__.py
@@ -1,8 +1,10 @@
-from .iter_utils import (
-    take_top,
-    is_some,
-    is_none,
-    unwrap,
-    unwrap_or_none,
-    take_as_singleton,
+from .iter_utils import is_none, is_some, take_as_singleton, take_top, unwrap, unwrap_or_none
+
+__all__ = (
+    "is_none",
+    "is_some",
+    "take_as_singleton",
+    "take_top",
+    "unwrap",
+    "unwrap_or_none",
 )
diff --git a/carling/iter_utils/iter_utils.py b/carling/iter_utils/iter_utils.py
@@ -1,8 +1,3 @@
-"""
-Generic iter utils
-Author: Tsuyoki Kumazaki (tsuyoki@mcdigital.jp)
-"""
-
 import itertools
 
 

diff --git a/carling/mapping.py b/carling/mapping.py
@@ -1,29 +1,20 @@
-"""
-Generic mapping transform utils
-
-Author: Tsuyoki Kumazaki (tsuyoki@mcdigital.jp)
-"""
-
 import json
 
 import apache_beam as beam
 
 
 def Label(**labels):
-    """Labels all elements.
-    """
+    """Labels all elements."""
     return "Label" >> beam.Map(lambda r: {**r, **labels})
 
 
 def Select(*keys):
-    """Removes all columns which are not specified in `*keys`.
-    """
+    """Removes all columns which are not specified in `*keys`."""
     return "Select" >> beam.Map(lambda r: {k: r[k] for k in keys})
 
 
 def Project(*keys):
-    """Transforms each element into a tuple of values of the specified columns.
-    """
+    """Transforms each element into a tuple of values of the specified columns."""
     return "Project" >> beam.Map(lambda r: tuple(r[k] for k in keys))
 
 
@@ -45,8 +36,7 @@ def _decimal_default_proc(obj):
 
 
 def Stringify():
-    """Transforms each element into its JSON representation.
-    """
+    """Transforms each element into its JSON representation."""
 
     def s(obj):
         return json.dumps(obj, default=_decimal_default_proc)
@@ -66,8 +56,7 @@ def IndexBySingle(key):
 
 
 def RenameFromTo(from_to_key_mapping):
-    """Rename columns according to `from_to_key_mapping`.
-    """
+    """Rename columns according to `from_to_key_mapping`."""
 
     def rename(row):
         res = dict(row)
@@ -81,8 +70,7 @@ def rename(row):
 
 
 def Exclude(*keys):
-    """Removes all columns specified in `*keys`.
-    """
+    """Removes all columns specified in `*keys`."""
 
     def exclude(row):
         res = dict(row)

diff --git a/carling/py.typed b/carling/py.typed
diff --git a/carling/test_utils/__init__.py b/carling/test_utils/__init__.py
@@ -1 +1,3 @@
 from .test_utils import pprint_equal_to
+
+__all__ = ("pprint_equal_to",)