Skip to content

Commit

Permalink
Merge pull request #13 from jabardigitalservice/feature/config_conver…
Browse files Browse the repository at this point in the history
…ter_spreadsheet

Feature/config converter spreadsheet
  • Loading branch information
pipinfitriadi authored Oct 30, 2023
2 parents 5a72827 + 490cc7d commit 0a6ad73
Show file tree
Hide file tree
Showing 13 changed files with 233 additions and 34 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/python.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ jobs:
. venv/bin/activate
echo PATH=$PATH >> $GITHUB_ENV
- name: Install package
run: pip install '.[converter,s3]'
run: pip install '.[converter,s3,gsheet]'
- name: Test
run: python -m coverage run -m unittest
- name: Display coverage
Expand Down Expand Up @@ -145,7 +145,7 @@ jobs:
- name: Generate docs
run: |
. venv/bin/activate
pip install '.[converter,s3]'
pip install '.[converter,s3,gsheet]'
python -m pdoc \
datasae \
--html \
Expand Down
2 changes: 1 addition & 1 deletion .vscode/tasks.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
"label": "Python: Package install",
"type": "shell",
"command": "${command:python.interpreterPath}",
"args": ["-m", "pip", "install", "'.[converter,s3]'"]
"args": ["-m", "pip", "install", "'.[converter,s3,gsheet]'"]
},
{
"label": "Python: Package show",
Expand Down
26 changes: 24 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ Data Quality Framework provides by Jabar Digital Service

## Converter

[https://github.com/jabardigitalservice/DataSae/blob/be7cb20fbd54293bae8a4949bf4bb6a1da1b87f6/tests/data/config.json#L1-L8](https://github.com/jabardigitalservice/DataSae/blob/be7cb20fbd54293bae8a4949bf4bb6a1da1b87f6/tests/data/config.json#L1-L8)
[https://github.com/jabardigitalservice/DataSae/blob/29d18db7d7660fadc88e8f9ef12902f604b20161/tests/data/config.json#L1-L12](https://github.com/jabardigitalservice/DataSae/blob/29d18db7d7660fadc88e8f9ef12902f604b20161/tests/data/config.json#L1-L12)

[https://github.com/jabardigitalservice/DataSae/blob/be7cb20fbd54293bae8a4949bf4bb6a1da1b87f6/tests/data/config.yaml#L1-L5](https://github.com/jabardigitalservice/DataSae/blob/be7cb20fbd54293bae8a4949bf4bb6a1da1b87f6/tests/data/config.yaml#L1-L5)
[https://github.com/jabardigitalservice/DataSae/blob/29d18db7d7660fadc88e8f9ef12902f604b20161/tests/data/config.yaml#L1-L8](https://github.com/jabardigitalservice/DataSae/blob/29d18db7d7660fadc88e8f9ef12902f604b20161/tests/data/config.yaml#L1-L8)

### S3

Expand All @@ -45,3 +45,25 @@ df = s3('bucket_name', 'path/file_name.json')
df = s3('bucket_name', 'path/file_name.parquet')
df = s3('bucket_name', 'path/file_name.xlsx', sheet_name='Sheet1')
```

### Google Spreadsheet

[https://github.com/jabardigitalservice/DataSae/blob/4308324d066c6627936773ab2d5b990adaa60100/tests/data/creds.json#L1-L12](https://github.com/jabardigitalservice/DataSae/blob/4308324d066c6627936773ab2d5b990adaa60100/tests/data/creds.json#L1-L12)

```sh
pip install 'DataSae[converter,gsheet]'
```

```py
from datasae.converter import Config

# From JSON
config = Config('DataSae/tests/data/config.json')
gsheet = config('test_gsheet')
df = gsheet('gsheet_id', 'sheet_name')

# From YAML
config = Config('DataSae/tests/data/config.yaml')
gsheet = config('test_gsheet')
df = gsheet('gsheet_id', 'sheet_name')
```
12 changes: 10 additions & 2 deletions datasae/converter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ class DataSourceType(CaseInsensitiveEnum):
sources.
"""

GSHEET = 'gsheet'
S3 = 's3'


Expand Down Expand Up @@ -222,8 +223,15 @@ def __call__(self, name: str) -> DataSource:
for key, value in config.get(name, {}).items()
}
source_type: DataSourceType = data_source['type']
func: Callable = lambda **_: None

if source_type is DataSourceType.S3:
if source_type is DataSourceType.GSHEET:
from .gsheet import GSheet

func = GSheet
elif source_type is DataSourceType.S3:
from .s3 import S3

return S3(**data_source)
func = S3

return func(**data_source)
67 changes: 67 additions & 0 deletions datasae/converter/gsheet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/usr/bin/env python3

# Copyright (c) Free Software Foundation, Inc. All rights reserved.
# Licensed under the AGPL-3.0-only License. See LICENSE in the project root
# for license information.

"""Google Spreadsheet library."""

from __future__ import annotations
from dataclasses import dataclass
import warnings

from google.oauth2.service_account import Credentials
import gspread
from pandas import DataFrame

from . import DataSource


@dataclass(repr=False)
class GSheet(DataSource):
"""
Represents a data source that connects to an Google Spreadsheet.
Args:
client_secret_file (str): path location credential google spreadsheet.
"""

client_secret_file: str

@property
def connection(self) -> Credentials:
"""
Returns a credential for the Google Spreadsheet.
Returns:
Credentials: Creds from googleservice account.
"""
return Credentials.from_service_account_file(
self.client_secret_file,
scopes=['https://www.googleapis.com/auth/spreadsheets']
)

def __call__(
self, gsheet_id: str, sheet_name: str,
) -> DataFrame:
"""
__call__ method.
Converts the data from google spreadsheet into a
Pandas DataFrame.
Args:
gsheet_id (str): The id from url spreadsheet.
sheet_name (str): The name a sheet will get data.
Returns:
DataFrame: A Pandas DataFrame.
"""
with warnings.catch_warnings(record=True):
warnings.simplefilter('always')
data: gspread.Worksheet = gspread.authorize(
self.connection
).open_by_key(gsheet_id).worksheet(sheet_name)

# default index 0 jadi kolom
return DataFrame(data.get_all_records())
2 changes: 1 addition & 1 deletion datasae/converter/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@

from __future__ import annotations
from dataclasses import dataclass
from pandas import DataFrame
from urllib3 import BaseHTTPResponse

from minio import Minio
from pandas import DataFrame

from . import DataSource, FileType

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ dynamic = ['version']
[project.optional-dependencies]
converter = ['pyyaml', 'fastparquet', 'openpyxl']
s3 = ['minio']
gsheet = ['google-api-python-client', 'gspread==5.12.0']

[project.urls]
Docs = 'https://jabardigitalservice.github.io/DataSae/'
Expand Down
4 changes: 4 additions & 0 deletions tests/data/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,9 @@
"endpoint": "play.min.io",
"access_key": "Q3AM3UQ867SPQQA43P2F",
"secret_key": "zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG"
},
"test_gsheet": {
"type": "gsheet",
"client_secret_file": "tests/data/creds.json"
}
}
3 changes: 3 additions & 0 deletions tests/data/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@ test_s3:
endpoint: play.min.io
access_key: Q3AM3UQ867SPQQA43P2F
secret_key: zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG
test_gsheet:
type: gsheet
client_secret_file: tests/data/creds.json
12 changes: 12 additions & 0 deletions tests/data/creds.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"type": "service_account",
"project_id": "project_id",
"private_key_id": "private_key_id",
"private_key": "-----BEGIN PRIVATE KEY----------END PRIVATE KEY-----\n",
"client_email": "name@email.com",
"client_id": "client_id",
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
"token_uri": "https://oauth2.googleapis.com/token",
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
"client_x509_cert_url": "client_x509_cert_url"
}
26 changes: 26 additions & 0 deletions tests/test_converter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,12 @@
"""test_converter."""

from os import path
from string import ascii_lowercase
import unittest

from pandas import DataFrame
from pandas.testing import assert_frame_equal

from datasae.converter import Config, FileType

PATH: str = path.join('tests', 'data')
Expand All @@ -23,3 +27,25 @@ def test_case_insensitive_enum(self):
"""test_case_insensitive_enum."""
self.assertEqual('.JSON', FileType.JSON)
self.assertIs(FileType('.JSON'), FileType.JSON)


class DataFrameTestCase(unittest.TestCase):
"""DataFrameTestCase."""

DATA: DataFrame = DataFrame({'alphabet': list(ascii_lowercase)})

def assertDataframeEqual(self, a, b, msg):
"""assertDataframeEqual."""
try:
assert_frame_equal(a, b)
except AssertionError as e:
raise self.failureException(msg) from e

def setUp(self):
"""Set up method."""
self.addTypeEqualityFunc(DataFrame, self.assertDataframeEqual)

def test_assertion_error(self):
"""test_assertion_error."""
with self.assertRaises(AssertionError):
self.assertEqual(DataFrame({'a': [1]}), DataFrame())
76 changes: 76 additions & 0 deletions tests/test_converter/test_gsheet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/env python3

# Copyright (c) Free Software Foundation, Inc. All rights reserved.
# Licensed under the AGPL-3.0-only License. See LICENSE in the project root
# for license information.

"""test_gsheet."""

import csv
from os import path
from unittest.mock import patch

from . import CONFIG_JSON, CONFIG_YAML, DataFrameTestCase, PATH
from datasae.converter import DataSourceType


class MockCreds:
"""MockCreds."""

def __init__(self, filename, **kwargs):
"""__init__."""
pass

@property
def project_id(self):
"""project_id."""
pass


class GSheetTest(DataFrameTestCase):
"""GSheetTest."""

def __init__(self, methodName: str = 'runTest'):
"""__init__."""
super().__init__(methodName)
self.NAME: str = 'test_gsheet'
self.gsheet = CONFIG_JSON(self.NAME)

def test_config(self):
"""test_config."""
for config in (CONFIG_JSON, CONFIG_YAML):
gsheet = config(self.NAME)
self.assertIs(gsheet.type, DataSourceType.GSHEET)
self.assertEqual(
gsheet.client_secret_file, path.join(PATH, 'creds.json')
)

@patch(
'google.oauth2.service_account.Credentials.from_service_account_file',
side_effect=MockCreds
)
def test_connection(self, _):
"""test_connection."""
self.assertTrue(hasattr(self.gsheet.connection, 'project_id'))

@patch('gspread.authorize')
@patch(
'google.oauth2.service_account.Credentials.from_service_account_file',
side_effect=MockCreds
)
def test_convert(self, _, mock_gspread):
"""test_convert."""
mock_open_by_key = mock_gspread.return_value.open_by_key.return_value
mock_worksheet = mock_open_by_key.worksheet.return_value

with open(path.join(PATH, 'data.csv')) as file:
mock_worksheet.get_all_records.return_value = [
{
key: value
for key, value in row.items()
if key
}
for row in csv.DictReader(file)
]

self.assertEqual(self.DATA, self.gsheet('gsheet_id', 'sheet_name'))
32 changes: 6 additions & 26 deletions tests/test_converter/test_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,10 @@

"""test_s3."""

from string import ascii_lowercase
from os import path
import unittest
from unittest.mock import patch

from pandas import DataFrame
from pandas.testing import assert_frame_equal

from . import CONFIG_JSON, CONFIG_YAML, PATH
from . import CONFIG_JSON, CONFIG_YAML, DataFrameTestCase, PATH
from datasae.converter import DataSourceType


Expand Down Expand Up @@ -45,20 +40,9 @@ def __init__(self, bucket_name: str, object_name: str):
}


class S3Test(unittest.TestCase):
class S3Test(DataFrameTestCase):
"""S3Test."""

def assertDataframeEqual(self, a, b, msg):
"""assertDataframeEqual."""
try:
assert_frame_equal(a, b)
except AssertionError as e:
raise self.failureException(msg) from e

def setUp(self):
"""Set up method."""
self.addTypeEqualityFunc(DataFrame, self.assertDataframeEqual)

def __init__(self, methodName: str = 'runTest'):
"""__init__."""
super().__init__(methodName)
Expand All @@ -84,27 +68,23 @@ def test_connection(self):
def test_convert(self, _):
"""test_convert."""
BUCKET_NAME: str = 'datasae'
DATA: DataFrame = DataFrame({'alphabet': list(ascii_lowercase)})

with self.assertRaises(AssertionError):
self.assertEqual(DATA, DataFrame())

self.assertEqual(
DATA,
self.DATA,
self.s3(
BUCKET_NAME, 'data.csv'
).drop('Unnamed: 0', axis='columns')
)
self.assertEqual(
DATA,
self.DATA,
self.s3(BUCKET_NAME, 'data.json').sort_index()
)
self.assertEqual(
DATA,
self.DATA,
self.s3(BUCKET_NAME, 'data.parquet')
)
self.assertEqual(
DATA,
self.DATA,
self.s3(
BUCKET_NAME, 'data.xlsx', sheet_name='Sheet1'
).drop('Unnamed: 0', axis='columns')
Expand Down

0 comments on commit 0a6ad73

Please sign in to comment.