Skip to content

Data Quality Framework provides by Jabar Digital Service

License

Notifications You must be signed in to change notification settings

jabardigitalservice/DataSae

Repository files navigation

DataSae

Docs License PyPI - Python Version PyPI - Version GitHub Action Coverage

Data Quality Framework provides by Jabar Digital Service

Configuration Files

{
"test_gsheet": {
"type": "gsheet",
"client_secret_file": "tests/data/creds.json",
"gsheet_id": "gsheet_id",
"checker": [
{
"sheet_name": "Sheet1",
"column": {
"alphabet": {
"string": {
"is_lowercase": null,
"regex_contain": "[a-z]"
}
},
"ALPHABET": {
"datasae.string.String": {
"is_uppercase": null,
"regex_contain": "[A-Z]"
}
}
}
}
]
},
"test_local": {
"type": "local",
"checker": [
{
"file_path": "tests/data/data.csv",
"sep": ",",
"column": {
"alphabet": {
"string": {
"is_lowercase": null,
"regex_contain": "[a-z]"
}
},
"ALPHABET": {
"datasae.string.String": {
"is_uppercase": null,
"regex_contain": "[A-Z]"
}
}
}
},
{
"file_path": "tests/data/data.xlsx",
"column": {
"alphabet": {
"string": {
"is_lowercase": null,
"regex_contain": "[a-z]"
}
},
"ALPHABET": {
"datasae.string.String": {
"is_uppercase": null,
"regex_contain": "[A-Z]"
}
}
}
}
]
},
"test_mariadb_or_mysql": {
"type": "sql",
"drivername": "mysql+pymysql",
"username": "root",
"password": "testpassword",
"host": "localhost",
"port": 3306,
"database": "mysql",
"checker": [
{
"query": "tests/data/query.sql",
"column": {
"column_name": {
"integer": {
"equal_to": 1,
"less_than": 3,
"in_range": {
"lower_limit": 0,
"upper_limit": 2
}
}
},
"another_column_name": {
"datasae.integer.Integer": {
"equal_to": 5,
"less_than": 10,
"in_range": [4, 6]
}
},
"boolean_column_name": {
"datasae.boolean.Boolean": {"is_bool": null}
}
}
}
]
},
"test_postgresql": {
"type": "datasae.converter.sql.Sql",
"drivername": "postgresql",
"username": "postgres",
"password": "testpassword",
"host": "localhost",
"port": 5432,
"database": "postgres",
"checker": [
{
"query": "select 1 column_name, 5 another_column_name, false boolean_column_name;",
"column": {
"column_name": {
"integer": {
"equal_to": 1,
"less_than": 3,
"in_range": {
"lower_limit": 0,
"upper_limit": 2
}
}
},
"another_column_name": {
"datasae.integer.Integer": {
"equal_to": 5,
"less_than": 10,
"in_range": [4, 6]
}
},
"boolean_column_name": {
"boolean": {"is_bool": null}
}
}
}
]
},
"test_s3": {
"type": "s3",
"endpoint": "play.min.io",
"access_key": "Q3AM3UQ867SPQQA43P2F",
"secret_key": "zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG",
"bucket_name": "bucket_name",
"checker": [
{
"object_name": "data.csv",
"column": {
"alphabet": {
"string": {
"is_lowercase": null,
"regex_contain": "[a-z]"
}
},
"ALPHABET": {
"datasae.string.String": {
"is_uppercase": null,
"regex_contain": "[A-Z]"
}
}
}
},
{
"object_name": "data.xlsx",
"bucket_name": "another_bucket_name",
"sheet_name": "Sheet1",
"column": {
"alphabet": {
"string": {
"is_lowercase": null,
"regex_contain": "[a-z]"
}
},
"ALPHABET": {
"datasae.string.String": {
"is_uppercase": null,
"regex_contain": "[A-Z]"
}
}
}
}
]
}
}

test_gsheet:
type: gsheet
client_secret_file: tests/data/creds.json
gsheet_id: gsheet_id
checker:
- sheet_name: Sheet1
column:
alphabet:
string:
is_lowercase:
regex_contain: '[a-z]'
ALPHABET:
datasae.string.String:
is_uppercase:
regex_contain: '[A-Z]'
test_local:
type: local
checker:
- file_path: tests/data/data.csv
sep: ','
column:
alphabet:
string:
is_lowercase:
regex_contain: '[a-z]'
ALPHABET:
datasae.string.String:
is_uppercase:
regex_contain: '[A-Z]'
- file_path: tests/data/data.xlsx
column:
alphabet:
string:
is_lowercase:
regex_contain: '[a-z]'
ALPHABET:
datasae.string.String:
is_uppercase:
regex_contain: '[A-Z]'
test_mariadb_or_mysql:
type: sql
drivername: mysql+pymysql
username: root
password: testpassword
host: localhost
port: 3306
database: mysql
checker:
- query: tests/data/query.sql
column:
column_name:
integer:
equal_to: 1
less_than: 3
in_range:
lower_limit: 0
upper_limit: 2
another_column_name:
datasae.integer.Integer:
equal_to: 5
less_than: 10
in_range: [4, 6]
boolean_column_name:
datasae.boolean.Boolean:
is_bool:
test_postgresql:
type: datasae.converter.sql.Sql
drivername: postgresql
username: postgres
password: testpassword
host: localhost
port: 5432
database: postgres
checker:
- query: select 1 column_name, 5 another_column_name, false boolean_column_name;
column:
column_name:
integer:
equal_to: 1
less_than: 3
in_range:
lower_limit: 0
upper_limit: 2
another_column_name:
datasae.integer.Integer:
equal_to: 5
less_than: 10
in_range: [4, 6]
boolean_column_name:
boolean:
is_bool:
test_s3:
type: s3
endpoint: play.min.io
access_key: Q3AM3UQ867SPQQA43P2F
secret_key: zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG
bucket_name: bucket_name
checker:
- object_name: data.csv
column:
alphabet:
string:
is_lowercase:
regex_contain: '[a-z]'
ALPHABET:
datasae.string.String:
is_uppercase:
regex_contain: '[A-Z]'
- object_name: data.xlsx
bucket_name: another_bucket_name
sheet_name: Sheet1
column:
alphabet:
string:
is_lowercase:
regex_contain: '[a-z]'
ALPHABET:
datasae.string.String:
is_uppercase:
regex_contain: '[A-Z]'

Checker for Data Quality

Note

You can use DataSae Column's Function Based on Data Type for adding column checker function data quality in the config file.

pip install 'DataSae[converter,gsheet,s3,sql]'

Command Line Interface (CLI)

datasae --help
 
 Usage: datasae [OPTIONS] FILE_PATH
 
 Checker command.
 Creates checker result based on the configuration provided in the checker section of the data source's configuration file.
╭─ Arguments ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ *    file_path      TEXT  The source path of the .json or .yaml file [default: None] [required]                                    │
╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
╭─ Options ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ --config-name                       TEXT  If the config name is not set, it will create all of the checker results [default: None] │
│ --yaml-display    --json-display          [default: yaml-display]                                                                  │
│ --save-to-file-path                 TEXT  [default: None]                                                                          │
│ --help                                    Show this message and exit.                                                              │
╰────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯

Example commands:

datasae DataSae/tests/data/config.yaml # Check all data qualities on configuration
datasae DataSae/tests/data/config.yaml --config-name test_local # Check data quality by config name

Tip

Actually, we have example for DataSae implementation in Apache Airflow, but for now it is for private use only. Internal developer can see it at this git repository.

Example results:

{
"test_gsheet": [
{
"sheet_name": "Sheet1",
"column": {
"alphabet": {
"string": {
"is_lowercase": {
"params": null,
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
},
"regex_contain": {
"params": "[a-z]",
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
}
}
},
"ALPHABET": {
"datasae.string.String": {
"is_uppercase": {
"params": null,
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
},
"regex_contain": {
"params": "[A-Z]",
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
}
}
}
}
}
],
"test_local": [
{
"file_path": "tests/data/data.csv",
"sep": ",",
"column": {
"alphabet": {
"string": {
"is_lowercase": {
"params": null,
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
},
"regex_contain": {
"params": "[a-z]",
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
}
}
},
"ALPHABET": {
"datasae.string.String": {
"is_uppercase": {
"params": null,
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
},
"regex_contain": {
"params": "[A-Z]",
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
}
}
}
}
},
{
"file_path": "tests/data/data.xlsx",
"column": {
"alphabet": {
"string": {
"is_lowercase": {
"params": null,
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
},
"regex_contain": {
"params": "[a-z]",
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
}
}
},
"ALPHABET": {
"datasae.string.String": {
"is_uppercase": {
"params": null,
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
},
"regex_contain": {
"params": "[A-Z]",
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
}
}
}
}
}
],
"test_mariadb_or_mysql": [
{
"query": "tests/data/query.sql",
"column": {
"column_name": {
"integer": {
"equal_to": {
"params": 1,
"result": {
"score": 1.0,
"valid": 1,
"invalid": 0,
"warning": {}
}
},
"less_than": {
"params": 3,
"result": {
"score": 1.0,
"valid": 1,
"invalid": 0,
"warning": {}
}
},
"in_range": {
"params": {
"lower_limit": 0,
"upper_limit": 2
},
"result": {
"score": 1.0,
"valid": 1,
"invalid": 0,
"warning": {}
}
}
}
},
"another_column_name": {
"datasae.integer.Integer": {
"equal_to": {
"params": 5,
"result": {
"score": 1.0,
"valid": 1,
"invalid": 0,
"warning": {}
}
},
"less_than": {
"params": 10,
"result": {
"score": 1.0,
"valid": 1,
"invalid": 0,
"warning": {}
}
},
"in_range": {
"params": [
4,
6
],
"result": {
"score": 1.0,
"valid": 1,
"invalid": 0,
"warning": {}
}
}
}
},
"boolean_column_name": {
"datasae.boolean.Boolean": {
"is_bool": {
"params": null,
"result": {
"score": 1.0,
"valid": 1,
"invalid": 0,
"warning": {}
}
}
}
}
}
}
],
"test_postgresql": [
{
"query": "select 1 column_name, 5 another_column_name, false boolean_column_name;",
"column": {
"column_name": {
"integer": {
"equal_to": {
"params": 1,
"result": {
"score": 1.0,
"valid": 1,
"invalid": 0,
"warning": {}
}
},
"less_than": {
"params": 3,
"result": {
"score": 1.0,
"valid": 1,
"invalid": 0,
"warning": {}
}
},
"in_range": {
"params": {
"lower_limit": 0,
"upper_limit": 2
},
"result": {
"score": 1.0,
"valid": 1,
"invalid": 0,
"warning": {}
}
}
}
},
"another_column_name": {
"datasae.integer.Integer": {
"equal_to": {
"params": 5,
"result": {
"score": 1.0,
"valid": 1,
"invalid": 0,
"warning": {}
}
},
"less_than": {
"params": 10,
"result": {
"score": 1.0,
"valid": 1,
"invalid": 0,
"warning": {}
}
},
"in_range": {
"params": [
4,
6
],
"result": {
"score": 1.0,
"valid": 1,
"invalid": 0,
"warning": {}
}
}
}
},
"boolean_column_name": {
"boolean": {
"is_bool": {
"params": null,
"result": {
"score": 1.0,
"valid": 1,
"invalid": 0,
"warning": {}
}
}
}
}
}
}
],
"test_s3": [
{
"object_name": "data.csv",
"column": {
"alphabet": {
"string": {
"is_lowercase": {
"params": null,
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
},
"regex_contain": {
"params": "[a-z]",
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
}
}
},
"ALPHABET": {
"datasae.string.String": {
"is_uppercase": {
"params": null,
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
},
"regex_contain": {
"params": "[A-Z]",
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
}
}
}
}
},
{
"object_name": "data.xlsx",
"bucket_name": "another_bucket_name",
"sheet_name": "Sheet1",
"column": {
"alphabet": {
"string": {
"is_lowercase": {
"params": null,
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
},
"regex_contain": {
"params": "[a-z]",
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
}
}
},
"ALPHABET": {
"datasae.string.String": {
"is_uppercase": {
"params": null,
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
},
"regex_contain": {
"params": "[A-Z]",
"result": {
"score": 1.0,
"valid": 26,
"invalid": 0,
"warning": {}
}
}
}
}
}
}
]
}

Python Code

from datasae.converter import Config

# From JSON
config = Config('DataSae/tests/data/config.json')

# From YAML
config = Config('DataSae/tests/data/config.yaml')

# Check all data qualities on configuration
config.checker  # dict result

# Check data quality by config name
config('test_local').checker  # list of dict result
config('test_gsheet').checker  # list of dict result
config('test_s3').checker  # list of dict result
config('test_mariadb_or_mysql').checker  # list of dict result
config('test_postgresql').checker  # list of dict result

Converter from Any Data Source to Pandas's DataFrame

Note

Currently support to convert from CSV, JSON, Parquet, Excel, Google Spreadsheet, and SQL.

pip install 'DataSae[converter]'

Local Computer

from datasae.converter import Config

# From JSON
config = Config('DataSae/tests/data/config.json')

# From YAML
config = Config('DataSae/tests/data/config.yaml')

# Local computer file to DataFrame
local = config('test_local')

df = local('path/file_name.csv', sep=',')
df = local('path/file_name.json')
df = local('path/file_name.parquet')
df = local('path/file_name.xlsx', sheet_name='Sheet1')

df = local('path/file_name.csv')  # Default: sep = ','
df = local('path/file_name.json')
df = local('path/file_name.parquet')
df = local('path/file_name.xlsx')  # Default: sheet_name = 'Sheet1'

Google Spreadsheet

{
"type": "service_account",
"project_id": "project_id",
"private_key_id": "private_key_id",
"private_key": "-----BEGIN PRIVATE KEY----------END PRIVATE KEY-----\n",
"client_email": "name@email.com",
"client_id": "client_id",
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
"token_uri": "https://oauth2.googleapis.com/token",
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
"client_x509_cert_url": "client_x509_cert_url"
}

pip install 'DataSae[converter,gsheet]'
from datasae.converter import Config

# From JSON
config = Config('DataSae/tests/data/config.json')

# From YAML
config = Config('DataSae/tests/data/config.yaml')

# Google Spreadsheet to DataFrame
gsheet = config('test_gsheet')
df = gsheet('Sheet1')
df = gsheet('Sheet1', 'gsheet_id')

S3

pip install 'DataSae[converter,s3]'
from datasae.converter import Config

# From JSON
config = Config('DataSae/tests/data/config.json')

# From YAML
config = Config('DataSae/tests/data/config.yaml')

# S3 object to DataFrame
s3 = config('test_s3')

df = s3('path/file_name.csv', sep=',')
df = s3('path/file_name.json')
df = s3('path/file_name.parquet')
df = s3('path/file_name.xlsx', sheet_name='Sheet1')

df = s3('path/file_name.csv', 'bucket_name')  # Default: sep = ','
df = s3('path/file_name.json', 'bucket_name')
df = s3('path/file_name.parquet', 'bucket_name')
df = s3('path/file_name.xlsx', 'bucket_name')  # Default: sheet_name = 'Sheet1'

SQL

pip install 'DataSae[converter,sql]'

Important

For MacOS users, if pip install failed at mysqlclient, please run this and retry to install again after that.

brew install mysql

MariaDB or MySQL

from datasae.converter import Config

# From JSON
config = Config('DataSae/tests/data/config.json')

# From YAML
config = Config('DataSae/tests/data/config.yaml')

# MariaDB or MySQL to DataFrame
mariadb_or_mysql = config('test_mariadb_or_mysql')
df = mariadb_or_mysql('select 1 column_name from schema_name.table_name;')
df = mariadb_or_mysql('path/file_name.sql')

PostgreSQL

from datasae.converter import Config

# From JSON
config = Config('DataSae/tests/data/config.json')

# From YAML
config = Config('DataSae/tests/data/config.yaml')

# PostgreSQL to DataFrame
postgresql = config('test_postgresql')
df = postgresql('select 1 column_name from schema_name.table_name;')
df = postgresql('path/file_name.sql')