Skip to content

Commit

Permalink
TDL-14475 added unsupported feature and unittests (#47)
Browse files Browse the repository at this point in the history
* added unsupported feature and unittests

* added code comments

* fixed indent

* fixed indentation

* resolved a bug of writing md when 2 consecutive empty headers

* updated the logic for consecutive empty headers

* rsolved comments

* added test case for consecutive empty headers

* added comments

* resolved circleci errors

* resolved comments

Co-authored-by: namrata270998 <namrata.brahmbhatt@crestdatasystems.com>
Co-authored-by: prijendev <prijen.khokhani@crestdatasys.com>
  • Loading branch information
3 people authored Dec 13, 2021
1 parent 98e032a commit d76ae96
Show file tree
Hide file tree
Showing 3 changed files with 212 additions and 5 deletions.
16 changes: 15 additions & 1 deletion tap_google_sheets/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,8 @@ def get_sheet_schema_columns(sheet):
skipped = skipped + 1
column_index_str = str(column_index).zfill(2)
column_name = '__sdc_skip_col_{}'.format(column_index_str)
col_properties = {'type': ['null', 'string']}
# unsupported field description if the field is to be skipped
col_properties = {'type': ['null', 'string'], 'description': 'Column is unsupported and would be skipped because header is not available'}
column_gs_type = 'stringValue'
LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
sheet_title, column_name, column_letter))
Expand All @@ -185,6 +186,11 @@ def get_sheet_schema_columns(sheet):
# skipped = 2 consecutive skipped headers
# Remove prior_header column_name
sheet_json_schema['properties'].pop(prior_header, None)
# prior index is the index of the column prior to the currently column
prior_index = column_index - 1
# added a new boolean key `prior_column_skipped` to check if the column is one of the two columns with consecutive headers
# as due to consecutive empty headers both the columns should not be included in the schema as well as the metadata
columns[prior_index-1]['prior_column_skipped'] = True
LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
sheet_title, column_name, column_letter))
break
Expand Down Expand Up @@ -306,6 +312,14 @@ def get_schemas(client, spreadsheet_id):
valid_replication_keys=None,
replication_method='FULL_TABLE'
)
# for each column check if the `columnSkipped` value is true and the `prior_column_skipped` is false or None
# in the columns dict. The `prior_column_skipped` would be true when it is the first column of the two
# consecutive empty headers column if true: update the incusion property to `unsupported`
for column in columns:
if column.get('columnSkipped') and not column.get('prior_column_skipped'):
mdata = metadata.to_map(sheet_mdata)
sheet_mdata = metadata.write(mdata, ('properties', column.get('columnName')), 'inclusion', 'unsupported')
sheet_mdata = metadata.to_list(mdata)
field_metadata[sheet_title] = sheet_mdata

return schemas, field_metadata
10 changes: 6 additions & 4 deletions tests/test_google_sheets_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,13 @@ def test_run(self):
expected_automatic_fields.remove('modifiedTime')
self.assertSetEqual(expected_automatic_fields, actual_automatic_fields)

# verify columns missing headers or missing values where __sdc_row = 2
# verify missing values where __sdc_row = 2
# are marked with inclusion of unsupported
# BUG_TDL-14475 | https://jira.talendforge.org/browse/TDL-14475
failing_streams = {'sadsheet-column-skip-bug', 'Item Master'} # BUG_TDL-14475
if stream not in failing_streams: # BUG_TDL-14475
# The card TDL-14475 was only about adding unsupported
# inclusion property for empty header values. The sheet
# `Item Master` has columns with empty row values
failing_streams = {'Item Master'}
if stream not in failing_streams:
self.assertSetEqual(expected_unsupported_fields, actual_unsupported_fields)

# verify that all other fields have inclusion of available
Expand Down
191 changes: 191 additions & 0 deletions tests/unittests/test_unsupported_fields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
from tap_google_sheets.client import GoogleClient
import unittest
from unittest import mock
from tap_google_sheets import schema

class TestUnsupportedFields(unittest.TestCase):
@mock.patch('tap_google_sheets.GoogleClient.get')
def test_unsupported_fields(self, mocked_get):
"""
Test whether the incusion property for the skipped property is changed to `unsupported`
and the description is added in the schema
"""
sheet = {
"sheets": [{
"properties": {
"sheetId": 0,
"title": "Sheet1",
"index": 0,
"sheetType": "GRID",
"gridProperties": {
"rowCount": 3,
"columnCount": 26
}
},
"data": [
{
"rowData": [
{
"values": [
{},
{
"formattedValue": "abc"
},
{
"formattedValue": "def"
}
]
},
{
"values": [
{
"formattedValue": "A"
},
{
"formattedValue": "B"
},
{
"formattedValue": "4"
}
]
}
]
}
]}
]
}
expected_schema = {
"type": "object",
"additionalProperties":False,
"properties": {
"__sdc_spreadsheet_id": {
"type": [
"null",
"string"
]
},
"__sdc_sheet_id": {
"type": [
"null",
"integer"
]
},
"__sdc_row": {
"type": [
"null",
"integer"
]
},
"__sdc_skip_col_01": {
"type": [
"null",
"string"
],
"description": "Column is unsupported and would be skipped because header is not available"
},
"abc": {
"type": [
"null",
"string"
]
},
"def": {
"type": [
"null",
"string"
]
}
}
}
mocked_get.return_value = sheet
schemas, field_metadata = schema.get_schemas(GoogleClient('client_id', 'client_secret', 'refresh_token'), 'sheet_id')
# check if the schemas are equal, hence verifying if the description is present
self.assertEqual(expected_schema, schemas["Sheet1"])
for each in field_metadata["Sheet1"]:
if each["breadcrumb"] and '__sdc_skip_col_01' in each["breadcrumb"]:
# check if the inclusion property is updated to `unsupported`
self.assertEqual(each["metadata"]["inclusion"], "unsupported")

def test_two_skipped_columns(self):
"""
Test whether the columns has the `prior_column_skipped` key which is True as there ar 2 consecutive empty headers
and the `sheet_json_schema` has only 3 keys
"""
sheet = {
"properties":{
"sheetId":1825500887,
"title":"Sheet11",
"index":2,
"sheetType":"GRID",
"gridProperties":{
"rowCount":1000,
"columnCount":26
}
},
"data":[
{
"rowData":[
{
"values":[
{},
{},
{
"formattedValue":"abd",
}
]
},
{
"values":[
{
"formattedValue":"1",
},
{
"formattedValue":"3",
},
{
"formattedValue":"45",
}
]
}
],
"rowMetadata":[
{
"pixelSize":21
}
],
"columnMetadata":[
{
"pixelSize":100
},
]
}
]
}
expected_columns = [
{
'columnIndex': 1,
'columnLetter': 'A',
'columnName': '__sdc_skip_col_01',
'columnType': 'stringValue',
'columnSkipped': True,
'prior_column_skipped': True
}
]
expected_schema = {
'type': 'object',
'additionalProperties': False,
'properties': {
'__sdc_spreadsheet_id': {
'type': ['null', 'string']
},
'__sdc_sheet_id': {
'type': ['null', 'integer']
},
'__sdc_row': {
'type': ['null', 'integer']
}
}
}
sheet_json_schema, columns = schema.get_sheet_schema_columns(sheet)
self.assertEqual(sheet_json_schema, expected_schema) # test the schema is as expected
self.assertEqual(columns, expected_columns) # test if the columns is as expected

0 comments on commit d76ae96

Please sign in to comment.