TDL-14475 added unsupported feature and unittests (#47)

* added unsupported feature and unittests * added code comments * fixed indent * fixed indentation * resolved a bug of writing md when 2 consecutive empty headers * updated the logic for consecutive empty headers * rsolved comments * added test case for consecutive empty headers * added comments * resolved circleci errors * resolved comments Co-authored-by: namrata270998 <namrata.brahmbhatt@crestdatasystems.com> Co-authored-by: prijendev <prijen.khokhani@crestdatasys.com>
singer-io · Dec 13, 2021 · d76ae96 · d76ae96
1 parent 98e032a
commit d76ae96
Show file tree

Hide file tree

Showing 3 changed files with 212 additions and 5 deletions.
diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py
@@ -175,7 +175,8 @@ def get_sheet_schema_columns(sheet):
             skipped = skipped + 1
             column_index_str = str(column_index).zfill(2)
             column_name = '__sdc_skip_col_{}'.format(column_index_str)
-            col_properties = {'type': ['null', 'string']}
+            # unsupported field description if the field is to be skipped
+            col_properties = {'type': ['null', 'string'], 'description': 'Column is unsupported and would be skipped because header is not available'}
             column_gs_type = 'stringValue'
             LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format(
                 sheet_title, column_name, column_letter))
@@ -185,6 +186,11 @@ def get_sheet_schema_columns(sheet):
             # skipped = 2 consecutive skipped headers
             # Remove prior_header column_name
             sheet_json_schema['properties'].pop(prior_header, None)
+            # prior index is the index of the column prior to the currently column
+            prior_index = column_index - 1
+            # added a new boolean key `prior_column_skipped` to check if the column is one of the two columns with consecutive headers 
+            # as due to consecutive empty headers both the columns should not be included in the schema as well as the metadata
+            columns[prior_index-1]['prior_column_skipped'] = True
             LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format(
                 sheet_title, column_name, column_letter))
             break
@@ -306,6 +312,14 @@ def get_schemas(client, spreadsheet_id):
                             valid_replication_keys=None,
                             replication_method='FULL_TABLE'
                         )
+                        # for each column check if the `columnSkipped` value is true and the `prior_column_skipped` is false or None
+                        # in the columns dict. The `prior_column_skipped` would be true  when it is the first column of the two
+                        # consecutive empty headers column if true: update the incusion property to `unsupported`
+                        for column in columns:
+                            if column.get('columnSkipped') and not column.get('prior_column_skipped'):
+                                mdata = metadata.to_map(sheet_mdata)
+                                sheet_mdata = metadata.write(mdata, ('properties', column.get('columnName')), 'inclusion', 'unsupported')
+                                sheet_mdata = metadata.to_list(mdata)
                         field_metadata[sheet_title] = sheet_mdata
 
     return schemas, field_metadata
diff --git a/tests/test_google_sheets_discovery.py b/tests/test_google_sheets_discovery.py
@@ -118,11 +118,13 @@ def test_run(self):
                     expected_automatic_fields.remove('modifiedTime')
                 self.assertSetEqual(expected_automatic_fields, actual_automatic_fields)
 
-                # verify columns missing headers or missing values where __sdc_row = 2
+                # verify missing values where __sdc_row = 2
                 # are marked with inclusion of unsupported
-                # BUG_TDL-14475 | https://jira.talendforge.org/browse/TDL-14475
-                failing_streams = {'sadsheet-column-skip-bug', 'Item Master'}  # BUG_TDL-14475
-                if stream not in failing_streams:  # BUG_TDL-14475
+                # The card TDL-14475 was only about adding unsupported 
+                # inclusion property for empty header values. The sheet 
+                # `Item Master` has columns with empty row values
+                failing_streams = {'Item Master'}
+                if stream not in failing_streams:
                     self.assertSetEqual(expected_unsupported_fields, actual_unsupported_fields)
 
                 # verify that all other fields have inclusion of available

diff --git a/tests/unittests/test_unsupported_fields.py b/tests/unittests/test_unsupported_fields.py
@@ -0,0 +1,191 @@
+from tap_google_sheets.client import GoogleClient
+import unittest
+from unittest import mock
+from tap_google_sheets import schema
+
+class TestUnsupportedFields(unittest.TestCase):
+    @mock.patch('tap_google_sheets.GoogleClient.get')
+    def test_unsupported_fields(self, mocked_get):
+        """
+        Test whether the incusion property for the skipped property is changed to `unsupported`
+        and the description is added in the schema
+        """
+        sheet = {
+            "sheets": [{
+                "properties": {
+                    "sheetId": 0,
+                    "title": "Sheet1",
+                    "index": 0,
+                    "sheetType": "GRID",
+                    "gridProperties": {
+                        "rowCount": 3,
+                        "columnCount": 26
+                    }
+                },
+                "data": [
+                    {
+                        "rowData": [
+                            {
+                                "values": [
+                                    {},
+                                    {
+                                        "formattedValue": "abc"
+                                    },
+                                    {
+                                        "formattedValue": "def"
+                                    }
+                                ]
+                            },
+                            {
+                                "values": [
+                                    {
+                                        "formattedValue": "A"
+                                    },
+                                    {
+                                        "formattedValue": "B"
+                                    },
+                                    {
+                                        "formattedValue": "4"
+                                    }
+                                ]
+                            }
+                        ]
+                    }
+                ]}
+            ]
+        }
+        expected_schema = {
+            "type": "object",
+            "additionalProperties":False,
+            "properties": {
+                "__sdc_spreadsheet_id": {
+                    "type": [
+                        "null",
+                        "string"
+                    ]
+                },
+                "__sdc_sheet_id": {
+                    "type": [
+                        "null",
+                        "integer"
+                    ]
+                },
+                "__sdc_row": {
+                    "type": [
+                        "null",
+                        "integer"
+                    ]
+                },
+                "__sdc_skip_col_01": {
+                    "type": [
+                        "null",
+                        "string"
+                    ],
+                    "description": "Column is unsupported and would be skipped because header is not available"
+                },
+                "abc": {
+                    "type": [
+                        "null",
+                        "string"
+                    ]
+                },
+                "def": {
+                    "type": [
+                        "null",
+                        "string"
+                    ]
+                }
+            }
+        }
+        mocked_get.return_value = sheet
+        schemas, field_metadata = schema.get_schemas(GoogleClient('client_id', 'client_secret', 'refresh_token'), 'sheet_id')
+        # check if the schemas are equal, hence verifying if the description is present
+        self.assertEqual(expected_schema, schemas["Sheet1"])
+        for each in field_metadata["Sheet1"]:
+            if each["breadcrumb"] and '__sdc_skip_col_01' in each["breadcrumb"]:
+                # check if the inclusion property is updated to `unsupported`
+                self.assertEqual(each["metadata"]["inclusion"], "unsupported")
+
+    def test_two_skipped_columns(self):
+        """
+        Test whether the columns has the `prior_column_skipped` key which is True as there ar 2 consecutive empty headers
+        and the `sheet_json_schema` has only 3 keys
+        """
+        sheet = {
+            "properties":{
+                "sheetId":1825500887,
+                "title":"Sheet11",
+                "index":2,
+                "sheetType":"GRID",
+                "gridProperties":{
+                    "rowCount":1000,
+                    "columnCount":26
+                }
+            },
+            "data":[
+                {
+                    "rowData":[
+                        {
+                        "values":[
+                            {},
+                            {},
+                            {
+                                "formattedValue":"abd",
+                            }
+                        ]
+                        },
+                        {
+                        "values":[
+                            {
+                                "formattedValue":"1",
+                            },
+                            {
+                                "formattedValue":"3",
+                            },
+                            {  
+                                "formattedValue":"45",
+                            }
+                        ]
+                        }
+                    ],
+                    "rowMetadata":[
+                        {
+                        "pixelSize":21
+                        }
+                    ],
+                    "columnMetadata":[
+                        {
+                        "pixelSize":100
+                        },
+                    ]
+                }
+            ]
+        }
+        expected_columns = [
+            {
+                'columnIndex': 1,
+                'columnLetter': 'A',
+                'columnName': '__sdc_skip_col_01',
+                'columnType': 'stringValue',
+                'columnSkipped': True,
+                'prior_column_skipped': True
+            }
+        ]
+        expected_schema = {
+            'type': 'object',
+            'additionalProperties': False,
+            'properties': {
+                '__sdc_spreadsheet_id': {
+                    'type': ['null', 'string']
+                    },
+                '__sdc_sheet_id': {
+                    'type': ['null', 'integer']
+                    },
+                '__sdc_row': {
+                    'type': ['null', 'integer']
+                }
+            }
+        }
+        sheet_json_schema, columns = schema.get_sheet_schema_columns(sheet)
+        self.assertEqual(sheet_json_schema, expected_schema) # test the schema is as expected
+        self.assertEqual(columns, expected_columns) # test if the columns is as expected