diff --git a/.circleci/config.yml b/.circleci/config.yml index dd82fae..897d521 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -13,7 +13,7 @@ jobs: command: | python3 -m venv /usr/local/share/virtualenvs/tap-google-sheets source /usr/local/share/virtualenvs/tap-google-sheets/bin/activate - pip install .[dev] + pip install .[test] - run: name: 'pylint' command: | @@ -25,6 +25,17 @@ jobs: source /usr/local/share/virtualenvs/tap-tester/bin/activate stitch-validate-json tap_google_sheets/schemas/*.json - add_ssh_keys + - run: + name: 'Unit Tests' + command: | + source /usr/local/share/virtualenvs/tap-google-sheets/bin/activate + pip install nose coverage + nosetests --with-coverage --cover-erase --cover-package=tap_google_sheets --cover-html-dir=htmlcov tests/unittests + coverage html + - store_test_results: + path: test_output/report.xml + - store_artifacts: + path: htmlcov - run: name: 'Integration Tests' command: | diff --git a/README.md b/README.md index 9470411..b813b5f 100644 --- a/README.md +++ b/README.md @@ -112,12 +112,13 @@ The [**Google Sheets Setup & Authentication**](https://drive.google.com/open?id= "refresh_token": "YOUR_REFRESH_TOKEN", "spreadsheet_id": "YOUR_GOOGLE_SPREADSHEET_ID", "start_date": "2019-01-01T00:00:00Z", - "user_agent": "tap-google-sheets " + "user_agent": "tap-google-sheets ", + "request_timeout": 300 } ``` Optionally, also create a `state.json` file. `currently_syncing` is an optional attribute used for identifying the last object to be synced in case the job is interrupted mid-stream. The next run would begin where the last job left off. - Only the `performance_reports` uses a bookmark. The date-time bookmark is stored in a nested structure based on the endpoint, site, and sub_type. + Only the `performance_reports` uses a bookmark. The date-time bookmark is stored in a nested structure based on the endpoint, site, and sub_type.The `request_timeout` is an optional paramater to set timeout for requests. Default: 300 seconds ```json { diff --git a/config.json.example b/config.json.example index 159aafb..5c07bce 100644 --- a/config.json.example +++ b/config.json.example @@ -4,5 +4,6 @@ "refresh_token": "YOUR_REFRESH_TOKEN", "spreadsheet_id": "YOUR_GOOGLE_SPREADSHEET_ID", "start_date": "2019-01-01T00:00:00Z", - "user_agent": "tap-google-search-console " + "user_agent": "tap-google-search-console ", + "request_timeout": 300 } diff --git a/setup.py b/setup.py index 5c0e9e9..0ad7e67 100644 --- a/setup.py +++ b/setup.py @@ -11,13 +11,15 @@ install_requires=[ 'backoff==1.8.0', 'requests==2.22.0', - 'singer-python==5.9.0' + 'singer-python==5.12.2' ], extras_require={ - 'dev': [ - 'ipdb==0.11', + 'test': [ 'pylint', 'nose' + ], + 'dev': [ + 'ipdb==0.11', ] }, entry_points=''' diff --git a/tap_google_sheets/__init__.py b/tap_google_sheets/__init__.py index f97d4b8..c74f745 100644 --- a/tap_google_sheets/__init__.py +++ b/tap_google_sheets/__init__.py @@ -36,7 +36,9 @@ def main(): with GoogleClient(parsed_args.config['client_id'], parsed_args.config['client_secret'], parsed_args.config['refresh_token'], - parsed_args.config['user_agent']) as client: + parsed_args.config.get('request_timeout'), + parsed_args.config['user_agent'] + ) as client: state = {} if parsed_args.state: diff --git a/tap_google_sheets/client.py b/tap_google_sheets/client.py index bce00ce..9dc1b13 100644 --- a/tap_google_sheets/client.py +++ b/tap_google_sheets/client.py @@ -5,11 +5,12 @@ import singer from singer import metrics from singer import utils +from requests.exceptions import Timeout, ConnectionError BASE_URL = 'https://www.googleapis.com' GOOGLE_TOKEN_URI = 'https://oauth2.googleapis.com/token' LOGGER = singer.get_logger() - +REQUEST_TIMEOUT = 300 class Server5xxError(Exception): pass @@ -131,6 +132,7 @@ def __init__(self, client_id, client_secret, refresh_token, + request_timeout=REQUEST_TIMEOUT, user_agent=None): self.__client_id = client_id self.__client_secret = client_secret @@ -140,8 +142,19 @@ def __init__(self, self.__expires = None self.__session = requests.Session() self.base_url = None - - + # if request_timeout is other than 0,"0" or "" then use request_timeout + if request_timeout and float(request_timeout): + request_timeout = float(request_timeout) + else: # If value is 0,"0" or "" then set default to 300 seconds. + request_timeout = REQUEST_TIMEOUT + self.request_timeout = request_timeout + + # Backoff request for 5 times at an interval of 10 seconds in case of Timeout or Connection error + @backoff.on_exception(backoff.constant, + (Timeout, ConnectionError), + max_tries=5, + interval=10, + jitter=None) # Interval value not consistent if jitter not None def __enter__(self): self.get_access_token() return self @@ -149,7 +162,6 @@ def __enter__(self): def __exit__(self, exception_type, exception_value, traceback): self.__session.close() - @backoff.on_exception(backoff.expo, Server5xxError, max_tries=5, @@ -172,7 +184,8 @@ def get_access_token(self): 'client_id': self.__client_id, 'client_secret': self.__client_secret, 'refresh_token': self.__refresh_token, - }) + }, + timeout=self.request_timeout) if response.status_code >= 500: raise Server5xxError() @@ -186,6 +199,12 @@ def get_access_token(self): LOGGER.info('Authorized, token expires = {}'.format(self.__expires)) + # Backoff request for 5 times at an interval of 10 seconds when we get Timeout error + @backoff.on_exception(backoff.constant, + (Timeout), + max_tries=5, + interval=10, + jitter=None) # Interval value not consistent if jitter not None # Rate Limit: https://developers.google.com/sheets/api/limits # 100 request per 100 seconds per User @backoff.on_exception(backoff.expo, @@ -221,7 +240,8 @@ def request(self, method, path=None, url=None, api=None, **kwargs): kwargs['headers']['Content-Type'] = 'application/json' with metrics.http_request_timer(endpoint) as timer: - response = self.__session.request(method, url, **kwargs) + + response = self.__session.request(method, url, timeout=self.request_timeout, **kwargs) timer.tags[metrics.Tag.http_status_code] = response.status_code if response.status_code >= 500: diff --git a/tap_google_sheets/schema.py b/tap_google_sheets/schema.py index 2806522..37f7fb1 100644 --- a/tap_google_sheets/schema.py +++ b/tap_google_sheets/schema.py @@ -65,13 +65,18 @@ def get_sheet_schema_columns(sheet): prior_header = None i = 0 skipped = 0 + + # if no headers are present, log the message that sheet is skipped + if not headers: + LOGGER.warn('SKIPPING THE SHEET AS HEADERS ROW IS EMPTY. SHEET: {}'.format(sheet_title)) + # Read column headers until end or 2 consecutive skipped headers for header in headers: # LOGGER.info('header = {}'.format(json.dumps(header, indent=2, sort_keys=True))) column_index = i + 1 column_letter = colnum_string(column_index) header_value = header.get('formattedValue') - if header_value: # NOT skipped + if header_value: # if the column is NOT to be skipped column_is_skipped = False skipped = 0 column_name = '{}'.format(header_value) @@ -170,12 +175,13 @@ def get_sheet_schema_columns(sheet): LOGGER.info('WARNING: UNSUPPORTED 2ND ROW VALUE: SHEET: {}, COL: {}, CELL: {}2, TYPE: {}'.format( sheet_title, column_name, column_letter, column_effective_value_type)) LOGGER.info('Converting to string.') - else: # skipped + else: # if the column is to be skipped column_is_skipped = True skipped = skipped + 1 column_index_str = str(column_index).zfill(2) column_name = '__sdc_skip_col_{}'.format(column_index_str) - col_properties = {'type': ['null', 'string']} + # unsupported field description if the field is to be skipped + col_properties = {'type': ['null', 'string'], 'description': 'Column is unsupported and would be skipped because header is not available'} column_gs_type = 'stringValue' LOGGER.info('WARNING: SKIPPED COLUMN; NO COLUMN HEADER. SHEET: {}, COL: {}, CELL: {}1'.format( sheet_title, column_name, column_letter)) @@ -184,12 +190,20 @@ def get_sheet_schema_columns(sheet): if skipped >= 2: # skipped = 2 consecutive skipped headers # Remove prior_header column_name + # stop scanning the sheet and break sheet_json_schema['properties'].pop(prior_header, None) + # prior index is the index of the column prior to the currently column + prior_index = column_index - 1 + # added a new boolean key `prior_column_skipped` to check if the column is one of the two columns with consecutive headers + # as due to consecutive empty headers both the columns should not be included in the schema as well as the metadata + columns[prior_index-1]['prior_column_skipped'] = True LOGGER.info('TWO CONSECUTIVE SKIPPED COLUMNS. STOPPING SCAN AT: SHEET: {}, COL: {}, CELL {}1'.format( sheet_title, column_name, column_letter)) break else: + # skipped < 2 prepare `columns` dictionary with index, letter, column name, column type and + # if the column is to be skipped or not for each column in the list column = {} column = { 'columnIndex': column_index, @@ -204,10 +218,10 @@ def get_sheet_schema_columns(sheet): col_properties = { 'anyOf': [ col_properties, - {'type': ['null', 'string']} + {'type': ['null', 'string']} # all the date, time has string types in schema ] } - + # add the column properties in the `properties` in json schema for the respective column name sheet_json_schema['properties'][column_name] = col_properties prior_header = column_name @@ -231,8 +245,10 @@ def get_sheet_metadata(sheet, spreadsheet_id, client): params = stream_metadata.get('params', {}) sheet_title_encoded = urllib.parse.quote_plus(sheet_title) sheet_title_escaped = re.escape(sheet_title) + # create querystring for preparing the request querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in \ params.items()]).replace('{sheet_title}', sheet_title_encoded) + # create path for preparing the request path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ spreadsheet_id), querystring) @@ -280,7 +296,9 @@ def get_schemas(client, spreadsheet_id): if stream_name == 'spreadsheet_metadata': api = stream_metadata.get('api', 'sheets') params = stream_metadata.get('params', {}) + # prepare the query string for the request querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()]) + # prepare the path for request path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ spreadsheet_id), querystring) @@ -306,6 +324,14 @@ def get_schemas(client, spreadsheet_id): valid_replication_keys=None, replication_method='FULL_TABLE' ) + # for each column check if the `columnSkipped` value is true and the `prior_column_skipped` is false or None + # in the columns dict. The `prior_column_skipped` would be true when it is the first column of the two + # consecutive empty headers column if true: update the incusion property to `unsupported` + for column in columns: + if column.get('columnSkipped') and not column.get('prior_column_skipped'): + mdata = metadata.to_map(sheet_mdata) + sheet_mdata = metadata.write(mdata, ('properties', column.get('columnName')), 'inclusion', 'unsupported') + sheet_mdata = metadata.to_list(mdata) field_metadata[sheet_title] = sheet_mdata return schemas, field_metadata diff --git a/tap_google_sheets/schemas/file_metadata.json b/tap_google_sheets/schemas/file_metadata.json index 03fefc6..7daf396 100644 --- a/tap_google_sheets/schemas/file_metadata.json +++ b/tap_google_sheets/schemas/file_metadata.json @@ -35,7 +35,7 @@ "displayName": { "type": ["null", "string"] }, - "emailAdress": { + "emailAddress": { "type": ["null", "string"] } } diff --git a/tap_google_sheets/sync.py b/tap_google_sheets/sync.py index fd06701..8670887 100644 --- a/tap_google_sheets/sync.py +++ b/tap_google_sheets/sync.py @@ -514,7 +514,18 @@ def sync(client, config, catalog, state): from_row=from_row, columns=columns, sheet_data_rows=sheet_data_rows) - if row_num < to_row: + + # Here row_num is the addition of from_row and total records get in response(per batch). + # Condition row_num < to_row was checking that if records on the current page are less than expected(to_row) or not. + # If the condition returns true then it was breaking the loop. + # API does not return the last empty rows in response. + # For example, rows 199 and 200 are empty, and a total of 400 rows are there in the sheet. So, in 1st iteration, + # to_row = 200, from_row = 2, row_num = 2(from_row) + 197 = 199(1st row contain header value) + # So, the above condition become true and breaks the loop without syncing records from 201 to 400. + # sheet_data_rows is no of records return in the current page. If it's a whole blank page then stop looping. + # So, in the above case, it syncs records 201 to 400 also even if rows 199 and 200 are blank. + # Then when the next batch 401 to 600 is empty, it breaks the loop. + if not sheet_data_rows: # If a whole blank page found, then stop looping. is_last_row = True # Process records, send batch of records to target diff --git a/tests/test_google_sheets_all_fields.py b/tests/test_google_sheets_all_fields.py index 8783bb0..d50ee66 100644 --- a/tests/test_google_sheets_all_fields.py +++ b/tests/test_google_sheets_all_fields.py @@ -80,7 +80,16 @@ def test_run(self): self.assertGreater(len(expected_all_keys), len(expected_automatic_keys)) self.assertTrue(expected_automatic_keys.issubset(expected_all_keys), msg=f'{expected_automatic_keys-expected_all_keys} is not in "expected_all_keys"') if stream == "file_metadata": - # BUG | below keys are not synced https://jira.talendforge.org/browse/TDL-14409 + + # As per google documentation https://developers.google.com/drive/api/v3/reference/files `teamDriveId` is deprecated. There is mentioned that use `driveId` instead. + # `driveId` is populated from items in the team shared drives. But stitch integration does not support shared team drive. So replicating driveid is not possible. + # So, these two fields will not be synced. expected_all_keys.remove('teamDriveId') expected_all_keys.remove('driveId') + # Earlier field `emailAddress` was defined as `emailAdress`(typo mismatch) in file_metadata.json. + # So, this particular field did not collected. Because API response contain `emailAddress` field. + # Now, typo has been correted and verifying that `emailAddress` field collected. + lastModifyingUser_fields = set(messages['messages'][0].get('data', {}).get('lastModifyingUser', {}).keys()) # Get `lastModifyingUser` from file_metadata records + # Verify that `emailAddress` field under `lastModifyingUser` collected. + self.assertTrue({'emailAddress'}.issubset(lastModifyingUser_fields), msg="emailAddress does not found in lastModifyingUser") self.assertSetEqual(expected_all_keys, actual_all_keys) diff --git a/tests/test_google_sheets_discovery.py b/tests/test_google_sheets_discovery.py index 40f11f4..8a614f3 100644 --- a/tests/test_google_sheets_discovery.py +++ b/tests/test_google_sheets_discovery.py @@ -118,11 +118,13 @@ def test_run(self): expected_automatic_fields.remove('modifiedTime') self.assertSetEqual(expected_automatic_fields, actual_automatic_fields) - # verify columns missing headers or missing values where __sdc_row = 2 + # verify missing values where __sdc_row = 2 # are marked with inclusion of unsupported - # BUG_TDL-14475 | https://jira.talendforge.org/browse/TDL-14475 - failing_streams = {'sadsheet-column-skip-bug', 'Item Master'} # BUG_TDL-14475 - if stream not in failing_streams: # BUG_TDL-14475 + # The card TDL-14475 was only about adding unsupported + # inclusion property for empty header values. The sheet + # `Item Master` has columns with empty row values + failing_streams = {'Item Master'} + if stream not in failing_streams: self.assertSetEqual(expected_unsupported_fields, actual_unsupported_fields) # verify that all other fields have inclusion of available diff --git a/tests/test_google_sheets_pagination.py b/tests/test_google_sheets_pagination.py index fcd4d3e..82dd481 100644 --- a/tests/test_google_sheets_pagination.py +++ b/tests/test_google_sheets_pagination.py @@ -6,11 +6,6 @@ from base import GoogleSheetsBaseTest -# BUG_TDL-14376 | https://jira.talendforge.org/browse/TDL-14376 -# Expectation: Tap will pick up next page (200 rows) iff there is a non-null value on that page -# We observed a BUG where the tap does not paginate properly on sheets where the last two rows in a batch -# are empty values. The tap does not capture anything on the subsequent pages when this happens. -# class PaginationTest(GoogleSheetsBaseTest): @@ -25,6 +20,7 @@ def test_run(self): Verify that for each stream you can get multiple pages of data and that when all fields are selected more than the automatic fields are replicated. + Verify by primary keys that data is unique for page PREREQUISITE This test relies on the existence of a specific sheet with the name Pagination that has a column called 'id' with values 1 -> 238. @@ -43,9 +39,8 @@ def test_run(self): record_count_by_stream = self.run_and_verify_sync(conn_id) synced_records = runner.get_records_from_target_output() - for stream in testable_streams.difference({ - 'sadsheet-pagination', # BUG TDL-14376 - }): + # Added back `sadsheet-pagination` to testable_streams as # BUG TDL-14376 resolved. + for stream in testable_streams: with self.subTest(stream=stream): our_fake_pk = 'id' @@ -61,6 +56,22 @@ def test_run(self): # verify that we can paginate with all fields selected self.assertGreater(record_count_by_stream.get(stream, 0), self.API_LIMIT) + + if stream == "sadsheet-pagination": + # verify the data for the "sadsheet-pagination" stream is free of any duplicates or breaks by checking + # our fake pk value ('id') + expected_pk_list = list(range(1, 238)) + expected_pk_list = [x for x in expected_pk_list if x not in [198, 199]] + self.assertEqual(expected_pk_list, fake_pk_list) + + # verify the data for the "sadsheet-pagination" stream is free of any duplicates or breaks by checking + # the actual primary key values (__sdc_row) + expected_pk_list = list(range(2, 239)) + expected_pk_list = [x for x in expected_pk_list if x not in [199, 200]] + self.assertEqual(expected_pk_list, actual_pk_list) + + continue + # verify the data for the "Pagination" stream is free of any duplicates or breaks by checking # our fake pk value ('id') # THIS ASSERTION CAN BE MADE BECAUSE WE SETUP DATA IN A SPECIFIC WAY. DONT COPY THIS diff --git a/tests/unittests/test_logger.py b/tests/unittests/test_logger.py new file mode 100644 index 0000000..af7c62f --- /dev/null +++ b/tests/unittests/test_logger.py @@ -0,0 +1,77 @@ +import unittest +from unittest import mock +import tap_google_sheets +from tap_google_sheets import schema + + +class TestLogger(unittest.TestCase): + @mock.patch('tap_google_sheets.schema.LOGGER.warn') + def test_logger_message(self, mocked_logger): + """ + Test if the logger statement is printed when the header row is empty and the sheet is being skipped. + """ + sheet_data = { + "properties":{ + "sheetId":2074712559, + "title":"Sheet5", + "index":1, + "sheetType":"GRID", + "gridProperties":{ + "rowCount":1000, + "columnCount":26 + } + }, + "data":[ + { + "rowData":[ + {}, + { + "values":[ + { + "userEnteredValue":{ + "numberValue":1 + }, + "effectiveValue":{ + "numberValue":1 + }, + "formattedValue":"1", + }, + { + "userEnteredValue":{ + "numberValue":2 + }, + "effectiveValue":{ + "numberValue":2 + }, + "formattedValue":"2" + }, + { + "userEnteredValue":{ + "numberValue":3 + }, + "effectiveValue":{ + "numberValue":3 + }, + "formattedValue":"3", + } + ] + } + ], + "rowMetadata":[ + { + "pixelSize":21 + } + ], + "columnMetadata":[ + { + "pixelSize":100 + } + ] + } + ] + } + # retrieve the sheet title from the `sheet_data` + sheet_title = sheet_data.get('properties', {}).get('title') + sheet_schema, columns = schema.get_sheet_schema_columns(sheet_data) + # check if the logger is called with correct logger message + mocked_logger.assert_called_with('SKIPPING THE SHEET AS HEADERS ROW IS EMPTY. SHEET: {}'.format(sheet_title)) diff --git a/tests/unittests/test_request_timeout.py b/tests/unittests/test_request_timeout.py new file mode 100644 index 0000000..7c25ee5 --- /dev/null +++ b/tests/unittests/test_request_timeout.py @@ -0,0 +1,158 @@ +from tap_google_sheets.client import GoogleClient +import unittest +from unittest import mock +from unittest.case import TestCase +from requests.exceptions import Timeout, ConnectionError + +class TestBackoffError(unittest.TestCase): + ''' + Test that backoff logic works properly. + ''' + @mock.patch('tap_google_sheets.client.requests.Session.request') + @mock.patch('tap_google_sheets.client.GoogleClient.get_access_token') + def test_request_timeout_and_backoff(self, mock_get_token, mock_request): + """ + Check whether the request backoffs properly for request() for 5 times in case of Timeout error. + """ + mock_request.side_effect = Timeout + client = GoogleClient("dummy_client_id", "dummy_client_secret", "dummy_refresh_token", 300) + with self.assertRaises(Timeout): + client.request("GET") + self.assertEquals(mock_request.call_count, 5) + + @mock.patch('tap_google_sheets.client.requests.Session.request') + def test_get_access_token_timeout_and_backoff(self, mocked_request): + """ + Check whether the request backoffs properly for __enter__() for 5 times in case of Timeout error. + """ + mocked_request.side_effect = Timeout + + config = { + "client_id": "dummy_ci", + "client_secret": "dummy_cs", + "refresh_token": "test_rt", + "user_agent": "test_ua", + } + # initialize 'GoogleClient' + try: + with GoogleClient(config['client_id'], + config['client_secret'], + config['refresh_token'], + config.get('request_timeout'), + config['user_agent']) as client: + pass + except Timeout: + pass + + # verify that we backoff for 5 times + self.assertEquals(mocked_request.call_count, 5) + + @mock.patch('tap_google_sheets.client.requests.Session.request') + def test_check_access_token_connection_error_and_backoff(self, mocked_request): + """ + Check whether the request backoffs properly for __enter__() for 5 times in case of Timeout error. + """ + mocked_request.side_effect = ConnectionError + + config = { + "client_id": "dummy_ci", + "client_secret": "dummy_cs", + "refresh_token": "test_rt", + "user_agent": "test_ua", + } + # initialize 'GoogleClient' + try: + with GoogleClient(config['client_id'], + config['client_secret'], + config['refresh_token'], + config.get('request_timeout'), + config['user_agent']) as client: + pass + except ConnectionError: + pass + + # verify that we backoff for 5 times + self.assertEquals(mocked_request.call_count, 5) + +class MockResponse(): + ''' + Mock response object for the requests call + ''' + def __init__(self, resp, status_code, content=[""], headers=None, raise_error=False, text={}): + self.json_data = resp + self.status_code = status_code + self.content = content + self.headers = headers + self.raise_error = raise_error + self.text = text + self.reason = "error" + + def prepare(self): + return (self.json_data, self.status_code, self.content, self.headers, self.raise_error) + + def json(self, object_pairs_hook): + return self.text + +class TestRequestTimeoutValue(unittest.TestCase): + ''' + Test that request timeout parameter works properly in various cases + ''' + @mock.patch('tap_google_sheets.client.requests.Session.request', return_value = MockResponse("", status_code=200)) + @mock.patch('tap_google_sheets.client.GoogleClient.get_access_token') + def test_config_provided_request_timeout(self, mock_get, mock_request): + """ + Unit tests to ensure that request timeout is set based on config value + """ + config = { "refresh_token": "dummy_token", "client_id": "dummy_client_id", "client_secret": "dummy_client_secret", "user_agent": "dummy_ua", "request_timeout": 100} + client = GoogleClient(**config) + client.request("GET", "dummy_path") + + mock_request.assert_called_with('GET', 'https://sheets.googleapis.com/v4/dummy_path', headers={'Authorization': 'Bearer None', 'User-Agent': 'dummy_ua'}, timeout=100.0) + + @mock.patch('tap_google_sheets.client.requests.Session.request', return_value = MockResponse("", status_code=200)) + @mock.patch('tap_google_sheets.client.GoogleClient.get_access_token') + def test_default_value_request_timeout(self, mock_get, mock_request): + """ + Unit tests to ensure that request timeout is set based default value + """ + config = { "refresh_token": "dummy_token", "client_id": "dummy_client_id", "client_secret": "dummy_client_secret", "user_agent": "dummy_ua"} + client = GoogleClient(**config) + client.request("GET", "dummy_path") + + mock_request.assert_called_with('GET', 'https://sheets.googleapis.com/v4/dummy_path', headers={'Authorization': 'Bearer None', 'User-Agent': 'dummy_ua'}, timeout=300.0) + + @mock.patch('tap_google_sheets.client.requests.Session.request', return_value = MockResponse("", status_code=200)) + @mock.patch('tap_google_sheets.client.GoogleClient.get_access_token') + def test_config_provided_empty_request_timeout(self, mock_get, mock_request): + """ + Unit tests to ensure that request timeout is set based on default value if empty value is given in config + """ + config = { "refresh_token": "dummy_token", "client_id": "dummy_client_id", "client_secret": "dummy_client_secret", "user_agent": "dummy_ua", "request_timeout": ""} + client = GoogleClient(**config) + client.request("GET", "dummy_path") + + mock_request.assert_called_with('GET', 'https://sheets.googleapis.com/v4/dummy_path', headers={'Authorization': 'Bearer None', 'User-Agent': 'dummy_ua'}, timeout=300) + + @mock.patch('tap_google_sheets.client.requests.Session.request', return_value = MockResponse("", status_code=200)) + @mock.patch('tap_google_sheets.client.GoogleClient.get_access_token') + def test_config_provided_string_request_timeout(self, mock_get, mock_request): + """ + Unit tests to ensure that request timeout is set based on config string value + """ + config = { "refresh_token": "dummy_token", "client_id": "dummy_client_id", "client_secret": "dummy_client_secret", "user_agent": "dummy_ua", "request_timeout": "100"} + client = GoogleClient(**config) + client.request("GET", "dummy_path") + + mock_request.assert_called_with('GET', 'https://sheets.googleapis.com/v4/dummy_path', headers={'Authorization': 'Bearer None', 'User-Agent': 'dummy_ua'}, timeout=100.0) + + @mock.patch('tap_google_sheets.client.requests.Session.request', return_value = MockResponse("", status_code=200)) + @mock.patch('tap_google_sheets.client.GoogleClient.get_access_token') + def test_config_provided_float_request_timeout(self, mock_get, mock_request): + """ + Unit tests to ensure that request timeout is set based on config float value + """ + config = { "refresh_token": "dummy_token", "client_id": "dummy_client_id", "client_secret": "dummy_client_secret", "user_agent": "dummy_ua", "request_timeout": 100.8} + client = GoogleClient(**config) + client.request("GET", "dummy_path") + + mock_request.assert_called_with('GET', 'https://sheets.googleapis.com/v4/dummy_path', headers={'Authorization': 'Bearer None', 'User-Agent': 'dummy_ua'}, timeout=100.8) diff --git a/tests/unittests/test_unsupported_fields.py b/tests/unittests/test_unsupported_fields.py new file mode 100644 index 0000000..761545e --- /dev/null +++ b/tests/unittests/test_unsupported_fields.py @@ -0,0 +1,191 @@ +from tap_google_sheets.client import GoogleClient +import unittest +from unittest import mock +from tap_google_sheets import schema + +class TestUnsupportedFields(unittest.TestCase): + @mock.patch('tap_google_sheets.GoogleClient.get') + def test_unsupported_fields(self, mocked_get): + """ + Test whether the incusion property for the skipped property is changed to `unsupported` + and the description is added in the schema + """ + sheet = { + "sheets": [{ + "properties": { + "sheetId": 0, + "title": "Sheet1", + "index": 0, + "sheetType": "GRID", + "gridProperties": { + "rowCount": 3, + "columnCount": 26 + } + }, + "data": [ + { + "rowData": [ + { + "values": [ + {}, + { + "formattedValue": "abc" + }, + { + "formattedValue": "def" + } + ] + }, + { + "values": [ + { + "formattedValue": "A" + }, + { + "formattedValue": "B" + }, + { + "formattedValue": "4" + } + ] + } + ] + } + ]} + ] + } + expected_schema = { + "type": "object", + "additionalProperties":False, + "properties": { + "__sdc_spreadsheet_id": { + "type": [ + "null", + "string" + ] + }, + "__sdc_sheet_id": { + "type": [ + "null", + "integer" + ] + }, + "__sdc_row": { + "type": [ + "null", + "integer" + ] + }, + "__sdc_skip_col_01": { + "type": [ + "null", + "string" + ], + "description": "Column is unsupported and would be skipped because header is not available" + }, + "abc": { + "type": [ + "null", + "string" + ] + }, + "def": { + "type": [ + "null", + "string" + ] + } + } + } + mocked_get.return_value = sheet + schemas, field_metadata = schema.get_schemas(GoogleClient('client_id', 'client_secret', 'refresh_token'), 'sheet_id') + # check if the schemas are equal, hence verifying if the description is present + self.assertEqual(expected_schema, schemas["Sheet1"]) + for each in field_metadata["Sheet1"]: + if each["breadcrumb"] and '__sdc_skip_col_01' in each["breadcrumb"]: + # check if the inclusion property is updated to `unsupported` + self.assertEqual(each["metadata"]["inclusion"], "unsupported") + + def test_two_skipped_columns(self): + """ + Test whether the columns has the `prior_column_skipped` key which is True as there ar 2 consecutive empty headers + and the `sheet_json_schema` has only 3 keys + """ + sheet = { + "properties":{ + "sheetId":1825500887, + "title":"Sheet11", + "index":2, + "sheetType":"GRID", + "gridProperties":{ + "rowCount":1000, + "columnCount":26 + } + }, + "data":[ + { + "rowData":[ + { + "values":[ + {}, + {}, + { + "formattedValue":"abd", + } + ] + }, + { + "values":[ + { + "formattedValue":"1", + }, + { + "formattedValue":"3", + }, + { + "formattedValue":"45", + } + ] + } + ], + "rowMetadata":[ + { + "pixelSize":21 + } + ], + "columnMetadata":[ + { + "pixelSize":100 + }, + ] + } + ] + } + expected_columns = [ + { + 'columnIndex': 1, + 'columnLetter': 'A', + 'columnName': '__sdc_skip_col_01', + 'columnType': 'stringValue', + 'columnSkipped': True, + 'prior_column_skipped': True + } + ] + expected_schema = { + 'type': 'object', + 'additionalProperties': False, + 'properties': { + '__sdc_spreadsheet_id': { + 'type': ['null', 'string'] + }, + '__sdc_sheet_id': { + 'type': ['null', 'integer'] + }, + '__sdc_row': { + 'type': ['null', 'integer'] + } + } + } + sheet_json_schema, columns = schema.get_sheet_schema_columns(sheet) + self.assertEqual(sheet_json_schema, expected_schema) # test the schema is as expected + self.assertEqual(columns, expected_columns) # test if the columns is as expected