From 70b81a967438325cacbb33a28ac511d654c3f29d Mon Sep 17 00:00:00 2001 From: Vi6hal <20889199+Vi6hal@users.noreply.github.com> Date: Thu, 29 Aug 2024 10:53:02 +0000 Subject: [PATCH 01/12] added condition to check stream metadata --- tap_google_sheets/sync.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tap_google_sheets/sync.py b/tap_google_sheets/sync.py index eb0942c..7b6597e 100644 --- a/tap_google_sheets/sync.py +++ b/tap_google_sheets/sync.py @@ -14,6 +14,8 @@ def sync(client, config, catalog, state): "sheets_loaded" & "sheet_metadata" -> get the data lists from the "spreadsheet_metadata" stream and sync the records if selected """ last_stream = singer.get_currently_syncing(state) + # preset to none + file_modified_time = None LOGGER.info("last/currently syncing stream: %s", last_stream) selected_streams = [] @@ -61,7 +63,8 @@ def sync(client, config, catalog, state): stream_obj.sync(catalog, state, sheets_loaded_records) # sync file metadata - elif stream_name == "file_metadata": + elif stream_name == "file_metadata" and "file_metadata" in selected_streams: + LOGGER.warning("This Stream might not work, please de-select if you face any issues") file_changed, file_modified_time = stream_obj.sync(catalog, state, selected_streams) if not file_changed: break @@ -70,4 +73,5 @@ def sync(client, config, catalog, state): # write "file_metadata" bookmark, as we have successfully synced all the sheet's records # it will force to re-sync of there is any interrupt between the sync - write_bookmark(state, 'file_metadata', strftime(file_modified_time)) + if file_modified_time: + write_bookmark(state, 'file_metadata', strftime(file_modified_time)) From 892d3f62be0d06c6cdc5b68d82fb182718144e2b Mon Sep 17 00:00:00 2001 From: Vi6hal <20889199+Vi6hal@users.noreply.github.com> Date: Fri, 30 Aug 2024 07:06:19 +0000 Subject: [PATCH 02/12] Doc changes --- CHANGELOG.md | 3 +++ setup.py | 2 +- tap_google_sheets/sync.py | 11 ++++++++--- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ab2220..e57631e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## 3.1.0 + * Updates Sync condition and exception handling for file_metadata stream[#96](https://github.com/singer-io/tap-google-sheets/pull/95) + ## 3.0.0 * Remove support for date datatype [#95](https://github.com/singer-io/tap-google-sheets/pull/95) diff --git a/setup.py b/setup.py index 3f42238..67f69b1 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup(name='tap-google-sheets', - version='3.0.0', + version='3.1.0', description='Singer.io tap for extracting data from the Google Sheets v4 API', author='jeff.huth@bytecode.io', classifiers=['Programming Language :: Python :: 3 :: Only'], diff --git a/tap_google_sheets/sync.py b/tap_google_sheets/sync.py index 7b6597e..c50259c 100644 --- a/tap_google_sheets/sync.py +++ b/tap_google_sheets/sync.py @@ -1,5 +1,6 @@ import singer from tap_google_sheets.streams import STREAMS, SheetsLoadData, write_bookmark, strftime +from tap_google_sheets.client import GoogleForbiddenError LOGGER = singer.get_logger() @@ -65,9 +66,13 @@ def sync(client, config, catalog, state): # sync file metadata elif stream_name == "file_metadata" and "file_metadata" in selected_streams: LOGGER.warning("This Stream might not work, please de-select if you face any issues") - file_changed, file_modified_time = stream_obj.sync(catalog, state, selected_streams) - if not file_changed: - break + try: + file_changed, file_modified_time = stream_obj.sync(catalog, state, selected_streams) + if not file_changed: + break + except GoogleForbiddenError as err: + LOGGER.info("Stream file_metadata cannot be synced due to insufficeint permissions, please de-select it") + raise GoogleForbiddenError("Stream file_metadata cannot be synced due to insufficeint permissions, please de-select it") LOGGER.info("FINISHED Syncing: %s", stream_name) From a55a103f340b7c45ac4393457c25bacce911a4c3 Mon Sep 17 00:00:00 2001 From: Vi6hal <20889199+Vi6hal@users.noreply.github.com> Date: Fri, 30 Aug 2024 07:06:57 +0000 Subject: [PATCH 03/12] updated changelog link --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e57631e..d607961 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # Changelog ## 3.1.0 - * Updates Sync condition and exception handling for file_metadata stream[#96](https://github.com/singer-io/tap-google-sheets/pull/95) + * Updates Sync condition and exception handling for file_metadata stream[#96](https://github.com/singer-io/tap-google-sheets/pull/96) ## 3.0.0 * Remove support for date datatype [#95](https://github.com/singer-io/tap-google-sheets/pull/95) From e8fa9e977fe7adfaadad8b840bbb9ab8f029fb9e Mon Sep 17 00:00:00 2001 From: Vi6hal <20889199+Vi6hal@users.noreply.github.com> Date: Fri, 30 Aug 2024 07:13:49 +0000 Subject: [PATCH 04/12] updated log statement --- tap_google_sheets/sync.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tap_google_sheets/sync.py b/tap_google_sheets/sync.py index c50259c..c8ec7ff 100644 --- a/tap_google_sheets/sync.py +++ b/tap_google_sheets/sync.py @@ -71,7 +71,6 @@ def sync(client, config, catalog, state): if not file_changed: break except GoogleForbiddenError as err: - LOGGER.info("Stream file_metadata cannot be synced due to insufficeint permissions, please de-select it") raise GoogleForbiddenError("Stream file_metadata cannot be synced due to insufficeint permissions, please de-select it") LOGGER.info("FINISHED Syncing: %s", stream_name) From de26fb69a708090f5a356c75e124d17787b15bbf Mon Sep 17 00:00:00 2001 From: Vi6hal <20889199+Vi6hal@users.noreply.github.com> Date: Thu, 5 Sep 2024 00:49:06 +0000 Subject: [PATCH 05/12] remove stream --- CHANGELOG.md | 2 +- tap_google_sheets/streams.py | 44 -------- tap_google_sheets/sync.py | 18 +--- tap_google_sheets/transform.py | 14 --- tests/base.py | 5 - tests/test_google_sheets_bookmarks.py | 142 +++++++++++++------------- 6 files changed, 72 insertions(+), 153 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d607961..20a85ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # Changelog ## 3.1.0 - * Updates Sync condition and exception handling for file_metadata stream[#96](https://github.com/singer-io/tap-google-sheets/pull/96) + * Remove deprecated stream file_metadata [#96](https://github.com/singer-io/tap-google-sheets/pull/96) ## 3.0.0 * Remove support for date datatype [#95](https://github.com/singer-io/tap-google-sheets/pull/95) diff --git a/tap_google_sheets/streams.py b/tap_google_sheets/streams.py index 721c8fd..69f1c83 100644 --- a/tap_google_sheets/streams.py +++ b/tap_google_sheets/streams.py @@ -243,49 +243,6 @@ def sync_stream(self, records, catalog, time_extracted=None): LOGGER.info('FINISHED Syncing {}, Total Records: {}'.format(self.stream_name, record_count)) update_currently_syncing(self.state, None) -class FileMetadata(GoogleSheets): - stream_name = "file_metadata" - api = "files" - path = "files/{spreadsheet_id}" - key_properties = ["id"] - replication_method = "INCREMENTAL" - replication_keys = ["modifiedTime"] - params = { - "fields": "id,name,createdTime,modifiedTime,version,teamDriveId,driveId,lastModifyingUser", - "supportsAllDrives": True - } - - def sync(self, catalog, state, selected_streams): - """ - sync file's metadata - """ - self.state = state - # variable to check if file is changed or not - - # get date to start sync from, ie. start date or bookmark date - start_date = strptime_to_utc(get_bookmark(state, self.stream_name, self.config_start_date)) - - LOGGER.info("GET file_metadata") - file_metadata, time_extracted = self.get_data(stream_name=self.stream_name) - LOGGER.info("Transform file_metadata") - - file_modified_time = strptime_to_utc(file_metadata.get("modifiedTime")) - LOGGER.info("last_datetime = {}, file_modified_time = {}".format(start_date, file_modified_time)) - if file_modified_time <= start_date: - # if file is not changed, update the variable - LOGGER.info("file_modified_time <= last_datetime, FILE NOT CHANGED. EXITING.") - # return and stop syncing the next streams, as the file is not changed - return False, file_modified_time - - # only perform sync if file metadata stream is selected and file is changed - if self.stream_name in selected_streams: - # transform file metadata records - file_metadata_transformed = internal_transform.transform_file_metadata(file_metadata) - # do sync - self.sync_stream(file_metadata_transformed, catalog, time_extracted) - - return True, file_modified_time - class SpreadSheetMetadata(GoogleSheets): stream_name = "spreadsheet_metadata" api = "sheets" @@ -645,7 +602,6 @@ def sync(self, catalog, state, sheets_loaded_records): # "spreadsheet_metadata" -> get sheets in the spreadsheet and load sheet's records # and prepare records for "sheet_metadata" and "sheets_loaded" streams STREAMS = OrderedDict() -STREAMS['file_metadata'] = FileMetadata STREAMS['spreadsheet_metadata'] = SpreadSheetMetadata STREAMS['sheet_metadata'] = SheetMetadata STREAMS['sheets_loaded'] = SheetsLoaded diff --git a/tap_google_sheets/sync.py b/tap_google_sheets/sync.py index c8ec7ff..f89c1cd 100644 --- a/tap_google_sheets/sync.py +++ b/tap_google_sheets/sync.py @@ -7,7 +7,6 @@ def sync(client, config, catalog, state): """ Sync the streams, loop over STREAMS - "file_metadata" -> get the file's metadata and if the spreadsheet file is updated then continue the sync else stop the sync "spreadsheet_metadata" -> get the spreadsheet's metadata - sync the spreadsheet_metadata stream if selected - get the sheets in the spreadsheet and loop over the sheets and sync the sheet's records if selected @@ -63,19 +62,4 @@ def sync(client, config, catalog, state): else: stream_obj.sync(catalog, state, sheets_loaded_records) - # sync file metadata - elif stream_name == "file_metadata" and "file_metadata" in selected_streams: - LOGGER.warning("This Stream might not work, please de-select if you face any issues") - try: - file_changed, file_modified_time = stream_obj.sync(catalog, state, selected_streams) - if not file_changed: - break - except GoogleForbiddenError as err: - raise GoogleForbiddenError("Stream file_metadata cannot be synced due to insufficeint permissions, please de-select it") - - LOGGER.info("FINISHED Syncing: %s", stream_name) - - # write "file_metadata" bookmark, as we have successfully synced all the sheet's records - # it will force to re-sync of there is any interrupt between the sync - if file_modified_time: - write_bookmark(state, 'file_metadata', strftime(file_modified_time)) + LOGGER.info("FINISHED Syncing: %s", stream_name) \ No newline at end of file diff --git a/tap_google_sheets/transform.py b/tap_google_sheets/transform.py index cebeca2..9fd74eb 100644 --- a/tap_google_sheets/transform.py +++ b/tap_google_sheets/transform.py @@ -33,20 +33,6 @@ def transform_spreadsheet_metadata(spreadsheet_metadata): spreadsheet_metadata_arr.append(spreadsheet_metadata_tf) return spreadsheet_metadata_arr -# Tranform file_metadata: remove nodes from lastModifyingUser, format as array -def transform_file_metadata(file_metadata): - # Convert to dict - file_metadata_tf = json.loads(json.dumps(file_metadata)) - # Remove keys - if file_metadata_tf.get('lastModifyingUser'): - file_metadata_tf['lastModifyingUser'].pop('photoLink', None) - file_metadata_tf['lastModifyingUser'].pop('me', None) - file_metadata_tf['lastModifyingUser'].pop('permissionId', None) - # Add record to an array of 1 - file_metadata_arr = [] - file_metadata_arr.append(file_metadata_tf) - return file_metadata_arr - # Convert Excel Date Serial Number (excel_date_sn) to datetime string # timezone_str: defaults to UTC (which we assume is the timezone for ALL datetimes) def excel_to_dttm_str(string_value, excel_date_sn, timezone_str=None): diff --git a/tests/base.py b/tests/base.py index 42db120..e4f6c0f 100644 --- a/tests/base.py +++ b/tests/base.py @@ -72,11 +72,6 @@ def expected_metadata(self): # self.REPLICATION_KEYS: {"modified_at"} } return { - "file_metadata": { - self.PRIMARY_KEYS: {"id", }, - self.REPLICATION_METHOD: self.INCREMENTAL, - self.REPLICATION_KEYS: {"modifiedTime"} - }, "sheet_metadata": { self.PRIMARY_KEYS: {"sheetId"}, # "spreadsheetId"}, # BUG? | This is not in the real tap, "spreadsheetId"}, self.REPLICATION_METHOD: self.FULL_TABLE, diff --git a/tests/test_google_sheets_bookmarks.py b/tests/test_google_sheets_bookmarks.py index 26e4761..484615f 100644 --- a/tests/test_google_sheets_bookmarks.py +++ b/tests/test_google_sheets_bookmarks.py @@ -17,78 +17,76 @@ class BookmarksTest(GoogleSheetsBaseTest): def name(): return "tap_tester_google_sheets_bookmarks" - def test_run(self): - """ - Run check mode, perform table and field selection, and run a sync. - Replication can be triggered by pushing back state to prior 'file_metadata' state. - Run a second sync after not updating state to verify no streams are being synced - Run a 3rd sync and ensure full table streams are triggered by the simulated bookmark value. - - - Verify initial sync message actions include activate versions and the upserts - - Verify no streams are synced when 'file_metadata' bookmark does not change - - Verify that the third sync with the updated simulated bookmark has the same synced streams as the first sync - - Verify that streams will sync based off of 'file_metadata' even when it is not selected - """ - skipped_streams = {stream - for stream in self.expected_streams() - if stream.startswith('sadsheet')}.union({ - 'file_metadata' # testing case without file_metadata selected, but still providing bookmark - }) - self.expected_test_streams = self.expected_streams() - skipped_streams - - # Grab connection, and run discovery and initial sync - self.starter() - - synced_records_1 = runner.get_records_from_target_output() - - # Grab state to be updated later - state = menagerie.get_state(self.conn_id) - - # BUG full table streams are saving bookmarks unnecessarily https://jira.talendforge.org/browse/TDL-14343 - - # BUG there are no activate version messages in the sheet_metadata, spreadsheet_metadata - # or sheets_loaded streams, even though they are full table https://jira.talendforge.org/browse/TDL-14346 - # verify message actions are correct - for stream in self.expected_test_streams.difference({'sheet_metadata', 'spreadsheet_metadata', 'sheets_loaded'}): - with self.subTest(stream=stream): - sync1_message_actions = [message['action'] for message in synced_records_1[stream]['messages']] - self.assertEqual('activate_version', sync1_message_actions[0]) - self.assertEqual('activate_version', sync1_message_actions[-1]) - self.assertSetEqual({'upsert'}, set(sync1_message_actions[1:-1])) - - # run a sync again, this time we shouldn't get any records back - sync_job_name = runner.run_sync_mode(self, self.conn_id) - exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) - menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) - record_count_by_stream_2 = runner.examine_target_output_file( - self, self.conn_id, self.expected_streams(), self.expected_primary_keys()) - - # verify we do not sync any unexpected streams - self.assertSetEqual(set(), set(record_count_by_stream_2.keys())) - - # verify no records were synced for our expected streams - for stream in self.expected_test_streams: - with self.subTest(stream=stream): - self.assertEqual(0, record_count_by_stream_2.get(stream, 0)) - - # roll back the state of the file_metadata stream to ensure that we sync sheets - # based off of this state - file_metadata_stream = 'file_metadata' - file_metadata_bookmark = state['bookmarks'][file_metadata_stream] - bookmark_datetime = datetime.datetime.strptime(file_metadata_bookmark, self.BOOKMARK_COMPARISON_FORMAT) - target_datetime = bookmark_datetime + datetime.timedelta(days=-1) - target_bookmark = datetime.datetime.strftime(target_datetime, self.BOOKMARK_COMPARISON_FORMAT) - - new_state = copy.deepcopy(state) - new_state['bookmarks'][file_metadata_stream] = target_bookmark - - menagerie.set_state(self.conn_id, new_state) - - record_count_by_stream_3 = self.run_and_verify_sync(self.conn_id) - synced_records_3 = runner.get_records_from_target_output() - - # verify we sync sheets based off the state of file_metadata - self.assertDictEqual(self.record_count_by_stream_1, record_count_by_stream_3) + # def test_run(self): + # """ + # Run check mode, perform table and field selection, and run a sync. + # Replication can be triggered by pushing back state to prior 'file_metadata' state. + # Run a second sync after not updating state to verify no streams are being synced + # Run a 3rd sync and ensure full table streams are triggered by the simulated bookmark value. + + # - Verify initial sync message actions include activate versions and the upserts + # - Verify no streams are synced when 'file_metadata' bookmark does not change + # - Verify that the third sync with the updated simulated bookmark has the same synced streams as the first sync + # - Verify that streams will sync based off of 'file_metadata' even when it is not selected + # """ + # skipped_streams = {stream + # for stream in self.expected_streams() + # if stream.startswith('sadsheet')} + # self.expected_test_streams = self.expected_streams() - skipped_streams + + # # Grab connection, and run discovery and initial sync + # self.starter() + + # synced_records_1 = runner.get_records_from_target_output() + + # # Grab state to be updated later + # state = menagerie.get_state(self.conn_id) + + # # BUG full table streams are saving bookmarks unnecessarily https://jira.talendforge.org/browse/TDL-14343 + + # # BUG there are no activate version messages in the sheet_metadata, spreadsheet_metadata + # # or sheets_loaded streams, even though they are full table https://jira.talendforge.org/browse/TDL-14346 + # # verify message actions are correct + # for stream in self.expected_test_streams.difference({'sheet_metadata', 'spreadsheet_metadata', 'sheets_loaded'}): + # with self.subTest(stream=stream): + # sync1_message_actions = [message['action'] for message in synced_records_1[stream]['messages']] + # self.assertEqual('activate_version', sync1_message_actions[0]) + # self.assertEqual('activate_version', sync1_message_actions[-1]) + # self.assertSetEqual({'upsert'}, set(sync1_message_actions[1:-1])) + + # # run a sync again, this time we shouldn't get any records back + # sync_job_name = runner.run_sync_mode(self, self.conn_id) + # exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) + # menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) + # record_count_by_stream_2 = runner.examine_target_output_file( + # self, self.conn_id, self.expected_streams(), self.expected_primary_keys()) + + # # verify we do not sync any unexpected streams + # self.assertSetEqual(set(), set(record_count_by_stream_2.keys())) + + # # verify no records were synced for our expected streams + # for stream in self.expected_test_streams: + # with self.subTest(stream=stream): + # self.assertEqual(0, record_count_by_stream_2.get(stream, 0)) + + # # roll back the state of the file_metadata stream to ensure that we sync sheets + # # based off of this state + # file_metadata_stream = 'file_metadata' + # file_metadata_bookmark = state['bookmarks'][file_metadata_stream] + # bookmark_datetime = datetime.datetime.strptime(file_metadata_bookmark, self.BOOKMARK_COMPARISON_FORMAT) + # target_datetime = bookmark_datetime + datetime.timedelta(days=-1) + # target_bookmark = datetime.datetime.strftime(target_datetime, self.BOOKMARK_COMPARISON_FORMAT) + + # new_state = copy.deepcopy(state) + # new_state['bookmarks'][file_metadata_stream] = target_bookmark + + # menagerie.set_state(self.conn_id, new_state) + + # record_count_by_stream_3 = self.run_and_verify_sync(self.conn_id) + # synced_records_3 = runner.get_records_from_target_output() + + # # verify we sync sheets based off the state of file_metadata + # self.assertDictEqual(self.record_count_by_stream_1, record_count_by_stream_3) def starter(self): """ From 9cfcd5e1c1d154fe997116b188f0a3bfb67e841b Mon Sep 17 00:00:00 2001 From: Vi6hal <20889199+Vi6hal@users.noreply.github.com> Date: Thu, 5 Sep 2024 01:21:23 +0000 Subject: [PATCH 06/12] updated bookmarks test --- tap_google_sheets/sync.py | 3 +- tests/test_google_sheets_bookmarks.py | 117 +++++++++----------------- 2 files changed, 41 insertions(+), 79 deletions(-) diff --git a/tap_google_sheets/sync.py b/tap_google_sheets/sync.py index f89c1cd..7e3204f 100644 --- a/tap_google_sheets/sync.py +++ b/tap_google_sheets/sync.py @@ -1,6 +1,5 @@ import singer from tap_google_sheets.streams import STREAMS, SheetsLoadData, write_bookmark, strftime -from tap_google_sheets.client import GoogleForbiddenError LOGGER = singer.get_logger() @@ -62,4 +61,4 @@ def sync(client, config, catalog, state): else: stream_obj.sync(catalog, state, sheets_loaded_records) - LOGGER.info("FINISHED Syncing: %s", stream_name) \ No newline at end of file + LOGGER.info("FINISHED Syncing: %s", stream_name) diff --git a/tests/test_google_sheets_bookmarks.py b/tests/test_google_sheets_bookmarks.py index 484615f..11a526c 100644 --- a/tests/test_google_sheets_bookmarks.py +++ b/tests/test_google_sheets_bookmarks.py @@ -8,7 +8,7 @@ class BookmarksTest(GoogleSheetsBaseTest): """Ensure all sheets streams will replicate based off of the most recent bookmarked state for 'file_metadata'""" - + conn_id = "" expected_test_streams = "" record_count_by_stream_1 = "" @@ -17,76 +17,43 @@ class BookmarksTest(GoogleSheetsBaseTest): def name(): return "tap_tester_google_sheets_bookmarks" - # def test_run(self): - # """ - # Run check mode, perform table and field selection, and run a sync. - # Replication can be triggered by pushing back state to prior 'file_metadata' state. - # Run a second sync after not updating state to verify no streams are being synced - # Run a 3rd sync and ensure full table streams are triggered by the simulated bookmark value. - - # - Verify initial sync message actions include activate versions and the upserts - # - Verify no streams are synced when 'file_metadata' bookmark does not change - # - Verify that the third sync with the updated simulated bookmark has the same synced streams as the first sync - # - Verify that streams will sync based off of 'file_metadata' even when it is not selected - # """ - # skipped_streams = {stream - # for stream in self.expected_streams() - # if stream.startswith('sadsheet')} - # self.expected_test_streams = self.expected_streams() - skipped_streams - - # # Grab connection, and run discovery and initial sync - # self.starter() - - # synced_records_1 = runner.get_records_from_target_output() - - # # Grab state to be updated later - # state = menagerie.get_state(self.conn_id) - - # # BUG full table streams are saving bookmarks unnecessarily https://jira.talendforge.org/browse/TDL-14343 - - # # BUG there are no activate version messages in the sheet_metadata, spreadsheet_metadata - # # or sheets_loaded streams, even though they are full table https://jira.talendforge.org/browse/TDL-14346 - # # verify message actions are correct - # for stream in self.expected_test_streams.difference({'sheet_metadata', 'spreadsheet_metadata', 'sheets_loaded'}): - # with self.subTest(stream=stream): - # sync1_message_actions = [message['action'] for message in synced_records_1[stream]['messages']] - # self.assertEqual('activate_version', sync1_message_actions[0]) - # self.assertEqual('activate_version', sync1_message_actions[-1]) - # self.assertSetEqual({'upsert'}, set(sync1_message_actions[1:-1])) - - # # run a sync again, this time we shouldn't get any records back - # sync_job_name = runner.run_sync_mode(self, self.conn_id) - # exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) - # menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) - # record_count_by_stream_2 = runner.examine_target_output_file( - # self, self.conn_id, self.expected_streams(), self.expected_primary_keys()) - - # # verify we do not sync any unexpected streams - # self.assertSetEqual(set(), set(record_count_by_stream_2.keys())) - - # # verify no records were synced for our expected streams - # for stream in self.expected_test_streams: - # with self.subTest(stream=stream): - # self.assertEqual(0, record_count_by_stream_2.get(stream, 0)) - - # # roll back the state of the file_metadata stream to ensure that we sync sheets - # # based off of this state - # file_metadata_stream = 'file_metadata' - # file_metadata_bookmark = state['bookmarks'][file_metadata_stream] - # bookmark_datetime = datetime.datetime.strptime(file_metadata_bookmark, self.BOOKMARK_COMPARISON_FORMAT) - # target_datetime = bookmark_datetime + datetime.timedelta(days=-1) - # target_bookmark = datetime.datetime.strftime(target_datetime, self.BOOKMARK_COMPARISON_FORMAT) - - # new_state = copy.deepcopy(state) - # new_state['bookmarks'][file_metadata_stream] = target_bookmark - - # menagerie.set_state(self.conn_id, new_state) - - # record_count_by_stream_3 = self.run_and_verify_sync(self.conn_id) - # synced_records_3 = runner.get_records_from_target_output() - - # # verify we sync sheets based off the state of file_metadata - # self.assertDictEqual(self.record_count_by_stream_1, record_count_by_stream_3) + def test_run(self): + """ + Run check mode, perform table and field selection, and run a sync. + + - Verify initial sync message actions include activate versions and the upserts + """ + skipped_streams = {stream + for stream in self.expected_streams() + if stream.startswith('sadsheet')} + self.expected_test_streams = self.expected_streams() - skipped_streams + + # Grab connection, and run discovery and initial sync + self.starter() + + synced_records_1 = runner.get_records_from_target_output() + + # Grab state to be updated later + state = menagerie.get_state(self.conn_id) + + # BUG full table streams are saving bookmarks unnecessarily https://jira.talendforge.org/browse/TDL-14343 + + # BUG there are no activate version messages in the sheet_metadata, spreadsheet_metadata + # or sheets_loaded streams, even though they are full table https://jira.talendforge.org/browse/TDL-14346 + # verify message actions are correct + for stream in self.expected_test_streams.difference({'sheet_metadata', 'spreadsheet_metadata', 'sheets_loaded'}): + with self.subTest(stream=stream): + sync1_message_actions = [message['action'] for message in synced_records_1[stream]['messages']] + self.assertEqual('activate_version', sync1_message_actions[0]) + self.assertEqual('activate_version', sync1_message_actions[-1]) + self.assertSetEqual({'upsert'}, set(sync1_message_actions[1:-1])) + + # run a sync again, this time we shouldn't get any records back + sync_job_name = runner.run_sync_mode(self, self.conn_id) + exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) + menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) + record_count_by_stream_2 = runner.examine_target_output_file( + self, self.conn_id, self.expected_streams(), self.expected_primary_keys()) def starter(self): """ @@ -100,7 +67,7 @@ def starter(self): ### Instantiate connection ########################################################################## self.conn_id = connections.ensure_connection(self) - + ########################################################################## ### Discovery without the backoff ########################################################################## @@ -116,7 +83,7 @@ def starter(self): self.assertSetEqual(self.expected_streams(), found_catalog_names, msg="discovered schemas do not match") LOGGER.info("discovered schemas are OK") - + # table and field selection test_catalogs = [catalog for catalog in found_catalogs if catalog.get('stream_name') in self.expected_test_streams] @@ -141,7 +108,3 @@ def starter(self): msg="failed to replicate any data: {}".format(self.record_count_by_stream_1) ) LOGGER.info("total replicated row count: %s", sum(self.record_count_by_stream_1.values())) - - - - From 447605dcc40b3538c083be87b3918b02215d0145 Mon Sep 17 00:00:00 2001 From: Vi6hal <20889199+Vi6hal@users.noreply.github.com> Date: Thu, 5 Sep 2024 01:38:00 +0000 Subject: [PATCH 07/12] removed unused vars --- tap_google_sheets/sync.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tap_google_sheets/sync.py b/tap_google_sheets/sync.py index 7e3204f..561d9dc 100644 --- a/tap_google_sheets/sync.py +++ b/tap_google_sheets/sync.py @@ -13,8 +13,6 @@ def sync(client, config, catalog, state): "sheets_loaded" & "sheet_metadata" -> get the data lists from the "spreadsheet_metadata" stream and sync the records if selected """ last_stream = singer.get_currently_syncing(state) - # preset to none - file_modified_time = None LOGGER.info("last/currently syncing stream: %s", last_stream) selected_streams = [] From 5d7d3880ff564714032bf8752685dad1708676cf Mon Sep 17 00:00:00 2001 From: Vi6hal <20889199+Vi6hal@users.noreply.github.com> Date: Thu, 5 Sep 2024 01:42:20 +0000 Subject: [PATCH 08/12] updated test --- tests/test_google_sheets_bookmarks.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/test_google_sheets_bookmarks.py b/tests/test_google_sheets_bookmarks.py index 11a526c..5110d22 100644 --- a/tests/test_google_sheets_bookmarks.py +++ b/tests/test_google_sheets_bookmarks.py @@ -48,13 +48,6 @@ def test_run(self): self.assertEqual('activate_version', sync1_message_actions[-1]) self.assertSetEqual({'upsert'}, set(sync1_message_actions[1:-1])) - # run a sync again, this time we shouldn't get any records back - sync_job_name = runner.run_sync_mode(self, self.conn_id) - exit_status = menagerie.get_exit_status(self.conn_id, sync_job_name) - menagerie.verify_sync_exit_status(self, exit_status, sync_job_name) - record_count_by_stream_2 = runner.examine_target_output_file( - self, self.conn_id, self.expected_streams(), self.expected_primary_keys()) - def starter(self): """ Instantiate connection, run discovery, and initial sync. From c5d5da8ffc24b6b424b7aae489e0347dbd568262 Mon Sep 17 00:00:00 2001 From: Vi6hal <20889199+Vi6hal@users.noreply.github.com> Date: Thu, 5 Sep 2024 07:49:29 +0000 Subject: [PATCH 09/12] update test bookmark --- tests/test_google_sheets_bookmarks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_google_sheets_bookmarks.py b/tests/test_google_sheets_bookmarks.py index 5110d22..2ecd4f2 100644 --- a/tests/test_google_sheets_bookmarks.py +++ b/tests/test_google_sheets_bookmarks.py @@ -7,7 +7,7 @@ class BookmarksTest(GoogleSheetsBaseTest): - """Ensure all sheets streams will replicate based off of the most recent bookmarked state for 'file_metadata'""" + """Ensure all sheets streams will replicate full table """ conn_id = "" expected_test_streams = "" @@ -20,7 +20,6 @@ def name(): def test_run(self): """ Run check mode, perform table and field selection, and run a sync. - - Verify initial sync message actions include activate versions and the upserts """ skipped_streams = {stream @@ -47,6 +46,7 @@ def test_run(self): self.assertEqual('activate_version', sync1_message_actions[0]) self.assertEqual('activate_version', sync1_message_actions[-1]) self.assertSetEqual({'upsert'}, set(sync1_message_actions[1:-1])) + self.assertIn(stream, state["bookmark"].keys()) def starter(self): """ From 9626d4ea4544fabfe43ce552815c9521c9b6a0ad Mon Sep 17 00:00:00 2001 From: Vi6hal <20889199+Vi6hal@users.noreply.github.com> Date: Thu, 5 Sep 2024 08:11:28 +0000 Subject: [PATCH 10/12] update key --- tests/test_google_sheets_bookmarks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_google_sheets_bookmarks.py b/tests/test_google_sheets_bookmarks.py index 2ecd4f2..1566a55 100644 --- a/tests/test_google_sheets_bookmarks.py +++ b/tests/test_google_sheets_bookmarks.py @@ -46,7 +46,7 @@ def test_run(self): self.assertEqual('activate_version', sync1_message_actions[0]) self.assertEqual('activate_version', sync1_message_actions[-1]) self.assertSetEqual({'upsert'}, set(sync1_message_actions[1:-1])) - self.assertIn(stream, state["bookmark"].keys()) + self.assertIn(stream, state["bookmarks"].keys()) def starter(self): """ From bb8e62d374c04a9d8cd3ba6963083d0b153e93e1 Mon Sep 17 00:00:00 2001 From: Vi6hal <20889199+Vi6hal@users.noreply.github.com> Date: Thu, 5 Sep 2024 08:32:40 +0000 Subject: [PATCH 11/12] removed refereces to file metadata --- README.md | 7 +------ state.json.example | 5 +---- tap_google_sheets/streams.py | 1 - tests/base.py | 5 +---- tests/test_google_sheets_all_fields.py | 15 +-------------- tests/test_google_sheets_automatic_fields.py | 4 ---- tests/test_google_sheets_bookmarks.py | 1 + tests/test_google_sheets_discovery.py | 2 -- 8 files changed, 5 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 772c28d..8d79e24 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,6 @@ This tap: ## Authentication The [**Google Sheets Setup & Authentication**](https://drive.google.com/open?id=1FojlvtLwS0-BzGS37R0jEXtwSHqSiO1Uw-7RKQQO-C4) Google Doc provides instructions show how to configure the Google Cloud API credentials to enable Google Drive and Google Sheets APIs, configure Google Cloud to authorize/verify your domain ownership, generate an API key (client_id, client_secret), authenticate and generate a refresh_token, and prepare your tap config.json with the necessary parameters. -- Enable Googe Drive APIs and Authorization Scope: https://www.googleapis.com/auth/drive.metadata.readonly - Enable Google Sheets API and Authorization Scope: https://www.googleapis.com/auth/spreadsheets.readonly - Tap config.json parameters: - client_id: identifies your application @@ -122,10 +121,7 @@ The [**Google Sheets Setup & Authentication**](https://drive.google.com/open?id= ```json { - "currently_syncing": "file_metadata", - "bookmarks": { - "file_metadata": "2019-09-27T22:34:39.000000Z" - } + "currently_syncing": "sheet_metadata", } ``` @@ -185,7 +181,6 @@ The [**Google Sheets Setup & Authentication**](https://drive.google.com/open?id= +----------------------+---------+---------+ | stream | records | schemas | +----------------------+---------+---------+ - | file_metadata | 1 | 1 | | spreadsheet_metadata | 1 | 1 | | Test-1 | 9 | 1 | | Test 2 | 2 | 1 | diff --git a/state.json.example b/state.json.example index a5ccda7..8599a7f 100644 --- a/state.json.example +++ b/state.json.example @@ -1,6 +1,3 @@ { - "currently_syncing": "file_metadata", - "bookmarks": { - "file_metadata": "2019-09-27T22:34:39.000000Z" - } + "currently_syncing": "sheet_metadata" } diff --git a/tap_google_sheets/streams.py b/tap_google_sheets/streams.py index 69f1c83..75f3ad4 100644 --- a/tap_google_sheets/streams.py +++ b/tap_google_sheets/streams.py @@ -598,7 +598,6 @@ def sync(self, catalog, state, sheets_loaded_records): # create OrderDict, as the order matters for syncing the streams -# "file_metadata" -> do not sync other streams, if file is not changed # "spreadsheet_metadata" -> get sheets in the spreadsheet and load sheet's records # and prepare records for "sheet_metadata" and "sheets_loaded" streams STREAMS = OrderedDict() diff --git a/tests/base.py b/tests/base.py index e4f6c0f..0ad6ecf 100644 --- a/tests/base.py +++ b/tests/base.py @@ -290,9 +290,6 @@ def perform_and_verify_table_and_field_selection(self, # Verify only automatic fields are selected expected_automatic_fields = self.expected_automatic_fields().get(cat['stream_name']) selected_fields = self.get_selected_fields_from_metadata(catalog_entry['metadata']) - # BUG TDL-14241 | Replication keys are not automatic - if cat['stream_name'] == "file_metadata": - expected_automatic_fields.remove('modifiedTime') self.assertEqual(expected_automatic_fields, selected_fields) @staticmethod @@ -368,7 +365,7 @@ def timedelta_formatted(self, dtime, days=0): ########################################################################## def is_sheet(self, stream): - non_sheets_streams = {'sheet_metadata', 'file_metadata', 'sheets_loaded', 'spreadsheet_metadata'} + non_sheets_streams = {'sheet_metadata', 'sheets_loaded', 'spreadsheet_metadata'} return stream in self.expected_streams().difference(non_sheets_streams) def undiscoverable_sheets(self): diff --git a/tests/test_google_sheets_all_fields.py b/tests/test_google_sheets_all_fields.py index f2edde2..836dcb1 100644 --- a/tests/test_google_sheets_all_fields.py +++ b/tests/test_google_sheets_all_fields.py @@ -79,17 +79,4 @@ def test_run(self): # verify all fields for a stream were replicated self.assertGreater(len(expected_all_keys), len(expected_automatic_keys)) self.assertTrue(expected_automatic_keys.issubset(expected_all_keys), msg=f'{expected_automatic_keys-expected_all_keys} is not in "expected_all_keys"') - if stream == "file_metadata": - - # As per google documentation https://developers.google.com/drive/api/v3/reference/files `teamDriveId` is deprecated. There is mentioned that use `driveId` instead. - # `driveId` is populated from items in the team shared drives. But stitch integration does not support shared team drive. So replicating driveid is not possible. - # So, these two fields will not be synced. - expected_all_keys.remove('teamDriveId') - expected_all_keys.remove('driveId') - # Earlier field `emailAddress` was defined as `emailAddress`(typo mismatch) in file_metadata.json. - # So, this particular field did not collected. Because API response contain `emailAddress` field. - # Now, typo has been corrected and verifying that `emailAddress` field collected. - lastModifyingUser_fields = set(messages['messages'][0].get('data', {}).get('lastModifyingUser', {}).keys()) # Get `lastModifyingUser` from file_metadata records - # Verify that `emailAddress` field under `lastModifyingUser` collected. - self.assertTrue({'emailAddress'}.issubset(lastModifyingUser_fields), msg="emailAddress does not found in lastModifyingUser") - self.assertSetEqual(expected_all_keys, actual_all_keys) + self.assertSetEqual(expected_all_keys, actual_all_keys) diff --git a/tests/test_google_sheets_automatic_fields.py b/tests/test_google_sheets_automatic_fields.py index dcb6f9c..e3de71a 100644 --- a/tests/test_google_sheets_automatic_fields.py +++ b/tests/test_google_sheets_automatic_fields.py @@ -58,9 +58,5 @@ def test_run(self): # Verify that you get some records for each stream self.assertGreater(record_count_by_stream.get(stream, -1), 0) - # Verify that only the automatic fields are sent to the target - # BUG TDL-14241 | Replication keys are not automatic - if stream == "file_metadata": - expected_keys.remove('modifiedTime') for actual_keys in record_messages_keys: self.assertSetEqual(expected_keys, actual_keys) diff --git a/tests/test_google_sheets_bookmarks.py b/tests/test_google_sheets_bookmarks.py index 1566a55..5574e12 100644 --- a/tests/test_google_sheets_bookmarks.py +++ b/tests/test_google_sheets_bookmarks.py @@ -21,6 +21,7 @@ def test_run(self): """ Run check mode, perform table and field selection, and run a sync. - Verify initial sync message actions include activate versions and the upserts + - check if bookmark include activate versions for all streams """ skipped_streams = {stream for stream in self.expected_streams() diff --git a/tests/test_google_sheets_discovery.py b/tests/test_google_sheets_discovery.py index 3f6b6dc..e58c92c 100644 --- a/tests/test_google_sheets_discovery.py +++ b/tests/test_google_sheets_discovery.py @@ -123,8 +123,6 @@ def test_run(self): # verify that primary keys and replication keys # are given the inclusion of automatic in metadata. # BUG TDL-14241 | Replication keys are not automatic - if stream == 'file_metadata': - expected_automatic_fields.remove('modifiedTime') self.assertSetEqual(expected_automatic_fields, actual_automatic_fields) # verify missing values where __sdc_row = 2 From 583a79795b51ca1aac15b51aaab4f5c353f95bba Mon Sep 17 00:00:00 2001 From: Vi6hal <20889199+Vi6hal@users.noreply.github.com> Date: Thu, 5 Sep 2024 08:53:20 +0000 Subject: [PATCH 12/12] updated bookmark test docs --- tests/test_google_sheets_bookmarks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_google_sheets_bookmarks.py b/tests/test_google_sheets_bookmarks.py index 5574e12..e0d8cda 100644 --- a/tests/test_google_sheets_bookmarks.py +++ b/tests/test_google_sheets_bookmarks.py @@ -7,7 +7,7 @@ class BookmarksTest(GoogleSheetsBaseTest): - """Ensure all sheets streams will replicate full table """ + """Ensure all sheets streams will replicate in full table mode and create appropriate bookmarks""" conn_id = "" expected_test_streams = ""