Skip to content

Commit

Permalink
TDL-19253 Crest work (#81)
Browse files Browse the repository at this point in the history
* TDL-7096 used effective format to check the type instead of effective value (#76)

* TDL-7096 used eff format to check the type instead of eff value

* added integration test for the updated code

* added new sheet in schema

* TDL-18745 changed to formatted, and made 2 API calls for datetime values (#75)

* TDL-18745 changed to formatted, and made 2 API calls for datetime values

* added cftime in setup.py

* added extra check for currency

* added unittsts for currency format and fixed cci

* added comments back

* resolved PR comments

* transform into datetime, if error, return string values

* added unittests

* resolved PR comments

* resolved review comments

* added sadsheet-date in sync

* added sadsheet-effective-format to base.py for schema

* added new test sheet

* added sheet to skip

* fixed unittests

* TDL-14454 Changed multipleof to singer.decimal (#74)

* changed multipleof to singer.decimal

* fixed cci issues

* fixed cci failures

* converted int to str list

* fixed circlci

* removed unused import

* resolved comments

* added sheets in base

* skipped streams due to errors

* added test-sheet-number in base

* TDL-14448 Numbertype falls to string for boolean values (#73)

* initial commit

* unittests for number transform

* intgration test update

* deleted test.py

* resolve PR comments

* resolved cci failures

* changed currency to string as it would now come as a string
  • Loading branch information
namrata270998 authored May 31, 2022
1 parent 15e9b4e commit 5df2c75
Show file tree
Hide file tree
Showing 17 changed files with 760 additions and 86 deletions.
3 changes: 2 additions & 1 deletion tap_google_sheets/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,8 @@ def get_access_token(self):
@backoff.on_exception(backoff.expo,
(Server5xxError, ConnectionError, Server429Error),
max_tries=7,
factor=3)
factor=3,
jitter=None)
@utils.ratelimit(100, 100)
def request(self, method, path=None, url=None, api=None, **kwargs):
self.get_access_token()
Expand Down
30 changes: 17 additions & 13 deletions tap_google_sheets/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,10 +96,13 @@ def get_sheet_schema_columns(sheet):

col_val = None
if column_effective_value == {}:
column_effective_value_type = 'stringValue'
LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
sheet_title, column_name, column_letter))
LOGGER.info(' Setting column datatype to STRING')
if ("numberFormat" in first_value.get('effectiveFormat', {})):
column_effective_value_type = "numberValue"
else:
column_effective_value_type = 'stringValue'
LOGGER.info('WARNING: NO VALUE IN 2ND ROW FOR HEADER. SHEET: {}, COL: {}, CELL: {}2.'.format(
sheet_title, column_name, column_letter))
LOGGER.info(' Setting column datatype to STRING')
else:
for key, val in column_effective_value.items():
if key in ('numberValue', 'stringValue', 'boolValue'):
Expand All @@ -125,13 +128,8 @@ def get_sheet_schema_columns(sheet):
# https://developers.google.com/sheets/api/reference/rest/v4/spreadsheets/cells#NumberFormatType
#
column_format = None # Default
if column_effective_value == {}:
col_properties = {'type': ['null', 'string']}
column_gs_type = 'stringValue'
LOGGER.info('WARNING: 2ND ROW VALUE IS BLANK: SHEET: {}, COL: {}, CELL: {}2'.format(
sheet_title, column_name, column_letter))
LOGGER.info(' Setting column datatype to STRING')
elif column_effective_value_type == 'stringValue':

if column_effective_value_type == 'stringValue':
col_properties = {'type': ['null', 'string']}
column_gs_type = 'stringValue'
elif column_effective_value_type == 'boolValue':
Expand Down Expand Up @@ -159,10 +157,16 @@ def get_sheet_schema_columns(sheet):
elif column_number_format_type == 'TEXT':
col_properties = {'type': ['null', 'string']}
column_gs_type = 'stringValue'
elif column_number_format_type == 'CURRENCY':
col_properties = {'type': ['null', 'string']}
column_gs_type = 'stringValue'
else:
# Interesting - order in the anyOf makes a difference.
# Number w/ multipleOf must be listed last, otherwise errors occur.
col_properties = {'type': 'number', 'multipleOf': 1e-15}
# Number w/ singer.decimal must be listed last, otherwise errors occur.
col_properties = {
'type': ['null', 'string'],
'format': 'singer.decimal'
}
column_gs_type = 'numberType'
# Catch-all to deal with other types and set to string
# column_effective_value_type: formulaValue, errorValue, or other
Expand Down
122 changes: 115 additions & 7 deletions tap_google_sheets/streams.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@
import os
import time
import re
import simplejson as json
from collections import OrderedDict
import urllib.parse
import singer
from singer import metrics, metadata, Transformer, utils
import decimal
from singer import metrics, metadata, Transformer, utils, messages
from singer.utils import strptime_to_utc, strftime
from singer.messages import RecordMessage
from singer.transform import SchemaKey
import tap_google_sheets.transform as internal_transform
import tap_google_sheets.schema as schema

Expand Down Expand Up @@ -105,6 +108,14 @@ def get_selected_fields(catalog, stream_name):
pass
return selected_fields

def new_format_message(message):
"""To override the ensure_ascii param, overwitten this function"""
return json.dumps(message.asdict(), ensure_ascii=False, use_decimal=True)

# To override the ensure_ascii param as while writing record the currency symbols were written as ascii values,
# overwitten this function of messages file of the singer module
messages.format_message = new_format_message

class GoogleSheets:
stream_name = None
api = None
Expand Down Expand Up @@ -342,17 +353,101 @@ def sync(self, catalog, state, spreadsheet_metadata, time_extracted):
# Sync spreadsheet_metadata if selected
self.sync_stream(spreadsheet_metadata_transformed, catalog, time_extracted)

def new_transform(self, data, typ, schema, path):
'''The new _transform function to replace in the singer module'''
if self.pre_hook:
data = self.pre_hook(data, typ, schema)

if typ == "null":
if data is None or data == "":
return True, None
else:
return False, None

elif schema.get("format") == "date-time":
data = self._transform_datetime(data)
if data is None:
return False, None

return True, data
elif schema.get("format") == "singer.decimal":
if data is None:
return False, None

if isinstance(data, (str, float, int)):
try:
return True, str(decimal.Decimal(str(data)))
except:
return False, None
elif isinstance(data, decimal.Decimal):
try:
if data.is_snan():
return True, 'NaN'
else:
return True, str(data)
except:
return False, None

return False, None
elif typ == "object":
# Objects do not necessarily specify properties
return self._transform_object(data,
schema.get("properties", {}),
path,
schema.get(SchemaKey.pattern_properties))

elif typ == "array":
return self._transform_array(data, schema["items"], path)

elif typ == "string":
if data is not None:
try:
return True, str(data)
except:
return False, None
else:
return False, None

elif typ == "integer":
if isinstance(data, str):
data = data.replace(",", "")

try:
return True, int(data)
except:
return False, None

elif typ == "number":
if isinstance(data, str):
data = data.replace(",", "")
try:
return True, float(data)
except:
return False, None

elif typ == "boolean":
# return the data as string itself if the value is of type string
if isinstance(data, str) and data is not None:
return True, data
try:
return True, bool(data)
except:
return False, None

else:
return False, None

# To cast the boolean values differently, overwriting this function of Transformer class of
# the singer module
Transformer._transform = new_transform

class SheetsLoadData(GoogleSheets):
api = "sheets"
path = "spreadsheets/{spreadsheet_id}/values/'{sheet_title}'!{range_rows}"
data_key = "values"
key_properties = ["spreadsheetId", "sheetId", "loadDate"]
replication_method = "FULL_TABLE"
params = {
"dateTimeRenderOption": "SERIAL_NUMBER",
"valueRenderOption": "UNFORMATTED_VALUE",
"majorDimension": "ROWS"
}
params = {}

def load_data(self, catalog, state, selected_streams, sheets, spreadsheet_time_extracted):
"""
Expand Down Expand Up @@ -427,10 +522,22 @@ def load_data(self, catalog, state, selected_streams, sheets, spreadsheet_time_e
while not is_last_row and from_row < sheet_max_row and to_row <= sheet_max_row:
range_rows = 'A{}:{}{}'.format(from_row, sheet_last_col_letter, to_row)

self.params = {
"dateTimeRenderOption": "SERIAL_NUMBER",
"valueRenderOption": "FORMATTED_VALUE",
"majorDimension": "ROWS"
}
# GET sheet_data for a worksheet tab
sheet_data, time_extracted = self.get_data(stream_name=sheet_title, range_rows=range_rows)
# Data is returned as a list of arrays, an array of values for each row
sheet_data_rows = sheet_data.get('values', [])
self.params = {
"dateTimeRenderOption": "SERIAL_NUMBER",
"valueRenderOption": "UNFORMATTED_VALUE",
"majorDimension": "ROWS"
}
unformatted_sheet_data, _ = self.get_data(stream_name=sheet_title, range_rows=range_rows)
unformatted_sheet_data_rows = unformatted_sheet_data.get('values', [])

# Transform batch of rows to JSON with keys for each column
sheet_data_transformed, row_num = internal_transform.transform_sheet_data(
Expand All @@ -439,7 +546,8 @@ def load_data(self, catalog, state, selected_streams, sheets, spreadsheet_time_e
sheet_title=sheet_title,
from_row=from_row,
columns=columns,
sheet_data_rows=sheet_data_rows)
sheet_data_rows=sheet_data_rows,
unformatted_rows = unformatted_sheet_data_rows)

# Here row_num is the addition of from_row and total records get in response(per batch).
# Condition row_num < to_row was checking that if records on the current page are less than expected(to_row) or not.
Expand Down
Loading

0 comments on commit 5df2c75

Please sign in to comment.