From 2e83f76ae3a95215fd50d66278ab6467dd4e462f Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Tue, 4 Jun 2024 20:00:47 +0200 Subject: [PATCH] Export to parquet instead of CSV --- df_to_azure/adf.py | 13 +++---------- df_to_azure/export.py | 4 ++-- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/df_to_azure/adf.py b/df_to_azure/adf.py index b1c08c0..884dbd7 100644 --- a/df_to_azure/adf.py +++ b/df_to_azure/adf.py @@ -19,6 +19,7 @@ Factory, LinkedServiceReference, LinkedServiceResource, + ParquetFormat, PipelineResource, SecureString, SqlServerStoredProcedureActivity, @@ -177,16 +178,8 @@ def create_input_blob(self): ds_azure_blob = AzureBlobDataset( linked_service_name=ds_ls, folder_path=f"dftoazure/{self.table_name}", - file_name=f"{self.table_name}.csv", - format={ - "type": "TextFormat", - "columnDelimiter": "^", - "rowDelimiter": "\n", - "treatEmptyAsNull": "true", - "skipLineCount": 0, - "firstRowAsHeader": "true", - "quoteChar": '"', - }, + file_name=f"{self.table_name}.parquet", # Changed to parquet + format=ParquetFormat(), # Changed format to ParquetFormat ) ds_azure_blob = DatasetResource(properties=ds_azure_blob) self.adf_client.datasets.create_or_update(self.rg_name, self.df_name, ds_name, ds_azure_blob) diff --git a/df_to_azure/export.py b/df_to_azure/export.py index 8805b8d..2ac478d 100644 --- a/df_to_azure/export.py +++ b/df_to_azure/export.py @@ -183,10 +183,10 @@ def upload_to_blob(self): blob_client = self.blob_service_client() blob_client = blob_client.get_blob_client( container="dftoazure", - blob=f"{self.table_name}/{self.table_name}.csv", + blob=f"{self.table_name}/{self.table_name}.parquet", ) - data = self.df.to_csv(index=False, sep="^", quotechar='"', lineterminator="\n") + data = self.df.to_parquet(index=False) blob_client.upload_blob(data, overwrite=True) def create_schema(self):