From a17b4df4a5874a9abaa99992d1ad61f7d916d427 Mon Sep 17 00:00:00 2001 From: Menglin Wang Date: Wed, 15 Nov 2023 17:24:30 -0800 Subject: [PATCH] resolved conflict with dev, added more tests --- dataprofiler/data_readers/data_utils.py | 2 +- .../tests/data_readers/test_parquet_data.py | 38 +++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 79b5b531..d255f3c3 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -1,8 +1,8 @@ """Contains functions for data readers.""" import json -import random import logging import os +import random import re import urllib from collections import OrderedDict diff --git a/dataprofiler/tests/data_readers/test_parquet_data.py b/dataprofiler/tests/data_readers/test_parquet_data.py index fdabd413..2c885882 100644 --- a/dataprofiler/tests/data_readers/test_parquet_data.py +++ b/dataprofiler/tests/data_readers/test_parquet_data.py @@ -102,6 +102,16 @@ def test_specifying_data_type(self): input_data_obj = Data(input_file["path"], data_type="parquet") self.assertEqual(input_data_obj.data_type, "parquet") + def test_specifying_data_type_when_sampled(self): + """ + Determine if the parquet file can be loaded with manual data_type setting when sampled + """ + for input_file in self.file_or_buf_list: + input_data_obj = Data( + input_file["path"], data_type="parquet", options={"sample_nrows": 100} + ) + self.assertEqual(input_data_obj.data_type, "parquet") + def test_reload_data(self): """ Determine if the parquet file can be reloaded @@ -112,6 +122,16 @@ def test_reload_data(self): self.assertEqual(input_data_obj.data_type, "parquet") self.assertEqual(input_file["path"], input_data_obj.input_file_path) + def test_reload_data_when_sampled(self): + """ + Determine if the parquet file can be reloaded when sampled + """ + for input_file in self.file_or_buf_list: + input_data_obj = Data(input_file["path"], options={"sample_nrows": 100}) + input_data_obj.reload(input_file["path"], options={"sample_nrows": 100}) + self.assertEqual(input_data_obj.data_type, "parquet") + self.assertEqual(input_file["path"], input_data_obj.input_file_path) + def test_data_formats(self): """ Determine if the parquet file data_formats can be used @@ -130,6 +150,24 @@ def test_data_formats(self): self.assertIsInstance(data, list) self.assertIsInstance(data[0], str) + def test_data_formats_when_sampled(self): + """ + Determine if the parquet file data_formats can be used when sampled + """ + for input_file in self.file_or_buf_list: + input_data_obj = Data(input_file["path"], options={"sample_nrows": 100}) + for data_format in list(input_data_obj._data_formats.keys()): + input_data_obj.data_format = data_format + self.assertEqual(input_data_obj.data_format, data_format) + data = input_data_obj.data + if data_format == "dataframe": + import pandas as pd + + self.assertIsInstance(data, pd.DataFrame) + elif data_format in ["records", "json"]: + self.assertIsInstance(data, list) + self.assertIsInstance(data[0], str) + def test_mixed_string_col(self): """ Determine if parquet can handle mixed string column types.