diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index 79b5b531..d255f3c3 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -1,8 +1,8 @@ """Contains functions for data readers.""" import json -import random import logging import os +import random import re import urllib from collections import OrderedDict diff --git a/dataprofiler/tests/data_readers/test_parquet_data.py b/dataprofiler/tests/data_readers/test_parquet_data.py index fdabd413..2c885882 100644 --- a/dataprofiler/tests/data_readers/test_parquet_data.py +++ b/dataprofiler/tests/data_readers/test_parquet_data.py @@ -102,6 +102,16 @@ def test_specifying_data_type(self): input_data_obj = Data(input_file["path"], data_type="parquet") self.assertEqual(input_data_obj.data_type, "parquet") + def test_specifying_data_type_when_sampled(self): + """ + Determine if the parquet file can be loaded with manual data_type setting when sampled + """ + for input_file in self.file_or_buf_list: + input_data_obj = Data( + input_file["path"], data_type="parquet", options={"sample_nrows": 100} + ) + self.assertEqual(input_data_obj.data_type, "parquet") + def test_reload_data(self): """ Determine if the parquet file can be reloaded @@ -112,6 +122,16 @@ def test_reload_data(self): self.assertEqual(input_data_obj.data_type, "parquet") self.assertEqual(input_file["path"], input_data_obj.input_file_path) + def test_reload_data_when_sampled(self): + """ + Determine if the parquet file can be reloaded when sampled + """ + for input_file in self.file_or_buf_list: + input_data_obj = Data(input_file["path"], options={"sample_nrows": 100}) + input_data_obj.reload(input_file["path"], options={"sample_nrows": 100}) + self.assertEqual(input_data_obj.data_type, "parquet") + self.assertEqual(input_file["path"], input_data_obj.input_file_path) + def test_data_formats(self): """ Determine if the parquet file data_formats can be used @@ -130,6 +150,24 @@ def test_data_formats(self): self.assertIsInstance(data, list) self.assertIsInstance(data[0], str) + def test_data_formats_when_sampled(self): + """ + Determine if the parquet file data_formats can be used when sampled + """ + for input_file in self.file_or_buf_list: + input_data_obj = Data(input_file["path"], options={"sample_nrows": 100}) + for data_format in list(input_data_obj._data_formats.keys()): + input_data_obj.data_format = data_format + self.assertEqual(input_data_obj.data_format, data_format) + data = input_data_obj.data + if data_format == "dataframe": + import pandas as pd + + self.assertIsInstance(data, pd.DataFrame) + elif data_format in ["records", "json"]: + self.assertIsInstance(data, list) + self.assertIsInstance(data[0], str) + def test_mixed_string_col(self): """ Determine if parquet can handle mixed string column types.