Skip to content

Commit

Permalink
resolved conflict with dev, added more tests
Browse files Browse the repository at this point in the history
  • Loading branch information
menglinw committed Nov 16, 2023
1 parent 09a868d commit a17b4df
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 1 deletion.
2 changes: 1 addition & 1 deletion dataprofiler/data_readers/data_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""Contains functions for data readers."""
import json
import random
import logging
import os
import random
import re
import urllib
from collections import OrderedDict
Expand Down
38 changes: 38 additions & 0 deletions dataprofiler/tests/data_readers/test_parquet_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,16 @@ def test_specifying_data_type(self):
input_data_obj = Data(input_file["path"], data_type="parquet")
self.assertEqual(input_data_obj.data_type, "parquet")

def test_specifying_data_type_when_sampled(self):
"""
Determine if the parquet file can be loaded with manual data_type setting when sampled
"""
for input_file in self.file_or_buf_list:
input_data_obj = Data(
input_file["path"], data_type="parquet", options={"sample_nrows": 100}
)
self.assertEqual(input_data_obj.data_type, "parquet")

def test_reload_data(self):
"""
Determine if the parquet file can be reloaded
Expand All @@ -112,6 +122,16 @@ def test_reload_data(self):
self.assertEqual(input_data_obj.data_type, "parquet")
self.assertEqual(input_file["path"], input_data_obj.input_file_path)

def test_reload_data_when_sampled(self):
"""
Determine if the parquet file can be reloaded when sampled
"""
for input_file in self.file_or_buf_list:
input_data_obj = Data(input_file["path"], options={"sample_nrows": 100})
input_data_obj.reload(input_file["path"], options={"sample_nrows": 100})
self.assertEqual(input_data_obj.data_type, "parquet")
self.assertEqual(input_file["path"], input_data_obj.input_file_path)

def test_data_formats(self):
"""
Determine if the parquet file data_formats can be used
Expand All @@ -130,6 +150,24 @@ def test_data_formats(self):
self.assertIsInstance(data, list)
self.assertIsInstance(data[0], str)

def test_data_formats_when_sampled(self):
"""
Determine if the parquet file data_formats can be used when sampled
"""
for input_file in self.file_or_buf_list:
input_data_obj = Data(input_file["path"], options={"sample_nrows": 100})
for data_format in list(input_data_obj._data_formats.keys()):
input_data_obj.data_format = data_format
self.assertEqual(input_data_obj.data_format, data_format)
data = input_data_obj.data
if data_format == "dataframe":
import pandas as pd

self.assertIsInstance(data, pd.DataFrame)
elif data_format in ["records", "json"]:
self.assertIsInstance(data, list)
self.assertIsInstance(data[0], str)

def test_mixed_string_col(self):
"""
Determine if parquet can handle mixed string column types.
Expand Down

0 comments on commit a17b4df

Please sign in to comment.