Skip to content

Commit

Permalink
Fix duplicated column in feature type detector. Fix docs
Browse files Browse the repository at this point in the history
  • Loading branch information
ThomasMeissnerDS committed Aug 15, 2024
1 parent 953c07b commit 45efab4
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 9 deletions.
4 changes: 2 additions & 2 deletions bluecast/eda/analyse.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def univariate_plots(df: pd.DataFrame) -> None:
Expects numeric columns only.
"""
for col in df.columns:
for col in df.columns.to_list():
plt.figure(figsize=(8, 4))

# Histogram
Expand Down Expand Up @@ -661,7 +661,7 @@ def plot_null_percentage(dataframe: pd.DataFrame) -> None:


def check_unique_values(
df: pd.DataFrame, columns: List[Union[str, int, float]], threshold: float
df: pd.DataFrame, columns: List[Union[str, int, float]], threshold: float = 0.9
) -> List[Union[str, int, float]]:
"""
Check if the columns have an amount of unique values that is almost the number of total rows (being above the defined threshold)
Expand Down
15 changes: 10 additions & 5 deletions bluecast/preprocessing/feature_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,8 @@ def identify_num_columns(self, df: pd.DataFrame) -> pd.DataFrame:
for vartype in self.num_dtypes:
num_cols = df.select_dtypes(include=[vartype]).columns
for col in num_cols:
num_col_list.append(col)
if col not in num_col_list:
num_col_list.append(col)

for col in df.columns.to_list():
max_length = df[col].astype(str).str.len().max()
Expand All @@ -161,14 +162,16 @@ def identify_num_columns(self, df: pd.DataFrame) -> pd.DataFrame:
if self.check_if_column_is_float_from_string(df[col]):
df[col] = df[col].astype(float)
self.detected_col_types[col] = "float"
num_col_list.append(col)
if col not in num_col_list:
num_col_list.append(col)
elif (
self.check_if_column_is_int_from_string(df[col])
and df[col].nunique() > 2
):
df[col] = df[col].astype("Int64")
self.detected_col_types[col] = "Int64"
num_col_list.append(col)
if col not in num_col_list:
num_col_list.append(col)
except Exception:
pass
self.num_columns = num_col_list
Expand Down Expand Up @@ -248,13 +251,15 @@ def cast_rest_columns_to_object(
if col in self.cat_columns:
df[col] = df[col].astype(str)
self.detected_col_types[col] = "object"
cat_columns.append(col)
if col not in cat_columns:
cat_columns.append(col)
if col in self.num_columns:
pass
else:
df[col] = df[col].astype(str)
self.detected_col_types[col] = "object"
cat_columns.append(col)
if col not in cat_columns:
cat_columns.append(col)
self.cat_columns = cat_columns
return df

Expand Down
4 changes: 2 additions & 2 deletions docs/source/EDA.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ from bluecast.eda.analyse import (
plot_tsne,
check_unique_values,
plot_null_percentage,
mutual_info_to_target.
mutual_info_to_target,
plot_pie_chart,
)

Expand All @@ -69,7 +69,7 @@ feat_type_detector = FeatureTypeDetector()
train_data = feat_type_detector.fit_transform_feature_types(train_data)

# detect columns with a very high share of unique values
many_unique_cols = check_unique_values(train_data, feat_type_detector.cat_columns)
many_unique_cols = check_unique_values(train_data, train_data.columns.to_list())
```
## Pie chart
Expand Down

0 comments on commit 45efab4

Please sign in to comment.