-
Notifications
You must be signed in to change notification settings - Fork 0
/
df_analysis.py
122 lines (103 loc) · 4.18 KB
/
df_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Define Base URL for API
BASE_URL = "http://127.0.0.1:8000/api"
def fetch_data(endpoint: str):
"""
Fetches data from the given API endpoint and returns it as a DataFrame.
"""
try:
response = requests.get(f"{BASE_URL}/{endpoint}")
response.raise_for_status()
data = response.json()
print(f"Fetched data from {endpoint}: {data}")
records = [
{**item['fields'], 'id': item['pk']} for item in data[endpoint]
]
return pd.DataFrame(records)
except requests.exceptions.RequestException as e:
print(f"Error fetching data from API: {e}")
return pd.DataFrame()
except KeyError as e:
print(f"Key error: {e} - Check the API response structure.")
return pd.DataFrame()
def clean_data(df: pd.DataFrame):
"""
Fills or drops missing values in the DataFrame.
"""
for column in df.columns:
if df[column].dtype == 'object':
df[column] = df[column].fillna('Unknown')
else:
df[column] = df[column].fillna(0)
return df
def assign_random_ids(df: pd.DataFrame, column: str, source_ids: list):
"""
Assigns random IDs to a column in a DataFrame where the column has null values.
"""
null_count = df[column].isnull().sum()
if null_count > 0:
random_ids = np.random.choice(source_ids, size=null_count, replace=True)
df.loc[df[column].isnull(), column] = random_ids
return df
def export_dataframes(dataframes: dict):
"""
Exports a dictionary of DataFrames to CSV files.
"""
for name, df in dataframes.items():
filename = f"{name}.csv"
df.to_csv(filename, index=False)
print(f"Exported {name} to {filename}")
# 2. Fetch Data
properties_df = fetch_data("properties")
tenants_df = fetch_data("tenants")
# 3. Preprocess and Clean Data
# Handle empty or missing data
if properties_df.empty:
print("Error: Properties DataFrame is empty. Check the API or database.")
properties_df = pd.DataFrame(columns=["id", "property_name", "address", "owner", "rent_amount", "monthly_expenses", "status", "tenant_id"])
if tenants_df.empty:
print("Warning: Tenants DataFrame is empty. Assigning placeholder values.")
tenants_df = pd.DataFrame(columns=["id", "name", "email", "phone_number"])
if "tenant_id" not in properties_df.columns:
properties_df["tenant_id"] = np.nan
if not tenants_df.empty and "id" in tenants_df.columns:
properties_df = assign_random_ids(properties_df, "tenant_id", tenants_df["id"].tolist())
properties_df["tenant_id"] = properties_df["tenant_id"].astype(int)
else:
print("Warning: No valid tenant IDs to assign to properties.")
properties_df = clean_data(properties_df)
tenants_df = clean_data(tenants_df)
merged_df = pd.merge(properties_df, tenants_df, left_on="tenant_id", right_on="id", how="left")
merged_df["rent_amount"] = pd.to_numeric(merged_df["rent_amount"], errors='coerce')
merged_df["monthly_expenses"] = pd.to_numeric(merged_df["monthly_expenses"], errors='coerce')
merged_df["net_income"] = merged_df["rent_amount"] - merged_df["monthly_expenses"]
merged_df["rent_expense_ratio"] = merged_df["monthly_expenses"] / (merged_df["rent_amount"] + 1e-9)
def compute_and_plot_statistics(df: pd.DataFrame, column: str):
"""
Computes statistics for a specific column and plots its histogram and correlation matrix.
"""
print(f"Statistics for {column}:")
print(df[column].describe())
plt.figure(figsize=(8, 4))
sns.histplot(df[column], bins=20, kde=True)
plt.title(f"Distribution of {column}")
plt.xlabel(column)
plt.ylabel("Frequency")
plt.show()
numerical_cols = df.select_dtypes(include=[np.number])
correlation_matrix = numerical_cols.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()
compute_and_plot_statistics(merged_df, "net_income")
export_dataframes({
"properties_data": properties_df,
"tenants_data": tenants_df,
"merged_data": merged_df
})
print("Script completed successfully.")