Skip to content

Commit

Permalink
change ec2
Browse files Browse the repository at this point in the history
  • Loading branch information
pohaoc2 committed Feb 9, 2025
1 parent ad06048 commit adbc8de
Show file tree
Hide file tree
Showing 4 changed files with 5,615 additions and 31 deletions.
9 changes: 6 additions & 3 deletions sandbox/src/approximate_bayesian.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ def main():
_, bins, patch = ax[0].hist(y_sims, bins=20)
ax[0].set_title("Prior - Activity")
ax[0].set_xlim([-1, 1])

ax[0].set_xlabel("Activity")
ax[0].set_ylabel("Number of samples")
y_obs = 0.25
print(f"Number of samples: {len(data)}")
epsilon = 0.25
Expand All @@ -61,15 +62,17 @@ def main():
print(f"Number of accepted samples: {len(posterior_samples)}")
# Plot the accepted samples
ax[1].hist(posterior_samples, bins=bins)
ax[1].set_title("Posterior - Activity")
ax[1].set_title("Posterior - Activity (ABC)")
ax[1].axvline(x=y_obs, color="red", linestyle="--", label="Observed")
# Plot eplison
ax[1].axvline(x=y_obs + epsilon, color="black", linestyle="--", label="Epsilon")
ax[1].axvline(x=y_obs - epsilon, color="black", linestyle="--")
ax[1].legend()
ax[1].set_xlim([-1, 1])
ax[1].set_xlabel("Activity")

plt.tight_layout()
plt.savefig("posterior_samples.png")
plt.savefig("posterior_abc.png")

if __name__ == "__main__":
main()
67 changes: 46 additions & 21 deletions sandbox/src/gp.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, WhiteKernel, ConstantKernel as C
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

def plot_parity_with_uncertainty(
y_true_train, y_pred_train, y_std_train,
Expand Down Expand Up @@ -88,24 +89,6 @@ def plot_parity_with_uncertainty(
plt.savefig(filename)


def clean_data(full_data, response):
"""Handle missing or non-numeric data"""

# Remove rows with multiple components
full_data = full_data[full_data["COMPONENTS"] == 1]
full_data.reset_index(drop=True, inplace=True)

# Remove response rows with bad values
full_data = full_data.loc[~full_data[response].isin([np.nan, np.inf, -np.inf])]
full_data.reset_index(drop=True, inplace=True)

# Removed features columns with bad values
numeric_cols = full_data.select_dtypes(include=[np.number]).columns
full_data = full_data.loc[
:, ~(np.isnan(full_data[numeric_cols]).any(axis=0) | np.isinf(full_data[numeric_cols])).any(axis=0)
]
return full_data

OUTPUT_MAPPING = {"ACTIVITY": 0, "GROWTH": 1, "SYMMETRY": 2}

# Load data
Expand All @@ -120,6 +103,7 @@ def clean_data(full_data, response):
"AVG_DEGREE", "AVG_CLUSTERING", "AVG_CLOSENESS",
"AVG_BETWEENNESS", "AVG_CORENESS"
]
features = ["RADIUS"]
spatial_features = [
"RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW",
"NODES", "EDGES", "GRADIUS", "GDIAMETER", "AVG_ECCENTRICITY",
Expand Down Expand Up @@ -182,7 +166,7 @@ def clean_data(full_data, response):
y_train = scaler.fit_transform(y_train)
y_test = scaler.transform(y_test)
if train:
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, alpha=3e-1)
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, alpha=3e-6)
gp.fit(X_train, y_train)
else:
gp = joblib.load('gp.pkl')
Expand All @@ -191,7 +175,48 @@ def clean_data(full_data, response):

y_pred, y_pred_std = gp.predict(X_test, return_std=True)
y_pred_train, y_pred_std_train = gp.predict(X_train, return_std=True)
# Convert back to original scale
# Plot GP prediction function with uncertainty
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
plot_pca = False
if plot_pca:
pca = PCA(n_components=1) # Focus on PC1 for plotting
pca_X_train = pca.fit_transform(X_train)
pca_X_test = pca.transform(X_test)
# Generate uniform points in the PC1 space
pc1_min, pc1_max = pca_X_train.min(), pca_X_train.max()
pc1_uniform = np.linspace(pc1_min, pc1_max, 50).reshape(-1, 1)
# Map uniform points back to the original feature space
X_uniform = pca.inverse_transform(pc1_uniform)
# Get GP predictions (mean and standard deviation) for the uniform points
y_p, y_std = gp.predict(X_uniform, return_std=True)

# Plot GP prediction with uncertainty
# Scatter plot for training data in PC1
print(pca_X_train.shape, y_train.shape)
ax.scatter(pca_X_train[:, 0], y_train[:, 0], label="Train Data", color="blue", alpha=0.6)
ax.scatter(pca_X_test[:, 0], y_test[:, 0], label="Test Data", color="green", alpha=0.6)

# GP prediction mean
ax.scatter(pc1_uniform, y_p[:, 0], label="GP Prediction", color="red", linewidth=2)
ax.plot(pc1_uniform, y_p[:, 0], label="GP Prediction", color="red", linewidth=2)
# Customize the plot
ax.set_title("GP Prediction with Uncertainty")
ax.set_xlabel("Principal Component 1 (PC1)")
ax.set_ylabel("Prediction")
else:
x = np.linspace(-3, 3, 1000).reshape(-1, 1)
y_p = gp.predict(x)
ax.scatter(X_train, y_train[:, 0], label="Train")
ax.scatter(X_test, y_test[:, 0], label="Test")
ax.plot(x, y_p[:, 0], label="Prediction", color="red")
ax.set_title("GP Prediction")
ax.set_xlabel("RADIUS")
ax.set_ylabel("Prediction")

ax.legend()

plt.tight_layout()
plt.savefig("gp.png")
"""
y_pred = scaler.inverse_transform(y_pred)
y_pred_std = scaler.inverse_transform(y_pred_std)
Expand Down
83 changes: 76 additions & 7 deletions sandbox/src/mcmc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,18 @@
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

column_names = [
"KEY",
"RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW",
"NODES", "EDGES", "GRADIUS", "GDIAMETER", "AVG_ECCENTRICITY",
"AVG_SHORTEST_PATH", "AVG_IN_DEGREES", "AVG_OUT_DEGREES",
"AVG_DEGREE", "AVG_CLUSTERING", "AVG_CLOSENESS",
"AVG_BETWEENNESS", "AVG_CORENESS"
]

# Define distance function based on the paper
def distance_function(y_obs, y_sim, weight=1.0):
Expand Down Expand Up @@ -61,22 +73,27 @@ def mcmc(data, y_sims, y_obs, n_iterations, proposal_std=1.0):
samples.append(np.append(current_theta, proposal_y_sim))
# Remove duplicates in the samples
#samples = list(set(tuple(row) for row in samples))
return pd.DataFrame(samples, columns=["NODES", "EDGES", "GRADIUS", "ACTIVITY"])
return pd.DataFrame(samples, columns= column_names + ["ACTIVITY"])

def main():
# Load ABM data
data_path = "../../data/ARCADE/C-feature_0.0_metric_15-04032023.csv"
data_path = "../../data/ARCADE/C-feature_15.0_metric_15-04032023.csv"
data = pd.read_csv(data_path)
data = data[data["COMPONENTS"] == 1]
threshold = 0.2
columns_to_drop = [col for col in data.columns if ((data[col] == np.inf) | (data[col] == -np.inf)).mean() >= threshold]
data = data.drop(columns=columns_to_drop)

# Extract inputs (theta) and outputs (y)
input_feature_names = ["NODES", "EDGES", "GRADIUS"]
input_feature_names = column_names #["NODES", "EDGES", "GRADIUS"]
# input_feature_names = ["ACTIVITY"]
predicted_output = ["ACTIVITY"]#, "GROWTH", "SYMMETRY"]
input_features = data[input_feature_names].values

y_sims = data[predicted_output].values

# Observed value
y_obs = [1]#, -10, 0]
y_obs = [0.25]#, -10, 0]

# Run MCMC
n_iterations = 10000
Expand All @@ -89,12 +106,64 @@ def main():
print(f"Number of samples: {len(posterior_samples)}")
print(posterior_samples.describe())
# Plot the accepted samples activity
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
fig, ax = plt.subplots(1, 3, figsize=(15, 5))
_, bins, patch = ax[0].hist(y_sims, bins=20)
ax[0].set_title("Prior - Activity")

ax[0].set_xlim([-1, 1])
ax[0].set_xlabel("Activity")
ax[0].set_ylabel("Number of samples")
ax[1].hist(posterior_samples["ACTIVITY"], bins=bins)
ax[1].set_title("Posterior - Activity")
ax[1].set_title("Posterior - Activity (MCMC)")
ax[1].set_xlim([-1, 1])
ax[1].set_xlabel("Activity")
ax[1].axvline(y_obs[0], color="red", linestyle="--", label="Target activity")
ax[1].legend()

pca = PCA(n_components=2)
scaler = StandardScaler()
features = scaler.fit_transform(input_features[:, 1:])
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(input_features[:, 0])
reduced_features = pca.fit_transform(features)
categories = label_encoder.classes_
markers = ['o', 's', 'D', '^', 'v', '<', '>', 'p', '*', 'h', 'H', '+', 'x', 'd', '|', '_']
unique_labels = np.unique(labels)
cmap = plt.cm.viridis
# drop duplicates
posterior_samples = posterior_samples.drop_duplicates(subset=input_feature_names)
posterior_reduced_features = pca.transform(scaler.transform(posterior_samples[input_feature_names].values[:, 1:]))
posterior_labels = label_encoder.transform(posterior_samples[input_feature_names].values[:, 0])

for i, label in enumerate(unique_labels):
ax[2].scatter(reduced_features[labels == label, 0],
reduced_features[labels == label, 1],
marker=markers[i % len(markers)],
label=f"{categories[label]}",
facecolors='none',
edgecolors=cmap(i / len(unique_labels))
)
ax[2].scatter(posterior_reduced_features[posterior_labels == label, 0],
posterior_reduced_features[posterior_labels == label, 1],
marker=markers[i % len(markers)],
facecolors=cmap(i / len(unique_labels)),
edgecolors='none', alpha=0.8
)

# Create custom legends
handles1 = [plt.Line2D([0], [0], marker=markers[i % len(markers)], color='w', label=categories[label],
markerfacecolor='none', markeredgecolor=cmap(i / len(unique_labels)))
for i, label in enumerate(unique_labels)]
handles2 = [plt.Line2D([0], [0], marker='o', color='w', label='Prior', markerfacecolor='none', markeredgecolor='k'),
plt.Line2D([0], [0], marker='o', color='w', label='Posterior', markerfacecolor='k', markeredgecolor='none', alpha=0.5)]

legend1 = ax[2].legend(handles=handles1, title="Vasculature type", loc='upper right')
ax[2].add_artist(legend1)
ax[2].legend(handles=handles2, title="Distribution", loc='lower right')
ax[2].set_title("PCA - Vasculature distribution")
ax[2].set_xlabel("PC1")
ax[2].set_ylabel("PC2")
plt.tight_layout()

plt.savefig("posterior_mcmc.png")

if __name__ == "__main__":
Expand Down
Loading

0 comments on commit adbc8de

Please sign in to comment.