-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathebg_rb_correlation.py
42 lines (34 loc) · 2.03 KB
/
ebg_rb_correlation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import os
import pandas as pd
from scipy.stats import pearsonr
df_ebg_prediction = pd.read_csv(os.path.join(os.pardir, "data/comparison/predictions/ebg_prediction_test.csv"))
df_ebg_prediction["dataset"] = df_ebg_prediction["dataset"].str.replace(".csv", "")
df_ebg_prediction["prediction_ebg_tool"] = df_ebg_prediction["prediction_median"]
df_ground_truth = pd.read_csv(os.path.join(os.pardir, "data/processed/target/target.csv"))
df_merged = df_ebg_prediction.merge(df_ground_truth, on=["dataset", "branchId"])
df_merged["support"] = df_merged["support"] * 100
correlation_df = df_merged.groupby('dataset').apply(lambda x: pd.Series({
'correlation': round(pearsonr(x['prediction_ebg_tool'], x['support'])[0], 2),
'p_value': round(pearsonr(x['prediction_ebg_tool'], x['support'])[1], 3),
'mean_support': x['support'].mean()
})).reset_index()
correlation_df.to_csv("correlation_results.csv", index=False)
import numpy as np
# Add a small constant to correlation coefficients to avoid division by zero
correlation_df['correlation'] += 1e-8
# Convert Pearson correlation coefficients to Fisher's z scores
correlation_df['fishers_z'] = np.arctanh(correlation_df['correlation'])
# Calculate the mean of Fisher's z scores
mean_fishers_z = correlation_df['fishers_z'].mean()
# Convert the average Fisher's z score back to a Pearson correlation value
mean_corr = np.tanh(mean_fishers_z)
print("Mean Pearson correlation:", mean_corr)
std_corr = correlation_df["correlation"].std()
print(f"Mean Pearson correlation: {mean_corr}, Std.: {std_corr}")
df_msa = pd.read_csv(os.path.join(os.pardir, "data/processed/msa_difficulty.csv"))
df_merged = correlation_df.merge(df_msa, how="inner", on=["dataset"])
filtered_rows = df_merged[df_merged['correlation'] <= 0.7]
print(filtered_rows[['dataset','difficulty', 'mean_support', 'correlation']])
msa_feats = pd.read_csv("/Users/juliuswiegert/Repositories/placement_difficulty_prediction/data/processed/features/msa_features.csv")
df_merged = df_merged.merge(msa_feats, how="inner", on=["dataset"])
print(df_merged.shape)