Skip to content

Commit

Permalink
Update repo to latest version (#1)
Browse files Browse the repository at this point in the history
  • Loading branch information
RicardoBrancas committed Dec 23, 2022
1 parent ec7b737 commit 12a655e
Show file tree
Hide file tree
Showing 382 changed files with 6,906 additions and 1,147,013 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ You can use `--help` to see other configuration options available in the accurac

# Final notes

- The file `cubes_500.instances` contains the list of instances used for evaluation in the parts where only a subset of instances was used. This file can be passed as the `--instances` argument to the benchmark script.
- The file `groups/cubes_500.instances` contains the list of instances used for evaluation in the parts where only a subset of instances was used. This file can be passed as the `--instances` argument to the benchmark script.

- The folder `analysis` contains all code use to analyse the results obtained and produced all graphs included in the paper.
- The folder `analysis` contains all code used to analyse the results obtained and produced all graphs included in the paper.

- The folders `analysis/data` and `analysis/fuzzy` come pre-packaged with all logs used in the evaluation section of the paper.
1 change: 1 addition & 0 deletions analysis/.gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
data/
fuzzy/
plots/
tables/
.RData
Expand Down
10 changes: 10 additions & 0 deletions analysis/dists.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
library(fitdistrplus)
library(logspline)

library(readr)

data <- read.csv("instances.csv")

data$input_cells

descdist(data$input_cells)
170 changes: 170 additions & 0 deletions analysis/dists.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
#!/usr/bin/python3

import pandas as pd

from scipy.stats import kstest
from scipy.stats import ks_2samp
# from scipy.stats import epps_singleton_2samp
import numpy as np
import scipy.stats as st
import warnings

warnings.simplefilter('ignore')


# Create models from data
def best_fit_distribution(data, bins=200, ax=None):
"""Model data by finding best fit distribution to data"""
# Get histogram of original data
print('Building...')

y, x = np.histogram(data, bins=bins, density=True)
x = (x + np.roll(x, -1))[:-1] / 2.0

print('Start checking...')

dist_names = ['alpha', 'anglit', 'arcsine', 'beta', 'betaprime', 'bradford', 'burr', 'cauchy', 'chi', 'chi2', 'cosine', 'dgamma', 'dweibull', 'erlang', 'expon', 'exponweib', 'exponpow', 'f', 'fatiguelife', 'fisk',
'foldcauchy', 'foldnorm', 'genlogistic', 'genpareto', 'genexpon', 'genextreme', 'gamma', 'gengamma', 'genhalflogistic', 'gilbrat', 'gompertz', 'gumbel_r',
'gumbel_l', 'halfcauchy', 'halflogistic', 'halfnorm', 'hypsecant', 'invgamma', 'invgauss', 'invweibull', 'johnsonsb', 'johnsonsu', 'ksone', 'kstwobign', 'laplace', 'logistic', 'loggamma', 'loglaplace',
'lognorm', 'lomax', 'maxwell', 'mielke', 'nakagami', 'ncx2', 'ncf', 'nct', 'norm', 'pareto', 'pearson3', 'powerlaw', 'powerlognorm', 'rdist', 'reciprocal', 'rayleigh', 'rice',
'recipinvgauss', 'semicircular', 't', 'triang', 'truncexpon', 'truncnorm', 'tukeylambda', 'uniform', 'vonmises', 'wald', 'weibull_min', 'weibull_max'] # , 'wrapcauchy']

# Best holders
dist_results = []
best_distribution = st.norm
best_params = (0.0, 1.0)
best_sse = np.inf

# Estimate distribution parameters from data
for dist_name in dist_names:
distribution = getattr(st, dist_name)

# Ignore warnings from data that can't be fit
with warnings.catch_warnings():
warnings.filterwarnings('ignore')

# fit dist to data
params = distribution.fit(data)
# print(str(params))

# Separate parts of parameters
arg = params[:-2]
loc = params[-2]
scale = params[-1]

# print("Arg: " + str(arg))
# print("Loc: " + str(loc))
# print("Scale: " + str(scale))

# Calculate fitted PDF and error with fit in distribution
pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
sse = np.sum(np.power(y - pdf, 2.0))

size_data = len(data)
if len(arg) == 0:
dist_data = distribution.rvs(loc=loc, size=size_data, scale=scale)
elif len(arg) == 1:
dist_data = distribution.rvs(arg[0], loc=loc, size=size_data, scale=scale)
elif len(arg) == 2:
dist_data = distribution.rvs(arg[0], arg[1], loc=loc, size=size_data, scale=scale)
elif len(arg) == 3:
dist_data = distribution.rvs(arg[0], arg[1], arg[2], loc=loc, size=size_data, scale=scale)
elif len(arg) == 4:
dist_data = distribution.rvs(arg[0], arg[1], arg[2], arg[3], loc=loc, size=size_data, scale=scale)
else:
dist_data = distribution.rvs(loc=loc, size=size_data, scale=scale)
print(distribution.name + ": Data generated")
# print(str(dist_data))
dist_data2 = []
for i in dist_data:
dist_data2.append(int(round(i)))
test_stat = ks_2samp(data, dist_data)
print(str(test_stat))
test_stat = ks_2samp(data, dist_data2)
print(str(test_stat))

print(distribution.name + '\t sse: ' + str(sse), flush=True)
print("\n")
if sse > 0:
dist_results.append((distribution.name, sse, test_stat.pvalue))

# identify if this distribution is better
if best_sse > sse > 0:
best_distribution = distribution
best_params = params
best_sse = sse

# sort results by sse
sorted_dist_results = sorted(dist_results, key=lambda x: x[1])

# print top 5 by sse
print("Top 5 (sse):")
print(sorted_dist_results[:5])
print(sorted_dist_results)
print(" ")

# sort results by pvalue
sorted_dist_results = sorted(dist_results, key=lambda x: x[2], reverse=True)

# print top 5 by pvalue
print("Top 5 (pvalue):")
print(sorted_dist_results[:5])
print(sorted_dist_results)
print(" ")

return best_distribution.name, best_params


def get_best_distribution(data):
dist_names = ['alpha', 'anglit', 'arcsine', 'beta', 'betaprime', 'bradford', 'burr', 'cauchy', 'chi', 'chi2', 'cosine', 'dgamma', 'dweibull', 'erlang', 'expon', 'exponweib', 'exponpow', 'f', 'fatiguelife', 'fisk',
'foldcauchy', 'foldnorm', 'frechet_r', 'frechet_l', 'genlogistic', 'genpareto', 'genexpon', 'genextreme', 'gausshyper', 'gamma', 'gengamma', 'genhalflogistic', 'gilbrat', 'gompertz', 'gumbel_r',
'gumbel_l', 'halfcauchy', 'halflogistic', 'halfnorm', 'hypsecant', 'invgamma', 'invgauss', 'invweibull', 'johnsonsb', 'johnsonsu', 'ksone', 'kstwobign', 'laplace', 'logistic', 'loggamma', 'loglaplace',
'lognorm', 'lomax', 'maxwell', 'mielke', 'nakagami', 'ncx2', 'ncf', 'nct', 'norm', 'pareto', 'pearson3', 'powerlaw', 'powerlognorm', 'rdist', 'reciprocal', 'rayleigh', 'rice',
'recipinvgauss', 'semicircular', 't', 'triang', 'truncexpon', 'truncnorm', 'tukeylambda', 'uniform', 'vonmises', 'wald', 'weibull_min', 'weibull_max', 'wrapcauchy']
dist_results = []
params = {}
for dist_name in dist_names:
dist = getattr(st, dist_name)
param = dist.fit(data)

params[dist_name] = param
# Applying the Kolmogorov-Smirnov test
D, p = st.kstest(data, dist_name, args=param)
print("p value for " + dist_name + " = " + str(p))
dist_results.append((dist_name, p))

# sort results
sorted_dist_results = sorted(dist_results, key=lambda x: -x[1])

# print top 5
print("Top 5:")
print(sorted_dist_results[:5])
print(sorted_dist_results)
print(" ")

best_dist, best_p = (max(dist_results, key=lambda item: item[1]))
# store the name of the best fit and its p value

print("Best fitting distribution: " + str(best_dist))
print("Best p value: " + str(best_p))
print("Parameters for the best fit: " + str(params[best_dist]))

return best_dist, best_p, params[best_dist]


df = pd.read_csv('instances.csv')

lst = df.input_cells

m = lst[0]
M = m
for i in lst:
if i < m:
m = i
if i > M:
M = i
print('Buckets: ' + str(M - m + 1))
# get_best_distribution(lst)
best = best_fit_distribution(lst, M - m + 1)

print(best)
Loading

0 comments on commit 12a655e

Please sign in to comment.