-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhistogram_plot.py
89 lines (65 loc) · 2.57 KB
/
histogram_plot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn import datasets
pd.set_option('display.max_columns', None)
sns.set()
def sklearn_to_df(sklearn_dataset):
df = pd.DataFrame(sklearn_dataset.data, columns=sklearn_dataset.feature_names)
df['target'] = pd.Series(sklearn_dataset.target)
return df
def ecdf(data):
"""Compute ECDF for a one-dimensional array of measurements."""
# Number of data points: n
n = len(data)
# x-data for the ECDF: x
x = np.sort(data)
# y-data for the ECDF: y
y = np.arange(1, n + 1) / n
return x, y
iris = datasets.load_iris()
print(iris.target_names)
iris_as_df = sklearn_to_df(iris)
iris_as_df.loc[:, 'target'] = [iris.target_names[i] for i in iris_as_df['target']]
versicolor_petal_length = iris_as_df.loc[iris_as_df['target'] == 'versicolor', "petal length (cm)"]
# The "square root rule" is a commonly-used rule of thumb for choosing number of bins
n_bins = int(np.sqrt(len(versicolor_petal_length)))
_ = plt.figure(1)
_ = plt.hist(versicolor_petal_length, bins=n_bins)
_ = plt.xlabel("petal length (cm)")
_ = plt.ylabel("count")
_ = plt.figure(2)
_ = sns.swarmplot(x='target', y='petal length (cm)', data=iris_as_df)
# Label the axes
_ = plt.xlabel('species')
_ = plt.ylabel('petal length (cm)')
# Compute ECDF for versicolor data: x_vers, y_vers
x_vers, y_vers = ecdf(versicolor_petal_length)
# Generate plot
_ = plt.figure(3)
_ = plt.plot(x_vers, y_vers, marker='.', linestyle='none')
# Label the axes
# 72% of versicolor have a petal length of ~4.5cm or less
_ = plt.xlabel("petal length (cm)")
_ = plt.ylabel("ECDF")
# Compute ECDFs (create the setosa and virginica data)
setosa_petal_length = iris_as_df.loc[iris_as_df['target'] == 'setosa', "petal length (cm)"]
virginica_petal_length = iris_as_df.loc[iris_as_df['target'] == 'virginica', "petal length (cm)"]
x_set, y_set = ecdf(setosa_petal_length)
x_vers, y_vers = ecdf(versicolor_petal_length)
x_virg, y_virg = ecdf(virginica_petal_length)
# Plot all ECDFs on the same plot
_ = plt.figure(4)
_ = plt.plot(x_set, y_set, marker='.', linestyle='none')
_ = plt.plot(x_vers, y_vers, marker='.', linestyle='none')
_ = plt.plot(x_virg, y_virg, marker='.', linestyle='none')
# Annotate the plot
# The ECDFs expose clear differences among the species
# Setosa is much shorter, also with less absolute variability in petal length
# than versicolor and virginica
_ = plt.legend(('setosa', 'versicolor', 'virginica'), loc='lower right')
_ = plt.xlabel('petal length (cm)')
_ = plt.ylabel('ECDF')
# Display the plot
plt.show()