-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhacker_stats.py
158 lines (116 loc) · 5.52 KB
/
hacker_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
def perform_bernoulli_trials(n, p):
"""Perform n Bernoulli trials with success probability p
and return number of successes."""
# Initialize number of successes: n_success
n_success = 0
# Perform trials
for i in range(n):
# Choose random number between zero and one: random_number
random_number = np.random.random()
# If less than p, it's a success so add one to n_success
if random_number < p:
n_success += 1
return n_success
def ecdf(data):
"""Compute ECDF for a one-dimensional array of measurements."""
# Number of data points: n
n = len(data)
# x-data for the ECDF: x
x = np.sort(data)
# y-data for the ECDF: y
y = np.arange(1, n + 1) / n
return x, y
# Seed the random number generator
np.random.seed(42)
# Initialize random numbers: random_numbers
random_numbers = np.empty(100000)
# Generate random numbers by looping over range(100000)
for it in range(100000):
random_numbers[it] = np.random.random()
# Plot a histogram
_ = plt.hist(random_numbers)
# Show the plot
plt.show()
# How many defaults might we expect?
# Let's say a bank made 100 mortgage loans. It is possible that anywhere between 0
# and 100 of the loans will be defaulted upon. You would like to know the probability of getting a given number of
# defaults, given that the probability of a default is p = 0.05. To investigate this, you will do a simulation. You
# will perform 100 Bernoulli trials using the perform_bernoulli_trials() function you wrote in the previous exercise
# and record how many defaults we get. Here, a success is a default. (Remember that the word "success" just means
# that the Bernoulli trial evaluates to True, i.e., did the loan recipient default?) You will do this for another 100
# Bernoulli trials. And again and again until we have tried it 1000 times. Then, you will plot a histogram describing
# the probability of the number of defaults.
# Seed random number generator
np.random.seed(42)
# Initialize the number of defaults: n_defaults
n_defaults = np.empty(1000)
# Compute the number of defaults
for i in range(1000):
n_defaults[i] = perform_bernoulli_trials(100, 0.05)
# Plot the histogram with default number of bins; label your axes
_ = plt.hist(n_defaults, density=True)
_ = plt.xlabel('number of defaults out of 100 loans')
_ = plt.ylabel('probability')
# Show the plot
plt.show()
print(stats.describe(n_defaults))
# Compute ECDF: x, y
x, y = ecdf(n_defaults)
# Plot the ECDF with labeled axes
_ = plt.plot(x, y, marker='.', linestyle='none')
_ = plt.xlabel("defaults")
_ = plt.ylabel("ECDF")
# Show the plot
plt.show()
# If interest rates are such that the bank will lose money if 10 or more of its loans are defaulted upon, what is the
# probability that the bank will lose money?
# Compute the number of 100-loan simulations with 10 or more defaults: n_lose_money
n_lose_money = np.sum(n_defaults >= 10)
# Compute and print probability of losing money
print('Probability of losing money =', n_lose_money / len(n_defaults))
# The number r of successes in n Bernoulli trials with probability p of success, is Binomially distributed
# np.random.binomial(100, 0.05, size=10000)
# Plotting the binomial PMF using a histogram The trick is setting up the edges of the bins to pass to plt.hist() via
# the bins keyword argument. We want the bins centered on the integers. So, the edges of the bins should be -0.5,
# 0.5, 1.5, 2.5, ... up to max(n_defaults) + 1.5. You can generate an array like this using np.arange() and then
# subtracting 0.5 from the array.
# Compute bin edges: bins
bins = np.arange(min(n_defaults), max(n_defaults) + 2) - 0.5
print(bins)
# Generate histogram
_ = plt.hist(n_defaults, density=True, bins=bins)
# Label axes
_ = plt.xlabel("number of successes (defaults)")
_ = plt.ylabel("probability")
# Show the plot
plt.show()
# The Poisson distribution is a limit of the Binomial distribution for rare events.
# This is just like the Poisson story we discussed in the video, where we get on average 6 hits on a website per hour.
# Draw 10,000 samples out of Poisson distribution: samples_poisson
samples_poisson = np.random.poisson(10, size=10000)
# Print the mean and standard deviation
print('Poisson: ', np.mean(samples_poisson), np.std(samples_poisson))
# Specify values of n and p to consider for Binomial: n, p
n = [20, 100, 1000]
p = [0.5, 0.1, 0.01]
# Draw 10,000 samples for each n,p pair: samples_binomial
for i in range(3):
samples_binomial = np.random.binomial(n[i], p[i], size=10000)
# Print results
# The standard deviation of the Binomial distribution gets closer and closer to that of the Poisson distribution
# as the probability p gets lower and lower.
print('n =', n[i], 'Binom:', np.mean(samples_binomial), np.std(samples_binomial))
# 1990 and 2015 featured the most no-hitters of any season of baseball (there were seven). Given that there are on
# average 251/115 no-hitters per season, what is the probability of having seven or more in a season?
# Mean/average: There were 251 no-hitter games (a team that batted recorded no hits in 9 innings) in 115 seasons.
# Draw 10,000 samples out of Poisson distribution: n_nohitters
n_nohitters = np.random.poisson(251/115, size=10000)
# Compute number of samples that are seven or greater: n_large
n_large = np.sum(n_nohitters >= 7)
# Compute probability of getting seven or more: p_large
p_large = n_large / 10000
# Print the result
print('Probability of seven or more no-hitters:', p_large)