stats_intro.py

#

Simulation and basis statistics in Python

#

Original code here.

import random
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
#

Import secret.py which contains the success_rate for instiating FlipSim in the examples below

import secret
import scipy.stats as stats
#

Simulation of Bernoulli trials

#

Class for simulating Bernoulli trials

class FlipSim(object):
#
    def __init__(self, success_rate):
        self.success_rate = success_rate
        self.trials = []
#
    def get_success_rate(self):
        return self.success_rate
#

Returns the result of a single 'flip'

    def flip(self):
#
        if random.random() <= self.success_rate:
            return True
        else:
            return False
#

Returns a list of a single trial of num_flips flips

    def flips(self, num_flips):
#
        return [self.flip() for i in range(num_flips)]
#

Returns a list of num_trials trials of flips_per_trials each

    def flip_trials(self, flips_per_trial, num_trials):
#
        self.trials = [self.flips(flips_per_trial) for i in range(num_trials)]
        return self.trials
#

Is it fair or loaded simulated coin?

sim = FlipSim(secret.value)
#

10,000 trials of 1,000 flips each. Experiment with different combinations of flips_per_trial and num_trials.

trials = sim.flip_trials(1000, 10000)
#

Descriptive stats

means = [np.mean(trial) for trial in trials]
mean = np.mean(means)
sigma = np.std(means)
#

Results should be similar to:
The mean is 0.5300304
The standard deviation is 0.0157946977128

print('The mean is', mean)
print('The standard deviation is', sigma)
#

68% of the values lie between 0.514235702287 and 0.545825097713
95% of the values lie between 0.498441004574 and 0.561619795426
99.7% of the values lie between 0.482646306861 and 0.577414493139 Thus, on the basis of this particular simulation we can say with 95% confidence that secret.value lies between 0.498 and 0.562.

print('68% of the values lie between', mean - sigma, 'and', mean + sigma)
print('95% of the values lie between', mean - 2*sigma, 'and', mean + 2*sigma)
print('99.7% of the values lie between', mean - 3*sigma, 'and', mean + 3*sigma)
#

Plot the simulation

#

numpy.linspace returns evenly spaced numbers over a specified interval. In this case the range of x is defined by the mean plus/minus three standard deviations.

x = np.linspace(mean - 3 * sigma, mean + 3 * sigma, 100)
#

Create a histogram of the means

plt.hist(means, bins = 200)
#

Draw the probability distribution function (pdf) of the normal distribuation with mean secret.value and standard deviation of sigma within the range of x.

plt.plot(x, 10 * mlab.normpdf(x, secret.value, sigma))
#

Show the plot

plt.show()
#

Using scipy.stats

#

Create three different Bernoulli trials of 1,000 flips each with different success rates

x1 = FlipSim(0.5).flips(1000)
x2 = FlipSim(0.52).flips(1000)
x3 = FlipSim(0.57).flips(1000)
#

Use Independent Two-Sampled T-test to test the Null Hypothesis that the means of two independent samples are the same. Results should be similar though not identical to: Ttest_indResult(statistic=0.53653387639428951, pvalue=0.59164936190673534) Null hypothesis is not rejected

print(stats.ttest_ind(x1, x2))
#

Ttest_indResult(statistic=-1.2543243107046314, pvalue=0.20987087051240788) Null hypothesis is rejected

print(stats.ttest_ind(x1, x3))
#

Use ANOVA to test the Null Hypothesis that the means of more than two independent samples are all the same. Results should be similar to:
F_onewayResult(statistic=9.1459470595680799, pvalue=0.0001096574199724701) Null Hypothesis is rejected

print(stats.f_oneway(x1, x2, x3))
#

Standard error of the means

#

Example from Temperley, Music and Probability, ch. 2

sample = [1] * 599 + [0] * 401
#

The standard deviation doesn't tell us what we want to know, because we are dealing with a single sample rather than the means of many samples.

mu = np.mean(sample)
sigma = np.std(sample)
print('mu = {}, std = {}'.format(mu, sigma))
plt.hist(sample)
plt.show()
#

To estimate the mean from a single sample, use the Standard Error of the Means (SEM)

SEM = sigma / (len(sample) ** 0.5)
#

standard error of the means is 0.0154983547514

print('standard error of the means is', SEM)
x = np.linspace(mu - 3 * SEM, mu + 3 * SEM, 100)
plt.plot(x, mlab.normpdf(x, mu, SEM))
plt.show()
#

SEM is implemented directly in scipy.stats

SEM = stats.sem(sample)
#

95% confidence that population mean is between 0.5679877805090032 and 0.6300122194909967

print('95% confidence that population mean is between {} and {}'.
      format(mu - 2 * SEM, mu + 2 * SEM))
#

Use the One-sample T-test to compare the mean of a sample with an expected mean

test_mean = 0.5 #
statistic, p_value = stats.ttest_1samp(sample, test_mean)
#

For one-sample T-test with given sample and a mean of 0.5 p-value is 2.6279100509e-10

print('For one-sample T-test with given sample and a mean of',
      test_mean, 'p-value is', p_value)
#

Geometric distributions

#

The following code is based on the ending conditions for Ash's algorithmic piece. Once the piece is at least 32 notes long, the next C4 produced initiates the ending of the piece. How many notes beyond 32 is it expected before C4 occurs?

#
def trial():
#

keep count of the notes produced before C4

    count = 0
    while True:
#

G3 = 0, A3 = 0, ..., C5 = 10

        value = random.randint(0, 10)
#

Stop trial if value is C4

        if value == 3:
            break
        count += 1
    return count
#

Generate counts for 10,000 trials

N = 10000
trials = [trial() for _ in range(N)]
counts = Counter(trials)
#

Create x and y values

xs = range(1, max(counts) + 1)
vals = [counts[x] for x in xs]
ys = np.array(vals) / N
#

Plot the probability mass function of the resulting geometric distribution

plt.title('Probability mass function\ngeometric distribution')
plt.ylabel('Probability')
plt.xlabel('Random selections until success')
plt.plot(xs, ys)
plt.show()
#

Plot the cumulative distribution function

ys = np.cumsum(vals) / N
plt.title('Cumulative distribution function\ngeometric distribution')
plt.ylabel('Cumulative probability')
plt.show()