Skip to content
This repository was archived by the owner on Jan 2, 2021. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 18 additions & 18 deletions bootstrapped/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.
'''Functions that allow one to create bootstrapped confidence intervals'''
"""Functions that allow one to create bootstrapped confidence intervals"""
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
Expand Down Expand Up @@ -53,11 +53,11 @@ def __rmul__(self, other):
return self._apply(float(other), lambda x, other: x * other)

def error_width(self):
'''Returns: upper_bound - lower_bound'''
"""Returns: upper_bound - lower_bound"""
return self.upper_bound - self.lower_bound

def error_fraction(self):
'''Returns the error_width / value'''
"""Returns the error_width / value"""
if self.value == 0:
return _np.inf
else:
Expand All @@ -67,16 +67,16 @@ def is_significant(self):
return _np.sign(self.upper_bound) == _np.sign(self.lower_bound)

def get_result(self):
'''Returns:
"""Returns:
-1 if statistically significantly negative
+1 if statistically significantly positive
0 otherwise
'''
"""
return int(self.is_significant()) * _np.sign(self.value)


def _get_confidence_interval(bootstrap_dist, stat_val, alpha, is_pivotal):
'''Get the bootstrap confidence interval for a given distribution.
"""Get the bootstrap confidence interval for a given distribution.
Args:
bootstrap_distribution: numpy array of bootstrap results from
bootstrap_distribution() or bootstrap_ab_distribution()
Expand All @@ -85,7 +85,7 @@ def _get_confidence_interval(bootstrap_dist, stat_val, alpha, is_pivotal):
alpha: The alpha value for the confidence intervals.
is_pivotal: if true, use the pivotal method. if false, use the
percentile method.
'''
"""
if is_pivotal:
low = 2 * stat_val - _np.percentile(bootstrap_dist, 100 * (1 - alpha / 2.))
val = stat_val
Expand Down Expand Up @@ -192,9 +192,9 @@ def _generate_distributions(values_lists, num_iterations):

def _bootstrap_sim(values_lists, stat_func_lists, num_iterations,
iteration_batch_size, seed):
'''Returns simulated bootstrap distribution.
See bootstrap() funciton for arg descriptions.
'''
"""Returns simulated bootstrap distribution.
See bootstrap() function for arg descriptions.
"""

if seed is not None:
_np.random.seed(seed)
Expand All @@ -218,7 +218,7 @@ def _bootstrap_sim(values_lists, stat_func_lists, num_iterations,
def _bootstrap_distribution(values_lists, stat_func_lists,
num_iterations, iteration_batch_size, num_threads):

'''Returns the simulated bootstrap distribution. The idea is to sample the same
"""Returns the simulated bootstrap distribution. The idea is to sample the same
indexes in a bootstrap re-sample across all arrays passed into values_lists.

This is especially useful when you want to co-sample records in a ratio metric.
Expand All @@ -244,8 +244,8 @@ def _bootstrap_distribution(values_lists, stat_func_lists,
multiprocessing.cpu_count() is used instead.
Returns:
The set of bootstrap resamples where each stat_function is applied on
the bootsrapped values.
'''
the bootstrapped values.
"""

_validate_arrays(values_lists)

Expand Down Expand Up @@ -285,7 +285,7 @@ def _bootstrap_distribution(values_lists, stat_func_lists,
def bootstrap(values, stat_func, denominator_values=None, alpha=0.05,
num_iterations=10000, iteration_batch_size=None, is_pivotal=True,
num_threads=1, return_distribution=False):
'''Returns bootstrap estimate.
"""Returns bootstrap estimate.
Args:
values: numpy array (or scipy.sparse.csr_matrix) of values to bootstrap
stat_func: statistic to bootstrap. We provide several default functions:
Expand Down Expand Up @@ -324,7 +324,7 @@ def bootstrap(values, stat_func, denominator_values=None, alpha=0.05,
multiprocessing.cpu_count() is used instead.
Returns:
BootstrapResults representing CI and estimated value.
'''
"""
if denominator_values is None:
values_lists = [values]
stat_func_lists = [stat_func]
Expand Down Expand Up @@ -361,7 +361,7 @@ def bootstrap_ab(test, ctrl, stat_func, compare_func, test_denominator=None,
ctrl_denominator=None, alpha=0.05, num_iterations=10000,
iteration_batch_size=None, scale_test_by=1.0,
is_pivotal=True, num_threads=1, return_distribution=False):
'''Returns bootstrap confidence intervals for an A/B test.
"""Returns bootstrap confidence intervals for an A/B test.
Args:
test: numpy array (or scipy.sparse.csr_matrix) of test results
ctrl: numpy array (or scipy.sparse.csr_matrix) of ctrl results
Expand Down Expand Up @@ -400,12 +400,12 @@ def bootstrap_ab(test, ctrl, stat_func, compare_func, test_denominator=None,
50/50 split. Defaults to 1.0.
is_pivotal: if true, use the pivotal method for bootstrapping confidence
intervals. If false, use the percentile method.
num_threads: The number of therads to use. This speeds up calculation of
num_threads: The number of threads to use. This speeds up calculation of
the bootstrap. Defaults to 1. If -1 is specified then
multiprocessing.cpu_count() is used instead.
Returns:
BootstrapResults representing CI and estimated value.
'''
"""

both_denominators = test_denominator is not None and \
ctrl_denominator is not None
Expand Down
8 changes: 4 additions & 4 deletions bootstrapped/compare_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.
'''Various comparison functions for use in bootstrap a/b tests'''
"""Various comparison functions for use in bootstrap a/b tests"""
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
Expand All @@ -19,7 +19,7 @@ def difference(test_stat, ctrl_stat):
Returns:
test_stat - ctrl_stat
"""
return (test_stat - ctrl_stat)
return test_stat - ctrl_stat


def percent_change(test_stat, ctrl_stat):
Expand All @@ -30,7 +30,7 @@ def percent_change(test_stat, ctrl_stat):
Returns:
(test_stat - ctrl_stat) / ctrl_stat * 100
"""
return (test_stat - ctrl_stat) * 100.0 / ctrl_stat
return (test_stat - ctrl_stat) * 100. / ctrl_stat


def ratio(test_stat, ctrl_stat):
Expand All @@ -53,4 +53,4 @@ def percent_difference(test_stat, ctrl_stat):
Returns:
(test_stat - ctrl_stat) / ((test_stat + ctrl_stat) / 2.0) * 100.0
"""
return (test_stat - ctrl_stat) / ((test_stat + ctrl_stat) / 2.0) * 100.0
return (test_stat - ctrl_stat) / ((test_stat + ctrl_stat) / 2.) * 100.
61 changes: 24 additions & 37 deletions bootstrapped/permutation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,47 +4,40 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.
'''Functions that allow one to run a permutation shuffle test'''
"""Functions that allow one to run a permutation shuffle test"""
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals

import numpy as _np
import multiprocessing as _multiprocessing
from warnings import warn
import numpy as _np

MAX_ITER = 10000
MAX_ARRAY_SIZE = 10000


# Randomized permutation shuffle test
def _get_permutation_result(permutation_dist, stat_val):
'''Get the permutation test result for a given distribution.
"""Get the permutation test result for a given distribution.
Args:
permutation_distribution: numpy array of permutation shuffle results
permutation_dist: numpy array of permutation shuffle results
from permutation_distribution()
stat_val: The overall statistic that this method is attempting to
calculate error bars for.
'''

denom = len(permutation_dist)

pct = (
len(permutation_dist[_np.where(permutation_dist >= abs(stat_val))]) +
len(permutation_dist[_np.where(permutation_dist <= -abs(stat_val))])
) / denom

"""
pct = _np.mean(_np.logical_or(permutation_dist <= -abs(stat_val), permutation_dist >= abs(stat_val)))
return pct


def _validate_arrays(values_lists):
t = values_lists[0]
t_type = type(t)
if not isinstance(t, _np.ndarray):
raise ValueError(('The arrays must be of type numpy.array'))
raise ValueError('The arrays must be of type numpy.array')

for _, values in enumerate(values_lists[1:]):
for values in values_lists[1:]:
if not isinstance(values, t_type):
raise ValueError('The arrays must all be of the same type')

Expand All @@ -54,11 +47,9 @@ def _validate_arrays(values_lists):

def _generate_distributions(values_lists, num_iterations=0):
values_shape = values_lists[0].shape[0]
ids = []

for _ in range(num_iterations):
ids.append(_np.random.choice(values_shape, values_shape, replace=False))

ids = [_np.random.choice(values_shape, values_shape, replace=False)
for _ in range(num_iterations)]
ids = _np.array(ids)

results = [values[ids] for values in values_lists]
Expand All @@ -67,19 +58,17 @@ def _generate_distributions(values_lists, num_iterations=0):

def _permutation_sim(test_lists, ctrl_lists, stat_func_lists, num_iterations,
iteration_batch_size, seed):
'''Returns simulated permutation distribution.
"""Returns simulated permutation distribution.
See permutation() function for arg descriptions.
'''
"""

if seed is not None:
_np.random.seed(seed)

num_iterations = int(num_iterations)
iteration_batch_size = int(iteration_batch_size)

values_lists = []
for i in range(len(test_lists)):
values_lists.append(_np.append(test_lists[i], ctrl_lists[i]))
values_lists = [_np.append(test_lists[i], ctrl_lists[i]) for i in range(len(test_lists))]

test_results = [[] for _ in test_lists]
ctrl_results = [[] for _ in ctrl_lists]
Expand All @@ -92,7 +81,7 @@ def _permutation_sim(test_lists, ctrl_lists, stat_func_lists, num_iterations,
values_sims = _generate_distributions(values_lists, max_rng)
for i, result in enumerate(values_sims):
for j in result:
test_sims[i].append(j[0:len(test_lists[0])])
test_sims[i].append(j[:len(test_lists[0])])
ctrl_sims[i].append(j[len(test_lists[0]):])

for i, test_sim_stat_func in enumerate(zip(test_sims, stat_func_lists)):
Expand All @@ -110,7 +99,7 @@ def _permutation_distribution(test_lists, ctrl_lists, stat_func_lists,
num_iterations, iteration_batch_size,
num_threads):

'''Returns the simulated permutation distribution. The idea is to sample the
"""Returns the simulated permutation distribution. The idea is to sample the
same indexes in a permutation shuffle across all arrays passed into
values_lists.

Expand All @@ -135,11 +124,10 @@ def _permutation_distribution(test_lists, ctrl_lists, stat_func_lists,
num_threads: The number of threads to use. This speeds up calculation of
the shuffle. Defaults to 1. If -1 is specified then
multiprocessing.cpu_count() is used instead.
exact: True to run an exact permutation shuffle test.
Returns:
The set of permutation shuffle samples where each stat_function is
applied on the shuffled values.
'''
"""
_validate_arrays(test_lists)
_validate_arrays(ctrl_lists)

Expand All @@ -161,11 +149,11 @@ def _permutation_distribution(test_lists, ctrl_lists, stat_func_lists,
else:
pool = _multiprocessing.Pool(num_threads)

iter_per_job = _np.ceil(num_iterations * 1.0 / num_threads)
iter_per_job = _np.ceil(num_iterations * 1. / num_threads)

test_results = []
ctrl_results = []
for seed in _np.random.randint(0, 2**32 - 1, num_threads):
for seed in _np.random.randint(0, 2 ** 32 - 1, num_threads):
job_args = (test_lists, ctrl_lists, stat_func_lists, iter_per_job,
iteration_batch_size, seed)
t, c = pool.apply_async(_permutation_sim, job_args)
Expand All @@ -184,7 +172,7 @@ def permutation_test(test, ctrl, stat_func, compare_func, test_denominator=None,
ctrl_denominator=None, num_iterations=10000,
iteration_batch_size=None, num_threads=1,
return_distribution=False):
'''Returns bootstrap confidence intervals for an A/B test.
"""Returns bootstrap confidence intervals for an A/B test.
Args:
test: numpy array (or scipy.sparse.csr_matrix) of test results
ctrl: numpy array (or scipy.sparse.csr_matrix) of ctrl results
Expand Down Expand Up @@ -216,14 +204,14 @@ def permutation_test(test, ctrl, stat_func, compare_func, test_denominator=None,
iteration_batch_size: The bootstrap sample can generate very large
arrays. This function iteration_batch_size limits the memory
footprint by batching bootstrap rounds.
num_threads: The number of therads to use. This speeds up calculation of
num_threads: The number of threads to use. This speeds up calculation of
the bootstrap. Defaults to 1. If -1 is specified then
multiprocessing.cpu_count() is used instead.
Returns:
percentage representing the percentage of permutation distribution
values that are more extreme than the original distribution.
'''
is_large_array = (len(test) >= MAX_ARRAY_SIZE or len(ctrl) >= MAX_ARRAY_SIZE)
"""
is_large_array = len(test) >= MAX_ARRAY_SIZE or len(ctrl) >= MAX_ARRAY_SIZE
if is_large_array and num_iterations > MAX_ITER:
warning_text = ("Maximum array length of {} exceeded, "
"limiting num_iterations to {}")
Expand Down Expand Up @@ -273,6 +261,5 @@ def do_division(num, denom):

if return_distribution:
return test_ctrl_dist
else:
test_ctrl_val = compare_func(test_val, ctrl_val)
return _get_permutation_result(test_ctrl_dist, test_ctrl_val)
test_ctrl_val = compare_func(test_val, ctrl_val)
return _get_permutation_result(test_ctrl_dist, test_ctrl_val)
Loading