diff --git a/bootstrapped/bootstrap.py b/bootstrapped/bootstrap.py index 807aa4b..3595c04 100644 --- a/bootstrapped/bootstrap.py +++ b/bootstrapped/bootstrap.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. An additional grant # of patent rights can be found in the PATENTS file in the same directory. -'''Functions that allow one to create bootstrapped confidence intervals''' +"""Functions that allow one to create bootstrapped confidence intervals""" from __future__ import print_function from __future__ import absolute_import from __future__ import division @@ -53,11 +53,11 @@ def __rmul__(self, other): return self._apply(float(other), lambda x, other: x * other) def error_width(self): - '''Returns: upper_bound - lower_bound''' + """Returns: upper_bound - lower_bound""" return self.upper_bound - self.lower_bound def error_fraction(self): - '''Returns the error_width / value''' + """Returns the error_width / value""" if self.value == 0: return _np.inf else: @@ -67,16 +67,16 @@ def is_significant(self): return _np.sign(self.upper_bound) == _np.sign(self.lower_bound) def get_result(self): - '''Returns: + """Returns: -1 if statistically significantly negative +1 if statistically significantly positive 0 otherwise - ''' + """ return int(self.is_significant()) * _np.sign(self.value) def _get_confidence_interval(bootstrap_dist, stat_val, alpha, is_pivotal): - '''Get the bootstrap confidence interval for a given distribution. + """Get the bootstrap confidence interval for a given distribution. Args: bootstrap_distribution: numpy array of bootstrap results from bootstrap_distribution() or bootstrap_ab_distribution() @@ -85,7 +85,7 @@ def _get_confidence_interval(bootstrap_dist, stat_val, alpha, is_pivotal): alpha: The alpha value for the confidence intervals. is_pivotal: if true, use the pivotal method. if false, use the percentile method. - ''' + """ if is_pivotal: low = 2 * stat_val - _np.percentile(bootstrap_dist, 100 * (1 - alpha / 2.)) val = stat_val @@ -192,9 +192,9 @@ def _generate_distributions(values_lists, num_iterations): def _bootstrap_sim(values_lists, stat_func_lists, num_iterations, iteration_batch_size, seed): - '''Returns simulated bootstrap distribution. - See bootstrap() funciton for arg descriptions. - ''' + """Returns simulated bootstrap distribution. + See bootstrap() function for arg descriptions. + """ if seed is not None: _np.random.seed(seed) @@ -218,7 +218,7 @@ def _bootstrap_sim(values_lists, stat_func_lists, num_iterations, def _bootstrap_distribution(values_lists, stat_func_lists, num_iterations, iteration_batch_size, num_threads): - '''Returns the simulated bootstrap distribution. The idea is to sample the same + """Returns the simulated bootstrap distribution. The idea is to sample the same indexes in a bootstrap re-sample across all arrays passed into values_lists. This is especially useful when you want to co-sample records in a ratio metric. @@ -244,8 +244,8 @@ def _bootstrap_distribution(values_lists, stat_func_lists, multiprocessing.cpu_count() is used instead. Returns: The set of bootstrap resamples where each stat_function is applied on - the bootsrapped values. - ''' + the bootstrapped values. + """ _validate_arrays(values_lists) @@ -285,7 +285,7 @@ def _bootstrap_distribution(values_lists, stat_func_lists, def bootstrap(values, stat_func, denominator_values=None, alpha=0.05, num_iterations=10000, iteration_batch_size=None, is_pivotal=True, num_threads=1, return_distribution=False): - '''Returns bootstrap estimate. + """Returns bootstrap estimate. Args: values: numpy array (or scipy.sparse.csr_matrix) of values to bootstrap stat_func: statistic to bootstrap. We provide several default functions: @@ -324,7 +324,7 @@ def bootstrap(values, stat_func, denominator_values=None, alpha=0.05, multiprocessing.cpu_count() is used instead. Returns: BootstrapResults representing CI and estimated value. - ''' + """ if denominator_values is None: values_lists = [values] stat_func_lists = [stat_func] @@ -361,7 +361,7 @@ def bootstrap_ab(test, ctrl, stat_func, compare_func, test_denominator=None, ctrl_denominator=None, alpha=0.05, num_iterations=10000, iteration_batch_size=None, scale_test_by=1.0, is_pivotal=True, num_threads=1, return_distribution=False): - '''Returns bootstrap confidence intervals for an A/B test. + """Returns bootstrap confidence intervals for an A/B test. Args: test: numpy array (or scipy.sparse.csr_matrix) of test results ctrl: numpy array (or scipy.sparse.csr_matrix) of ctrl results @@ -400,12 +400,12 @@ def bootstrap_ab(test, ctrl, stat_func, compare_func, test_denominator=None, 50/50 split. Defaults to 1.0. is_pivotal: if true, use the pivotal method for bootstrapping confidence intervals. If false, use the percentile method. - num_threads: The number of therads to use. This speeds up calculation of + num_threads: The number of threads to use. This speeds up calculation of the bootstrap. Defaults to 1. If -1 is specified then multiprocessing.cpu_count() is used instead. Returns: BootstrapResults representing CI and estimated value. - ''' + """ both_denominators = test_denominator is not None and \ ctrl_denominator is not None diff --git a/bootstrapped/compare_functions.py b/bootstrapped/compare_functions.py index 6593a12..3c2438e 100644 --- a/bootstrapped/compare_functions.py +++ b/bootstrapped/compare_functions.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. An additional grant # of patent rights can be found in the PATENTS file in the same directory. -'''Various comparison functions for use in bootstrap a/b tests''' +"""Various comparison functions for use in bootstrap a/b tests""" from __future__ import print_function from __future__ import absolute_import from __future__ import division @@ -19,7 +19,7 @@ def difference(test_stat, ctrl_stat): Returns: test_stat - ctrl_stat """ - return (test_stat - ctrl_stat) + return test_stat - ctrl_stat def percent_change(test_stat, ctrl_stat): @@ -30,7 +30,7 @@ def percent_change(test_stat, ctrl_stat): Returns: (test_stat - ctrl_stat) / ctrl_stat * 100 """ - return (test_stat - ctrl_stat) * 100.0 / ctrl_stat + return (test_stat - ctrl_stat) * 100. / ctrl_stat def ratio(test_stat, ctrl_stat): @@ -53,4 +53,4 @@ def percent_difference(test_stat, ctrl_stat): Returns: (test_stat - ctrl_stat) / ((test_stat + ctrl_stat) / 2.0) * 100.0 """ - return (test_stat - ctrl_stat) / ((test_stat + ctrl_stat) / 2.0) * 100.0 + return (test_stat - ctrl_stat) / ((test_stat + ctrl_stat) / 2.) * 100. diff --git a/bootstrapped/permutation.py b/bootstrapped/permutation.py index 6a4d2ec..0d07aa6 100644 --- a/bootstrapped/permutation.py +++ b/bootstrapped/permutation.py @@ -4,15 +4,15 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. An additional grant # of patent rights can be found in the PATENTS file in the same directory. -'''Functions that allow one to run a permutation shuffle test''' +"""Functions that allow one to run a permutation shuffle test""" from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals -import numpy as _np import multiprocessing as _multiprocessing from warnings import warn +import numpy as _np MAX_ITER = 10000 MAX_ARRAY_SIZE = 10000 @@ -20,21 +20,14 @@ # Randomized permutation shuffle test def _get_permutation_result(permutation_dist, stat_val): - '''Get the permutation test result for a given distribution. + """Get the permutation test result for a given distribution. Args: - permutation_distribution: numpy array of permutation shuffle results + permutation_dist: numpy array of permutation shuffle results from permutation_distribution() stat_val: The overall statistic that this method is attempting to calculate error bars for. - ''' - - denom = len(permutation_dist) - - pct = ( - len(permutation_dist[_np.where(permutation_dist >= abs(stat_val))]) + - len(permutation_dist[_np.where(permutation_dist <= -abs(stat_val))]) - ) / denom - + """ + pct = _np.mean(_np.logical_or(permutation_dist <= -abs(stat_val), permutation_dist >= abs(stat_val))) return pct @@ -42,9 +35,9 @@ def _validate_arrays(values_lists): t = values_lists[0] t_type = type(t) if not isinstance(t, _np.ndarray): - raise ValueError(('The arrays must be of type numpy.array')) + raise ValueError('The arrays must be of type numpy.array') - for _, values in enumerate(values_lists[1:]): + for values in values_lists[1:]: if not isinstance(values, t_type): raise ValueError('The arrays must all be of the same type') @@ -54,11 +47,9 @@ def _validate_arrays(values_lists): def _generate_distributions(values_lists, num_iterations=0): values_shape = values_lists[0].shape[0] - ids = [] - - for _ in range(num_iterations): - ids.append(_np.random.choice(values_shape, values_shape, replace=False)) + ids = [_np.random.choice(values_shape, values_shape, replace=False) + for _ in range(num_iterations)] ids = _np.array(ids) results = [values[ids] for values in values_lists] @@ -67,9 +58,9 @@ def _generate_distributions(values_lists, num_iterations=0): def _permutation_sim(test_lists, ctrl_lists, stat_func_lists, num_iterations, iteration_batch_size, seed): - '''Returns simulated permutation distribution. + """Returns simulated permutation distribution. See permutation() function for arg descriptions. - ''' + """ if seed is not None: _np.random.seed(seed) @@ -77,9 +68,7 @@ def _permutation_sim(test_lists, ctrl_lists, stat_func_lists, num_iterations, num_iterations = int(num_iterations) iteration_batch_size = int(iteration_batch_size) - values_lists = [] - for i in range(len(test_lists)): - values_lists.append(_np.append(test_lists[i], ctrl_lists[i])) + values_lists = [_np.append(test_lists[i], ctrl_lists[i]) for i in range(len(test_lists))] test_results = [[] for _ in test_lists] ctrl_results = [[] for _ in ctrl_lists] @@ -92,7 +81,7 @@ def _permutation_sim(test_lists, ctrl_lists, stat_func_lists, num_iterations, values_sims = _generate_distributions(values_lists, max_rng) for i, result in enumerate(values_sims): for j in result: - test_sims[i].append(j[0:len(test_lists[0])]) + test_sims[i].append(j[:len(test_lists[0])]) ctrl_sims[i].append(j[len(test_lists[0]):]) for i, test_sim_stat_func in enumerate(zip(test_sims, stat_func_lists)): @@ -110,7 +99,7 @@ def _permutation_distribution(test_lists, ctrl_lists, stat_func_lists, num_iterations, iteration_batch_size, num_threads): - '''Returns the simulated permutation distribution. The idea is to sample the + """Returns the simulated permutation distribution. The idea is to sample the same indexes in a permutation shuffle across all arrays passed into values_lists. @@ -135,11 +124,10 @@ def _permutation_distribution(test_lists, ctrl_lists, stat_func_lists, num_threads: The number of threads to use. This speeds up calculation of the shuffle. Defaults to 1. If -1 is specified then multiprocessing.cpu_count() is used instead. - exact: True to run an exact permutation shuffle test. Returns: The set of permutation shuffle samples where each stat_function is applied on the shuffled values. - ''' + """ _validate_arrays(test_lists) _validate_arrays(ctrl_lists) @@ -161,11 +149,11 @@ def _permutation_distribution(test_lists, ctrl_lists, stat_func_lists, else: pool = _multiprocessing.Pool(num_threads) - iter_per_job = _np.ceil(num_iterations * 1.0 / num_threads) + iter_per_job = _np.ceil(num_iterations * 1. / num_threads) test_results = [] ctrl_results = [] - for seed in _np.random.randint(0, 2**32 - 1, num_threads): + for seed in _np.random.randint(0, 2 ** 32 - 1, num_threads): job_args = (test_lists, ctrl_lists, stat_func_lists, iter_per_job, iteration_batch_size, seed) t, c = pool.apply_async(_permutation_sim, job_args) @@ -184,7 +172,7 @@ def permutation_test(test, ctrl, stat_func, compare_func, test_denominator=None, ctrl_denominator=None, num_iterations=10000, iteration_batch_size=None, num_threads=1, return_distribution=False): - '''Returns bootstrap confidence intervals for an A/B test. + """Returns bootstrap confidence intervals for an A/B test. Args: test: numpy array (or scipy.sparse.csr_matrix) of test results ctrl: numpy array (or scipy.sparse.csr_matrix) of ctrl results @@ -216,14 +204,14 @@ def permutation_test(test, ctrl, stat_func, compare_func, test_denominator=None, iteration_batch_size: The bootstrap sample can generate very large arrays. This function iteration_batch_size limits the memory footprint by batching bootstrap rounds. - num_threads: The number of therads to use. This speeds up calculation of + num_threads: The number of threads to use. This speeds up calculation of the bootstrap. Defaults to 1. If -1 is specified then multiprocessing.cpu_count() is used instead. Returns: percentage representing the percentage of permutation distribution values that are more extreme than the original distribution. - ''' - is_large_array = (len(test) >= MAX_ARRAY_SIZE or len(ctrl) >= MAX_ARRAY_SIZE) + """ + is_large_array = len(test) >= MAX_ARRAY_SIZE or len(ctrl) >= MAX_ARRAY_SIZE if is_large_array and num_iterations > MAX_ITER: warning_text = ("Maximum array length of {} exceeded, " "limiting num_iterations to {}") @@ -273,6 +261,5 @@ def do_division(num, denom): if return_distribution: return test_ctrl_dist - else: - test_ctrl_val = compare_func(test_val, ctrl_val) - return _get_permutation_result(test_ctrl_dist, test_ctrl_val) + test_ctrl_val = compare_func(test_val, ctrl_val) + return _get_permutation_result(test_ctrl_dist, test_ctrl_val) diff --git a/bootstrapped/power.py b/bootstrapped/power.py index 6f1ac71..3d0645b 100644 --- a/bootstrapped/power.py +++ b/bootstrapped/power.py @@ -4,18 +4,18 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. An additional grant # of patent rights can be found in the PATENTS file in the same directory. -'Functions that allow one to perform power analysis' +"""Functions that allow one to perform power analysis""" from __future__ import print_function from __future__ import absolute_import from __future__ import division from __future__ import unicode_literals -import pandas as _pd -import numpy as _np import warnings as _warnings +import numpy as _np +import pandas as _pd def _get_power_df(bootstrap_result_list): - '''Returns a dataframe with importat statistics for power analysis + """Returns a dataframe with important statistics for power analysis Args: bootstrap_result_list: list of BootstrapResults @@ -32,7 +32,7 @@ def _get_power_df(bootstrap_result_list): power_df = bootstrap.get_power_df(results) - ''' + """ if len(bootstrap_result_list) < 3000: _warnings.warn(('bootstrap_result_list has very few examples. ' 'A general heuristic is to have at least 3k values. ' @@ -48,7 +48,7 @@ def _get_power_df(bootstrap_result_list): df['is_significant'] = is_sig df['test_result'] = df['upper_bound'].apply(_np.sign).astype(int) \ - * df['is_significant'].apply(lambda x: 1 if x else 0) + * df['is_significant'].apply(bool) result_cols = [ 'negative_significant', @@ -66,12 +66,12 @@ def _get_power_df(bootstrap_result_list): def power_stats(bootstrap_result_list): - '''Returns summary statistics about a power_df + """Returns summary statistics about a power_df Args: - power_df: get_power_df([BootstrapResult, ...]) + bootstrap_result_list: list of BootstrapResults Returns: A dataframe with summary statistics about the power of the simulation. - ''' + """ power_df = _get_power_df(bootstrap_result_list) pcnt_results = power_df.test_result.value_counts() * 100 / len(power_df) @@ -90,9 +90,9 @@ def power_stats(bootstrap_result_list): def plot_power(bootstrap_result_list, insignificant_color='blue', significant_color='orange', trend_color='black', zero_color='black'): - ''' + """ Args: - power_df: get_power_df([BootstrapResult, ...]) + bootstrap_result_list: list of BootstrapResults Example: @@ -108,7 +108,7 @@ def plot_power(bootstrap_result_list, insignificant_color='blue', bootstrap.power_stats(power_df) bootstrap.plot_power(power_df) - ''' + """ import matplotlib.pyplot as plt power_df = _get_power_df(bootstrap_result_list) diff --git a/bootstrapped/stats_functions.py b/bootstrapped/stats_functions.py index a490351..056373a 100644 --- a/bootstrapped/stats_functions.py +++ b/bootstrapped/stats_functions.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. An additional grant # of patent rights can be found in the PATENTS file in the same directory. -'''Various comparison statistics functions to run on bootstrap simulations''' +"""Various comparison statistics functions to run on bootstrap simulations""" from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -14,34 +14,32 @@ def mean(values, axis=1): - '''Returns the mean of each row of a matrix''' + """Returns the mean of each row of a matrix""" if isinstance(values, _sparse.csr_matrix): ret = values.mean(axis=axis) return ret.A1 - else: - return _np.mean(_np.asmatrix(values), axis=axis).A1 + return _np.mean(_np.asmatrix(values), axis=axis).A1 + def sum(values, axis=1): - '''Returns the sum of each row of a matrix''' + """Returns the sum of each row of a matrix""" if isinstance(values, _sparse.csr_matrix): ret = values.sum(axis=axis) return ret.A1 - else: - return _np.sum(_np.asmatrix(values), axis=axis).A1 + return _np.sum(_np.asmatrix(values), axis=axis).A1 def median(values, axis=1): - '''Returns the sum of each row of a matrix''' + """Returns the sum of each row of a matrix""" if isinstance(values, _sparse.csr_matrix): ret = values.median(axis=axis) return ret.A1 - else: - return _np.median(_np.asmatrix(values), axis=axis).A1 + return _np.median(_np.asmatrix(values), axis=axis).A1 + def std(values, axis=1): - '''Returns the std of each row of a matrix''' + """Returns the std of each row of a matrix""" if isinstance(values, _sparse.csr_matrix): ret = values.std(axis=axis) return ret.A1 - else: - return _np.std(_np.asmatrix(values), axis=axis).A1 + return _np.std(_np.asmatrix(values), axis=axis).A1 diff --git a/setup.py b/setup.py index ad66936..2faaa68 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ from setuptools import setup -readme = open('README.rst').read() +with open('README.rst') as f: + readme = f.read() setup( name="bootstrapped",