facebookarchive · bfgray3 · Jan 1, 2019
diff --git a/bootstrapped/bootstrap.py b/bootstrapped/bootstrap.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree. An additional grant
 # of patent rights can be found in the PATENTS file in the same directory.
-'''Functions that allow one to create bootstrapped confidence intervals'''
+"""Functions that allow one to create bootstrapped confidence intervals"""
 from __future__ import print_function
 from __future__ import absolute_import
 from __future__ import division
@@ -53,11 +53,11 @@ def __rmul__(self, other):
         return self._apply(float(other), lambda x, other: x * other)
 
     def error_width(self):
-        '''Returns: upper_bound - lower_bound'''
+        """Returns: upper_bound - lower_bound"""
         return self.upper_bound - self.lower_bound
 
     def error_fraction(self):
-        '''Returns the error_width / value'''
+        """Returns the error_width / value"""
         if self.value == 0:
             return _np.inf
         else:
@@ -67,16 +67,16 @@ def is_significant(self):
         return _np.sign(self.upper_bound) == _np.sign(self.lower_bound)
 
     def get_result(self):
-        '''Returns:
+        """Returns:
             -1 if statistically significantly negative
             +1 if statistically significantly positive
             0 otherwise
-        '''
+        """
         return int(self.is_significant()) * _np.sign(self.value)
 
 
 def _get_confidence_interval(bootstrap_dist, stat_val, alpha, is_pivotal):
-    '''Get the bootstrap confidence interval for a given distribution.
+    """Get the bootstrap confidence interval for a given distribution.
     Args:
         bootstrap_distribution: numpy array of bootstrap results from
             bootstrap_distribution() or bootstrap_ab_distribution()
@@ -85,7 +85,7 @@ def _get_confidence_interval(bootstrap_dist, stat_val, alpha, is_pivotal):
         alpha: The alpha value for the confidence intervals.
         is_pivotal: if true, use the pivotal method. if false, use the
             percentile method.
-    '''
+    """
     if is_pivotal:
         low = 2 * stat_val - _np.percentile(bootstrap_dist, 100 * (1 - alpha / 2.))
         val = stat_val
@@ -192,9 +192,9 @@ def _generate_distributions(values_lists, num_iterations):
 
 def _bootstrap_sim(values_lists, stat_func_lists, num_iterations,
                    iteration_batch_size, seed):
-    '''Returns simulated bootstrap distribution.
-    See bootstrap() funciton for arg descriptions.
-    '''
+    """Returns simulated bootstrap distribution.
+    See bootstrap() function for arg descriptions.
+    """
 
     if seed is not None:
         _np.random.seed(seed)
@@ -218,7 +218,7 @@ def _bootstrap_sim(values_lists, stat_func_lists, num_iterations,
 def _bootstrap_distribution(values_lists, stat_func_lists,
                             num_iterations, iteration_batch_size, num_threads):
 
-    '''Returns the simulated bootstrap distribution. The idea is to sample the same
+    """Returns the simulated bootstrap distribution. The idea is to sample the same
         indexes in a bootstrap re-sample across all arrays passed into values_lists.
 
         This is especially useful when you want to co-sample records in a ratio metric.
@@ -244,8 +244,8 @@ def _bootstrap_distribution(values_lists, stat_func_lists,
             multiprocessing.cpu_count() is used instead.
     Returns:
         The set of bootstrap resamples where each stat_function is applied on
-        the bootsrapped values.
-    '''
+        the bootstrapped values.
+    """
 
     _validate_arrays(values_lists)
 
@@ -285,7 +285,7 @@ def _bootstrap_distribution(values_lists, stat_func_lists,
 def bootstrap(values, stat_func, denominator_values=None, alpha=0.05,
               num_iterations=10000, iteration_batch_size=None, is_pivotal=True,
               num_threads=1, return_distribution=False):
-    '''Returns bootstrap estimate.
+    """Returns bootstrap estimate.
     Args:
         values: numpy array (or scipy.sparse.csr_matrix) of values to bootstrap
         stat_func: statistic to bootstrap. We provide several default functions:
@@ -324,7 +324,7 @@ def bootstrap(values, stat_func, denominator_values=None, alpha=0.05,
             multiprocessing.cpu_count() is used instead.
     Returns:
         BootstrapResults representing CI and estimated value.
-    '''
+    """
     if denominator_values is None:
         values_lists = [values]
         stat_func_lists = [stat_func]
@@ -361,7 +361,7 @@ def bootstrap_ab(test, ctrl, stat_func, compare_func, test_denominator=None,
                  ctrl_denominator=None, alpha=0.05, num_iterations=10000,
                  iteration_batch_size=None, scale_test_by=1.0,
                  is_pivotal=True, num_threads=1, return_distribution=False):
-    '''Returns bootstrap confidence intervals for an A/B test.
+    """Returns bootstrap confidence intervals for an A/B test.
     Args:
         test: numpy array (or scipy.sparse.csr_matrix) of test results
         ctrl: numpy array (or scipy.sparse.csr_matrix) of ctrl results
@@ -400,12 +400,12 @@ def bootstrap_ab(test, ctrl, stat_func, compare_func, test_denominator=None,
             50/50 split. Defaults to 1.0.
         is_pivotal: if true, use the pivotal method for bootstrapping confidence
             intervals. If false, use the percentile method.
-        num_threads: The number of therads to use. This speeds up calculation of
+        num_threads: The number of threads to use. This speeds up calculation of
             the bootstrap. Defaults to 1. If -1 is specified then
             multiprocessing.cpu_count() is used instead.
     Returns:
         BootstrapResults representing CI and estimated value.
-    '''
+    """
 
     both_denominators = test_denominator is not None and \
             ctrl_denominator is not None

diff --git a/bootstrapped/compare_functions.py b/bootstrapped/compare_functions.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree. An additional grant
 # of patent rights can be found in the PATENTS file in the same directory.
-'''Various comparison functions for use in bootstrap a/b tests'''
+"""Various comparison functions for use in bootstrap a/b tests"""
 from __future__ import print_function
 from __future__ import absolute_import
 from __future__ import division
@@ -19,7 +19,7 @@ def difference(test_stat, ctrl_stat):
     Returns:
         test_stat - ctrl_stat
     """
-    return (test_stat - ctrl_stat)
+    return test_stat - ctrl_stat
 
 
 def percent_change(test_stat, ctrl_stat):
@@ -30,7 +30,7 @@ def percent_change(test_stat, ctrl_stat):
     Returns:
         (test_stat - ctrl_stat) / ctrl_stat * 100
     """
-    return (test_stat - ctrl_stat) * 100.0 / ctrl_stat
+    return (test_stat - ctrl_stat) * 100. / ctrl_stat
 
 
 def ratio(test_stat, ctrl_stat):
@@ -53,4 +53,4 @@ def percent_difference(test_stat, ctrl_stat):
     Returns:
         (test_stat - ctrl_stat) / ((test_stat + ctrl_stat) / 2.0) * 100.0
     """
-    return (test_stat - ctrl_stat) / ((test_stat + ctrl_stat) / 2.0) * 100.0
+    return (test_stat - ctrl_stat) / ((test_stat + ctrl_stat) / 2.) * 100.
diff --git a/bootstrapped/permutation.py b/bootstrapped/permutation.py
@@ -4,47 +4,40 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree. An additional grant
 # of patent rights can be found in the PATENTS file in the same directory.
-'''Functions that allow one to run a permutation shuffle test'''
+"""Functions that allow one to run a permutation shuffle test"""
 from __future__ import print_function
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import unicode_literals
 
-import numpy as _np
 import multiprocessing as _multiprocessing
 from warnings import warn
+import numpy as _np
 
 MAX_ITER = 10000
 MAX_ARRAY_SIZE = 10000
 
 
 # Randomized permutation shuffle test
 def _get_permutation_result(permutation_dist, stat_val):
-    '''Get the permutation test result for a given distribution.
+    """Get the permutation test result for a given distribution.
     Args:
-        permutation_distribution: numpy array of permutation shuffle results
+        permutation_dist: numpy array of permutation shuffle results
             from permutation_distribution()
         stat_val: The overall statistic that this method is attempting to
             calculate error bars for.
-    '''
-
-    denom = len(permutation_dist)
-
-    pct = (
-        len(permutation_dist[_np.where(permutation_dist >= abs(stat_val))]) +
-        len(permutation_dist[_np.where(permutation_dist <= -abs(stat_val))])
-    ) / denom
-
+    """
+    pct = _np.mean(_np.logical_or(permutation_dist <= -abs(stat_val), permutation_dist >= abs(stat_val)))
     return pct
 
 
 def _validate_arrays(values_lists):
     t = values_lists[0]
     t_type = type(t)
     if not isinstance(t, _np.ndarray):
-        raise ValueError(('The arrays must be of type numpy.array'))
+        raise ValueError('The arrays must be of type numpy.array')
 
-    for _, values in enumerate(values_lists[1:]):
+    for values in values_lists[1:]:
         if not isinstance(values, t_type):
             raise ValueError('The arrays must all be of the same type')
 
@@ -54,11 +47,9 @@ def _validate_arrays(values_lists):
 
 def _generate_distributions(values_lists, num_iterations=0):
     values_shape = values_lists[0].shape[0]
-    ids = []
-
-    for _ in range(num_iterations):
-        ids.append(_np.random.choice(values_shape, values_shape, replace=False))
 
+    ids = [_np.random.choice(values_shape, values_shape, replace=False)
+           for _ in range(num_iterations)]
     ids = _np.array(ids)
 
     results = [values[ids] for values in values_lists]
@@ -67,19 +58,17 @@ def _generate_distributions(values_lists, num_iterations=0):
 
 def _permutation_sim(test_lists, ctrl_lists, stat_func_lists, num_iterations,
                      iteration_batch_size, seed):
-    '''Returns simulated permutation distribution.
+    """Returns simulated permutation distribution.
     See permutation() function for arg descriptions.
-    '''
+    """
 
     if seed is not None:
         _np.random.seed(seed)
 
     num_iterations = int(num_iterations)
     iteration_batch_size = int(iteration_batch_size)
 
-    values_lists = []
-    for i in range(len(test_lists)):
-        values_lists.append(_np.append(test_lists[i], ctrl_lists[i]))
+    values_lists = [_np.append(test_lists[i], ctrl_lists[i]) for i in range(len(test_lists))]
 
     test_results = [[] for _ in test_lists]
     ctrl_results = [[] for _ in ctrl_lists]
@@ -92,7 +81,7 @@ def _permutation_sim(test_lists, ctrl_lists, stat_func_lists, num_iterations,
         values_sims = _generate_distributions(values_lists, max_rng)
         for i, result in enumerate(values_sims):
             for j in result:
-                test_sims[i].append(j[0:len(test_lists[0])])
+                test_sims[i].append(j[:len(test_lists[0])])
                 ctrl_sims[i].append(j[len(test_lists[0]):])
 
         for i, test_sim_stat_func in enumerate(zip(test_sims, stat_func_lists)):
@@ -110,7 +99,7 @@ def _permutation_distribution(test_lists, ctrl_lists, stat_func_lists,
                               num_iterations, iteration_batch_size,
                               num_threads):
 
-    '''Returns the simulated permutation distribution. The idea is to sample the
+    """Returns the simulated permutation distribution. The idea is to sample the
         same indexes in a permutation shuffle across all arrays passed into
         values_lists.
 
@@ -135,11 +124,10 @@ def _permutation_distribution(test_lists, ctrl_lists, stat_func_lists,
         num_threads: The number of threads to use. This speeds up calculation of
             the shuffle. Defaults to 1. If -1 is specified then
             multiprocessing.cpu_count() is used instead.
-        exact: True to run an exact permutation shuffle test.
     Returns:
         The set of permutation shuffle samples where each stat_function is
         applied on the shuffled values.
-    '''
+    """
     _validate_arrays(test_lists)
     _validate_arrays(ctrl_lists)
 
@@ -161,11 +149,11 @@ def _permutation_distribution(test_lists, ctrl_lists, stat_func_lists,
     else:
         pool = _multiprocessing.Pool(num_threads)
 
-        iter_per_job = _np.ceil(num_iterations * 1.0 / num_threads)
+        iter_per_job = _np.ceil(num_iterations * 1. / num_threads)
 
         test_results = []
         ctrl_results = []
-        for seed in _np.random.randint(0, 2**32 - 1, num_threads):
+        for seed in _np.random.randint(0, 2 ** 32 - 1, num_threads):
             job_args = (test_lists, ctrl_lists, stat_func_lists, iter_per_job,
                         iteration_batch_size, seed)
             t, c = pool.apply_async(_permutation_sim, job_args)
@@ -184,7 +172,7 @@ def permutation_test(test, ctrl, stat_func, compare_func, test_denominator=None,
                      ctrl_denominator=None, num_iterations=10000,
                      iteration_batch_size=None, num_threads=1,
                      return_distribution=False):
-    '''Returns bootstrap confidence intervals for an A/B test.
+    """Returns bootstrap confidence intervals for an A/B test.
     Args:
         test: numpy array (or scipy.sparse.csr_matrix) of test results
         ctrl: numpy array (or scipy.sparse.csr_matrix) of ctrl results
@@ -216,14 +204,14 @@ def permutation_test(test, ctrl, stat_func, compare_func, test_denominator=None,
         iteration_batch_size: The bootstrap sample can generate very large
             arrays. This function iteration_batch_size limits the memory
             footprint by batching bootstrap rounds.
-        num_threads: The number of therads to use. This speeds up calculation of
+        num_threads: The number of threads to use. This speeds up calculation of
             the bootstrap. Defaults to 1. If -1 is specified then
             multiprocessing.cpu_count() is used instead.
     Returns:
         percentage representing the percentage of permutation distribution
             values that are more extreme than the original distribution.
-    '''
-    is_large_array = (len(test) >= MAX_ARRAY_SIZE or len(ctrl) >= MAX_ARRAY_SIZE)
+    """
+    is_large_array = len(test) >= MAX_ARRAY_SIZE or len(ctrl) >= MAX_ARRAY_SIZE
     if is_large_array and num_iterations > MAX_ITER:
         warning_text = ("Maximum array length of {} exceeded, "
                         "limiting num_iterations to {}")
@@ -273,6 +261,5 @@ def do_division(num, denom):
 
     if return_distribution:
         return test_ctrl_dist
-    else:
-        test_ctrl_val = compare_func(test_val, ctrl_val)
-        return _get_permutation_result(test_ctrl_dist, test_ctrl_val)
+    test_ctrl_val = compare_func(test_val, ctrl_val)
+    return _get_permutation_result(test_ctrl_dist, test_ctrl_val)