From 9e9d6dc225cc30659d699a72dacfdf5be94679b4 Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Wed, 24 Nov 2021 13:33:39 -0800 Subject: [PATCH 1/8] bootstrap ci math helper --- src/common/math.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 src/common/math.py diff --git a/src/common/math.py b/src/common/math.py new file mode 100644 index 00000000..d0890eae --- /dev/null +++ b/src/common/math.py @@ -0,0 +1,50 @@ +""" +Helper math functions +""" +import os +import argparse +import logging +import numpy as np + +def bootstrap_ci(data, iterations=1000, operators={'mean':np.mean}, alpha=0.05): + """ + Args: + data (np.array) : input data + iterations (int) : how many bootstrapped samples to generate + operators (Dict[str->func]) : map of functions to produce CI for + + Returns: + bootstrap_cis: Dict[str->tuple] + """ + # values will be stored in a dict + bootstrap_runs = {} + for operator_key in operators.keys(): + bootstrap_runs[operator_key] = [] + + sample_size = len(data) + for _ in range(iterations): + bootstrap = np.random.choice(data, size=sample_size, replace=True) + for operator_key, operator_func in operators.items(): + bootstrap_runs[operator_key].append(operator_func(bootstrap)) + + operators_ci = {} + for operator_key in operators.keys(): + values = np.array(bootstrap_runs[operator_key]) + ci_left = np.percentile(values, round(alpha/2*100)) + ci_right = np.percentile(values, round(100-alpha/2*100)) + ci_mean = np.mean(values) # just for fun + operators_ci[operator_key] = (ci_left, ci_mean, ci_right) + + return(operators_ci) + +if __name__ == "__main__": + sample_data = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 5.0]) + print(bootstrap_ci( + sample_data, + iterations=1000, + operators={ + 'mean':np.mean, + 'p90': (lambda x : np.percentile(x, 90)), + 'p99': (lambda x : np.percentile(x, 99)), + } + )) \ No newline at end of file From 520c905c773be4ad892169b5c82ef0a011effe22 Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Wed, 24 Nov 2021 13:39:13 -0800 Subject: [PATCH 2/8] use confidence level instead of alpha --- src/common/math.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/common/math.py b/src/common/math.py index d0890eae..4f63f35b 100644 --- a/src/common/math.py +++ b/src/common/math.py @@ -6,15 +6,16 @@ import logging import numpy as np -def bootstrap_ci(data, iterations=1000, operators={'mean':np.mean}, alpha=0.05): +def bootstrap_ci(data, iterations=1000, operators={'mean':np.mean}, confidence_level=0.95): """ Args: data (np.array) : input data iterations (int) : how many bootstrapped samples to generate operators (Dict[str->func]) : map of functions to produce CI for + confidence_level (float) : confidence_level = 1-alpha Returns: - bootstrap_cis: Dict[str->tuple] + operators_ci: Dict[str->tuple] """ # values will be stored in a dict bootstrap_runs = {} @@ -30,8 +31,8 @@ def bootstrap_ci(data, iterations=1000, operators={'mean':np.mean}, alpha=0.05): operators_ci = {} for operator_key in operators.keys(): values = np.array(bootstrap_runs[operator_key]) - ci_left = np.percentile(values, round(alpha/2*100)) - ci_right = np.percentile(values, round(100-alpha/2*100)) + ci_left = np.percentile(values, ((1-confidence_level)/2*100)) + ci_right = np.percentile(values, (100-(1-confidence_level)/2*100)) ci_mean = np.mean(values) # just for fun operators_ci[operator_key] = (ci_left, ci_mean, ci_right) From b288e428e5c3489e6365874b2baf47726eb53d68 Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Wed, 24 Nov 2021 13:53:55 -0800 Subject: [PATCH 3/8] add unit tests --- src/common/math.py | 14 +----------- tests/common/test_math.py | 47 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 13 deletions(-) create mode 100644 tests/common/test_math.py diff --git a/src/common/math.py b/src/common/math.py index 4f63f35b..8defb88f 100644 --- a/src/common/math.py +++ b/src/common/math.py @@ -6,7 +6,7 @@ import logging import numpy as np -def bootstrap_ci(data, iterations=1000, operators={'mean':np.mean}, confidence_level=0.95): +def bootstrap_ci(data, iterations=1000, operators={'mean':np.mean}, confidence_level=0.95, seed=None): """ Args: data (np.array) : input data @@ -37,15 +37,3 @@ def bootstrap_ci(data, iterations=1000, operators={'mean':np.mean}, confidence_l operators_ci[operator_key] = (ci_left, ci_mean, ci_right) return(operators_ci) - -if __name__ == "__main__": - sample_data = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 5.0]) - print(bootstrap_ci( - sample_data, - iterations=1000, - operators={ - 'mean':np.mean, - 'p90': (lambda x : np.percentile(x, 90)), - 'p99': (lambda x : np.percentile(x, 99)), - } - )) \ No newline at end of file diff --git a/tests/common/test_math.py b/tests/common/test_math.py new file mode 100644 index 00000000..84dbed9b --- /dev/null +++ b/tests/common/test_math.py @@ -0,0 +1,47 @@ +"""Tests src/common/math.py""" +import os +import pytest +import numpy as np + +from common.math import bootstrap_ci + +def test_bootstrap_ci(): + """Testing the bootstrap_ci method, but we can't have a non-deterministic test here. """ + sample_data = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 5.0]) + operators={ + 'mean':np.mean, + 'p90': (lambda x : np.percentile(x, 90)), + 'p99': (lambda x : np.percentile(x, 99)), + } + + np.random.seed(404) # fixed const + + # because we're fixing the seed, we can actually go deeper + expected_values = { + 'mean': (0.30000000000000004, 0.99395, 2.1624999999999996), + 'p90': (0.5, 2.36413, 5.0), + 'p99': (0.593, 3.469213, 5.0), + } + + returned_values = bootstrap_ci( + sample_data, + iterations=1000, + operators=operators + ) + + for key in operators: + # check type + assert key in returned_values + assert isinstance(returned_values[key], tuple) + assert len(returned_values[key]) == 3 + + # basic interval over + ci_left, ci_mean, ci_right = returned_values[key] + assert ci_left <= ci_mean + assert ci_mean <= ci_right + + # because it's a bootstrap, these are supposed to be true + assert min(sample_data) <= ci_left + assert ci_right <= max(sample_data) + + assert returned_values == expected_values \ No newline at end of file From e1fede7aa6c2337f0e2ed58f02e8b63edfba386f Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Wed, 24 Nov 2021 13:56:21 -0800 Subject: [PATCH 4/8] add unit tests --- tests/common/test_math.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/tests/common/test_math.py b/tests/common/test_math.py index 84dbed9b..3956be3a 100644 --- a/tests/common/test_math.py +++ b/tests/common/test_math.py @@ -5,7 +5,7 @@ from common.math import bootstrap_ci -def test_bootstrap_ci(): +def test_bootstrap_ci_fixed_seed(): """Testing the bootstrap_ci method, but we can't have a non-deterministic test here. """ sample_data = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 5.0]) operators={ @@ -29,6 +29,26 @@ def test_bootstrap_ci(): operators=operators ) + assert returned_values == expected_values + + +def test_bootstrap_ci_no_seed(): + """Testing the bootstrap_ci method, but we can't have a non-deterministic test here. """ + sample_data = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 5.0]) + operators={ + 'mean':np.mean, + 'p90': (lambda x : np.percentile(x, 90)), + 'p99': (lambda x : np.percentile(x, 99)), + } + + np.random.seed(None) # not const + + returned_values = bootstrap_ci( + sample_data, + iterations=1000, + operators=operators + ) + for key in operators: # check type assert key in returned_values @@ -43,5 +63,3 @@ def test_bootstrap_ci(): # because it's a bootstrap, these are supposed to be true assert min(sample_data) <= ci_left assert ci_right <= max(sample_data) - - assert returned_values == expected_values \ No newline at end of file From 9e956b810b25689e96e925afa25e4a25ffd390db Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Wed, 24 Nov 2021 13:57:41 -0800 Subject: [PATCH 5/8] add unit tests --- tests/common/test_math.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/common/test_math.py b/tests/common/test_math.py index 3956be3a..eb7cec2c 100644 --- a/tests/common/test_math.py +++ b/tests/common/test_math.py @@ -34,14 +34,15 @@ def test_bootstrap_ci_fixed_seed(): def test_bootstrap_ci_no_seed(): """Testing the bootstrap_ci method, but we can't have a non-deterministic test here. """ - sample_data = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 5.0]) + np.random.seed(None) # not const + + sample_data = np.random.rand(100) operators={ 'mean':np.mean, 'p90': (lambda x : np.percentile(x, 90)), 'p99': (lambda x : np.percentile(x, 99)), } - np.random.seed(None) # not const returned_values = bootstrap_ci( sample_data, From 86a012188f43cbc38c0dfa9e5f7c6986b41124f4 Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Wed, 24 Nov 2021 14:00:02 -0800 Subject: [PATCH 6/8] add unit tests --- tests/common/test_math.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/common/test_math.py b/tests/common/test_math.py index eb7cec2c..6de360c6 100644 --- a/tests/common/test_math.py +++ b/tests/common/test_math.py @@ -64,3 +64,8 @@ def test_bootstrap_ci_no_seed(): # because it's a bootstrap, these are supposed to be true assert min(sample_data) <= ci_left assert ci_right <= max(sample_data) + + # tests that are specific to the operators + assert returned_values['p90'][0] <= returned_values['p99'][0] # p90 < p99 so left CI also + assert returned_values['p90'][1] <= returned_values['p99'][1] # p90 < p99 so mean also + assert returned_values['p90'][2] <= returned_values['p99'][2] # p90 < p99 so right CI also From 34ca1341083c70c44f92a27c19bf2f68dc455984 Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Wed, 24 Nov 2021 14:00:44 -0800 Subject: [PATCH 7/8] add unit tests --- tests/common/test_math.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/common/test_math.py b/tests/common/test_math.py index 6de360c6..53609c4e 100644 --- a/tests/common/test_math.py +++ b/tests/common/test_math.py @@ -26,7 +26,8 @@ def test_bootstrap_ci_fixed_seed(): returned_values = bootstrap_ci( sample_data, iterations=1000, - operators=operators + operators=operators, + confidence_level=0.95 ) assert returned_values == expected_values @@ -47,7 +48,8 @@ def test_bootstrap_ci_no_seed(): returned_values = bootstrap_ci( sample_data, iterations=1000, - operators=operators + operators=operators, + confidence_level=0.95 ) for key in operators: From e01e03ababa4c949ec96f68b35d24bfc0296959d Mon Sep 17 00:00:00 2001 From: Jeff Omhover Date: Wed, 24 Nov 2021 14:06:30 -0800 Subject: [PATCH 8/8] add unit tests --- tests/common/test_math.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/common/test_math.py b/tests/common/test_math.py index 53609c4e..75aaab59 100644 --- a/tests/common/test_math.py +++ b/tests/common/test_math.py @@ -44,7 +44,6 @@ def test_bootstrap_ci_no_seed(): 'p99': (lambda x : np.percentile(x, 99)), } - returned_values = bootstrap_ci( sample_data, iterations=1000, @@ -58,7 +57,7 @@ def test_bootstrap_ci_no_seed(): assert isinstance(returned_values[key], tuple) assert len(returned_values[key]) == 3 - # basic interval over + # basic interval ordering ci_left, ci_mean, ci_right = returned_values[key] assert ci_left <= ci_mean assert ci_mean <= ci_right