diff --git a/.gitignore b/.gitignore index 1c4e86d..e41947a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ build/* dist/* +doc/build/* *.egg-info __pycache__ .ruff_cache diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..be9d2e1 --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,24 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +buildapi: + sphinx-apidoc -efM ../squigglepy -o source/reference + @echo "Auto-generation of API documentation finished. " \ + "The generated files are in 'api/'" diff --git a/doc/make.bat b/doc/make.bat new file mode 100644 index 0000000..543c6b1 --- /dev/null +++ b/doc/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% + +:end +popd diff --git a/doc/source/conf.py b/doc/source/conf.py new file mode 100644 index 0000000..ebbe282 --- /dev/null +++ b/doc/source/conf.py @@ -0,0 +1,181 @@ +# -*- coding: utf-8 -*- +# +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = "Squigglepy" +copyright = "2023, Peter Wildeford" +author = "Peter Wildeford" + +# The short X.Y version +version = "" +# The full version, including alpha/beta/rc tags +release = "" + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.imgmath", + "sphinx.ext.viewcode", + "numpydoc", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = ".rst" + +# The master toctree document. +master_doc = "index" + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = "en" + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = None + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "pydata_sphinx_theme" + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = "Squigglepydoc" + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, "Squigglepy.tex", "Squigglepy Documentation", "Peter Wildeford", "manual"), +] + + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [(master_doc, "squigglepy", "Squigglepy Documentation", [author], 1)] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ( + master_doc, + "Squigglepy", + "Squigglepy Documentation", + author, + "Squigglepy", + "One line description of project.", + "Miscellaneous", + ), +] + + +# -- Options for Epub output ------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# +# epub_identifier = '' + +# A unique identification for the text. +# +# epub_uid = '' + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ["search.html"] + + +# -- Extension configuration ------------------------------------------------- + +numpydoc_class_members_toctree = False diff --git a/doc/source/index.rst b/doc/source/index.rst new file mode 100644 index 0000000..6a637ce --- /dev/null +++ b/doc/source/index.rst @@ -0,0 +1,52 @@ +Squigglepy: Implementation of Squiggle in Python +================================================ + +`Squiggle `__ is a "simple +programming language for intuitive probabilistic estimation". It serves +as its own standalone programming language with its own syntax, but it +is implemented in JavaScript. I like the features of Squiggle and intend +to use it frequently, but I also sometimes want to use similar +functionalities in Python, especially alongside other Python statistical +programming packages like Numpy, Pandas, and Matplotlib. The +**squigglepy** package here implements many Squiggle-like +functionalities in Python. + +.. toctree:: + :maxdepth: 2 + :caption: Contents + + Installation + Usage + Numeric Distributions + API Reference + +Disclaimers +----------- + +This package is unofficial and supported by Peter Wildeford and Rethink +Priorities. It is not affiliated with or associated with the Quantified +Uncertainty Research Institute, which maintains the Squiggle language +(in JavaScript). + +This package is also new and not yet in a stable production version, so +you may encounter bugs and other errors. Please report those so they can +be fixed. It’s also possible that future versions of the package may +introduce breaking changes. + +This package is available under an MIT License. + +Acknowledgements +---------------- + +- The primary author of this package is Peter Wildeford. Agustín + Covarrubias and Bernardo Baron contributed several key features and + developments. +- Thanks to Ozzie Gooen and the Quantified Uncertainty Research + Institute for creating and maintaining the original Squiggle + language. +- Thanks to Dawn Drescher for helping me implement math between + distributions. +- Thanks to Dawn Drescher for coming up with the idea to use ``~`` as a + shorthand for ``sample``, as well as helping me implement it. + +.. autosummary:: diff --git a/doc/source/installation.rst b/doc/source/installation.rst new file mode 100644 index 0000000..fb5e8d9 --- /dev/null +++ b/doc/source/installation.rst @@ -0,0 +1,12 @@ +Installation +============ + +.. code:: shell + + pip install squigglepy + +For plotting support, you can also use the ``plots`` extra: + +.. code:: shell + + pip install squigglepy[plots] diff --git a/doc/source/numeric_distributions.rst b/doc/source/numeric_distributions.rst new file mode 100644 index 0000000..f2e3f4e --- /dev/null +++ b/doc/source/numeric_distributions.rst @@ -0,0 +1,190 @@ +Numeric Distributions +===================== + +A ``NumericDistribution`` represents a probability distribution as a histogram +of values along with the probability mass near each value. + +A ``NumericDistribution`` is functionally equivalent to a Monte Carlo +simulation where you generate infinitely many samples and then group the +samples into finitely many bins, keeping track of the proportion of samples +in each bin (a.k.a. the probability mass) and the average value for each +bin. + +Compared to a Monte Carlo simulation, ``NumericDistribution`` can represent +information much more densely by grouping together nearby values (although +some information is lost in the grouping). The benefit of this is most +obvious in fat-tailed distributions. In a Monte Carlo simulation, perhaps 1 +in 1000 samples account for 10% of the expected value, but a +``NumericDistribution`` (with the right bin sizing method, see +:any:`BinSizing`) can easily track the probability mass of large values. + +Accuracy +-------- + +The construction of ``NumericDistribution`` ensures that its expected value +is always close to 100% accurate. The higher moments (standard deviation, +skewness, etc.) and percentiles are less accurate, but still almost always +more accurate than Monte Carlo in practice. + +We are probably most interested in the accuracy of percentiles. Consider a +simulation that applies binary operations to combine ``m`` different +``NumericDistribution`` s, each of which has ``n`` bins. The relative error of +estimated percentiles grows with approximately :math:`O(m / n^1.5)` or better. +That is, the error is proportional to the number of operations and inversely +proportional to the number of bins to the power of 1.5. This is only an +approximation, and the exact error rate depends on the shapes of the underlying +distributions. + +Compare this to the relative error of percentiles for a Monte Carlo (MC) +simulation over a log-normal distribution. MC relative error grows with +:math:`O(1 / n)` [1], given the assumption that if our +``NumericDistribution`` has ``n`` bins, then our MC simulation runs ``n^2`` +samples (because that way both have a runtime of approximately :math:`O(n^2)`). +So MC scales worse with ``n``, but better with ``m``. + +I tested accuracy across a range of percentiles for a variety of values of ``m`` +and ``n``. Although MC scales better with ``m`` than ``NumericDistribution``, MC +does not achieve lower error rates until ``m = 500`` or so when using 200 +bins/40,000 MC samples (depending on the distribution shape). Few models will +use 500 distinct variables, so ``NumericDistribution`` should nearly always +perform better in practice. + +We are also interested in the accuracy of the standard deviation. The error of +``NumericDistribution``'s estimated standard deviation scales with approximately +:math:`O(\sqrt[4]{m} / n^{1.5})`. The error of Monte Carlo standard deviation +scales with :math:`O(\sqrt{m} / n)`. + +Where possible, ``NumericDistribution`` uses `Richardson extrapolation +`_ over the bins to +improve accuracy. When done correctly, Richardson extrapolation significantly +reduces the big-O error of an operation. But doing Richardson extrapolation +correctly requires setting a rate parameter correctly, and the rate parameter +depends on the shape of the distribution and is not knowable in general. +``NumericDistribution`` simply uses the same rate parameter for all +distributions, which somewhat reduces error, but might not reduce the big-O +error. + +[1] Goodman (1983). Accuracy and Efficiency of Monte Carlo Method. +https://inis.iaea.org/collection/NCLCollectionStore/_Public/19/047/19047359.pdf + +Speed +----- +I tested ``NumericDistribution`` versus Monte Carlo on two fairly complex +example models (originally written by Laura Duffy to evaluate hypothetical +animal welfare and x-risk interventions). On the first model, +``NumericDistribution`` performed ~300x better at the same level of accuracy. +The second model used some operations for which I cannot reliably assess the +accuracy, but my best guess is that ``NumericDistribution`` performs ~10x +better. + +``NumericDistribution`` may be +slower at the same level of accuracy for a model that uses sufficiently many +operations that ``NumericDistribution`` handles relatively poorly. The most +inaccurate operations are ``log`` and ``exp`` because they significantly alter +the shape of the distribution. + +The biggest performance upside to Monte Carlo is that it is trivial to +parallelize by simply generating samples on separate threads. +``NumericDistribution`` does not currently support multithreading. + +Some more details about the runtime performance of ``NumericDistribution``: + +Where ``n`` is the number of bins, constructing a ``NumericDistribution`` or +performing a unary operation has runtime :math:`O(n)`. A binary operation (such +as addition or multiplication) has a runtime close to :math:`O(n^2)`. To be +precise, the runtime is :math:`O(n^2 \log(n))` because the :math:`n^2` results +of a binary operation must be partitioned into :math:`n` ordered bins. In +practice, this partitioning operation takes up a fairly small portion of the +runtime for ``n = 200`` (which is the default bin count), and takes up ~half the +runtime for ``n = 1000``. + +For ``n = 200``, a binary operation takes about twice as long as constructing a +``NumericDistribution``. Even though construction is :math:`O(n)` and binary +operations are :math:`O(n^2)`, construction has a much worse constant factor for +normal and log-normal distributions because construction requires repeatedly +computing the `error function `_, +which is fairly slow. + +The accuracy to speed ratio gets worse as ``n`` increases, so you typically +don't want to use bin counts larger than the default unless you're particularly +concerned about accuracy. + +Implementation Details +====================== + +On setting values within bins +----------------------------- +Whenever possible, NumericDistribution assigns the value of each bin as the +average value between the two edges (weighted by mass). You can think of +this as the result you'd get if you generated infinitely many Monte Carlo +samples and grouped them into bins, setting the value of each bin as the +average of the samples. You might call this the expected value (EV) method, +in contrast to two methods described below. + +The EV method guarantees that, whenever the histogram width covers the full +support of the distribution, the histogram's expected value exactly equals +the expected value of the true distribution (modulo floating point rounding +errors). + +There are some other methods we could use, which are generally worse: + +1. Set the value of each bin to the average of the two edges (the +"trapezoid rule"). The purpose of using the trapezoid rule is that we don't +know the probability mass within a bin (perhaps the CDF is too hard to +evaluate) so we have to estimate it. But whenever we *do* know the CDF, we +can calculate the probability mass exactly, so we don't need to use the +trapezoid rule. + +2. Set the value of each bin to the center of the probability mass (the +"mass method"). This is equivalent to generating infinitely many Monte +Carlo samples and grouping them into bins, setting the value of each bin as +the **median** of the samples. This approach does not particularly help us +because we don't care about the median of every bin. We might care about +the median of the distribution, but we can calculate that near-exactly +regardless of what value-setting method we use by looking at the value in +the bin where the probability mass crosses 0.5. And the mass method will +systematically underestimate (the absolute value of) EV because the +definition of expected value places larger weight on larger (absolute) +values, and the mass method does not. + +Although the EV method perfectly measures the expected value of a distribution, +it systematically underestimates the variance. To see this, consider that it is +possible to define the variance of a random variable X as :math:`E[X^2] - +E[X]^2`. The EV method correctly estimates :math:`E[X]`, so it also correctly +estimates :math:`E[X]^2`. However, it systematically underestimates +:math:`E[X^2]` because :math:`E[X^2]` places more weight on larger values. But +an alternative method that accurately estimated variance would necessarily +overestimate :math:`E[X]`. It's possible to force both mean and variance to be +exactly correct by adjusting the value of each bin according to its z-score, but +this could make other summary statistics less accurate. + +On bin sizing for two-sided distributions +----------------------------------------- +:any:`squigglepy.numeric_distribution.BinSizing` describes the various +bin-sizing methods and their properties. ``BinSizing.ev`` is usually preferred. +``BinSizing.ev`` allocates bins such that each bin contributes equally to the +distribution's expected value. This has the nice property that bins are narrower +in the regions that matter more for the distribution's EV, for example, in +fat-tailed distributions it more bins in the tail(s) and fewer bins in the +center. + +The interpretation of the EV bin-sizing method is slightly non-obvious +for two-sided distributions because we must decide how to interpret bins +with negative expected value. + +The EV method arranges values into bins such that: + * The negative side has the correct negative contribution to EV and the + positive side has the correct positive contribution to EV. + * Every negative bin has equal contribution to EV and every positive bin + has equal contribution to EV. + * If a side has nonzero probability mass, then it has at least one bin, + regardless of how small its mass. + * The number of negative and positive bins are chosen such that the + absolute contribution to EV for negative bins is as close as possible + to the absolute contribution to EV for positive bins given the above + constraints. + +This binning method means that the distribution EV is exactly preserved +and there is no bin that contains the value zero. However, the positive +and negative bins do not necessarily have equal contribution to EV, and +the magnitude of the error is at most ``1 / num_bins / 2``. diff --git a/doc/source/reference/modules.rst b/doc/source/reference/modules.rst new file mode 100644 index 0000000..57192bd --- /dev/null +++ b/doc/source/reference/modules.rst @@ -0,0 +1,7 @@ +squigglepy +========== + +.. toctree:: + :maxdepth: 4 + + squigglepy diff --git a/doc/source/reference/squigglepy.bayes.rst b/doc/source/reference/squigglepy.bayes.rst new file mode 100644 index 0000000..146fe05 --- /dev/null +++ b/doc/source/reference/squigglepy.bayes.rst @@ -0,0 +1,7 @@ +squigglepy.bayes module +======================= + +.. automodule:: squigglepy.bayes + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/reference/squigglepy.correlation.rst b/doc/source/reference/squigglepy.correlation.rst new file mode 100644 index 0000000..15304d7 --- /dev/null +++ b/doc/source/reference/squigglepy.correlation.rst @@ -0,0 +1,7 @@ +squigglepy.correlation module +============================= + +.. automodule:: squigglepy.correlation + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/reference/squigglepy.distributions.rst b/doc/source/reference/squigglepy.distributions.rst new file mode 100644 index 0000000..46cf84b --- /dev/null +++ b/doc/source/reference/squigglepy.distributions.rst @@ -0,0 +1,7 @@ +squigglepy.distributions module +=============================== + +.. automodule:: squigglepy.distributions + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/reference/squigglepy.numbers.rst b/doc/source/reference/squigglepy.numbers.rst new file mode 100644 index 0000000..9640c0f --- /dev/null +++ b/doc/source/reference/squigglepy.numbers.rst @@ -0,0 +1,7 @@ +squigglepy.numbers module +========================= + +.. automodule:: squigglepy.numbers + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/reference/squigglepy.numeric_distribution.rst b/doc/source/reference/squigglepy.numeric_distribution.rst new file mode 100644 index 0000000..2016c6e --- /dev/null +++ b/doc/source/reference/squigglepy.numeric_distribution.rst @@ -0,0 +1,7 @@ +squigglepy.numeric\_distribution module +======================================= + +.. automodule:: squigglepy.numeric_distribution + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/reference/squigglepy.rng.rst b/doc/source/reference/squigglepy.rng.rst new file mode 100644 index 0000000..052b5a9 --- /dev/null +++ b/doc/source/reference/squigglepy.rng.rst @@ -0,0 +1,7 @@ +squigglepy.rng module +===================== + +.. automodule:: squigglepy.rng + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/reference/squigglepy.rst b/doc/source/reference/squigglepy.rst new file mode 100644 index 0000000..134cc43 --- /dev/null +++ b/doc/source/reference/squigglepy.rst @@ -0,0 +1,23 @@ +squigglepy package +================== + +.. automodule:: squigglepy + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + + squigglepy.bayes + squigglepy.correlation + squigglepy.distributions + squigglepy.numbers + squigglepy.numeric_distribution + squigglepy.rng + squigglepy.samplers + squigglepy.utils + squigglepy.version + diff --git a/doc/source/reference/squigglepy.samplers.rst b/doc/source/reference/squigglepy.samplers.rst new file mode 100644 index 0000000..cbd3be1 --- /dev/null +++ b/doc/source/reference/squigglepy.samplers.rst @@ -0,0 +1,7 @@ +squigglepy.samplers module +========================== + +.. automodule:: squigglepy.samplers + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/reference/squigglepy.utils.rst b/doc/source/reference/squigglepy.utils.rst new file mode 100644 index 0000000..1638b74 --- /dev/null +++ b/doc/source/reference/squigglepy.utils.rst @@ -0,0 +1,7 @@ +squigglepy.utils module +======================= + +.. automodule:: squigglepy.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/reference/squigglepy.version.rst b/doc/source/reference/squigglepy.version.rst new file mode 100644 index 0000000..fe983ba --- /dev/null +++ b/doc/source/reference/squigglepy.version.rst @@ -0,0 +1,7 @@ +squigglepy.version module +========================= + +.. automodule:: squigglepy.version + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/usage.rst b/doc/source/usage.rst new file mode 100644 index 0000000..3e03a70 --- /dev/null +++ b/doc/source/usage.rst @@ -0,0 +1,476 @@ +Examples +======== + +Piano tuners example +~~~~~~~~~~~~~~~~~~~~ + +Here’s the Squigglepy implementation of `the example from Squiggle +Docs `__: + +.. code:: python + + import squigglepy as sq + import numpy as np + import matplotlib.pyplot as plt + from squigglepy.numbers import K, M + from pprint import pprint + + pop_of_ny_2022 = sq.to(8.1*M, 8.4*M) # This means that you're 90% confident the value is between 8.1 and 8.4 Million. + pct_of_pop_w_pianos = sq.to(0.2, 1) * 0.01 # We assume there are almost no people with multiple pianos + pianos_per_piano_tuner = sq.to(2*K, 50*K) + piano_tuners_per_piano = 1 / pianos_per_piano_tuner + total_tuners_in_2022 = pop_of_ny_2022 * pct_of_pop_w_pianos * piano_tuners_per_piano + samples = total_tuners_in_2022 @ 1000 # Note: `@ 1000` is shorthand to get 1000 samples + + # Get mean and SD + print('Mean: {}, SD: {}'.format(round(np.mean(samples), 2), + round(np.std(samples), 2))) + + # Get percentiles + pprint(sq.get_percentiles(samples, digits=0)) + + # Histogram + plt.hist(samples, bins=200) + plt.show() + + # Shorter histogram + total_tuners_in_2022.plot() + +And the version from the Squiggle doc that incorporates time: + +.. code:: python + + import squigglepy as sq + from squigglepy.numbers import K, M + + pop_of_ny_2022 = sq.to(8.1*M, 8.4*M) + pct_of_pop_w_pianos = sq.to(0.2, 1) * 0.01 + pianos_per_piano_tuner = sq.to(2*K, 50*K) + piano_tuners_per_piano = 1 / pianos_per_piano_tuner + + def pop_at_time(t): # t = Time in years after 2022 + avg_yearly_pct_change = sq.to(-0.01, 0.05) # We're expecting NYC to continuously grow with an mean of roughly between -1% and +4% per year + return pop_of_ny_2022 * ((avg_yearly_pct_change + 1) ** t) + + def total_tuners_at_time(t): + return pop_at_time(t) * pct_of_pop_w_pianos * piano_tuners_per_piano + + # Get total piano tuners at 2030 + sq.get_percentiles(total_tuners_at_time(2030-2022) @ 1000) + +**WARNING:** Be careful about dividing by ``K``, ``M``, etc. ``1/2*K`` = +500 in Python. Use ``1/(2*K)`` instead to get the expected outcome. + +**WARNING:** Be careful about using ``K`` to get sample counts. Use +``sq.norm(2, 3) @ (2*K)``\ … ``sq.norm(2, 3) @ 2*K`` will return only +two samples, multiplied by 1000. + +Distributions +~~~~~~~~~~~~~ + +.. code:: python + + import squigglepy as sq + + # Normal distribution + sq.norm(1, 3) # 90% interval from 1 to 3 + + # Distribution can be sampled with mean and sd too + sq.norm(mean=0, sd=1) + + # Shorthand to get one sample + ~sq.norm(1, 3) + + # Shorthand to get more than one sample + sq.norm(1, 3) @ 100 + + # Longhand version to get more than one sample + sq.sample(sq.norm(1, 3), n=100) + + # Nice progress reporter + sq.sample(sq.norm(1, 3), n=1000, verbose=True) + + # Other distributions exist + sq.lognorm(1, 10) + sq.tdist(1, 10, t=5) + sq.triangular(1, 2, 3) + sq.pert(1, 2, 3, lam=2) + sq.binomial(p=0.5, n=5) + sq.beta(a=1, b=2) + sq.bernoulli(p=0.5) + sq.poisson(10) + sq.chisquare(2) + sq.gamma(3, 2) + sq.pareto(1) + sq.exponential(scale=1) + sq.geometric(p=0.5) + + # Discrete sampling + sq.discrete({'A': 0.1, 'B': 0.9}) + + # Can return integers + sq.discrete({0: 0.1, 1: 0.3, 2: 0.3, 3: 0.15, 4: 0.15}) + + # Alternate format (also can be used to return more complex objects) + sq.discrete([[0.1, 0], + [0.3, 1], + [0.3, 2], + [0.15, 3], + [0.15, 4]]) + + sq.discrete([0, 1, 2]) # No weights assumes equal weights + + # You can mix distributions together + sq.mixture([sq.norm(1, 3), + sq.norm(4, 10), + sq.lognorm(1, 10)], # Distributions to mix + [0.3, 0.3, 0.4]) # These are the weights on each distribution + + # This is equivalent to the above, just a different way of doing the notation + sq.mixture([[0.3, sq.norm(1,3)], + [0.3, sq.norm(4,10)], + [0.4, sq.lognorm(1,10)]]) + + # Make a zero-inflated distribution + # 60% chance of returning 0, 40% chance of sampling from `norm(1, 2)`. + sq.zero_inflated(0.6, sq.norm(1, 2)) + +Additional features +~~~~~~~~~~~~~~~~~~~ + +.. code:: python + + import squigglepy as sq + + # You can add and subtract distributions + (sq.norm(1,3) + sq.norm(4,5)) @ 100 + (sq.norm(1,3) - sq.norm(4,5)) @ 100 + (sq.norm(1,3) * sq.norm(4,5)) @ 100 + (sq.norm(1,3) / sq.norm(4,5)) @ 100 + + # You can also do math with numbers + ~((sq.norm(sd=5) + 2) * 2) + ~(-sq.lognorm(0.1, 1) * sq.pareto(1) / 10) + + # You can change the CI from 90% (default) to 80% + sq.norm(1, 3, credibility=80) + + # You can clip + sq.norm(0, 3, lclip=0, rclip=5) # Sample norm with a 90% CI from 0-3, but anything lower than 0 gets clipped to 0 and anything higher than 5 gets clipped to 5. + + # You can also clip with a function, and use pipes + sq.norm(0, 3) >> sq.clip(0, 5) + + # You can correlate continuous distributions + a, b = sq.uniform(-1, 1), sq.to(0, 3) + a, b = sq.correlate((a, b), 0.5) # Correlate a and b with a correlation of 0.5 + # You can even pass your own correlation matrix! + a, b = sq.correlate((a, b), [[1, 0.5], [0.5, 1]]) + +Example: Rolling a die +^^^^^^^^^^^^^^^^^^^^^^ + +An example of how to use distributions to build tools: + +.. code:: python + + import squigglepy as sq + + def roll_die(sides, n=1): + return sq.discrete(list(range(1, sides + 1))) @ n if sides > 0 else None + + roll_die(sides=6, n=10) + # [2, 6, 5, 2, 6, 2, 3, 1, 5, 2] + +This is already included standard in the utils of this package. Use +``sq.roll_die``. + +Bayesian inference +~~~~~~~~~~~~~~~~~~ + +1% of women at age forty who participate in routine screening have +breast cancer. 80% of women with breast cancer will get positive +mammographies. 9.6% of women without breast cancer will also get +positive mammographies. + +A woman in this age group had a positive mammography in a routine +screening. What is the probability that she actually has breast cancer? + +We can approximate the answer with a Bayesian network (uses rejection +sampling): + +.. code:: python + + import squigglepy as sq + from squigglepy import bayes + from squigglepy.numbers import M + + def mammography(has_cancer): + return sq.event(0.8 if has_cancer else 0.096) + + def define_event(): + cancer = ~sq.bernoulli(0.01) + return({'mammography': mammography(cancer), + 'cancer': cancer}) + + bayes.bayesnet(define_event, + find=lambda e: e['cancer'], + conditional_on=lambda e: e['mammography'], + n=1*M) + # 0.07723995880535531 + +Or if we have the information immediately on hand, we can directly +calculate it. Though this doesn’t work for very complex stuff. + +.. code:: python + + from squigglepy import bayes + bayes.simple_bayes(prior=0.01, likelihood_h=0.8, likelihood_not_h=0.096) + # 0.07763975155279504 + +You can also make distributions and update them: + +.. code:: python + + import matplotlib.pyplot as plt + import squigglepy as sq + from squigglepy import bayes + from squigglepy.numbers import K + import numpy as np + + print('Prior') + prior = sq.norm(1,5) + prior_samples = prior @ (10*K) + plt.hist(prior_samples, bins = 200) + plt.show() + print(sq.get_percentiles(prior_samples)) + print('Prior Mean: {} SD: {}'.format(np.mean(prior_samples), np.std(prior_samples))) + print('-') + + print('Evidence') + evidence = sq.norm(2,3) + evidence_samples = evidence @ (10*K) + plt.hist(evidence_samples, bins = 200) + plt.show() + print(sq.get_percentiles(evidence_samples)) + print('Evidence Mean: {} SD: {}'.format(np.mean(evidence_samples), np.std(evidence_samples))) + print('-') + + print('Posterior') + posterior = bayes.update(prior, evidence) + posterior_samples = posterior @ (10*K) + plt.hist(posterior_samples, bins = 200) + plt.show() + print(sq.get_percentiles(posterior_samples)) + print('Posterior Mean: {} SD: {}'.format(np.mean(posterior_samples), np.std(posterior_samples))) + + print('Average') + average = bayes.average(prior, evidence) + average_samples = average @ (10*K) + plt.hist(average_samples, bins = 200) + plt.show() + print(sq.get_percentiles(average_samples)) + print('Average Mean: {} SD: {}'.format(np.mean(average_samples), np.std(average_samples))) + +Example: Alarm net +^^^^^^^^^^^^^^^^^^ + +This is the alarm network from `Bayesian Artificial Intelligence - +Section +2.5.1 `__: + + Assume your house has an alarm system against burglary. + + You live in the seismically active area and the alarm system can get + occasionally set off by an earthquake. + + You have two neighbors, Mary and John, who do not know each other. If + they hear the alarm they call you, but this is not guaranteed. + + The chance of a burglary on a particular day is 0.1%. The chance of + an earthquake on a particular day is 0.2%. + + The alarm will go off 95% of the time with both a burglary and an + earthquake, 94% of the time with just a burglary, 29% of the time + with just an earthquake, and 0.1% of the time with nothing (total + false alarm). + + John will call you 90% of the time when the alarm goes off. But on 5% + of the days, John will just call to say “hi”. Mary will call you 70% + of the time when the alarm goes off. But on 1% of the days, Mary will + just call to say “hi”. + +.. code:: python + + import squigglepy as sq + from squigglepy import bayes + from squigglepy.numbers import M + + def p_alarm_goes_off(burglary, earthquake): + if burglary and earthquake: + return 0.95 + elif burglary and not earthquake: + return 0.94 + elif not burglary and earthquake: + return 0.29 + elif not burglary and not earthquake: + return 0.001 + + def p_john_calls(alarm_goes_off): + return 0.9 if alarm_goes_off else 0.05 + + def p_mary_calls(alarm_goes_off): + return 0.7 if alarm_goes_off else 0.01 + + def define_event(): + burglary_happens = sq.event(p=0.001) + earthquake_happens = sq.event(p=0.002) + alarm_goes_off = sq.event(p_alarm_goes_off(burglary_happens, earthquake_happens)) + john_calls = sq.event(p_john_calls(alarm_goes_off)) + mary_calls = sq.event(p_mary_calls(alarm_goes_off)) + return {'burglary': burglary_happens, + 'earthquake': earthquake_happens, + 'alarm_goes_off': alarm_goes_off, + 'john_calls': john_calls, + 'mary_calls': mary_calls} + + # What are the chances that both John and Mary call if an earthquake happens? + bayes.bayesnet(define_event, + n=1*M, + find=lambda e: (e['mary_calls'] and e['john_calls']), + conditional_on=lambda e: e['earthquake']) + # Result will be ~0.19, though it varies because it is based on a random sample. + # This also may take a minute to run. + + # If both John and Mary call, what is the chance there's been a burglary? + bayes.bayesnet(define_event, + n=1*M, + find=lambda e: e['burglary'], + conditional_on=lambda e: (e['mary_calls'] and e['john_calls'])) + # Result will be ~0.27, though it varies because it is based on a random sample. + # This will run quickly because there is a built-in cache. + # Use `cache=False` to not build a cache and `reload_cache=True` to recalculate the cache. + +Note that the amount of Bayesian analysis that squigglepy can do is +pretty limited. For more complex bayesian analysis, consider +`sorobn `__, +`pomegranate `__, +`bnlearn `__, or +`pyMC `__. + +Example: A demonstration of the Monty Hall Problem +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code:: python + + import squigglepy as sq + from squigglepy import bayes + from squigglepy.numbers import K, M, B, T + + + def monte_hall(door_picked, switch=False): + doors = ['A', 'B', 'C'] + car_is_behind_door = ~sq.discrete(doors) + reveal_door = ~sq.discrete([d for d in doors if d != door_picked and d != car_is_behind_door]) + + if switch: + old_door_picked = door_picked + door_picked = [d for d in doors if d != old_door_picked and d != reveal_door][0] + + won_car = (car_is_behind_door == door_picked) + return won_car + + + def define_event(): + door = ~sq.discrete(['A', 'B', 'C']) + switch = sq.event(0.5) + return {'won': monte_hall(door_picked=door, switch=switch), + 'switched': switch} + + RUNS = 10*K + r = bayes.bayesnet(define_event, + find=lambda e: e['won'], + conditional_on=lambda e: e['switched'], + verbose=True, + n=RUNS) + print('Win {}% of the time when switching'.format(int(r * 100))) + + r = bayes.bayesnet(define_event, + find=lambda e: e['won'], + conditional_on=lambda e: not e['switched'], + verbose=True, + n=RUNS) + print('Win {}% of the time when not switching'.format(int(r * 100))) + + # Win 66% of the time when switching + # Win 34% of the time when not switching + +Example: More complex coin/dice interactions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + Imagine that I flip a coin. If heads, I take a random die out of my + blue bag. If tails, I take a random die out of my red bag. The blue + bag contains only 6-sided dice. The red bag contains a 4-sided die, a + 6-sided die, a 10-sided die, and a 20-sided die. I then roll the + random die I took. What is the chance that I roll a 6? + +.. code:: python + + import squigglepy as sq + from squigglepy.numbers import K, M, B, T + from squigglepy import bayes + + def define_event(): + if sq.flip_coin() == 'heads': # Blue bag + return sq.roll_die(6) + else: # Red bag + return sq.discrete([4, 6, 10, 20]) >> sq.roll_die + + + bayes.bayesnet(define_event, + find=lambda e: e == 6, + verbose=True, + n=100*K) + # This run for me returned 0.12306 which is pretty close to the correct answer of 0.12292 + +Kelly betting +~~~~~~~~~~~~~ + +You can use probability generated, combine with a bankroll to determine +bet sizing using `Kelly +criterion `__. + +For example, if you want to Kelly bet and you’ve… + +- determined that your price (your probability of the event in question + happening / the market in question resolving in your favor) is $0.70 + (70%) +- see that the market is pricing at $0.65 +- you have a bankroll of $1000 that you are willing to bet + +You should bet as follows: + +.. code:: python + + import squigglepy as sq + kelly_data = sq.kelly(my_price=0.70, market_price=0.65, bankroll=1000) + kelly_data['kelly'] # What fraction of my bankroll should I bet on this? + # 0.143 + kelly_data['target'] # How much money should be invested in this? + # 142.86 + kelly_data['expected_roi'] # What is the expected ROI of this bet? + # 0.077 + +More examples +~~~~~~~~~~~~~ + +You can see more examples of squigglepy in action +`here `__. + +Run tests +--------- + +Use ``black .`` for formatting. + +Run +``ruff check . && pytest && pip3 install . && python3 tests/integration.py`` diff --git a/poetry.lock b/poetry.lock index 86c4e93..42f9236 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,30 @@ # This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +[[package]] +name = "accessible-pygments" +version = "0.0.4" +description = "A collection of accessible pygments styles" +optional = false +python-versions = "*" +files = [ + {file = "accessible-pygments-0.0.4.tar.gz", hash = "sha256:e7b57a9b15958e9601c7e9eb07a440c813283545a20973f2574a5f453d0e953e"}, + {file = "accessible_pygments-0.0.4-py2.py3-none-any.whl", hash = "sha256:416c6d8c1ea1c5ad8701903a20fcedf953c6e720d64f33dc47bfb2d3f2fa4e8d"}, +] + +[package.dependencies] +pygments = ">=1.5" + +[[package]] +name = "alabaster" +version = "0.7.13" +description = "A configurable sidebar-enabled Sphinx theme" +optional = false +python-versions = ">=3.6" +files = [ + {file = "alabaster-0.7.13-py3-none-any.whl", hash = "sha256:1ee19aca801bbabb5ba3f5f258e4422dfa86f82f3e9cefb0859b283cdd7f62a3"}, + {file = "alabaster-0.7.13.tar.gz", hash = "sha256:a27a4a084d5e690e16e01e03ad2b2e552c61a65469419b907243193de1a84ae2"}, +] + [[package]] name = "ansi2html" version = "1.8.0" @@ -33,6 +58,38 @@ docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib- tests = ["attrs[tests-no-zope]", "zope-interface"] tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +[[package]] +name = "babel" +version = "2.13.1" +description = "Internationalization utilities" +optional = false +python-versions = ">=3.7" +files = [ + {file = "Babel-2.13.1-py3-none-any.whl", hash = "sha256:7077a4984b02b6727ac10f1f7294484f737443d7e2e66c5e4380e41a3ae0b4ed"}, + {file = "Babel-2.13.1.tar.gz", hash = "sha256:33e0952d7dd6374af8dbf6768cc4ddf3ccfefc244f9986d4074704f2fbd18900"}, +] + +[package.extras] +dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"] + +[[package]] +name = "beautifulsoup4" +version = "4.12.2" +description = "Screen-scraping library" +optional = false +python-versions = ">=3.6.0" +files = [ + {file = "beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"}, + {file = "beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"}, +] + +[package.dependencies] +soupsieve = ">1.2" + +[package.extras] +html5lib = ["html5lib"] +lxml = ["lxml"] + [[package]] name = "black" version = "23.3.0" @@ -438,6 +495,17 @@ files = [ [package.extras] graph = ["objgraph (>=1.7.2)"] +[[package]] +name = "docutils" +version = "0.20.1" +description = "Docutils -- Python Documentation Utilities" +optional = false +python-versions = ">=3.7" +files = [ + {file = "docutils-0.20.1-py3-none-any.whl", hash = "sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6"}, + {file = "docutils-0.20.1.tar.gz", hash = "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b"}, +] + [[package]] name = "exceptiongroup" version = "1.1.1" @@ -602,6 +670,17 @@ files = [ {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, ] +[[package]] +name = "imagesize" +version = "1.4.1" +description = "Getting image size from png/jpeg/jpeg2000/gif file" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b"}, + {file = "imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a"}, +] + [[package]] name = "importlib-metadata" version = "6.7.0" @@ -1369,6 +1448,33 @@ files = [ [package.extras] test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] +[[package]] +name = "pydata-sphinx-theme" +version = "0.14.3" +description = "Bootstrap-based Sphinx theme from the PyData community" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pydata_sphinx_theme-0.14.3-py3-none-any.whl", hash = "sha256:b7e40cd75a20449adfe2d7525be379b9fe92f6d31e5233e449fa34ddcd4398d9"}, + {file = "pydata_sphinx_theme-0.14.3.tar.gz", hash = "sha256:bd474f347895f3fc5b6ce87390af64330ee54f11ebf9660d5bc3f87d532d4e5c"}, +] + +[package.dependencies] +accessible-pygments = "*" +Babel = "*" +beautifulsoup4 = "*" +docutils = "!=0.17.0" +packaging = "*" +pygments = ">=2.7" +sphinx = ">=5.0" +typing-extensions = "*" + +[package.extras] +a11y = ["pytest-playwright"] +dev = ["nox", "pre-commit", "pydata-sphinx-theme[doc,test]", "pyyaml"] +doc = ["ablog (>=0.11.0rc2)", "colorama", "ipykernel", "ipyleaflet", "jupyter_sphinx", "jupyterlite-sphinx", "linkify-it-py", "matplotlib", "myst-parser", "nbsphinx", "numpy", "numpydoc", "pandas", "plotly", "rich", "sphinx-autoapi (>=3.0.0)", "sphinx-copybutton", "sphinx-design", "sphinx-favicon (>=1.0.1)", "sphinx-sitemap", "sphinx-togglebutton", "sphinxcontrib-youtube (<1.4)", "sphinxext-rediraffe", "xarray"] +test = ["pytest", "pytest-cov", "pytest-regressions"] + [[package]] name = "pygments" version = "2.15.1" @@ -1659,6 +1765,17 @@ files = [ {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] +[[package]] +name = "snowballstemmer" +version = "2.2.0" +description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms." +optional = false +python-versions = "*" +files = [ + {file = "snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"}, + {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"}, +] + [[package]] name = "sortedcontainers" version = "2.4.0" @@ -1670,6 +1787,156 @@ files = [ {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, ] +[[package]] +name = "soupsieve" +version = "2.5" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = false +python-versions = ">=3.8" +files = [ + {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, + {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, +] + +[[package]] +name = "sphinx" +version = "7.2.6" +description = "Python documentation generator" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinx-7.2.6-py3-none-any.whl", hash = "sha256:1e09160a40b956dc623c910118fa636da93bd3ca0b9876a7b3df90f07d691560"}, + {file = "sphinx-7.2.6.tar.gz", hash = "sha256:9a5160e1ea90688d5963ba09a2dcd8bdd526620edbb65c328728f1b2228d5ab5"}, +] + +[package.dependencies] +alabaster = ">=0.7,<0.8" +babel = ">=2.9" +colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} +docutils = ">=0.18.1,<0.21" +imagesize = ">=1.3" +importlib-metadata = {version = ">=4.8", markers = "python_version < \"3.10\""} +Jinja2 = ">=3.0" +packaging = ">=21.0" +Pygments = ">=2.14" +requests = ">=2.25.0" +snowballstemmer = ">=2.0" +sphinxcontrib-applehelp = "*" +sphinxcontrib-devhelp = "*" +sphinxcontrib-htmlhelp = ">=2.0.0" +sphinxcontrib-jsmath = "*" +sphinxcontrib-qthelp = "*" +sphinxcontrib-serializinghtml = ">=1.1.9" + +[package.extras] +docs = ["sphinxcontrib-websupport"] +lint = ["docutils-stubs", "flake8 (>=3.5.0)", "flake8-simplify", "isort", "mypy (>=0.990)", "ruff", "sphinx-lint", "types-requests"] +test = ["cython (>=3.0)", "filelock", "html5lib", "pytest (>=4.6)", "setuptools (>=67.0)"] + +[[package]] +name = "sphinxcontrib-applehelp" +version = "1.0.7" +description = "sphinxcontrib-applehelp is a Sphinx extension which outputs Apple help books" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_applehelp-1.0.7-py3-none-any.whl", hash = "sha256:094c4d56209d1734e7d252f6e0b3ccc090bd52ee56807a5d9315b19c122ab15d"}, + {file = "sphinxcontrib_applehelp-1.0.7.tar.gz", hash = "sha256:39fdc8d762d33b01a7d8f026a3b7d71563ea3b72787d5f00ad8465bd9d6dfbfa"}, +] + +[package.dependencies] +Sphinx = ">=5" + +[package.extras] +lint = ["docutils-stubs", "flake8", "mypy"] +test = ["pytest"] + +[[package]] +name = "sphinxcontrib-devhelp" +version = "1.0.5" +description = "sphinxcontrib-devhelp is a sphinx extension which outputs Devhelp documents" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_devhelp-1.0.5-py3-none-any.whl", hash = "sha256:fe8009aed765188f08fcaadbb3ea0d90ce8ae2d76710b7e29ea7d047177dae2f"}, + {file = "sphinxcontrib_devhelp-1.0.5.tar.gz", hash = "sha256:63b41e0d38207ca40ebbeabcf4d8e51f76c03e78cd61abe118cf4435c73d4212"}, +] + +[package.dependencies] +Sphinx = ">=5" + +[package.extras] +lint = ["docutils-stubs", "flake8", "mypy"] +test = ["pytest"] + +[[package]] +name = "sphinxcontrib-htmlhelp" +version = "2.0.4" +description = "sphinxcontrib-htmlhelp is a sphinx extension which renders HTML help files" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_htmlhelp-2.0.4-py3-none-any.whl", hash = "sha256:8001661c077a73c29beaf4a79968d0726103c5605e27db92b9ebed8bab1359e9"}, + {file = "sphinxcontrib_htmlhelp-2.0.4.tar.gz", hash = "sha256:6c26a118a05b76000738429b724a0568dbde5b72391a688577da08f11891092a"}, +] + +[package.dependencies] +Sphinx = ">=5" + +[package.extras] +lint = ["docutils-stubs", "flake8", "mypy"] +test = ["html5lib", "pytest"] + +[[package]] +name = "sphinxcontrib-jsmath" +version = "1.0.1" +description = "A sphinx extension which renders display math in HTML via JavaScript" +optional = false +python-versions = ">=3.5" +files = [ + {file = "sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"}, + {file = "sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178"}, +] + +[package.extras] +test = ["flake8", "mypy", "pytest"] + +[[package]] +name = "sphinxcontrib-qthelp" +version = "1.0.6" +description = "sphinxcontrib-qthelp is a sphinx extension which outputs QtHelp documents" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_qthelp-1.0.6-py3-none-any.whl", hash = "sha256:bf76886ee7470b934e363da7a954ea2825650013d367728588732c7350f49ea4"}, + {file = "sphinxcontrib_qthelp-1.0.6.tar.gz", hash = "sha256:62b9d1a186ab7f5ee3356d906f648cacb7a6bdb94d201ee7adf26db55092982d"}, +] + +[package.dependencies] +Sphinx = ">=5" + +[package.extras] +lint = ["docutils-stubs", "flake8", "mypy"] +test = ["pytest"] + +[[package]] +name = "sphinxcontrib-serializinghtml" +version = "1.1.9" +description = "sphinxcontrib-serializinghtml is a sphinx extension which outputs \"serialized\" HTML files (json and pickle)" +optional = false +python-versions = ">=3.9" +files = [ + {file = "sphinxcontrib_serializinghtml-1.1.9-py3-none-any.whl", hash = "sha256:9b36e503703ff04f20e9675771df105e58aa029cfcbc23b8ed716019b7416ae1"}, + {file = "sphinxcontrib_serializinghtml-1.1.9.tar.gz", hash = "sha256:0c64ff898339e1fac29abd2bf5f11078f3ec413cfe9c046d3120d7ca65530b54"}, +] + +[package.dependencies] +Sphinx = ">=5" + +[package.extras] +lint = ["docutils-stubs", "flake8", "mypy"] +test = ["pytest"] + [[package]] name = "tenacity" version = "8.2.2" @@ -1809,4 +2076,4 @@ plots = ["matplotlib"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.12" -content-hash = "5625f6f3eec6de2121f93d9cb0591c23402a9c232f824ce8353c50b7441e3a9d" +content-hash = "f9f1beabf7d339e2173f854e8559f6bd9f1fbb28d711802c3d0d831a7b1fa2b5" diff --git a/pyproject.toml b/pyproject.toml index d4e93d2..b35de76 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ pathos = "^0.3.0" msgspec = "^0.15.1" matplotlib = { version = "^3.7.1", optional = true } pandas = { version = "^2.0.2", optional = true } +pydata-sphinx-theme = "^0.14.3" [tool.poetry.group.dev.dependencies] diff --git a/squigglepy/__init__.py b/squigglepy/__init__.py index 9ac7350..4a61a20 100644 --- a/squigglepy/__init__.py +++ b/squigglepy/__init__.py @@ -1,5 +1,6 @@ from .distributions import * # noqa ignore=F405 from .numbers import * # noqa ignore=F405 +from .numeric_distribution import * # noqa ignore=F405 from .samplers import * # noqa ignore=F405 from .utils import * # noqa ignore=F405 from .rng import * # noqa ignore=F405 diff --git a/squigglepy/bayes.py b/squigglepy/bayes.py index 415f29a..e3fec17 100644 --- a/squigglepy/bayes.py +++ b/squigglepy/bayes.py @@ -1,3 +1,7 @@ +""" +This modules includes functions for Bayesian inference. +""" + import os import time import math diff --git a/squigglepy/correlation.py b/squigglepy/correlation.py index abd20b2..0108940 100644 --- a/squigglepy/correlation.py +++ b/squigglepy/correlation.py @@ -107,9 +107,10 @@ def correlate( >>> solar_radiation, temperature = sq.gamma(300, 100), sq.to(22, 28) >>> solar_radiation, temperature = sq.correlate((solar_radiation, temperature), 0.7) >>> print(np.corrcoef(solar_radiation @ 1000, temperature @ 1000)[0, 1]) - 0.6975960649767123 + 0.6975960649767123 Or you could pass a correlation matrix: + >>> funding_gap, cost_per_delivery, effect_size = ( sq.to(20_000, 80_000), sq.to(30, 80), sq.beta(2, 5) ) @@ -118,9 +119,9 @@ def correlate( [[1, 0.6, -0.5], [0.6, 1, -0.2], [-0.5, -0.2, 1]] ) >>> print(np.corrcoef(funding_gap @ 1000, cost_per_delivery @ 1000, effect_size @ 1000)) - array([[ 1. , 0.580520 , -0.480149], - [ 0.580962, 1. , -0.187831], - [-0.480149, -0.187831 , 1. ]]) + array([[ 1. , 0.580520 , -0.480149], + [ 0.580962, 1. , -0.187831], + [-0.480149, -0.187831 , 1. ]]) """ if not isinstance(variables, tuple): diff --git a/squigglepy/distributions.py b/squigglepy/distributions.py index 413e749..67b2082 100644 --- a/squigglepy/distributions.py +++ b/squigglepy/distributions.py @@ -1,11 +1,24 @@ -import operator +""" +A collection of probability distributions and functions to operate on them. +""" + import math import numpy as np +from numpy import exp, log, pi, sqrt +import operator import scipy.stats +from scipy import special +from scipy.special import erfc, erfinv +import warnings from typing import Optional, Union -from .utils import _process_weights_values, _is_numpy, is_dist, _round +from .utils import ( + _process_weights_values, + _is_numpy, + is_dist, + _round, +) from .version import __version__ from .correlation import CorrelationGroup @@ -58,6 +71,64 @@ def __repr__(self): return self.__str__() + f" (version {self._version})" +class IntegrableEVDistribution(ABC): + """ + A base class for distributions that can be integrated to find the + contribution to expected value. + """ + + @abstractmethod + def contribution_to_ev(self, x: Union[np.ndarray, float], normalized: bool = True): + """Find the fraction of this distribution's absolute expected value + given by the portion of the distribution that lies to the left of x. + For a distribution with support on [a, b], ``contribution_to_ev(a) = + 0`` and ``contribution_to_ev(b) = 1``. + + ``contribution_to_ev(x, normalized=False)`` is defined as + + .. math:: \\int_a^x |t| f(t) dt + + where `f(t)` is the PDF of the normal distribution. ``normalized=True`` + divides this result by ``contribution_to_ev(b, normalized=False)``. + + Note that this is different from the partial expected value, which is + defined as + + .. math:: \\int_{x}^\\infty t f_X(t | X > x) dt + + This function does not respect lclip/rclip. + + Parameters + ---------- + x : array-like + The value(s) to find the contribution to expected value for. + normalized : bool (default=True) + If True, normalize the result such that the return value is a + fraction (between 0 and 1). If False, return the raw integral + value. + + """ + # TODO: can compute this via numeric integration for any scipy + # distribution using something like + # + # scipy_dist.expect(lambda x: abs(x), ub=x, loc=loc, scale=scale) + # + # This is equivalent to contribution_to_ev(x, normalized=False). + # I tested that this works correctly for normal and lognormal dists. + ... + + @abstractmethod + def inv_contribution_to_ev(self, fraction: Union[np.ndarray, float]): + """For a given fraction of expected value, find the number such that + that fraction lies to the left of that number. The inverse of + :func:`contribution_to_ev`. + + This function is analogous to scipy.stats.ppf except that + the integrand is :math:`|x| f(x) dx` instead of :math:`f(x) dx`. + """ + ... + + class OperableDistribution(BaseDistribution): def __init__(self): super().__init__() @@ -652,7 +723,7 @@ def const(x): return ConstantDistribution(x) -class UniformDistribution(ContinuousDistribution): +class UniformDistribution(ContinuousDistribution, IntegrableEVDistribution): def __init__(self, x, y): super().__init__() self.x = x @@ -662,6 +733,40 @@ def __init__(self, x, y): def __str__(self): return " uniform({}, {})".format(self.x, self.y) + def contribution_to_ev(self, x: Union[np.ndarray, float], normalized=True): + x = np.asarray(x) + a = self.x + b = self.y + + x = np.where(x < a, a, x) + x = np.where(x > b, b, x) + + fraction = np.squeeze((x**2 * np.sign(x) - a**2 * np.sign(a)) / (2 * (b - a))) + if not normalized: + return fraction + normalizer = self.contribution_to_ev(b, normalized=False) + return fraction / normalizer + + def inv_contribution_to_ev(self, fraction: Union[np.ndarray, float]): + if isinstance(fraction, float) or isinstance(fraction, int): + fraction = np.array([fraction]) + + if any(fraction < 0) or any(fraction > 1): + raise ValueError(f"fraction must be >= 0 and <= 1, not {fraction}") + + a = self.x + b = self.y + + pos_sol = np.sqrt( + np.abs((1 - fraction) * a**2 * np.sign(a) + fraction * b**2 * np.sign(b)) + ) + neg_sol = -pos_sol + + if fraction < self.contribution_to_ev(0): + return neg_sol + else: + return pos_sol + def uniform(x, y): """ @@ -686,8 +791,17 @@ def uniform(x, y): return UniformDistribution(x=x, y=y) -class NormalDistribution(ContinuousDistribution): - def __init__(self, x=None, y=None, mean=None, sd=None, credibility=90, lclip=None, rclip=None): +class NormalDistribution(ContinuousDistribution, IntegrableEVDistribution): + def __init__( + self, + x=None, + y=None, + mean=None, + sd=None, + credibility=90, + lclip=None, + rclip=None, + ): super().__init__() self.x = x self.y = y @@ -722,6 +836,119 @@ def __str__(self): out += ")" return out + def contribution_to_ev(self, x: Union[np.ndarray, float], normalized=True): + x = np.asarray(x) + mu = self.mean + sigma = self.sd + sigma_scalar = sigma / sqrt(2 * pi) + + # The definite integral from the formula for EV, evaluated from -inf to + # x. Evaluating from -inf to +inf would give the EV. This number alone + # doesn't tell us the contribution to EV because it is negative for x < + # 0. Note: This computes ``erf((x - mu) / (sigma * sqrt(2))) - + # erf(-inf)`` as ``erfc(-(x - mu) / (sigma * sqrt(2)))`` to avoid + # floating point rounding issues. + normal_integral = 0.5 * mu * erfc((mu - x) / (sigma * sqrt(2))) - sigma_scalar * ( + exp(-((x - mu) ** 2) / sigma**2 / 2) + ) + + # The absolute value of the integral from -infinity to 0. When + # evaluating the formula for normal dist EV, all the values up to zero + # contribute negatively to EV, so we flip the sign on these. + zero_term = -( + 0.5 * mu * erfc(mu / (sigma * sqrt(2))) + - sigma_scalar * (exp(-((-mu) ** 2) / sigma**2 / 2)) + ) + + # Fix floating point rounding issue where if x is very small, these + # values can end up on the wrong side of zero. This happens when mu is + # big and x is far out on the tail, so erf_term(x) is close enough to + # 0.5 * mu that the difference cannot be faithfully represented as a + # float, and it ends up exaggerating the difference. + + # When x >= 0, add zero_term to get contribution_to_ev(0) up to 0. Then + # add zero_term again because that's how much of the contribution to EV + # we already integrated. When x < 0, we don't need to adjust + # normal_integral for the negative left values, but normal_integral is + # negative so we flip the sign. + contribution = np.where(x >= 0, 2 * zero_term + normal_integral, -normal_integral) + + # Normalize by the total integral over abs(x) * PDF(x). Note: We cannot + # use scipy.stats.foldnorm because its scale parameter is not the same + # thing as sigma, and I don't know how to translate. + abs_mean = mu + 2 * zero_term + return np.squeeze(contribution) / (abs_mean if normalized else 1) + + def _derivative_contribution_to_ev(self, x: np.ndarray): + mu = self.mean + sigma = self.sd + deriv = x * exp(-((mu - abs(x)) ** 2) / (2 * sigma**2)) / (sigma * sqrt(2 * pi)) + return deriv + + def inv_contribution_to_ev( + self, fraction: Union[np.ndarray, float], full_output: bool = False + ): + if isinstance(fraction, float) or isinstance(fraction, int): + fraction = np.array([fraction]) + mu = self.mean + sigma = self.sd + tolerance = 1e-8 + + if any(fraction < 0) or any(fraction > 1): + raise ValueError(f"fraction must be >= 0 and <= 1, not {fraction}") + + # Approximate using Newton's method. Sometimes this has trouble + # converging b/c it diverges or gets caught in a cycle, so use binary + # search as a fallback. + guess = np.full_like(fraction, mu) + max_iter = 10 + newton_iter = 0 + binary_iter = 0 + converged = False + + # Catch warnings because Newton's method often causes divisions by + # zero. If that does happen, we will just fall back to binary search. + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + for newton_iter in range(max_iter): + root = self.contribution_to_ev(guess) - fraction + if all(abs(root) < tolerance): + converged = True + break + deriv = self._derivative_contribution_to_ev(guess) + if all(deriv == 0): + break + guess = np.where(abs(deriv) == 0, guess, guess - root / deriv) + + if not converged: + # Approximate using binary search (RIP) + lower = np.full_like(fraction, scipy.stats.norm.ppf(1e-10, mu, scale=sigma)) + upper = np.full_like(fraction, scipy.stats.norm.ppf(1 - 1e-10, mu, scale=sigma)) + guess = scipy.stats.norm.ppf(fraction, mu, scale=sigma) + max_iter = 50 + for binary_iter in range(max_iter): + y = self.contribution_to_ev(guess) + diff = y - fraction + if all(abs(diff) < tolerance): + converged = True + break + lower = np.where(diff < 0, guess, lower) + upper = np.where(diff > 0, guess, upper) + guess = (lower + upper) / 2 + + if full_output: + return ( + np.squeeze(guess), + { + "success": converged, + "newton_iterations": newton_iter, + "binary_search_iterations": binary_iter, + "used_binary_search": binary_iter > 0, + }, + ) + else: + return np.squeeze(guess) + def norm( x=None, y=None, credibility=90, mean=None, sd=None, lclip=None, rclip=None @@ -762,11 +989,17 @@ def norm( norm(mean=1, sd=2) """ return NormalDistribution( - x=x, y=y, credibility=credibility, mean=mean, sd=sd, lclip=lclip, rclip=rclip + x=x, + y=y, + credibility=credibility, + mean=mean, + sd=sd, + lclip=lclip, + rclip=rclip, ) -class LognormalDistribution(ContinuousDistribution): +class LognormalDistribution(ContinuousDistribution, IntegrableEVDistribution): def __init__( self, x=None, @@ -817,21 +1050,24 @@ def __init__( self.lognorm_mean = 1 if self.x is not None: - self.norm_mean = (np.log(self.x) + np.log(self.y)) / 2 + self.norm_mean = (log(self.x) + log(self.y)) / 2 cdf_value = 0.5 + 0.5 * (self.credibility / 100) normed_sigma = scipy.stats.norm.ppf(cdf_value) - self.norm_sd = (np.log(self.y) - self.norm_mean) / normed_sigma + self.norm_sd = (log(self.y) - self.norm_mean) / normed_sigma if self.lognorm_sd is None: - self.lognorm_mean = np.exp(self.norm_mean + self.norm_sd**2 / 2) + self.lognorm_mean = exp(self.norm_mean + self.norm_sd**2 / 2) self.lognorm_sd = ( - (np.exp(self.norm_sd**2) - 1) * np.exp(2 * self.norm_mean + self.norm_sd**2) + (exp(self.norm_sd**2) - 1) * exp(2 * self.norm_mean + self.norm_sd**2) ) ** 0.5 elif self.norm_sd is None: - self.norm_mean = np.log( - (self.lognorm_mean**2 / np.sqrt(self.lognorm_sd**2 + self.lognorm_mean**2)) + self.norm_mean = log( + (self.lognorm_mean**2 / sqrt(self.lognorm_sd**2 + self.lognorm_mean**2)) ) - self.norm_sd = np.sqrt(np.log(1 + self.lognorm_sd**2 / self.lognorm_mean**2)) + self.norm_sd = sqrt(log(1 + self.lognorm_sd**2 / self.lognorm_mean**2)) + + # Cached value for calculating ``contribution_to_ev`` + self._EV_DENOM = sqrt(2) * self.norm_sd def __str__(self): out = " lognorm(lognorm_mean={}, lognorm_sd={}, norm_mean={}, norm_sd={}" @@ -848,6 +1084,39 @@ def __str__(self): out += ")" return out + def contribution_to_ev(self, x, normalized=True): + x = np.asarray(x) + mu = self.norm_mean + sigma = self.norm_sd + + with np.errstate(divide="ignore"): + # Note: erf can have floating point rounding errors when x is very + # small because erf(inf) = 1. erfc is better because erfc(inf) = + # 0 so floats can represent the result with more significant digits. + inner = -erfc((-log(x) + mu + sigma**2) / self._EV_DENOM) + + return -0.5 * np.squeeze(inner) * (1 if normalized else self.lognorm_mean) + + def inv_contribution_to_ev(self, fraction: Union[np.ndarray, float]): + """For a given fraction of expected value, find the number such that + that fraction lies to the left of that number. The inverse of + `contribution_to_ev`. + + This function is analogous to `lognorm.ppf` except that + the integrand is `x * f(x) dx` instead of `f(x) dx`. + """ + if isinstance(fraction, float): + fraction = np.array([fraction]) + if any(fraction < 0) or any(fraction > 1): + raise ValueError(f"fraction must be >= 0 and <= 1, not {fraction}") + + mu = self.norm_mean + sigma = self.norm_sd + y = fraction * self.lognorm_mean + return np.squeeze( + exp(mu + sigma**2 - sqrt(2) * sigma * erfinv(1 - 2 * exp(-mu - sigma**2 / 2) * y)) + ) + def lognorm( x=None, @@ -993,15 +1262,35 @@ def binomial(n, p): return BinomialDistribution(n=n, p=p) -class BetaDistribution(ContinuousDistribution): +class BetaDistribution(ContinuousDistribution, IntegrableEVDistribution): def __init__(self, a, b): super().__init__() self.a = a self.b = b + self.mean = a / (a + b) def __str__(self): return " beta(a={}, b={})".format(self.a, self.b) + def contribution_to_ev(self, x: Union[np.ndarray, float], normalized=True): + x = np.asarray(x) + a = self.a + b = self.b + + res = special.betainc(a + 1, b, x) * special.beta(a + 1, b) / special.beta(a, b) + return np.squeeze(res) / (self.mean if normalized else 1) + + def inv_contribution_to_ev(self, fraction: Union[np.ndarray, float]): + if isinstance(fraction, float) or isinstance(fraction, int): + fraction = np.array([fraction]) + if any(fraction < 0) or any(fraction > 1): + raise ValueError(f"fraction must be >= 0 and <= 1, not {fraction}") + + a = self.a + b = self.b + y = fraction * self.mean * special.beta(a, b) / special.beta(a + 1, b) + return np.squeeze(special.betaincinv(a + 1, b, y)) + def beta(a, b): """ @@ -1476,13 +1765,14 @@ def exponential(scale, lclip=None, rclip=None): return ExponentialDistribution(scale=scale, lclip=lclip, rclip=rclip) -class GammaDistribution(ContinuousDistribution): +class GammaDistribution(ContinuousDistribution, IntegrableEVDistribution): def __init__(self, shape, scale=1, lclip=None, rclip=None): super().__init__() self.shape = shape self.scale = scale self.lclip = lclip self.rclip = rclip + self.mean = shape * scale def __str__(self): out = " gamma(shape={}, scale={}".format(self.shape, self.scale) @@ -1493,6 +1783,23 @@ def __str__(self): out += ")" return out + def contribution_to_ev(self, x: Union[np.ndarray, float], normalized: bool = True): + x = np.asarray(x) + k = self.shape + scale = self.scale + res = special.gammainc(k + 1, x / scale) + return np.squeeze(res) * (1 if normalized else self.mean) + + def inv_contribution_to_ev(self, fraction: Union[np.ndarray, float]): + if isinstance(fraction, float) or isinstance(fraction, int): + fraction = np.array([fraction]) + if any(fraction < 0) or any(fraction >= 1): + raise ValueError(f"fraction must be >= 0 and < 1, not {fraction}") + + k = self.shape + scale = self.scale + return np.squeeze(special.gammaincinv(k + 1, fraction) * scale) + def gamma(shape, scale=1, lclip=None, rclip=None): """ @@ -1521,14 +1828,31 @@ def gamma(shape, scale=1, lclip=None, rclip=None): return GammaDistribution(shape=shape, scale=scale, lclip=lclip, rclip=rclip) -class ParetoDistribution(ContinuousDistribution): +class ParetoDistribution(ContinuousDistribution, IntegrableEVDistribution): def __init__(self, shape): super().__init__() self.shape = shape + self.mean = np.inf if shape <= 1 else shape / (shape - 1) def __str__(self): return " pareto({})".format(self.shape) + def contribution_to_ev(self, x: Union[np.ndarray, float], normalized: bool = True): + x = np.asarray(x) + a = self.shape + res = np.where(x <= 1, 0, a / (a - 1) * (1 - x ** (1 - a))) + return np.squeeze(res) / (self.mean if normalized else 1) + + def inv_contribution_to_ev(self, fraction: Union[np.ndarray, float]): + if isinstance(fraction, float) or isinstance(fraction, int): + fraction = np.array([fraction]) + if any(fraction < 0) or any(fraction >= 1): + raise ValueError(f"fraction must be >= 0 and < 1, not {fraction}") + + a = self.shape + x = (1 - fraction) ** (1 / (1 - a)) + return np.squeeze(x) + def pareto(shape): """ @@ -1547,12 +1871,20 @@ def pareto(shape): -------- >>> pareto(1) pareto(1) + """ return ParetoDistribution(shape=shape) class MixtureDistribution(CompositeDistribution): - def __init__(self, dists, weights=None, relative_weights=None, lclip=None, rclip=None): + def __init__( + self, + dists, + weights=None, + relative_weights=None, + lclip=None, + rclip=None, + ): super().__init__() weights, dists = _process_weights_values(weights, relative_weights, dists) self.dists = dists diff --git a/squigglepy/numeric_distribution.py b/squigglepy/numeric_distribution.py new file mode 100644 index 0000000..4e2a4d3 --- /dev/null +++ b/squigglepy/numeric_distribution.py @@ -0,0 +1,2394 @@ +from abc import ABC, abstractmethod +from enum import Enum +from numbers import Real +import numpy as np +from scipy import stats +from scipy.interpolate import CubicSpline, PchipInterpolator +from typing import Callable, List, Optional, Tuple, Union +import warnings + +from .distributions import ( + BaseDistribution, + BernoulliDistribution, + BetaDistribution, + ChiSquareDistribution, + ComplexDistribution, + ConstantDistribution, + ExponentialDistribution, + GammaDistribution, + LognormalDistribution, + MixtureDistribution, + NormalDistribution, + ParetoDistribution, + PERTDistribution, + UniformDistribution, +) +from .version import __version__ + + +class BinSizing(str, Enum): + """An enum for the different methods of sizing histogram bins. A histogram + with finitely many bins can only contain so much information about the + shape of a distribution; the choice of bin sizing determines what + information to prioritize. + """ + + uniform = "uniform" + """Divides the distribution into bins of equal width. For distributions + with infinite support (such as normal distributions), it chooses a total + width to roughly minimize total error, considering both intra-bin error and + error due to the excluded tails. + """ + + log_uniform = "log-uniform" + """Divides the distribution into bins with exponentially increasing width, + so that the logarithms of the bin edges are uniformly spaced. + + Log-uniform bin sizing is to uniform bin sizing as a log-normal + distribution is to a normal distribution. That is, if you generated a + NumericDistribution from a log-normal distribution with log-uniform bin + sizing, and then took the log of each bin, you'd get a normal distribution + with uniform bin sizing. + """ + + ev = "ev" + """Divides the distribution into bins such that each bin has equal + contribution to expected value (see + :any:`IntegrableEVDistribution.contribution_to_ev`). + """ + + mass = "mass" + """Divides the distribution into bins such that each bin has equal + probability mass. This method is generally not recommended because it puts + too much probability mass near the center of the distribution, where + precision is the least useful. + """ + + fat_hybrid = "fat-hybrid" + """A hybrid method designed for fat-tailed distributions. Uses mass bin + sizing close to the center and log-uniform bin siding on the right tail. + Empirically, this combination provides a good balance for the accuracy of + fat-tailed distributions at the center and at the tails. + """ + + bin_count = "bin-count" + """Shortens a vector of bins by merging every ``len(vec)/num_bins`` bins + together. Can only be used for resizing existing NumericDistributions, not + for initializing new ones. + """ + + +def _bump_indexes(indexes, length): + """Given an ordered list of indexes, ensure that every index is unique. If + any index is not unique, increment or decrement it until it's unique. + + This function does not guarantee that every index gets moved to the closest + unique index, but it does guarantee that every index gets moved to the + closest unique index in a certain direction. + """ + for i in range(1, len(indexes)): + if indexes[i] <= indexes[i - 1]: + indexes[i] = min(length - 1, indexes[i - 1] + 1) + + for i in reversed(range(len(indexes) - 1)): + if indexes[i] >= indexes[i + 1]: + indexes[i] = max(0, indexes[i + 1] - 1) + + return indexes + + +def _support_for_bin_sizing(dist, bin_sizing, num_bins): + """Return where to set the bounds for a bin sizing method with fixed + bounds, or None if the given dist/bin sizing does not require finite + bounds. + """ + # For norm/lognorm, wider domain increases error within each bin, and + # narrower domain increases error at the tails. Inter-bin error is + # proportional to width^3 / num_bins^2 and tail error is proportional to + # something like exp(-width^2). Setting width using the formula below + # balances these two sources of error. ``scale`` has an upper bound because + # excessively large values result in floating point rounding errors. + if isinstance(dist, NormalDistribution) and bin_sizing == BinSizing.uniform: + scale = max(6.5, 4.5 + np.log(num_bins) ** 0.5) + return (dist.mean - scale * dist.sd, dist.mean + scale * dist.sd) + if isinstance(dist, LognormalDistribution) and bin_sizing == BinSizing.log_uniform: + scale = max(6.5, 4.5 + np.log(num_bins) ** 0.5) + return np.exp( + (dist.norm_mean - scale * dist.norm_sd, dist.norm_mean + scale * dist.norm_sd) + ) + + # Uniform bin sizing is not gonna be very accurate for a lognormal + # distribution no matter how you set the bounds. + if isinstance(dist, LognormalDistribution) and bin_sizing == BinSizing.uniform: + scale = 6 + return np.exp( + (dist.norm_mean - scale * dist.norm_sd, dist.norm_mean + scale * dist.norm_sd) + ) + + # Compute the upper bound numerically because there is no good closed-form + # expression (that I could find) that reliably captures almost all of the + # mass without making the bins overly wide. + if isinstance(dist, GammaDistribution) and bin_sizing == BinSizing.uniform: + upper_bound = stats.gamma.ppf(1 - 1e-9, dist.shape, scale=dist.scale) + return (0, upper_bound) + if isinstance(dist, GammaDistribution) and bin_sizing == BinSizing.log_uniform: + lower_bound = stats.gamma.ppf(1e-10, dist.shape, scale=dist.scale) + upper_bound = stats.gamma.ppf(1 - 1e-10, dist.shape, scale=dist.scale) + return (lower_bound, upper_bound) + + return None + + +DEFAULT_BIN_SIZING = { + BetaDistribution: BinSizing.mass, + ChiSquareDistribution: BinSizing.ev, + ExponentialDistribution: BinSizing.ev, + GammaDistribution: BinSizing.ev, + LognormalDistribution: BinSizing.ev, + NormalDistribution: BinSizing.ev, + ParetoDistribution: BinSizing.ev, + PERTDistribution: BinSizing.mass, + UniformDistribution: BinSizing.uniform, +} +""" +Default bin sizing method for each distribution type. Chosen based on +empirical tests of which method best balances the accuracy across summary +statistics and across operations (addition, multiplication, left/right clip, +etc.). +""" + +DEFAULT_NUM_BINS = { + BetaDistribution: 50, + ChiSquareDistribution: 200, + ExponentialDistribution: 200, + GammaDistribution: 200, + LognormalDistribution: 200, + MixtureDistribution: 200, + NormalDistribution: 200, + ParetoDistribution: 200, + PERTDistribution: 100, + UniformDistribution: 50, +} +""" +Default number of bins for each distribution type. The default is 200 for +most distributions, which provides a good balance of accuracy and speed. Some +distributions use a smaller number of bins because they are sufficiently narrow +and well-behaved that not many bins are needed for high accuracy. +""" + +CACHED_LOGNORM_CDFS = {} +CACHED_LOGNORM_PPFS = {} + + +def cached_lognorm_cdfs(num_bins): + if num_bins in CACHED_LOGNORM_CDFS: + return CACHED_LOGNORM_CDFS[num_bins] + support = _support_for_bin_sizing( + LognormalDistribution(norm_mean=0, norm_sd=1), BinSizing.log_uniform, num_bins + ) + values = np.exp(np.linspace(np.log(support[0]), np.log(support[1]), num_bins + 1)) + cdfs = stats.lognorm.cdf(values, 1) + CACHED_LOGNORM_CDFS[num_bins] = cdfs + return cdfs + + +def cached_lognorm_ppf_zscore(num_bins): + if num_bins in CACHED_LOGNORM_PPFS: + return CACHED_LOGNORM_PPFS[num_bins] + cdfs = np.linspace(0, 1, num_bins + 1) + ppfs = _log(stats.lognorm.ppf(cdfs, 1)) + CACHED_LOGNORM_PPFS[num_bins] = (cdfs, ppfs) + return (cdfs, ppfs) + + +def _narrow_support( + support: Tuple[float, float], new_support: Tuple[Optional[float], Optional[float]] +): + """Narrow the support to the intersection of ``support`` and ``new_support``.""" + if new_support[0] is not None: + support = (max(support[0], new_support[0]), support[1]) + if new_support[1] is not None: + support = (support[0], min(support[1], new_support[1])) + return support + + +def _log(x): + with np.errstate(divide="ignore"): + return np.log(x) + + +class BaseNumericDistribution(ABC): + """BaseNumericDistribution + + An abstract base class for numeric distributions. For more documentation, + see :class:`NumericDistribution` and :class:`ZeroNumericDistribution`. + + """ + + def __repr__(self): + return ( + f"<{type(self).__name__}(mean={self.mean()}, sd={self.sd()}, " + f"num_bins={len(self)}, bin_sizing={self.bin_sizing}) at {hex(id(self))}>" + ) + + def __str__(self): + return f"{type(self).__name__}(mean={self.mean()}, sd={self.sd()})" + + def mean(self, axis=None, dtype=None, out=None): + """Mean of the distribution. May be calculated using a stored exact + value or the histogram data. + + Parameters + ---------- + None of the parameters do anything, they're only there so that + ``numpy.mean()`` can be called on a ``BaseNumericDistribution``. + """ + if self.exact_mean is not None: + return self.exact_mean + return self.est_mean() + + def sd(self): + """Standard deviation of the distribution. May be calculated using a + stored exact value or the histogram data.""" + if self.exact_sd is not None: + return self.exact_sd + return self.est_sd() + + def std(self, axis=None, dtype=None, out=None): + """Standard deviation of the distribution. May be calculated using a + stored exact value or the histogram data. + + Parameters + ---------- + None of the parameters do anything, they're only there so that + ``numpy.std()`` can be called on a ``BaseNumericDistribution``. + """ + return self.sd() + + def quantile(self, q): + """Estimate the value of the distribution at quantile ``q`` by + interpolating between known values. + + The accuracy at different quantiles depends on the bin sizing method + used. :any:`BinSizing.mass` will produce bins that are evenly spaced + across quantiles. :any:``BinSizing.ev`` for fat-tailed distributions + will be very inaccurate at lower quantiles in exchange for greater + accuracy on the right tail. + + The quantile value at the median of a bin will exactly equal the value + in the bin. Other quantile values are interpolated using scipy's + ``PchipInterpolator``, which fits points to a piecewise cubic function + with the restriction that values between points are monotonic. + + Parameters + ---------- + q : number or array_like + The quantile or quantiles for which to determine the value(s). + + Returns + ------- + quantiles: number or array-like + The estimated value at the given quantile(s). + + """ + return self.ppf(q) + + @abstractmethod + def ppf(self, q): + """Percent point function/inverse CD. An alias for :any:`quantile`.""" + ... + + def percentile(self, p): + """Estimate the value of the distribution at percentile ``p``. See + :any:`quantile` for notes on this function's accuracy. + """ + return np.squeeze(self.ppf(np.asarray(p) / 100)) + + def condition_on_success( + self, + event: Union["BaseNumericDistribution", float], + failure_outcome: Optional[Union["BaseNumericDistribution", float]] = 0, + ): + """``event`` is a probability distribution over a probability for some + binary outcome. If the event succeeds, the result is the random + variable defined by ``self``. If the event fails, the result is zero. + Or, if ``failure_outcome`` is provided, the result is + ``failure_outcome``. + + This function's return value represents the probability + distribution over outcomes in this scenario. + + The return value is equivalent to the result of this procedure: + + 1. Generate a probability ``p`` according to the distribution defined + by ``event``. + 2. Generate a Bernoulli random variable with probability ``p``. + 3. If success, generate a random outcome according to the distribution + defined by ``self``. + 4. Otherwise, generate a random outcome according to the distribution + defined by ``failure_outcome``. + + """ + if failure_outcome != 0: + # TODO: you can't just do a sum. I think what you want to do is + # scale the masses and then smush the bins together + raise NotImplementedError + if isinstance(event, Real): + p_success = event + elif isinstance(event, BaseNumericDistribution): + p_success = event.mean() + else: + raise TypeError(f"Cannot condition on type {type(event)}") + return ZeroNumericDistribution.wrap(self, 1 - p_success) + + def __ne__(x, y): + return not (x == y) + + def __radd__(x, y): + return x + y + + def __sub__(x, y): + return x + (-y) + + def __rsub__(x, y): + return -x + y + + def __rmul__(x, y): + return x * y + + def __truediv__(x, y): + if isinstance(y, Real): + return x.scale_by(1 / y) + return x * y.reciprocal() + + def __rtruediv__(x, y): + return y * x.reciprocal() + + +class NumericDistribution(BaseNumericDistribution): + """NumericDistribution + + A numerical representation of a probability distribution as a histogram of + values along with the probability mass near each value. + + A ``NumericDistribution`` is functionally equivalent to a Monte Carlo + simulation where you generate infinitely many samples and then group the + samples into finitely many bins, keeping track of the proportion of samples + in each bin (a.k.a. the probability mass) and the average value for each + bin. + + Compared to a Monte Carlo simulation, ``NumericDistribution`` can represent + information much more densely by grouping together nearby values (although + some information is lost in the grouping). The benefit of this is most + obvious in fat-tailed distributions. In a Monte Carlo simulation, perhaps 1 + in 1000 samples account for 10% of the expected value, but a + ``NumericDistribution`` (with the right bin sizing method, see + :any:`BinSizing`) can easily track the probability mass of large values. + + For more, see :doc:`/numeric_distributions`. + + """ + + def __init__( + self, + values: np.ndarray, + masses: np.ndarray, + zero_bin_index: int, + neg_ev_contribution: float, + pos_ev_contribution: float, + exact_mean: Optional[float], + exact_sd: Optional[float], + bin_sizing: Optional[BinSizing] = None, + min_bins_per_side: Optional[int] = 2, + richardson_extrapolation_enabled: Optional[bool] = True, + ): + """Create a probability mass histogram. You should usually not call + this constructor directly; instead, use :func:`from_distribution`. + + Parameters + ---------- + values : np.ndarray + The values of the distribution. + masses : np.ndarray + The probability masses of the values. + zero_bin_index : int + The index of the smallest bin that contains positive values + (0 if all bins are positive). + bin_sizing : :any:`BinSizing` + The method used to size the bins. + neg_ev_contribution : float + The (absolute value of) contribution to expected value from the negative + portion of the distribution. + pos_ev_contribution : float + The contribution to expected value from the positive portion of the distribution. + exact_mean : Optional[float] + The exact mean of the distribution, if known. + exact_sd : Optional[float] + The exact standard deviation of the distribution, if known. + bin_sizing : Optional[BinSizing] + The bin sizing method used to construct the distribution, if any. + richardson_extrapolation_enabled : Optional[bool] = True + If True, use Richardson extrapolation over the number of bins to + improve the accuracy of unary and binary operations. + """ + assert len(values) == len(masses) + self._version = __version__ + self.values = values + self.masses = masses + self.num_bins = len(values) + self.zero_bin_index = zero_bin_index + self.neg_ev_contribution = neg_ev_contribution + self.pos_ev_contribution = pos_ev_contribution + self.exact_mean = exact_mean + self.exact_sd = exact_sd + self.bin_sizing = bin_sizing + self.min_bins_per_side = min_bins_per_side + self.richardson_extrapolation_enabled = richardson_extrapolation_enabled + + # These are computed lazily + self.interpolate_cdf = None + self.interpolate_ppf = None + self.interpolate_cev = None + self.interpolate_inv_cev = None + + @classmethod + def _construct_edge_values( + cls, + num_bins, + support, + max_support, + dist, + cdf, + ppf, + bin_sizing, + ): + """Construct a list of bin edge values. Helper function for + :func:`from_distribution`; do not call this directly. + + Parameters + ---------- + num_bins : int + The number of bins to use. + support : Tuple[float, float] + The support of the distribution. + max_support : Tuple[float, float] + The maximum support of the distribution, after clipping but before + narrowing due to limitations of certain bin sizing methods. Namely, + uniform and log-uniform bin sizing is undefined for infinite bounds, + so ``support`` is narrowed to finite bounds, but ``max_support`` is + not. + dist : BaseDistribution + The distribution to convert to a NumericDistribution. + cdf : Callable[[np.ndarray], np.ndarray] + The CDF of the distribution. + ppf : Callable[[np.ndarray], np.ndarray] + The inverse CDF of the distribution. + bin_sizing : BinSizing + The bin sizing method to use. + + Returns + ------- + edge_values : np.ndarray + The value of each bin edge. + edge_cdfs : Optional[np.ndarray] + The CDF at each bin edge. Only provided as a performance + optimization if the CDFs are either required to determine bin edge + values or if they can be pulled from a cache. Otherwise, the parent + caller is responsible for calculating the CDFs. + + """ + edge_cdfs = None + if bin_sizing == BinSizing.uniform: + edge_values = np.linspace(support[0], support[1], num_bins + 1) + + elif bin_sizing == BinSizing.log_uniform: + log_support = (_log(support[0]), _log(support[1])) + log_edge_values = np.linspace(log_support[0], log_support[1], num_bins + 1) + edge_values = np.exp(log_edge_values) + if ( + isinstance(dist, LognormalDistribution) + and dist.lclip is None + and dist.rclip is None + ): + # Edge CDFs are the same regardless of the mean and SD of the + # distribution, so we can cache them + edge_cdfs = cached_lognorm_cdfs(num_bins) + + elif bin_sizing == BinSizing.ev: + if not hasattr(dist, "inv_contribution_to_ev"): + raise ValueError( + f"Bin sizing {bin_sizing} requires an inv_contribution_to_ev " + f"method, but {type(dist)} does not have one." + ) + left_prop = dist.contribution_to_ev(support[0]) + right_prop = dist.contribution_to_ev(support[1]) + edge_values = np.concatenate( + ( + # Don't call inv_contribution_to_ev on the left and right + # edges because it's undefined for 0 and 1 + [support[0]], + np.atleast_1d( + dist.inv_contribution_to_ev( + np.linspace(left_prop, right_prop, num_bins + 1)[1:-1] + ) + ) + if num_bins > 1 + else [], + [support[1]], + ) + ) + + elif bin_sizing == BinSizing.mass: + if ( + isinstance(dist, LognormalDistribution) + and dist.lclip is None + and dist.rclip is None + ): + edge_cdfs, edge_zscores = cached_lognorm_ppf_zscore(num_bins) + edge_values = np.exp(dist.norm_mean + dist.norm_sd * edge_zscores) + else: + edge_cdfs = np.linspace(cdf(support[0]), cdf(support[1]), num_bins + 1) + edge_values = ppf(edge_cdfs) + + elif bin_sizing == BinSizing.fat_hybrid: + # Use a combination of mass and log-uniform + logu_support = _support_for_bin_sizing(dist, BinSizing.log_uniform, num_bins) + logu_support = _narrow_support(support, logu_support) + logu_edge_values, logu_edge_cdfs = cls._construct_edge_values( + num_bins, logu_support, max_support, dist, cdf, ppf, BinSizing.log_uniform + ) + mass_edge_values, mass_edge_cdfs = cls._construct_edge_values( + num_bins, support, max_support, dist, cdf, ppf, BinSizing.mass + ) + + if logu_edge_cdfs is not None and mass_edge_cdfs is not None: + edge_cdfs = np.where( + logu_edge_values > mass_edge_values, logu_edge_cdfs, mass_edge_cdfs + ) + edge_values = np.where( + logu_edge_values > mass_edge_values, logu_edge_values, mass_edge_values + ) + + else: + raise ValueError(f"Unsupported bin sizing method: {bin_sizing}") + + return (edge_values, edge_cdfs) + + @classmethod + def _construct_bins( + cls, + num_bins, + support, + max_support, + dist, + cdf, + ppf, + bin_sizing, + warn, + is_reversed, + ): + """Construct a list of bin masses and values. Helper function for + :func:`from_distribution`; do not call this directly. + + Parameters + ---------- + num_bins : int + The number of bins to use. + support : Tuple[float, float] + The support of the distribution. + max_support : Tuple[float, float] + The maximum support of the distribution, after clipping but before + narrowing due to limitations of certain bin sizing methods. Namely, + uniform and log-uniform bin sizing is undefined for infinite bounds, + so ``support`` is narrowed to finite bounds, but ``max_support`` is + not. + dist : BaseDistribution + The distribution to convert to a NumericDistribution. + cdf : Callable[[np.ndarray], np.ndarray] + The CDF of the distribution. + ppf : Callable[[np.ndarray], np.ndarray] + The inverse CDF of the distribution. + bin_sizing : BinSizing + The bin sizing method to use. + warn : bool + If True, raise warnings about bins with zero mass. + + Returns + ------- + masses : np.ndarray + The probability mass of each bin. + values : np.ndarray + The value of each bin. + """ + if num_bins <= 0: + return (np.array([]), np.array([])) + + edge_values, edge_cdfs = cls._construct_edge_values( + num_bins, support, max_support, dist, cdf, ppf, bin_sizing + ) + + # Avoid re-calculating CDFs if we can because it's really slow. + if edge_cdfs is None: + edge_cdfs = cdf(edge_values) + + masses = np.diff(edge_cdfs) + + # Note: Re-calculating this for BinSize.ev appears to add ~zero + # performance penalty. Perhaps Python is caching the result somehow? + edge_ev_contributions = dist.contribution_to_ev(edge_values, normalized=False) + bin_ev_contributions = np.diff(edge_ev_contributions) + + # For sufficiently large edge values, CDF rounds to 1 which makes the + # mass 0. Values can also be 0 due to floating point rounding if + # support is very small. Remove any 0s. + mass_zeros = [i for i in range(len(masses)) if masses[i] == 0] + ev_zeros = [i for i in range(len(bin_ev_contributions)) if bin_ev_contributions[i] == 0] + non_monotonic = [] + + if len(mass_zeros) == 0: + # Set the value of each bin to equal the average value within the + # bin. + values = bin_ev_contributions / masses + + # Values can be non-monotonic if there are rounding errors when + # calculating EV contribution. Look at the bottom and top separately + # because on the bottom, the lower value will be the incorrect one, and + # on the top, the upper value will be the incorrect one. + sign = -1 if is_reversed else 1 + bot_diffs = sign * np.diff(values[: (num_bins // 10)]) + top_diffs = sign * np.diff(values[-(num_bins // 10) :]) + non_monotonic = [i for i in range(len(bot_diffs)) if bot_diffs[i] < 0] + [ + i + 1 + num_bins - len(top_diffs) + for i in range(len(top_diffs)) + if top_diffs[i] < 0 + ] + + bad_indexes = set(mass_zeros + ev_zeros + non_monotonic) + + if len(bad_indexes) > 0: + good_indexes = [i for i in range(num_bins) if i not in set(bad_indexes)] + bin_ev_contributions = bin_ev_contributions[good_indexes] + masses = masses[good_indexes] + values = bin_ev_contributions / masses + + messages = [] + if len(mass_zeros) > 0: + messages.append(f"{len(mass_zeros) + 1} neighboring values had equal CDFs") + if len(ev_zeros) == 1: + messages.append( + "1 bin had zero expected value, most likely because it was too small" + ) + elif len(ev_zeros) > 1: + messages.append( + f"{len(ev_zeros)} bins had zero expected value, most likely " + "because they were too small" + ) + if len(non_monotonic) > 0: + messages.append(f"{len(non_monotonic) + 1} neighboring values were non-monotonic") + joint_message = "; and".join(messages) + + if warn: + warnings.warn( + f"When constructing NumericDistribution, {joint_message}.", + RuntimeWarning, + ) + + return (masses, values) + + @classmethod + def from_distribution( + cls, + dist: Union[BaseDistribution, BaseNumericDistribution, Real], + num_bins: Optional[int] = None, + bin_sizing: Optional[str] = None, + warn: bool = True, + ): + """Create a probability mass histogram from the given distribution. + + Parameters + ---------- + dist : BaseDistribution | BaseNumericDistribution | Real + A distribution from which to generate numeric values. If the + provided value is a :any:`BaseNumericDistribution`, simply return + it. + num_bins : Optional[int] (default = ref:``DEFAULT_NUM_BINS``) + The number of bins for the numeric distribution to use. The time to + construct a NumericDistribution is linear with ``num_bins``, and + the time to run a binary operation on two distributions with the + same number of bins is approximately quadratic. 100 bins provides a + good balance between accuracy and speed. 1000 bins provides greater + accuracy and is fast for small models, but for large models there + will be some noticeable slowdown in binary operations. + bin_sizing : Optional[str] + The bin sizing method to use, which affects the accuracy of the + bins. If none is given, a default will be chosen from + :any:`DEFAULT_BIN_SIZING` based on the distribution type of + ``dist``. It is recommended to use the default bin sizing method + most of the time. See + :any:`squigglepy.numeric_distribution.BinSizing` for a list of + valid options and explanations of their behavior. warn : + Optional[bool] (default = True) If True, raise warnings about bins + with zero mass. + warn : Optional[bool] (default = True) + If True, raise warnings about bins with zero mass. + + Returns + ------- + result : NumericDistribution | ZeroNumericDistribution + The generated numeric distribution that represents ``dist``. + + """ + + # ---------------------------- + # Handle special distributions + # ---------------------------- + + if isinstance(dist, BaseNumericDistribution): + return dist + if isinstance(dist, ConstantDistribution) or isinstance(dist, Real): + x = dist if isinstance(dist, Real) else dist.x + return cls( + values=np.array([x]), + masses=np.array([1]), + zero_bin_index=0 if x >= 0 else 1, + neg_ev_contribution=0 if x >= 0 else -x, + pos_ev_contribution=x if x >= 0 else 0, + exact_mean=x, + exact_sd=0, + bin_sizing=bin_sizing, + ) + if isinstance(dist, BernoulliDistribution): + return cls.from_distribution(1, num_bins, bin_sizing, warn).condition_on_success( + dist.p + ) + if isinstance(dist, MixtureDistribution): + return cls.mixture( + dist.dists, + dist.weights, + lclip=dist.lclip, + rclip=dist.rclip, + num_bins=num_bins, + bin_sizing=bin_sizing, + warn=warn, + ) + if isinstance(dist, ComplexDistribution): + left = dist.left + right = dist.right + if isinstance(left, BaseDistribution): + left = cls.from_distribution(left, num_bins, bin_sizing, warn) + if isinstance(right, BaseDistribution): + right = cls.from_distribution(right, num_bins, bin_sizing, warn) + if right is None: + return dist.fn(left).clip(dist.lclip, dist.rclip) + return dist.fn(left, right).clip(dist.lclip, dist.rclip) + + # ------------ + # Basic checks + # ------------ + + if type(dist) not in DEFAULT_BIN_SIZING: + raise ValueError(f"Unsupported distribution type: {type(dist)}") + + num_bins = num_bins or DEFAULT_NUM_BINS[type(dist)] + bin_sizing = BinSizing(bin_sizing or DEFAULT_BIN_SIZING[type(dist)]) + + if num_bins % 2 != 0: + raise ValueError(f"num_bins must be even, not {num_bins}") + + # ------------------------------------------------------------------ + # Handle distributions that are special cases of other distributions + # ------------------------------------------------------------------ + + if isinstance(dist, ChiSquareDistribution): + return cls.from_distribution( + GammaDistribution(shape=dist.df / 2, scale=2, lclip=dist.lclip, rclip=dist.rclip), + num_bins=num_bins, + bin_sizing=bin_sizing, + warn=warn, + ) + + if isinstance(dist, ExponentialDistribution): + return cls.from_distribution( + GammaDistribution(shape=1, scale=dist.scale, lclip=dist.lclip, rclip=dist.rclip), + num_bins=num_bins, + bin_sizing=bin_sizing, + warn=warn, + ) + if isinstance(dist, PERTDistribution): + # PERT is a generalization of Beta. We can generate a PERT by + # generating a Beta and then scaling and shifting it. + if dist.lclip is not None or dist.rclip is not None: + raise ValueError("PERT distribution with lclip or rclip is not supported.") + r = dist.right - dist.left + alpha = 1 + dist.lam * (dist.mode - dist.left) / r + beta = 1 + dist.lam * (dist.right - dist.mode) / r + beta_dist = cls.from_distribution( + BetaDistribution( + a=alpha, + b=beta, + ), + num_bins=num_bins, + bin_sizing=bin_sizing, + warn=warn, + ) + # Note: There are formulas for the exact mean/SD of a PERT, but + # scaling/shifting will correctly produce the exact mean/SD so we + # don't need to set them manually. + return dist.left + r * beta_dist + + # ------------------------------------------------------------------- + # Set up required parameters based on dist type and bin sizing method + # ------------------------------------------------------------------- + + max_support = { + # These are the widest possible supports, but they maybe narrowed + # later by lclip/rclip or by some bin sizing methods. + BetaDistribution: (0, 1), + GammaDistribution: (0, np.inf), + LognormalDistribution: (0, np.inf), + NormalDistribution: (-np.inf, np.inf), + ParetoDistribution: (1, np.inf), + UniformDistribution: (dist.x, dist.y), + }[type(dist)] + support = max_support + ppf = { + BetaDistribution: lambda p: stats.beta.ppf(p, dist.a, dist.b), + GammaDistribution: lambda p: stats.gamma.ppf(p, dist.shape, scale=dist.scale), + LognormalDistribution: lambda p: stats.lognorm.ppf( + p, dist.norm_sd, scale=np.exp(dist.norm_mean) + ), + NormalDistribution: lambda p: stats.norm.ppf(p, loc=dist.mean, scale=dist.sd), + ParetoDistribution: lambda p: stats.pareto.ppf(p, dist.shape), + UniformDistribution: lambda p: stats.uniform.ppf(p, loc=dist.x, scale=dist.y - dist.x), + }[type(dist)] + cdf = { + BetaDistribution: lambda x: stats.beta.cdf(x, dist.a, dist.b), + GammaDistribution: lambda x: stats.gamma.cdf(x, dist.shape, scale=dist.scale), + LognormalDistribution: lambda x: stats.lognorm.cdf( + x, dist.norm_sd, scale=np.exp(dist.norm_mean) + ), + NormalDistribution: lambda x: stats.norm.cdf(x, loc=dist.mean, scale=dist.sd), + ParetoDistribution: lambda x: stats.pareto.cdf(x, dist.shape), + UniformDistribution: lambda x: stats.uniform.cdf(x, loc=dist.x, scale=dist.y - dist.x), + }[type(dist)] + + # ----------- + # Set support + # ----------- + + dist_bin_sizing_supported = False + new_support = _support_for_bin_sizing(dist, bin_sizing, num_bins) + + if new_support is not None: + support = _narrow_support(support, new_support) + dist_bin_sizing_supported = True + elif bin_sizing == BinSizing.uniform: + if isinstance(dist, BetaDistribution) or isinstance(dist, UniformDistribution): + dist_bin_sizing_supported = True + elif bin_sizing == BinSizing.log_uniform: + if isinstance(dist, BetaDistribution): + dist_bin_sizing_supported = True + elif bin_sizing == BinSizing.ev: + dist_bin_sizing_supported = True + elif bin_sizing == BinSizing.mass: + dist_bin_sizing_supported = True + elif bin_sizing == BinSizing.fat_hybrid: + if isinstance(dist, GammaDistribution) or isinstance(dist, LognormalDistribution): + dist_bin_sizing_supported = True + + if not dist_bin_sizing_supported: + raise ValueError(f"Unsupported bin sizing method {bin_sizing} for {type(dist)}.") + + # ---------------------------- + # Adjust support based on clip + # ---------------------------- + + support = _narrow_support(support, (dist.lclip, dist.rclip)) + + # --------------------------- + # Set exact_mean and exact_sd + # --------------------------- + + if dist.lclip is None and dist.rclip is None: + if isinstance(dist, BetaDistribution): + exact_mean = stats.beta.mean(dist.a, dist.b) + exact_sd = stats.beta.std(dist.a, dist.b) + elif isinstance(dist, GammaDistribution): + exact_mean = stats.gamma.mean(dist.shape, scale=dist.scale) + exact_sd = stats.gamma.std(dist.shape, scale=dist.scale) + elif isinstance(dist, LognormalDistribution): + exact_mean = dist.lognorm_mean + exact_sd = dist.lognorm_sd + elif isinstance(dist, NormalDistribution): + exact_mean = dist.mean + exact_sd = dist.sd + elif isinstance(dist, ParetoDistribution): + if dist.shape <= 1: + raise ValueError( + "NumericDistribution does not support Pareto distributions " + "with shape <= 1 because they have infinite mean." + ) + # exact_mean = 1 / (dist.shape - 1) # Lomax + exact_mean = dist.shape / (dist.shape - 1) + if dist.shape <= 2: + exact_sd = np.inf + else: + exact_sd = np.sqrt(dist.shape / ((dist.shape - 1) ** 2 * (dist.shape - 2))) + elif isinstance(dist, UniformDistribution): + exact_mean = (dist.x + dist.y) / 2 + exact_sd = np.sqrt(1 / 12) * (dist.y - dist.x) + else: + if ( + isinstance(dist, BetaDistribution) + or isinstance(dist, GammaDistribution) + or isinstance(dist, LognormalDistribution) + ): + # For one-sided distributions without a known formula for + # truncated mean, compute the mean using + # ``contribution_to_ev``. + contribution_to_ev = dist.contribution_to_ev( + support[1], normalized=False + ) - dist.contribution_to_ev(support[0], normalized=False) + mass = cdf(support[1]) - cdf(support[0]) + exact_mean = contribution_to_ev / mass + exact_sd = None # unknown + elif isinstance(dist, NormalDistribution): + a = (support[0] - dist.mean) / dist.sd + b = (support[1] - dist.mean) / dist.sd + exact_mean = stats.truncnorm.mean(a, b, dist.mean, dist.sd) + exact_sd = stats.truncnorm.std(a, b, dist.mean, dist.sd) + elif isinstance(dist, UniformDistribution): + exact_mean = (support[0] + support[1]) / 2 + exact_sd = np.sqrt(1 / 12) * (support[1] - support[0]) + + # ----------------------------------------------------------------- + # Split dist into negative and positive sides and generate bins for + # each side + # ----------------------------------------------------------------- + + total_ev_contribution = dist.contribution_to_ev( + support[1], normalized=False + ) - dist.contribution_to_ev(support[0], normalized=False) + neg_ev_contribution = max( + 0, + dist.contribution_to_ev(max(0, support[0]), normalized=False) + - dist.contribution_to_ev(support[0], normalized=False), + ) + if dist.lclip is not None or dist.rclip is not None: + total_mass = cdf(support[1]) - cdf(support[0]) + total_ev_contribution /= total_mass + neg_ev_contribution /= total_mass + pos_ev_contribution = total_ev_contribution - neg_ev_contribution + + if bin_sizing == BinSizing.uniform: + if support[0] > 0: + neg_prop = 0 + pos_prop = 1 + elif support[1] < 0: + neg_prop = 1 + pos_prop = 0 + else: + width = support[1] - support[0] + neg_prop = -support[0] / width + pos_prop = support[1] / width + elif bin_sizing == BinSizing.log_uniform: + neg_prop = 0 + pos_prop = 1 + elif bin_sizing == BinSizing.ev: + neg_prop = neg_ev_contribution / total_ev_contribution + pos_prop = pos_ev_contribution / total_ev_contribution + elif bin_sizing == BinSizing.mass: + neg_mass = max(0, cdf(0) - cdf(support[0])) + pos_mass = max(0, cdf(support[1]) - cdf(0)) + total_mass = neg_mass + pos_mass + neg_prop = neg_mass / total_mass + pos_prop = pos_mass / total_mass + elif bin_sizing == BinSizing.fat_hybrid: + neg_prop = 0 + pos_prop = 1 + else: + raise ValueError(f"Unsupported bin sizing method: {bin_sizing}") + + # Divide up bins such that each bin has as close as possible to equal + # contribution. If one side has very small but nonzero contribution, + # still give it two bins. + num_neg_bins, num_pos_bins = cls._num_bins_per_side(num_bins, neg_prop, pos_prop, 2) + neg_masses, neg_values = cls._construct_bins( + num_neg_bins, + (support[0], min(0, support[1])), + (max_support[0], min(0, max_support[1])), + dist, + cdf, + ppf, + bin_sizing, + warn, + is_reversed=True, + ) + neg_values = -neg_values + pos_masses, pos_values = cls._construct_bins( + num_pos_bins, + (max(0, support[0]), support[1]), + (max(0, max_support[0]), max_support[1]), + dist, + cdf, + ppf, + bin_sizing, + warn, + is_reversed=False, + ) + + # Resize in case some bins got removed due to having zero mass/EV + if len(neg_values) < num_neg_bins: + neg_ev_contribution = abs(np.sum(neg_masses * neg_values)) + num_neg_bins = len(neg_values) + if len(pos_values) < num_pos_bins: + pos_ev_contribution = np.sum(pos_masses * pos_values) + num_pos_bins = len(pos_values) + + masses = np.concatenate((neg_masses, pos_masses)) + values = np.concatenate((neg_values, pos_values)) + + # Normalize masses to sum to 1 in case the distribution is clipped, but + # don't do this until after setting values because values depend on the + # mass relative to the full distribution, not the clipped distribution. + masses /= np.sum(masses) + + return cls( + values=np.array(values), + masses=np.array(masses), + zero_bin_index=num_neg_bins, + neg_ev_contribution=neg_ev_contribution, + pos_ev_contribution=pos_ev_contribution, + exact_mean=exact_mean, + exact_sd=exact_sd, + bin_sizing=bin_sizing, + ) + + @classmethod + def mixture( + cls, + dists: List[Union[BaseDistribution, BaseNumericDistribution]], + weights: List[float], + lclip: Optional[float] = None, + rclip: Optional[float] = None, + num_bins: Optional[int] = None, + bin_sizing: Optional[BinSizing] = None, + warn: bool = True, + ): + """Construct a ``NumericDistribution`` as a mixture of the + distributions in ``dists``, weighted by ``weights``. + + Parameters + ---------- + dists : List[BaseDistribution | BaseNumericDistribution] + The distributions to mix. + weights : List[float] + The weights of each distribution. Must sum to 1. + lclip : Optional[float] + The left clip of the resulting mixture distribution. Note that a + clip value on an entry in ``dists`` will only clip that entry, + while this parameter clips every entry. + rclip : Optional[float] + The right clip of the resulting mixture distribution. Note that a + clip value on an entry in ``dists`` will only clip that entry, + while this parameter clips every entry. + num_bins : Optional[int] (default = :any:`DEFAULT_NUM_BINS`) + The number of bins for each distribution to use. If not given, each + distribution will use the default number of bins for its type as + given by :any:`DEFAULT_NUM_BINS`. + bin_sizing : Optional[BinSizing] + The bin sizing method to use. If not given, each distribution will + use the default bin sizing method for its type as given by + :any:`DEFAULT_BIN_SIZING`. + warn : Optional[bool] (default = True) + If True, raise warnings about bins with zero mass. + + """ + # This function replicates how MixtureDistribution handles lclip/rclip: + # it clips the sub-distributions based on their own lclip/rclip, then + # takes the mixture sample, then clips the mixture sample based on the + # mixture lclip/rclip. + if num_bins is None: + mixture_num_bins = DEFAULT_NUM_BINS[MixtureDistribution] + + # Convert any Squigglepy dists into NumericDistributions + dists = [ + NumericDistribution.from_distribution(dist, num_bins, bin_sizing) for dist in dists + ] + + value_vectors = [d.values for d in dists] + weighted_mass_vectors = [d.masses * w for d, w in zip(dists, weights)] + extended_values = np.concatenate(value_vectors) + extended_masses = np.concatenate(weighted_mass_vectors) + + sorted_indexes = np.argsort(extended_values, kind="mergesort") + extended_values = extended_values[sorted_indexes] + extended_masses = extended_masses[sorted_indexes] + zero_index = np.searchsorted(extended_values, 0) + neg_ev_contribution = sum(d.neg_ev_contribution * w for d, w in zip(dists, weights)) + pos_ev_contribution = sum(d.pos_ev_contribution * w for d, w in zip(dists, weights)) + + mixture = cls._resize_bins( + extended_neg_values=extended_values[:zero_index], + extended_neg_masses=extended_masses[:zero_index], + extended_pos_values=extended_values[zero_index:], + extended_pos_masses=extended_masses[zero_index:], + num_bins=num_bins or mixture_num_bins, + neg_ev_contribution=neg_ev_contribution, + pos_ev_contribution=pos_ev_contribution, + bin_sizing=BinSizing.ev, + is_sorted=True, + ) + mixture.bin_sizing = bin_sizing + if all(d.exact_mean is not None for d in dists): + mixture.exact_mean = sum(d.exact_mean * w for d, w in zip(dists, weights)) + if all(d.exact_sd is not None and d.exact_mean is not None for d in dists): + second_moment = sum( + (d.exact_mean**2 + d.exact_sd**2) * w for d, w in zip(dists, weights) + ) + mixture.exact_sd = np.sqrt(second_moment - mixture.exact_mean**2) + + return mixture.clip(lclip, rclip) + + def given_value_satisfies(self, condition: Callable[float, bool]): + """Return a new distribution conditioned on the value of the random + variable satisfying ``condition``. + + Parameters + ---------- + condition : Callable[float, bool] + """ + good_indexes = np.where(np.vectorize(condition)(self.values)) + values = self.values[good_indexes] + masses = self.masses[good_indexes] + masses /= np.sum(masses) + zero_bin_index = np.searchsorted(values, 0, side="left") + neg_ev_contribution = -np.sum(masses[:zero_bin_index] * values[:zero_bin_index]) + pos_ev_contribution = np.sum(masses[zero_bin_index:] * values[zero_bin_index:]) + return NumericDistribution( + values=values, + masses=masses, + zero_bin_index=zero_bin_index, + neg_ev_contribution=neg_ev_contribution, + pos_ev_contribution=pos_ev_contribution, + exact_mean=None, + exact_sd=None, + ) + + def probability_value_satisfies(self, condition: Callable[float, bool]): + """Return the probability that the random variable satisfies + ``condition``. + + Parameters + ---------- + condition : Callable[float, bool] + """ + return np.sum(self.masses[np.where(np.vectorize(condition)(self.values))]) + + def __len__(self): + return self.num_bins + + def positive_everywhere(self): + """Return True if the distribution is positive everywhere.""" + return self.zero_bin_index == 0 + + def negative_everywhere(self): + """Return True if the distribution is negative everywhere.""" + return self.zero_bin_index == self.num_bins + + def is_one_sided(self): + """Return True if the histogram contains only positive or negative values.""" + return self.positive_everywhere() or self.negative_everywhere() + + def is_two_sided(self): + """Return True if the histogram contains both positive and negative values.""" + return not self.is_one_sided() + + def num_neg_bins(self): + """Return the number of bins containing negative values.""" + return self.zero_bin_index + + def est_mean(self): + """Estimated mean of the distribution, calculated using the histogram + data.""" + return np.sum(self.masses * self.values) + + def est_sd(self): + """Estimated standard deviation of the distribution, calculated using + the histogram data. + """ + mean = self.mean() + return np.sqrt(np.sum(self.masses * (self.values - mean) ** 2)) + + def _init_interpolate_cdf(self): + if self.interpolate_cdf is None: + # Subtracting 0.5 * masses because eg the first out of 100 values + # represents the 0.5th percentile, not the 1st percentile + cum_mass = np.cumsum(self.masses) - 0.5 * self.masses + self.interpolate_cdf = PchipInterpolator(self.values, cum_mass) + + def _init_interpolate_ppf(self): + if self.interpolate_ppf is None: + cum_mass = np.cumsum(self.masses) - 0.5 * self.masses + + # Mass diffs can be 0 if a mass is very small and gets rounded off. + # The interpolator doesn't like this, so remove these values. + nonzero_indexes = [0] + [i + 1 for (i, d) in enumerate(np.diff(cum_mass)) if d > 0] + cum_mass = cum_mass[nonzero_indexes] + values = self.values[nonzero_indexes] + self.interpolate_ppf = PchipInterpolator(cum_mass, values) + + def cdf(self, x): + """Estimate the proportion of the distribution that lies below ``x``.""" + self._init_interpolate_cdf() + return self.interpolate_cdf(x) + + def ppf(self, q): + self._init_interpolate_ppf() + return self.interpolate_ppf(q) + + def clip(self, lclip, rclip): + """Return a new distribution clipped to the given bounds. + + It is strongly recommended that, whenever possible, you construct a + ``NumericDistribution`` by supplying a ``Distribution`` that has + lclip/rclip defined on it, rather than calling + ``NumericDistribution.clip``. ``NumericDistribution.clip`` works by + deleting bins, which can greatly decrease accuracy. + + Parameters + ---------- + lclip : Optional[float] + The new lower bound of the distribution, or None if the lower bound + should not change. + rclip : Optional[float] + The new upper bound of the distribution, or None if the upper bound + should not change. + + Returns + ------- + clipped : NumericDistribution + A new distribution clipped to the given bounds. + + """ + if lclip is None and rclip is None: + return NumericDistribution( + values=self.values, + masses=self.masses, + zero_bin_index=self.zero_bin_index, + neg_ev_contribution=self.neg_ev_contribution, + pos_ev_contribution=self.pos_ev_contribution, + exact_mean=self.exact_mean, + exact_sd=self.exact_sd, + bin_sizing=self.bin_sizing, + ) + + if lclip is None: + lclip = -np.inf + if rclip is None: + rclip = np.inf + + if lclip >= rclip: + raise ValueError(f"lclip ({lclip}) must be less than rclip ({rclip})") + + indexes = np.array(range(len(self) + 1)) + cum_mass_at_index = CubicSpline(indexes, np.concatenate(([0], np.cumsum(self.masses)))) + cum_ev_at_index = CubicSpline( + indexes, np.concatenate(([0], np.cumsum(self.masses * self.values))) + ) + + # If lclip/rclip is outside the bounds of the known values, use linear + # interpolation because cubic spline is sometimes non-monotonic. + # Otherwise, use cubic spline because it's more accurate. There are + # more accurate methods to extrapolate outside known values, but + # they're more complicated and this case should be rare. + index_of_value = CubicSpline(self.values, indexes[1:] - 0.5, extrapolate=False) + linear_indexes = np.interp([lclip, rclip], self.values, indexes[1:] - 0.5) + lclip_index = max( + 0, index_of_value(lclip) if lclip < self.values[0] else linear_indexes[0] + ) + rclip_index = min( + len(self.values), + index_of_value(rclip) if rclip > self.values[-1] else linear_indexes[1], + ) + new_masses = np.array(self.masses[int(lclip_index) : int(np.ceil(rclip_index))]) + new_values = np.array(self.values[int(lclip_index) : int(np.ceil(rclip_index))]) + + # If a bin is partially clipped, interpolate how much of the bin + # remains. + if lclip_index != int(lclip_index): + # lclip is between two bins + left_mass = cum_mass_at_index(int(lclip_index) + 1) - cum_mass_at_index(lclip_index) + if left_mass > 0: + left_value = ( + cum_ev_at_index(int(lclip_index) + 1) - cum_ev_at_index(lclip_index) + ) / left_mass + new_masses[0] = left_mass + new_values[0] = left_value + if rclip_index != int(rclip_index): + # rclip is between two bins + right_mass = cum_mass_at_index(rclip_index) - cum_mass_at_index(int(rclip_index)) + if right_mass > 0: + right_value = ( + cum_ev_at_index(rclip_index) - cum_ev_at_index(int(rclip_index)) + ) / right_mass + new_masses[-1] = right_mass + new_values[-1] = right_value + + clipped_mass = np.sum(new_masses) + new_masses /= clipped_mass + zero_bin_index = np.searchsorted(new_values, 0) + neg_ev_contribution = -np.sum(new_masses[:zero_bin_index] * new_values[:zero_bin_index]) + pos_ev_contribution = np.sum(new_masses[zero_bin_index:] * new_values[zero_bin_index:]) + + return NumericDistribution( + values=new_values, + masses=new_masses, + zero_bin_index=zero_bin_index, + neg_ev_contribution=neg_ev_contribution, + pos_ev_contribution=pos_ev_contribution, + exact_mean=None, + exact_sd=None, + bin_sizing=self.bin_sizing, + ) + + def sample(self, n=1): + """Generate ``n`` random samples from the distribution. The samples are + generated by interpolating between bin values in the same manner as + :any:`ppf`.""" + return self.ppf(np.random.uniform(size=n)) + + def contribution_to_ev(self, x: Union[np.ndarray, float]): + if self.interpolate_cev is None: + bin_evs = self.masses * abs(self.values) + fractions_of_ev = (np.cumsum(bin_evs) - 0.5 * bin_evs) / np.sum(bin_evs) + self.interpolate_cev = PchipInterpolator(self.values, fractions_of_ev) + return self.interpolate_cev(x) + + def inv_contribution_to_ev(self, fraction: Union[np.ndarray, float]): + """Return the value such that ``fraction`` of the contribution to + expected value lies to the left of that value. + """ + if self.interpolate_inv_cev is None: + bin_evs = self.masses * abs(self.values) + fractions_of_ev = (np.cumsum(bin_evs) - 0.5 * bin_evs) / np.sum(bin_evs) + self.interpolate_inv_cev = PchipInterpolator(fractions_of_ev, self.values) + return self.interpolate_inv_cev(fraction) + + def plot(self, scale="linear"): + from matplotlib import pyplot as plt + + # matplotlib.use('GTK3Agg') + # matplotlib.use('Qt5Agg') + values_for_widths = np.concatenate(([0], self.values)) + widths = np.diff(values_for_widths[1:]) + densities = self.masses / widths + values, densities, widths = zip( + *[ + (v, d, w) + for v, d, w in zip(list(values_for_widths), list(densities), list(widths)) + if d > 0.001 + ] + ) + if scale == "log": + plt.xscale("log") + plt.bar(values, densities, width=widths, align="edge") + plt.savefig("/tmp/plot.png") + plt.show() + + def richardson(r: float, correct_ev: bool = True): + """A decorator that applies Richardson extrapolation to a + NumericDistribution method to improve its accuracy. + + This decorator uses the following procedure (this procedure assumes a + binary operation, but it works the same for any number of arguments): + + 1. Evaluate ``z = func(x, y)`` where ``x`` and ``y`` are + ``NumericDistribution`` objects. + 2. Construct a new ``x2`` and ``y2`` which are identical to ``x`` + and ``y`` except that they use half as many bins. + 3. Evaluate ``z2 = func(x2, y2)``. + 4. Apply Richardson extrapolation: ``res = (2^r * z - z2) / (2^r - 1)`` + for some constant exponent ``r``, chosen to maximize accuracy. + + Parameters + ---------- + r : float + The (positive) exponent to use in Richardson extrapolation. This + should equal the rate at which the error shrinks as the number of + bins increases. A higher ``r`` results in slower extrapolation. + correct_ev : bool = True + If True, adjust the negative and positive EV contributions to be + exactly correct. The caller should set ``correct_ev`` to False if + the function being decorated does produce exactly correct values + for ``(neg|pos)_contribution_to_ev``. + + Returns + ------- + res : Callable + A decorator function that takes a function ``func`` and returns a + new function that applies Richardson extrapolation to ``func``. + + """ + + def sum_pairs(arr): + """Sum every pair of values in ``arr``.""" + return arr.reshape(-1, 2).sum(axis=1) + + def decorator(func): + def inner(*hists): + if not all(x.richardson_extrapolation_enabled for x in hists): + return func(*hists) + + # Richardson extrapolation often runs into issues due to wonky + # bin sizing if the inputs don't have the same number of bins. + if len(set(x.num_bins for x in hists)) != 1: + return func(*hists) + + # Empirically, BinSizing.ev and BinSizing.mass error shrinks at + # a consistent rate r for all bins (except for the outermost + # bins, which are more unpredictable). BinSizing.log_uniform + # and BinSizing.uniform error growth rate isn't consistent + # across bins, so Richardson extrapolation doesn't work well + # (and often makes the result worse). + if not all(x.bin_sizing in [BinSizing.ev, BinSizing.mass] for x in hists): + return func(*hists) + + # Construct half_hists as identical to hists but with half as + # many bins + half_hists = [] + for x in hists: + if len(x.masses) % 2 != 0: + # If the number of bins is odd, we can't halve the + # number of bins, so just return the original result. + return func(*hists) + + halfx_masses = sum_pairs(x.masses) + halfx_evs = sum_pairs(x.values * x.masses) + halfx_values = halfx_evs / halfx_masses + zero_bin_index = np.searchsorted(halfx_values, 0) + halfx = NumericDistribution( + values=halfx_values, + masses=halfx_masses, + zero_bin_index=zero_bin_index, + neg_ev_contribution=np.sum( + halfx_masses[:zero_bin_index] * -halfx_values[:zero_bin_index] + ), + pos_ev_contribution=np.sum( + halfx_masses[zero_bin_index:] * halfx_values[zero_bin_index:] + ), + exact_mean=x.exact_mean, + exact_sd=x.exact_sd, + min_bins_per_side=1, + ) + half_hists.append(halfx) + + half_res = func(*half_hists) + full_res = func(*hists) + if 2 * half_res.zero_bin_index != full_res.zero_bin_index: + # In some edge cases, full_res has very small negative mass + # and half_res has no negative mass (ex: norm(mean=0, sd=2) + # + norm(mean=9, sd=1)), so the bins aren't lined up + # correctly for Richardson extrapolation. These edge cases + # are rare, so it's not a big deal to bail out on + # Richardson extrapolation in those cases. + return full_res + + paired_full_masses = sum_pairs(full_res.masses) + paired_full_evs = sum_pairs(full_res.values * full_res.masses) + paired_full_values = paired_full_evs / paired_full_masses + k = 2 ** (-r) + richardson_masses = (k * half_res.masses - paired_full_masses) / (k - 1) + if any(richardson_masses < 0): + # TODO: delete me when you're confident that this doesn't + # happen anymore + return full_res + + mass_adjustment = np.repeat(richardson_masses / paired_full_masses, 2) + new_masses = full_res.masses * np.where( + np.isnan(mass_adjustment), 1, mass_adjustment + ) + + # method 1: adjust EV + # full_evs = full_res.values * full_res.masses + # half_evs = half_res.values * half_res.masses + # richardson_evs = (k * half_evs - paired_full_evs) / (k - 1) + # ev_adjustment = np.repeat(richardson_evs / paired_full_evs, 2) + # new_evs = full_evs * np.where(np.isnan(ev_adjustment), 1, ev_adjustment) + # new_values = new_evs / new_masses + + # method 2: adjust values directly... + richardson_values = (k * half_res.values - paired_full_values) / (k - 1) + value_adjustment = np.repeat(richardson_values / paired_full_values, 2) + new_values = full_res.values * np.where( + np.isnan(value_adjustment), 1, value_adjustment + ) + # ...then adjust EV to be exactly correct + if correct_ev: + new_neg_ev_contribution = np.sum( + new_masses[: full_res.zero_bin_index] + * -new_values[: full_res.zero_bin_index] + ) + new_pos_ev_contribution = np.sum( + new_masses[full_res.zero_bin_index :] + * new_values[full_res.zero_bin_index :] + ) + new_values[: full_res.zero_bin_index] *= ( + 1 + if new_neg_ev_contribution == 0 + else full_res.neg_ev_contribution / new_neg_ev_contribution + ) + new_values[full_res.zero_bin_index :] *= ( + 1 + if new_pos_ev_contribution == 0 + else full_res.pos_ev_contribution / new_pos_ev_contribution + ) + + full_res.masses = new_masses + full_res.values = new_values + full_res.zero_bin_index = np.searchsorted(new_values, 0) + if len(np.unique([x.bin_sizing for x in hists])) == 1: + full_res.bin_sizing = hists[0].bin_sizing + return full_res + + return inner + + return decorator + + @classmethod + def _num_bins_per_side( + cls, num_bins, neg_contribution, pos_contribution, min_bins_per_side, allowance=0 + ): + """Determine how many bins to allocate to the positive and negative + sides of the distribution. + + The negative and positive sides will get a number of bins approximately + proportional to `neg_contribution` and `pos_contribution` respectively. + + Ordinarily, a domain gets its own bin if it represents greater than `1 + / num_bins / 2` of the total contribution. But if one side of the + distribution has less than that, it will still get one bin if it has + greater than `allowance * 1 / num_bins / 2` of the total contribution. + `allowance = 0` means both sides get a bin as long as they have any + contribution. + + If one side has less than that but still nonzero contribution, that + side will be allocated zero bins and that side's contribution will be + dropped, which means `neg_contribution` and `pos_contribution` may need + to be adjusted. + + Parameters + ---------- + num_bins : int + Total number of bins across the distribution. + neg_contribution : float + The total contribution of value from the negative side, using + whatever measure of value determines bin sizing. + pos_contribution : float + The total contribution of value from the positive side. + allowance = 0.5 : float + The fraction + + Returns + ------- + (num_neg_bins, num_pos_bins) : (int, int) + Number of bins assigned to the negative/positive side of the + distribution. + + """ + min_prop_cutoff = min_bins_per_side * allowance * 1 / num_bins / 2 + total_contribution = neg_contribution + pos_contribution + num_neg_bins = min_bins_per_side * int( + np.round(num_bins * neg_contribution / total_contribution / min_bins_per_side) + ) + num_pos_bins = num_bins - num_neg_bins + + if neg_contribution / total_contribution > min_prop_cutoff: + num_neg_bins = max(min_bins_per_side, num_neg_bins) + num_pos_bins = num_bins - num_neg_bins + else: + num_neg_bins = 0 + num_pos_bins = num_bins + + if pos_contribution / total_contribution > min_prop_cutoff: + num_pos_bins = max(min_bins_per_side, num_pos_bins) + num_neg_bins = num_bins - num_pos_bins + else: + num_pos_bins = 0 + num_neg_bins = num_bins + + return (num_neg_bins, num_pos_bins) + + @classmethod + def _resize_pos_bins( + cls, + extended_values, + extended_masses, + num_bins, + ev, + bin_sizing=BinSizing.bin_count, + is_sorted=False, + ): + """Given two arrays of values and masses representing the result of a + binary operation on two positive-everywhere distributions, compress the + arrays down to ``num_bins`` bins and return the new values and masses of + the bins. + + Parameters + ---------- + extended_values : np.ndarray + The values of the distribution. The values must all be non-negative. + extended_masses : np.ndarray + The probability masses of the values. + num_bins : int + The number of bins to compress the distribution into. + ev : float + The expected value of the distribution. + is_sorted : bool + If True, assume that ``extended_values`` and ``extended_masses`` are + already sorted in ascending order. This provides a significant + performance improvement (~3x). + + Returns + ------- + values : np.ndarray + The values of the bins. + masses : np.ndarray + The probability masses of the bins. + + """ + # TODO: This whole method is messy, it could use a refactoring + if num_bins == 0: + return (np.array([]), np.array([])) + + bin_evs = None + masses = None + + if bin_sizing == BinSizing.bin_count: + if len(extended_values) == 1 and num_bins == 2: + extended_values = np.repeat(extended_values, 2) + extended_masses = np.repeat(extended_masses, 2) / 2 + elif len(extended_values) < num_bins: + return (extended_values, extended_masses) + + boundary_indexes = np.round(np.linspace(0, len(extended_values), num_bins + 1)).astype( + int + ) + + if not is_sorted: + # Partition such that the values in one bin are all less than + # or equal to the values in the next bin. Values within bins + # don't need to be sorted, and partitioning is ~10% faster than + # timsort. + partitioned_indexes = extended_values.argpartition(boundary_indexes[1:-1]) + extended_values = extended_values[partitioned_indexes] + extended_masses = extended_masses[partitioned_indexes] + + extended_evs = extended_values * extended_masses + if len(extended_masses) % num_bins == 0: + # Vectorize when possible for better performance + bin_evs = extended_evs.reshape((num_bins, -1)).sum(axis=1) + masses = extended_masses.reshape((num_bins, -1)).sum(axis=1) + + elif bin_sizing == BinSizing.ev: + if not is_sorted: + sorted_indexes = extended_values.argsort(kind="mergesort") + extended_values = extended_values[sorted_indexes] + extended_masses = extended_masses[sorted_indexes] + + extended_evs = extended_values * extended_masses + cumulative_evs = np.concatenate(([0], np.cumsum(extended_evs))) + + # Using cumulative_evs[-1] as the upper bound can create rounding + # errors. For example, if there are 100 bins with equal EV, + # boundary_evs will be slightly smaller than cumulative_evs until + # near the end, which will duplicate the first bin and skip a bin + # near the end. Slightly increasing the upper bound fixes this. + upper_bound = cumulative_evs[-1] * (1 + 1e-6) + + boundary_evs = np.linspace(0, upper_bound, num_bins + 1) + boundary_indexes = np.searchsorted(cumulative_evs, boundary_evs, side="right") - 1 + # Fix bin boundaries where boundary[i] == boundary[i+1] + if any(boundary_indexes[:-1] == boundary_indexes[1:]): + boundary_indexes = _bump_indexes(boundary_indexes, len(extended_values)) + + elif bin_sizing == BinSizing.log_uniform: + # ``bin_count`` puts too much mass in the bins on the left and + # right tails, but it's still more accurate than log-uniform + # sizing, I don't know why. + assert num_bins % 2 == 0 + assert len(extended_values) == num_bins**2 + + use_pyramid_method = False + if use_pyramid_method: + # method 1: size bins in a pyramid shape. this preserves + # log-uniform bin sizing but it makes the bin widths unnecessarily + # large + ascending_indexes = 2 * np.array(range(num_bins // 2 + 1)) ** 2 + descending_indexes = np.flip(num_bins**2 - ascending_indexes) + boundary_indexes = np.concatenate((ascending_indexes, descending_indexes[1:])) + + else: + # method 2: size bins by going out a fixed number of log-standard + # deviations in each direction + log_mean = np.average(np.log(extended_values), weights=extended_masses) + log_sd = np.sqrt( + np.average((np.log(extended_values) - log_mean) ** 2, weights=extended_masses) + ) + scale = 6.5 + log_left_bound = log_mean - scale * log_sd + log_right_bound = log_mean + scale * log_sd + log_boundary_values = np.linspace(log_left_bound, log_right_bound, num_bins + 1) + boundary_values = np.exp(log_boundary_values) + + if not is_sorted: + sorted_indexes = extended_values.argsort(kind="mergesort") + extended_values = extended_values[sorted_indexes] + extended_masses = extended_masses[sorted_indexes] + + boundary_indexes = np.searchsorted(extended_values, boundary_values) + + # Compute sums one at a time instead of using ``cumsum`` because + # ``cumsum`` produces non-trivial rounding errors. + extended_evs = extended_values * extended_masses + else: + raise ValueError(f"resize_pos_bins: Unsupported bin sizing method: {bin_sizing}") + + if bin_evs is None: + bin_evs = np.array( + [ + np.sum(extended_evs[i:j]) + for (i, j) in zip(boundary_indexes[:-1], boundary_indexes[1:]) + ] + ) + if masses is None: + masses = np.array( + [ + np.sum(extended_masses[i:j]) + for (i, j) in zip(boundary_indexes[:-1], boundary_indexes[1:]) + ] + ) + + values = bin_evs / masses + return (values, masses) + + @classmethod + def _resize_bins( + cls, + extended_neg_values: np.ndarray, + extended_neg_masses: np.ndarray, + extended_pos_values: np.ndarray, + extended_pos_masses: np.ndarray, + num_bins: int, + neg_ev_contribution: float, + pos_ev_contribution: float, + bin_sizing: BinSizing = BinSizing.bin_count, + min_bins_per_side: int = 2, + is_sorted: bool = False, + ): + """Given two arrays of values and masses representing the result of a + binary operation on two distributions, compress the arrays down to + ``num_bins`` bins and return the new values and masses of the bins. + + Parameters + ---------- + extended_neg_values : np.ndarray + The values of the negative side of the distribution. The values must + all be negative. + extended_neg_masses : np.ndarray + The probability masses of the negative side of the distribution. + extended_pos_values : np.ndarray + The values of the positive side of the distribution. The values must + all be positive. + extended_pos_masses : np.ndarray + The probability masses of the positive side of the distribution. + num_bins : int + The number of bins to compress the distribution into. + neg_ev_contribution : float + The expected value of the negative side of the distribution. + pos_ev_contribution : float + The expected value of the positive side of the distribution. + is_sorted : bool + If True, assume that ``extended_neg_values``, + ``extended_neg_masses``, ``extended_pos_values``, and + ``extended_pos_masses`` are already sorted in ascending order. This + provides a significant performance improvement (~3x). + + Returns + ------- + values : np.ndarray + The values of the bins. + masses : np.ndarray + The probability masses of the bins. + + """ + num_neg_bins, num_pos_bins = cls._num_bins_per_side( + num_bins, neg_ev_contribution, pos_ev_contribution, min_bins_per_side + ) + + total_ev = pos_ev_contribution - neg_ev_contribution + if num_neg_bins == 0: + neg_ev_contribution = 0 + pos_ev_contribution = total_ev + if num_pos_bins == 0: + neg_ev_contribution = -total_ev + pos_ev_contribution = 0 + + # Collect extended_values and extended_masses into the correct number + # of bins. Make ``extended_values`` positive because ``_resize_bins`` + # can only operate on non-negative values. Making them positive means + # they're now reverse-sorted, so reverse them. + neg_values, neg_masses = cls._resize_pos_bins( + extended_values=np.flip(-extended_neg_values), + extended_masses=np.flip(extended_neg_masses), + num_bins=num_neg_bins, + ev=neg_ev_contribution, + bin_sizing=bin_sizing, + is_sorted=is_sorted, + ) + + # ``_resize_bins`` returns positive values, so negate and reverse them. + neg_values = np.flip(-neg_values) + neg_masses = np.flip(neg_masses) + + # Collect extended_values and extended_masses into the correct number + # of bins, for the positive values this time. + pos_values, pos_masses = cls._resize_pos_bins( + extended_values=extended_pos_values, + extended_masses=extended_pos_masses, + num_bins=num_pos_bins, + ev=pos_ev_contribution, + bin_sizing=bin_sizing, + is_sorted=is_sorted, + ) + + # Construct the resulting ``NumericDistribution`` object. + values = np.concatenate((neg_values, pos_values)) + masses = np.concatenate((neg_masses, pos_masses)) + return NumericDistribution( + values=values, + masses=masses, + zero_bin_index=len(neg_masses), + neg_ev_contribution=neg_ev_contribution, + pos_ev_contribution=pos_ev_contribution, + exact_mean=None, + exact_sd=None, + ) + + def __eq__(x, y): + return x.values == y.values and x.masses == y.masses + + def __add__(x, y): + if isinstance(y, Real): + return x.shift_by(y) + elif isinstance(y, ZeroNumericDistribution): + return y.__radd__(x) + elif not isinstance(y, NumericDistribution): + raise TypeError(f"Cannot add types {type(x)} and {type(y)}") + + cls = x + num_bins = max(len(x), len(y)) + + # Add every pair of values and find the joint probabilty mass for every + # sum. + extended_values = np.add.outer(x.values, y.values).reshape(-1) + extended_masses = np.outer(x.masses, y.masses).reshape(-1) + + # Sort so we can split the values into positive and negative sides. + # Use timsort (called 'mergesort' by the numpy API) because + # ``extended_values`` contains many sorted runs. And then pass + # `is_sorted` down to `_resize_bins` so it knows not to sort again. + sorted_indexes = extended_values.argsort(kind="mergesort") + extended_values = extended_values[sorted_indexes] + extended_masses = extended_masses[sorted_indexes] + zero_index = np.searchsorted(extended_values, 0) + is_sorted = True + + # Find how much of the EV contribution is on the negative side vs. the + # positive side. + neg_ev_contribution = -np.sum(extended_values[:zero_index] * extended_masses[:zero_index]) + pos_ev_contribution = np.sum(extended_values[zero_index:] * extended_masses[zero_index:]) + + res = cls._resize_bins( + extended_neg_values=extended_values[:zero_index], + extended_neg_masses=extended_masses[:zero_index], + extended_pos_values=extended_values[zero_index:], + extended_pos_masses=extended_masses[zero_index:], + num_bins=num_bins, + neg_ev_contribution=neg_ev_contribution, + pos_ev_contribution=pos_ev_contribution, + is_sorted=is_sorted, + min_bins_per_side=x.min_bins_per_side, + ) + + if x.exact_mean is not None and y.exact_mean is not None: + res.exact_mean = x.exact_mean + y.exact_mean + if x.exact_sd is not None and y.exact_sd is not None: + res.exact_sd = np.sqrt(x.exact_sd**2 + y.exact_sd**2) + return res + + def shift_by(self, scalar): + """Shift the distribution over by a constant factor.""" + values = self.values + scalar + zero_bin_index = np.searchsorted(values, 0) + return NumericDistribution( + values=values, + masses=self.masses, + zero_bin_index=zero_bin_index, + neg_ev_contribution=-np.sum(values[:zero_bin_index] * self.masses[:zero_bin_index]), + pos_ev_contribution=np.sum(values[zero_bin_index:] * self.masses[zero_bin_index:]), + exact_mean=self.exact_mean + scalar if self.exact_mean is not None else None, + exact_sd=self.exact_sd, + ) + + def __neg__(self): + return NumericDistribution( + values=np.flip(-self.values), + masses=np.flip(self.masses), + zero_bin_index=len(self.values) - self.zero_bin_index, + neg_ev_contribution=self.pos_ev_contribution, + pos_ev_contribution=self.neg_ev_contribution, + exact_mean=-self.exact_mean if self.exact_mean is not None else None, + exact_sd=self.exact_sd, + ) + + def __abs__(self): + values = abs(self.values) + masses = self.masses + sorted_indexes = np.argsort(values, kind="mergesort") + values = values[sorted_indexes] + masses = masses[sorted_indexes] + return NumericDistribution( + values=values, + masses=masses, + zero_bin_index=0, + neg_ev_contribution=0, + pos_ev_contribution=np.sum(values * masses), + exact_mean=None, + exact_sd=None, + ) + + def __mul__(x, y): + if isinstance(y, Real): + return x.scale_by(y) + elif isinstance(y, ZeroNumericDistribution): + return y.__rmul__(x) + elif not isinstance(y, NumericDistribution): + raise TypeError(f"Cannot add types {type(x)} and {type(y)}") + + return x._inner_mul(y) + + @richardson(r=1.5) + def _inner_mul(x, y): + cls = x + num_bins = max(len(x), len(y)) + + # If xpos is the positive part of x and xneg is the negative part, then + # resultpos = (xpos * ypos) + (xneg * yneg) and resultneg = (xpos * yneg) + (xneg * + # ypos). We perform this calculation by running these steps: + # + # 1. Multiply the four pairs of one-sided distributions, + # producing n^2 bins. + # 2. Add the two positive results and the two negative results into + # an array of positive values and an array of negative values. + # 3. Run the binning algorithm on both arrays to compress them into + # a total of n bins. + # 4. Join the two arrays into a new histogram. + xneg_values = x.values[: x.zero_bin_index] + xneg_masses = x.masses[: x.zero_bin_index] + xpos_values = x.values[x.zero_bin_index :] + xpos_masses = x.masses[x.zero_bin_index :] + yneg_values = y.values[: y.zero_bin_index] + yneg_masses = y.masses[: y.zero_bin_index] + ypos_values = y.values[y.zero_bin_index :] + ypos_masses = y.masses[y.zero_bin_index :] + + # Calculate the four products. + extended_neg_values = np.concatenate( + ( + np.outer(xneg_values, ypos_values).reshape(-1), + np.outer(xpos_values, yneg_values).reshape(-1), + ) + ) + extended_neg_masses = np.concatenate( + ( + np.outer(xneg_masses, ypos_masses).reshape(-1), + np.outer(xpos_masses, yneg_masses).reshape(-1), + ) + ) + extended_pos_values = np.concatenate( + ( + np.outer(xneg_values, yneg_values).reshape(-1), + np.outer(xpos_values, ypos_values).reshape(-1), + ) + ) + extended_pos_masses = np.concatenate( + ( + np.outer(xneg_masses, yneg_masses).reshape(-1), + np.outer(xpos_masses, ypos_masses).reshape(-1), + ) + ) + + # Set the number of bins per side to be approximately proportional to + # the EV contribution, but make sure that if a side has non-trivial EV + # contribution, it gets at least one bin, even if it's less + # contribution than an average bin. + neg_ev_contribution = ( + x.neg_ev_contribution * y.pos_ev_contribution + + x.pos_ev_contribution * y.neg_ev_contribution + ) + pos_ev_contribution = ( + x.neg_ev_contribution * y.neg_ev_contribution + + x.pos_ev_contribution * y.pos_ev_contribution + ) + + res = cls._resize_bins( + extended_neg_values=extended_neg_values, + extended_neg_masses=extended_neg_masses, + extended_pos_values=extended_pos_values, + extended_pos_masses=extended_pos_masses, + num_bins=num_bins, + neg_ev_contribution=neg_ev_contribution, + pos_ev_contribution=pos_ev_contribution, + min_bins_per_side=x.min_bins_per_side, + is_sorted=False, + ) + if x.exact_mean is not None and y.exact_mean is not None: + res.exact_mean = x.exact_mean * y.exact_mean + if x.exact_sd is not None and y.exact_sd is not None: + res.exact_sd = np.sqrt( + (x.exact_sd * y.exact_mean) ** 2 + + (y.exact_sd * x.exact_mean) ** 2 + + (x.exact_sd * y.exact_sd) ** 2 + ) + return res + + def __pow__(x, y): + """Raise the distribution to a power. + + Note: x * x does not give the same result as x ** 2 because + multiplication assumes that the two distributions are independent. + """ + if isinstance(y, Real) or isinstance(y, NumericDistribution): + return (x.log() * y).exp() + else: + raise TypeError(f"Cannot compute x**y for types {type(x)} and {type(y)}") + + def __rpow__(x, y): + # Compute y**x + if isinstance(y, Real): + return (x * np.log(y)).exp() + else: + raise TypeError(f"Cannot compute x**y for types {type(x)} and {type(y)}") + + def scale_by(self, scalar): + """Scale the distribution by a constant factor.""" + if scalar < 0: + return -self * -scalar + return NumericDistribution( + values=self.values * scalar, + masses=self.masses, + zero_bin_index=self.zero_bin_index, + neg_ev_contribution=self.neg_ev_contribution * scalar, + pos_ev_contribution=self.pos_ev_contribution * scalar, + exact_mean=self.exact_mean * scalar if self.exact_mean is not None else None, + exact_sd=self.exact_sd * scalar if self.exact_sd is not None else None, + ) + + def reciprocal(self): + """Return the reciprocal of the distribution. + + Warning: The result can be very inaccurate for certain distributions + and bin sizing methods. Specifically, if the distribution is fat-tailed + and does not use log-uniform bin sizing, the reciprocal will be + inaccurate for small values. Most bin sizing methods on fat-tailed + distributions maximize information at the tails in exchange for less + accuracy on [0, 1], which means the reciprocal will contain very little + information about the tails. Log-uniform bin sizing is most accurate + because it is invariant with reciprocation. + """ + values = 1 / self.values + sorted_indexes = values.argsort() + values = values[sorted_indexes] + masses = self.masses[sorted_indexes] + + # Re-calculate EV contribution manually. + neg_ev_contribution = np.sum(values[: self.zero_bin_index] * masses[: self.zero_bin_index]) + pos_ev_contribution = np.sum(values[self.zero_bin_index :] * masses[self.zero_bin_index :]) + + return NumericDistribution( + values=values, + masses=masses, + zero_bin_index=self.zero_bin_index, + neg_ev_contribution=neg_ev_contribution, + pos_ev_contribution=pos_ev_contribution, + # There is no general formula for the mean and SD of the + # reciprocal of a random variable. + exact_mean=None, + exact_sd=None, + ) + + @richardson(r=0.66, correct_ev=False) + def exp(self): + """Return the exponential of the distribution.""" + # Note: This code naively sets the average value within each bin to + # e^x, which is wrong because we want E[e^X], not e^E[X]. An + # alternative method, which you might expect to be more accurate, is to + # interpolate the edges of each bin and then set each bin's value to + # the result of the integral + # + # .. math:: + # \int_{lb}^{ub} \frac{1}{ub - lb} exp(x) dx + # + # However, this method turns out to be less accurate overall (although + # it's more accurate in the tails of the distribution). + # + # Where the underlying distribution is normal, the naive e^E[X] method + # systematically underestimates expected value (by something like 0.1% + # with num_bins=200), and the integration method overestimates expected + # value by about 3x as much. Both methods mis-estimate the standard + # deviation in the same direction as they mis-estimate the mean but by + # a somewhat larger margin, with the naive method again having the + # better estimate. + # + # Another method would be not to interpolate the edge values, and + # instead record the true edge values when the numeric distribution is + # generated and carry them through mathematical operations by + # re-calculating them. But this method would be much more complicated, + # and we'd need to lazily compute the edge values to avoid a ~2x + # performance penalty. + values = np.exp(self.values) + return NumericDistribution( + values=values, + masses=self.masses, + zero_bin_index=0, + neg_ev_contribution=0, + pos_ev_contribution=np.sum(values * self.masses), + exact_mean=None, + exact_sd=None, + ) + + @richardson(r=0.66, correct_ev=False) + def log(self): + """Return the natural log of the distribution.""" + # See :any:`exp`` for some discussion of accuracy. For ``log` on a + # log-normal distribution, both the naive method and the integration + # method tend to overestimate the true mean, but the naive method + # overestimates it by less. + if self.zero_bin_index != 0: + raise ValueError("Cannot take the log of a distribution with non-positive values") + + values = np.log(self.values) + return NumericDistribution( + values=values, + masses=self.masses, + zero_bin_index=np.searchsorted(values, 0), + neg_ev_contribution=np.sum( + values[: self.zero_bin_index] * self.masses[: self.zero_bin_index] + ), + pos_ev_contribution=np.sum( + values[self.zero_bin_index :] * self.masses[self.zero_bin_index :] + ), + exact_mean=None, + exact_sd=None, + ) + + def __hash__(self): + return hash(repr(self.values) + "," + repr(self.masses)) + + +class ZeroNumericDistribution(BaseNumericDistribution): + """ + A :any:`NumericDistribution` with a point mass at zero. + """ + + def __init__(self, dist: NumericDistribution, zero_mass: float): + if not isinstance(dist, NumericDistribution): + raise TypeError(f"dist must be a NumericDistribution, got {type(dist)}") + if not isinstance(zero_mass, Real): + raise TypeError(f"zero_mass must be a Real, got {type(zero_mass)}") + + self._version = __version__ + self.dist = dist + self.zero_mass = zero_mass + self.nonzero_mass = 1 - zero_mass + self.exact_mean = None + self.exact_sd = None + self.exact_2nd_moment = None + + if dist.exact_mean is not None: + self.exact_mean = dist.exact_mean * self.nonzero_mass + if dist.exact_sd is not None: + nonzero_moment2 = dist.exact_mean**2 + dist.exact_sd**2 + moment2 = self.nonzero_mass * nonzero_moment2 + variance = moment2 - self.exact_mean**2 + self.exact_sd = np.sqrt(variance) + + self._neg_mass = np.sum(dist.masses[: dist.zero_bin_index]) * self.nonzero_mass + + # To be computed lazily + self.interpolate_ppf = None + + @classmethod + def wrap(cls, dist: BaseNumericDistribution, zero_mass: float): + if isinstance(dist, ZeroNumericDistribution): + return cls(dist.dist, zero_mass + dist.zero_mass * (1 - zero_mass)) + return cls(dist, zero_mass) + + def given_value_satisfies(self, condition): + nonzero_dist = self.dist.given_value_satisfies(condition) + if condition(0): + nonzero_mass = np.sum( + [x for i, x in enumerate(self.dist.masses) if condition(self.dist.values[i])] + ) + zero_mass = self.zero_mass + total_mass = nonzero_mass + zero_mass + scaled_zero_mass = zero_mass / total_mass + return ZeroNumericDistribution(nonzero_dist, scaled_zero_mass) + return nonzero_dist + + def probability_value_satisfies(self, condition): + return ( + self.dist.probability_value_satisfies(condition) * self.nonzero_mass + + condition(0) * self.zero_mass + ) + + def est_mean(self): + return self.dist.est_mean() * self.nonzero_mass + + def est_sd(self): + mean = self.mean() + nonzero_variance = ( + np.sum(self.dist.masses * (self.dist.values - mean) ** 2) * self.nonzero_mass + ) + zero_variance = self.zero_mass * mean**2 + variance = nonzero_variance + zero_variance + return np.sqrt(variance) + + def ppf(self, q): + if not isinstance(q, Real): + return np.array([self.ppf(x) for x in q]) + + if q < 0 or q > 1: + raise ValueError(f"q must be between 0 and 1, got {q}") + + if q <= self._neg_mass: + return self.dist.ppf(q / self.nonzero_mass) + elif q < self._neg_mass + self.zero_mass: + return 0 + else: + return self.dist.ppf((q - self.zero_mass) / self.nonzero_mass) + + def __eq__(x, y): + return x.zero_mass == y.zero_mass and x.dist == y.dist + + def __add__(x, y): + if isinstance(y, NumericDistribution): + return x + ZeroNumericDistribution(y, 0) + elif isinstance(y, Real): + return x.shift_by(y) + elif not isinstance(y, ZeroNumericDistribution): + raise ValueError(f"Cannot add types {type(x)} and {type(y)}") + nonzero_sum = (x.dist + y.dist) * x.nonzero_mass * y.nonzero_mass + extra_x = x.dist * x.nonzero_mass * y.zero_mass + extra_y = y.dist * x.zero_mass * y.nonzero_mass + zero_mass = x.zero_mass * y.zero_mass + return ZeroNumericDistribution(nonzero_sum + extra_x + extra_y, zero_mass) + + def shift_by(self, scalar): + old_zero_index = self.dist.zero_bin_index + shifted_dist = self.dist.shift_by(scalar) + scaled_masses = shifted_dist.masses * self.nonzero_mass + values = np.insert(shifted_dist.values, old_zero_index, scalar) + masses = np.insert(scaled_masses, old_zero_index, self.zero_mass) + exact_mean = None + if self.exact_mean is not None: + exact_mean = self.exact_mean + scalar + exact_sd = self.exact_sd + + return NumericDistribution( + values=values, + masses=masses, + zero_bin_index=shifted_dist.zero_bin_index, + neg_ev_contribution=shifted_dist.neg_ev_contribution * self.nonzero_mass + + min(0, -scalar) * self.zero_mass, + pos_ev_contribution=shifted_dist.pos_ev_contribution * self.nonzero_mass + + min(0, scalar) * self.zero_mass, + exact_mean=exact_mean, + exact_sd=exact_sd, + ) + + def __neg__(self): + return ZeroNumericDistribution(-self.dist, self.zero_mass) + + def __abs__(self): + return ZeroNumericDistribution(abs(self.dist), self.zero_mass) + + def exp(self): + # TODO: exponentiate the wrapped dist, then do something like shift_by + # to insert a 1 into the bins + return NotImplementedError + + def log(self): + raise ValueError("Cannot take the logarithm of a distribution with non-positive values") + + def __mul__(x, y): + if isinstance(y, NumericDistribution): + return x * ZeroNumericDistribution(y, 0) + if isinstance(y, Real): + return x.scale_by(y) + dist = x.dist * y.dist + nonzero_mass = x.nonzero_mass * y.nonzero_mass + return ZeroNumericDistribution(dist, 1 - nonzero_mass) + + def scale_by(self, scalar): + return ZeroNumericDistribution(self.dist.scale_by(scalar), self.zero_mass) + + def reciprocal(self): + raise ValueError( + "Reciprocal is undefined for probability distributions " + "with non-infinitesimal mass at zero" + ) + + def __hash__(self): + return 33 * hash(repr(self.zero_mass)) + hash(self.dist) + + +def numeric( + dist: Union[BaseDistribution, BaseNumericDistribution], + num_bins: Optional[int] = None, + bin_sizing: Optional[str] = None, + warn: bool = True, +): + """Create a probability mass histogram from the given distribution. + + Parameters + ---------- + dist : BaseDistribution | BaseNumericDistribution + A distribution from which to generate numeric values. If the + provided value is a :any:`BaseNumericDistribution`, simply return + it. + num_bins : Optional[int] (default = :any:``DEFAULT_NUM_BINS``) + The number of bins for the numeric distribution to use. The time to + construct a NumericDistribution is linear with ``num_bins``, and + the time to run a binary operation on two distributions with the + same number of bins is approximately quadratic with ``num_bins``. + 100 bins provides a good balance between accuracy and speed. + bin_sizing : Optional[str] + The bin sizing method to use, which affects the accuracy of the bins. + If none is given, a default will be chosen from + :any:`DEFAULT_BIN_SIZING` based on the distribution type of ``dist``. + It is recommended to use the default bin sizing method most of the + time. See :any:`BinSizing` for a list of valid options and explanations + of their behavior. warn : Optional[bool] (default = True) If True, + raise warnings about bins with zero mass. + warn : Optional[bool] (default = True) + If True, raise warnings about bins with zero mass. + + Returns + ------- + result : NumericDistribution | ZeroNumericDistribution + The generated numeric distribution that represents ``dist``. + + """ + return NumericDistribution.from_distribution(dist, num_bins, bin_sizing, warn) diff --git a/squigglepy/samplers.py b/squigglepy/samplers.py index e88df0b..983204f 100644 --- a/squigglepy/samplers.py +++ b/squigglepy/samplers.py @@ -47,6 +47,7 @@ UniformDistribution, const, ) +from .numeric_distribution import NumericDistribution _squigglepy_internal_sample_caches = {} @@ -465,13 +466,14 @@ def pareto_sample(shape, samples=1): Returns ------- int - A random number sampled from an pareto distribution. + A random number sampled from a pareto distribution. Examples -------- >>> set_seed(42) >>> pareto_sample(1) 10.069666324736094 + """ return _simplify(_get_rng().pareto(shape, samples)) @@ -1107,6 +1109,9 @@ def run_dist(dist, pbar=None, tick=1): if is_dist(samples) or callable(samples): samples = sample(samples, n=n) + elif isinstance(dist, NumericDistribution): + samples = dist.sample(n=n) + else: raise ValueError("{} sampler not found".format(type(dist))) diff --git a/squigglepy/utils.py b/squigglepy/utils.py index 03e02dc..4124371 100644 --- a/squigglepy/utils.py +++ b/squigglepy/utils.py @@ -436,12 +436,28 @@ def get_percentiles( """ percentiles = percentiles if isinstance(percentiles, list) else [percentiles] percentile_labels = list(reversed(percentiles)) if reverse else percentiles - percentiles = np.percentile(data, percentiles) - percentiles = [_round(p, digits) for p in percentiles] + + if ( + type(data).__name__ == "NumericDistribution" + or type(data).__name__ == "ZeroNumericDistribution" + ): + if len(percentiles) == 1: + values = [data.percentile(percentiles[0])] + else: + values = data.percentile(percentiles) + else: + values = np.percentile(data, percentiles) + values = [_round(p, digits) for p in values] if len(percentile_labels) == 1: - return percentiles[0] + return values[0] else: - return dict(list(zip(percentile_labels, percentiles))) + return dict(list(zip(percentile_labels, values))) + + +def mean(x): + if type(x).__name__ == "NumericDistribution" or type(x).__name__ == "ZeroNumericDistribution": + return x.mean() + return np.mean(x) def get_log_percentiles( @@ -912,25 +928,25 @@ def kelly(my_price, market_price, deference=0, bankroll=1, resolve_date=None, cu ------- dict A dict of values specifying: - * ``my_price`` - * ``market_price`` - * ``deference`` - * ``adj_price`` : an adjustment to ``my_price`` once ``deference`` is taken - into account. - * ``delta_price`` : the absolute difference between ``my_price`` and ``market_price``. - * ``adj_delta_price`` : the absolute difference between ``adj_price`` and - ``market_price``. - * ``kelly`` : the kelly criterion indicating the percentage of ``bankroll`` - you should bet. - * ``target`` : the target amount of money you should have invested - * ``current`` - * ``delta`` : the amount of money you should invest given what you already - have invested - * ``max_gain`` : the amount of money you would gain if you win - * ``modeled_gain`` : the expected value you would win given ``adj_price`` - * ``expected_roi`` : the expected return on investment - * ``expected_arr`` : the expected ARR given ``resolve_date`` - * ``resolve_date`` + * ``my_price`` + * ``market_price`` + * ``deference`` + * ``adj_price`` : an adjustment to ``my_price`` once ``deference`` is taken + into account. + * ``delta_price`` : the absolute difference between ``my_price`` and ``market_price``. + * ``adj_delta_price`` : the absolute difference between ``adj_price`` and + ``market_price``. + * ``kelly`` : the kelly criterion indicating the percentage of ``bankroll`` + you should bet. + * ``target`` : the target amount of money you should have invested + * ``current`` + * ``delta`` : the amount of money you should invest given what you already + have invested + * ``max_gain`` : the amount of money you would gain if you win + * ``modeled_gain`` : the expected value you would win given ``adj_price`` + * ``expected_roi`` : the expected return on investment + * ``expected_arr`` : the expected ARR given ``resolve_date`` + * ``resolve_date`` Examples -------- @@ -1000,25 +1016,25 @@ def full_kelly(my_price, market_price, bankroll=1, resolve_date=None, current=0) ------- dict A dict of values specifying: - * ``my_price`` - * ``market_price`` - * ``deference`` - * ``adj_price`` : an adjustment to ``my_price`` once ``deference`` is taken - into account. - * ``delta_price`` : the absolute difference between ``my_price`` and ``market_price``. - * ``adj_delta_price`` : the absolute difference between ``adj_price`` and - ``market_price``. - * ``kelly`` : the kelly criterion indicating the percentage of ``bankroll`` - you should bet. - * ``target`` : the target amount of money you should have invested - * ``current`` - * ``delta`` : the amount of money you should invest given what you already - have invested - * ``max_gain`` : the amount of money you would gain if you win - * ``modeled_gain`` : the expected value you would win given ``adj_price`` - * ``expected_roi`` : the expected return on investment - * ``expected_arr`` : the expected ARR given ``resolve_date`` - * ``resolve_date`` + * ``my_price`` + * ``market_price`` + * ``deference`` + * ``adj_price`` : an adjustment to ``my_price`` once ``deference`` is taken + into account. + * ``delta_price`` : the absolute difference between ``my_price`` and ``market_price``. + * ``adj_delta_price`` : the absolute difference between ``adj_price`` and + ``market_price``. + * ``kelly`` : the kelly criterion indicating the percentage of ``bankroll`` + you should bet. + * ``target`` : the target amount of money you should have invested + * ``current`` + * ``delta`` : the amount of money you should invest given what you already + have invested + * ``max_gain`` : the amount of money you would gain if you win + * ``modeled_gain`` : the expected value you would win given ``adj_price`` + * ``expected_roi`` : the expected return on investment + * ``expected_arr`` : the expected ARR given ``resolve_date`` + * ``resolve_date`` Examples -------- @@ -1062,25 +1078,25 @@ def half_kelly(my_price, market_price, bankroll=1, resolve_date=None, current=0) ------- dict A dict of values specifying: - * ``my_price`` - * ``market_price`` - * ``deference`` - * ``adj_price`` : an adjustment to ``my_price`` once ``deference`` is taken - into account. - * ``delta_price`` : the absolute difference between ``my_price`` and ``market_price``. - * ``adj_delta_price`` : the absolute difference between ``adj_price`` and - ``market_price``. - * ``kelly`` : the kelly criterion indicating the percentage of ``bankroll`` - you should bet. - * ``target`` : the target amount of money you should have invested - * ``current`` - * ``delta`` : the amount of money you should invest given what you already - have invested - * ``max_gain`` : the amount of money you would gain if you win - * ``modeled_gain`` : the expected value you would win given ``adj_price`` - * ``expected_roi`` : the expected return on investment - * ``expected_arr`` : the expected ARR given ``resolve_date`` - * ``resolve_date`` + * ``my_price`` + * ``market_price`` + * ``deference`` + * ``adj_price`` : an adjustment to ``my_price`` once ``deference`` is taken + into account. + * ``delta_price`` : the absolute difference between ``my_price`` and ``market_price``. + * ``adj_delta_price`` : the absolute difference between ``adj_price`` and + ``market_price``. + * ``kelly`` : the kelly criterion indicating the percentage of ``bankroll`` + you should bet. + * ``target`` : the target amount of money you should have invested + * ``current`` + * ``delta`` : the amount of money you should invest given what you already + have invested + * ``max_gain`` : the amount of money you would gain if you win + * ``modeled_gain`` : the expected value you would win given ``adj_price`` + * ``expected_roi`` : the expected return on investment + * ``expected_arr`` : the expected ARR given ``resolve_date`` + * ``resolve_date`` Examples -------- @@ -1124,25 +1140,25 @@ def quarter_kelly(my_price, market_price, bankroll=1, resolve_date=None, current ------- dict A dict of values specifying: - * ``my_price`` - * ``market_price`` - * ``deference`` - * ``adj_price`` : an adjustment to ``my_price`` once ``deference`` is taken - into account. - * ``delta_price`` : the absolute difference between ``my_price`` and ``market_price``. - * ``adj_delta_price`` : the absolute difference between ``adj_price`` and - ``market_price``. - * ``kelly`` : the kelly criterion indicating the percentage of ``bankroll`` - you should bet. - * ``target`` : the target amount of money you should have invested - * ``current`` - * ``delta`` : the amount of money you should invest given what you already - have invested - * ``max_gain`` : the amount of money you would gain if you win - * ``modeled_gain`` : the expected value you would win given ``adj_price`` - * ``expected_roi`` : the expected return on investment - * ``expected_arr`` : the expected ARR given ``resolve_date`` - * ``resolve_date`` + * ``my_price`` + * ``market_price`` + * ``deference`` + * ``adj_price`` : an adjustment to ``my_price`` once ``deference`` is taken + into account. + * ``delta_price`` : the absolute difference between ``my_price`` and ``market_price``. + * ``adj_delta_price`` : the absolute difference between ``adj_price`` and + ``market_price``. + * ``kelly`` : the kelly criterion indicating the percentage of ``bankroll`` + you should bet. + * ``target`` : the target amount of money you should have invested + * ``current`` + * ``delta`` : the amount of money you should invest given what you already + have invested + * ``max_gain`` : the amount of money you would gain if you win + * ``modeled_gain`` : the expected value you would win given ``adj_price`` + * ``expected_roi`` : the expected return on investment + * ``expected_arr`` : the expected ARR given ``resolve_date`` + * ``resolve_date`` Examples -------- @@ -1191,3 +1207,7 @@ def extremize(p, e): return 1 - ((1 - p) ** e) else: return p**e + + +class ConvergenceWarning(RuntimeWarning): + ... diff --git a/tests/test_accuracy.py b/tests/test_accuracy.py new file mode 100644 index 0000000..8968d43 --- /dev/null +++ b/tests/test_accuracy.py @@ -0,0 +1,788 @@ +from functools import reduce +import numpy as np +from scipy import optimize, stats +import warnings + +from ..squigglepy.distributions import ( + GammaDistribution, + NormalDistribution, + LognormalDistribution, + mixture, +) +from ..squigglepy.numeric_distribution import numeric +from ..squigglepy import samplers + + +RUN_PRINT_ONLY_TESTS = False +"""Some tests print information but don't assert anything. This flag determines +whether to run those tests.""" + +RUN_MONTE_CARLO_TESTS = False +"""Some tests compare the accuracy of NumericDistribution to Monte Carlo. They +can occasionally fail due to Monte Carlo getting lucky, so you might not want +to run them. +""" + + +def relative_error(x, y): + if x == 0 and y == 0: + return 0 + if x == 0: + return abs(y) + if y == 0: + return abs(x) + return max(x / y, y / x) - 1 + + +def print_accuracy_ratio(x, y, extra_message=None): + ratio = relative_error(x, y) + if extra_message is not None: + extra_message += " " + else: + extra_message = "" + direction_off = "small" if x < y else "large" + if ratio > 1: + print(f"{extra_message}Ratio: {direction_off} by a factor of {ratio + 1:.1f}") + else: + print(f"{extra_message}Ratio: {direction_off} by {100 * ratio:.4f}%") + + +def fmt(x): + return f"{(100*x):.4f}%" + + +def get_mc_accuracy(exact_sd, num_samples, dists, operation): + # Run multiple trials because NumericDistribution should usually beat MC, + # but sometimes MC wins by luck. Even though NumericDistribution wins a + # large percentage of the time, this test suite does a lot of runs, so the + # chance of MC winning at least once is fairly high. + mc_abs_error = [] + for i in range(10): + mcs = [samplers.sample(dist, num_samples) for dist in dists] + mc = reduce(operation, mcs) + mc_abs_error.append(abs(np.std(mc) - exact_sd)) + + mc_abs_error.sort() + + # Small numbers are good. A smaller index in mc_abs_error has a better + # accuracy + return mc_abs_error[-5] + + +def test_norm_sd_bin_sizing_accuracy(): + # Accuracy order is ev > uniform > mass + dist = NormalDistribution(mean=0, sd=1) + ev_hist = numeric(dist, bin_sizing="ev", warn=False) + mass_hist = numeric(dist, bin_sizing="mass", warn=False) + uniform_hist = numeric(dist, bin_sizing="uniform", warn=False) + + sd_errors = [ + relative_error(uniform_hist.est_sd(), dist.sd), + relative_error(ev_hist.est_sd(), dist.sd), + relative_error(mass_hist.est_sd(), dist.sd), + ] + assert all(np.diff(sd_errors) >= 0) + + +def test_norm_product_bin_sizing_accuracy(): + dist = NormalDistribution(mean=2, sd=1) + uniform_hist = numeric(dist, bin_sizing="uniform", warn=False) + uniform_hist = uniform_hist * uniform_hist + ev_hist = numeric(dist, bin_sizing="ev", warn=False) + ev_hist = ev_hist * ev_hist + mass_hist = numeric(dist, bin_sizing="mass", warn=False) + mass_hist = mass_hist * mass_hist + + # uniform and log-uniform should have small errors and the others should be + # pretty much perfect + mean_errors = np.array( + [ + relative_error(mass_hist.est_mean(), ev_hist.exact_mean), + relative_error(ev_hist.est_mean(), ev_hist.exact_mean), + relative_error(uniform_hist.est_mean(), ev_hist.exact_mean), + ] + ) + assert all(mean_errors <= 1e-6) + + sd_errors = [ + relative_error(ev_hist.est_sd(), ev_hist.exact_sd), + relative_error(mass_hist.est_sd(), ev_hist.exact_sd), + relative_error(uniform_hist.est_sd(), ev_hist.exact_sd), + ] + assert all(np.diff(sd_errors) >= 0) + + +def test_lognorm_product_bin_sizing_accuracy(): + dist = LognormalDistribution(norm_mean=np.log(1e6), norm_sd=1) + uniform_hist = numeric(dist, bin_sizing="uniform", warn=False) + uniform_hist = uniform_hist * uniform_hist + log_uniform_hist = numeric(dist, bin_sizing="log-uniform", warn=False) + log_uniform_hist = log_uniform_hist * log_uniform_hist + ev_hist = numeric(dist, bin_sizing="ev", warn=False) + ev_hist = ev_hist * ev_hist + mass_hist = numeric(dist, bin_sizing="mass", warn=False) + mass_hist = mass_hist * mass_hist + fat_hybrid_hist = numeric(dist, bin_sizing="fat-hybrid", warn=False) + fat_hybrid_hist = fat_hybrid_hist * fat_hybrid_hist + dist_prod = LognormalDistribution( + norm_mean=2 * dist.norm_mean, norm_sd=np.sqrt(2) * dist.norm_sd + ) + + mean_errors = np.array( + [ + relative_error(mass_hist.est_mean(), dist_prod.lognorm_mean), + relative_error(ev_hist.est_mean(), dist_prod.lognorm_mean), + relative_error(fat_hybrid_hist.est_mean(), dist_prod.lognorm_mean), + relative_error(uniform_hist.est_mean(), dist_prod.lognorm_mean), + relative_error(log_uniform_hist.est_mean(), dist_prod.lognorm_mean), + ] + ) + assert all(mean_errors <= 1e-6) + + sd_errors = [ + relative_error(fat_hybrid_hist.est_sd(), dist_prod.lognorm_sd), + relative_error(log_uniform_hist.est_sd(), dist_prod.lognorm_sd), + relative_error(ev_hist.est_sd(), dist_prod.lognorm_sd), + relative_error(mass_hist.est_sd(), dist_prod.lognorm_sd), + relative_error(uniform_hist.est_sd(), dist_prod.lognorm_sd), + ] + assert all(np.diff(sd_errors) >= 0) + + +def test_lognorm_clip_center_bin_sizing_accuracy(): + dist1 = LognormalDistribution(norm_mean=-1, norm_sd=0.5, lclip=0, rclip=1) + dist2 = LognormalDistribution(norm_mean=0, norm_sd=1, lclip=0, rclip=2 * np.e) + true_mean1 = stats.lognorm.expect( + lambda x: x, + args=(dist1.norm_sd,), + scale=np.exp(dist1.norm_mean), + lb=dist1.lclip, + ub=dist1.rclip, + conditional=True, + ) + true_sd1 = np.sqrt( + stats.lognorm.expect( + lambda x: (x - true_mean1) ** 2, + args=(dist1.norm_sd,), + scale=np.exp(dist1.norm_mean), + lb=dist1.lclip, + ub=dist1.rclip, + conditional=True, + ) + ) + true_mean2 = stats.lognorm.expect( + lambda x: x, + args=(dist2.norm_sd,), + scale=np.exp(dist2.norm_mean), + lb=dist2.lclip, + ub=dist2.rclip, + conditional=True, + ) + true_sd2 = np.sqrt( + stats.lognorm.expect( + lambda x: (x - true_mean2) ** 2, + args=(dist2.norm_sd,), + scale=np.exp(dist2.norm_mean), + lb=dist2.lclip, + ub=dist2.rclip, + conditional=True, + ) + ) + true_mean = true_mean1 * true_mean2 + true_sd = np.sqrt( + true_sd1**2 * true_mean2**2 + + true_mean1**2 * true_sd2**2 + + true_sd1**2 * true_sd2**2 + ) + + uniform_hist = numeric(dist1, bin_sizing="uniform", warn=False) * numeric( + dist2, bin_sizing="uniform", warn=False + ) + log_uniform_hist = numeric(dist1, bin_sizing="log-uniform", warn=False) * numeric( + dist2, bin_sizing="log-uniform", warn=False + ) + ev_hist = numeric(dist1, bin_sizing="ev", warn=False) * numeric( + dist2, bin_sizing="ev", warn=False + ) + mass_hist = numeric(dist1, bin_sizing="mass", warn=False) * numeric( + dist2, bin_sizing="mass", warn=False + ) + fat_hybrid_hist = numeric(dist1, bin_sizing="fat-hybrid", warn=False) * numeric( + dist2, bin_sizing="fat-hybrid", warn=False + ) + + mean_errors = np.array( + [ + relative_error(ev_hist.est_mean(), true_mean), + relative_error(mass_hist.est_mean(), true_mean), + relative_error(uniform_hist.est_mean(), true_mean), + relative_error(fat_hybrid_hist.est_mean(), true_mean), + relative_error(log_uniform_hist.est_mean(), true_mean), + ] + ) + assert all(mean_errors <= 1e-6) + + # Uniform does poorly in general with fat-tailed dists, but it does well + # with a center clip because most of the mass is in the center + sd_errors = [ + relative_error(mass_hist.est_mean(), true_mean), + relative_error(uniform_hist.est_sd(), true_sd), + relative_error(ev_hist.est_sd(), true_sd), + relative_error(fat_hybrid_hist.est_sd(), true_sd), + relative_error(log_uniform_hist.est_sd(), true_sd), + ] + assert all(np.diff(sd_errors) >= 0) + + +def test_lognorm_clip_tail_bin_sizing_accuracy(): + # cut off 99% of mass and 95% of mass, respectively + dist1 = LognormalDistribution(norm_mean=0, norm_sd=1, lclip=10) + dist2 = LognormalDistribution(norm_mean=0, norm_sd=2, rclip=27) + true_mean1 = stats.lognorm.expect( + lambda x: x, + args=(dist1.norm_sd,), + scale=np.exp(dist1.norm_mean), + lb=dist1.lclip, + ub=dist1.rclip, + conditional=True, + ) + true_sd1 = np.sqrt( + stats.lognorm.expect( + lambda x: (x - true_mean1) ** 2, + args=(dist1.norm_sd,), + scale=np.exp(dist1.norm_mean), + lb=dist1.lclip, + ub=dist1.rclip, + conditional=True, + ) + ) + true_mean2 = stats.lognorm.expect( + lambda x: x, + args=(dist2.norm_sd,), + scale=np.exp(dist2.norm_mean), + lb=dist2.lclip, + ub=dist2.rclip, + conditional=True, + ) + true_sd2 = np.sqrt( + stats.lognorm.expect( + lambda x: (x - true_mean2) ** 2, + args=(dist2.norm_sd,), + scale=np.exp(dist2.norm_mean), + lb=dist2.lclip, + ub=dist2.rclip, + conditional=True, + ) + ) + true_mean = true_mean1 * true_mean2 + true_sd = np.sqrt( + true_sd1**2 * true_mean2**2 + + true_mean1**2 * true_sd2**2 + + true_sd1**2 * true_sd2**2 + ) + + uniform_hist = numeric(dist1, bin_sizing="uniform", warn=False) * numeric( + dist2, bin_sizing="uniform", warn=False + ) + log_uniform_hist = numeric(dist1, bin_sizing="log-uniform", warn=False) * numeric( + dist2, bin_sizing="log-uniform", warn=False + ) + ev_hist = numeric(dist1, bin_sizing="ev", warn=False) * numeric( + dist2, bin_sizing="ev", warn=False + ) + mass_hist = numeric(dist1, bin_sizing="mass", warn=False) * numeric( + dist2, bin_sizing="mass", warn=False + ) + fat_hybrid_hist = numeric(dist1, bin_sizing="fat-hybrid", warn=False) * numeric( + dist2, bin_sizing="fat-hybrid", warn=False + ) + + mean_errors = np.array( + [ + relative_error(mass_hist.est_mean(), true_mean), + relative_error(uniform_hist.est_mean(), true_mean), + relative_error(ev_hist.est_mean(), true_mean), + relative_error(fat_hybrid_hist.est_mean(), true_mean), + relative_error(log_uniform_hist.est_mean(), true_mean), + ] + ) + assert all(mean_errors <= 1e-5) + + sd_errors = [ + relative_error(fat_hybrid_hist.est_sd(), true_sd), + relative_error(log_uniform_hist.est_sd(), true_sd), + relative_error(ev_hist.est_sd(), true_sd), + relative_error(uniform_hist.est_sd(), true_sd), + relative_error(mass_hist.est_sd(), true_sd), + ] + assert all(np.diff(sd_errors) >= 0) + + +def test_gamma_bin_sizing_accuracy(): + dist1 = GammaDistribution(shape=1, scale=5) + dist2 = GammaDistribution(shape=10, scale=1) + + uniform_hist = numeric(dist1, bin_sizing="uniform") * numeric(dist2, bin_sizing="uniform") + log_uniform_hist = numeric(dist1, bin_sizing="log-uniform") * numeric( + dist2, bin_sizing="log-uniform" + ) + ev_hist = numeric(dist1, bin_sizing="ev") * numeric(dist2, bin_sizing="ev") + mass_hist = numeric(dist1, bin_sizing="mass") * numeric(dist2, bin_sizing="mass") + fat_hybrid_hist = numeric(dist1, bin_sizing="fat-hybrid") * numeric( + dist2, bin_sizing="fat-hybrid" + ) + + true_mean = uniform_hist.exact_mean + true_sd = uniform_hist.exact_sd + + mean_errors = np.array( + [ + relative_error(mass_hist.est_mean(), true_mean), + relative_error(uniform_hist.est_mean(), true_mean), + relative_error(ev_hist.est_mean(), true_mean), + relative_error(log_uniform_hist.est_mean(), true_mean), + relative_error(fat_hybrid_hist.est_mean(), true_mean), + ] + ) + assert all(mean_errors <= 1e-6) + + sd_errors = [ + relative_error(ev_hist.est_sd(), true_sd), + relative_error(uniform_hist.est_sd(), true_sd), + relative_error(fat_hybrid_hist.est_sd(), true_sd), + relative_error(log_uniform_hist.est_sd(), true_sd), + relative_error(mass_hist.est_sd(), true_sd), + ] + assert all(np.diff(sd_errors) >= 0) + + +def test_norm_product_sd_accuracy_vs_monte_carlo(): + """Test that PMH SD is more accurate than Monte Carlo SD both for initial + distributions and when multiplying up to 8 distributions together. + """ + if not RUN_MONTE_CARLO_TESTS: + return None + # Time complexity for binary operations is roughly O(n^2) for PMH and O(n) + # for MC, so let MC have num_bins^2 samples. + num_bins = 100 + num_samples = 100**2 + dists = [NormalDistribution(mean=i, sd=0.5 + i / 4) for i in range(9)] + hists = [numeric(dist, num_bins=num_bins, warn=False) for dist in dists] + hist = reduce(lambda acc, hist: acc * hist, hists) + dist_abs_error = abs(hist.est_sd() - hist.exact_sd) + + mc_abs_error = get_mc_accuracy(hist.exact_sd, num_samples, dists, lambda acc, mc: acc * mc) + assert dist_abs_error < mc_abs_error + + +def test_lognorm_product_sd_accuracy_vs_monte_carlo(): + """Test that PMH SD is more accurate than Monte Carlo SD both for initial + distributions and when multiplying up to 16 distributions together.""" + if not RUN_MONTE_CARLO_TESTS: + return None + num_bins = 100 + num_samples = 100**2 + dists = [LognormalDistribution(norm_mean=i, norm_sd=0.5 + i / 4) for i in range(9)] + hists = [numeric(dist, num_bins=num_bins, warn=False) for dist in dists] + hist = reduce(lambda acc, hist: acc * hist, hists) + dist_abs_error = abs(hist.est_sd() - hist.exact_sd) + + mc_abs_error = get_mc_accuracy(hist.exact_sd, num_samples, dists, lambda acc, mc: acc * mc) + assert dist_abs_error < mc_abs_error + + +def test_norm_sum_sd_accuracy_vs_monte_carlo(): + """Test that PMH SD is more accurate than Monte Carlo SD both for initial + distributions and when multiplying up to 8 distributions together. + + Note: With more multiplications, MC has a good chance of being more + accurate, and is significantly more accurate at 16 multiplications. + """ + if not RUN_MONTE_CARLO_TESTS: + return None + num_bins = 1000 + num_samples = num_bins**2 + dists = [NormalDistribution(mean=i, sd=0.5 + i / 4) for i in range(9)] + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + hists = [numeric(dist, num_bins=num_bins, bin_sizing="uniform") for dist in dists] + hist = reduce(lambda acc, hist: acc + hist, hists) + dist_abs_error = abs(hist.est_sd() - hist.exact_sd) + + mc_abs_error = get_mc_accuracy(hist.exact_sd, num_samples, dists, lambda acc, mc: acc + mc) + assert dist_abs_error < mc_abs_error + + +def test_lognorm_sum_sd_accuracy_vs_monte_carlo(): + """Test that PMH SD is more accurate than Monte Carlo SD both for initial + distributions and when multiplying up to 16 distributions together.""" + if not RUN_MONTE_CARLO_TESTS: + return None + num_bins = 100 + num_samples = 100**2 + dists = [LognormalDistribution(norm_mean=i, norm_sd=0.5 + i / 4) for i in range(17)] + hists = [numeric(dist, num_bins=num_bins, warn=False) for dist in dists] + hist = reduce(lambda acc, hist: acc + hist, hists) + dist_abs_error = abs(hist.est_sd() - hist.exact_sd) + + mc_abs_error = get_mc_accuracy(hist.exact_sd, num_samples, dists, lambda acc, mc: acc + mc) + assert dist_abs_error < mc_abs_error + + +def test_quantile_accuracy(): + if not RUN_PRINT_ONLY_TESTS: + return None + props = np.array([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999]) + # props = np.array([0.05, 0.1, 0.25, 0.75, 0.9, 0.95, 0.99, 0.999]) + dist = LognormalDistribution(norm_mean=0, norm_sd=1) + true_quantiles = stats.lognorm.ppf(props, dist.norm_sd, scale=np.exp(dist.norm_mean)) + # dist = NormalDistribution(mean=0, sd=1) + # true_quantiles = stats.norm.ppf(props, dist.mean, dist.sd) + num_bins = 100 + num_mc_samples = num_bins**2 + + # Formula from Goodman, "Accuracy and Efficiency of Monte Carlo Method." + # https://inis.iaea.org/collection/NCLCollectionStore/_Public/19/047/19047359.pdf + # Figure 20 on page 434. + mc_error = ( + np.sqrt(props * (1 - props)) + * np.sqrt(2 * np.pi) + * dist.norm_sd + * np.exp(0.5 * (np.log(true_quantiles) - dist.norm_mean) ** 2 / dist.norm_sd**2) + / np.sqrt(num_mc_samples) + ) + + print("\n") + print( + f"MC error: average {fmt(np.mean(mc_error))}, " + f"median {fmt(np.median(mc_error))}, " + f"max {fmt(np.max(mc_error))}" + ) + + for bin_sizing in ["log-uniform", "mass", "ev", "fat-hybrid"]: + # for bin_sizing in ["uniform", "mass", "ev"]: + hist = numeric(dist, bin_sizing=bin_sizing, warn=False, num_bins=num_bins) + linear_quantiles = np.interp( + props, np.cumsum(hist.masses) - 0.5 * hist.masses, hist.values + ) + hist_quantiles = hist.quantile(props) + linear_error = abs(true_quantiles - linear_quantiles) / abs(true_quantiles) + hist_error = abs(true_quantiles - hist_quantiles) / abs(true_quantiles) + print(f"\n{bin_sizing}") + print( + f"\tLinear error: average {fmt(np.mean(linear_error))}, " + f"median {fmt(np.median(linear_error))}, " + f"max {fmt(np.max(linear_error))}" + ) + print( + f"\tHist error : average {fmt(np.mean(hist_error))}, " + f"median {fmt(np.median(hist_error))}, " + f"max {fmt(np.max(hist_error))}" + ) + print( + f"\tHist / MC : average {fmt(np.mean(hist_error / mc_error))}, " + f"median {fmt(np.median(hist_error / mc_error))}, " + f"max {fmt(np.max(hist_error / mc_error))}" + ) + + +def test_quantile_product_accuracy(): + if not RUN_PRINT_ONLY_TESTS: + return None + props = np.array([0.5, 0.75, 0.9, 0.95, 0.99, 0.999]) # EV + # props = np.array([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999]) # lognorm + # props = np.array([0.05, 0.1, 0.25, 0.75, 0.9, 0.95, 0.99, 0.999]) # norm + num_bins = 200 + print("\n") + + bin_sizing = "log-uniform" + hists = [] + # for bin_sizing in ["log-uniform", "mass", "ev", "fat-hybrid"]: + for num_products in [2, 8, 32, 128, 512]: + dist1 = LognormalDistribution(norm_mean=0, norm_sd=1 / np.sqrt(num_products)) + dist = LognormalDistribution( + norm_mean=dist1.norm_mean * num_products, norm_sd=dist1.norm_sd * np.sqrt(num_products) + ) + true_quantiles = stats.lognorm.ppf(props, dist.norm_sd, scale=np.exp(dist.norm_mean)) + num_mc_samples = num_bins**2 + + # I'm not sure how to prove this, but empirically, it looks like the error + # for MC(x) * MC(y) is the same as the error for MC(x * y). + mc_error = ( + np.sqrt(props * (1 - props)) + * np.sqrt(2 * np.pi) + * dist.norm_sd + * np.exp(0.5 * (np.log(true_quantiles) - dist.norm_mean) ** 2 / dist.norm_sd**2) + / np.sqrt(num_mc_samples) + ) + + hist1 = numeric(dist1, bin_sizing=bin_sizing, warn=False, num_bins=num_bins) + hist = reduce(lambda acc, x: acc * x, [hist1] * num_products) + oneshot = numeric(dist, bin_sizing=bin_sizing, warn=False, num_bins=num_bins) + hist_quantiles = hist.quantile(props) + hist_error = abs(true_quantiles - hist_quantiles) / abs(true_quantiles) + oneshot_error = abs(true_quantiles - oneshot.quantile(props)) / abs(true_quantiles) + hists.append(hist) + + print(f"{num_products}") + print( + f"\tHist error : average {fmt(np.mean(hist_error))}, " + f"median {fmt(np.median(hist_error))}, " + "max {fmt(np.max(hist_error))}" + ) + print( + f"\tHist / MC : average {fmt(np.mean(hist_error / mc_error))}, " + f"median {fmt(np.median(hist_error / mc_error))}, " + f"max {fmt(np.max(hist_error / mc_error))}" + ) + print( + f"\tHist / 1shot: average {fmt(np.mean(hist_error / oneshot_error))}, " + f"median {fmt(np.median(hist_error / oneshot_error))}, " + f"max {fmt(np.max(hist_error / oneshot_error))}" + ) + + +def test_individual_bin_accuracy(): + if not RUN_PRINT_ONLY_TESTS: + return None + num_bins = 200 + bin_sizing = "ev" + print("") + bin_errs = [] + num_products = 16 + bin_sizes = 40 * np.arange(1, 11) + for num_bins in bin_sizes: + operation = "mul" + if operation == "mul": + true_dist_type = "lognorm" + true_dist = LognormalDistribution(norm_mean=0, norm_sd=1) + dist1 = LognormalDistribution(norm_mean=0, norm_sd=1 / np.sqrt(num_products)) + hist1 = numeric(dist1, bin_sizing=bin_sizing, num_bins=num_bins, warn=False) + hist = reduce(lambda acc, x: acc * x, [hist1] * num_products) + elif operation == "add": + true_dist_type = "norm" + true_dist = NormalDistribution(mean=0, sd=1) + dist1 = NormalDistribution(mean=0, sd=1 / np.sqrt(num_products)) + hist1 = numeric(dist1, bin_sizing=bin_sizing, num_bins=num_bins, warn=False) + hist = reduce(lambda acc, x: acc + x, [hist1] * num_products) + elif operation == "exp": + true_dist_type = "lognorm" + true_dist = LognormalDistribution(norm_mean=0, norm_sd=1) + dist1 = NormalDistribution(mean=0, sd=1) + hist1 = numeric(dist1, bin_sizing=bin_sizing, num_bins=num_bins, warn=False) + hist = hist1.exp() + + cum_mass = np.cumsum(hist.masses) + cum_cev = np.cumsum(hist.masses * abs(hist.values)) + cum_cev_frac = cum_cev / cum_cev[-1] + if true_dist_type == "lognorm": + expected_cum_mass = stats.lognorm.cdf( + true_dist.inv_contribution_to_ev(cum_cev_frac), + true_dist.norm_sd, + scale=np.exp(true_dist.norm_mean), + ) + elif true_dist_type == "norm": + expected_cum_mass = stats.norm.cdf( + true_dist.inv_contribution_to_ev(cum_cev_frac), true_dist.mean, true_dist.sd + ) + + # Take only every nth value where n = num_bins/40 + cum_mass = cum_mass[:: num_bins // 40] + expected_cum_mass = expected_cum_mass[:: num_bins // 40] + bin_errs.append(abs(cum_mass - expected_cum_mass) / expected_cum_mass) + + bin_errs = np.array(bin_errs) + + best_fits = [] + for i in range(40): + try: + best_fit = optimize.curve_fit( + lambda x, a, r: a * x**r, bin_sizes, bin_errs[:, i], p0=[1, 2] + )[0] + best_fits.append(best_fit) + print(f"{i:2d} {best_fit[0]:9.3f} * x ^ {best_fit[1]:.3f}") + except RuntimeError: + # optimal parameters not found + print(f"{i:2d} ? ?") + + print("") + print(f"Average: {np.mean(best_fits, axis=0)}\nMedian: {np.median(best_fits, axis=0)}") + + meta_fit = np.polynomial.polynomial.Polynomial.fit( + np.array(range(len(best_fits))) / len(best_fits), np.array(best_fits)[:, 1], 2 + ) + print(f"\nMeta fit: {meta_fit}") + + +def test_richardson_product(): + if not RUN_PRINT_ONLY_TESTS: + return None + print("") + num_bins = 200 + num_products = 2 + bin_sizing = "ev" + # mixture_ratio = [0.035, 0.965] + mixture_ratio = [0, 1] + # mixture_ratio = [0.3, 0.7] + bin_sizes = 40 * np.arange(1, 11) + product_nums = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] + err_rates = [] + for num_products in product_nums: + # for num_bins in bin_sizes: + true_mixture_ratio = reduce( + lambda acc, x: (acc[0] * x[1] + acc[1] * x[0], acc[0] * x[0] + acc[1] * x[1]), + [(mixture_ratio) for _ in range(num_products)], + ) + one_sided_dist = LognormalDistribution(norm_mean=0, norm_sd=1) + true_dist = mixture([-one_sided_dist, one_sided_dist], true_mixture_ratio) + true_hist = numeric(true_dist, bin_sizing=bin_sizing, num_bins=num_bins, warn=False) + one_sided_dist1 = LognormalDistribution(norm_mean=0, norm_sd=1 / np.sqrt(num_products)) + dist1 = mixture([-one_sided_dist1, one_sided_dist1], mixture_ratio) + hist1 = numeric(dist1, bin_sizing=bin_sizing, num_bins=num_bins, warn=False) + hist = reduce(lambda acc, x: acc * x, [hist1] * num_products) + + test_mode = "ppf" + if test_mode == "cev": + true_answer = ( + one_sided_dist.contribution_to_ev( + stats.lognorm.ppf( + 2 * hist.masses[50:100].sum(), + one_sided_dist.norm_sd, + scale=np.exp(one_sided_dist.norm_mean), + ), + False, + ) + / 2 + ) + est_answer = (hist.masses * abs(hist.values))[50:100].sum() + print_accuracy_ratio(est_answer, true_answer, f"CEV({num_products:3d})") + elif test_mode == "sd": + mcs = [samplers.sample(dist, num_bins**2) for dist in [dist1] * num_products] + true_answer = true_hist.exact_sd + est_answer = hist.est_sd() + print_accuracy_ratio(est_answer, true_answer, f"SD({num_products:3d}, {num_bins:3d})") + mc = reduce(lambda acc, x: acc * x, mcs) + mc_answer = np.std(mc) + print_accuracy_ratio(mc_answer, true_answer, f"MC({num_products:3d}, {num_bins:3d})") + err_rates.append(abs(est_answer - true_answer)) + elif test_mode == "ppf": + fracs = [0.5, 0.75, 0.9, 0.97, 0.99] + frac_errs = [] + # mc_errs = [] + for frac in fracs: + true_answer = stats.lognorm.ppf( + (frac - true_mixture_ratio[0]) / true_mixture_ratio[1], + one_sided_dist.norm_sd, + scale=np.exp(one_sided_dist.norm_mean), + ) + est_answer = hist.ppf(frac) + frac_errs.append(abs(est_answer - true_answer) / true_answer) + median_err = np.median(frac_errs) + print(f"ppf ({num_products:3d}, {num_bins:3d}): {median_err * 100:.3f}%") + err_rates.append(median_err) + + if num_bins == bin_sizes[-1]: + best_fit = optimize.curve_fit(lambda x, a, r: a * x**r, bin_sizes, err_rates, p0=[1, 2])[ + 0 + ] + print(f"\nBest fit: {best_fit}") + else: + best_fit = optimize.curve_fit( + lambda x, a, r: a * x**r, product_nums, err_rates, p0=[1, 2] + )[0] + print(f"\nBest fit: {best_fit}") + + +def test_richardson_sum(): + if not RUN_PRINT_ONLY_TESTS: + return None + print("") + num_bins = 200 + num_sums = 2 + bin_sizing = "ev" + bin_sizes = 40 * np.arange(1, 11) + err_rates = [] + # for num_sums in [2, 4, 8, 16, 32, 64]: + for num_bins in bin_sizes: + true_dist = NormalDistribution(mean=0, sd=1) + true_hist = numeric(true_dist, bin_sizing=bin_sizing, num_bins=num_bins, warn=False) + dist1 = NormalDistribution(mean=0, sd=1 / np.sqrt(num_sums)) + hist1 = numeric(dist1, bin_sizing=bin_sizing, num_bins=num_bins, warn=False) + hist = reduce(lambda acc, x: acc + x, [hist1] * num_sums) + + test_mode = "ppf" + if test_mode == "cev": + true_answer = ( + true_dist.contribution_to_ev( + stats.lognorm.ppf( + 2 * hist.masses[50:100].sum(), + true_dist.norm_sd, + scale=np.exp(true_dist.norm_mean), + ), + False, + ) + / 2 + ) + est_answer = (hist.masses * abs(hist.values))[50:100].sum() + print_accuracy_ratio(est_answer, true_answer, f"CEV({num_sums:3d})") + elif test_mode == "sd": + true_answer = true_hist.exact_sd + est_answer = hist.est_sd() + print_accuracy_ratio(est_answer, true_answer, f"SD({num_sums}, {num_bins:3d})") + err_rates.append(abs(est_answer - true_answer)) + elif test_mode == "ppf": + fracs = [0.75, 0.9, 0.95, 0.98, 0.99] + frac_errs = [] + for frac in fracs: + true_answer = stats.norm.ppf(frac, true_dist.mean, true_dist.sd) + est_answer = hist.ppf(frac) + frac_errs.append(abs(est_answer - true_answer) / true_answer) + median_err = np.median(frac_errs) + print(f"ppf ({num_sums:3d}, {num_bins:3d}): {median_err * 100:.3f}%") + err_rates.append(median_err) + + if len(err_rates) == len(bin_sizes): + best_fit = optimize.curve_fit(lambda x, a, r: a * x**r, bin_sizes, err_rates, p0=[1, 2])[ + 0 + ] + print(f"\nBest fit: {best_fit}") + + +def test_richardson_exp(): + if not RUN_PRINT_ONLY_TESTS: + return None + print("") + bin_sizing = "ev" + bin_sizes = 200 * np.arange(1, 11) + err_rates = [] + for num_bins in bin_sizes: + true_dist = LognormalDistribution(norm_mean=0, norm_sd=1) + true_hist = numeric(true_dist, bin_sizing=bin_sizing, num_bins=num_bins, warn=False) + dist1 = NormalDistribution(mean=0, sd=1) + hist1 = numeric(dist1, bin_sizing=bin_sizing, num_bins=num_bins, warn=False) + hist = hist1.exp() + + test_mode = "sd" + if test_mode == "sd": + true_answer = true_hist.exact_sd + est_answer = hist.est_sd() + print_accuracy_ratio(est_answer, true_answer, f"SD({num_bins:3d})") + err_rates.append(abs(est_answer - true_answer)) + elif test_mode == "ppf": + fracs = [0.5, 0.75, 0.9, 0.97, 0.99] + frac_errs = [] + for frac in fracs: + true_answer = stats.lognorm.ppf( + frac, true_dist.norm_sd, scale=np.exp(true_dist.norm_mean) + ) + est_answer = hist.ppf(frac) + frac_errs.append(abs(est_answer - true_answer) / true_answer) + median_err = np.median(frac_errs) + print(f"ppf ({num_bins:4d}): {median_err * 100:.5f}%") + err_rates.append(median_err) + + if len(err_rates) == len(bin_sizes): + best_fit = optimize.curve_fit(lambda x, a, r: a * x**r, bin_sizes, err_rates, p0=[1, 2])[ + 0 + ] + print(f"\nBest fit: {best_fit}") diff --git a/tests/test_contribution_to_ev.py b/tests/test_contribution_to_ev.py new file mode 100644 index 0000000..ec988ff --- /dev/null +++ b/tests/test_contribution_to_ev.py @@ -0,0 +1,225 @@ +import hypothesis.strategies as st +import numpy as np +from pytest import approx +from scipy import stats +from hypothesis import assume, example, given, settings + +from ..squigglepy.distributions import ( + BetaDistribution, + GammaDistribution, + LognormalDistribution, + NormalDistribution, + ParetoDistribution, + UniformDistribution, +) + + +@given( + norm_mean=st.floats(min_value=np.log(0.01), max_value=np.log(1e6)), + norm_sd=st.floats(min_value=0.1, max_value=2.5), + ev_fraction=st.floats(min_value=0.01, max_value=0.99), +) +@settings(max_examples=1000) +def test_lognorm_inv_contribution_to_ev_inverts_contribution_to_ev( + norm_mean, norm_sd, ev_fraction +): + dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd) + assert dist.contribution_to_ev(dist.inv_contribution_to_ev(ev_fraction)) == approx( + ev_fraction, 2e-5 / ev_fraction + ) + + +def test_lognorm_basic(): + dist = LognormalDistribution(lognorm_mean=2, lognorm_sd=1.0) + ev_fraction = 0.25 + assert dist.contribution_to_ev(dist.inv_contribution_to_ev(ev_fraction)) == approx(ev_fraction) + + +def test_norm_basic(): + dist = NormalDistribution(mean=100, sd=11.97) + assert dist.contribution_to_ev(-1e-14) >= 0 + + +@given( + mu=st.floats(min_value=-10, max_value=10), + sigma=st.floats(min_value=0.01, max_value=100), +) +@example(mu=0, sigma=1) +def test_norm_contribution_to_ev(mu, sigma): + dist = NormalDistribution(mean=mu, sd=sigma) + + assert dist.contribution_to_ev(mu + 99 * sigma) == approx(1) + assert dist.contribution_to_ev(mu - 99 * sigma) == approx(0) + + # midpoint represents less than half the EV if mu > 0 b/c the larger + # values are weighted heavier, and vice versa if mu < 0 + if mu > 1e-6: + assert dist.contribution_to_ev(mu) < 0.5 + elif mu < -1e-6: + assert dist.contribution_to_ev(mu) > 0.5 + elif mu == 0: + assert dist.contribution_to_ev(mu) == approx(0.5) + + # contribution_to_ev should be monotonic + assert dist.contribution_to_ev(mu - 2 * sigma) < dist.contribution_to_ev(mu - 1 * sigma) + assert dist.contribution_to_ev(mu - sigma) < dist.contribution_to_ev(mu) + assert dist.contribution_to_ev(mu) < dist.contribution_to_ev(mu + sigma) + assert dist.contribution_to_ev(mu + sigma) < dist.contribution_to_ev(mu + 2 * sigma) + + +@given( + mu=st.floats(min_value=-10, max_value=10), + sigma=st.floats(min_value=0.01, max_value=10), +) +def test_norm_inv_contribution_to_ev(mu, sigma): + dist = NormalDistribution(mean=mu, sd=sigma) + + assert dist.inv_contribution_to_ev(1 - 1e-9) > mu + 3 * sigma + assert dist.inv_contribution_to_ev(1e-9) < mu - 3 * sigma + + # midpoint represents less than half the EV if mu > 0 b/c the larger + # values are weighted heavier, and vice versa if mu < 0 + if mu > 1e-6: + assert dist.inv_contribution_to_ev(0.5) > mu + elif mu < -1e-6: + assert dist.inv_contribution_to_ev(0.5) < mu + elif mu == 0: + assert dist.inv_contribution_to_ev(0.5) == approx(mu) + + # inv_contribution_to_ev should be monotonic + assert dist.inv_contribution_to_ev(0.05) < dist.inv_contribution_to_ev(0.25) + assert dist.inv_contribution_to_ev(0.25) < dist.inv_contribution_to_ev(0.5) + assert dist.inv_contribution_to_ev(0.5) < dist.inv_contribution_to_ev(0.75) + assert dist.inv_contribution_to_ev(0.75) < dist.inv_contribution_to_ev(0.95) + + +@given( + mu=st.floats(min_value=-10, max_value=10), + sigma=st.floats(min_value=0.01, max_value=10), + ev_fraction=st.floats(min_value=0.0001, max_value=0.9999), +) +def test_norm_inv_contribution_to_ev_inverts_contribution_to_ev(mu, sigma, ev_fraction): + dist = NormalDistribution(mean=mu, sd=sigma) + assert dist.contribution_to_ev(dist.inv_contribution_to_ev(ev_fraction)) == approx( + ev_fraction, abs=1e-8 + ) + + +def test_uniform_contribution_to_ev_basic(): + dist = UniformDistribution(-1, 1) + assert dist.contribution_to_ev(-1) == approx(0) + assert dist.contribution_to_ev(1) == approx(1) + assert dist.contribution_to_ev(0) == approx(0.5) + + +@given(prop=st.floats(min_value=0, max_value=1)) +def test_standard_uniform_contribution_to_ev(prop): + dist = UniformDistribution(0, 1) + assert dist.contribution_to_ev(prop, False) == approx(0.5 * prop**2) + + +@given( + a=st.floats(min_value=-10, max_value=10), + b=st.floats(min_value=-10, max_value=10), +) +def test_uniform_contribution_to_ev(a, b): + if a > b: + a, b = b, a + assume(abs(a - b) > 1e-4) + dist = UniformDistribution(x=a, y=b) + assert dist.contribution_to_ev(a) == approx(0) + assert dist.contribution_to_ev(b) == approx(1) + assert dist.contribution_to_ev(a - 1) == approx(0) + assert dist.contribution_to_ev(b + 1) == approx(1) + + assert dist.contribution_to_ev(a, normalized=False) == approx(0) + if not (a < 0 and b > 0): + assert dist.contribution_to_ev(b, normalized=False) == approx(abs(a + b) / 2) + else: + total_contribution = (a**2 + b**2) / 2 / (b - a) + assert dist.contribution_to_ev(b, normalized=False) == approx(total_contribution) + + +@given( + a=st.floats(min_value=-10, max_value=10), + b=st.floats(min_value=-10, max_value=10), + prop=st.floats(min_value=0, max_value=1), +) +def test_uniform_inv_contribution_to_ev_inverts_contribution_to_ev(a, b, prop): + if a > b: + a, b = b, a + assume(abs(a - b) > 1e-4) + dist = UniformDistribution(x=a, y=b) + assert dist.contribution_to_ev(dist.inv_contribution_to_ev(prop)) == approx(prop) + + +@given( + ab=st.floats(min_value=0.01, max_value=10), +) +@example(ab=1) +def test_beta_contribution_to_ev_basic(ab): + dist = BetaDistribution(ab, ab) + assert dist.contribution_to_ev(0) == approx(0) + assert dist.contribution_to_ev(1) == approx(1) + assert dist.contribution_to_ev(1, normalized=False) == approx(0.5) + assert dist.contribution_to_ev(0.5) > 0 + assert dist.contribution_to_ev(0.5) <= 0.5 + + +@given( + a=st.floats(min_value=0.5, max_value=10), + b=st.floats(min_value=0.5, max_value=10), + fraction=st.floats(min_value=0, max_value=1), +) +def test_beta_inv_contribution_ev_inverts_contribution_to_ev(a, b, fraction): + # Note: The answers do become a bit off for small fractional values of a, b + dist = BetaDistribution(a, b) + tolerance = 1e-6 if a < 1 or b < 1 else 1e-8 + assert dist.contribution_to_ev(dist.inv_contribution_to_ev(fraction)) == approx( + fraction, rel=tolerance + ) + + +@given( + shape=st.floats(min_value=0.1, max_value=100), + scale=st.floats(min_value=0.1, max_value=100), + x=st.floats(min_value=0, max_value=100), +) +def test_gamma_contribution_to_ev_basic(shape, scale, x): + dist = GammaDistribution(shape, scale) + assert dist.contribution_to_ev(0) == approx(0) + assert dist.contribution_to_ev(1) < dist.contribution_to_ev(2) or ( + dist.contribution_to_ev(1) == 0 and dist.contribution_to_ev(2) == 0 + ) + if shape * scale <= 1: + assert dist.contribution_to_ev(shape * scale, normalized=False) < stats.gamma.cdf( + shape * scale, shape, scale=scale + ) + assert dist.contribution_to_ev(100 * shape * scale, normalized=False) == approx( + shape * scale, rel=1e-3 + ) + + +@given( + shape=st.floats(min_value=0.1, max_value=100), + scale=st.floats(min_value=0.1, max_value=100), + fraction=st.floats(min_value=0, max_value=1 - 1e-6), +) +@example(shape=1, scale=2, fraction=0.5) +def test_gamma_inv_contribution_ev_inverts_contribution_to_ev(shape, scale, fraction): + dist = GammaDistribution(shape, scale) + tolerance = 1e-6 if shape < 1 or scale < 1 else 1e-8 + assert dist.contribution_to_ev(dist.inv_contribution_to_ev(fraction)) == approx( + fraction, rel=tolerance + ) + + +@given( + shape=st.floats(min_value=1.1, max_value=100), + fraction=st.floats(min_value=0, max_value=1 - 1e-6), +) +def test_pareto_inv_contribution_to_ev_inverts_contribution_to_ev(shape, fraction): + dist = ParetoDistribution(shape) + assert dist.contribution_to_ev(dist.inv_contribution_to_ev(fraction)) == approx( + fraction, rel=1e-8 + ) diff --git a/tests/test_numeric_distribution.py b/tests/test_numeric_distribution.py new file mode 100644 index 0000000..973f624 --- /dev/null +++ b/tests/test_numeric_distribution.py @@ -0,0 +1,1678 @@ +from functools import reduce +from hypothesis import assume, example, given, settings +import hypothesis.strategies as st +import numpy as np +import operator +from pytest import approx +from scipy import integrate, stats +import sys +from unittest.mock import patch, Mock +import warnings + +from ..squigglepy.distributions import ( + BernoulliDistribution, + BetaDistribution, + ChiSquareDistribution, + ComplexDistribution, + ConstantDistribution, + ExponentialDistribution, + GammaDistribution, + LognormalDistribution, + MixtureDistribution, + NormalDistribution, + ParetoDistribution, + PERTDistribution, + UniformDistribution, + mixture, +) +from ..squigglepy.numeric_distribution import numeric, NumericDistribution, _bump_indexes +from ..squigglepy import utils + +# There are a lot of functions testing various combinations of behaviors with +# no obvious way to order them. These functions are ordered basically like this: +# +# 1. helper functions +# 2. tests for constructors for norm and lognorm +# 3. tests for basic operations on norm and lognorm, in the order +# product > sum > negation > subtraction +# 4. tests for other distributions +# 5. tests for non-operation functions such as cdf/quantile +# 6. special tests, such as profiling tests +# +# Tests with `basic` in the name use hard-coded values to ensure basic +# functionality. Other tests use values generated by the hypothesis library. + + +def relative_error(x, y): + if x == 0 and y == 0: + return 0 + if x == 0: + return abs(y) + if y == 0: + return abs(x) + return max(x / y, y / x) - 1 + + +def fix_ordering(a, b): + """ + Check that a and b are ordered correctly and that they're not tiny enough + to mess up floating point calculations. + """ + if a > b: + a, b = b, a + assume(a != b) + assume(((b - a) / (50 * (abs(a) + abs(b)))) ** 2 > sys.float_info.epsilon) + assume(a == 0 or abs(a) > sys.float_info.epsilon) + assume(b == 0 or abs(b) > sys.float_info.epsilon) + return a, b + + +@given( + mean1=st.floats(min_value=-1e5, max_value=1e5), + mean2=st.floats(min_value=-1e5, max_value=1e5), + sd1=st.floats(min_value=0.1, max_value=100), + sd2=st.floats(min_value=0.001, max_value=1000), +) +@example(mean1=0, mean2=-8, sd1=1, sd2=1) +def test_sum_exact_summary_stats(mean1, mean2, sd1, sd2): + """Test that the formulas for exact moments are implemented correctly.""" + dist1 = NormalDistribution(mean=mean1, sd=sd1) + dist2 = NormalDistribution(mean=mean2, sd=sd2) + hist1 = numeric(dist1, warn=False) + hist2 = numeric(dist2, warn=False) + hist_prod = hist1 + hist2 + assert hist_prod.exact_mean == approx( + stats.norm.mean(mean1 + mean2, np.sqrt(sd1**2 + sd2**2)) + ) + assert hist_prod.exact_sd == approx( + stats.norm.std( + mean1 + mean2, + np.sqrt(sd1**2 + sd2**2), + ) + ) + + +@given( + norm_mean1=st.floats(min_value=-np.log(1e9), max_value=np.log(1e9)), + norm_mean2=st.floats(min_value=-np.log(1e5), max_value=np.log(1e5)), + norm_sd1=st.floats(min_value=0.1, max_value=3), + norm_sd2=st.floats(min_value=0.001, max_value=3), +) +def test_lognorm_product_exact_summary_stats(norm_mean1, norm_mean2, norm_sd1, norm_sd2): + """Test that the formulas for exact moments are implemented correctly.""" + dist1 = LognormalDistribution(norm_mean=norm_mean1, norm_sd=norm_sd1) + dist2 = LognormalDistribution(norm_mean=norm_mean2, norm_sd=norm_sd2) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + hist1 = numeric(dist1, warn=False) + hist2 = numeric(dist2, warn=False) + hist_prod = hist1 * hist2 + assert hist_prod.exact_mean == approx( + stats.lognorm.mean( + np.sqrt(norm_sd1**2 + norm_sd2**2), scale=np.exp(norm_mean1 + norm_mean2) + ) + ) + assert hist_prod.exact_sd == approx( + stats.lognorm.std( + np.sqrt(norm_sd1**2 + norm_sd2**2), scale=np.exp(norm_mean1 + norm_mean2) + ) + ) + + +@given( + mean=st.floats(min_value=-np.log(1e9), max_value=np.log(1e9)), + sd=st.floats(min_value=0.001, max_value=100), +) +def test_norm_basic(mean, sd): + dist = NormalDistribution(mean=mean, sd=sd) + hist = numeric(dist, bin_sizing="uniform", warn=False) + assert hist.est_mean() == approx(mean) + assert hist.est_sd() == approx(sd, rel=0.01) + + +@given( + norm_mean=st.floats(min_value=-np.log(1e9), max_value=np.log(1e9)), + norm_sd=st.floats(min_value=0.001, max_value=3), + bin_sizing=st.sampled_from(["uniform", "log-uniform", "ev", "mass"]), +) +@example(norm_mean=0, norm_sd=1, bin_sizing="log-uniform") +def test_lognorm_mean(norm_mean, norm_sd, bin_sizing): + dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd) + hist = numeric(dist, bin_sizing=bin_sizing, warn=False) + if bin_sizing == "ev": + tolerance = 1e-6 + elif bin_sizing == "log-uniform": + tolerance = 1e-2 + else: + tolerance = 0.01 if dist.norm_sd < 3 else 0.1 + assert hist.est_mean() == approx( + stats.lognorm.mean(dist.norm_sd, scale=np.exp(dist.norm_mean)), + rel=tolerance, + ) + + +@given( + norm_mean=st.floats(min_value=-np.log(1e9), max_value=np.log(1e9)), + norm_sd=st.floats(min_value=0.01, max_value=3), +) +def test_lognorm_sd(norm_mean, norm_sd): + dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd) + hist = numeric(dist, bin_sizing="log-uniform", warn=False) + + def true_variance(left, right): + return integrate.quad( + lambda x: (x - dist.lognorm_mean) ** 2 + * stats.lognorm.pdf(x, dist.norm_sd, scale=np.exp(dist.norm_mean)), + left, + right, + )[0] + + def observed_variance(left, right): + return np.sum(hist.masses[left:right] * (hist.values[left:right] - hist.est_mean()) ** 2) + + assert hist.est_sd() == approx(dist.lognorm_sd, rel=0.01 + 0.1 * norm_sd) + + +@given( + mean=st.floats(min_value=-10, max_value=10), + sd=st.floats(min_value=0.01, max_value=10), + clip_zscore=st.floats(min_value=-4, max_value=4), +) +@example(mean=0.25, sd=6, clip_zscore=0) +def test_norm_one_sided_clip(mean, sd, clip_zscore): + # TODO: changing allowance from 0 to 0.25 made this start failing. The + # problem is that deleting a bit of mass on one side is shifting the EV by + # more than the tolerance. + tolerance = 1e-3 if abs(clip_zscore) > 3 else 1e-5 + clip = mean + clip_zscore * sd + dist = NormalDistribution(mean=mean, sd=sd, lclip=clip) + hist = numeric(dist, warn=False) + # The exact mean can still be a bit off because uniform bin_sizing doesn't + # cover the full domain + assert hist.exact_mean == approx( + stats.truncnorm.mean(clip_zscore, np.inf, loc=mean, scale=sd), rel=1e-5, abs=1e-9 + ) + + assert hist.est_mean() == approx( + stats.truncnorm.mean(clip_zscore, np.inf, loc=mean, scale=sd), rel=tolerance, abs=tolerance + ) + + dist = NormalDistribution(mean=mean, sd=sd, rclip=clip) + hist = numeric(dist, warn=False) + assert hist.est_mean() == approx( + stats.truncnorm.mean(-np.inf, clip_zscore, loc=mean, scale=sd), + rel=tolerance, + abs=tolerance, + ) + assert hist.exact_mean == approx( + stats.truncnorm.mean(-np.inf, clip_zscore, loc=mean, scale=sd), rel=1e-5, abs=1e-6 + ) + + +@given( + mean=st.floats(min_value=-1, max_value=1), + sd=st.floats(min_value=0.01, max_value=10), + lclip_zscore=st.floats(min_value=-4, max_value=4), + rclip_zscore=st.floats(min_value=-4, max_value=4), +) +def test_norm_clip(mean, sd, lclip_zscore, rclip_zscore): + tolerance = 1e-3 if max(abs(lclip_zscore), abs(rclip_zscore)) > 3 else 1e-5 + if lclip_zscore > rclip_zscore: + lclip_zscore, rclip_zscore = rclip_zscore, lclip_zscore + assume(abs(rclip_zscore - lclip_zscore) > 0.01) + lclip = mean + lclip_zscore * sd + rclip = mean + rclip_zscore * sd + dist = NormalDistribution(mean=mean, sd=sd, lclip=lclip, rclip=rclip) + hist = numeric(dist, warn=False) + + assert hist.est_mean() == approx( + stats.truncnorm.mean(lclip_zscore, rclip_zscore, loc=mean, scale=sd), rel=tolerance + ) + assert hist.est_mean() == approx(hist.exact_mean, rel=tolerance) + assert hist.exact_mean == approx( + stats.truncnorm.mean(lclip_zscore, rclip_zscore, loc=mean, scale=sd), rel=1e-6, abs=1e-10 + ) + assert hist.exact_sd == approx( + stats.truncnorm.std(lclip_zscore, rclip_zscore, loc=mean, scale=sd), rel=1e-6, abs=1e-10 + ) + + +@given( + a=st.floats(-100, -1), + b=st.floats(1, 100), + lclip=st.floats(-100, -1), + rclip=st.floats(1, 100), +) +def test_uniform_clip(a, b, lclip, rclip): + dist = UniformDistribution(a, b) + dist.lclip = lclip + dist.rclip = rclip + narrow_dist = UniformDistribution(max(a, lclip), min(b, rclip)) + hist = numeric(dist) + narrow_hist = numeric(narrow_dist) + + assert hist.est_mean() == approx(narrow_hist.exact_mean) + assert hist.est_mean() == approx(narrow_hist.est_mean()) + assert hist.values[0] == approx(narrow_hist.values[0]) + assert hist.values[-1] == approx(narrow_hist.values[-1]) + + +@given( + norm_mean=st.floats(min_value=0.1, max_value=10), + norm_sd=st.floats(min_value=0.5, max_value=3), + clip_zscore=st.floats(min_value=-2, max_value=2), +) +def test_lognorm_clip_and_sum(norm_mean, norm_sd, clip_zscore): + clip = np.exp(norm_mean + norm_sd * clip_zscore) + left_dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd, rclip=clip) + right_dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd, lclip=clip) + left_hist = numeric(left_dist, warn=False) + right_hist = numeric(right_dist, warn=False) + left_mass = stats.lognorm.cdf(clip, norm_sd, scale=np.exp(norm_mean)) + right_mass = 1 - left_mass + true_mean = stats.lognorm.mean(norm_sd, scale=np.exp(norm_mean)) + sum_exact_mean = left_mass * left_hist.exact_mean + right_mass * right_hist.exact_mean + sum_hist_mean = left_mass * left_hist.est_mean() + right_mass * right_hist.est_mean() + + # TODO: the error margin is surprisingly large + assert sum_exact_mean == approx(true_mean, rel=1e-3, abs=1e-6) + assert sum_hist_mean == approx(true_mean, rel=1e-3, abs=1e-6) + + +@given( + mean1=st.floats(min_value=-1000, max_value=0.01), + mean2=st.floats(min_value=0.01, max_value=1000), + mean3=st.floats(min_value=0.01, max_value=1000), + sd1=st.floats(min_value=0.1, max_value=10), + sd2=st.floats(min_value=0.1, max_value=10), + sd3=st.floats(min_value=0.1, max_value=10), + bin_sizing=st.sampled_from(["ev", "mass", "uniform"]), +) +@example(mean1=5, mean2=5, mean3=4, sd1=1, sd2=1, sd3=1, bin_sizing="ev") +@example(mean1=9, mean2=9, mean3=9, sd1=1, sd2=1, sd3=1, bin_sizing="ev") +def test_norm_product(mean1, mean2, mean3, sd1, sd2, sd3, bin_sizing): + dist1 = NormalDistribution(mean=mean1, sd=sd1) + dist2 = NormalDistribution(mean=mean2, sd=sd2) + dist3 = NormalDistribution(mean=mean3, sd=sd3) + mean_tolerance = 1e-5 + sd_tolerance = 0.2 if bin_sizing == "uniform" else 1 + + hist1 = numeric(dist1, num_bins=40, bin_sizing=bin_sizing, warn=False) + hist2 = numeric(dist2, num_bins=40, bin_sizing=bin_sizing, warn=False) + hist3 = numeric(dist3, num_bins=40, bin_sizing=bin_sizing, warn=False) + hist_prod = hist1 * hist2 + assert hist_prod.est_mean() == approx(dist1.mean * dist2.mean, rel=mean_tolerance, abs=1e-8) + assert hist_prod.est_sd() == approx( + np.sqrt( + (dist1.sd**2 + dist1.mean**2) * (dist2.sd**2 + dist2.mean**2) + - dist1.mean**2 * dist2.mean**2 + ), + rel=sd_tolerance, + ) + hist3_prod = hist_prod * hist3 + assert hist3_prod.est_mean() == approx( + dist1.mean * dist2.mean * dist3.mean, rel=mean_tolerance, abs=1e-8 + ) + + +@given( + mean=st.floats(min_value=-10, max_value=10), + sd=st.floats(min_value=0.001, max_value=100), + num_bins=st.sampled_from([40, 100]), + bin_sizing=st.sampled_from(["ev", "mass", "uniform"]), +) +@settings(max_examples=100) +def test_norm_mean_error_propagation(mean, sd, num_bins, bin_sizing): + """ "Test how quickly the error in the mean grows as distributions are + multiplied.""" + dist = NormalDistribution(mean=mean, sd=sd) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + hist = numeric(dist, num_bins=num_bins, bin_sizing=bin_sizing) + hist_base = numeric(dist, num_bins=num_bins, bin_sizing=bin_sizing) + tolerance = 1e-10 if bin_sizing == "ev" else 1e-5 + + for i in range(1, 17): + true_mean = mean**i + true_sd = np.sqrt((dist.sd**2 + dist.mean**2) ** i - dist.mean ** (2 * i)) + if true_sd > 1e15: + break + assert hist.est_mean() == approx( + true_mean, abs=tolerance ** (1 / i), rel=tolerance ** (1 / i) + ), f"On iteration {i}" + hist = hist * hist_base + + +@given( + mean1=st.floats(min_value=-100, max_value=100), + mean2=st.floats(min_value=-np.log(1e5), max_value=np.log(1e5)), + mean3=st.floats(min_value=-100, max_value=100), + sd1=st.floats(min_value=0.001, max_value=100), + sd2=st.floats(min_value=0.001, max_value=3), + sd3=st.floats(min_value=0.001, max_value=100), + # num_bins1=st.sampled_from([40, 100]), + # num_bins2=st.sampled_from([40, 100]), + num_bins1=st.sampled_from([100]), + num_bins2=st.sampled_from([100]), +) +@example(mean1=99, mean2=0, mean3=-1e-16, sd1=1.5, sd2=3, sd3=0.5, num_bins1=100, num_bins2=100) +def test_norm_lognorm_product_sum(mean1, mean2, mean3, sd1, sd2, sd3, num_bins1, num_bins2): + dist1 = NormalDistribution(mean=mean1, sd=sd1) + dist2 = LognormalDistribution(norm_mean=mean2, norm_sd=sd2) + dist3 = NormalDistribution(mean=mean3, sd=sd3) + hist1 = numeric(dist1, num_bins=num_bins1, warn=False) + hist2 = numeric(dist2, num_bins=num_bins2, warn=False) + hist3 = numeric(dist3, num_bins=num_bins1, warn=False) + hist_prod = hist1 * hist2 + assert all(np.diff(hist_prod.values) >= 0) + assert hist_prod.est_mean() == approx(hist_prod.exact_mean, abs=1e-5, rel=1e-5) + + # SD is pretty inaccurate + sd_tolerance = 1 if num_bins1 == 100 and num_bins2 == 100 else 2 + assert hist_prod.est_sd() == approx(hist_prod.exact_sd, rel=sd_tolerance) + + hist_sum = hist_prod + hist3 + assert hist_sum.est_mean() == approx(hist_sum.exact_mean, abs=1e-5, rel=1e-5) + + +@given( + norm_mean=st.floats(min_value=np.log(1e-6), max_value=np.log(1e6)), + norm_sd=st.floats(min_value=0.001, max_value=2), + num_bins=st.sampled_from([40, 100]), + bin_sizing=st.sampled_from(["ev", "log-uniform", "fat-hybrid"]), +) +@settings(max_examples=10) +def test_lognorm_mean_error_propagation(norm_mean, norm_sd, num_bins, bin_sizing): + dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd) + hist = numeric(dist, num_bins=num_bins, bin_sizing=bin_sizing, warn=False) + hist_base = numeric(dist, num_bins=num_bins, bin_sizing=bin_sizing, warn=False) + inv_tolerance = 1 - 1e-12 if bin_sizing == "ev" else 0.98 + + for i in range(1, 13): + true_mean = stats.lognorm.mean(np.sqrt(i) * norm_sd, scale=np.exp(i * norm_mean)) + if bin_sizing == "ev": + # log-uniform can have out-of-order values due to the masses at the + # end being very small + assert all(np.diff(hist.values) >= 0), f"On iteration {i}: {hist.values}" + assert hist.est_mean() == approx( + true_mean, rel=1 - inv_tolerance**i + ), f"On iteration {i}" + hist = hist * hist_base + + +@given(bin_sizing=st.sampled_from(["ev", "log-uniform"])) +def test_lognorm_sd_error_propagation(bin_sizing): + verbose = False + dist = LognormalDistribution(norm_mean=0, norm_sd=0.1) + num_bins = 100 + hist = numeric(dist, num_bins=num_bins, bin_sizing=bin_sizing, warn=False) + abs_error = [] + rel_error = [] + + if verbose: + print("") + for i in [1, 2, 4, 8, 16, 32]: + oneshot = numeric( + LognormalDistribution(norm_mean=0, norm_sd=0.1 * np.sqrt(i)), + num_bins=num_bins, + bin_sizing=bin_sizing, + warn=False, + ) + true_sd = hist.exact_sd + abs_error.append(abs(hist.est_sd() - true_sd)) + rel_error.append(relative_error(hist.est_sd(), true_sd)) + if verbose: + print(f"i={i:2d}: Hist error : {rel_error[-1] * 100:.4f}%") + print( + "i={:2d}: Hist / 1shot: {:.0f}%".format( + i, ((rel_error[-1] / relative_error(oneshot.est_sd(), true_sd)) * 100) + ) + ) + hist = hist * hist + + expected_error_pcts = ( + [0.9, 2.8, 9.9, 40.7, 211, 2678] + if bin_sizing == "ev" + else [12, 26.3, 99.8, 733, 32000, 1e9] + ) + + for i in range(len(expected_error_pcts)): + assert rel_error[i] < expected_error_pcts[i] / 100 + + +@given( + norm_mean1=st.floats(min_value=-np.log(1e9), max_value=np.log(1e9)), + norm_mean2=st.floats(min_value=-np.log(1e9), max_value=np.log(1e9)), + norm_sd1=st.floats(min_value=0.1, max_value=3), + norm_sd2=st.floats(min_value=0.1, max_value=3), + bin_sizing=st.sampled_from(["ev", "log-uniform", "fat-hybrid"]), +) +@example(norm_mean1=0, norm_mean2=0, norm_sd1=1, norm_sd2=1, bin_sizing="fat-hybrid") +def test_lognorm_product(norm_mean1, norm_sd1, norm_mean2, norm_sd2, bin_sizing): + dists = [ + LognormalDistribution(norm_mean=norm_mean2, norm_sd=norm_sd2), + LognormalDistribution(norm_mean=norm_mean1, norm_sd=norm_sd1), + ] + dist_prod = LognormalDistribution( + norm_mean=norm_mean1 + norm_mean2, norm_sd=np.sqrt(norm_sd1**2 + norm_sd2**2) + ) + hists = [numeric(dist, bin_sizing=bin_sizing, warn=False) for dist in dists] + hist_prod = reduce(lambda acc, hist: acc * hist, hists) + + # Lognorm width grows with e**norm_sd**2, so error tolerance grows the same way + sd_tolerance = 1.05 ** (1 + (norm_sd1 + norm_sd2) ** 2) - 1 + mean_tolerance = 1e-3 if bin_sizing == "log-uniform" else 1e-6 + assert hist_prod.est_mean() == approx(dist_prod.lognorm_mean, rel=mean_tolerance) + assert hist_prod.est_sd() == approx(dist_prod.lognorm_sd, rel=sd_tolerance) + + +@given( + mean1=st.floats(-1e5, 1e5), + mean2=st.floats(min_value=-1e5, max_value=1e5), + sd1=st.floats(min_value=0.001, max_value=1e5), + sd2=st.floats(min_value=0.001, max_value=1e5), + num_bins=st.sampled_from([40, 100]), + bin_sizing=st.sampled_from(["ev", "uniform"]), +) +@example(mean1=0, mean2=0, sd1=1, sd2=16, num_bins=40, bin_sizing="ev") +@example(mean1=0, mean2=0, sd1=7, sd2=1, num_bins=40, bin_sizing="ev") +def test_norm_sum(mean1, mean2, sd1, sd2, num_bins, bin_sizing): + dist1 = NormalDistribution(mean=mean1, sd=sd1) + dist2 = NormalDistribution(mean=mean2, sd=sd2) + hist1 = numeric(dist1, num_bins=num_bins, bin_sizing=bin_sizing, warn=False) + hist2 = numeric(dist2, num_bins=num_bins, bin_sizing=bin_sizing, warn=False) + hist_sum = hist1 + hist2 + + # The further apart the means are, the less accurate the SD estimate is + distance_apart = abs(mean1 - mean2) / hist_sum.exact_sd + sd_tolerance = 2 + 0.5 * distance_apart + mean_tolerance = 1e-10 + 1e-10 * distance_apart + + assert all(np.diff(hist_sum.values) >= 0) + assert hist_sum.est_mean() == approx(hist_sum.exact_mean, abs=mean_tolerance, rel=1e-5) + assert hist_sum.est_sd() == approx(hist_sum.exact_sd, rel=sd_tolerance) + + +@given( + norm_mean1=st.floats(min_value=-np.log(1e6), max_value=np.log(1e6)), + norm_mean2=st.floats(min_value=-np.log(1e6), max_value=np.log(1e6)), + norm_sd1=st.floats(min_value=0.1, max_value=3), + norm_sd2=st.floats(min_value=0.01, max_value=3), + bin_sizing=st.sampled_from(["ev", "log-uniform"]), +) +def test_lognorm_sum(norm_mean1, norm_mean2, norm_sd1, norm_sd2, bin_sizing): + dist1 = LognormalDistribution(norm_mean=norm_mean1, norm_sd=norm_sd1) + dist2 = LognormalDistribution(norm_mean=norm_mean2, norm_sd=norm_sd2) + hist1 = numeric(dist1, bin_sizing=bin_sizing, warn=False) + hist2 = numeric(dist2, bin_sizing=bin_sizing, warn=False) + hist_sum = hist1 + hist2 + assert all(np.diff(hist_sum.values) >= 0), hist_sum.values + mean_tolerance = 1e-3 if bin_sizing == "log-uniform" else 1e-6 + assert hist_sum.est_mean() == approx(hist_sum.exact_mean, rel=mean_tolerance) + + # SD is very inaccurate because adding lognormals produces some large but + # very low-probability values on the right tail and the only approach is to + # either downweight them or make the histogram much wider. + assert hist_sum.est_sd() > min(hist1.est_sd(), hist2.est_sd()) + assert hist_sum.est_sd() == approx(hist_sum.exact_sd, rel=2) + + +@given( + mean1=st.floats(min_value=-100, max_value=100), + mean2=st.floats(min_value=-np.log(1e5), max_value=np.log(1e5)), + sd1=st.floats(min_value=0.001, max_value=100), + sd2=st.floats(min_value=0.001, max_value=3), + lognorm_bin_sizing=st.sampled_from(["ev", "log-uniform", "fat-hybrid"]), +) +@example(mean1=-68, mean2=0, sd1=1, sd2=2.9, lognorm_bin_sizing="log-uniform") +def test_norm_lognorm_sum(mean1, mean2, sd1, sd2, lognorm_bin_sizing): + dist1 = NormalDistribution(mean=mean1, sd=sd1) + dist2 = LognormalDistribution(norm_mean=mean2, norm_sd=sd2) + hist1 = numeric(dist1, warn=False) + hist2 = numeric(dist2, bin_sizing=lognorm_bin_sizing, warn=False) + hist_sum = hist1 + hist2 + mean_tolerance = 0.005 if lognorm_bin_sizing == "log-uniform" else 1e-6 + sd_tolerance = 0.5 + assert all(np.diff(hist_sum.values) >= 0), hist_sum.values + assert hist_sum.est_mean() == approx( + hist_sum.exact_mean, abs=mean_tolerance, rel=mean_tolerance + ) + assert hist_sum.est_sd() == approx(hist_sum.exact_sd, rel=sd_tolerance) + + +@given( + norm_mean=st.floats(min_value=-10, max_value=10), + norm_sd=st.floats(min_value=0.1, max_value=2), +) +def test_lognorm_to_const_power(norm_mean, norm_sd): + # If you make the power bigger, mean stays pretty accurate but SD gets + # pretty far off (>100%) for high-variance dists + power = 1.5 + dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd) + hist = numeric(dist, bin_sizing="log-uniform", warn=False) + + hist_pow = hist**power + true_dist_pow = LognormalDistribution(norm_mean=power * norm_mean, norm_sd=power * norm_sd) + assert hist_pow.est_mean() == approx(true_dist_pow.lognorm_mean, rel=0.005) + assert hist_pow.est_sd() == approx(true_dist_pow.lognorm_sd, rel=0.5) + + +@given( + norm_mean=st.floats(min_value=-1e6, max_value=1e6), + norm_sd=st.floats(min_value=0.001, max_value=3), + num_bins=st.sampled_from([40, 100]), + bin_sizing=st.sampled_from(["ev", "uniform"]), +) +def test_norm_negate(norm_mean, norm_sd, num_bins, bin_sizing): + dist = NormalDistribution(mean=0, sd=1) + hist = numeric(dist, warn=False) + neg_hist = -hist + assert neg_hist.est_mean() == approx(-hist.est_mean()) + assert neg_hist.est_sd() == approx(hist.est_sd()) + + +@given( + norm_mean=st.floats(min_value=-np.log(1e9), max_value=np.log(1e9)), + norm_sd=st.floats(min_value=0.001, max_value=3), + num_bins=st.sampled_from([40, 100]), + bin_sizing=st.sampled_from(["ev", "log-uniform"]), +) +def test_lognorm_negate(norm_mean, norm_sd, num_bins, bin_sizing): + dist = LognormalDistribution(norm_mean=0, norm_sd=1) + hist = numeric(dist, warn=False) + neg_hist = -hist + assert neg_hist.est_mean() == approx(-hist.est_mean()) + assert neg_hist.est_sd() == approx(hist.est_sd()) + + +@given( + type_and_size=st.sampled_from(["norm-ev", "norm-uniform", "lognorm-ev"]), + mean1=st.floats(min_value=-1e6, max_value=1e6), + mean2=st.floats(min_value=-100, max_value=100), + sd1=st.floats(min_value=0.001, max_value=1000), + sd2=st.floats(min_value=0.1, max_value=5), + num_bins=st.sampled_from([30, 100]), +) +def test_sub(type_and_size, mean1, mean2, sd1, sd2, num_bins): + dist1 = NormalDistribution(mean=mean1, sd=sd1) + dist2_type, bin_sizing = type_and_size.split("-") + + if dist2_type == "norm": + dist2 = NormalDistribution(mean=mean2, sd=sd2) + neg_dist = NormalDistribution(mean=-mean2, sd=sd2) + elif dist2_type == "lognorm": + dist2 = LognormalDistribution(norm_mean=mean2, norm_sd=sd2) + # We can't negate a lognormal distribution by changing the params + neg_dist = None + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + hist1 = numeric(dist1, num_bins=num_bins, bin_sizing=bin_sizing) + hist2 = numeric(dist2, num_bins=num_bins, bin_sizing=bin_sizing) + hist_diff = hist1 - hist2 + backward_diff = hist2 - hist1 + assert not any(np.isnan(hist_diff.values)) + assert all(np.diff(hist_diff.values) >= 0) + assert hist_diff.est_mean() == approx(-backward_diff.est_mean(), rel=0.01) + assert hist_diff.est_sd() == approx(backward_diff.est_sd(), rel=0.05) + + if neg_dist: + neg_hist = numeric(neg_dist, num_bins=num_bins, bin_sizing=bin_sizing) + hist_sum = hist1 + neg_hist + assert hist_diff.est_mean() == approx(hist_sum.est_mean(), rel=0.01) + assert hist_diff.est_sd() == approx(hist_sum.est_sd(), rel=0.05) + + +def test_lognorm_sub(): + dist = LognormalDistribution(norm_mean=0, norm_sd=1) + hist = numeric(dist, warn=False) + hist_diff = 0.97 * hist - 0.03 * hist + assert not any(np.isnan(hist_diff.values)) + assert all(np.diff(hist_diff.values) >= 0) + assert hist_diff.est_mean() == approx(0.94 * dist.lognorm_mean, rel=0.001) + assert hist_diff.est_sd() == approx(hist_diff.exact_sd, rel=0.05) + + +@given( + mean=st.floats(min_value=-100, max_value=100), + sd=st.floats(min_value=0.001, max_value=1000), + scalar=st.floats(min_value=-100, max_value=100), +) +def test_scale(mean, sd, scalar): + assume(scalar != 0) + dist = NormalDistribution(mean=mean, sd=sd) + hist = numeric(dist, warn=False) + scaled_hist = scalar * hist + assert scaled_hist.est_mean() == approx(scalar * hist.est_mean(), abs=1e-6, rel=1e-6) + assert scaled_hist.est_sd() == approx(abs(scalar) * hist.est_sd(), abs=1e-6, rel=1e-6) + assert scaled_hist.exact_mean == approx(scalar * hist.exact_mean) + assert scaled_hist.exact_sd == approx(abs(scalar) * hist.exact_sd) + + +@given( + mean=st.floats(min_value=-100, max_value=100), + sd=st.floats(min_value=0.001, max_value=1000), + scalar=st.floats(min_value=-100, max_value=100), +) +def test_shift_by(mean, sd, scalar): + dist = NormalDistribution(mean=mean, sd=sd) + hist = numeric(dist, warn=False) + shifted_hist = hist + scalar + assert shifted_hist.est_mean() == approx(hist.est_mean() + scalar, abs=1e-6, rel=1e-6) + assert shifted_hist.est_sd() == approx(hist.est_sd(), abs=1e-6, rel=1e-6) + assert shifted_hist.exact_mean == approx(hist.exact_mean + scalar) + assert shifted_hist.exact_sd == approx(hist.exact_sd) + assert shifted_hist.pos_ev_contribution - shifted_hist.neg_ev_contribution == approx( + shifted_hist.exact_mean + ) + if shifted_hist.zero_bin_index < len(shifted_hist.values): + assert shifted_hist.values[shifted_hist.zero_bin_index] > 0 + if shifted_hist.zero_bin_index > 0: + assert shifted_hist.values[shifted_hist.zero_bin_index - 1] < 0 + + +@given( + norm_mean=st.floats(min_value=-10, max_value=10), + norm_sd=st.floats(min_value=0.01, max_value=2.5), +) +def test_lognorm_reciprocal(norm_mean, norm_sd): + dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd) + reciprocal_dist = LognormalDistribution(norm_mean=-norm_mean, norm_sd=norm_sd) + hist = numeric(dist, bin_sizing="log-uniform", warn=False) + reciprocal_hist = 1 / hist + true_reciprocal_hist = numeric(reciprocal_dist, bin_sizing="log-uniform", warn=False) + + # Taking the reciprocal does lose a good bit of accuracy because bin values + # are set as the expected value of the bin, and the EV of 1/X is pretty + # different. Could improve accuracy by writing + # reciprocal_contribution_to_ev functions for every distribution type, but + # that's probably not worth it. + assert reciprocal_hist.est_mean() == approx(reciprocal_dist.lognorm_mean, rel=0.05) + assert reciprocal_hist.est_sd() == approx(reciprocal_dist.lognorm_sd, rel=0.2) + assert reciprocal_hist.neg_ev_contribution == 0 + assert reciprocal_hist.pos_ev_contribution == approx( + true_reciprocal_hist.pos_ev_contribution, rel=0.05 + ) + + +@given( + norm_mean1=st.floats(min_value=-10, max_value=10), + norm_mean2=st.floats(min_value=-10, max_value=10), + norm_sd1=st.floats(min_value=0.01, max_value=2), + norm_sd2=st.floats(min_value=0.01, max_value=2), + bin_sizing1=st.sampled_from(["ev", "log-uniform"]), +) +def test_lognorm_quotient(norm_mean1, norm_mean2, norm_sd1, norm_sd2, bin_sizing1): + dist1 = LognormalDistribution(norm_mean=norm_mean1, norm_sd=norm_sd1) + dist2 = LognormalDistribution(norm_mean=norm_mean2, norm_sd=norm_sd2) + hist1 = numeric(dist1, bin_sizing=bin_sizing1, warn=False) + hist2 = numeric(dist2, bin_sizing="log-uniform", warn=False) + quotient_hist = hist1 / hist2 + true_quotient_dist = LognormalDistribution( + norm_mean=norm_mean1 - norm_mean2, norm_sd=np.sqrt(norm_sd1**2 + norm_sd2**2) + ) + true_quotient_hist = numeric(true_quotient_dist, bin_sizing="log-uniform", warn=False) + + assert quotient_hist.est_mean() == approx(true_quotient_hist.est_mean(), rel=0.05) + assert quotient_hist.est_sd() == approx(true_quotient_hist.est_sd(), rel=0.2) + assert quotient_hist.neg_ev_contribution == approx( + true_quotient_hist.neg_ev_contribution, rel=0.01 + ) + assert quotient_hist.pos_ev_contribution == approx( + true_quotient_hist.pos_ev_contribution, rel=0.01 + ) + + +@given( + mean=st.floats(min_value=-20, max_value=20), + sd=st.floats(min_value=0.1, max_value=1), +) +@example(mean=0, sd=2) +def test_norm_exp(mean, sd): + dist = NormalDistribution(mean=mean, sd=sd) + hist = numeric(dist) + exp_hist = hist.exp() + true_exp_dist = LognormalDistribution(norm_mean=mean, norm_sd=sd) + + # TODO: previously with richardson, mean was accurate to 0.005, and sd to + # 0.1, but now it's worse b/c it was using uniform before and now it's + # using ev + # assert exp_hist.est_mean() == approx(true_exp_dist.lognorm_mean, rel=0.005) + # assert exp_hist.est_sd() == approx(true_exp_dist.lognorm_sd, rel=0.1) + assert exp_hist.est_mean() == approx(true_exp_dist.lognorm_mean, rel=0.02) + assert exp_hist.est_sd() == approx(true_exp_dist.lognorm_sd, rel=0.5) + + +@given( + loga=st.floats(min_value=-5, max_value=5), + logb=st.floats(min_value=0, max_value=10), +) +@example(loga=0, logb=0.001) +@example(loga=0, logb=2) +@example(loga=-5, logb=10) +@settings(max_examples=1) +def test_uniform_exp(loga, logb): + loga, logb = fix_ordering(loga, logb) + dist = UniformDistribution(loga, logb) + hist = numeric(dist) + exp_hist = hist.exp() + a = np.exp(loga) + b = np.exp(logb) + true_mean = (b - a) / np.log(b / a) + true_sd = np.sqrt((b**2 - a**2) / (2 * np.log(b / a)) - ((b - a) / (np.log(b / a))) ** 2) + assert exp_hist.est_mean() == approx(true_mean, rel=0.02) + if not np.isnan(true_sd): + # variance can be slightly negative due to rounding errors + assert exp_hist.est_sd() == approx(true_sd, rel=0.2, abs=1e-5) + + +@given( + mean=st.floats(min_value=-20, max_value=20), + sd=st.floats(min_value=0.1, max_value=2), +) +def test_lognorm_log(mean, sd): + dist = LognormalDistribution(norm_mean=mean, norm_sd=sd) + hist = numeric(dist, warn=False) + log_hist = hist.log() + true_log_dist = NormalDistribution(mean=mean, sd=sd) + true_log_hist = numeric(true_log_dist, warn=False) + assert log_hist.est_mean() == approx(true_log_hist.exact_mean, rel=0.01, abs=1) + assert log_hist.est_sd() == approx(true_log_hist.exact_sd, rel=0.3) + + +@given( + mean=st.floats(min_value=-20, max_value=20), + sd=st.floats(min_value=0.1, max_value=10), +) +@settings(max_examples=10) +def test_norm_abs(mean, sd): + dist = NormalDistribution(mean=mean, sd=sd) + hist = abs(numeric(dist)) + shape = abs(mean) / sd + scale = sd + true_mean = stats.foldnorm.mean(shape, loc=0, scale=scale) + true_sd = stats.foldnorm.std(shape, loc=0, scale=scale) + assert hist.est_mean() == approx(true_mean, rel=0.001) + assert hist.est_sd() == approx(true_sd, rel=0.01) + + +@given( + mean=st.floats(min_value=-20, max_value=20), + sd=st.floats(min_value=0.1, max_value=10), + left_zscore=st.floats(min_value=-2, max_value=2), + zscore_width=st.floats(min_value=2, max_value=5), +) +@settings(max_examples=10) +@example(mean=-2, sd=1, left_zscore=2, zscore_width=2) +def test_given_value_satisfies(mean, sd, left_zscore, zscore_width): + right_zscore = left_zscore + zscore_width + left = mean + left_zscore * sd + right = mean + right_zscore * sd + dist = NormalDistribution(mean=mean, sd=sd) + hist = numeric(dist).given_value_satisfies(lambda x: x >= left and x <= right) + true_mean = stats.truncnorm.mean(left_zscore, right_zscore, loc=mean, scale=sd) + true_sd = stats.truncnorm.std(left_zscore, right_zscore, loc=mean, scale=sd) + assert hist.est_mean() == approx(true_mean, rel=0.1, abs=0.05) + assert hist.est_sd() == approx(true_sd, rel=0.1, abs=0.1) + + +def test_probability_value_satisfies(): + dist = NormalDistribution(mean=0, sd=1) + hist = numeric(dist) + prob1 = hist.probability_value_satisfies(lambda x: x >= 0) + assert prob1 == approx(0.5, rel=0.01) + prob2 = hist.probability_value_satisfies(lambda x: x < 1) + assert prob2 == approx(stats.norm.cdf(1), rel=0.01) + + +@given( + a=st.floats(min_value=1e-6, max_value=1), + b=st.floats(min_value=1e-6, max_value=1), +) +@example(a=1, b=1e-5) +def test_mixture(a, b): + if a + b > 1: + scale = a + b + a /= scale + b /= scale + c = max(0, 1 - a - b) # do max to fix floating point rounding + dist1 = NormalDistribution(mean=0, sd=5) + dist2 = NormalDistribution(mean=5, sd=3) + dist3 = NormalDistribution(mean=-1, sd=1) + mixture = MixtureDistribution([dist1, dist2, dist3], [a, b, c]) + hist = numeric(mixture, bin_sizing="uniform") + assert hist.est_mean() == approx(a * dist1.mean + b * dist2.mean + c * dist3.mean, rel=1e-4) + assert hist.values[0] < 0 + + +def test_disjoint_mixture(): + dist = LognormalDistribution(norm_mean=0, norm_sd=1) + hist1 = numeric(dist) + hist2 = -numeric(dist) + mixture = NumericDistribution.mixture([hist1, hist2], [0.97, 0.03], warn=False) + assert mixture.est_mean() == approx(0.94 * dist.lognorm_mean, rel=0.001) + assert mixture.values[0] < 0 + assert mixture.values[1] < 0 + assert mixture.values[-1] > 0 + assert mixture.contribution_to_ev(0) == approx(0.03, rel=0.1) + + +def test_mixture_distributivity(): + one_sided_dist = LognormalDistribution(norm_mean=0, norm_sd=1) + ratio = [0.03, 0.97] + product_of_mixture = numeric(mixture([-one_sided_dist, one_sided_dist], ratio)) * numeric( + one_sided_dist + ) + mixture_of_products = numeric( + mixture([-one_sided_dist * one_sided_dist, one_sided_dist * one_sided_dist], ratio) + ) + + assert product_of_mixture.exact_mean == approx(mixture_of_products.exact_mean, rel=1e-5) + assert product_of_mixture.exact_sd == approx(mixture_of_products.exact_sd, rel=1e-5) + assert product_of_mixture.est_mean() == approx(mixture_of_products.est_mean(), rel=1e-5) + assert product_of_mixture.est_sd() == approx(mixture_of_products.est_sd(), rel=1e-2) + assert product_of_mixture.ppf(0.5) == approx(mixture_of_products.ppf(0.5), rel=1e-3, abs=2e-3) + + +@given(lclip=st.integers(-4, 4), width=st.integers(1, 4)) +@example(lclip=-1, width=2) +@example(lclip=4, width=1) +def test_numeric_clip(lclip, width): + rclip = lclip + width + dist = NormalDistribution(mean=0, sd=1) + full_hist = numeric(dist, num_bins=200, bin_sizing="uniform", warn=False) + clipped_hist = full_hist.clip(lclip, rclip) + assert clipped_hist.est_mean() == approx(stats.truncnorm.mean(lclip, rclip), rel=0.001) + hist_sum = clipped_hist + full_hist + assert hist_sum.est_mean() == approx( + stats.truncnorm.mean(lclip, rclip) + stats.norm.mean(), rel=0.001 + ) + + +@given( + a=st.sampled_from([0.2, 0.3, 0.5, 0.7, 0.8]), + lclip=st.sampled_from([-1, 1, None]), + clip_width=st.sampled_from([2, 3, None]), + bin_sizing=st.sampled_from(["uniform", "ev"]), + # Only clip inner or outer dist b/c clipping both makes it hard to + # calculate what the mean should be + clip_inner=st.booleans(), +) +@example(a=0.7, lclip=1, clip_width=3, bin_sizing="ev", clip_inner=False) +def test_sum2_clipped(a, lclip, clip_width, bin_sizing, clip_inner): + # Clipped NumericDist accuracy really benefits from more bins. It's not + # very accurate with 100 bins because a clipped histogram might end up with + # only 10 bins or so. + num_bins = 500 if not clip_inner and bin_sizing == "uniform" else 200 + clip_outer = not clip_inner + b = max(0, 1 - a) # do max to fix floating point rounding + rclip = lclip + clip_width if lclip is not None and clip_width is not None else np.inf + if lclip is None: + lclip = -np.inf + dist1 = NormalDistribution( + mean=0, + sd=1, + lclip=lclip if clip_inner else None, + rclip=rclip if clip_inner else None, + ) + dist2 = NormalDistribution(mean=1, sd=2) + hist = a * numeric(dist1, num_bins, bin_sizing, warn=False) + b * numeric( + dist2, num_bins, bin_sizing, warn=False + ) + if clip_outer: + hist = hist.clip(lclip, rclip) + if clip_inner: + # Truncating then adding is more accurate than adding then truncating, + # which is good because truncate-then-add is the more typical use case + true_mean = a * stats.truncnorm.mean(lclip, rclip, 0, 1) + b * dist2.mean + tolerance = 0.01 + else: + mixed_mean = a * dist1.mean + b * dist2.mean + mixed_sd = np.sqrt(a**2 * dist1.sd**2 + b**2 * dist2.sd**2) + lclip_zscore = (lclip - mixed_mean) / mixed_sd + rclip_zscore = (rclip - mixed_mean) / mixed_sd + + true_mean = stats.truncnorm.mean( + lclip_zscore, + rclip_zscore, + mixed_mean, + mixed_sd, + ) + tolerance = 0.01 + + assert hist.est_mean() == approx(true_mean, rel=tolerance, abs=tolerance / 10) + + +@given( + a=st.floats(min_value=1e-3, max_value=1), + b=st.floats(min_value=1e-3, max_value=1), + lclip=st.sampled_from([-1, 1, None]), + clip_width=st.sampled_from([1, 3, None]), + bin_sizing=st.sampled_from(["uniform", "ev"]), + # Only clip inner or outer dist b/c clipping both makes it hard to + # calculate what the mean should be + clip_inner=st.booleans(), +) +@example(a=0.0625, b=0.015625, lclip=1, clip_width=1, bin_sizing="ev", clip_inner=False) +@example(a=0.0625, b=0.015625, lclip=-2, clip_width=1, bin_sizing="ev", clip_inner=False) +def test_sum3_clipped(a, b, lclip, clip_width, bin_sizing, clip_inner): + # Clipped sum accuracy really benefits from more bins. + num_bins = 500 if not clip_inner else 100 + clip_outer = not clip_inner + if a + b > 1: + scale = a + b + a /= scale + b /= scale + c = max(0, 1 - a - b) # do max to fix floating point rounding + rclip = lclip + clip_width if lclip is not None and clip_width is not None else np.inf + if lclip is None: + lclip = -np.inf + dist1 = NormalDistribution( + mean=0, + sd=1, + lclip=lclip if clip_inner else None, + rclip=rclip if clip_inner else None, + ) + dist2 = NormalDistribution(mean=1, sd=2) + dist3 = NormalDistribution(mean=-1, sd=0.75) + dist_sum = a * dist1 + b * dist2 + c * dist3 + if clip_outer: + dist_sum.lclip = lclip + dist_sum.rclip = rclip + + hist = numeric(dist_sum, num_bins=num_bins, bin_sizing=bin_sizing, warn=False) + if clip_inner: + true_mean = a * stats.truncnorm.mean(lclip, rclip, 0, 1) + b * dist2.mean + c * dist3.mean + tolerance = 0.01 + else: + mixed_mean = a * dist1.mean + b * dist2.mean + c * dist3.mean + mixed_sd = np.sqrt( + a**2 * dist1.sd**2 + b**2 * dist2.sd**2 + c**2 * dist3.sd**2 + ) + lclip_zscore = (lclip - mixed_mean) / mixed_sd + rclip_zscore = (rclip - mixed_mean) / mixed_sd + true_mean = stats.truncnorm.mean( + lclip_zscore, + rclip_zscore, + mixed_mean, + mixed_sd, + ) + tolerance = 0.05 + assert hist.est_mean() == approx(true_mean, rel=tolerance, abs=tolerance / 10) + + +def test_sum_with_zeros(): + dist1 = NormalDistribution(mean=3, sd=1) + dist2 = NormalDistribution(mean=2, sd=1) + hist1 = numeric(dist1) + hist2 = numeric(dist2) + hist2 = hist2.condition_on_success(0.75) + assert hist2.exact_mean == approx(1.5) + assert hist2.est_mean() == approx(1.5, rel=1e-5) + assert hist2.est_sd() == approx(hist2.exact_sd, rel=1e-3) + hist_sum = hist1 + hist2 + assert hist_sum.exact_mean == approx(4.5) + assert hist_sum.est_mean() == approx(4.5, rel=1e-5) + + +def test_product_with_zeros(): + dist1 = LognormalDistribution(norm_mean=1, norm_sd=1) + dist2 = LognormalDistribution(norm_mean=2, norm_sd=1) + hist1 = numeric(dist1) + hist2 = numeric(dist2) + hist1 = hist1.condition_on_success(2 / 3) + hist2 = hist2.condition_on_success(0.5) + assert hist2.exact_mean == approx(dist2.lognorm_mean / 2) + assert hist2.est_mean() == approx(dist2.lognorm_mean / 2, rel=1e-5) + hist_prod = hist1 * hist2 + dist_prod = LognormalDistribution(norm_mean=3, norm_sd=np.sqrt(2)) + assert hist_prod.exact_mean == approx(dist_prod.lognorm_mean / 3) + assert hist_prod.est_mean() == approx(dist_prod.lognorm_mean / 3, rel=1e-5) + + +def test_shift_with_zeros(): + dist = NormalDistribution(mean=1, sd=1) + wrapped_hist = numeric(dist, warn=False) + hist = wrapped_hist.condition_on_success(0.5) + shifted_hist = hist + 2 + assert shifted_hist.exact_mean == approx(2.5) + assert shifted_hist.est_mean() == approx(2.5, rel=1e-5) + assert shifted_hist.masses[np.searchsorted(shifted_hist.values, 2)] == approx(0.5) + assert shifted_hist.est_sd() == approx(hist.est_sd(), rel=1e-3) + assert shifted_hist.est_sd() == approx(shifted_hist.exact_sd, rel=1e-3) + + +def test_abs_with_zeros(): + dist = NormalDistribution(mean=1, sd=2) + wrapped_hist = numeric(dist, warn=False) + hist = wrapped_hist.condition_on_success(0.5) + abs_hist = abs(hist) + true_mean = stats.foldnorm.mean(1 / 2, loc=0, scale=2) + assert abs_hist.est_mean() == approx(0.5 * true_mean, rel=0.001) + + +def test_condition_on_success(): + dist1 = NormalDistribution(mean=4, sd=2) + dist2 = LognormalDistribution(norm_mean=-1, norm_sd=1) + hist = numeric(dist1) + event = numeric(dist2) + outcome = hist.condition_on_success(event) + assert outcome.exact_mean == approx(hist.exact_mean * dist2.lognorm_mean) + + +def test_probability_value_satisfies_with_zeros(): + dist = NormalDistribution(mean=0, sd=1) + hist = numeric(dist).condition_on_success(0.5) + assert hist.probability_value_satisfies(lambda x: x == 0) == approx(0.5) + assert hist.probability_value_satisfies(lambda x: x != 0) == approx(0.5) + assert hist.probability_value_satisfies(lambda x: x > 0) == approx(0.25) + assert hist.probability_value_satisfies(lambda x: x >= 0) == approx(0.75) + + +def test_quantile_with_zeros(): + mean = 1 + sd = 1 + dist = NormalDistribution(mean=mean, sd=sd) + hist = numeric(dist, bin_sizing="uniform", warn=False).condition_on_success(0.25) + + tolerance = 0.01 + + # When we scale down by 4x, the quantile that used to be at q is now at 4q + assert hist.quantile(0.025) == approx(stats.norm.ppf(0.1, mean, sd), rel=tolerance) + assert hist.quantile(stats.norm.cdf(-0.01, mean, sd) / 4) == approx( + -0.01, rel=tolerance, abs=1e-3 + ) + + # The values in the ~middle 75% equal 0 + assert hist.quantile(stats.norm.cdf(0.01, mean, sd) / 4) == 0 + assert hist.quantile(0.4 + stats.norm.cdf(0.01, mean, sd) / 4) == 0 + + # The values above 0 work like the values below 0 + assert hist.quantile(0.75 + stats.norm.cdf(0.01, mean, sd) / 4) == approx( + 0.01, rel=tolerance, abs=1e-3 + ) + assert hist.quantile([0.99]) == approx([stats.norm.ppf(0.96, mean, sd)], rel=tolerance) + + +@given( + a=st.floats(min_value=-100, max_value=100), + b=st.floats(min_value=-100, max_value=100), +) +@settings(max_examples=10) +def test_uniform_basic(a, b): + a, b = fix_ordering(a, b) + dist = UniformDistribution(x=a, y=b) + with warnings.catch_warnings(): + # hypothesis generates some extremely tiny input params, which + # generates warnings about EV contributions being 0. + warnings.simplefilter("ignore") + hist = numeric(dist) + assert hist.est_mean() == approx((a + b) / 2, 1e-6) + assert hist.est_sd() == approx(np.sqrt(1 / 12 * (b - a) ** 2), rel=1e-3) + + +def test_uniform_sum_basic(): + # The sum of standard uniform distributions is also known as an Irwin-Hall + # distribution: + # https://en.wikipedia.org/wiki/Irwin%E2%80%93Hall_distribution + dist = UniformDistribution(0, 1) + hist1 = numeric(dist) + hist_sum = numeric(dist) + hist_sum += hist1 + assert hist_sum.exact_mean == approx(1) + assert hist_sum.exact_sd == approx(np.sqrt(2 / 12)) + assert hist_sum.est_mean() == approx(1) + assert hist_sum.est_sd() == approx(np.sqrt(2 / 12), rel=0.005) + hist_sum += hist1 + assert hist_sum.est_mean() == approx(1.5) + assert hist_sum.est_sd() == approx(np.sqrt(3 / 12), rel=0.005) + hist_sum += hist1 + assert hist_sum.est_mean() == approx(2) + assert hist_sum.est_sd() == approx(np.sqrt(4 / 12), rel=0.005) + + +@given( + # I originally had both dists on [-1000, 1000] but then hypothesis would + # generate ~90% of cases with extremely tiny values that are too small for + # floating point operations to handle, so I forced most of the values to be + # at least a little away from 0. + a1=st.floats(min_value=-1000, max_value=0.001), + b1=st.floats(min_value=0.001, max_value=1000), + a2=st.floats(min_value=0, max_value=1000), + b2=st.floats(min_value=1, max_value=10000), + flip2=st.booleans(), +) +def test_uniform_sum(a1, b1, a2, b2, flip2): + if flip2: + a2, b2 = -b2, -a2 + a1, b1 = fix_ordering(a1, b1) + a2, b2 = fix_ordering(a2, b2) + dist1 = UniformDistribution(x=a1, y=b1) + dist2 = UniformDistribution(x=a2, y=b2) + with warnings.catch_warnings(): + # hypothesis generates some extremely tiny input params, which + # generates warnings about EV contributions being 0. + warnings.simplefilter("ignore") + hist1 = numeric(dist1) + hist2 = numeric(dist2) + + hist_sum = hist1 + hist2 + assert hist_sum.est_mean() == approx(hist_sum.exact_mean) + assert hist_sum.est_sd() == approx(hist_sum.exact_sd, rel=0.01) + + +@given( + a1=st.floats(min_value=-1000, max_value=0.001), + b1=st.floats(min_value=0.001, max_value=1000), + a2=st.floats(min_value=0, max_value=1000), + b2=st.floats(min_value=1, max_value=10000), + flip2=st.booleans(), +) +@settings(max_examples=10) +def test_uniform_prod(a1, b1, a2, b2, flip2): + if flip2: + a2, b2 = -b2, -a2 + a1, b1 = fix_ordering(a1, b1) + a2, b2 = fix_ordering(a2, b2) + dist1 = UniformDistribution(x=a1, y=b1) + dist2 = UniformDistribution(x=a2, y=b2) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + hist1 = numeric(dist1) + hist2 = numeric(dist2) + hist_prod = hist1 * hist2 + assert hist_prod.est_mean() == approx(hist_prod.exact_mean, abs=1e-6, rel=1e-6) + assert hist_prod.est_sd() == approx(hist_prod.exact_sd, rel=0.01) + + +@given( + a=st.floats(min_value=-1000, max_value=0.001), + b=st.floats(min_value=0.001, max_value=1000), + norm_mean=st.floats(np.log(0.001), np.log(1e6)), + norm_sd=st.floats(0.1, 2), +) +@settings(max_examples=10) +def test_uniform_lognorm_prod(a, b, norm_mean, norm_sd): + a, b = fix_ordering(a, b) + dist1 = UniformDistribution(x=a, y=b) + dist2 = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd) + hist1 = numeric(dist1) + hist2 = numeric(dist2, warn=False) + hist_prod = hist1 * hist2 + assert hist_prod.est_mean() == approx(hist_prod.exact_mean, rel=1e-7, abs=1e-7) + assert hist_prod.est_sd() == approx(hist_prod.exact_sd, rel=0.1) + + +@given( + a=st.floats(min_value=0.5, max_value=100), + b=st.floats(min_value=0.5, max_value=100), +) +@settings(max_examples=10) +def test_beta_basic(a, b): + dist = BetaDistribution(a, b) + hist = numeric(dist) + assert hist.exact_mean == approx(a / (a + b)) + assert hist.exact_sd == approx(np.sqrt(a * b / ((a + b) ** 2 * (a + b + 1)))) + assert hist.est_mean() == approx(hist.exact_mean) + assert hist.est_sd() == approx(hist.exact_sd, rel=0.02) + + +@given( + a=st.floats(min_value=1, max_value=100), + b=st.floats(min_value=1, max_value=100), + mean=st.floats(-10, 10), + sd=st.floats(0.1, 10), +) +@settings(max_examples=10) +def test_beta_sum(a, b, mean, sd): + dist1 = BetaDistribution(a=a, b=b) + dist2 = NormalDistribution(mean=mean, sd=sd) + hist1 = numeric(dist1) + hist2 = numeric(dist2) + hist_sum = hist1 + hist2 + assert hist_sum.est_mean() == approx(hist_sum.exact_mean, rel=1e-7, abs=1e-7) + assert hist_sum.est_sd() == approx(hist_sum.exact_sd, rel=0.01) + + +@given( + a=st.floats(min_value=1, max_value=100), + b=st.floats(min_value=1, max_value=100), + norm_mean=st.floats(-10, 10), + norm_sd=st.floats(0.1, 1), +) +@settings(max_examples=10) +def test_beta_prod(a, b, norm_mean, norm_sd): + dist1 = BetaDistribution(a=a, b=b) + dist2 = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd) + hist1 = numeric(dist1) + hist2 = numeric(dist2) + hist_prod = hist1 * hist2 + assert hist_prod.est_mean() == approx(hist_prod.exact_mean, rel=1e-7, abs=1e-7) + assert hist_prod.est_sd() == approx(hist_prod.exact_sd, rel=0.02) + + +@given( + left=st.floats(min_value=-100, max_value=100), + dist_to_mode=st.floats(min_value=0.001, max_value=100), + dist_to_right=st.floats(min_value=0.001, max_value=100), +) +@settings(max_examples=10) +def test_pert_basic(left, dist_to_mode, dist_to_right): + mode = left + dist_to_mode + right = mode + dist_to_right + dist = PERTDistribution(left=left, right=right, mode=mode) + hist = numeric(dist) + assert hist.exact_mean == approx((left + 4 * mode + right) / 6) + assert hist.est_mean() == approx(hist.exact_mean) + assert hist.exact_sd == approx( + (np.sqrt((hist.exact_mean - left) * (right - hist.exact_mean) / 7)) + ) + assert hist.est_sd() == approx(hist.exact_sd, rel=0.001) + + +@given( + left=st.floats(min_value=-100, max_value=100), + dist_to_mode=st.floats(min_value=0.001, max_value=100), + dist_to_right=st.floats(min_value=0.001, max_value=100), + lam=st.floats(min_value=1, max_value=10), +) +@settings(max_examples=10) +def test_pert_with_lambda(left, dist_to_mode, dist_to_right, lam): + mode = left + dist_to_mode + right = mode + dist_to_right + dist = PERTDistribution(left=left, right=right, mode=mode, lam=lam) + hist = numeric(dist) + true_mean = (left + lam * mode + right) / (lam + 2) + true_sd_lam4 = np.sqrt((hist.exact_mean - left) * (right - hist.exact_mean) / 7) + assert hist.exact_mean == approx(true_mean) + assert hist.est_mean() == approx(true_mean) + if lam < 3.9: + assert hist.est_sd() > true_sd_lam4 + elif lam > 4.1: + assert hist.est_sd() < true_sd_lam4 + + +@given( + mode=st.floats(min_value=0.01, max_value=0.99), +) +@settings(max_examples=10) +def test_pert_equals_beta(mode): + alpha = 1 + 4 * mode + beta = 1 + 4 * (1 - mode) + beta_dist = BetaDistribution(alpha, beta) + pert_dist = PERTDistribution(left=0, right=1, mode=mode) + beta_hist = numeric(beta_dist) + pert_hist = numeric(pert_dist) + assert beta_hist.exact_mean == approx(pert_hist.exact_mean) + assert beta_hist.exact_sd == approx(pert_hist.exact_sd) + assert beta_hist.est_mean() == approx(pert_hist.est_mean()) + assert beta_hist.est_sd() == approx(pert_hist.est_sd(), rel=0.01) + + +@given( + shape=st.floats(min_value=0.1, max_value=100), + scale=st.floats(min_value=0.1, max_value=100), + bin_sizing=st.sampled_from(["uniform", "ev", "mass", "fat-hybrid"]), +) +def test_gamma_basic(shape, scale, bin_sizing): + dist = GammaDistribution(shape=shape, scale=scale) + hist = numeric(dist, bin_sizing=bin_sizing, warn=False) + assert hist.exact_mean == approx(shape * scale) + assert hist.exact_sd == approx(np.sqrt(shape) * scale) + assert hist.est_mean() == approx(hist.exact_mean) + assert hist.est_sd() == approx(hist.exact_sd, rel=0.1) + + +@given( + shape=st.floats(min_value=0.1, max_value=100), + scale=st.floats(min_value=0.1, max_value=100), + mean=st.floats(-10, 10), + sd=st.floats(0.1, 10), +) +def test_gamma_sum(shape, scale, mean, sd): + dist1 = GammaDistribution(shape=shape, scale=scale) + dist2 = NormalDistribution(mean=mean, sd=sd) + hist1 = numeric(dist1) + hist2 = numeric(dist2) + hist_sum = hist1 + hist2 + assert hist_sum.est_mean() == approx(hist_sum.exact_mean, rel=1e-7, abs=1e-7) + assert hist_sum.est_sd() == approx(hist_sum.exact_sd, rel=0.02, abs=0.02) + + +@given( + shape=st.floats(min_value=0.1, max_value=100), + scale=st.floats(min_value=0.1, max_value=100), + mean=st.floats(-10, 10), + sd=st.floats(0.1, 10), +) +def test_gamma_product(shape, scale, mean, sd): + dist1 = GammaDistribution(shape=shape, scale=scale) + dist2 = NormalDistribution(mean=mean, sd=sd) + hist1 = numeric(dist1) + hist2 = numeric(dist2) + hist_prod = hist1 * hist2 + assert hist_prod.est_mean() == approx(hist_prod.exact_mean, rel=1e-7, abs=1e-7) + assert hist_prod.est_sd() == approx(hist_prod.exact_sd, rel=0.02) + + +@given( + df=st.floats(min_value=0.1, max_value=100), +) +def test_chi_square(df): + dist = ChiSquareDistribution(df=df) + hist = numeric(dist) + assert hist.exact_mean == approx(df) + assert hist.exact_sd == approx(np.sqrt(2 * df)) + assert hist.est_mean() == approx(hist.exact_mean) + assert hist.est_sd() == approx(hist.exact_sd, rel=0.01) + + +@given( + scale=st.floats(min_value=0.1, max_value=1e6), +) +def test_exponential_dist(scale): + dist = ExponentialDistribution(scale=scale) + hist = numeric(dist) + assert hist.exact_mean == approx(scale) + assert hist.exact_sd == approx(scale) + assert hist.est_mean() == approx(hist.exact_mean) + assert hist.est_sd() == approx(hist.exact_sd, rel=0.01) + + +@given( + shape=st.floats(min_value=1.1, max_value=100), +) +def test_pareto_dist(shape): + dist = ParetoDistribution(shape) + hist = numeric(dist, warn=shape >= 2) + assert hist.exact_mean == approx(shape / (shape - 1)) + assert hist.est_mean() == approx(hist.exact_mean, rel=0.01 / (shape - 1)) + if shape <= 2: + assert hist.exact_sd == approx(np.inf) + else: + assert hist.est_sd() == approx(hist.exact_sd, rel=max(0.01, 0.1 / (shape - 2))) + + +@given( + x=st.floats(min_value=-100, max_value=100), + wrap_in_dist=st.booleans(), +) +@example(x=1, wrap_in_dist=False) +def test_constant_dist(x, wrap_in_dist): + dist1 = NormalDistribution(mean=1, sd=1) + if wrap_in_dist: + dist2 = ConstantDistribution(x=x) + else: + dist2 = x + hist1 = numeric(dist1, warn=False) + hist2 = numeric(dist2, warn=False) + hist_sum = hist1 + hist2 + assert hist_sum.exact_mean == approx(1 + x) + assert hist_sum.est_mean() == approx(1 + x, rel=1e-6) + assert hist_sum.exact_sd == approx(1) + assert hist_sum.est_sd() == approx(hist1.est_sd(), rel=1e-3) + + +@given( + p=st.floats(min_value=0.001, max_value=0.999), +) +def test_bernoulli_dist(p): + dist = BernoulliDistribution(p=p) + hist = numeric(dist, warn=False) + assert hist.exact_mean == approx(p) + assert hist.est_mean() == approx(p, rel=1e-6) + assert hist.exact_sd == approx(np.sqrt(p * (1 - p))) + assert hist.est_sd() == approx(hist.exact_sd, rel=1e-6) + + +def test_complex_dist(): + left = NormalDistribution(mean=1, sd=1) + right = NormalDistribution(mean=0, sd=1) + dist = ComplexDistribution(left, right, operator.add) + hist = numeric(dist, warn=False) + assert hist.exact_mean == approx(1) + assert hist.est_mean() == approx(1, rel=1e-6) + + +def test_complex_dist_with_float(): + left = NormalDistribution(mean=1, sd=1) + right = 2 + dist = ComplexDistribution(left, right, operator.mul) + hist = numeric(dist, warn=False) + assert hist.exact_mean == approx(2) + assert hist.est_mean() == approx(2, rel=1e-6) + + +@given( + mean=st.floats(min_value=-10, max_value=10), + sd=st.floats(min_value=0.01, max_value=10), + bin_num=st.integers(min_value=5, max_value=95), +) +@example(mean=1, sd=0.7822265625, bin_num=5) +def test_numeric_dist_contribution_to_ev(mean, sd, bin_num): + fraction = bin_num / 100 + dist = NormalDistribution(mean=mean, sd=sd) + tolerance = 0.02 if bin_num < 10 or bin_num > 90 else 0.01 + hist = numeric(dist, bin_sizing="uniform", num_bins=100, warn=False) + assert hist.contribution_to_ev(dist.inv_contribution_to_ev(fraction)) == approx( + fraction, rel=tolerance + ) + + +@given( + norm_mean=st.floats(min_value=-np.log(1e9), max_value=np.log(1e9)), + norm_sd=st.floats(min_value=0.001, max_value=4), + bin_num=st.integers(min_value=2, max_value=98), +) +def test_numeric_dist_inv_contribution_to_ev(norm_mean, norm_sd, bin_num): + # The nth value stored in the PMH represents a value between the nth and n+1th edges + dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd) + hist = numeric(dist, bin_sizing="ev", warn=False) + fraction = bin_num / hist.num_bins + prev_fraction = fraction - 1 / hist.num_bins + next_fraction = fraction + assert hist.inv_contribution_to_ev(fraction) > dist.inv_contribution_to_ev(prev_fraction) + assert hist.inv_contribution_to_ev(fraction) < dist.inv_contribution_to_ev(next_fraction) + + +@given( + mean=st.floats(min_value=-100, max_value=100), + sd=st.floats(min_value=0.01, max_value=100), + percent=st.integers(min_value=0, max_value=100), +) +@example(mean=0, sd=1, percent=100) +def test_quantile_uniform(mean, sd, percent): + # Note: Quantile interpolation can sometimes give incorrect results at the + # 0th percentile because if the first two bin edges are extremely close to + # 0, the values can be out of order due to floating point rounding. + assume(percent != 0 or abs(mean) / sd < 3) + dist = NormalDistribution(mean=mean, sd=sd) + hist = numeric(dist, num_bins=200, bin_sizing="uniform", warn=False) + if percent == 0: + assert hist.percentile(percent) <= hist.values[0] + elif percent == 100: + cum_mass = np.cumsum(hist.masses) - 0.5 * hist.masses + nonzero_indexes = [i for (i, d) in enumerate(np.diff(cum_mass)) if d > 0] + last_valid_index = max(nonzero_indexes) + assert hist.percentile(percent) >= hist.values[last_valid_index] + else: + assert hist.percentile(percent) == approx( + stats.norm.ppf(percent / 100, loc=mean, scale=sd), rel=0.02, abs=0.02 + ) + + +@given( + norm_mean=st.floats(min_value=-5, max_value=5), + norm_sd=st.floats(min_value=0.1, max_value=2), + percent=st.integers(min_value=1, max_value=100), +) +def test_quantile_log_uniform(norm_mean, norm_sd, percent): + dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd) + hist = numeric(dist, num_bins=200, bin_sizing="log-uniform", warn=False) + if percent == 0: + assert hist.percentile(percent) <= hist.values[0] + elif percent == 100: + cum_mass = np.cumsum(hist.masses) - 0.5 * hist.masses + nonzero_indexes = [i for (i, d) in enumerate(np.diff(cum_mass)) if d > 0] + last_valid_index = max(nonzero_indexes) + assert hist.percentile(percent) >= hist.values[last_valid_index] + else: + assert hist.percentile(percent) == approx( + stats.lognorm.ppf(percent / 100, norm_sd, scale=np.exp(norm_mean)), rel=0.01 + ) + + +@given( + norm_mean=st.floats(min_value=-5, max_value=5), + norm_sd=st.floats(min_value=0.1, max_value=2), + # Don't try smaller percentiles because the smaller bins have a lot of + # probability mass + percent=st.floats(min_value=20, max_value=99.9), +) +@example(norm_mean=0, norm_sd=0.5, percent=99.9) +def test_quantile_ev(norm_mean, norm_sd, percent): + dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd) + hist = numeric(dist, num_bins=200, bin_sizing="ev", warn=False) + tolerance = 0.1 if percent < 70 or percent > 99 else 0.01 + assert hist.percentile(percent) == approx( + stats.lognorm.ppf(percent / 100, norm_sd, scale=np.exp(norm_mean)), rel=tolerance + ) + + +@given( + mean=st.floats(min_value=100, max_value=100), + sd=st.floats(min_value=0.01, max_value=100), + percent=st.floats(min_value=1, max_value=99), +) +@example(mean=0, sd=1, percent=1) +def test_quantile_mass(mean, sd, percent): + dist = NormalDistribution(mean=mean, sd=sd) + hist = numeric(dist, num_bins=200, bin_sizing="mass", warn=False) + + # It's hard to make guarantees about how close the value will be, but we + # should know for sure that the cdf of the value is very close to the + # percent. Naive interpolation should have a maximum absolute error of 1 / + # num_bins. + assert 100 * stats.norm.cdf(hist.percentile(percent), mean, sd) == approx(percent, abs=0.1) + + +@given( + mean=st.floats(min_value=100, max_value=100), + sd=st.floats(min_value=0.01, max_value=100), +) +@example(mean=100, sd=11.97) +def test_cdf_mass(mean, sd): + dist = NormalDistribution(mean=mean, sd=sd) + hist = numeric(dist, num_bins=200, bin_sizing="mass", warn=False) + + # should definitely be accurate to within 1 / num_bins but a smart interpolator + # can do better + tolerance = 0.001 + assert hist.cdf(mean) == approx(0.5, abs=tolerance) + assert hist.cdf(mean - sd) == approx(stats.norm.cdf(-1), abs=tolerance) + assert hist.cdf(mean + 2 * sd) == approx(stats.norm.cdf(2), abs=tolerance) + + +@given( + mean=st.floats(min_value=100, max_value=100), + sd=st.floats(min_value=0.01, max_value=100), + percent=st.integers(min_value=1, max_value=99), +) +def test_cdf_inverts_quantile(mean, sd, percent): + dist = NormalDistribution(mean=mean, sd=sd) + hist = numeric(dist, num_bins=200, bin_sizing="mass", warn=False) + assert 100 * hist.cdf(hist.percentile(percent)) == approx(percent, abs=0.1) + + +@given( + mean1=st.floats(min_value=100, max_value=100), + mean2=st.floats(min_value=100, max_value=100), + sd1=st.floats(min_value=0.01, max_value=100), + sd2=st.floats(min_value=0.01, max_value=100), + percent=st.integers(min_value=1, max_value=99), +) +@example(mean1=100, mean2=100, sd1=1, sd2=99, percent=2) +def test_quantile_mass_after_sum(mean1, mean2, sd1, sd2, percent): + dist1 = NormalDistribution(mean=mean1, sd=sd1) + dist2 = NormalDistribution(mean=mean2, sd=sd2) + hist1 = numeric(dist1, num_bins=200, bin_sizing="mass", warn=False) + hist2 = numeric(dist2, num_bins=200, bin_sizing="mass", warn=False) + hist_sum = hist1 + hist2 + assert hist_sum.percentile(percent) == approx( + stats.norm.ppf(percent / 100, mean1 + mean2, np.sqrt(sd1**2 + sd2**2)), + rel=0.01 * (mean1 + mean2), + ) + assert 100 * stats.norm.cdf( + hist_sum.percentile(percent), hist_sum.exact_mean, hist_sum.exact_sd + ) == approx(percent, abs=0.25) + + +@patch.object(np.random, "uniform", Mock(return_value=0.5)) +@given( + mean=st.floats(min_value=-10, max_value=10), + bin_sizing=st.sampled_from(["uniform", "ev", "mass"]), +) +def test_sample(mean, bin_sizing): + dist = NormalDistribution(mean=mean, sd=1) + hist = numeric(dist, bin_sizing=bin_sizing) + tol = 0.001 if bin_sizing == "uniform" else 0.01 + assert hist.sample() == approx(mean, rel=tol) + + +def test_utils_get_percentiles_basic(): + dist = NormalDistribution(mean=0, sd=1) + hist = numeric(dist, warn=False) + assert utils.get_percentiles(hist, 1) == hist.percentile(1) + assert utils.get_percentiles(hist, [5]) == hist.percentile([5]) + assert all(utils.get_percentiles(hist, np.array([10, 20])) == hist.percentile([10, 20])) + + +def test_bump_indexes(): + assert _bump_indexes([2, 2, 2, 2], 4) == [0, 1, 2, 3] + assert _bump_indexes([2, 2, 2, 2], 6) == [2, 3, 4, 5] + assert _bump_indexes([2, 2, 2, 2], 8) == [2, 3, 4, 5] + assert _bump_indexes([1, 2, 2, 5, 7, 7, 8, 8, 8], 9) == list(range(9)) + assert _bump_indexes([0, 0, 0, 6], 8) == [0, 1, 2, 6] + assert _bump_indexes([0, 0, 0, 9, 9, 9], 11) == [0, 1, 2, 8, 9, 10] + + +def test_plot(): + return None + hist = numeric(LognormalDistribution(norm_mean=0, norm_sd=1)) * numeric( + NormalDistribution(mean=0, sd=5) + ) + # hist = numeric(LognormalDistribution(norm_mean=0, norm_sd=2)) + hist.plot(scale="linear") + + +def test_performance(): + return None + # Note: I wrote some C++ code to approximate the behavior of distribution + # multiplication. On my machine, distribution multiplication (with profile + # = False) runs in 15s, and the equivalent C++ code (with -O3) runs in 11s. + # The C++ code is not well-optimized, the most glaring issue being it uses + # std::sort instead of something like argpartition (the trouble is that + # numpy's argpartition can partition on many values simultaneously, whereas + # C++'s std::partition can only partition on one value at a time, which is + # far slower). + dist1 = LognormalDistribution(norm_mean=0, norm_sd=1) + dist2 = LognormalDistribution(norm_mean=1, norm_sd=0.5) + + import cProfile + import pstats + import io + + pr = cProfile.Profile() + pr.enable() + + for i in range(5000): + hist1 = numeric(dist1, num_bins=200) + hist2 = numeric(dist2, num_bins=200) + hist1 = hist1 * hist2 + + pr.disable() + s = io.StringIO() + sortby = "cumulative" + ps = pstats.Stats(pr, stream=s).sort_stats(sortby) + ps.print_stats() + print(s.getvalue())