diff --git a/.gitignore b/.gitignore
index 1c4e86d..e41947a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
build/*
dist/*
+doc/build/*
*.egg-info
__pycache__
.ruff_cache
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..be9d2e1
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,24 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+SOURCEDIR = source
+BUILDDIR = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+buildapi:
+ sphinx-apidoc -efM ../squigglepy -o source/reference
+ @echo "Auto-generation of API documentation finished. " \
+ "The generated files are in 'api/'"
diff --git a/doc/make.bat b/doc/make.bat
new file mode 100644
index 0000000..543c6b1
--- /dev/null
+++ b/doc/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.http://sphinx-doc.org/
+ exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/doc/source/conf.py b/doc/source/conf.py
new file mode 100644
index 0000000..ebbe282
--- /dev/null
+++ b/doc/source/conf.py
@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = "Squigglepy"
+copyright = "2023, Peter Wildeford"
+author = "Peter Wildeford"
+
+# The short X.Y version
+version = ""
+# The full version, including alpha/beta/rc tags
+release = ""
+
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ "sphinx.ext.autodoc",
+ "sphinx.ext.imgmath",
+ "sphinx.ext.viewcode",
+ "numpydoc",
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = ".rst"
+
+# The master toctree document.
+master_doc = "index"
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = "en"
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+html_theme = "pydata_sphinx_theme"
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself. Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = "Squigglepydoc"
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+ # The paper size ('letterpaper' or 'a4paper').
+ #
+ # 'papersize': 'letterpaper',
+ # The font size ('10pt', '11pt' or '12pt').
+ #
+ # 'pointsize': '10pt',
+ # Additional stuff for the LaTeX preamble.
+ #
+ # 'preamble': '',
+ # Latex figure (float) alignment
+ #
+ # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+# author, documentclass [howto, manual, or own class]).
+latex_documents = [
+ (master_doc, "Squigglepy.tex", "Squigglepy Documentation", "Peter Wildeford", "manual"),
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(master_doc, "squigglepy", "Squigglepy Documentation", [author], 1)]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ (
+ master_doc,
+ "Squigglepy",
+ "Squigglepy Documentation",
+ author,
+ "Squigglepy",
+ "One line description of project.",
+ "Miscellaneous",
+ ),
+]
+
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ["search.html"]
+
+
+# -- Extension configuration -------------------------------------------------
+
+numpydoc_class_members_toctree = False
diff --git a/doc/source/index.rst b/doc/source/index.rst
new file mode 100644
index 0000000..6a637ce
--- /dev/null
+++ b/doc/source/index.rst
@@ -0,0 +1,52 @@
+Squigglepy: Implementation of Squiggle in Python
+================================================
+
+`Squiggle `__ is a "simple
+programming language for intuitive probabilistic estimation". It serves
+as its own standalone programming language with its own syntax, but it
+is implemented in JavaScript. I like the features of Squiggle and intend
+to use it frequently, but I also sometimes want to use similar
+functionalities in Python, especially alongside other Python statistical
+programming packages like Numpy, Pandas, and Matplotlib. The
+**squigglepy** package here implements many Squiggle-like
+functionalities in Python.
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Contents
+
+ Installation
+ Usage
+ Numeric Distributions
+ API Reference
+
+Disclaimers
+-----------
+
+This package is unofficial and supported by Peter Wildeford and Rethink
+Priorities. It is not affiliated with or associated with the Quantified
+Uncertainty Research Institute, which maintains the Squiggle language
+(in JavaScript).
+
+This package is also new and not yet in a stable production version, so
+you may encounter bugs and other errors. Please report those so they can
+be fixed. It’s also possible that future versions of the package may
+introduce breaking changes.
+
+This package is available under an MIT License.
+
+Acknowledgements
+----------------
+
+- The primary author of this package is Peter Wildeford. Agustín
+ Covarrubias and Bernardo Baron contributed several key features and
+ developments.
+- Thanks to Ozzie Gooen and the Quantified Uncertainty Research
+ Institute for creating and maintaining the original Squiggle
+ language.
+- Thanks to Dawn Drescher for helping me implement math between
+ distributions.
+- Thanks to Dawn Drescher for coming up with the idea to use ``~`` as a
+ shorthand for ``sample``, as well as helping me implement it.
+
+.. autosummary::
diff --git a/doc/source/installation.rst b/doc/source/installation.rst
new file mode 100644
index 0000000..fb5e8d9
--- /dev/null
+++ b/doc/source/installation.rst
@@ -0,0 +1,12 @@
+Installation
+============
+
+.. code:: shell
+
+ pip install squigglepy
+
+For plotting support, you can also use the ``plots`` extra:
+
+.. code:: shell
+
+ pip install squigglepy[plots]
diff --git a/doc/source/numeric_distributions.rst b/doc/source/numeric_distributions.rst
new file mode 100644
index 0000000..f2e3f4e
--- /dev/null
+++ b/doc/source/numeric_distributions.rst
@@ -0,0 +1,190 @@
+Numeric Distributions
+=====================
+
+A ``NumericDistribution`` represents a probability distribution as a histogram
+of values along with the probability mass near each value.
+
+A ``NumericDistribution`` is functionally equivalent to a Monte Carlo
+simulation where you generate infinitely many samples and then group the
+samples into finitely many bins, keeping track of the proportion of samples
+in each bin (a.k.a. the probability mass) and the average value for each
+bin.
+
+Compared to a Monte Carlo simulation, ``NumericDistribution`` can represent
+information much more densely by grouping together nearby values (although
+some information is lost in the grouping). The benefit of this is most
+obvious in fat-tailed distributions. In a Monte Carlo simulation, perhaps 1
+in 1000 samples account for 10% of the expected value, but a
+``NumericDistribution`` (with the right bin sizing method, see
+:any:`BinSizing`) can easily track the probability mass of large values.
+
+Accuracy
+--------
+
+The construction of ``NumericDistribution`` ensures that its expected value
+is always close to 100% accurate. The higher moments (standard deviation,
+skewness, etc.) and percentiles are less accurate, but still almost always
+more accurate than Monte Carlo in practice.
+
+We are probably most interested in the accuracy of percentiles. Consider a
+simulation that applies binary operations to combine ``m`` different
+``NumericDistribution`` s, each of which has ``n`` bins. The relative error of
+estimated percentiles grows with approximately :math:`O(m / n^1.5)` or better.
+That is, the error is proportional to the number of operations and inversely
+proportional to the number of bins to the power of 1.5. This is only an
+approximation, and the exact error rate depends on the shapes of the underlying
+distributions.
+
+Compare this to the relative error of percentiles for a Monte Carlo (MC)
+simulation over a log-normal distribution. MC relative error grows with
+:math:`O(1 / n)` [1], given the assumption that if our
+``NumericDistribution`` has ``n`` bins, then our MC simulation runs ``n^2``
+samples (because that way both have a runtime of approximately :math:`O(n^2)`).
+So MC scales worse with ``n``, but better with ``m``.
+
+I tested accuracy across a range of percentiles for a variety of values of ``m``
+and ``n``. Although MC scales better with ``m`` than ``NumericDistribution``, MC
+does not achieve lower error rates until ``m = 500`` or so when using 200
+bins/40,000 MC samples (depending on the distribution shape). Few models will
+use 500 distinct variables, so ``NumericDistribution`` should nearly always
+perform better in practice.
+
+We are also interested in the accuracy of the standard deviation. The error of
+``NumericDistribution``'s estimated standard deviation scales with approximately
+:math:`O(\sqrt[4]{m} / n^{1.5})`. The error of Monte Carlo standard deviation
+scales with :math:`O(\sqrt{m} / n)`.
+
+Where possible, ``NumericDistribution`` uses `Richardson extrapolation
+`_ over the bins to
+improve accuracy. When done correctly, Richardson extrapolation significantly
+reduces the big-O error of an operation. But doing Richardson extrapolation
+correctly requires setting a rate parameter correctly, and the rate parameter
+depends on the shape of the distribution and is not knowable in general.
+``NumericDistribution`` simply uses the same rate parameter for all
+distributions, which somewhat reduces error, but might not reduce the big-O
+error.
+
+[1] Goodman (1983). Accuracy and Efficiency of Monte Carlo Method.
+https://inis.iaea.org/collection/NCLCollectionStore/_Public/19/047/19047359.pdf
+
+Speed
+-----
+I tested ``NumericDistribution`` versus Monte Carlo on two fairly complex
+example models (originally written by Laura Duffy to evaluate hypothetical
+animal welfare and x-risk interventions). On the first model,
+``NumericDistribution`` performed ~300x better at the same level of accuracy.
+The second model used some operations for which I cannot reliably assess the
+accuracy, but my best guess is that ``NumericDistribution`` performs ~10x
+better.
+
+``NumericDistribution`` may be
+slower at the same level of accuracy for a model that uses sufficiently many
+operations that ``NumericDistribution`` handles relatively poorly. The most
+inaccurate operations are ``log`` and ``exp`` because they significantly alter
+the shape of the distribution.
+
+The biggest performance upside to Monte Carlo is that it is trivial to
+parallelize by simply generating samples on separate threads.
+``NumericDistribution`` does not currently support multithreading.
+
+Some more details about the runtime performance of ``NumericDistribution``:
+
+Where ``n`` is the number of bins, constructing a ``NumericDistribution`` or
+performing a unary operation has runtime :math:`O(n)`. A binary operation (such
+as addition or multiplication) has a runtime close to :math:`O(n^2)`. To be
+precise, the runtime is :math:`O(n^2 \log(n))` because the :math:`n^2` results
+of a binary operation must be partitioned into :math:`n` ordered bins. In
+practice, this partitioning operation takes up a fairly small portion of the
+runtime for ``n = 200`` (which is the default bin count), and takes up ~half the
+runtime for ``n = 1000``.
+
+For ``n = 200``, a binary operation takes about twice as long as constructing a
+``NumericDistribution``. Even though construction is :math:`O(n)` and binary
+operations are :math:`O(n^2)`, construction has a much worse constant factor for
+normal and log-normal distributions because construction requires repeatedly
+computing the `error function `_,
+which is fairly slow.
+
+The accuracy to speed ratio gets worse as ``n`` increases, so you typically
+don't want to use bin counts larger than the default unless you're particularly
+concerned about accuracy.
+
+Implementation Details
+======================
+
+On setting values within bins
+-----------------------------
+Whenever possible, NumericDistribution assigns the value of each bin as the
+average value between the two edges (weighted by mass). You can think of
+this as the result you'd get if you generated infinitely many Monte Carlo
+samples and grouped them into bins, setting the value of each bin as the
+average of the samples. You might call this the expected value (EV) method,
+in contrast to two methods described below.
+
+The EV method guarantees that, whenever the histogram width covers the full
+support of the distribution, the histogram's expected value exactly equals
+the expected value of the true distribution (modulo floating point rounding
+errors).
+
+There are some other methods we could use, which are generally worse:
+
+1. Set the value of each bin to the average of the two edges (the
+"trapezoid rule"). The purpose of using the trapezoid rule is that we don't
+know the probability mass within a bin (perhaps the CDF is too hard to
+evaluate) so we have to estimate it. But whenever we *do* know the CDF, we
+can calculate the probability mass exactly, so we don't need to use the
+trapezoid rule.
+
+2. Set the value of each bin to the center of the probability mass (the
+"mass method"). This is equivalent to generating infinitely many Monte
+Carlo samples and grouping them into bins, setting the value of each bin as
+the **median** of the samples. This approach does not particularly help us
+because we don't care about the median of every bin. We might care about
+the median of the distribution, but we can calculate that near-exactly
+regardless of what value-setting method we use by looking at the value in
+the bin where the probability mass crosses 0.5. And the mass method will
+systematically underestimate (the absolute value of) EV because the
+definition of expected value places larger weight on larger (absolute)
+values, and the mass method does not.
+
+Although the EV method perfectly measures the expected value of a distribution,
+it systematically underestimates the variance. To see this, consider that it is
+possible to define the variance of a random variable X as :math:`E[X^2] -
+E[X]^2`. The EV method correctly estimates :math:`E[X]`, so it also correctly
+estimates :math:`E[X]^2`. However, it systematically underestimates
+:math:`E[X^2]` because :math:`E[X^2]` places more weight on larger values. But
+an alternative method that accurately estimated variance would necessarily
+overestimate :math:`E[X]`. It's possible to force both mean and variance to be
+exactly correct by adjusting the value of each bin according to its z-score, but
+this could make other summary statistics less accurate.
+
+On bin sizing for two-sided distributions
+-----------------------------------------
+:any:`squigglepy.numeric_distribution.BinSizing` describes the various
+bin-sizing methods and their properties. ``BinSizing.ev`` is usually preferred.
+``BinSizing.ev`` allocates bins such that each bin contributes equally to the
+distribution's expected value. This has the nice property that bins are narrower
+in the regions that matter more for the distribution's EV, for example, in
+fat-tailed distributions it more bins in the tail(s) and fewer bins in the
+center.
+
+The interpretation of the EV bin-sizing method is slightly non-obvious
+for two-sided distributions because we must decide how to interpret bins
+with negative expected value.
+
+The EV method arranges values into bins such that:
+ * The negative side has the correct negative contribution to EV and the
+ positive side has the correct positive contribution to EV.
+ * Every negative bin has equal contribution to EV and every positive bin
+ has equal contribution to EV.
+ * If a side has nonzero probability mass, then it has at least one bin,
+ regardless of how small its mass.
+ * The number of negative and positive bins are chosen such that the
+ absolute contribution to EV for negative bins is as close as possible
+ to the absolute contribution to EV for positive bins given the above
+ constraints.
+
+This binning method means that the distribution EV is exactly preserved
+and there is no bin that contains the value zero. However, the positive
+and negative bins do not necessarily have equal contribution to EV, and
+the magnitude of the error is at most ``1 / num_bins / 2``.
diff --git a/doc/source/reference/modules.rst b/doc/source/reference/modules.rst
new file mode 100644
index 0000000..57192bd
--- /dev/null
+++ b/doc/source/reference/modules.rst
@@ -0,0 +1,7 @@
+squigglepy
+==========
+
+.. toctree::
+ :maxdepth: 4
+
+ squigglepy
diff --git a/doc/source/reference/squigglepy.bayes.rst b/doc/source/reference/squigglepy.bayes.rst
new file mode 100644
index 0000000..146fe05
--- /dev/null
+++ b/doc/source/reference/squigglepy.bayes.rst
@@ -0,0 +1,7 @@
+squigglepy.bayes module
+=======================
+
+.. automodule:: squigglepy.bayes
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/doc/source/reference/squigglepy.correlation.rst b/doc/source/reference/squigglepy.correlation.rst
new file mode 100644
index 0000000..15304d7
--- /dev/null
+++ b/doc/source/reference/squigglepy.correlation.rst
@@ -0,0 +1,7 @@
+squigglepy.correlation module
+=============================
+
+.. automodule:: squigglepy.correlation
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/doc/source/reference/squigglepy.distributions.rst b/doc/source/reference/squigglepy.distributions.rst
new file mode 100644
index 0000000..46cf84b
--- /dev/null
+++ b/doc/source/reference/squigglepy.distributions.rst
@@ -0,0 +1,7 @@
+squigglepy.distributions module
+===============================
+
+.. automodule:: squigglepy.distributions
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/doc/source/reference/squigglepy.numbers.rst b/doc/source/reference/squigglepy.numbers.rst
new file mode 100644
index 0000000..9640c0f
--- /dev/null
+++ b/doc/source/reference/squigglepy.numbers.rst
@@ -0,0 +1,7 @@
+squigglepy.numbers module
+=========================
+
+.. automodule:: squigglepy.numbers
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/doc/source/reference/squigglepy.numeric_distribution.rst b/doc/source/reference/squigglepy.numeric_distribution.rst
new file mode 100644
index 0000000..2016c6e
--- /dev/null
+++ b/doc/source/reference/squigglepy.numeric_distribution.rst
@@ -0,0 +1,7 @@
+squigglepy.numeric\_distribution module
+=======================================
+
+.. automodule:: squigglepy.numeric_distribution
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/doc/source/reference/squigglepy.rng.rst b/doc/source/reference/squigglepy.rng.rst
new file mode 100644
index 0000000..052b5a9
--- /dev/null
+++ b/doc/source/reference/squigglepy.rng.rst
@@ -0,0 +1,7 @@
+squigglepy.rng module
+=====================
+
+.. automodule:: squigglepy.rng
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/doc/source/reference/squigglepy.rst b/doc/source/reference/squigglepy.rst
new file mode 100644
index 0000000..134cc43
--- /dev/null
+++ b/doc/source/reference/squigglepy.rst
@@ -0,0 +1,23 @@
+squigglepy package
+==================
+
+.. automodule:: squigglepy
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+
+ squigglepy.bayes
+ squigglepy.correlation
+ squigglepy.distributions
+ squigglepy.numbers
+ squigglepy.numeric_distribution
+ squigglepy.rng
+ squigglepy.samplers
+ squigglepy.utils
+ squigglepy.version
+
diff --git a/doc/source/reference/squigglepy.samplers.rst b/doc/source/reference/squigglepy.samplers.rst
new file mode 100644
index 0000000..cbd3be1
--- /dev/null
+++ b/doc/source/reference/squigglepy.samplers.rst
@@ -0,0 +1,7 @@
+squigglepy.samplers module
+==========================
+
+.. automodule:: squigglepy.samplers
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/doc/source/reference/squigglepy.utils.rst b/doc/source/reference/squigglepy.utils.rst
new file mode 100644
index 0000000..1638b74
--- /dev/null
+++ b/doc/source/reference/squigglepy.utils.rst
@@ -0,0 +1,7 @@
+squigglepy.utils module
+=======================
+
+.. automodule:: squigglepy.utils
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/doc/source/reference/squigglepy.version.rst b/doc/source/reference/squigglepy.version.rst
new file mode 100644
index 0000000..fe983ba
--- /dev/null
+++ b/doc/source/reference/squigglepy.version.rst
@@ -0,0 +1,7 @@
+squigglepy.version module
+=========================
+
+.. automodule:: squigglepy.version
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/doc/source/usage.rst b/doc/source/usage.rst
new file mode 100644
index 0000000..3e03a70
--- /dev/null
+++ b/doc/source/usage.rst
@@ -0,0 +1,476 @@
+Examples
+========
+
+Piano tuners example
+~~~~~~~~~~~~~~~~~~~~
+
+Here’s the Squigglepy implementation of `the example from Squiggle
+Docs `__:
+
+.. code:: python
+
+ import squigglepy as sq
+ import numpy as np
+ import matplotlib.pyplot as plt
+ from squigglepy.numbers import K, M
+ from pprint import pprint
+
+ pop_of_ny_2022 = sq.to(8.1*M, 8.4*M) # This means that you're 90% confident the value is between 8.1 and 8.4 Million.
+ pct_of_pop_w_pianos = sq.to(0.2, 1) * 0.01 # We assume there are almost no people with multiple pianos
+ pianos_per_piano_tuner = sq.to(2*K, 50*K)
+ piano_tuners_per_piano = 1 / pianos_per_piano_tuner
+ total_tuners_in_2022 = pop_of_ny_2022 * pct_of_pop_w_pianos * piano_tuners_per_piano
+ samples = total_tuners_in_2022 @ 1000 # Note: `@ 1000` is shorthand to get 1000 samples
+
+ # Get mean and SD
+ print('Mean: {}, SD: {}'.format(round(np.mean(samples), 2),
+ round(np.std(samples), 2)))
+
+ # Get percentiles
+ pprint(sq.get_percentiles(samples, digits=0))
+
+ # Histogram
+ plt.hist(samples, bins=200)
+ plt.show()
+
+ # Shorter histogram
+ total_tuners_in_2022.plot()
+
+And the version from the Squiggle doc that incorporates time:
+
+.. code:: python
+
+ import squigglepy as sq
+ from squigglepy.numbers import K, M
+
+ pop_of_ny_2022 = sq.to(8.1*M, 8.4*M)
+ pct_of_pop_w_pianos = sq.to(0.2, 1) * 0.01
+ pianos_per_piano_tuner = sq.to(2*K, 50*K)
+ piano_tuners_per_piano = 1 / pianos_per_piano_tuner
+
+ def pop_at_time(t): # t = Time in years after 2022
+ avg_yearly_pct_change = sq.to(-0.01, 0.05) # We're expecting NYC to continuously grow with an mean of roughly between -1% and +4% per year
+ return pop_of_ny_2022 * ((avg_yearly_pct_change + 1) ** t)
+
+ def total_tuners_at_time(t):
+ return pop_at_time(t) * pct_of_pop_w_pianos * piano_tuners_per_piano
+
+ # Get total piano tuners at 2030
+ sq.get_percentiles(total_tuners_at_time(2030-2022) @ 1000)
+
+**WARNING:** Be careful about dividing by ``K``, ``M``, etc. ``1/2*K`` =
+500 in Python. Use ``1/(2*K)`` instead to get the expected outcome.
+
+**WARNING:** Be careful about using ``K`` to get sample counts. Use
+``sq.norm(2, 3) @ (2*K)``\ … ``sq.norm(2, 3) @ 2*K`` will return only
+two samples, multiplied by 1000.
+
+Distributions
+~~~~~~~~~~~~~
+
+.. code:: python
+
+ import squigglepy as sq
+
+ # Normal distribution
+ sq.norm(1, 3) # 90% interval from 1 to 3
+
+ # Distribution can be sampled with mean and sd too
+ sq.norm(mean=0, sd=1)
+
+ # Shorthand to get one sample
+ ~sq.norm(1, 3)
+
+ # Shorthand to get more than one sample
+ sq.norm(1, 3) @ 100
+
+ # Longhand version to get more than one sample
+ sq.sample(sq.norm(1, 3), n=100)
+
+ # Nice progress reporter
+ sq.sample(sq.norm(1, 3), n=1000, verbose=True)
+
+ # Other distributions exist
+ sq.lognorm(1, 10)
+ sq.tdist(1, 10, t=5)
+ sq.triangular(1, 2, 3)
+ sq.pert(1, 2, 3, lam=2)
+ sq.binomial(p=0.5, n=5)
+ sq.beta(a=1, b=2)
+ sq.bernoulli(p=0.5)
+ sq.poisson(10)
+ sq.chisquare(2)
+ sq.gamma(3, 2)
+ sq.pareto(1)
+ sq.exponential(scale=1)
+ sq.geometric(p=0.5)
+
+ # Discrete sampling
+ sq.discrete({'A': 0.1, 'B': 0.9})
+
+ # Can return integers
+ sq.discrete({0: 0.1, 1: 0.3, 2: 0.3, 3: 0.15, 4: 0.15})
+
+ # Alternate format (also can be used to return more complex objects)
+ sq.discrete([[0.1, 0],
+ [0.3, 1],
+ [0.3, 2],
+ [0.15, 3],
+ [0.15, 4]])
+
+ sq.discrete([0, 1, 2]) # No weights assumes equal weights
+
+ # You can mix distributions together
+ sq.mixture([sq.norm(1, 3),
+ sq.norm(4, 10),
+ sq.lognorm(1, 10)], # Distributions to mix
+ [0.3, 0.3, 0.4]) # These are the weights on each distribution
+
+ # This is equivalent to the above, just a different way of doing the notation
+ sq.mixture([[0.3, sq.norm(1,3)],
+ [0.3, sq.norm(4,10)],
+ [0.4, sq.lognorm(1,10)]])
+
+ # Make a zero-inflated distribution
+ # 60% chance of returning 0, 40% chance of sampling from `norm(1, 2)`.
+ sq.zero_inflated(0.6, sq.norm(1, 2))
+
+Additional features
+~~~~~~~~~~~~~~~~~~~
+
+.. code:: python
+
+ import squigglepy as sq
+
+ # You can add and subtract distributions
+ (sq.norm(1,3) + sq.norm(4,5)) @ 100
+ (sq.norm(1,3) - sq.norm(4,5)) @ 100
+ (sq.norm(1,3) * sq.norm(4,5)) @ 100
+ (sq.norm(1,3) / sq.norm(4,5)) @ 100
+
+ # You can also do math with numbers
+ ~((sq.norm(sd=5) + 2) * 2)
+ ~(-sq.lognorm(0.1, 1) * sq.pareto(1) / 10)
+
+ # You can change the CI from 90% (default) to 80%
+ sq.norm(1, 3, credibility=80)
+
+ # You can clip
+ sq.norm(0, 3, lclip=0, rclip=5) # Sample norm with a 90% CI from 0-3, but anything lower than 0 gets clipped to 0 and anything higher than 5 gets clipped to 5.
+
+ # You can also clip with a function, and use pipes
+ sq.norm(0, 3) >> sq.clip(0, 5)
+
+ # You can correlate continuous distributions
+ a, b = sq.uniform(-1, 1), sq.to(0, 3)
+ a, b = sq.correlate((a, b), 0.5) # Correlate a and b with a correlation of 0.5
+ # You can even pass your own correlation matrix!
+ a, b = sq.correlate((a, b), [[1, 0.5], [0.5, 1]])
+
+Example: Rolling a die
+^^^^^^^^^^^^^^^^^^^^^^
+
+An example of how to use distributions to build tools:
+
+.. code:: python
+
+ import squigglepy as sq
+
+ def roll_die(sides, n=1):
+ return sq.discrete(list(range(1, sides + 1))) @ n if sides > 0 else None
+
+ roll_die(sides=6, n=10)
+ # [2, 6, 5, 2, 6, 2, 3, 1, 5, 2]
+
+This is already included standard in the utils of this package. Use
+``sq.roll_die``.
+
+Bayesian inference
+~~~~~~~~~~~~~~~~~~
+
+1% of women at age forty who participate in routine screening have
+breast cancer. 80% of women with breast cancer will get positive
+mammographies. 9.6% of women without breast cancer will also get
+positive mammographies.
+
+A woman in this age group had a positive mammography in a routine
+screening. What is the probability that she actually has breast cancer?
+
+We can approximate the answer with a Bayesian network (uses rejection
+sampling):
+
+.. code:: python
+
+ import squigglepy as sq
+ from squigglepy import bayes
+ from squigglepy.numbers import M
+
+ def mammography(has_cancer):
+ return sq.event(0.8 if has_cancer else 0.096)
+
+ def define_event():
+ cancer = ~sq.bernoulli(0.01)
+ return({'mammography': mammography(cancer),
+ 'cancer': cancer})
+
+ bayes.bayesnet(define_event,
+ find=lambda e: e['cancer'],
+ conditional_on=lambda e: e['mammography'],
+ n=1*M)
+ # 0.07723995880535531
+
+Or if we have the information immediately on hand, we can directly
+calculate it. Though this doesn’t work for very complex stuff.
+
+.. code:: python
+
+ from squigglepy import bayes
+ bayes.simple_bayes(prior=0.01, likelihood_h=0.8, likelihood_not_h=0.096)
+ # 0.07763975155279504
+
+You can also make distributions and update them:
+
+.. code:: python
+
+ import matplotlib.pyplot as plt
+ import squigglepy as sq
+ from squigglepy import bayes
+ from squigglepy.numbers import K
+ import numpy as np
+
+ print('Prior')
+ prior = sq.norm(1,5)
+ prior_samples = prior @ (10*K)
+ plt.hist(prior_samples, bins = 200)
+ plt.show()
+ print(sq.get_percentiles(prior_samples))
+ print('Prior Mean: {} SD: {}'.format(np.mean(prior_samples), np.std(prior_samples)))
+ print('-')
+
+ print('Evidence')
+ evidence = sq.norm(2,3)
+ evidence_samples = evidence @ (10*K)
+ plt.hist(evidence_samples, bins = 200)
+ plt.show()
+ print(sq.get_percentiles(evidence_samples))
+ print('Evidence Mean: {} SD: {}'.format(np.mean(evidence_samples), np.std(evidence_samples)))
+ print('-')
+
+ print('Posterior')
+ posterior = bayes.update(prior, evidence)
+ posterior_samples = posterior @ (10*K)
+ plt.hist(posterior_samples, bins = 200)
+ plt.show()
+ print(sq.get_percentiles(posterior_samples))
+ print('Posterior Mean: {} SD: {}'.format(np.mean(posterior_samples), np.std(posterior_samples)))
+
+ print('Average')
+ average = bayes.average(prior, evidence)
+ average_samples = average @ (10*K)
+ plt.hist(average_samples, bins = 200)
+ plt.show()
+ print(sq.get_percentiles(average_samples))
+ print('Average Mean: {} SD: {}'.format(np.mean(average_samples), np.std(average_samples)))
+
+Example: Alarm net
+^^^^^^^^^^^^^^^^^^
+
+This is the alarm network from `Bayesian Artificial Intelligence -
+Section
+2.5.1 `__:
+
+ Assume your house has an alarm system against burglary.
+
+ You live in the seismically active area and the alarm system can get
+ occasionally set off by an earthquake.
+
+ You have two neighbors, Mary and John, who do not know each other. If
+ they hear the alarm they call you, but this is not guaranteed.
+
+ The chance of a burglary on a particular day is 0.1%. The chance of
+ an earthquake on a particular day is 0.2%.
+
+ The alarm will go off 95% of the time with both a burglary and an
+ earthquake, 94% of the time with just a burglary, 29% of the time
+ with just an earthquake, and 0.1% of the time with nothing (total
+ false alarm).
+
+ John will call you 90% of the time when the alarm goes off. But on 5%
+ of the days, John will just call to say “hi”. Mary will call you 70%
+ of the time when the alarm goes off. But on 1% of the days, Mary will
+ just call to say “hi”.
+
+.. code:: python
+
+ import squigglepy as sq
+ from squigglepy import bayes
+ from squigglepy.numbers import M
+
+ def p_alarm_goes_off(burglary, earthquake):
+ if burglary and earthquake:
+ return 0.95
+ elif burglary and not earthquake:
+ return 0.94
+ elif not burglary and earthquake:
+ return 0.29
+ elif not burglary and not earthquake:
+ return 0.001
+
+ def p_john_calls(alarm_goes_off):
+ return 0.9 if alarm_goes_off else 0.05
+
+ def p_mary_calls(alarm_goes_off):
+ return 0.7 if alarm_goes_off else 0.01
+
+ def define_event():
+ burglary_happens = sq.event(p=0.001)
+ earthquake_happens = sq.event(p=0.002)
+ alarm_goes_off = sq.event(p_alarm_goes_off(burglary_happens, earthquake_happens))
+ john_calls = sq.event(p_john_calls(alarm_goes_off))
+ mary_calls = sq.event(p_mary_calls(alarm_goes_off))
+ return {'burglary': burglary_happens,
+ 'earthquake': earthquake_happens,
+ 'alarm_goes_off': alarm_goes_off,
+ 'john_calls': john_calls,
+ 'mary_calls': mary_calls}
+
+ # What are the chances that both John and Mary call if an earthquake happens?
+ bayes.bayesnet(define_event,
+ n=1*M,
+ find=lambda e: (e['mary_calls'] and e['john_calls']),
+ conditional_on=lambda e: e['earthquake'])
+ # Result will be ~0.19, though it varies because it is based on a random sample.
+ # This also may take a minute to run.
+
+ # If both John and Mary call, what is the chance there's been a burglary?
+ bayes.bayesnet(define_event,
+ n=1*M,
+ find=lambda e: e['burglary'],
+ conditional_on=lambda e: (e['mary_calls'] and e['john_calls']))
+ # Result will be ~0.27, though it varies because it is based on a random sample.
+ # This will run quickly because there is a built-in cache.
+ # Use `cache=False` to not build a cache and `reload_cache=True` to recalculate the cache.
+
+Note that the amount of Bayesian analysis that squigglepy can do is
+pretty limited. For more complex bayesian analysis, consider
+`sorobn `__,
+`pomegranate `__,
+`bnlearn `__, or
+`pyMC `__.
+
+Example: A demonstration of the Monty Hall Problem
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code:: python
+
+ import squigglepy as sq
+ from squigglepy import bayes
+ from squigglepy.numbers import K, M, B, T
+
+
+ def monte_hall(door_picked, switch=False):
+ doors = ['A', 'B', 'C']
+ car_is_behind_door = ~sq.discrete(doors)
+ reveal_door = ~sq.discrete([d for d in doors if d != door_picked and d != car_is_behind_door])
+
+ if switch:
+ old_door_picked = door_picked
+ door_picked = [d for d in doors if d != old_door_picked and d != reveal_door][0]
+
+ won_car = (car_is_behind_door == door_picked)
+ return won_car
+
+
+ def define_event():
+ door = ~sq.discrete(['A', 'B', 'C'])
+ switch = sq.event(0.5)
+ return {'won': monte_hall(door_picked=door, switch=switch),
+ 'switched': switch}
+
+ RUNS = 10*K
+ r = bayes.bayesnet(define_event,
+ find=lambda e: e['won'],
+ conditional_on=lambda e: e['switched'],
+ verbose=True,
+ n=RUNS)
+ print('Win {}% of the time when switching'.format(int(r * 100)))
+
+ r = bayes.bayesnet(define_event,
+ find=lambda e: e['won'],
+ conditional_on=lambda e: not e['switched'],
+ verbose=True,
+ n=RUNS)
+ print('Win {}% of the time when not switching'.format(int(r * 100)))
+
+ # Win 66% of the time when switching
+ # Win 34% of the time when not switching
+
+Example: More complex coin/dice interactions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+ Imagine that I flip a coin. If heads, I take a random die out of my
+ blue bag. If tails, I take a random die out of my red bag. The blue
+ bag contains only 6-sided dice. The red bag contains a 4-sided die, a
+ 6-sided die, a 10-sided die, and a 20-sided die. I then roll the
+ random die I took. What is the chance that I roll a 6?
+
+.. code:: python
+
+ import squigglepy as sq
+ from squigglepy.numbers import K, M, B, T
+ from squigglepy import bayes
+
+ def define_event():
+ if sq.flip_coin() == 'heads': # Blue bag
+ return sq.roll_die(6)
+ else: # Red bag
+ return sq.discrete([4, 6, 10, 20]) >> sq.roll_die
+
+
+ bayes.bayesnet(define_event,
+ find=lambda e: e == 6,
+ verbose=True,
+ n=100*K)
+ # This run for me returned 0.12306 which is pretty close to the correct answer of 0.12292
+
+Kelly betting
+~~~~~~~~~~~~~
+
+You can use probability generated, combine with a bankroll to determine
+bet sizing using `Kelly
+criterion `__.
+
+For example, if you want to Kelly bet and you’ve…
+
+- determined that your price (your probability of the event in question
+ happening / the market in question resolving in your favor) is $0.70
+ (70%)
+- see that the market is pricing at $0.65
+- you have a bankroll of $1000 that you are willing to bet
+
+You should bet as follows:
+
+.. code:: python
+
+ import squigglepy as sq
+ kelly_data = sq.kelly(my_price=0.70, market_price=0.65, bankroll=1000)
+ kelly_data['kelly'] # What fraction of my bankroll should I bet on this?
+ # 0.143
+ kelly_data['target'] # How much money should be invested in this?
+ # 142.86
+ kelly_data['expected_roi'] # What is the expected ROI of this bet?
+ # 0.077
+
+More examples
+~~~~~~~~~~~~~
+
+You can see more examples of squigglepy in action
+`here `__.
+
+Run tests
+---------
+
+Use ``black .`` for formatting.
+
+Run
+``ruff check . && pytest && pip3 install . && python3 tests/integration.py``
diff --git a/poetry.lock b/poetry.lock
index 86c4e93..42f9236 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,5 +1,30 @@
# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+[[package]]
+name = "accessible-pygments"
+version = "0.0.4"
+description = "A collection of accessible pygments styles"
+optional = false
+python-versions = "*"
+files = [
+ {file = "accessible-pygments-0.0.4.tar.gz", hash = "sha256:e7b57a9b15958e9601c7e9eb07a440c813283545a20973f2574a5f453d0e953e"},
+ {file = "accessible_pygments-0.0.4-py2.py3-none-any.whl", hash = "sha256:416c6d8c1ea1c5ad8701903a20fcedf953c6e720d64f33dc47bfb2d3f2fa4e8d"},
+]
+
+[package.dependencies]
+pygments = ">=1.5"
+
+[[package]]
+name = "alabaster"
+version = "0.7.13"
+description = "A configurable sidebar-enabled Sphinx theme"
+optional = false
+python-versions = ">=3.6"
+files = [
+ {file = "alabaster-0.7.13-py3-none-any.whl", hash = "sha256:1ee19aca801bbabb5ba3f5f258e4422dfa86f82f3e9cefb0859b283cdd7f62a3"},
+ {file = "alabaster-0.7.13.tar.gz", hash = "sha256:a27a4a084d5e690e16e01e03ad2b2e552c61a65469419b907243193de1a84ae2"},
+]
+
[[package]]
name = "ansi2html"
version = "1.8.0"
@@ -33,6 +58,38 @@ docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-
tests = ["attrs[tests-no-zope]", "zope-interface"]
tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
+[[package]]
+name = "babel"
+version = "2.13.1"
+description = "Internationalization utilities"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "Babel-2.13.1-py3-none-any.whl", hash = "sha256:7077a4984b02b6727ac10f1f7294484f737443d7e2e66c5e4380e41a3ae0b4ed"},
+ {file = "Babel-2.13.1.tar.gz", hash = "sha256:33e0952d7dd6374af8dbf6768cc4ddf3ccfefc244f9986d4074704f2fbd18900"},
+]
+
+[package.extras]
+dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"]
+
+[[package]]
+name = "beautifulsoup4"
+version = "4.12.2"
+description = "Screen-scraping library"
+optional = false
+python-versions = ">=3.6.0"
+files = [
+ {file = "beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"},
+ {file = "beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"},
+]
+
+[package.dependencies]
+soupsieve = ">1.2"
+
+[package.extras]
+html5lib = ["html5lib"]
+lxml = ["lxml"]
+
[[package]]
name = "black"
version = "23.3.0"
@@ -438,6 +495,17 @@ files = [
[package.extras]
graph = ["objgraph (>=1.7.2)"]
+[[package]]
+name = "docutils"
+version = "0.20.1"
+description = "Docutils -- Python Documentation Utilities"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "docutils-0.20.1-py3-none-any.whl", hash = "sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6"},
+ {file = "docutils-0.20.1.tar.gz", hash = "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b"},
+]
+
[[package]]
name = "exceptiongroup"
version = "1.1.1"
@@ -602,6 +670,17 @@ files = [
{file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"},
]
+[[package]]
+name = "imagesize"
+version = "1.4.1"
+description = "Getting image size from png/jpeg/jpeg2000/gif file"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
+files = [
+ {file = "imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b"},
+ {file = "imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a"},
+]
+
[[package]]
name = "importlib-metadata"
version = "6.7.0"
@@ -1369,6 +1448,33 @@ files = [
[package.extras]
test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
+[[package]]
+name = "pydata-sphinx-theme"
+version = "0.14.3"
+description = "Bootstrap-based Sphinx theme from the PyData community"
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "pydata_sphinx_theme-0.14.3-py3-none-any.whl", hash = "sha256:b7e40cd75a20449adfe2d7525be379b9fe92f6d31e5233e449fa34ddcd4398d9"},
+ {file = "pydata_sphinx_theme-0.14.3.tar.gz", hash = "sha256:bd474f347895f3fc5b6ce87390af64330ee54f11ebf9660d5bc3f87d532d4e5c"},
+]
+
+[package.dependencies]
+accessible-pygments = "*"
+Babel = "*"
+beautifulsoup4 = "*"
+docutils = "!=0.17.0"
+packaging = "*"
+pygments = ">=2.7"
+sphinx = ">=5.0"
+typing-extensions = "*"
+
+[package.extras]
+a11y = ["pytest-playwright"]
+dev = ["nox", "pre-commit", "pydata-sphinx-theme[doc,test]", "pyyaml"]
+doc = ["ablog (>=0.11.0rc2)", "colorama", "ipykernel", "ipyleaflet", "jupyter_sphinx", "jupyterlite-sphinx", "linkify-it-py", "matplotlib", "myst-parser", "nbsphinx", "numpy", "numpydoc", "pandas", "plotly", "rich", "sphinx-autoapi (>=3.0.0)", "sphinx-copybutton", "sphinx-design", "sphinx-favicon (>=1.0.1)", "sphinx-sitemap", "sphinx-togglebutton", "sphinxcontrib-youtube (<1.4)", "sphinxext-rediraffe", "xarray"]
+test = ["pytest", "pytest-cov", "pytest-regressions"]
+
[[package]]
name = "pygments"
version = "2.15.1"
@@ -1659,6 +1765,17 @@ files = [
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
]
+[[package]]
+name = "snowballstemmer"
+version = "2.2.0"
+description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms."
+optional = false
+python-versions = "*"
+files = [
+ {file = "snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"},
+ {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"},
+]
+
[[package]]
name = "sortedcontainers"
version = "2.4.0"
@@ -1670,6 +1787,156 @@ files = [
{file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
]
+[[package]]
+name = "soupsieve"
+version = "2.5"
+description = "A modern CSS selector implementation for Beautiful Soup."
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"},
+ {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"},
+]
+
+[[package]]
+name = "sphinx"
+version = "7.2.6"
+description = "Python documentation generator"
+optional = false
+python-versions = ">=3.9"
+files = [
+ {file = "sphinx-7.2.6-py3-none-any.whl", hash = "sha256:1e09160a40b956dc623c910118fa636da93bd3ca0b9876a7b3df90f07d691560"},
+ {file = "sphinx-7.2.6.tar.gz", hash = "sha256:9a5160e1ea90688d5963ba09a2dcd8bdd526620edbb65c328728f1b2228d5ab5"},
+]
+
+[package.dependencies]
+alabaster = ">=0.7,<0.8"
+babel = ">=2.9"
+colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
+docutils = ">=0.18.1,<0.21"
+imagesize = ">=1.3"
+importlib-metadata = {version = ">=4.8", markers = "python_version < \"3.10\""}
+Jinja2 = ">=3.0"
+packaging = ">=21.0"
+Pygments = ">=2.14"
+requests = ">=2.25.0"
+snowballstemmer = ">=2.0"
+sphinxcontrib-applehelp = "*"
+sphinxcontrib-devhelp = "*"
+sphinxcontrib-htmlhelp = ">=2.0.0"
+sphinxcontrib-jsmath = "*"
+sphinxcontrib-qthelp = "*"
+sphinxcontrib-serializinghtml = ">=1.1.9"
+
+[package.extras]
+docs = ["sphinxcontrib-websupport"]
+lint = ["docutils-stubs", "flake8 (>=3.5.0)", "flake8-simplify", "isort", "mypy (>=0.990)", "ruff", "sphinx-lint", "types-requests"]
+test = ["cython (>=3.0)", "filelock", "html5lib", "pytest (>=4.6)", "setuptools (>=67.0)"]
+
+[[package]]
+name = "sphinxcontrib-applehelp"
+version = "1.0.7"
+description = "sphinxcontrib-applehelp is a Sphinx extension which outputs Apple help books"
+optional = false
+python-versions = ">=3.9"
+files = [
+ {file = "sphinxcontrib_applehelp-1.0.7-py3-none-any.whl", hash = "sha256:094c4d56209d1734e7d252f6e0b3ccc090bd52ee56807a5d9315b19c122ab15d"},
+ {file = "sphinxcontrib_applehelp-1.0.7.tar.gz", hash = "sha256:39fdc8d762d33b01a7d8f026a3b7d71563ea3b72787d5f00ad8465bd9d6dfbfa"},
+]
+
+[package.dependencies]
+Sphinx = ">=5"
+
+[package.extras]
+lint = ["docutils-stubs", "flake8", "mypy"]
+test = ["pytest"]
+
+[[package]]
+name = "sphinxcontrib-devhelp"
+version = "1.0.5"
+description = "sphinxcontrib-devhelp is a sphinx extension which outputs Devhelp documents"
+optional = false
+python-versions = ">=3.9"
+files = [
+ {file = "sphinxcontrib_devhelp-1.0.5-py3-none-any.whl", hash = "sha256:fe8009aed765188f08fcaadbb3ea0d90ce8ae2d76710b7e29ea7d047177dae2f"},
+ {file = "sphinxcontrib_devhelp-1.0.5.tar.gz", hash = "sha256:63b41e0d38207ca40ebbeabcf4d8e51f76c03e78cd61abe118cf4435c73d4212"},
+]
+
+[package.dependencies]
+Sphinx = ">=5"
+
+[package.extras]
+lint = ["docutils-stubs", "flake8", "mypy"]
+test = ["pytest"]
+
+[[package]]
+name = "sphinxcontrib-htmlhelp"
+version = "2.0.4"
+description = "sphinxcontrib-htmlhelp is a sphinx extension which renders HTML help files"
+optional = false
+python-versions = ">=3.9"
+files = [
+ {file = "sphinxcontrib_htmlhelp-2.0.4-py3-none-any.whl", hash = "sha256:8001661c077a73c29beaf4a79968d0726103c5605e27db92b9ebed8bab1359e9"},
+ {file = "sphinxcontrib_htmlhelp-2.0.4.tar.gz", hash = "sha256:6c26a118a05b76000738429b724a0568dbde5b72391a688577da08f11891092a"},
+]
+
+[package.dependencies]
+Sphinx = ">=5"
+
+[package.extras]
+lint = ["docutils-stubs", "flake8", "mypy"]
+test = ["html5lib", "pytest"]
+
+[[package]]
+name = "sphinxcontrib-jsmath"
+version = "1.0.1"
+description = "A sphinx extension which renders display math in HTML via JavaScript"
+optional = false
+python-versions = ">=3.5"
+files = [
+ {file = "sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"},
+ {file = "sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178"},
+]
+
+[package.extras]
+test = ["flake8", "mypy", "pytest"]
+
+[[package]]
+name = "sphinxcontrib-qthelp"
+version = "1.0.6"
+description = "sphinxcontrib-qthelp is a sphinx extension which outputs QtHelp documents"
+optional = false
+python-versions = ">=3.9"
+files = [
+ {file = "sphinxcontrib_qthelp-1.0.6-py3-none-any.whl", hash = "sha256:bf76886ee7470b934e363da7a954ea2825650013d367728588732c7350f49ea4"},
+ {file = "sphinxcontrib_qthelp-1.0.6.tar.gz", hash = "sha256:62b9d1a186ab7f5ee3356d906f648cacb7a6bdb94d201ee7adf26db55092982d"},
+]
+
+[package.dependencies]
+Sphinx = ">=5"
+
+[package.extras]
+lint = ["docutils-stubs", "flake8", "mypy"]
+test = ["pytest"]
+
+[[package]]
+name = "sphinxcontrib-serializinghtml"
+version = "1.1.9"
+description = "sphinxcontrib-serializinghtml is a sphinx extension which outputs \"serialized\" HTML files (json and pickle)"
+optional = false
+python-versions = ">=3.9"
+files = [
+ {file = "sphinxcontrib_serializinghtml-1.1.9-py3-none-any.whl", hash = "sha256:9b36e503703ff04f20e9675771df105e58aa029cfcbc23b8ed716019b7416ae1"},
+ {file = "sphinxcontrib_serializinghtml-1.1.9.tar.gz", hash = "sha256:0c64ff898339e1fac29abd2bf5f11078f3ec413cfe9c046d3120d7ca65530b54"},
+]
+
+[package.dependencies]
+Sphinx = ">=5"
+
+[package.extras]
+lint = ["docutils-stubs", "flake8", "mypy"]
+test = ["pytest"]
+
[[package]]
name = "tenacity"
version = "8.2.2"
@@ -1809,4 +2076,4 @@ plots = ["matplotlib"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.9,<3.12"
-content-hash = "5625f6f3eec6de2121f93d9cb0591c23402a9c232f824ce8353c50b7441e3a9d"
+content-hash = "f9f1beabf7d339e2173f854e8559f6bd9f1fbb28d711802c3d0d831a7b1fa2b5"
diff --git a/pyproject.toml b/pyproject.toml
index d4e93d2..b35de76 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,7 @@ pathos = "^0.3.0"
msgspec = "^0.15.1"
matplotlib = { version = "^3.7.1", optional = true }
pandas = { version = "^2.0.2", optional = true }
+pydata-sphinx-theme = "^0.14.3"
[tool.poetry.group.dev.dependencies]
diff --git a/squigglepy/__init__.py b/squigglepy/__init__.py
index 9ac7350..4a61a20 100644
--- a/squigglepy/__init__.py
+++ b/squigglepy/__init__.py
@@ -1,5 +1,6 @@
from .distributions import * # noqa ignore=F405
from .numbers import * # noqa ignore=F405
+from .numeric_distribution import * # noqa ignore=F405
from .samplers import * # noqa ignore=F405
from .utils import * # noqa ignore=F405
from .rng import * # noqa ignore=F405
diff --git a/squigglepy/bayes.py b/squigglepy/bayes.py
index 415f29a..e3fec17 100644
--- a/squigglepy/bayes.py
+++ b/squigglepy/bayes.py
@@ -1,3 +1,7 @@
+"""
+This modules includes functions for Bayesian inference.
+"""
+
import os
import time
import math
diff --git a/squigglepy/correlation.py b/squigglepy/correlation.py
index abd20b2..0108940 100644
--- a/squigglepy/correlation.py
+++ b/squigglepy/correlation.py
@@ -107,9 +107,10 @@ def correlate(
>>> solar_radiation, temperature = sq.gamma(300, 100), sq.to(22, 28)
>>> solar_radiation, temperature = sq.correlate((solar_radiation, temperature), 0.7)
>>> print(np.corrcoef(solar_radiation @ 1000, temperature @ 1000)[0, 1])
- 0.6975960649767123
+ 0.6975960649767123
Or you could pass a correlation matrix:
+
>>> funding_gap, cost_per_delivery, effect_size = (
sq.to(20_000, 80_000), sq.to(30, 80), sq.beta(2, 5)
)
@@ -118,9 +119,9 @@ def correlate(
[[1, 0.6, -0.5], [0.6, 1, -0.2], [-0.5, -0.2, 1]]
)
>>> print(np.corrcoef(funding_gap @ 1000, cost_per_delivery @ 1000, effect_size @ 1000))
- array([[ 1. , 0.580520 , -0.480149],
- [ 0.580962, 1. , -0.187831],
- [-0.480149, -0.187831 , 1. ]])
+ array([[ 1. , 0.580520 , -0.480149],
+ [ 0.580962, 1. , -0.187831],
+ [-0.480149, -0.187831 , 1. ]])
"""
if not isinstance(variables, tuple):
diff --git a/squigglepy/distributions.py b/squigglepy/distributions.py
index 413e749..67b2082 100644
--- a/squigglepy/distributions.py
+++ b/squigglepy/distributions.py
@@ -1,11 +1,24 @@
-import operator
+"""
+A collection of probability distributions and functions to operate on them.
+"""
+
import math
import numpy as np
+from numpy import exp, log, pi, sqrt
+import operator
import scipy.stats
+from scipy import special
+from scipy.special import erfc, erfinv
+import warnings
from typing import Optional, Union
-from .utils import _process_weights_values, _is_numpy, is_dist, _round
+from .utils import (
+ _process_weights_values,
+ _is_numpy,
+ is_dist,
+ _round,
+)
from .version import __version__
from .correlation import CorrelationGroup
@@ -58,6 +71,64 @@ def __repr__(self):
return self.__str__() + f" (version {self._version})"
+class IntegrableEVDistribution(ABC):
+ """
+ A base class for distributions that can be integrated to find the
+ contribution to expected value.
+ """
+
+ @abstractmethod
+ def contribution_to_ev(self, x: Union[np.ndarray, float], normalized: bool = True):
+ """Find the fraction of this distribution's absolute expected value
+ given by the portion of the distribution that lies to the left of x.
+ For a distribution with support on [a, b], ``contribution_to_ev(a) =
+ 0`` and ``contribution_to_ev(b) = 1``.
+
+ ``contribution_to_ev(x, normalized=False)`` is defined as
+
+ .. math:: \\int_a^x |t| f(t) dt
+
+ where `f(t)` is the PDF of the normal distribution. ``normalized=True``
+ divides this result by ``contribution_to_ev(b, normalized=False)``.
+
+ Note that this is different from the partial expected value, which is
+ defined as
+
+ .. math:: \\int_{x}^\\infty t f_X(t | X > x) dt
+
+ This function does not respect lclip/rclip.
+
+ Parameters
+ ----------
+ x : array-like
+ The value(s) to find the contribution to expected value for.
+ normalized : bool (default=True)
+ If True, normalize the result such that the return value is a
+ fraction (between 0 and 1). If False, return the raw integral
+ value.
+
+ """
+ # TODO: can compute this via numeric integration for any scipy
+ # distribution using something like
+ #
+ # scipy_dist.expect(lambda x: abs(x), ub=x, loc=loc, scale=scale)
+ #
+ # This is equivalent to contribution_to_ev(x, normalized=False).
+ # I tested that this works correctly for normal and lognormal dists.
+ ...
+
+ @abstractmethod
+ def inv_contribution_to_ev(self, fraction: Union[np.ndarray, float]):
+ """For a given fraction of expected value, find the number such that
+ that fraction lies to the left of that number. The inverse of
+ :func:`contribution_to_ev`.
+
+ This function is analogous to scipy.stats.ppf except that
+ the integrand is :math:`|x| f(x) dx` instead of :math:`f(x) dx`.
+ """
+ ...
+
+
class OperableDistribution(BaseDistribution):
def __init__(self):
super().__init__()
@@ -652,7 +723,7 @@ def const(x):
return ConstantDistribution(x)
-class UniformDistribution(ContinuousDistribution):
+class UniformDistribution(ContinuousDistribution, IntegrableEVDistribution):
def __init__(self, x, y):
super().__init__()
self.x = x
@@ -662,6 +733,40 @@ def __init__(self, x, y):
def __str__(self):
return " uniform({}, {})".format(self.x, self.y)
+ def contribution_to_ev(self, x: Union[np.ndarray, float], normalized=True):
+ x = np.asarray(x)
+ a = self.x
+ b = self.y
+
+ x = np.where(x < a, a, x)
+ x = np.where(x > b, b, x)
+
+ fraction = np.squeeze((x**2 * np.sign(x) - a**2 * np.sign(a)) / (2 * (b - a)))
+ if not normalized:
+ return fraction
+ normalizer = self.contribution_to_ev(b, normalized=False)
+ return fraction / normalizer
+
+ def inv_contribution_to_ev(self, fraction: Union[np.ndarray, float]):
+ if isinstance(fraction, float) or isinstance(fraction, int):
+ fraction = np.array([fraction])
+
+ if any(fraction < 0) or any(fraction > 1):
+ raise ValueError(f"fraction must be >= 0 and <= 1, not {fraction}")
+
+ a = self.x
+ b = self.y
+
+ pos_sol = np.sqrt(
+ np.abs((1 - fraction) * a**2 * np.sign(a) + fraction * b**2 * np.sign(b))
+ )
+ neg_sol = -pos_sol
+
+ if fraction < self.contribution_to_ev(0):
+ return neg_sol
+ else:
+ return pos_sol
+
def uniform(x, y):
"""
@@ -686,8 +791,17 @@ def uniform(x, y):
return UniformDistribution(x=x, y=y)
-class NormalDistribution(ContinuousDistribution):
- def __init__(self, x=None, y=None, mean=None, sd=None, credibility=90, lclip=None, rclip=None):
+class NormalDistribution(ContinuousDistribution, IntegrableEVDistribution):
+ def __init__(
+ self,
+ x=None,
+ y=None,
+ mean=None,
+ sd=None,
+ credibility=90,
+ lclip=None,
+ rclip=None,
+ ):
super().__init__()
self.x = x
self.y = y
@@ -722,6 +836,119 @@ def __str__(self):
out += ")"
return out
+ def contribution_to_ev(self, x: Union[np.ndarray, float], normalized=True):
+ x = np.asarray(x)
+ mu = self.mean
+ sigma = self.sd
+ sigma_scalar = sigma / sqrt(2 * pi)
+
+ # The definite integral from the formula for EV, evaluated from -inf to
+ # x. Evaluating from -inf to +inf would give the EV. This number alone
+ # doesn't tell us the contribution to EV because it is negative for x <
+ # 0. Note: This computes ``erf((x - mu) / (sigma * sqrt(2))) -
+ # erf(-inf)`` as ``erfc(-(x - mu) / (sigma * sqrt(2)))`` to avoid
+ # floating point rounding issues.
+ normal_integral = 0.5 * mu * erfc((mu - x) / (sigma * sqrt(2))) - sigma_scalar * (
+ exp(-((x - mu) ** 2) / sigma**2 / 2)
+ )
+
+ # The absolute value of the integral from -infinity to 0. When
+ # evaluating the formula for normal dist EV, all the values up to zero
+ # contribute negatively to EV, so we flip the sign on these.
+ zero_term = -(
+ 0.5 * mu * erfc(mu / (sigma * sqrt(2)))
+ - sigma_scalar * (exp(-((-mu) ** 2) / sigma**2 / 2))
+ )
+
+ # Fix floating point rounding issue where if x is very small, these
+ # values can end up on the wrong side of zero. This happens when mu is
+ # big and x is far out on the tail, so erf_term(x) is close enough to
+ # 0.5 * mu that the difference cannot be faithfully represented as a
+ # float, and it ends up exaggerating the difference.
+
+ # When x >= 0, add zero_term to get contribution_to_ev(0) up to 0. Then
+ # add zero_term again because that's how much of the contribution to EV
+ # we already integrated. When x < 0, we don't need to adjust
+ # normal_integral for the negative left values, but normal_integral is
+ # negative so we flip the sign.
+ contribution = np.where(x >= 0, 2 * zero_term + normal_integral, -normal_integral)
+
+ # Normalize by the total integral over abs(x) * PDF(x). Note: We cannot
+ # use scipy.stats.foldnorm because its scale parameter is not the same
+ # thing as sigma, and I don't know how to translate.
+ abs_mean = mu + 2 * zero_term
+ return np.squeeze(contribution) / (abs_mean if normalized else 1)
+
+ def _derivative_contribution_to_ev(self, x: np.ndarray):
+ mu = self.mean
+ sigma = self.sd
+ deriv = x * exp(-((mu - abs(x)) ** 2) / (2 * sigma**2)) / (sigma * sqrt(2 * pi))
+ return deriv
+
+ def inv_contribution_to_ev(
+ self, fraction: Union[np.ndarray, float], full_output: bool = False
+ ):
+ if isinstance(fraction, float) or isinstance(fraction, int):
+ fraction = np.array([fraction])
+ mu = self.mean
+ sigma = self.sd
+ tolerance = 1e-8
+
+ if any(fraction < 0) or any(fraction > 1):
+ raise ValueError(f"fraction must be >= 0 and <= 1, not {fraction}")
+
+ # Approximate using Newton's method. Sometimes this has trouble
+ # converging b/c it diverges or gets caught in a cycle, so use binary
+ # search as a fallback.
+ guess = np.full_like(fraction, mu)
+ max_iter = 10
+ newton_iter = 0
+ binary_iter = 0
+ converged = False
+
+ # Catch warnings because Newton's method often causes divisions by
+ # zero. If that does happen, we will just fall back to binary search.
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ for newton_iter in range(max_iter):
+ root = self.contribution_to_ev(guess) - fraction
+ if all(abs(root) < tolerance):
+ converged = True
+ break
+ deriv = self._derivative_contribution_to_ev(guess)
+ if all(deriv == 0):
+ break
+ guess = np.where(abs(deriv) == 0, guess, guess - root / deriv)
+
+ if not converged:
+ # Approximate using binary search (RIP)
+ lower = np.full_like(fraction, scipy.stats.norm.ppf(1e-10, mu, scale=sigma))
+ upper = np.full_like(fraction, scipy.stats.norm.ppf(1 - 1e-10, mu, scale=sigma))
+ guess = scipy.stats.norm.ppf(fraction, mu, scale=sigma)
+ max_iter = 50
+ for binary_iter in range(max_iter):
+ y = self.contribution_to_ev(guess)
+ diff = y - fraction
+ if all(abs(diff) < tolerance):
+ converged = True
+ break
+ lower = np.where(diff < 0, guess, lower)
+ upper = np.where(diff > 0, guess, upper)
+ guess = (lower + upper) / 2
+
+ if full_output:
+ return (
+ np.squeeze(guess),
+ {
+ "success": converged,
+ "newton_iterations": newton_iter,
+ "binary_search_iterations": binary_iter,
+ "used_binary_search": binary_iter > 0,
+ },
+ )
+ else:
+ return np.squeeze(guess)
+
def norm(
x=None, y=None, credibility=90, mean=None, sd=None, lclip=None, rclip=None
@@ -762,11 +989,17 @@ def norm(
norm(mean=1, sd=2)
"""
return NormalDistribution(
- x=x, y=y, credibility=credibility, mean=mean, sd=sd, lclip=lclip, rclip=rclip
+ x=x,
+ y=y,
+ credibility=credibility,
+ mean=mean,
+ sd=sd,
+ lclip=lclip,
+ rclip=rclip,
)
-class LognormalDistribution(ContinuousDistribution):
+class LognormalDistribution(ContinuousDistribution, IntegrableEVDistribution):
def __init__(
self,
x=None,
@@ -817,21 +1050,24 @@ def __init__(
self.lognorm_mean = 1
if self.x is not None:
- self.norm_mean = (np.log(self.x) + np.log(self.y)) / 2
+ self.norm_mean = (log(self.x) + log(self.y)) / 2
cdf_value = 0.5 + 0.5 * (self.credibility / 100)
normed_sigma = scipy.stats.norm.ppf(cdf_value)
- self.norm_sd = (np.log(self.y) - self.norm_mean) / normed_sigma
+ self.norm_sd = (log(self.y) - self.norm_mean) / normed_sigma
if self.lognorm_sd is None:
- self.lognorm_mean = np.exp(self.norm_mean + self.norm_sd**2 / 2)
+ self.lognorm_mean = exp(self.norm_mean + self.norm_sd**2 / 2)
self.lognorm_sd = (
- (np.exp(self.norm_sd**2) - 1) * np.exp(2 * self.norm_mean + self.norm_sd**2)
+ (exp(self.norm_sd**2) - 1) * exp(2 * self.norm_mean + self.norm_sd**2)
) ** 0.5
elif self.norm_sd is None:
- self.norm_mean = np.log(
- (self.lognorm_mean**2 / np.sqrt(self.lognorm_sd**2 + self.lognorm_mean**2))
+ self.norm_mean = log(
+ (self.lognorm_mean**2 / sqrt(self.lognorm_sd**2 + self.lognorm_mean**2))
)
- self.norm_sd = np.sqrt(np.log(1 + self.lognorm_sd**2 / self.lognorm_mean**2))
+ self.norm_sd = sqrt(log(1 + self.lognorm_sd**2 / self.lognorm_mean**2))
+
+ # Cached value for calculating ``contribution_to_ev``
+ self._EV_DENOM = sqrt(2) * self.norm_sd
def __str__(self):
out = " lognorm(lognorm_mean={}, lognorm_sd={}, norm_mean={}, norm_sd={}"
@@ -848,6 +1084,39 @@ def __str__(self):
out += ")"
return out
+ def contribution_to_ev(self, x, normalized=True):
+ x = np.asarray(x)
+ mu = self.norm_mean
+ sigma = self.norm_sd
+
+ with np.errstate(divide="ignore"):
+ # Note: erf can have floating point rounding errors when x is very
+ # small because erf(inf) = 1. erfc is better because erfc(inf) =
+ # 0 so floats can represent the result with more significant digits.
+ inner = -erfc((-log(x) + mu + sigma**2) / self._EV_DENOM)
+
+ return -0.5 * np.squeeze(inner) * (1 if normalized else self.lognorm_mean)
+
+ def inv_contribution_to_ev(self, fraction: Union[np.ndarray, float]):
+ """For a given fraction of expected value, find the number such that
+ that fraction lies to the left of that number. The inverse of
+ `contribution_to_ev`.
+
+ This function is analogous to `lognorm.ppf` except that
+ the integrand is `x * f(x) dx` instead of `f(x) dx`.
+ """
+ if isinstance(fraction, float):
+ fraction = np.array([fraction])
+ if any(fraction < 0) or any(fraction > 1):
+ raise ValueError(f"fraction must be >= 0 and <= 1, not {fraction}")
+
+ mu = self.norm_mean
+ sigma = self.norm_sd
+ y = fraction * self.lognorm_mean
+ return np.squeeze(
+ exp(mu + sigma**2 - sqrt(2) * sigma * erfinv(1 - 2 * exp(-mu - sigma**2 / 2) * y))
+ )
+
def lognorm(
x=None,
@@ -993,15 +1262,35 @@ def binomial(n, p):
return BinomialDistribution(n=n, p=p)
-class BetaDistribution(ContinuousDistribution):
+class BetaDistribution(ContinuousDistribution, IntegrableEVDistribution):
def __init__(self, a, b):
super().__init__()
self.a = a
self.b = b
+ self.mean = a / (a + b)
def __str__(self):
return " beta(a={}, b={})".format(self.a, self.b)
+ def contribution_to_ev(self, x: Union[np.ndarray, float], normalized=True):
+ x = np.asarray(x)
+ a = self.a
+ b = self.b
+
+ res = special.betainc(a + 1, b, x) * special.beta(a + 1, b) / special.beta(a, b)
+ return np.squeeze(res) / (self.mean if normalized else 1)
+
+ def inv_contribution_to_ev(self, fraction: Union[np.ndarray, float]):
+ if isinstance(fraction, float) or isinstance(fraction, int):
+ fraction = np.array([fraction])
+ if any(fraction < 0) or any(fraction > 1):
+ raise ValueError(f"fraction must be >= 0 and <= 1, not {fraction}")
+
+ a = self.a
+ b = self.b
+ y = fraction * self.mean * special.beta(a, b) / special.beta(a + 1, b)
+ return np.squeeze(special.betaincinv(a + 1, b, y))
+
def beta(a, b):
"""
@@ -1476,13 +1765,14 @@ def exponential(scale, lclip=None, rclip=None):
return ExponentialDistribution(scale=scale, lclip=lclip, rclip=rclip)
-class GammaDistribution(ContinuousDistribution):
+class GammaDistribution(ContinuousDistribution, IntegrableEVDistribution):
def __init__(self, shape, scale=1, lclip=None, rclip=None):
super().__init__()
self.shape = shape
self.scale = scale
self.lclip = lclip
self.rclip = rclip
+ self.mean = shape * scale
def __str__(self):
out = " gamma(shape={}, scale={}".format(self.shape, self.scale)
@@ -1493,6 +1783,23 @@ def __str__(self):
out += ")"
return out
+ def contribution_to_ev(self, x: Union[np.ndarray, float], normalized: bool = True):
+ x = np.asarray(x)
+ k = self.shape
+ scale = self.scale
+ res = special.gammainc(k + 1, x / scale)
+ return np.squeeze(res) * (1 if normalized else self.mean)
+
+ def inv_contribution_to_ev(self, fraction: Union[np.ndarray, float]):
+ if isinstance(fraction, float) or isinstance(fraction, int):
+ fraction = np.array([fraction])
+ if any(fraction < 0) or any(fraction >= 1):
+ raise ValueError(f"fraction must be >= 0 and < 1, not {fraction}")
+
+ k = self.shape
+ scale = self.scale
+ return np.squeeze(special.gammaincinv(k + 1, fraction) * scale)
+
def gamma(shape, scale=1, lclip=None, rclip=None):
"""
@@ -1521,14 +1828,31 @@ def gamma(shape, scale=1, lclip=None, rclip=None):
return GammaDistribution(shape=shape, scale=scale, lclip=lclip, rclip=rclip)
-class ParetoDistribution(ContinuousDistribution):
+class ParetoDistribution(ContinuousDistribution, IntegrableEVDistribution):
def __init__(self, shape):
super().__init__()
self.shape = shape
+ self.mean = np.inf if shape <= 1 else shape / (shape - 1)
def __str__(self):
return " pareto({})".format(self.shape)
+ def contribution_to_ev(self, x: Union[np.ndarray, float], normalized: bool = True):
+ x = np.asarray(x)
+ a = self.shape
+ res = np.where(x <= 1, 0, a / (a - 1) * (1 - x ** (1 - a)))
+ return np.squeeze(res) / (self.mean if normalized else 1)
+
+ def inv_contribution_to_ev(self, fraction: Union[np.ndarray, float]):
+ if isinstance(fraction, float) or isinstance(fraction, int):
+ fraction = np.array([fraction])
+ if any(fraction < 0) or any(fraction >= 1):
+ raise ValueError(f"fraction must be >= 0 and < 1, not {fraction}")
+
+ a = self.shape
+ x = (1 - fraction) ** (1 / (1 - a))
+ return np.squeeze(x)
+
def pareto(shape):
"""
@@ -1547,12 +1871,20 @@ def pareto(shape):
--------
>>> pareto(1)
pareto(1)
+
"""
return ParetoDistribution(shape=shape)
class MixtureDistribution(CompositeDistribution):
- def __init__(self, dists, weights=None, relative_weights=None, lclip=None, rclip=None):
+ def __init__(
+ self,
+ dists,
+ weights=None,
+ relative_weights=None,
+ lclip=None,
+ rclip=None,
+ ):
super().__init__()
weights, dists = _process_weights_values(weights, relative_weights, dists)
self.dists = dists
diff --git a/squigglepy/numeric_distribution.py b/squigglepy/numeric_distribution.py
new file mode 100644
index 0000000..4e2a4d3
--- /dev/null
+++ b/squigglepy/numeric_distribution.py
@@ -0,0 +1,2394 @@
+from abc import ABC, abstractmethod
+from enum import Enum
+from numbers import Real
+import numpy as np
+from scipy import stats
+from scipy.interpolate import CubicSpline, PchipInterpolator
+from typing import Callable, List, Optional, Tuple, Union
+import warnings
+
+from .distributions import (
+ BaseDistribution,
+ BernoulliDistribution,
+ BetaDistribution,
+ ChiSquareDistribution,
+ ComplexDistribution,
+ ConstantDistribution,
+ ExponentialDistribution,
+ GammaDistribution,
+ LognormalDistribution,
+ MixtureDistribution,
+ NormalDistribution,
+ ParetoDistribution,
+ PERTDistribution,
+ UniformDistribution,
+)
+from .version import __version__
+
+
+class BinSizing(str, Enum):
+ """An enum for the different methods of sizing histogram bins. A histogram
+ with finitely many bins can only contain so much information about the
+ shape of a distribution; the choice of bin sizing determines what
+ information to prioritize.
+ """
+
+ uniform = "uniform"
+ """Divides the distribution into bins of equal width. For distributions
+ with infinite support (such as normal distributions), it chooses a total
+ width to roughly minimize total error, considering both intra-bin error and
+ error due to the excluded tails.
+ """
+
+ log_uniform = "log-uniform"
+ """Divides the distribution into bins with exponentially increasing width,
+ so that the logarithms of the bin edges are uniformly spaced.
+
+ Log-uniform bin sizing is to uniform bin sizing as a log-normal
+ distribution is to a normal distribution. That is, if you generated a
+ NumericDistribution from a log-normal distribution with log-uniform bin
+ sizing, and then took the log of each bin, you'd get a normal distribution
+ with uniform bin sizing.
+ """
+
+ ev = "ev"
+ """Divides the distribution into bins such that each bin has equal
+ contribution to expected value (see
+ :any:`IntegrableEVDistribution.contribution_to_ev`).
+ """
+
+ mass = "mass"
+ """Divides the distribution into bins such that each bin has equal
+ probability mass. This method is generally not recommended because it puts
+ too much probability mass near the center of the distribution, where
+ precision is the least useful.
+ """
+
+ fat_hybrid = "fat-hybrid"
+ """A hybrid method designed for fat-tailed distributions. Uses mass bin
+ sizing close to the center and log-uniform bin siding on the right tail.
+ Empirically, this combination provides a good balance for the accuracy of
+ fat-tailed distributions at the center and at the tails.
+ """
+
+ bin_count = "bin-count"
+ """Shortens a vector of bins by merging every ``len(vec)/num_bins`` bins
+ together. Can only be used for resizing existing NumericDistributions, not
+ for initializing new ones.
+ """
+
+
+def _bump_indexes(indexes, length):
+ """Given an ordered list of indexes, ensure that every index is unique. If
+ any index is not unique, increment or decrement it until it's unique.
+
+ This function does not guarantee that every index gets moved to the closest
+ unique index, but it does guarantee that every index gets moved to the
+ closest unique index in a certain direction.
+ """
+ for i in range(1, len(indexes)):
+ if indexes[i] <= indexes[i - 1]:
+ indexes[i] = min(length - 1, indexes[i - 1] + 1)
+
+ for i in reversed(range(len(indexes) - 1)):
+ if indexes[i] >= indexes[i + 1]:
+ indexes[i] = max(0, indexes[i + 1] - 1)
+
+ return indexes
+
+
+def _support_for_bin_sizing(dist, bin_sizing, num_bins):
+ """Return where to set the bounds for a bin sizing method with fixed
+ bounds, or None if the given dist/bin sizing does not require finite
+ bounds.
+ """
+ # For norm/lognorm, wider domain increases error within each bin, and
+ # narrower domain increases error at the tails. Inter-bin error is
+ # proportional to width^3 / num_bins^2 and tail error is proportional to
+ # something like exp(-width^2). Setting width using the formula below
+ # balances these two sources of error. ``scale`` has an upper bound because
+ # excessively large values result in floating point rounding errors.
+ if isinstance(dist, NormalDistribution) and bin_sizing == BinSizing.uniform:
+ scale = max(6.5, 4.5 + np.log(num_bins) ** 0.5)
+ return (dist.mean - scale * dist.sd, dist.mean + scale * dist.sd)
+ if isinstance(dist, LognormalDistribution) and bin_sizing == BinSizing.log_uniform:
+ scale = max(6.5, 4.5 + np.log(num_bins) ** 0.5)
+ return np.exp(
+ (dist.norm_mean - scale * dist.norm_sd, dist.norm_mean + scale * dist.norm_sd)
+ )
+
+ # Uniform bin sizing is not gonna be very accurate for a lognormal
+ # distribution no matter how you set the bounds.
+ if isinstance(dist, LognormalDistribution) and bin_sizing == BinSizing.uniform:
+ scale = 6
+ return np.exp(
+ (dist.norm_mean - scale * dist.norm_sd, dist.norm_mean + scale * dist.norm_sd)
+ )
+
+ # Compute the upper bound numerically because there is no good closed-form
+ # expression (that I could find) that reliably captures almost all of the
+ # mass without making the bins overly wide.
+ if isinstance(dist, GammaDistribution) and bin_sizing == BinSizing.uniform:
+ upper_bound = stats.gamma.ppf(1 - 1e-9, dist.shape, scale=dist.scale)
+ return (0, upper_bound)
+ if isinstance(dist, GammaDistribution) and bin_sizing == BinSizing.log_uniform:
+ lower_bound = stats.gamma.ppf(1e-10, dist.shape, scale=dist.scale)
+ upper_bound = stats.gamma.ppf(1 - 1e-10, dist.shape, scale=dist.scale)
+ return (lower_bound, upper_bound)
+
+ return None
+
+
+DEFAULT_BIN_SIZING = {
+ BetaDistribution: BinSizing.mass,
+ ChiSquareDistribution: BinSizing.ev,
+ ExponentialDistribution: BinSizing.ev,
+ GammaDistribution: BinSizing.ev,
+ LognormalDistribution: BinSizing.ev,
+ NormalDistribution: BinSizing.ev,
+ ParetoDistribution: BinSizing.ev,
+ PERTDistribution: BinSizing.mass,
+ UniformDistribution: BinSizing.uniform,
+}
+"""
+Default bin sizing method for each distribution type. Chosen based on
+empirical tests of which method best balances the accuracy across summary
+statistics and across operations (addition, multiplication, left/right clip,
+etc.).
+"""
+
+DEFAULT_NUM_BINS = {
+ BetaDistribution: 50,
+ ChiSquareDistribution: 200,
+ ExponentialDistribution: 200,
+ GammaDistribution: 200,
+ LognormalDistribution: 200,
+ MixtureDistribution: 200,
+ NormalDistribution: 200,
+ ParetoDistribution: 200,
+ PERTDistribution: 100,
+ UniformDistribution: 50,
+}
+"""
+Default number of bins for each distribution type. The default is 200 for
+most distributions, which provides a good balance of accuracy and speed. Some
+distributions use a smaller number of bins because they are sufficiently narrow
+and well-behaved that not many bins are needed for high accuracy.
+"""
+
+CACHED_LOGNORM_CDFS = {}
+CACHED_LOGNORM_PPFS = {}
+
+
+def cached_lognorm_cdfs(num_bins):
+ if num_bins in CACHED_LOGNORM_CDFS:
+ return CACHED_LOGNORM_CDFS[num_bins]
+ support = _support_for_bin_sizing(
+ LognormalDistribution(norm_mean=0, norm_sd=1), BinSizing.log_uniform, num_bins
+ )
+ values = np.exp(np.linspace(np.log(support[0]), np.log(support[1]), num_bins + 1))
+ cdfs = stats.lognorm.cdf(values, 1)
+ CACHED_LOGNORM_CDFS[num_bins] = cdfs
+ return cdfs
+
+
+def cached_lognorm_ppf_zscore(num_bins):
+ if num_bins in CACHED_LOGNORM_PPFS:
+ return CACHED_LOGNORM_PPFS[num_bins]
+ cdfs = np.linspace(0, 1, num_bins + 1)
+ ppfs = _log(stats.lognorm.ppf(cdfs, 1))
+ CACHED_LOGNORM_PPFS[num_bins] = (cdfs, ppfs)
+ return (cdfs, ppfs)
+
+
+def _narrow_support(
+ support: Tuple[float, float], new_support: Tuple[Optional[float], Optional[float]]
+):
+ """Narrow the support to the intersection of ``support`` and ``new_support``."""
+ if new_support[0] is not None:
+ support = (max(support[0], new_support[0]), support[1])
+ if new_support[1] is not None:
+ support = (support[0], min(support[1], new_support[1]))
+ return support
+
+
+def _log(x):
+ with np.errstate(divide="ignore"):
+ return np.log(x)
+
+
+class BaseNumericDistribution(ABC):
+ """BaseNumericDistribution
+
+ An abstract base class for numeric distributions. For more documentation,
+ see :class:`NumericDistribution` and :class:`ZeroNumericDistribution`.
+
+ """
+
+ def __repr__(self):
+ return (
+ f"<{type(self).__name__}(mean={self.mean()}, sd={self.sd()}, "
+ f"num_bins={len(self)}, bin_sizing={self.bin_sizing}) at {hex(id(self))}>"
+ )
+
+ def __str__(self):
+ return f"{type(self).__name__}(mean={self.mean()}, sd={self.sd()})"
+
+ def mean(self, axis=None, dtype=None, out=None):
+ """Mean of the distribution. May be calculated using a stored exact
+ value or the histogram data.
+
+ Parameters
+ ----------
+ None of the parameters do anything, they're only there so that
+ ``numpy.mean()`` can be called on a ``BaseNumericDistribution``.
+ """
+ if self.exact_mean is not None:
+ return self.exact_mean
+ return self.est_mean()
+
+ def sd(self):
+ """Standard deviation of the distribution. May be calculated using a
+ stored exact value or the histogram data."""
+ if self.exact_sd is not None:
+ return self.exact_sd
+ return self.est_sd()
+
+ def std(self, axis=None, dtype=None, out=None):
+ """Standard deviation of the distribution. May be calculated using a
+ stored exact value or the histogram data.
+
+ Parameters
+ ----------
+ None of the parameters do anything, they're only there so that
+ ``numpy.std()`` can be called on a ``BaseNumericDistribution``.
+ """
+ return self.sd()
+
+ def quantile(self, q):
+ """Estimate the value of the distribution at quantile ``q`` by
+ interpolating between known values.
+
+ The accuracy at different quantiles depends on the bin sizing method
+ used. :any:`BinSizing.mass` will produce bins that are evenly spaced
+ across quantiles. :any:``BinSizing.ev`` for fat-tailed distributions
+ will be very inaccurate at lower quantiles in exchange for greater
+ accuracy on the right tail.
+
+ The quantile value at the median of a bin will exactly equal the value
+ in the bin. Other quantile values are interpolated using scipy's
+ ``PchipInterpolator``, which fits points to a piecewise cubic function
+ with the restriction that values between points are monotonic.
+
+ Parameters
+ ----------
+ q : number or array_like
+ The quantile or quantiles for which to determine the value(s).
+
+ Returns
+ -------
+ quantiles: number or array-like
+ The estimated value at the given quantile(s).
+
+ """
+ return self.ppf(q)
+
+ @abstractmethod
+ def ppf(self, q):
+ """Percent point function/inverse CD. An alias for :any:`quantile`."""
+ ...
+
+ def percentile(self, p):
+ """Estimate the value of the distribution at percentile ``p``. See
+ :any:`quantile` for notes on this function's accuracy.
+ """
+ return np.squeeze(self.ppf(np.asarray(p) / 100))
+
+ def condition_on_success(
+ self,
+ event: Union["BaseNumericDistribution", float],
+ failure_outcome: Optional[Union["BaseNumericDistribution", float]] = 0,
+ ):
+ """``event`` is a probability distribution over a probability for some
+ binary outcome. If the event succeeds, the result is the random
+ variable defined by ``self``. If the event fails, the result is zero.
+ Or, if ``failure_outcome`` is provided, the result is
+ ``failure_outcome``.
+
+ This function's return value represents the probability
+ distribution over outcomes in this scenario.
+
+ The return value is equivalent to the result of this procedure:
+
+ 1. Generate a probability ``p`` according to the distribution defined
+ by ``event``.
+ 2. Generate a Bernoulli random variable with probability ``p``.
+ 3. If success, generate a random outcome according to the distribution
+ defined by ``self``.
+ 4. Otherwise, generate a random outcome according to the distribution
+ defined by ``failure_outcome``.
+
+ """
+ if failure_outcome != 0:
+ # TODO: you can't just do a sum. I think what you want to do is
+ # scale the masses and then smush the bins together
+ raise NotImplementedError
+ if isinstance(event, Real):
+ p_success = event
+ elif isinstance(event, BaseNumericDistribution):
+ p_success = event.mean()
+ else:
+ raise TypeError(f"Cannot condition on type {type(event)}")
+ return ZeroNumericDistribution.wrap(self, 1 - p_success)
+
+ def __ne__(x, y):
+ return not (x == y)
+
+ def __radd__(x, y):
+ return x + y
+
+ def __sub__(x, y):
+ return x + (-y)
+
+ def __rsub__(x, y):
+ return -x + y
+
+ def __rmul__(x, y):
+ return x * y
+
+ def __truediv__(x, y):
+ if isinstance(y, Real):
+ return x.scale_by(1 / y)
+ return x * y.reciprocal()
+
+ def __rtruediv__(x, y):
+ return y * x.reciprocal()
+
+
+class NumericDistribution(BaseNumericDistribution):
+ """NumericDistribution
+
+ A numerical representation of a probability distribution as a histogram of
+ values along with the probability mass near each value.
+
+ A ``NumericDistribution`` is functionally equivalent to a Monte Carlo
+ simulation where you generate infinitely many samples and then group the
+ samples into finitely many bins, keeping track of the proportion of samples
+ in each bin (a.k.a. the probability mass) and the average value for each
+ bin.
+
+ Compared to a Monte Carlo simulation, ``NumericDistribution`` can represent
+ information much more densely by grouping together nearby values (although
+ some information is lost in the grouping). The benefit of this is most
+ obvious in fat-tailed distributions. In a Monte Carlo simulation, perhaps 1
+ in 1000 samples account for 10% of the expected value, but a
+ ``NumericDistribution`` (with the right bin sizing method, see
+ :any:`BinSizing`) can easily track the probability mass of large values.
+
+ For more, see :doc:`/numeric_distributions`.
+
+ """
+
+ def __init__(
+ self,
+ values: np.ndarray,
+ masses: np.ndarray,
+ zero_bin_index: int,
+ neg_ev_contribution: float,
+ pos_ev_contribution: float,
+ exact_mean: Optional[float],
+ exact_sd: Optional[float],
+ bin_sizing: Optional[BinSizing] = None,
+ min_bins_per_side: Optional[int] = 2,
+ richardson_extrapolation_enabled: Optional[bool] = True,
+ ):
+ """Create a probability mass histogram. You should usually not call
+ this constructor directly; instead, use :func:`from_distribution`.
+
+ Parameters
+ ----------
+ values : np.ndarray
+ The values of the distribution.
+ masses : np.ndarray
+ The probability masses of the values.
+ zero_bin_index : int
+ The index of the smallest bin that contains positive values
+ (0 if all bins are positive).
+ bin_sizing : :any:`BinSizing`
+ The method used to size the bins.
+ neg_ev_contribution : float
+ The (absolute value of) contribution to expected value from the negative
+ portion of the distribution.
+ pos_ev_contribution : float
+ The contribution to expected value from the positive portion of the distribution.
+ exact_mean : Optional[float]
+ The exact mean of the distribution, if known.
+ exact_sd : Optional[float]
+ The exact standard deviation of the distribution, if known.
+ bin_sizing : Optional[BinSizing]
+ The bin sizing method used to construct the distribution, if any.
+ richardson_extrapolation_enabled : Optional[bool] = True
+ If True, use Richardson extrapolation over the number of bins to
+ improve the accuracy of unary and binary operations.
+ """
+ assert len(values) == len(masses)
+ self._version = __version__
+ self.values = values
+ self.masses = masses
+ self.num_bins = len(values)
+ self.zero_bin_index = zero_bin_index
+ self.neg_ev_contribution = neg_ev_contribution
+ self.pos_ev_contribution = pos_ev_contribution
+ self.exact_mean = exact_mean
+ self.exact_sd = exact_sd
+ self.bin_sizing = bin_sizing
+ self.min_bins_per_side = min_bins_per_side
+ self.richardson_extrapolation_enabled = richardson_extrapolation_enabled
+
+ # These are computed lazily
+ self.interpolate_cdf = None
+ self.interpolate_ppf = None
+ self.interpolate_cev = None
+ self.interpolate_inv_cev = None
+
+ @classmethod
+ def _construct_edge_values(
+ cls,
+ num_bins,
+ support,
+ max_support,
+ dist,
+ cdf,
+ ppf,
+ bin_sizing,
+ ):
+ """Construct a list of bin edge values. Helper function for
+ :func:`from_distribution`; do not call this directly.
+
+ Parameters
+ ----------
+ num_bins : int
+ The number of bins to use.
+ support : Tuple[float, float]
+ The support of the distribution.
+ max_support : Tuple[float, float]
+ The maximum support of the distribution, after clipping but before
+ narrowing due to limitations of certain bin sizing methods. Namely,
+ uniform and log-uniform bin sizing is undefined for infinite bounds,
+ so ``support`` is narrowed to finite bounds, but ``max_support`` is
+ not.
+ dist : BaseDistribution
+ The distribution to convert to a NumericDistribution.
+ cdf : Callable[[np.ndarray], np.ndarray]
+ The CDF of the distribution.
+ ppf : Callable[[np.ndarray], np.ndarray]
+ The inverse CDF of the distribution.
+ bin_sizing : BinSizing
+ The bin sizing method to use.
+
+ Returns
+ -------
+ edge_values : np.ndarray
+ The value of each bin edge.
+ edge_cdfs : Optional[np.ndarray]
+ The CDF at each bin edge. Only provided as a performance
+ optimization if the CDFs are either required to determine bin edge
+ values or if they can be pulled from a cache. Otherwise, the parent
+ caller is responsible for calculating the CDFs.
+
+ """
+ edge_cdfs = None
+ if bin_sizing == BinSizing.uniform:
+ edge_values = np.linspace(support[0], support[1], num_bins + 1)
+
+ elif bin_sizing == BinSizing.log_uniform:
+ log_support = (_log(support[0]), _log(support[1]))
+ log_edge_values = np.linspace(log_support[0], log_support[1], num_bins + 1)
+ edge_values = np.exp(log_edge_values)
+ if (
+ isinstance(dist, LognormalDistribution)
+ and dist.lclip is None
+ and dist.rclip is None
+ ):
+ # Edge CDFs are the same regardless of the mean and SD of the
+ # distribution, so we can cache them
+ edge_cdfs = cached_lognorm_cdfs(num_bins)
+
+ elif bin_sizing == BinSizing.ev:
+ if not hasattr(dist, "inv_contribution_to_ev"):
+ raise ValueError(
+ f"Bin sizing {bin_sizing} requires an inv_contribution_to_ev "
+ f"method, but {type(dist)} does not have one."
+ )
+ left_prop = dist.contribution_to_ev(support[0])
+ right_prop = dist.contribution_to_ev(support[1])
+ edge_values = np.concatenate(
+ (
+ # Don't call inv_contribution_to_ev on the left and right
+ # edges because it's undefined for 0 and 1
+ [support[0]],
+ np.atleast_1d(
+ dist.inv_contribution_to_ev(
+ np.linspace(left_prop, right_prop, num_bins + 1)[1:-1]
+ )
+ )
+ if num_bins > 1
+ else [],
+ [support[1]],
+ )
+ )
+
+ elif bin_sizing == BinSizing.mass:
+ if (
+ isinstance(dist, LognormalDistribution)
+ and dist.lclip is None
+ and dist.rclip is None
+ ):
+ edge_cdfs, edge_zscores = cached_lognorm_ppf_zscore(num_bins)
+ edge_values = np.exp(dist.norm_mean + dist.norm_sd * edge_zscores)
+ else:
+ edge_cdfs = np.linspace(cdf(support[0]), cdf(support[1]), num_bins + 1)
+ edge_values = ppf(edge_cdfs)
+
+ elif bin_sizing == BinSizing.fat_hybrid:
+ # Use a combination of mass and log-uniform
+ logu_support = _support_for_bin_sizing(dist, BinSizing.log_uniform, num_bins)
+ logu_support = _narrow_support(support, logu_support)
+ logu_edge_values, logu_edge_cdfs = cls._construct_edge_values(
+ num_bins, logu_support, max_support, dist, cdf, ppf, BinSizing.log_uniform
+ )
+ mass_edge_values, mass_edge_cdfs = cls._construct_edge_values(
+ num_bins, support, max_support, dist, cdf, ppf, BinSizing.mass
+ )
+
+ if logu_edge_cdfs is not None and mass_edge_cdfs is not None:
+ edge_cdfs = np.where(
+ logu_edge_values > mass_edge_values, logu_edge_cdfs, mass_edge_cdfs
+ )
+ edge_values = np.where(
+ logu_edge_values > mass_edge_values, logu_edge_values, mass_edge_values
+ )
+
+ else:
+ raise ValueError(f"Unsupported bin sizing method: {bin_sizing}")
+
+ return (edge_values, edge_cdfs)
+
+ @classmethod
+ def _construct_bins(
+ cls,
+ num_bins,
+ support,
+ max_support,
+ dist,
+ cdf,
+ ppf,
+ bin_sizing,
+ warn,
+ is_reversed,
+ ):
+ """Construct a list of bin masses and values. Helper function for
+ :func:`from_distribution`; do not call this directly.
+
+ Parameters
+ ----------
+ num_bins : int
+ The number of bins to use.
+ support : Tuple[float, float]
+ The support of the distribution.
+ max_support : Tuple[float, float]
+ The maximum support of the distribution, after clipping but before
+ narrowing due to limitations of certain bin sizing methods. Namely,
+ uniform and log-uniform bin sizing is undefined for infinite bounds,
+ so ``support`` is narrowed to finite bounds, but ``max_support`` is
+ not.
+ dist : BaseDistribution
+ The distribution to convert to a NumericDistribution.
+ cdf : Callable[[np.ndarray], np.ndarray]
+ The CDF of the distribution.
+ ppf : Callable[[np.ndarray], np.ndarray]
+ The inverse CDF of the distribution.
+ bin_sizing : BinSizing
+ The bin sizing method to use.
+ warn : bool
+ If True, raise warnings about bins with zero mass.
+
+ Returns
+ -------
+ masses : np.ndarray
+ The probability mass of each bin.
+ values : np.ndarray
+ The value of each bin.
+ """
+ if num_bins <= 0:
+ return (np.array([]), np.array([]))
+
+ edge_values, edge_cdfs = cls._construct_edge_values(
+ num_bins, support, max_support, dist, cdf, ppf, bin_sizing
+ )
+
+ # Avoid re-calculating CDFs if we can because it's really slow.
+ if edge_cdfs is None:
+ edge_cdfs = cdf(edge_values)
+
+ masses = np.diff(edge_cdfs)
+
+ # Note: Re-calculating this for BinSize.ev appears to add ~zero
+ # performance penalty. Perhaps Python is caching the result somehow?
+ edge_ev_contributions = dist.contribution_to_ev(edge_values, normalized=False)
+ bin_ev_contributions = np.diff(edge_ev_contributions)
+
+ # For sufficiently large edge values, CDF rounds to 1 which makes the
+ # mass 0. Values can also be 0 due to floating point rounding if
+ # support is very small. Remove any 0s.
+ mass_zeros = [i for i in range(len(masses)) if masses[i] == 0]
+ ev_zeros = [i for i in range(len(bin_ev_contributions)) if bin_ev_contributions[i] == 0]
+ non_monotonic = []
+
+ if len(mass_zeros) == 0:
+ # Set the value of each bin to equal the average value within the
+ # bin.
+ values = bin_ev_contributions / masses
+
+ # Values can be non-monotonic if there are rounding errors when
+ # calculating EV contribution. Look at the bottom and top separately
+ # because on the bottom, the lower value will be the incorrect one, and
+ # on the top, the upper value will be the incorrect one.
+ sign = -1 if is_reversed else 1
+ bot_diffs = sign * np.diff(values[: (num_bins // 10)])
+ top_diffs = sign * np.diff(values[-(num_bins // 10) :])
+ non_monotonic = [i for i in range(len(bot_diffs)) if bot_diffs[i] < 0] + [
+ i + 1 + num_bins - len(top_diffs)
+ for i in range(len(top_diffs))
+ if top_diffs[i] < 0
+ ]
+
+ bad_indexes = set(mass_zeros + ev_zeros + non_monotonic)
+
+ if len(bad_indexes) > 0:
+ good_indexes = [i for i in range(num_bins) if i not in set(bad_indexes)]
+ bin_ev_contributions = bin_ev_contributions[good_indexes]
+ masses = masses[good_indexes]
+ values = bin_ev_contributions / masses
+
+ messages = []
+ if len(mass_zeros) > 0:
+ messages.append(f"{len(mass_zeros) + 1} neighboring values had equal CDFs")
+ if len(ev_zeros) == 1:
+ messages.append(
+ "1 bin had zero expected value, most likely because it was too small"
+ )
+ elif len(ev_zeros) > 1:
+ messages.append(
+ f"{len(ev_zeros)} bins had zero expected value, most likely "
+ "because they were too small"
+ )
+ if len(non_monotonic) > 0:
+ messages.append(f"{len(non_monotonic) + 1} neighboring values were non-monotonic")
+ joint_message = "; and".join(messages)
+
+ if warn:
+ warnings.warn(
+ f"When constructing NumericDistribution, {joint_message}.",
+ RuntimeWarning,
+ )
+
+ return (masses, values)
+
+ @classmethod
+ def from_distribution(
+ cls,
+ dist: Union[BaseDistribution, BaseNumericDistribution, Real],
+ num_bins: Optional[int] = None,
+ bin_sizing: Optional[str] = None,
+ warn: bool = True,
+ ):
+ """Create a probability mass histogram from the given distribution.
+
+ Parameters
+ ----------
+ dist : BaseDistribution | BaseNumericDistribution | Real
+ A distribution from which to generate numeric values. If the
+ provided value is a :any:`BaseNumericDistribution`, simply return
+ it.
+ num_bins : Optional[int] (default = ref:``DEFAULT_NUM_BINS``)
+ The number of bins for the numeric distribution to use. The time to
+ construct a NumericDistribution is linear with ``num_bins``, and
+ the time to run a binary operation on two distributions with the
+ same number of bins is approximately quadratic. 100 bins provides a
+ good balance between accuracy and speed. 1000 bins provides greater
+ accuracy and is fast for small models, but for large models there
+ will be some noticeable slowdown in binary operations.
+ bin_sizing : Optional[str]
+ The bin sizing method to use, which affects the accuracy of the
+ bins. If none is given, a default will be chosen from
+ :any:`DEFAULT_BIN_SIZING` based on the distribution type of
+ ``dist``. It is recommended to use the default bin sizing method
+ most of the time. See
+ :any:`squigglepy.numeric_distribution.BinSizing` for a list of
+ valid options and explanations of their behavior. warn :
+ Optional[bool] (default = True) If True, raise warnings about bins
+ with zero mass.
+ warn : Optional[bool] (default = True)
+ If True, raise warnings about bins with zero mass.
+
+ Returns
+ -------
+ result : NumericDistribution | ZeroNumericDistribution
+ The generated numeric distribution that represents ``dist``.
+
+ """
+
+ # ----------------------------
+ # Handle special distributions
+ # ----------------------------
+
+ if isinstance(dist, BaseNumericDistribution):
+ return dist
+ if isinstance(dist, ConstantDistribution) or isinstance(dist, Real):
+ x = dist if isinstance(dist, Real) else dist.x
+ return cls(
+ values=np.array([x]),
+ masses=np.array([1]),
+ zero_bin_index=0 if x >= 0 else 1,
+ neg_ev_contribution=0 if x >= 0 else -x,
+ pos_ev_contribution=x if x >= 0 else 0,
+ exact_mean=x,
+ exact_sd=0,
+ bin_sizing=bin_sizing,
+ )
+ if isinstance(dist, BernoulliDistribution):
+ return cls.from_distribution(1, num_bins, bin_sizing, warn).condition_on_success(
+ dist.p
+ )
+ if isinstance(dist, MixtureDistribution):
+ return cls.mixture(
+ dist.dists,
+ dist.weights,
+ lclip=dist.lclip,
+ rclip=dist.rclip,
+ num_bins=num_bins,
+ bin_sizing=bin_sizing,
+ warn=warn,
+ )
+ if isinstance(dist, ComplexDistribution):
+ left = dist.left
+ right = dist.right
+ if isinstance(left, BaseDistribution):
+ left = cls.from_distribution(left, num_bins, bin_sizing, warn)
+ if isinstance(right, BaseDistribution):
+ right = cls.from_distribution(right, num_bins, bin_sizing, warn)
+ if right is None:
+ return dist.fn(left).clip(dist.lclip, dist.rclip)
+ return dist.fn(left, right).clip(dist.lclip, dist.rclip)
+
+ # ------------
+ # Basic checks
+ # ------------
+
+ if type(dist) not in DEFAULT_BIN_SIZING:
+ raise ValueError(f"Unsupported distribution type: {type(dist)}")
+
+ num_bins = num_bins or DEFAULT_NUM_BINS[type(dist)]
+ bin_sizing = BinSizing(bin_sizing or DEFAULT_BIN_SIZING[type(dist)])
+
+ if num_bins % 2 != 0:
+ raise ValueError(f"num_bins must be even, not {num_bins}")
+
+ # ------------------------------------------------------------------
+ # Handle distributions that are special cases of other distributions
+ # ------------------------------------------------------------------
+
+ if isinstance(dist, ChiSquareDistribution):
+ return cls.from_distribution(
+ GammaDistribution(shape=dist.df / 2, scale=2, lclip=dist.lclip, rclip=dist.rclip),
+ num_bins=num_bins,
+ bin_sizing=bin_sizing,
+ warn=warn,
+ )
+
+ if isinstance(dist, ExponentialDistribution):
+ return cls.from_distribution(
+ GammaDistribution(shape=1, scale=dist.scale, lclip=dist.lclip, rclip=dist.rclip),
+ num_bins=num_bins,
+ bin_sizing=bin_sizing,
+ warn=warn,
+ )
+ if isinstance(dist, PERTDistribution):
+ # PERT is a generalization of Beta. We can generate a PERT by
+ # generating a Beta and then scaling and shifting it.
+ if dist.lclip is not None or dist.rclip is not None:
+ raise ValueError("PERT distribution with lclip or rclip is not supported.")
+ r = dist.right - dist.left
+ alpha = 1 + dist.lam * (dist.mode - dist.left) / r
+ beta = 1 + dist.lam * (dist.right - dist.mode) / r
+ beta_dist = cls.from_distribution(
+ BetaDistribution(
+ a=alpha,
+ b=beta,
+ ),
+ num_bins=num_bins,
+ bin_sizing=bin_sizing,
+ warn=warn,
+ )
+ # Note: There are formulas for the exact mean/SD of a PERT, but
+ # scaling/shifting will correctly produce the exact mean/SD so we
+ # don't need to set them manually.
+ return dist.left + r * beta_dist
+
+ # -------------------------------------------------------------------
+ # Set up required parameters based on dist type and bin sizing method
+ # -------------------------------------------------------------------
+
+ max_support = {
+ # These are the widest possible supports, but they maybe narrowed
+ # later by lclip/rclip or by some bin sizing methods.
+ BetaDistribution: (0, 1),
+ GammaDistribution: (0, np.inf),
+ LognormalDistribution: (0, np.inf),
+ NormalDistribution: (-np.inf, np.inf),
+ ParetoDistribution: (1, np.inf),
+ UniformDistribution: (dist.x, dist.y),
+ }[type(dist)]
+ support = max_support
+ ppf = {
+ BetaDistribution: lambda p: stats.beta.ppf(p, dist.a, dist.b),
+ GammaDistribution: lambda p: stats.gamma.ppf(p, dist.shape, scale=dist.scale),
+ LognormalDistribution: lambda p: stats.lognorm.ppf(
+ p, dist.norm_sd, scale=np.exp(dist.norm_mean)
+ ),
+ NormalDistribution: lambda p: stats.norm.ppf(p, loc=dist.mean, scale=dist.sd),
+ ParetoDistribution: lambda p: stats.pareto.ppf(p, dist.shape),
+ UniformDistribution: lambda p: stats.uniform.ppf(p, loc=dist.x, scale=dist.y - dist.x),
+ }[type(dist)]
+ cdf = {
+ BetaDistribution: lambda x: stats.beta.cdf(x, dist.a, dist.b),
+ GammaDistribution: lambda x: stats.gamma.cdf(x, dist.shape, scale=dist.scale),
+ LognormalDistribution: lambda x: stats.lognorm.cdf(
+ x, dist.norm_sd, scale=np.exp(dist.norm_mean)
+ ),
+ NormalDistribution: lambda x: stats.norm.cdf(x, loc=dist.mean, scale=dist.sd),
+ ParetoDistribution: lambda x: stats.pareto.cdf(x, dist.shape),
+ UniformDistribution: lambda x: stats.uniform.cdf(x, loc=dist.x, scale=dist.y - dist.x),
+ }[type(dist)]
+
+ # -----------
+ # Set support
+ # -----------
+
+ dist_bin_sizing_supported = False
+ new_support = _support_for_bin_sizing(dist, bin_sizing, num_bins)
+
+ if new_support is not None:
+ support = _narrow_support(support, new_support)
+ dist_bin_sizing_supported = True
+ elif bin_sizing == BinSizing.uniform:
+ if isinstance(dist, BetaDistribution) or isinstance(dist, UniformDistribution):
+ dist_bin_sizing_supported = True
+ elif bin_sizing == BinSizing.log_uniform:
+ if isinstance(dist, BetaDistribution):
+ dist_bin_sizing_supported = True
+ elif bin_sizing == BinSizing.ev:
+ dist_bin_sizing_supported = True
+ elif bin_sizing == BinSizing.mass:
+ dist_bin_sizing_supported = True
+ elif bin_sizing == BinSizing.fat_hybrid:
+ if isinstance(dist, GammaDistribution) or isinstance(dist, LognormalDistribution):
+ dist_bin_sizing_supported = True
+
+ if not dist_bin_sizing_supported:
+ raise ValueError(f"Unsupported bin sizing method {bin_sizing} for {type(dist)}.")
+
+ # ----------------------------
+ # Adjust support based on clip
+ # ----------------------------
+
+ support = _narrow_support(support, (dist.lclip, dist.rclip))
+
+ # ---------------------------
+ # Set exact_mean and exact_sd
+ # ---------------------------
+
+ if dist.lclip is None and dist.rclip is None:
+ if isinstance(dist, BetaDistribution):
+ exact_mean = stats.beta.mean(dist.a, dist.b)
+ exact_sd = stats.beta.std(dist.a, dist.b)
+ elif isinstance(dist, GammaDistribution):
+ exact_mean = stats.gamma.mean(dist.shape, scale=dist.scale)
+ exact_sd = stats.gamma.std(dist.shape, scale=dist.scale)
+ elif isinstance(dist, LognormalDistribution):
+ exact_mean = dist.lognorm_mean
+ exact_sd = dist.lognorm_sd
+ elif isinstance(dist, NormalDistribution):
+ exact_mean = dist.mean
+ exact_sd = dist.sd
+ elif isinstance(dist, ParetoDistribution):
+ if dist.shape <= 1:
+ raise ValueError(
+ "NumericDistribution does not support Pareto distributions "
+ "with shape <= 1 because they have infinite mean."
+ )
+ # exact_mean = 1 / (dist.shape - 1) # Lomax
+ exact_mean = dist.shape / (dist.shape - 1)
+ if dist.shape <= 2:
+ exact_sd = np.inf
+ else:
+ exact_sd = np.sqrt(dist.shape / ((dist.shape - 1) ** 2 * (dist.shape - 2)))
+ elif isinstance(dist, UniformDistribution):
+ exact_mean = (dist.x + dist.y) / 2
+ exact_sd = np.sqrt(1 / 12) * (dist.y - dist.x)
+ else:
+ if (
+ isinstance(dist, BetaDistribution)
+ or isinstance(dist, GammaDistribution)
+ or isinstance(dist, LognormalDistribution)
+ ):
+ # For one-sided distributions without a known formula for
+ # truncated mean, compute the mean using
+ # ``contribution_to_ev``.
+ contribution_to_ev = dist.contribution_to_ev(
+ support[1], normalized=False
+ ) - dist.contribution_to_ev(support[0], normalized=False)
+ mass = cdf(support[1]) - cdf(support[0])
+ exact_mean = contribution_to_ev / mass
+ exact_sd = None # unknown
+ elif isinstance(dist, NormalDistribution):
+ a = (support[0] - dist.mean) / dist.sd
+ b = (support[1] - dist.mean) / dist.sd
+ exact_mean = stats.truncnorm.mean(a, b, dist.mean, dist.sd)
+ exact_sd = stats.truncnorm.std(a, b, dist.mean, dist.sd)
+ elif isinstance(dist, UniformDistribution):
+ exact_mean = (support[0] + support[1]) / 2
+ exact_sd = np.sqrt(1 / 12) * (support[1] - support[0])
+
+ # -----------------------------------------------------------------
+ # Split dist into negative and positive sides and generate bins for
+ # each side
+ # -----------------------------------------------------------------
+
+ total_ev_contribution = dist.contribution_to_ev(
+ support[1], normalized=False
+ ) - dist.contribution_to_ev(support[0], normalized=False)
+ neg_ev_contribution = max(
+ 0,
+ dist.contribution_to_ev(max(0, support[0]), normalized=False)
+ - dist.contribution_to_ev(support[0], normalized=False),
+ )
+ if dist.lclip is not None or dist.rclip is not None:
+ total_mass = cdf(support[1]) - cdf(support[0])
+ total_ev_contribution /= total_mass
+ neg_ev_contribution /= total_mass
+ pos_ev_contribution = total_ev_contribution - neg_ev_contribution
+
+ if bin_sizing == BinSizing.uniform:
+ if support[0] > 0:
+ neg_prop = 0
+ pos_prop = 1
+ elif support[1] < 0:
+ neg_prop = 1
+ pos_prop = 0
+ else:
+ width = support[1] - support[0]
+ neg_prop = -support[0] / width
+ pos_prop = support[1] / width
+ elif bin_sizing == BinSizing.log_uniform:
+ neg_prop = 0
+ pos_prop = 1
+ elif bin_sizing == BinSizing.ev:
+ neg_prop = neg_ev_contribution / total_ev_contribution
+ pos_prop = pos_ev_contribution / total_ev_contribution
+ elif bin_sizing == BinSizing.mass:
+ neg_mass = max(0, cdf(0) - cdf(support[0]))
+ pos_mass = max(0, cdf(support[1]) - cdf(0))
+ total_mass = neg_mass + pos_mass
+ neg_prop = neg_mass / total_mass
+ pos_prop = pos_mass / total_mass
+ elif bin_sizing == BinSizing.fat_hybrid:
+ neg_prop = 0
+ pos_prop = 1
+ else:
+ raise ValueError(f"Unsupported bin sizing method: {bin_sizing}")
+
+ # Divide up bins such that each bin has as close as possible to equal
+ # contribution. If one side has very small but nonzero contribution,
+ # still give it two bins.
+ num_neg_bins, num_pos_bins = cls._num_bins_per_side(num_bins, neg_prop, pos_prop, 2)
+ neg_masses, neg_values = cls._construct_bins(
+ num_neg_bins,
+ (support[0], min(0, support[1])),
+ (max_support[0], min(0, max_support[1])),
+ dist,
+ cdf,
+ ppf,
+ bin_sizing,
+ warn,
+ is_reversed=True,
+ )
+ neg_values = -neg_values
+ pos_masses, pos_values = cls._construct_bins(
+ num_pos_bins,
+ (max(0, support[0]), support[1]),
+ (max(0, max_support[0]), max_support[1]),
+ dist,
+ cdf,
+ ppf,
+ bin_sizing,
+ warn,
+ is_reversed=False,
+ )
+
+ # Resize in case some bins got removed due to having zero mass/EV
+ if len(neg_values) < num_neg_bins:
+ neg_ev_contribution = abs(np.sum(neg_masses * neg_values))
+ num_neg_bins = len(neg_values)
+ if len(pos_values) < num_pos_bins:
+ pos_ev_contribution = np.sum(pos_masses * pos_values)
+ num_pos_bins = len(pos_values)
+
+ masses = np.concatenate((neg_masses, pos_masses))
+ values = np.concatenate((neg_values, pos_values))
+
+ # Normalize masses to sum to 1 in case the distribution is clipped, but
+ # don't do this until after setting values because values depend on the
+ # mass relative to the full distribution, not the clipped distribution.
+ masses /= np.sum(masses)
+
+ return cls(
+ values=np.array(values),
+ masses=np.array(masses),
+ zero_bin_index=num_neg_bins,
+ neg_ev_contribution=neg_ev_contribution,
+ pos_ev_contribution=pos_ev_contribution,
+ exact_mean=exact_mean,
+ exact_sd=exact_sd,
+ bin_sizing=bin_sizing,
+ )
+
+ @classmethod
+ def mixture(
+ cls,
+ dists: List[Union[BaseDistribution, BaseNumericDistribution]],
+ weights: List[float],
+ lclip: Optional[float] = None,
+ rclip: Optional[float] = None,
+ num_bins: Optional[int] = None,
+ bin_sizing: Optional[BinSizing] = None,
+ warn: bool = True,
+ ):
+ """Construct a ``NumericDistribution`` as a mixture of the
+ distributions in ``dists``, weighted by ``weights``.
+
+ Parameters
+ ----------
+ dists : List[BaseDistribution | BaseNumericDistribution]
+ The distributions to mix.
+ weights : List[float]
+ The weights of each distribution. Must sum to 1.
+ lclip : Optional[float]
+ The left clip of the resulting mixture distribution. Note that a
+ clip value on an entry in ``dists`` will only clip that entry,
+ while this parameter clips every entry.
+ rclip : Optional[float]
+ The right clip of the resulting mixture distribution. Note that a
+ clip value on an entry in ``dists`` will only clip that entry,
+ while this parameter clips every entry.
+ num_bins : Optional[int] (default = :any:`DEFAULT_NUM_BINS`)
+ The number of bins for each distribution to use. If not given, each
+ distribution will use the default number of bins for its type as
+ given by :any:`DEFAULT_NUM_BINS`.
+ bin_sizing : Optional[BinSizing]
+ The bin sizing method to use. If not given, each distribution will
+ use the default bin sizing method for its type as given by
+ :any:`DEFAULT_BIN_SIZING`.
+ warn : Optional[bool] (default = True)
+ If True, raise warnings about bins with zero mass.
+
+ """
+ # This function replicates how MixtureDistribution handles lclip/rclip:
+ # it clips the sub-distributions based on their own lclip/rclip, then
+ # takes the mixture sample, then clips the mixture sample based on the
+ # mixture lclip/rclip.
+ if num_bins is None:
+ mixture_num_bins = DEFAULT_NUM_BINS[MixtureDistribution]
+
+ # Convert any Squigglepy dists into NumericDistributions
+ dists = [
+ NumericDistribution.from_distribution(dist, num_bins, bin_sizing) for dist in dists
+ ]
+
+ value_vectors = [d.values for d in dists]
+ weighted_mass_vectors = [d.masses * w for d, w in zip(dists, weights)]
+ extended_values = np.concatenate(value_vectors)
+ extended_masses = np.concatenate(weighted_mass_vectors)
+
+ sorted_indexes = np.argsort(extended_values, kind="mergesort")
+ extended_values = extended_values[sorted_indexes]
+ extended_masses = extended_masses[sorted_indexes]
+ zero_index = np.searchsorted(extended_values, 0)
+ neg_ev_contribution = sum(d.neg_ev_contribution * w for d, w in zip(dists, weights))
+ pos_ev_contribution = sum(d.pos_ev_contribution * w for d, w in zip(dists, weights))
+
+ mixture = cls._resize_bins(
+ extended_neg_values=extended_values[:zero_index],
+ extended_neg_masses=extended_masses[:zero_index],
+ extended_pos_values=extended_values[zero_index:],
+ extended_pos_masses=extended_masses[zero_index:],
+ num_bins=num_bins or mixture_num_bins,
+ neg_ev_contribution=neg_ev_contribution,
+ pos_ev_contribution=pos_ev_contribution,
+ bin_sizing=BinSizing.ev,
+ is_sorted=True,
+ )
+ mixture.bin_sizing = bin_sizing
+ if all(d.exact_mean is not None for d in dists):
+ mixture.exact_mean = sum(d.exact_mean * w for d, w in zip(dists, weights))
+ if all(d.exact_sd is not None and d.exact_mean is not None for d in dists):
+ second_moment = sum(
+ (d.exact_mean**2 + d.exact_sd**2) * w for d, w in zip(dists, weights)
+ )
+ mixture.exact_sd = np.sqrt(second_moment - mixture.exact_mean**2)
+
+ return mixture.clip(lclip, rclip)
+
+ def given_value_satisfies(self, condition: Callable[float, bool]):
+ """Return a new distribution conditioned on the value of the random
+ variable satisfying ``condition``.
+
+ Parameters
+ ----------
+ condition : Callable[float, bool]
+ """
+ good_indexes = np.where(np.vectorize(condition)(self.values))
+ values = self.values[good_indexes]
+ masses = self.masses[good_indexes]
+ masses /= np.sum(masses)
+ zero_bin_index = np.searchsorted(values, 0, side="left")
+ neg_ev_contribution = -np.sum(masses[:zero_bin_index] * values[:zero_bin_index])
+ pos_ev_contribution = np.sum(masses[zero_bin_index:] * values[zero_bin_index:])
+ return NumericDistribution(
+ values=values,
+ masses=masses,
+ zero_bin_index=zero_bin_index,
+ neg_ev_contribution=neg_ev_contribution,
+ pos_ev_contribution=pos_ev_contribution,
+ exact_mean=None,
+ exact_sd=None,
+ )
+
+ def probability_value_satisfies(self, condition: Callable[float, bool]):
+ """Return the probability that the random variable satisfies
+ ``condition``.
+
+ Parameters
+ ----------
+ condition : Callable[float, bool]
+ """
+ return np.sum(self.masses[np.where(np.vectorize(condition)(self.values))])
+
+ def __len__(self):
+ return self.num_bins
+
+ def positive_everywhere(self):
+ """Return True if the distribution is positive everywhere."""
+ return self.zero_bin_index == 0
+
+ def negative_everywhere(self):
+ """Return True if the distribution is negative everywhere."""
+ return self.zero_bin_index == self.num_bins
+
+ def is_one_sided(self):
+ """Return True if the histogram contains only positive or negative values."""
+ return self.positive_everywhere() or self.negative_everywhere()
+
+ def is_two_sided(self):
+ """Return True if the histogram contains both positive and negative values."""
+ return not self.is_one_sided()
+
+ def num_neg_bins(self):
+ """Return the number of bins containing negative values."""
+ return self.zero_bin_index
+
+ def est_mean(self):
+ """Estimated mean of the distribution, calculated using the histogram
+ data."""
+ return np.sum(self.masses * self.values)
+
+ def est_sd(self):
+ """Estimated standard deviation of the distribution, calculated using
+ the histogram data.
+ """
+ mean = self.mean()
+ return np.sqrt(np.sum(self.masses * (self.values - mean) ** 2))
+
+ def _init_interpolate_cdf(self):
+ if self.interpolate_cdf is None:
+ # Subtracting 0.5 * masses because eg the first out of 100 values
+ # represents the 0.5th percentile, not the 1st percentile
+ cum_mass = np.cumsum(self.masses) - 0.5 * self.masses
+ self.interpolate_cdf = PchipInterpolator(self.values, cum_mass)
+
+ def _init_interpolate_ppf(self):
+ if self.interpolate_ppf is None:
+ cum_mass = np.cumsum(self.masses) - 0.5 * self.masses
+
+ # Mass diffs can be 0 if a mass is very small and gets rounded off.
+ # The interpolator doesn't like this, so remove these values.
+ nonzero_indexes = [0] + [i + 1 for (i, d) in enumerate(np.diff(cum_mass)) if d > 0]
+ cum_mass = cum_mass[nonzero_indexes]
+ values = self.values[nonzero_indexes]
+ self.interpolate_ppf = PchipInterpolator(cum_mass, values)
+
+ def cdf(self, x):
+ """Estimate the proportion of the distribution that lies below ``x``."""
+ self._init_interpolate_cdf()
+ return self.interpolate_cdf(x)
+
+ def ppf(self, q):
+ self._init_interpolate_ppf()
+ return self.interpolate_ppf(q)
+
+ def clip(self, lclip, rclip):
+ """Return a new distribution clipped to the given bounds.
+
+ It is strongly recommended that, whenever possible, you construct a
+ ``NumericDistribution`` by supplying a ``Distribution`` that has
+ lclip/rclip defined on it, rather than calling
+ ``NumericDistribution.clip``. ``NumericDistribution.clip`` works by
+ deleting bins, which can greatly decrease accuracy.
+
+ Parameters
+ ----------
+ lclip : Optional[float]
+ The new lower bound of the distribution, or None if the lower bound
+ should not change.
+ rclip : Optional[float]
+ The new upper bound of the distribution, or None if the upper bound
+ should not change.
+
+ Returns
+ -------
+ clipped : NumericDistribution
+ A new distribution clipped to the given bounds.
+
+ """
+ if lclip is None and rclip is None:
+ return NumericDistribution(
+ values=self.values,
+ masses=self.masses,
+ zero_bin_index=self.zero_bin_index,
+ neg_ev_contribution=self.neg_ev_contribution,
+ pos_ev_contribution=self.pos_ev_contribution,
+ exact_mean=self.exact_mean,
+ exact_sd=self.exact_sd,
+ bin_sizing=self.bin_sizing,
+ )
+
+ if lclip is None:
+ lclip = -np.inf
+ if rclip is None:
+ rclip = np.inf
+
+ if lclip >= rclip:
+ raise ValueError(f"lclip ({lclip}) must be less than rclip ({rclip})")
+
+ indexes = np.array(range(len(self) + 1))
+ cum_mass_at_index = CubicSpline(indexes, np.concatenate(([0], np.cumsum(self.masses))))
+ cum_ev_at_index = CubicSpline(
+ indexes, np.concatenate(([0], np.cumsum(self.masses * self.values)))
+ )
+
+ # If lclip/rclip is outside the bounds of the known values, use linear
+ # interpolation because cubic spline is sometimes non-monotonic.
+ # Otherwise, use cubic spline because it's more accurate. There are
+ # more accurate methods to extrapolate outside known values, but
+ # they're more complicated and this case should be rare.
+ index_of_value = CubicSpline(self.values, indexes[1:] - 0.5, extrapolate=False)
+ linear_indexes = np.interp([lclip, rclip], self.values, indexes[1:] - 0.5)
+ lclip_index = max(
+ 0, index_of_value(lclip) if lclip < self.values[0] else linear_indexes[0]
+ )
+ rclip_index = min(
+ len(self.values),
+ index_of_value(rclip) if rclip > self.values[-1] else linear_indexes[1],
+ )
+ new_masses = np.array(self.masses[int(lclip_index) : int(np.ceil(rclip_index))])
+ new_values = np.array(self.values[int(lclip_index) : int(np.ceil(rclip_index))])
+
+ # If a bin is partially clipped, interpolate how much of the bin
+ # remains.
+ if lclip_index != int(lclip_index):
+ # lclip is between two bins
+ left_mass = cum_mass_at_index(int(lclip_index) + 1) - cum_mass_at_index(lclip_index)
+ if left_mass > 0:
+ left_value = (
+ cum_ev_at_index(int(lclip_index) + 1) - cum_ev_at_index(lclip_index)
+ ) / left_mass
+ new_masses[0] = left_mass
+ new_values[0] = left_value
+ if rclip_index != int(rclip_index):
+ # rclip is between two bins
+ right_mass = cum_mass_at_index(rclip_index) - cum_mass_at_index(int(rclip_index))
+ if right_mass > 0:
+ right_value = (
+ cum_ev_at_index(rclip_index) - cum_ev_at_index(int(rclip_index))
+ ) / right_mass
+ new_masses[-1] = right_mass
+ new_values[-1] = right_value
+
+ clipped_mass = np.sum(new_masses)
+ new_masses /= clipped_mass
+ zero_bin_index = np.searchsorted(new_values, 0)
+ neg_ev_contribution = -np.sum(new_masses[:zero_bin_index] * new_values[:zero_bin_index])
+ pos_ev_contribution = np.sum(new_masses[zero_bin_index:] * new_values[zero_bin_index:])
+
+ return NumericDistribution(
+ values=new_values,
+ masses=new_masses,
+ zero_bin_index=zero_bin_index,
+ neg_ev_contribution=neg_ev_contribution,
+ pos_ev_contribution=pos_ev_contribution,
+ exact_mean=None,
+ exact_sd=None,
+ bin_sizing=self.bin_sizing,
+ )
+
+ def sample(self, n=1):
+ """Generate ``n`` random samples from the distribution. The samples are
+ generated by interpolating between bin values in the same manner as
+ :any:`ppf`."""
+ return self.ppf(np.random.uniform(size=n))
+
+ def contribution_to_ev(self, x: Union[np.ndarray, float]):
+ if self.interpolate_cev is None:
+ bin_evs = self.masses * abs(self.values)
+ fractions_of_ev = (np.cumsum(bin_evs) - 0.5 * bin_evs) / np.sum(bin_evs)
+ self.interpolate_cev = PchipInterpolator(self.values, fractions_of_ev)
+ return self.interpolate_cev(x)
+
+ def inv_contribution_to_ev(self, fraction: Union[np.ndarray, float]):
+ """Return the value such that ``fraction`` of the contribution to
+ expected value lies to the left of that value.
+ """
+ if self.interpolate_inv_cev is None:
+ bin_evs = self.masses * abs(self.values)
+ fractions_of_ev = (np.cumsum(bin_evs) - 0.5 * bin_evs) / np.sum(bin_evs)
+ self.interpolate_inv_cev = PchipInterpolator(fractions_of_ev, self.values)
+ return self.interpolate_inv_cev(fraction)
+
+ def plot(self, scale="linear"):
+ from matplotlib import pyplot as plt
+
+ # matplotlib.use('GTK3Agg')
+ # matplotlib.use('Qt5Agg')
+ values_for_widths = np.concatenate(([0], self.values))
+ widths = np.diff(values_for_widths[1:])
+ densities = self.masses / widths
+ values, densities, widths = zip(
+ *[
+ (v, d, w)
+ for v, d, w in zip(list(values_for_widths), list(densities), list(widths))
+ if d > 0.001
+ ]
+ )
+ if scale == "log":
+ plt.xscale("log")
+ plt.bar(values, densities, width=widths, align="edge")
+ plt.savefig("/tmp/plot.png")
+ plt.show()
+
+ def richardson(r: float, correct_ev: bool = True):
+ """A decorator that applies Richardson extrapolation to a
+ NumericDistribution method to improve its accuracy.
+
+ This decorator uses the following procedure (this procedure assumes a
+ binary operation, but it works the same for any number of arguments):
+
+ 1. Evaluate ``z = func(x, y)`` where ``x`` and ``y`` are
+ ``NumericDistribution`` objects.
+ 2. Construct a new ``x2`` and ``y2`` which are identical to ``x``
+ and ``y`` except that they use half as many bins.
+ 3. Evaluate ``z2 = func(x2, y2)``.
+ 4. Apply Richardson extrapolation: ``res = (2^r * z - z2) / (2^r - 1)``
+ for some constant exponent ``r``, chosen to maximize accuracy.
+
+ Parameters
+ ----------
+ r : float
+ The (positive) exponent to use in Richardson extrapolation. This
+ should equal the rate at which the error shrinks as the number of
+ bins increases. A higher ``r`` results in slower extrapolation.
+ correct_ev : bool = True
+ If True, adjust the negative and positive EV contributions to be
+ exactly correct. The caller should set ``correct_ev`` to False if
+ the function being decorated does produce exactly correct values
+ for ``(neg|pos)_contribution_to_ev``.
+
+ Returns
+ -------
+ res : Callable
+ A decorator function that takes a function ``func`` and returns a
+ new function that applies Richardson extrapolation to ``func``.
+
+ """
+
+ def sum_pairs(arr):
+ """Sum every pair of values in ``arr``."""
+ return arr.reshape(-1, 2).sum(axis=1)
+
+ def decorator(func):
+ def inner(*hists):
+ if not all(x.richardson_extrapolation_enabled for x in hists):
+ return func(*hists)
+
+ # Richardson extrapolation often runs into issues due to wonky
+ # bin sizing if the inputs don't have the same number of bins.
+ if len(set(x.num_bins for x in hists)) != 1:
+ return func(*hists)
+
+ # Empirically, BinSizing.ev and BinSizing.mass error shrinks at
+ # a consistent rate r for all bins (except for the outermost
+ # bins, which are more unpredictable). BinSizing.log_uniform
+ # and BinSizing.uniform error growth rate isn't consistent
+ # across bins, so Richardson extrapolation doesn't work well
+ # (and often makes the result worse).
+ if not all(x.bin_sizing in [BinSizing.ev, BinSizing.mass] for x in hists):
+ return func(*hists)
+
+ # Construct half_hists as identical to hists but with half as
+ # many bins
+ half_hists = []
+ for x in hists:
+ if len(x.masses) % 2 != 0:
+ # If the number of bins is odd, we can't halve the
+ # number of bins, so just return the original result.
+ return func(*hists)
+
+ halfx_masses = sum_pairs(x.masses)
+ halfx_evs = sum_pairs(x.values * x.masses)
+ halfx_values = halfx_evs / halfx_masses
+ zero_bin_index = np.searchsorted(halfx_values, 0)
+ halfx = NumericDistribution(
+ values=halfx_values,
+ masses=halfx_masses,
+ zero_bin_index=zero_bin_index,
+ neg_ev_contribution=np.sum(
+ halfx_masses[:zero_bin_index] * -halfx_values[:zero_bin_index]
+ ),
+ pos_ev_contribution=np.sum(
+ halfx_masses[zero_bin_index:] * halfx_values[zero_bin_index:]
+ ),
+ exact_mean=x.exact_mean,
+ exact_sd=x.exact_sd,
+ min_bins_per_side=1,
+ )
+ half_hists.append(halfx)
+
+ half_res = func(*half_hists)
+ full_res = func(*hists)
+ if 2 * half_res.zero_bin_index != full_res.zero_bin_index:
+ # In some edge cases, full_res has very small negative mass
+ # and half_res has no negative mass (ex: norm(mean=0, sd=2)
+ # + norm(mean=9, sd=1)), so the bins aren't lined up
+ # correctly for Richardson extrapolation. These edge cases
+ # are rare, so it's not a big deal to bail out on
+ # Richardson extrapolation in those cases.
+ return full_res
+
+ paired_full_masses = sum_pairs(full_res.masses)
+ paired_full_evs = sum_pairs(full_res.values * full_res.masses)
+ paired_full_values = paired_full_evs / paired_full_masses
+ k = 2 ** (-r)
+ richardson_masses = (k * half_res.masses - paired_full_masses) / (k - 1)
+ if any(richardson_masses < 0):
+ # TODO: delete me when you're confident that this doesn't
+ # happen anymore
+ return full_res
+
+ mass_adjustment = np.repeat(richardson_masses / paired_full_masses, 2)
+ new_masses = full_res.masses * np.where(
+ np.isnan(mass_adjustment), 1, mass_adjustment
+ )
+
+ # method 1: adjust EV
+ # full_evs = full_res.values * full_res.masses
+ # half_evs = half_res.values * half_res.masses
+ # richardson_evs = (k * half_evs - paired_full_evs) / (k - 1)
+ # ev_adjustment = np.repeat(richardson_evs / paired_full_evs, 2)
+ # new_evs = full_evs * np.where(np.isnan(ev_adjustment), 1, ev_adjustment)
+ # new_values = new_evs / new_masses
+
+ # method 2: adjust values directly...
+ richardson_values = (k * half_res.values - paired_full_values) / (k - 1)
+ value_adjustment = np.repeat(richardson_values / paired_full_values, 2)
+ new_values = full_res.values * np.where(
+ np.isnan(value_adjustment), 1, value_adjustment
+ )
+ # ...then adjust EV to be exactly correct
+ if correct_ev:
+ new_neg_ev_contribution = np.sum(
+ new_masses[: full_res.zero_bin_index]
+ * -new_values[: full_res.zero_bin_index]
+ )
+ new_pos_ev_contribution = np.sum(
+ new_masses[full_res.zero_bin_index :]
+ * new_values[full_res.zero_bin_index :]
+ )
+ new_values[: full_res.zero_bin_index] *= (
+ 1
+ if new_neg_ev_contribution == 0
+ else full_res.neg_ev_contribution / new_neg_ev_contribution
+ )
+ new_values[full_res.zero_bin_index :] *= (
+ 1
+ if new_pos_ev_contribution == 0
+ else full_res.pos_ev_contribution / new_pos_ev_contribution
+ )
+
+ full_res.masses = new_masses
+ full_res.values = new_values
+ full_res.zero_bin_index = np.searchsorted(new_values, 0)
+ if len(np.unique([x.bin_sizing for x in hists])) == 1:
+ full_res.bin_sizing = hists[0].bin_sizing
+ return full_res
+
+ return inner
+
+ return decorator
+
+ @classmethod
+ def _num_bins_per_side(
+ cls, num_bins, neg_contribution, pos_contribution, min_bins_per_side, allowance=0
+ ):
+ """Determine how many bins to allocate to the positive and negative
+ sides of the distribution.
+
+ The negative and positive sides will get a number of bins approximately
+ proportional to `neg_contribution` and `pos_contribution` respectively.
+
+ Ordinarily, a domain gets its own bin if it represents greater than `1
+ / num_bins / 2` of the total contribution. But if one side of the
+ distribution has less than that, it will still get one bin if it has
+ greater than `allowance * 1 / num_bins / 2` of the total contribution.
+ `allowance = 0` means both sides get a bin as long as they have any
+ contribution.
+
+ If one side has less than that but still nonzero contribution, that
+ side will be allocated zero bins and that side's contribution will be
+ dropped, which means `neg_contribution` and `pos_contribution` may need
+ to be adjusted.
+
+ Parameters
+ ----------
+ num_bins : int
+ Total number of bins across the distribution.
+ neg_contribution : float
+ The total contribution of value from the negative side, using
+ whatever measure of value determines bin sizing.
+ pos_contribution : float
+ The total contribution of value from the positive side.
+ allowance = 0.5 : float
+ The fraction
+
+ Returns
+ -------
+ (num_neg_bins, num_pos_bins) : (int, int)
+ Number of bins assigned to the negative/positive side of the
+ distribution.
+
+ """
+ min_prop_cutoff = min_bins_per_side * allowance * 1 / num_bins / 2
+ total_contribution = neg_contribution + pos_contribution
+ num_neg_bins = min_bins_per_side * int(
+ np.round(num_bins * neg_contribution / total_contribution / min_bins_per_side)
+ )
+ num_pos_bins = num_bins - num_neg_bins
+
+ if neg_contribution / total_contribution > min_prop_cutoff:
+ num_neg_bins = max(min_bins_per_side, num_neg_bins)
+ num_pos_bins = num_bins - num_neg_bins
+ else:
+ num_neg_bins = 0
+ num_pos_bins = num_bins
+
+ if pos_contribution / total_contribution > min_prop_cutoff:
+ num_pos_bins = max(min_bins_per_side, num_pos_bins)
+ num_neg_bins = num_bins - num_pos_bins
+ else:
+ num_pos_bins = 0
+ num_neg_bins = num_bins
+
+ return (num_neg_bins, num_pos_bins)
+
+ @classmethod
+ def _resize_pos_bins(
+ cls,
+ extended_values,
+ extended_masses,
+ num_bins,
+ ev,
+ bin_sizing=BinSizing.bin_count,
+ is_sorted=False,
+ ):
+ """Given two arrays of values and masses representing the result of a
+ binary operation on two positive-everywhere distributions, compress the
+ arrays down to ``num_bins`` bins and return the new values and masses of
+ the bins.
+
+ Parameters
+ ----------
+ extended_values : np.ndarray
+ The values of the distribution. The values must all be non-negative.
+ extended_masses : np.ndarray
+ The probability masses of the values.
+ num_bins : int
+ The number of bins to compress the distribution into.
+ ev : float
+ The expected value of the distribution.
+ is_sorted : bool
+ If True, assume that ``extended_values`` and ``extended_masses`` are
+ already sorted in ascending order. This provides a significant
+ performance improvement (~3x).
+
+ Returns
+ -------
+ values : np.ndarray
+ The values of the bins.
+ masses : np.ndarray
+ The probability masses of the bins.
+
+ """
+ # TODO: This whole method is messy, it could use a refactoring
+ if num_bins == 0:
+ return (np.array([]), np.array([]))
+
+ bin_evs = None
+ masses = None
+
+ if bin_sizing == BinSizing.bin_count:
+ if len(extended_values) == 1 and num_bins == 2:
+ extended_values = np.repeat(extended_values, 2)
+ extended_masses = np.repeat(extended_masses, 2) / 2
+ elif len(extended_values) < num_bins:
+ return (extended_values, extended_masses)
+
+ boundary_indexes = np.round(np.linspace(0, len(extended_values), num_bins + 1)).astype(
+ int
+ )
+
+ if not is_sorted:
+ # Partition such that the values in one bin are all less than
+ # or equal to the values in the next bin. Values within bins
+ # don't need to be sorted, and partitioning is ~10% faster than
+ # timsort.
+ partitioned_indexes = extended_values.argpartition(boundary_indexes[1:-1])
+ extended_values = extended_values[partitioned_indexes]
+ extended_masses = extended_masses[partitioned_indexes]
+
+ extended_evs = extended_values * extended_masses
+ if len(extended_masses) % num_bins == 0:
+ # Vectorize when possible for better performance
+ bin_evs = extended_evs.reshape((num_bins, -1)).sum(axis=1)
+ masses = extended_masses.reshape((num_bins, -1)).sum(axis=1)
+
+ elif bin_sizing == BinSizing.ev:
+ if not is_sorted:
+ sorted_indexes = extended_values.argsort(kind="mergesort")
+ extended_values = extended_values[sorted_indexes]
+ extended_masses = extended_masses[sorted_indexes]
+
+ extended_evs = extended_values * extended_masses
+ cumulative_evs = np.concatenate(([0], np.cumsum(extended_evs)))
+
+ # Using cumulative_evs[-1] as the upper bound can create rounding
+ # errors. For example, if there are 100 bins with equal EV,
+ # boundary_evs will be slightly smaller than cumulative_evs until
+ # near the end, which will duplicate the first bin and skip a bin
+ # near the end. Slightly increasing the upper bound fixes this.
+ upper_bound = cumulative_evs[-1] * (1 + 1e-6)
+
+ boundary_evs = np.linspace(0, upper_bound, num_bins + 1)
+ boundary_indexes = np.searchsorted(cumulative_evs, boundary_evs, side="right") - 1
+ # Fix bin boundaries where boundary[i] == boundary[i+1]
+ if any(boundary_indexes[:-1] == boundary_indexes[1:]):
+ boundary_indexes = _bump_indexes(boundary_indexes, len(extended_values))
+
+ elif bin_sizing == BinSizing.log_uniform:
+ # ``bin_count`` puts too much mass in the bins on the left and
+ # right tails, but it's still more accurate than log-uniform
+ # sizing, I don't know why.
+ assert num_bins % 2 == 0
+ assert len(extended_values) == num_bins**2
+
+ use_pyramid_method = False
+ if use_pyramid_method:
+ # method 1: size bins in a pyramid shape. this preserves
+ # log-uniform bin sizing but it makes the bin widths unnecessarily
+ # large
+ ascending_indexes = 2 * np.array(range(num_bins // 2 + 1)) ** 2
+ descending_indexes = np.flip(num_bins**2 - ascending_indexes)
+ boundary_indexes = np.concatenate((ascending_indexes, descending_indexes[1:]))
+
+ else:
+ # method 2: size bins by going out a fixed number of log-standard
+ # deviations in each direction
+ log_mean = np.average(np.log(extended_values), weights=extended_masses)
+ log_sd = np.sqrt(
+ np.average((np.log(extended_values) - log_mean) ** 2, weights=extended_masses)
+ )
+ scale = 6.5
+ log_left_bound = log_mean - scale * log_sd
+ log_right_bound = log_mean + scale * log_sd
+ log_boundary_values = np.linspace(log_left_bound, log_right_bound, num_bins + 1)
+ boundary_values = np.exp(log_boundary_values)
+
+ if not is_sorted:
+ sorted_indexes = extended_values.argsort(kind="mergesort")
+ extended_values = extended_values[sorted_indexes]
+ extended_masses = extended_masses[sorted_indexes]
+
+ boundary_indexes = np.searchsorted(extended_values, boundary_values)
+
+ # Compute sums one at a time instead of using ``cumsum`` because
+ # ``cumsum`` produces non-trivial rounding errors.
+ extended_evs = extended_values * extended_masses
+ else:
+ raise ValueError(f"resize_pos_bins: Unsupported bin sizing method: {bin_sizing}")
+
+ if bin_evs is None:
+ bin_evs = np.array(
+ [
+ np.sum(extended_evs[i:j])
+ for (i, j) in zip(boundary_indexes[:-1], boundary_indexes[1:])
+ ]
+ )
+ if masses is None:
+ masses = np.array(
+ [
+ np.sum(extended_masses[i:j])
+ for (i, j) in zip(boundary_indexes[:-1], boundary_indexes[1:])
+ ]
+ )
+
+ values = bin_evs / masses
+ return (values, masses)
+
+ @classmethod
+ def _resize_bins(
+ cls,
+ extended_neg_values: np.ndarray,
+ extended_neg_masses: np.ndarray,
+ extended_pos_values: np.ndarray,
+ extended_pos_masses: np.ndarray,
+ num_bins: int,
+ neg_ev_contribution: float,
+ pos_ev_contribution: float,
+ bin_sizing: BinSizing = BinSizing.bin_count,
+ min_bins_per_side: int = 2,
+ is_sorted: bool = False,
+ ):
+ """Given two arrays of values and masses representing the result of a
+ binary operation on two distributions, compress the arrays down to
+ ``num_bins`` bins and return the new values and masses of the bins.
+
+ Parameters
+ ----------
+ extended_neg_values : np.ndarray
+ The values of the negative side of the distribution. The values must
+ all be negative.
+ extended_neg_masses : np.ndarray
+ The probability masses of the negative side of the distribution.
+ extended_pos_values : np.ndarray
+ The values of the positive side of the distribution. The values must
+ all be positive.
+ extended_pos_masses : np.ndarray
+ The probability masses of the positive side of the distribution.
+ num_bins : int
+ The number of bins to compress the distribution into.
+ neg_ev_contribution : float
+ The expected value of the negative side of the distribution.
+ pos_ev_contribution : float
+ The expected value of the positive side of the distribution.
+ is_sorted : bool
+ If True, assume that ``extended_neg_values``,
+ ``extended_neg_masses``, ``extended_pos_values``, and
+ ``extended_pos_masses`` are already sorted in ascending order. This
+ provides a significant performance improvement (~3x).
+
+ Returns
+ -------
+ values : np.ndarray
+ The values of the bins.
+ masses : np.ndarray
+ The probability masses of the bins.
+
+ """
+ num_neg_bins, num_pos_bins = cls._num_bins_per_side(
+ num_bins, neg_ev_contribution, pos_ev_contribution, min_bins_per_side
+ )
+
+ total_ev = pos_ev_contribution - neg_ev_contribution
+ if num_neg_bins == 0:
+ neg_ev_contribution = 0
+ pos_ev_contribution = total_ev
+ if num_pos_bins == 0:
+ neg_ev_contribution = -total_ev
+ pos_ev_contribution = 0
+
+ # Collect extended_values and extended_masses into the correct number
+ # of bins. Make ``extended_values`` positive because ``_resize_bins``
+ # can only operate on non-negative values. Making them positive means
+ # they're now reverse-sorted, so reverse them.
+ neg_values, neg_masses = cls._resize_pos_bins(
+ extended_values=np.flip(-extended_neg_values),
+ extended_masses=np.flip(extended_neg_masses),
+ num_bins=num_neg_bins,
+ ev=neg_ev_contribution,
+ bin_sizing=bin_sizing,
+ is_sorted=is_sorted,
+ )
+
+ # ``_resize_bins`` returns positive values, so negate and reverse them.
+ neg_values = np.flip(-neg_values)
+ neg_masses = np.flip(neg_masses)
+
+ # Collect extended_values and extended_masses into the correct number
+ # of bins, for the positive values this time.
+ pos_values, pos_masses = cls._resize_pos_bins(
+ extended_values=extended_pos_values,
+ extended_masses=extended_pos_masses,
+ num_bins=num_pos_bins,
+ ev=pos_ev_contribution,
+ bin_sizing=bin_sizing,
+ is_sorted=is_sorted,
+ )
+
+ # Construct the resulting ``NumericDistribution`` object.
+ values = np.concatenate((neg_values, pos_values))
+ masses = np.concatenate((neg_masses, pos_masses))
+ return NumericDistribution(
+ values=values,
+ masses=masses,
+ zero_bin_index=len(neg_masses),
+ neg_ev_contribution=neg_ev_contribution,
+ pos_ev_contribution=pos_ev_contribution,
+ exact_mean=None,
+ exact_sd=None,
+ )
+
+ def __eq__(x, y):
+ return x.values == y.values and x.masses == y.masses
+
+ def __add__(x, y):
+ if isinstance(y, Real):
+ return x.shift_by(y)
+ elif isinstance(y, ZeroNumericDistribution):
+ return y.__radd__(x)
+ elif not isinstance(y, NumericDistribution):
+ raise TypeError(f"Cannot add types {type(x)} and {type(y)}")
+
+ cls = x
+ num_bins = max(len(x), len(y))
+
+ # Add every pair of values and find the joint probabilty mass for every
+ # sum.
+ extended_values = np.add.outer(x.values, y.values).reshape(-1)
+ extended_masses = np.outer(x.masses, y.masses).reshape(-1)
+
+ # Sort so we can split the values into positive and negative sides.
+ # Use timsort (called 'mergesort' by the numpy API) because
+ # ``extended_values`` contains many sorted runs. And then pass
+ # `is_sorted` down to `_resize_bins` so it knows not to sort again.
+ sorted_indexes = extended_values.argsort(kind="mergesort")
+ extended_values = extended_values[sorted_indexes]
+ extended_masses = extended_masses[sorted_indexes]
+ zero_index = np.searchsorted(extended_values, 0)
+ is_sorted = True
+
+ # Find how much of the EV contribution is on the negative side vs. the
+ # positive side.
+ neg_ev_contribution = -np.sum(extended_values[:zero_index] * extended_masses[:zero_index])
+ pos_ev_contribution = np.sum(extended_values[zero_index:] * extended_masses[zero_index:])
+
+ res = cls._resize_bins(
+ extended_neg_values=extended_values[:zero_index],
+ extended_neg_masses=extended_masses[:zero_index],
+ extended_pos_values=extended_values[zero_index:],
+ extended_pos_masses=extended_masses[zero_index:],
+ num_bins=num_bins,
+ neg_ev_contribution=neg_ev_contribution,
+ pos_ev_contribution=pos_ev_contribution,
+ is_sorted=is_sorted,
+ min_bins_per_side=x.min_bins_per_side,
+ )
+
+ if x.exact_mean is not None and y.exact_mean is not None:
+ res.exact_mean = x.exact_mean + y.exact_mean
+ if x.exact_sd is not None and y.exact_sd is not None:
+ res.exact_sd = np.sqrt(x.exact_sd**2 + y.exact_sd**2)
+ return res
+
+ def shift_by(self, scalar):
+ """Shift the distribution over by a constant factor."""
+ values = self.values + scalar
+ zero_bin_index = np.searchsorted(values, 0)
+ return NumericDistribution(
+ values=values,
+ masses=self.masses,
+ zero_bin_index=zero_bin_index,
+ neg_ev_contribution=-np.sum(values[:zero_bin_index] * self.masses[:zero_bin_index]),
+ pos_ev_contribution=np.sum(values[zero_bin_index:] * self.masses[zero_bin_index:]),
+ exact_mean=self.exact_mean + scalar if self.exact_mean is not None else None,
+ exact_sd=self.exact_sd,
+ )
+
+ def __neg__(self):
+ return NumericDistribution(
+ values=np.flip(-self.values),
+ masses=np.flip(self.masses),
+ zero_bin_index=len(self.values) - self.zero_bin_index,
+ neg_ev_contribution=self.pos_ev_contribution,
+ pos_ev_contribution=self.neg_ev_contribution,
+ exact_mean=-self.exact_mean if self.exact_mean is not None else None,
+ exact_sd=self.exact_sd,
+ )
+
+ def __abs__(self):
+ values = abs(self.values)
+ masses = self.masses
+ sorted_indexes = np.argsort(values, kind="mergesort")
+ values = values[sorted_indexes]
+ masses = masses[sorted_indexes]
+ return NumericDistribution(
+ values=values,
+ masses=masses,
+ zero_bin_index=0,
+ neg_ev_contribution=0,
+ pos_ev_contribution=np.sum(values * masses),
+ exact_mean=None,
+ exact_sd=None,
+ )
+
+ def __mul__(x, y):
+ if isinstance(y, Real):
+ return x.scale_by(y)
+ elif isinstance(y, ZeroNumericDistribution):
+ return y.__rmul__(x)
+ elif not isinstance(y, NumericDistribution):
+ raise TypeError(f"Cannot add types {type(x)} and {type(y)}")
+
+ return x._inner_mul(y)
+
+ @richardson(r=1.5)
+ def _inner_mul(x, y):
+ cls = x
+ num_bins = max(len(x), len(y))
+
+ # If xpos is the positive part of x and xneg is the negative part, then
+ # resultpos = (xpos * ypos) + (xneg * yneg) and resultneg = (xpos * yneg) + (xneg *
+ # ypos). We perform this calculation by running these steps:
+ #
+ # 1. Multiply the four pairs of one-sided distributions,
+ # producing n^2 bins.
+ # 2. Add the two positive results and the two negative results into
+ # an array of positive values and an array of negative values.
+ # 3. Run the binning algorithm on both arrays to compress them into
+ # a total of n bins.
+ # 4. Join the two arrays into a new histogram.
+ xneg_values = x.values[: x.zero_bin_index]
+ xneg_masses = x.masses[: x.zero_bin_index]
+ xpos_values = x.values[x.zero_bin_index :]
+ xpos_masses = x.masses[x.zero_bin_index :]
+ yneg_values = y.values[: y.zero_bin_index]
+ yneg_masses = y.masses[: y.zero_bin_index]
+ ypos_values = y.values[y.zero_bin_index :]
+ ypos_masses = y.masses[y.zero_bin_index :]
+
+ # Calculate the four products.
+ extended_neg_values = np.concatenate(
+ (
+ np.outer(xneg_values, ypos_values).reshape(-1),
+ np.outer(xpos_values, yneg_values).reshape(-1),
+ )
+ )
+ extended_neg_masses = np.concatenate(
+ (
+ np.outer(xneg_masses, ypos_masses).reshape(-1),
+ np.outer(xpos_masses, yneg_masses).reshape(-1),
+ )
+ )
+ extended_pos_values = np.concatenate(
+ (
+ np.outer(xneg_values, yneg_values).reshape(-1),
+ np.outer(xpos_values, ypos_values).reshape(-1),
+ )
+ )
+ extended_pos_masses = np.concatenate(
+ (
+ np.outer(xneg_masses, yneg_masses).reshape(-1),
+ np.outer(xpos_masses, ypos_masses).reshape(-1),
+ )
+ )
+
+ # Set the number of bins per side to be approximately proportional to
+ # the EV contribution, but make sure that if a side has non-trivial EV
+ # contribution, it gets at least one bin, even if it's less
+ # contribution than an average bin.
+ neg_ev_contribution = (
+ x.neg_ev_contribution * y.pos_ev_contribution
+ + x.pos_ev_contribution * y.neg_ev_contribution
+ )
+ pos_ev_contribution = (
+ x.neg_ev_contribution * y.neg_ev_contribution
+ + x.pos_ev_contribution * y.pos_ev_contribution
+ )
+
+ res = cls._resize_bins(
+ extended_neg_values=extended_neg_values,
+ extended_neg_masses=extended_neg_masses,
+ extended_pos_values=extended_pos_values,
+ extended_pos_masses=extended_pos_masses,
+ num_bins=num_bins,
+ neg_ev_contribution=neg_ev_contribution,
+ pos_ev_contribution=pos_ev_contribution,
+ min_bins_per_side=x.min_bins_per_side,
+ is_sorted=False,
+ )
+ if x.exact_mean is not None and y.exact_mean is not None:
+ res.exact_mean = x.exact_mean * y.exact_mean
+ if x.exact_sd is not None and y.exact_sd is not None:
+ res.exact_sd = np.sqrt(
+ (x.exact_sd * y.exact_mean) ** 2
+ + (y.exact_sd * x.exact_mean) ** 2
+ + (x.exact_sd * y.exact_sd) ** 2
+ )
+ return res
+
+ def __pow__(x, y):
+ """Raise the distribution to a power.
+
+ Note: x * x does not give the same result as x ** 2 because
+ multiplication assumes that the two distributions are independent.
+ """
+ if isinstance(y, Real) or isinstance(y, NumericDistribution):
+ return (x.log() * y).exp()
+ else:
+ raise TypeError(f"Cannot compute x**y for types {type(x)} and {type(y)}")
+
+ def __rpow__(x, y):
+ # Compute y**x
+ if isinstance(y, Real):
+ return (x * np.log(y)).exp()
+ else:
+ raise TypeError(f"Cannot compute x**y for types {type(x)} and {type(y)}")
+
+ def scale_by(self, scalar):
+ """Scale the distribution by a constant factor."""
+ if scalar < 0:
+ return -self * -scalar
+ return NumericDistribution(
+ values=self.values * scalar,
+ masses=self.masses,
+ zero_bin_index=self.zero_bin_index,
+ neg_ev_contribution=self.neg_ev_contribution * scalar,
+ pos_ev_contribution=self.pos_ev_contribution * scalar,
+ exact_mean=self.exact_mean * scalar if self.exact_mean is not None else None,
+ exact_sd=self.exact_sd * scalar if self.exact_sd is not None else None,
+ )
+
+ def reciprocal(self):
+ """Return the reciprocal of the distribution.
+
+ Warning: The result can be very inaccurate for certain distributions
+ and bin sizing methods. Specifically, if the distribution is fat-tailed
+ and does not use log-uniform bin sizing, the reciprocal will be
+ inaccurate for small values. Most bin sizing methods on fat-tailed
+ distributions maximize information at the tails in exchange for less
+ accuracy on [0, 1], which means the reciprocal will contain very little
+ information about the tails. Log-uniform bin sizing is most accurate
+ because it is invariant with reciprocation.
+ """
+ values = 1 / self.values
+ sorted_indexes = values.argsort()
+ values = values[sorted_indexes]
+ masses = self.masses[sorted_indexes]
+
+ # Re-calculate EV contribution manually.
+ neg_ev_contribution = np.sum(values[: self.zero_bin_index] * masses[: self.zero_bin_index])
+ pos_ev_contribution = np.sum(values[self.zero_bin_index :] * masses[self.zero_bin_index :])
+
+ return NumericDistribution(
+ values=values,
+ masses=masses,
+ zero_bin_index=self.zero_bin_index,
+ neg_ev_contribution=neg_ev_contribution,
+ pos_ev_contribution=pos_ev_contribution,
+ # There is no general formula for the mean and SD of the
+ # reciprocal of a random variable.
+ exact_mean=None,
+ exact_sd=None,
+ )
+
+ @richardson(r=0.66, correct_ev=False)
+ def exp(self):
+ """Return the exponential of the distribution."""
+ # Note: This code naively sets the average value within each bin to
+ # e^x, which is wrong because we want E[e^X], not e^E[X]. An
+ # alternative method, which you might expect to be more accurate, is to
+ # interpolate the edges of each bin and then set each bin's value to
+ # the result of the integral
+ #
+ # .. math::
+ # \int_{lb}^{ub} \frac{1}{ub - lb} exp(x) dx
+ #
+ # However, this method turns out to be less accurate overall (although
+ # it's more accurate in the tails of the distribution).
+ #
+ # Where the underlying distribution is normal, the naive e^E[X] method
+ # systematically underestimates expected value (by something like 0.1%
+ # with num_bins=200), and the integration method overestimates expected
+ # value by about 3x as much. Both methods mis-estimate the standard
+ # deviation in the same direction as they mis-estimate the mean but by
+ # a somewhat larger margin, with the naive method again having the
+ # better estimate.
+ #
+ # Another method would be not to interpolate the edge values, and
+ # instead record the true edge values when the numeric distribution is
+ # generated and carry them through mathematical operations by
+ # re-calculating them. But this method would be much more complicated,
+ # and we'd need to lazily compute the edge values to avoid a ~2x
+ # performance penalty.
+ values = np.exp(self.values)
+ return NumericDistribution(
+ values=values,
+ masses=self.masses,
+ zero_bin_index=0,
+ neg_ev_contribution=0,
+ pos_ev_contribution=np.sum(values * self.masses),
+ exact_mean=None,
+ exact_sd=None,
+ )
+
+ @richardson(r=0.66, correct_ev=False)
+ def log(self):
+ """Return the natural log of the distribution."""
+ # See :any:`exp`` for some discussion of accuracy. For ``log` on a
+ # log-normal distribution, both the naive method and the integration
+ # method tend to overestimate the true mean, but the naive method
+ # overestimates it by less.
+ if self.zero_bin_index != 0:
+ raise ValueError("Cannot take the log of a distribution with non-positive values")
+
+ values = np.log(self.values)
+ return NumericDistribution(
+ values=values,
+ masses=self.masses,
+ zero_bin_index=np.searchsorted(values, 0),
+ neg_ev_contribution=np.sum(
+ values[: self.zero_bin_index] * self.masses[: self.zero_bin_index]
+ ),
+ pos_ev_contribution=np.sum(
+ values[self.zero_bin_index :] * self.masses[self.zero_bin_index :]
+ ),
+ exact_mean=None,
+ exact_sd=None,
+ )
+
+ def __hash__(self):
+ return hash(repr(self.values) + "," + repr(self.masses))
+
+
+class ZeroNumericDistribution(BaseNumericDistribution):
+ """
+ A :any:`NumericDistribution` with a point mass at zero.
+ """
+
+ def __init__(self, dist: NumericDistribution, zero_mass: float):
+ if not isinstance(dist, NumericDistribution):
+ raise TypeError(f"dist must be a NumericDistribution, got {type(dist)}")
+ if not isinstance(zero_mass, Real):
+ raise TypeError(f"zero_mass must be a Real, got {type(zero_mass)}")
+
+ self._version = __version__
+ self.dist = dist
+ self.zero_mass = zero_mass
+ self.nonzero_mass = 1 - zero_mass
+ self.exact_mean = None
+ self.exact_sd = None
+ self.exact_2nd_moment = None
+
+ if dist.exact_mean is not None:
+ self.exact_mean = dist.exact_mean * self.nonzero_mass
+ if dist.exact_sd is not None:
+ nonzero_moment2 = dist.exact_mean**2 + dist.exact_sd**2
+ moment2 = self.nonzero_mass * nonzero_moment2
+ variance = moment2 - self.exact_mean**2
+ self.exact_sd = np.sqrt(variance)
+
+ self._neg_mass = np.sum(dist.masses[: dist.zero_bin_index]) * self.nonzero_mass
+
+ # To be computed lazily
+ self.interpolate_ppf = None
+
+ @classmethod
+ def wrap(cls, dist: BaseNumericDistribution, zero_mass: float):
+ if isinstance(dist, ZeroNumericDistribution):
+ return cls(dist.dist, zero_mass + dist.zero_mass * (1 - zero_mass))
+ return cls(dist, zero_mass)
+
+ def given_value_satisfies(self, condition):
+ nonzero_dist = self.dist.given_value_satisfies(condition)
+ if condition(0):
+ nonzero_mass = np.sum(
+ [x for i, x in enumerate(self.dist.masses) if condition(self.dist.values[i])]
+ )
+ zero_mass = self.zero_mass
+ total_mass = nonzero_mass + zero_mass
+ scaled_zero_mass = zero_mass / total_mass
+ return ZeroNumericDistribution(nonzero_dist, scaled_zero_mass)
+ return nonzero_dist
+
+ def probability_value_satisfies(self, condition):
+ return (
+ self.dist.probability_value_satisfies(condition) * self.nonzero_mass
+ + condition(0) * self.zero_mass
+ )
+
+ def est_mean(self):
+ return self.dist.est_mean() * self.nonzero_mass
+
+ def est_sd(self):
+ mean = self.mean()
+ nonzero_variance = (
+ np.sum(self.dist.masses * (self.dist.values - mean) ** 2) * self.nonzero_mass
+ )
+ zero_variance = self.zero_mass * mean**2
+ variance = nonzero_variance + zero_variance
+ return np.sqrt(variance)
+
+ def ppf(self, q):
+ if not isinstance(q, Real):
+ return np.array([self.ppf(x) for x in q])
+
+ if q < 0 or q > 1:
+ raise ValueError(f"q must be between 0 and 1, got {q}")
+
+ if q <= self._neg_mass:
+ return self.dist.ppf(q / self.nonzero_mass)
+ elif q < self._neg_mass + self.zero_mass:
+ return 0
+ else:
+ return self.dist.ppf((q - self.zero_mass) / self.nonzero_mass)
+
+ def __eq__(x, y):
+ return x.zero_mass == y.zero_mass and x.dist == y.dist
+
+ def __add__(x, y):
+ if isinstance(y, NumericDistribution):
+ return x + ZeroNumericDistribution(y, 0)
+ elif isinstance(y, Real):
+ return x.shift_by(y)
+ elif not isinstance(y, ZeroNumericDistribution):
+ raise ValueError(f"Cannot add types {type(x)} and {type(y)}")
+ nonzero_sum = (x.dist + y.dist) * x.nonzero_mass * y.nonzero_mass
+ extra_x = x.dist * x.nonzero_mass * y.zero_mass
+ extra_y = y.dist * x.zero_mass * y.nonzero_mass
+ zero_mass = x.zero_mass * y.zero_mass
+ return ZeroNumericDistribution(nonzero_sum + extra_x + extra_y, zero_mass)
+
+ def shift_by(self, scalar):
+ old_zero_index = self.dist.zero_bin_index
+ shifted_dist = self.dist.shift_by(scalar)
+ scaled_masses = shifted_dist.masses * self.nonzero_mass
+ values = np.insert(shifted_dist.values, old_zero_index, scalar)
+ masses = np.insert(scaled_masses, old_zero_index, self.zero_mass)
+ exact_mean = None
+ if self.exact_mean is not None:
+ exact_mean = self.exact_mean + scalar
+ exact_sd = self.exact_sd
+
+ return NumericDistribution(
+ values=values,
+ masses=masses,
+ zero_bin_index=shifted_dist.zero_bin_index,
+ neg_ev_contribution=shifted_dist.neg_ev_contribution * self.nonzero_mass
+ + min(0, -scalar) * self.zero_mass,
+ pos_ev_contribution=shifted_dist.pos_ev_contribution * self.nonzero_mass
+ + min(0, scalar) * self.zero_mass,
+ exact_mean=exact_mean,
+ exact_sd=exact_sd,
+ )
+
+ def __neg__(self):
+ return ZeroNumericDistribution(-self.dist, self.zero_mass)
+
+ def __abs__(self):
+ return ZeroNumericDistribution(abs(self.dist), self.zero_mass)
+
+ def exp(self):
+ # TODO: exponentiate the wrapped dist, then do something like shift_by
+ # to insert a 1 into the bins
+ return NotImplementedError
+
+ def log(self):
+ raise ValueError("Cannot take the logarithm of a distribution with non-positive values")
+
+ def __mul__(x, y):
+ if isinstance(y, NumericDistribution):
+ return x * ZeroNumericDistribution(y, 0)
+ if isinstance(y, Real):
+ return x.scale_by(y)
+ dist = x.dist * y.dist
+ nonzero_mass = x.nonzero_mass * y.nonzero_mass
+ return ZeroNumericDistribution(dist, 1 - nonzero_mass)
+
+ def scale_by(self, scalar):
+ return ZeroNumericDistribution(self.dist.scale_by(scalar), self.zero_mass)
+
+ def reciprocal(self):
+ raise ValueError(
+ "Reciprocal is undefined for probability distributions "
+ "with non-infinitesimal mass at zero"
+ )
+
+ def __hash__(self):
+ return 33 * hash(repr(self.zero_mass)) + hash(self.dist)
+
+
+def numeric(
+ dist: Union[BaseDistribution, BaseNumericDistribution],
+ num_bins: Optional[int] = None,
+ bin_sizing: Optional[str] = None,
+ warn: bool = True,
+):
+ """Create a probability mass histogram from the given distribution.
+
+ Parameters
+ ----------
+ dist : BaseDistribution | BaseNumericDistribution
+ A distribution from which to generate numeric values. If the
+ provided value is a :any:`BaseNumericDistribution`, simply return
+ it.
+ num_bins : Optional[int] (default = :any:``DEFAULT_NUM_BINS``)
+ The number of bins for the numeric distribution to use. The time to
+ construct a NumericDistribution is linear with ``num_bins``, and
+ the time to run a binary operation on two distributions with the
+ same number of bins is approximately quadratic with ``num_bins``.
+ 100 bins provides a good balance between accuracy and speed.
+ bin_sizing : Optional[str]
+ The bin sizing method to use, which affects the accuracy of the bins.
+ If none is given, a default will be chosen from
+ :any:`DEFAULT_BIN_SIZING` based on the distribution type of ``dist``.
+ It is recommended to use the default bin sizing method most of the
+ time. See :any:`BinSizing` for a list of valid options and explanations
+ of their behavior. warn : Optional[bool] (default = True) If True,
+ raise warnings about bins with zero mass.
+ warn : Optional[bool] (default = True)
+ If True, raise warnings about bins with zero mass.
+
+ Returns
+ -------
+ result : NumericDistribution | ZeroNumericDistribution
+ The generated numeric distribution that represents ``dist``.
+
+ """
+ return NumericDistribution.from_distribution(dist, num_bins, bin_sizing, warn)
diff --git a/squigglepy/samplers.py b/squigglepy/samplers.py
index e88df0b..983204f 100644
--- a/squigglepy/samplers.py
+++ b/squigglepy/samplers.py
@@ -47,6 +47,7 @@
UniformDistribution,
const,
)
+from .numeric_distribution import NumericDistribution
_squigglepy_internal_sample_caches = {}
@@ -465,13 +466,14 @@ def pareto_sample(shape, samples=1):
Returns
-------
int
- A random number sampled from an pareto distribution.
+ A random number sampled from a pareto distribution.
Examples
--------
>>> set_seed(42)
>>> pareto_sample(1)
10.069666324736094
+
"""
return _simplify(_get_rng().pareto(shape, samples))
@@ -1107,6 +1109,9 @@ def run_dist(dist, pbar=None, tick=1):
if is_dist(samples) or callable(samples):
samples = sample(samples, n=n)
+ elif isinstance(dist, NumericDistribution):
+ samples = dist.sample(n=n)
+
else:
raise ValueError("{} sampler not found".format(type(dist)))
diff --git a/squigglepy/utils.py b/squigglepy/utils.py
index 03e02dc..4124371 100644
--- a/squigglepy/utils.py
+++ b/squigglepy/utils.py
@@ -436,12 +436,28 @@ def get_percentiles(
"""
percentiles = percentiles if isinstance(percentiles, list) else [percentiles]
percentile_labels = list(reversed(percentiles)) if reverse else percentiles
- percentiles = np.percentile(data, percentiles)
- percentiles = [_round(p, digits) for p in percentiles]
+
+ if (
+ type(data).__name__ == "NumericDistribution"
+ or type(data).__name__ == "ZeroNumericDistribution"
+ ):
+ if len(percentiles) == 1:
+ values = [data.percentile(percentiles[0])]
+ else:
+ values = data.percentile(percentiles)
+ else:
+ values = np.percentile(data, percentiles)
+ values = [_round(p, digits) for p in values]
if len(percentile_labels) == 1:
- return percentiles[0]
+ return values[0]
else:
- return dict(list(zip(percentile_labels, percentiles)))
+ return dict(list(zip(percentile_labels, values)))
+
+
+def mean(x):
+ if type(x).__name__ == "NumericDistribution" or type(x).__name__ == "ZeroNumericDistribution":
+ return x.mean()
+ return np.mean(x)
def get_log_percentiles(
@@ -912,25 +928,25 @@ def kelly(my_price, market_price, deference=0, bankroll=1, resolve_date=None, cu
-------
dict
A dict of values specifying:
- * ``my_price``
- * ``market_price``
- * ``deference``
- * ``adj_price`` : an adjustment to ``my_price`` once ``deference`` is taken
- into account.
- * ``delta_price`` : the absolute difference between ``my_price`` and ``market_price``.
- * ``adj_delta_price`` : the absolute difference between ``adj_price`` and
- ``market_price``.
- * ``kelly`` : the kelly criterion indicating the percentage of ``bankroll``
- you should bet.
- * ``target`` : the target amount of money you should have invested
- * ``current``
- * ``delta`` : the amount of money you should invest given what you already
- have invested
- * ``max_gain`` : the amount of money you would gain if you win
- * ``modeled_gain`` : the expected value you would win given ``adj_price``
- * ``expected_roi`` : the expected return on investment
- * ``expected_arr`` : the expected ARR given ``resolve_date``
- * ``resolve_date``
+ * ``my_price``
+ * ``market_price``
+ * ``deference``
+ * ``adj_price`` : an adjustment to ``my_price`` once ``deference`` is taken
+ into account.
+ * ``delta_price`` : the absolute difference between ``my_price`` and ``market_price``.
+ * ``adj_delta_price`` : the absolute difference between ``adj_price`` and
+ ``market_price``.
+ * ``kelly`` : the kelly criterion indicating the percentage of ``bankroll``
+ you should bet.
+ * ``target`` : the target amount of money you should have invested
+ * ``current``
+ * ``delta`` : the amount of money you should invest given what you already
+ have invested
+ * ``max_gain`` : the amount of money you would gain if you win
+ * ``modeled_gain`` : the expected value you would win given ``adj_price``
+ * ``expected_roi`` : the expected return on investment
+ * ``expected_arr`` : the expected ARR given ``resolve_date``
+ * ``resolve_date``
Examples
--------
@@ -1000,25 +1016,25 @@ def full_kelly(my_price, market_price, bankroll=1, resolve_date=None, current=0)
-------
dict
A dict of values specifying:
- * ``my_price``
- * ``market_price``
- * ``deference``
- * ``adj_price`` : an adjustment to ``my_price`` once ``deference`` is taken
- into account.
- * ``delta_price`` : the absolute difference between ``my_price`` and ``market_price``.
- * ``adj_delta_price`` : the absolute difference between ``adj_price`` and
- ``market_price``.
- * ``kelly`` : the kelly criterion indicating the percentage of ``bankroll``
- you should bet.
- * ``target`` : the target amount of money you should have invested
- * ``current``
- * ``delta`` : the amount of money you should invest given what you already
- have invested
- * ``max_gain`` : the amount of money you would gain if you win
- * ``modeled_gain`` : the expected value you would win given ``adj_price``
- * ``expected_roi`` : the expected return on investment
- * ``expected_arr`` : the expected ARR given ``resolve_date``
- * ``resolve_date``
+ * ``my_price``
+ * ``market_price``
+ * ``deference``
+ * ``adj_price`` : an adjustment to ``my_price`` once ``deference`` is taken
+ into account.
+ * ``delta_price`` : the absolute difference between ``my_price`` and ``market_price``.
+ * ``adj_delta_price`` : the absolute difference between ``adj_price`` and
+ ``market_price``.
+ * ``kelly`` : the kelly criterion indicating the percentage of ``bankroll``
+ you should bet.
+ * ``target`` : the target amount of money you should have invested
+ * ``current``
+ * ``delta`` : the amount of money you should invest given what you already
+ have invested
+ * ``max_gain`` : the amount of money you would gain if you win
+ * ``modeled_gain`` : the expected value you would win given ``adj_price``
+ * ``expected_roi`` : the expected return on investment
+ * ``expected_arr`` : the expected ARR given ``resolve_date``
+ * ``resolve_date``
Examples
--------
@@ -1062,25 +1078,25 @@ def half_kelly(my_price, market_price, bankroll=1, resolve_date=None, current=0)
-------
dict
A dict of values specifying:
- * ``my_price``
- * ``market_price``
- * ``deference``
- * ``adj_price`` : an adjustment to ``my_price`` once ``deference`` is taken
- into account.
- * ``delta_price`` : the absolute difference between ``my_price`` and ``market_price``.
- * ``adj_delta_price`` : the absolute difference between ``adj_price`` and
- ``market_price``.
- * ``kelly`` : the kelly criterion indicating the percentage of ``bankroll``
- you should bet.
- * ``target`` : the target amount of money you should have invested
- * ``current``
- * ``delta`` : the amount of money you should invest given what you already
- have invested
- * ``max_gain`` : the amount of money you would gain if you win
- * ``modeled_gain`` : the expected value you would win given ``adj_price``
- * ``expected_roi`` : the expected return on investment
- * ``expected_arr`` : the expected ARR given ``resolve_date``
- * ``resolve_date``
+ * ``my_price``
+ * ``market_price``
+ * ``deference``
+ * ``adj_price`` : an adjustment to ``my_price`` once ``deference`` is taken
+ into account.
+ * ``delta_price`` : the absolute difference between ``my_price`` and ``market_price``.
+ * ``adj_delta_price`` : the absolute difference between ``adj_price`` and
+ ``market_price``.
+ * ``kelly`` : the kelly criterion indicating the percentage of ``bankroll``
+ you should bet.
+ * ``target`` : the target amount of money you should have invested
+ * ``current``
+ * ``delta`` : the amount of money you should invest given what you already
+ have invested
+ * ``max_gain`` : the amount of money you would gain if you win
+ * ``modeled_gain`` : the expected value you would win given ``adj_price``
+ * ``expected_roi`` : the expected return on investment
+ * ``expected_arr`` : the expected ARR given ``resolve_date``
+ * ``resolve_date``
Examples
--------
@@ -1124,25 +1140,25 @@ def quarter_kelly(my_price, market_price, bankroll=1, resolve_date=None, current
-------
dict
A dict of values specifying:
- * ``my_price``
- * ``market_price``
- * ``deference``
- * ``adj_price`` : an adjustment to ``my_price`` once ``deference`` is taken
- into account.
- * ``delta_price`` : the absolute difference between ``my_price`` and ``market_price``.
- * ``adj_delta_price`` : the absolute difference between ``adj_price`` and
- ``market_price``.
- * ``kelly`` : the kelly criterion indicating the percentage of ``bankroll``
- you should bet.
- * ``target`` : the target amount of money you should have invested
- * ``current``
- * ``delta`` : the amount of money you should invest given what you already
- have invested
- * ``max_gain`` : the amount of money you would gain if you win
- * ``modeled_gain`` : the expected value you would win given ``adj_price``
- * ``expected_roi`` : the expected return on investment
- * ``expected_arr`` : the expected ARR given ``resolve_date``
- * ``resolve_date``
+ * ``my_price``
+ * ``market_price``
+ * ``deference``
+ * ``adj_price`` : an adjustment to ``my_price`` once ``deference`` is taken
+ into account.
+ * ``delta_price`` : the absolute difference between ``my_price`` and ``market_price``.
+ * ``adj_delta_price`` : the absolute difference between ``adj_price`` and
+ ``market_price``.
+ * ``kelly`` : the kelly criterion indicating the percentage of ``bankroll``
+ you should bet.
+ * ``target`` : the target amount of money you should have invested
+ * ``current``
+ * ``delta`` : the amount of money you should invest given what you already
+ have invested
+ * ``max_gain`` : the amount of money you would gain if you win
+ * ``modeled_gain`` : the expected value you would win given ``adj_price``
+ * ``expected_roi`` : the expected return on investment
+ * ``expected_arr`` : the expected ARR given ``resolve_date``
+ * ``resolve_date``
Examples
--------
@@ -1191,3 +1207,7 @@ def extremize(p, e):
return 1 - ((1 - p) ** e)
else:
return p**e
+
+
+class ConvergenceWarning(RuntimeWarning):
+ ...
diff --git a/tests/test_accuracy.py b/tests/test_accuracy.py
new file mode 100644
index 0000000..8968d43
--- /dev/null
+++ b/tests/test_accuracy.py
@@ -0,0 +1,788 @@
+from functools import reduce
+import numpy as np
+from scipy import optimize, stats
+import warnings
+
+from ..squigglepy.distributions import (
+ GammaDistribution,
+ NormalDistribution,
+ LognormalDistribution,
+ mixture,
+)
+from ..squigglepy.numeric_distribution import numeric
+from ..squigglepy import samplers
+
+
+RUN_PRINT_ONLY_TESTS = False
+"""Some tests print information but don't assert anything. This flag determines
+whether to run those tests."""
+
+RUN_MONTE_CARLO_TESTS = False
+"""Some tests compare the accuracy of NumericDistribution to Monte Carlo. They
+can occasionally fail due to Monte Carlo getting lucky, so you might not want
+to run them.
+"""
+
+
+def relative_error(x, y):
+ if x == 0 and y == 0:
+ return 0
+ if x == 0:
+ return abs(y)
+ if y == 0:
+ return abs(x)
+ return max(x / y, y / x) - 1
+
+
+def print_accuracy_ratio(x, y, extra_message=None):
+ ratio = relative_error(x, y)
+ if extra_message is not None:
+ extra_message += " "
+ else:
+ extra_message = ""
+ direction_off = "small" if x < y else "large"
+ if ratio > 1:
+ print(f"{extra_message}Ratio: {direction_off} by a factor of {ratio + 1:.1f}")
+ else:
+ print(f"{extra_message}Ratio: {direction_off} by {100 * ratio:.4f}%")
+
+
+def fmt(x):
+ return f"{(100*x):.4f}%"
+
+
+def get_mc_accuracy(exact_sd, num_samples, dists, operation):
+ # Run multiple trials because NumericDistribution should usually beat MC,
+ # but sometimes MC wins by luck. Even though NumericDistribution wins a
+ # large percentage of the time, this test suite does a lot of runs, so the
+ # chance of MC winning at least once is fairly high.
+ mc_abs_error = []
+ for i in range(10):
+ mcs = [samplers.sample(dist, num_samples) for dist in dists]
+ mc = reduce(operation, mcs)
+ mc_abs_error.append(abs(np.std(mc) - exact_sd))
+
+ mc_abs_error.sort()
+
+ # Small numbers are good. A smaller index in mc_abs_error has a better
+ # accuracy
+ return mc_abs_error[-5]
+
+
+def test_norm_sd_bin_sizing_accuracy():
+ # Accuracy order is ev > uniform > mass
+ dist = NormalDistribution(mean=0, sd=1)
+ ev_hist = numeric(dist, bin_sizing="ev", warn=False)
+ mass_hist = numeric(dist, bin_sizing="mass", warn=False)
+ uniform_hist = numeric(dist, bin_sizing="uniform", warn=False)
+
+ sd_errors = [
+ relative_error(uniform_hist.est_sd(), dist.sd),
+ relative_error(ev_hist.est_sd(), dist.sd),
+ relative_error(mass_hist.est_sd(), dist.sd),
+ ]
+ assert all(np.diff(sd_errors) >= 0)
+
+
+def test_norm_product_bin_sizing_accuracy():
+ dist = NormalDistribution(mean=2, sd=1)
+ uniform_hist = numeric(dist, bin_sizing="uniform", warn=False)
+ uniform_hist = uniform_hist * uniform_hist
+ ev_hist = numeric(dist, bin_sizing="ev", warn=False)
+ ev_hist = ev_hist * ev_hist
+ mass_hist = numeric(dist, bin_sizing="mass", warn=False)
+ mass_hist = mass_hist * mass_hist
+
+ # uniform and log-uniform should have small errors and the others should be
+ # pretty much perfect
+ mean_errors = np.array(
+ [
+ relative_error(mass_hist.est_mean(), ev_hist.exact_mean),
+ relative_error(ev_hist.est_mean(), ev_hist.exact_mean),
+ relative_error(uniform_hist.est_mean(), ev_hist.exact_mean),
+ ]
+ )
+ assert all(mean_errors <= 1e-6)
+
+ sd_errors = [
+ relative_error(ev_hist.est_sd(), ev_hist.exact_sd),
+ relative_error(mass_hist.est_sd(), ev_hist.exact_sd),
+ relative_error(uniform_hist.est_sd(), ev_hist.exact_sd),
+ ]
+ assert all(np.diff(sd_errors) >= 0)
+
+
+def test_lognorm_product_bin_sizing_accuracy():
+ dist = LognormalDistribution(norm_mean=np.log(1e6), norm_sd=1)
+ uniform_hist = numeric(dist, bin_sizing="uniform", warn=False)
+ uniform_hist = uniform_hist * uniform_hist
+ log_uniform_hist = numeric(dist, bin_sizing="log-uniform", warn=False)
+ log_uniform_hist = log_uniform_hist * log_uniform_hist
+ ev_hist = numeric(dist, bin_sizing="ev", warn=False)
+ ev_hist = ev_hist * ev_hist
+ mass_hist = numeric(dist, bin_sizing="mass", warn=False)
+ mass_hist = mass_hist * mass_hist
+ fat_hybrid_hist = numeric(dist, bin_sizing="fat-hybrid", warn=False)
+ fat_hybrid_hist = fat_hybrid_hist * fat_hybrid_hist
+ dist_prod = LognormalDistribution(
+ norm_mean=2 * dist.norm_mean, norm_sd=np.sqrt(2) * dist.norm_sd
+ )
+
+ mean_errors = np.array(
+ [
+ relative_error(mass_hist.est_mean(), dist_prod.lognorm_mean),
+ relative_error(ev_hist.est_mean(), dist_prod.lognorm_mean),
+ relative_error(fat_hybrid_hist.est_mean(), dist_prod.lognorm_mean),
+ relative_error(uniform_hist.est_mean(), dist_prod.lognorm_mean),
+ relative_error(log_uniform_hist.est_mean(), dist_prod.lognorm_mean),
+ ]
+ )
+ assert all(mean_errors <= 1e-6)
+
+ sd_errors = [
+ relative_error(fat_hybrid_hist.est_sd(), dist_prod.lognorm_sd),
+ relative_error(log_uniform_hist.est_sd(), dist_prod.lognorm_sd),
+ relative_error(ev_hist.est_sd(), dist_prod.lognorm_sd),
+ relative_error(mass_hist.est_sd(), dist_prod.lognorm_sd),
+ relative_error(uniform_hist.est_sd(), dist_prod.lognorm_sd),
+ ]
+ assert all(np.diff(sd_errors) >= 0)
+
+
+def test_lognorm_clip_center_bin_sizing_accuracy():
+ dist1 = LognormalDistribution(norm_mean=-1, norm_sd=0.5, lclip=0, rclip=1)
+ dist2 = LognormalDistribution(norm_mean=0, norm_sd=1, lclip=0, rclip=2 * np.e)
+ true_mean1 = stats.lognorm.expect(
+ lambda x: x,
+ args=(dist1.norm_sd,),
+ scale=np.exp(dist1.norm_mean),
+ lb=dist1.lclip,
+ ub=dist1.rclip,
+ conditional=True,
+ )
+ true_sd1 = np.sqrt(
+ stats.lognorm.expect(
+ lambda x: (x - true_mean1) ** 2,
+ args=(dist1.norm_sd,),
+ scale=np.exp(dist1.norm_mean),
+ lb=dist1.lclip,
+ ub=dist1.rclip,
+ conditional=True,
+ )
+ )
+ true_mean2 = stats.lognorm.expect(
+ lambda x: x,
+ args=(dist2.norm_sd,),
+ scale=np.exp(dist2.norm_mean),
+ lb=dist2.lclip,
+ ub=dist2.rclip,
+ conditional=True,
+ )
+ true_sd2 = np.sqrt(
+ stats.lognorm.expect(
+ lambda x: (x - true_mean2) ** 2,
+ args=(dist2.norm_sd,),
+ scale=np.exp(dist2.norm_mean),
+ lb=dist2.lclip,
+ ub=dist2.rclip,
+ conditional=True,
+ )
+ )
+ true_mean = true_mean1 * true_mean2
+ true_sd = np.sqrt(
+ true_sd1**2 * true_mean2**2
+ + true_mean1**2 * true_sd2**2
+ + true_sd1**2 * true_sd2**2
+ )
+
+ uniform_hist = numeric(dist1, bin_sizing="uniform", warn=False) * numeric(
+ dist2, bin_sizing="uniform", warn=False
+ )
+ log_uniform_hist = numeric(dist1, bin_sizing="log-uniform", warn=False) * numeric(
+ dist2, bin_sizing="log-uniform", warn=False
+ )
+ ev_hist = numeric(dist1, bin_sizing="ev", warn=False) * numeric(
+ dist2, bin_sizing="ev", warn=False
+ )
+ mass_hist = numeric(dist1, bin_sizing="mass", warn=False) * numeric(
+ dist2, bin_sizing="mass", warn=False
+ )
+ fat_hybrid_hist = numeric(dist1, bin_sizing="fat-hybrid", warn=False) * numeric(
+ dist2, bin_sizing="fat-hybrid", warn=False
+ )
+
+ mean_errors = np.array(
+ [
+ relative_error(ev_hist.est_mean(), true_mean),
+ relative_error(mass_hist.est_mean(), true_mean),
+ relative_error(uniform_hist.est_mean(), true_mean),
+ relative_error(fat_hybrid_hist.est_mean(), true_mean),
+ relative_error(log_uniform_hist.est_mean(), true_mean),
+ ]
+ )
+ assert all(mean_errors <= 1e-6)
+
+ # Uniform does poorly in general with fat-tailed dists, but it does well
+ # with a center clip because most of the mass is in the center
+ sd_errors = [
+ relative_error(mass_hist.est_mean(), true_mean),
+ relative_error(uniform_hist.est_sd(), true_sd),
+ relative_error(ev_hist.est_sd(), true_sd),
+ relative_error(fat_hybrid_hist.est_sd(), true_sd),
+ relative_error(log_uniform_hist.est_sd(), true_sd),
+ ]
+ assert all(np.diff(sd_errors) >= 0)
+
+
+def test_lognorm_clip_tail_bin_sizing_accuracy():
+ # cut off 99% of mass and 95% of mass, respectively
+ dist1 = LognormalDistribution(norm_mean=0, norm_sd=1, lclip=10)
+ dist2 = LognormalDistribution(norm_mean=0, norm_sd=2, rclip=27)
+ true_mean1 = stats.lognorm.expect(
+ lambda x: x,
+ args=(dist1.norm_sd,),
+ scale=np.exp(dist1.norm_mean),
+ lb=dist1.lclip,
+ ub=dist1.rclip,
+ conditional=True,
+ )
+ true_sd1 = np.sqrt(
+ stats.lognorm.expect(
+ lambda x: (x - true_mean1) ** 2,
+ args=(dist1.norm_sd,),
+ scale=np.exp(dist1.norm_mean),
+ lb=dist1.lclip,
+ ub=dist1.rclip,
+ conditional=True,
+ )
+ )
+ true_mean2 = stats.lognorm.expect(
+ lambda x: x,
+ args=(dist2.norm_sd,),
+ scale=np.exp(dist2.norm_mean),
+ lb=dist2.lclip,
+ ub=dist2.rclip,
+ conditional=True,
+ )
+ true_sd2 = np.sqrt(
+ stats.lognorm.expect(
+ lambda x: (x - true_mean2) ** 2,
+ args=(dist2.norm_sd,),
+ scale=np.exp(dist2.norm_mean),
+ lb=dist2.lclip,
+ ub=dist2.rclip,
+ conditional=True,
+ )
+ )
+ true_mean = true_mean1 * true_mean2
+ true_sd = np.sqrt(
+ true_sd1**2 * true_mean2**2
+ + true_mean1**2 * true_sd2**2
+ + true_sd1**2 * true_sd2**2
+ )
+
+ uniform_hist = numeric(dist1, bin_sizing="uniform", warn=False) * numeric(
+ dist2, bin_sizing="uniform", warn=False
+ )
+ log_uniform_hist = numeric(dist1, bin_sizing="log-uniform", warn=False) * numeric(
+ dist2, bin_sizing="log-uniform", warn=False
+ )
+ ev_hist = numeric(dist1, bin_sizing="ev", warn=False) * numeric(
+ dist2, bin_sizing="ev", warn=False
+ )
+ mass_hist = numeric(dist1, bin_sizing="mass", warn=False) * numeric(
+ dist2, bin_sizing="mass", warn=False
+ )
+ fat_hybrid_hist = numeric(dist1, bin_sizing="fat-hybrid", warn=False) * numeric(
+ dist2, bin_sizing="fat-hybrid", warn=False
+ )
+
+ mean_errors = np.array(
+ [
+ relative_error(mass_hist.est_mean(), true_mean),
+ relative_error(uniform_hist.est_mean(), true_mean),
+ relative_error(ev_hist.est_mean(), true_mean),
+ relative_error(fat_hybrid_hist.est_mean(), true_mean),
+ relative_error(log_uniform_hist.est_mean(), true_mean),
+ ]
+ )
+ assert all(mean_errors <= 1e-5)
+
+ sd_errors = [
+ relative_error(fat_hybrid_hist.est_sd(), true_sd),
+ relative_error(log_uniform_hist.est_sd(), true_sd),
+ relative_error(ev_hist.est_sd(), true_sd),
+ relative_error(uniform_hist.est_sd(), true_sd),
+ relative_error(mass_hist.est_sd(), true_sd),
+ ]
+ assert all(np.diff(sd_errors) >= 0)
+
+
+def test_gamma_bin_sizing_accuracy():
+ dist1 = GammaDistribution(shape=1, scale=5)
+ dist2 = GammaDistribution(shape=10, scale=1)
+
+ uniform_hist = numeric(dist1, bin_sizing="uniform") * numeric(dist2, bin_sizing="uniform")
+ log_uniform_hist = numeric(dist1, bin_sizing="log-uniform") * numeric(
+ dist2, bin_sizing="log-uniform"
+ )
+ ev_hist = numeric(dist1, bin_sizing="ev") * numeric(dist2, bin_sizing="ev")
+ mass_hist = numeric(dist1, bin_sizing="mass") * numeric(dist2, bin_sizing="mass")
+ fat_hybrid_hist = numeric(dist1, bin_sizing="fat-hybrid") * numeric(
+ dist2, bin_sizing="fat-hybrid"
+ )
+
+ true_mean = uniform_hist.exact_mean
+ true_sd = uniform_hist.exact_sd
+
+ mean_errors = np.array(
+ [
+ relative_error(mass_hist.est_mean(), true_mean),
+ relative_error(uniform_hist.est_mean(), true_mean),
+ relative_error(ev_hist.est_mean(), true_mean),
+ relative_error(log_uniform_hist.est_mean(), true_mean),
+ relative_error(fat_hybrid_hist.est_mean(), true_mean),
+ ]
+ )
+ assert all(mean_errors <= 1e-6)
+
+ sd_errors = [
+ relative_error(ev_hist.est_sd(), true_sd),
+ relative_error(uniform_hist.est_sd(), true_sd),
+ relative_error(fat_hybrid_hist.est_sd(), true_sd),
+ relative_error(log_uniform_hist.est_sd(), true_sd),
+ relative_error(mass_hist.est_sd(), true_sd),
+ ]
+ assert all(np.diff(sd_errors) >= 0)
+
+
+def test_norm_product_sd_accuracy_vs_monte_carlo():
+ """Test that PMH SD is more accurate than Monte Carlo SD both for initial
+ distributions and when multiplying up to 8 distributions together.
+ """
+ if not RUN_MONTE_CARLO_TESTS:
+ return None
+ # Time complexity for binary operations is roughly O(n^2) for PMH and O(n)
+ # for MC, so let MC have num_bins^2 samples.
+ num_bins = 100
+ num_samples = 100**2
+ dists = [NormalDistribution(mean=i, sd=0.5 + i / 4) for i in range(9)]
+ hists = [numeric(dist, num_bins=num_bins, warn=False) for dist in dists]
+ hist = reduce(lambda acc, hist: acc * hist, hists)
+ dist_abs_error = abs(hist.est_sd() - hist.exact_sd)
+
+ mc_abs_error = get_mc_accuracy(hist.exact_sd, num_samples, dists, lambda acc, mc: acc * mc)
+ assert dist_abs_error < mc_abs_error
+
+
+def test_lognorm_product_sd_accuracy_vs_monte_carlo():
+ """Test that PMH SD is more accurate than Monte Carlo SD both for initial
+ distributions and when multiplying up to 16 distributions together."""
+ if not RUN_MONTE_CARLO_TESTS:
+ return None
+ num_bins = 100
+ num_samples = 100**2
+ dists = [LognormalDistribution(norm_mean=i, norm_sd=0.5 + i / 4) for i in range(9)]
+ hists = [numeric(dist, num_bins=num_bins, warn=False) for dist in dists]
+ hist = reduce(lambda acc, hist: acc * hist, hists)
+ dist_abs_error = abs(hist.est_sd() - hist.exact_sd)
+
+ mc_abs_error = get_mc_accuracy(hist.exact_sd, num_samples, dists, lambda acc, mc: acc * mc)
+ assert dist_abs_error < mc_abs_error
+
+
+def test_norm_sum_sd_accuracy_vs_monte_carlo():
+ """Test that PMH SD is more accurate than Monte Carlo SD both for initial
+ distributions and when multiplying up to 8 distributions together.
+
+ Note: With more multiplications, MC has a good chance of being more
+ accurate, and is significantly more accurate at 16 multiplications.
+ """
+ if not RUN_MONTE_CARLO_TESTS:
+ return None
+ num_bins = 1000
+ num_samples = num_bins**2
+ dists = [NormalDistribution(mean=i, sd=0.5 + i / 4) for i in range(9)]
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ hists = [numeric(dist, num_bins=num_bins, bin_sizing="uniform") for dist in dists]
+ hist = reduce(lambda acc, hist: acc + hist, hists)
+ dist_abs_error = abs(hist.est_sd() - hist.exact_sd)
+
+ mc_abs_error = get_mc_accuracy(hist.exact_sd, num_samples, dists, lambda acc, mc: acc + mc)
+ assert dist_abs_error < mc_abs_error
+
+
+def test_lognorm_sum_sd_accuracy_vs_monte_carlo():
+ """Test that PMH SD is more accurate than Monte Carlo SD both for initial
+ distributions and when multiplying up to 16 distributions together."""
+ if not RUN_MONTE_CARLO_TESTS:
+ return None
+ num_bins = 100
+ num_samples = 100**2
+ dists = [LognormalDistribution(norm_mean=i, norm_sd=0.5 + i / 4) for i in range(17)]
+ hists = [numeric(dist, num_bins=num_bins, warn=False) for dist in dists]
+ hist = reduce(lambda acc, hist: acc + hist, hists)
+ dist_abs_error = abs(hist.est_sd() - hist.exact_sd)
+
+ mc_abs_error = get_mc_accuracy(hist.exact_sd, num_samples, dists, lambda acc, mc: acc + mc)
+ assert dist_abs_error < mc_abs_error
+
+
+def test_quantile_accuracy():
+ if not RUN_PRINT_ONLY_TESTS:
+ return None
+ props = np.array([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999])
+ # props = np.array([0.05, 0.1, 0.25, 0.75, 0.9, 0.95, 0.99, 0.999])
+ dist = LognormalDistribution(norm_mean=0, norm_sd=1)
+ true_quantiles = stats.lognorm.ppf(props, dist.norm_sd, scale=np.exp(dist.norm_mean))
+ # dist = NormalDistribution(mean=0, sd=1)
+ # true_quantiles = stats.norm.ppf(props, dist.mean, dist.sd)
+ num_bins = 100
+ num_mc_samples = num_bins**2
+
+ # Formula from Goodman, "Accuracy and Efficiency of Monte Carlo Method."
+ # https://inis.iaea.org/collection/NCLCollectionStore/_Public/19/047/19047359.pdf
+ # Figure 20 on page 434.
+ mc_error = (
+ np.sqrt(props * (1 - props))
+ * np.sqrt(2 * np.pi)
+ * dist.norm_sd
+ * np.exp(0.5 * (np.log(true_quantiles) - dist.norm_mean) ** 2 / dist.norm_sd**2)
+ / np.sqrt(num_mc_samples)
+ )
+
+ print("\n")
+ print(
+ f"MC error: average {fmt(np.mean(mc_error))}, "
+ f"median {fmt(np.median(mc_error))}, "
+ f"max {fmt(np.max(mc_error))}"
+ )
+
+ for bin_sizing in ["log-uniform", "mass", "ev", "fat-hybrid"]:
+ # for bin_sizing in ["uniform", "mass", "ev"]:
+ hist = numeric(dist, bin_sizing=bin_sizing, warn=False, num_bins=num_bins)
+ linear_quantiles = np.interp(
+ props, np.cumsum(hist.masses) - 0.5 * hist.masses, hist.values
+ )
+ hist_quantiles = hist.quantile(props)
+ linear_error = abs(true_quantiles - linear_quantiles) / abs(true_quantiles)
+ hist_error = abs(true_quantiles - hist_quantiles) / abs(true_quantiles)
+ print(f"\n{bin_sizing}")
+ print(
+ f"\tLinear error: average {fmt(np.mean(linear_error))}, "
+ f"median {fmt(np.median(linear_error))}, "
+ f"max {fmt(np.max(linear_error))}"
+ )
+ print(
+ f"\tHist error : average {fmt(np.mean(hist_error))}, "
+ f"median {fmt(np.median(hist_error))}, "
+ f"max {fmt(np.max(hist_error))}"
+ )
+ print(
+ f"\tHist / MC : average {fmt(np.mean(hist_error / mc_error))}, "
+ f"median {fmt(np.median(hist_error / mc_error))}, "
+ f"max {fmt(np.max(hist_error / mc_error))}"
+ )
+
+
+def test_quantile_product_accuracy():
+ if not RUN_PRINT_ONLY_TESTS:
+ return None
+ props = np.array([0.5, 0.75, 0.9, 0.95, 0.99, 0.999]) # EV
+ # props = np.array([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999]) # lognorm
+ # props = np.array([0.05, 0.1, 0.25, 0.75, 0.9, 0.95, 0.99, 0.999]) # norm
+ num_bins = 200
+ print("\n")
+
+ bin_sizing = "log-uniform"
+ hists = []
+ # for bin_sizing in ["log-uniform", "mass", "ev", "fat-hybrid"]:
+ for num_products in [2, 8, 32, 128, 512]:
+ dist1 = LognormalDistribution(norm_mean=0, norm_sd=1 / np.sqrt(num_products))
+ dist = LognormalDistribution(
+ norm_mean=dist1.norm_mean * num_products, norm_sd=dist1.norm_sd * np.sqrt(num_products)
+ )
+ true_quantiles = stats.lognorm.ppf(props, dist.norm_sd, scale=np.exp(dist.norm_mean))
+ num_mc_samples = num_bins**2
+
+ # I'm not sure how to prove this, but empirically, it looks like the error
+ # for MC(x) * MC(y) is the same as the error for MC(x * y).
+ mc_error = (
+ np.sqrt(props * (1 - props))
+ * np.sqrt(2 * np.pi)
+ * dist.norm_sd
+ * np.exp(0.5 * (np.log(true_quantiles) - dist.norm_mean) ** 2 / dist.norm_sd**2)
+ / np.sqrt(num_mc_samples)
+ )
+
+ hist1 = numeric(dist1, bin_sizing=bin_sizing, warn=False, num_bins=num_bins)
+ hist = reduce(lambda acc, x: acc * x, [hist1] * num_products)
+ oneshot = numeric(dist, bin_sizing=bin_sizing, warn=False, num_bins=num_bins)
+ hist_quantiles = hist.quantile(props)
+ hist_error = abs(true_quantiles - hist_quantiles) / abs(true_quantiles)
+ oneshot_error = abs(true_quantiles - oneshot.quantile(props)) / abs(true_quantiles)
+ hists.append(hist)
+
+ print(f"{num_products}")
+ print(
+ f"\tHist error : average {fmt(np.mean(hist_error))}, "
+ f"median {fmt(np.median(hist_error))}, "
+ "max {fmt(np.max(hist_error))}"
+ )
+ print(
+ f"\tHist / MC : average {fmt(np.mean(hist_error / mc_error))}, "
+ f"median {fmt(np.median(hist_error / mc_error))}, "
+ f"max {fmt(np.max(hist_error / mc_error))}"
+ )
+ print(
+ f"\tHist / 1shot: average {fmt(np.mean(hist_error / oneshot_error))}, "
+ f"median {fmt(np.median(hist_error / oneshot_error))}, "
+ f"max {fmt(np.max(hist_error / oneshot_error))}"
+ )
+
+
+def test_individual_bin_accuracy():
+ if not RUN_PRINT_ONLY_TESTS:
+ return None
+ num_bins = 200
+ bin_sizing = "ev"
+ print("")
+ bin_errs = []
+ num_products = 16
+ bin_sizes = 40 * np.arange(1, 11)
+ for num_bins in bin_sizes:
+ operation = "mul"
+ if operation == "mul":
+ true_dist_type = "lognorm"
+ true_dist = LognormalDistribution(norm_mean=0, norm_sd=1)
+ dist1 = LognormalDistribution(norm_mean=0, norm_sd=1 / np.sqrt(num_products))
+ hist1 = numeric(dist1, bin_sizing=bin_sizing, num_bins=num_bins, warn=False)
+ hist = reduce(lambda acc, x: acc * x, [hist1] * num_products)
+ elif operation == "add":
+ true_dist_type = "norm"
+ true_dist = NormalDistribution(mean=0, sd=1)
+ dist1 = NormalDistribution(mean=0, sd=1 / np.sqrt(num_products))
+ hist1 = numeric(dist1, bin_sizing=bin_sizing, num_bins=num_bins, warn=False)
+ hist = reduce(lambda acc, x: acc + x, [hist1] * num_products)
+ elif operation == "exp":
+ true_dist_type = "lognorm"
+ true_dist = LognormalDistribution(norm_mean=0, norm_sd=1)
+ dist1 = NormalDistribution(mean=0, sd=1)
+ hist1 = numeric(dist1, bin_sizing=bin_sizing, num_bins=num_bins, warn=False)
+ hist = hist1.exp()
+
+ cum_mass = np.cumsum(hist.masses)
+ cum_cev = np.cumsum(hist.masses * abs(hist.values))
+ cum_cev_frac = cum_cev / cum_cev[-1]
+ if true_dist_type == "lognorm":
+ expected_cum_mass = stats.lognorm.cdf(
+ true_dist.inv_contribution_to_ev(cum_cev_frac),
+ true_dist.norm_sd,
+ scale=np.exp(true_dist.norm_mean),
+ )
+ elif true_dist_type == "norm":
+ expected_cum_mass = stats.norm.cdf(
+ true_dist.inv_contribution_to_ev(cum_cev_frac), true_dist.mean, true_dist.sd
+ )
+
+ # Take only every nth value where n = num_bins/40
+ cum_mass = cum_mass[:: num_bins // 40]
+ expected_cum_mass = expected_cum_mass[:: num_bins // 40]
+ bin_errs.append(abs(cum_mass - expected_cum_mass) / expected_cum_mass)
+
+ bin_errs = np.array(bin_errs)
+
+ best_fits = []
+ for i in range(40):
+ try:
+ best_fit = optimize.curve_fit(
+ lambda x, a, r: a * x**r, bin_sizes, bin_errs[:, i], p0=[1, 2]
+ )[0]
+ best_fits.append(best_fit)
+ print(f"{i:2d} {best_fit[0]:9.3f} * x ^ {best_fit[1]:.3f}")
+ except RuntimeError:
+ # optimal parameters not found
+ print(f"{i:2d} ? ?")
+
+ print("")
+ print(f"Average: {np.mean(best_fits, axis=0)}\nMedian: {np.median(best_fits, axis=0)}")
+
+ meta_fit = np.polynomial.polynomial.Polynomial.fit(
+ np.array(range(len(best_fits))) / len(best_fits), np.array(best_fits)[:, 1], 2
+ )
+ print(f"\nMeta fit: {meta_fit}")
+
+
+def test_richardson_product():
+ if not RUN_PRINT_ONLY_TESTS:
+ return None
+ print("")
+ num_bins = 200
+ num_products = 2
+ bin_sizing = "ev"
+ # mixture_ratio = [0.035, 0.965]
+ mixture_ratio = [0, 1]
+ # mixture_ratio = [0.3, 0.7]
+ bin_sizes = 40 * np.arange(1, 11)
+ product_nums = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
+ err_rates = []
+ for num_products in product_nums:
+ # for num_bins in bin_sizes:
+ true_mixture_ratio = reduce(
+ lambda acc, x: (acc[0] * x[1] + acc[1] * x[0], acc[0] * x[0] + acc[1] * x[1]),
+ [(mixture_ratio) for _ in range(num_products)],
+ )
+ one_sided_dist = LognormalDistribution(norm_mean=0, norm_sd=1)
+ true_dist = mixture([-one_sided_dist, one_sided_dist], true_mixture_ratio)
+ true_hist = numeric(true_dist, bin_sizing=bin_sizing, num_bins=num_bins, warn=False)
+ one_sided_dist1 = LognormalDistribution(norm_mean=0, norm_sd=1 / np.sqrt(num_products))
+ dist1 = mixture([-one_sided_dist1, one_sided_dist1], mixture_ratio)
+ hist1 = numeric(dist1, bin_sizing=bin_sizing, num_bins=num_bins, warn=False)
+ hist = reduce(lambda acc, x: acc * x, [hist1] * num_products)
+
+ test_mode = "ppf"
+ if test_mode == "cev":
+ true_answer = (
+ one_sided_dist.contribution_to_ev(
+ stats.lognorm.ppf(
+ 2 * hist.masses[50:100].sum(),
+ one_sided_dist.norm_sd,
+ scale=np.exp(one_sided_dist.norm_mean),
+ ),
+ False,
+ )
+ / 2
+ )
+ est_answer = (hist.masses * abs(hist.values))[50:100].sum()
+ print_accuracy_ratio(est_answer, true_answer, f"CEV({num_products:3d})")
+ elif test_mode == "sd":
+ mcs = [samplers.sample(dist, num_bins**2) for dist in [dist1] * num_products]
+ true_answer = true_hist.exact_sd
+ est_answer = hist.est_sd()
+ print_accuracy_ratio(est_answer, true_answer, f"SD({num_products:3d}, {num_bins:3d})")
+ mc = reduce(lambda acc, x: acc * x, mcs)
+ mc_answer = np.std(mc)
+ print_accuracy_ratio(mc_answer, true_answer, f"MC({num_products:3d}, {num_bins:3d})")
+ err_rates.append(abs(est_answer - true_answer))
+ elif test_mode == "ppf":
+ fracs = [0.5, 0.75, 0.9, 0.97, 0.99]
+ frac_errs = []
+ # mc_errs = []
+ for frac in fracs:
+ true_answer = stats.lognorm.ppf(
+ (frac - true_mixture_ratio[0]) / true_mixture_ratio[1],
+ one_sided_dist.norm_sd,
+ scale=np.exp(one_sided_dist.norm_mean),
+ )
+ est_answer = hist.ppf(frac)
+ frac_errs.append(abs(est_answer - true_answer) / true_answer)
+ median_err = np.median(frac_errs)
+ print(f"ppf ({num_products:3d}, {num_bins:3d}): {median_err * 100:.3f}%")
+ err_rates.append(median_err)
+
+ if num_bins == bin_sizes[-1]:
+ best_fit = optimize.curve_fit(lambda x, a, r: a * x**r, bin_sizes, err_rates, p0=[1, 2])[
+ 0
+ ]
+ print(f"\nBest fit: {best_fit}")
+ else:
+ best_fit = optimize.curve_fit(
+ lambda x, a, r: a * x**r, product_nums, err_rates, p0=[1, 2]
+ )[0]
+ print(f"\nBest fit: {best_fit}")
+
+
+def test_richardson_sum():
+ if not RUN_PRINT_ONLY_TESTS:
+ return None
+ print("")
+ num_bins = 200
+ num_sums = 2
+ bin_sizing = "ev"
+ bin_sizes = 40 * np.arange(1, 11)
+ err_rates = []
+ # for num_sums in [2, 4, 8, 16, 32, 64]:
+ for num_bins in bin_sizes:
+ true_dist = NormalDistribution(mean=0, sd=1)
+ true_hist = numeric(true_dist, bin_sizing=bin_sizing, num_bins=num_bins, warn=False)
+ dist1 = NormalDistribution(mean=0, sd=1 / np.sqrt(num_sums))
+ hist1 = numeric(dist1, bin_sizing=bin_sizing, num_bins=num_bins, warn=False)
+ hist = reduce(lambda acc, x: acc + x, [hist1] * num_sums)
+
+ test_mode = "ppf"
+ if test_mode == "cev":
+ true_answer = (
+ true_dist.contribution_to_ev(
+ stats.lognorm.ppf(
+ 2 * hist.masses[50:100].sum(),
+ true_dist.norm_sd,
+ scale=np.exp(true_dist.norm_mean),
+ ),
+ False,
+ )
+ / 2
+ )
+ est_answer = (hist.masses * abs(hist.values))[50:100].sum()
+ print_accuracy_ratio(est_answer, true_answer, f"CEV({num_sums:3d})")
+ elif test_mode == "sd":
+ true_answer = true_hist.exact_sd
+ est_answer = hist.est_sd()
+ print_accuracy_ratio(est_answer, true_answer, f"SD({num_sums}, {num_bins:3d})")
+ err_rates.append(abs(est_answer - true_answer))
+ elif test_mode == "ppf":
+ fracs = [0.75, 0.9, 0.95, 0.98, 0.99]
+ frac_errs = []
+ for frac in fracs:
+ true_answer = stats.norm.ppf(frac, true_dist.mean, true_dist.sd)
+ est_answer = hist.ppf(frac)
+ frac_errs.append(abs(est_answer - true_answer) / true_answer)
+ median_err = np.median(frac_errs)
+ print(f"ppf ({num_sums:3d}, {num_bins:3d}): {median_err * 100:.3f}%")
+ err_rates.append(median_err)
+
+ if len(err_rates) == len(bin_sizes):
+ best_fit = optimize.curve_fit(lambda x, a, r: a * x**r, bin_sizes, err_rates, p0=[1, 2])[
+ 0
+ ]
+ print(f"\nBest fit: {best_fit}")
+
+
+def test_richardson_exp():
+ if not RUN_PRINT_ONLY_TESTS:
+ return None
+ print("")
+ bin_sizing = "ev"
+ bin_sizes = 200 * np.arange(1, 11)
+ err_rates = []
+ for num_bins in bin_sizes:
+ true_dist = LognormalDistribution(norm_mean=0, norm_sd=1)
+ true_hist = numeric(true_dist, bin_sizing=bin_sizing, num_bins=num_bins, warn=False)
+ dist1 = NormalDistribution(mean=0, sd=1)
+ hist1 = numeric(dist1, bin_sizing=bin_sizing, num_bins=num_bins, warn=False)
+ hist = hist1.exp()
+
+ test_mode = "sd"
+ if test_mode == "sd":
+ true_answer = true_hist.exact_sd
+ est_answer = hist.est_sd()
+ print_accuracy_ratio(est_answer, true_answer, f"SD({num_bins:3d})")
+ err_rates.append(abs(est_answer - true_answer))
+ elif test_mode == "ppf":
+ fracs = [0.5, 0.75, 0.9, 0.97, 0.99]
+ frac_errs = []
+ for frac in fracs:
+ true_answer = stats.lognorm.ppf(
+ frac, true_dist.norm_sd, scale=np.exp(true_dist.norm_mean)
+ )
+ est_answer = hist.ppf(frac)
+ frac_errs.append(abs(est_answer - true_answer) / true_answer)
+ median_err = np.median(frac_errs)
+ print(f"ppf ({num_bins:4d}): {median_err * 100:.5f}%")
+ err_rates.append(median_err)
+
+ if len(err_rates) == len(bin_sizes):
+ best_fit = optimize.curve_fit(lambda x, a, r: a * x**r, bin_sizes, err_rates, p0=[1, 2])[
+ 0
+ ]
+ print(f"\nBest fit: {best_fit}")
diff --git a/tests/test_contribution_to_ev.py b/tests/test_contribution_to_ev.py
new file mode 100644
index 0000000..ec988ff
--- /dev/null
+++ b/tests/test_contribution_to_ev.py
@@ -0,0 +1,225 @@
+import hypothesis.strategies as st
+import numpy as np
+from pytest import approx
+from scipy import stats
+from hypothesis import assume, example, given, settings
+
+from ..squigglepy.distributions import (
+ BetaDistribution,
+ GammaDistribution,
+ LognormalDistribution,
+ NormalDistribution,
+ ParetoDistribution,
+ UniformDistribution,
+)
+
+
+@given(
+ norm_mean=st.floats(min_value=np.log(0.01), max_value=np.log(1e6)),
+ norm_sd=st.floats(min_value=0.1, max_value=2.5),
+ ev_fraction=st.floats(min_value=0.01, max_value=0.99),
+)
+@settings(max_examples=1000)
+def test_lognorm_inv_contribution_to_ev_inverts_contribution_to_ev(
+ norm_mean, norm_sd, ev_fraction
+):
+ dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd)
+ assert dist.contribution_to_ev(dist.inv_contribution_to_ev(ev_fraction)) == approx(
+ ev_fraction, 2e-5 / ev_fraction
+ )
+
+
+def test_lognorm_basic():
+ dist = LognormalDistribution(lognorm_mean=2, lognorm_sd=1.0)
+ ev_fraction = 0.25
+ assert dist.contribution_to_ev(dist.inv_contribution_to_ev(ev_fraction)) == approx(ev_fraction)
+
+
+def test_norm_basic():
+ dist = NormalDistribution(mean=100, sd=11.97)
+ assert dist.contribution_to_ev(-1e-14) >= 0
+
+
+@given(
+ mu=st.floats(min_value=-10, max_value=10),
+ sigma=st.floats(min_value=0.01, max_value=100),
+)
+@example(mu=0, sigma=1)
+def test_norm_contribution_to_ev(mu, sigma):
+ dist = NormalDistribution(mean=mu, sd=sigma)
+
+ assert dist.contribution_to_ev(mu + 99 * sigma) == approx(1)
+ assert dist.contribution_to_ev(mu - 99 * sigma) == approx(0)
+
+ # midpoint represents less than half the EV if mu > 0 b/c the larger
+ # values are weighted heavier, and vice versa if mu < 0
+ if mu > 1e-6:
+ assert dist.contribution_to_ev(mu) < 0.5
+ elif mu < -1e-6:
+ assert dist.contribution_to_ev(mu) > 0.5
+ elif mu == 0:
+ assert dist.contribution_to_ev(mu) == approx(0.5)
+
+ # contribution_to_ev should be monotonic
+ assert dist.contribution_to_ev(mu - 2 * sigma) < dist.contribution_to_ev(mu - 1 * sigma)
+ assert dist.contribution_to_ev(mu - sigma) < dist.contribution_to_ev(mu)
+ assert dist.contribution_to_ev(mu) < dist.contribution_to_ev(mu + sigma)
+ assert dist.contribution_to_ev(mu + sigma) < dist.contribution_to_ev(mu + 2 * sigma)
+
+
+@given(
+ mu=st.floats(min_value=-10, max_value=10),
+ sigma=st.floats(min_value=0.01, max_value=10),
+)
+def test_norm_inv_contribution_to_ev(mu, sigma):
+ dist = NormalDistribution(mean=mu, sd=sigma)
+
+ assert dist.inv_contribution_to_ev(1 - 1e-9) > mu + 3 * sigma
+ assert dist.inv_contribution_to_ev(1e-9) < mu - 3 * sigma
+
+ # midpoint represents less than half the EV if mu > 0 b/c the larger
+ # values are weighted heavier, and vice versa if mu < 0
+ if mu > 1e-6:
+ assert dist.inv_contribution_to_ev(0.5) > mu
+ elif mu < -1e-6:
+ assert dist.inv_contribution_to_ev(0.5) < mu
+ elif mu == 0:
+ assert dist.inv_contribution_to_ev(0.5) == approx(mu)
+
+ # inv_contribution_to_ev should be monotonic
+ assert dist.inv_contribution_to_ev(0.05) < dist.inv_contribution_to_ev(0.25)
+ assert dist.inv_contribution_to_ev(0.25) < dist.inv_contribution_to_ev(0.5)
+ assert dist.inv_contribution_to_ev(0.5) < dist.inv_contribution_to_ev(0.75)
+ assert dist.inv_contribution_to_ev(0.75) < dist.inv_contribution_to_ev(0.95)
+
+
+@given(
+ mu=st.floats(min_value=-10, max_value=10),
+ sigma=st.floats(min_value=0.01, max_value=10),
+ ev_fraction=st.floats(min_value=0.0001, max_value=0.9999),
+)
+def test_norm_inv_contribution_to_ev_inverts_contribution_to_ev(mu, sigma, ev_fraction):
+ dist = NormalDistribution(mean=mu, sd=sigma)
+ assert dist.contribution_to_ev(dist.inv_contribution_to_ev(ev_fraction)) == approx(
+ ev_fraction, abs=1e-8
+ )
+
+
+def test_uniform_contribution_to_ev_basic():
+ dist = UniformDistribution(-1, 1)
+ assert dist.contribution_to_ev(-1) == approx(0)
+ assert dist.contribution_to_ev(1) == approx(1)
+ assert dist.contribution_to_ev(0) == approx(0.5)
+
+
+@given(prop=st.floats(min_value=0, max_value=1))
+def test_standard_uniform_contribution_to_ev(prop):
+ dist = UniformDistribution(0, 1)
+ assert dist.contribution_to_ev(prop, False) == approx(0.5 * prop**2)
+
+
+@given(
+ a=st.floats(min_value=-10, max_value=10),
+ b=st.floats(min_value=-10, max_value=10),
+)
+def test_uniform_contribution_to_ev(a, b):
+ if a > b:
+ a, b = b, a
+ assume(abs(a - b) > 1e-4)
+ dist = UniformDistribution(x=a, y=b)
+ assert dist.contribution_to_ev(a) == approx(0)
+ assert dist.contribution_to_ev(b) == approx(1)
+ assert dist.contribution_to_ev(a - 1) == approx(0)
+ assert dist.contribution_to_ev(b + 1) == approx(1)
+
+ assert dist.contribution_to_ev(a, normalized=False) == approx(0)
+ if not (a < 0 and b > 0):
+ assert dist.contribution_to_ev(b, normalized=False) == approx(abs(a + b) / 2)
+ else:
+ total_contribution = (a**2 + b**2) / 2 / (b - a)
+ assert dist.contribution_to_ev(b, normalized=False) == approx(total_contribution)
+
+
+@given(
+ a=st.floats(min_value=-10, max_value=10),
+ b=st.floats(min_value=-10, max_value=10),
+ prop=st.floats(min_value=0, max_value=1),
+)
+def test_uniform_inv_contribution_to_ev_inverts_contribution_to_ev(a, b, prop):
+ if a > b:
+ a, b = b, a
+ assume(abs(a - b) > 1e-4)
+ dist = UniformDistribution(x=a, y=b)
+ assert dist.contribution_to_ev(dist.inv_contribution_to_ev(prop)) == approx(prop)
+
+
+@given(
+ ab=st.floats(min_value=0.01, max_value=10),
+)
+@example(ab=1)
+def test_beta_contribution_to_ev_basic(ab):
+ dist = BetaDistribution(ab, ab)
+ assert dist.contribution_to_ev(0) == approx(0)
+ assert dist.contribution_to_ev(1) == approx(1)
+ assert dist.contribution_to_ev(1, normalized=False) == approx(0.5)
+ assert dist.contribution_to_ev(0.5) > 0
+ assert dist.contribution_to_ev(0.5) <= 0.5
+
+
+@given(
+ a=st.floats(min_value=0.5, max_value=10),
+ b=st.floats(min_value=0.5, max_value=10),
+ fraction=st.floats(min_value=0, max_value=1),
+)
+def test_beta_inv_contribution_ev_inverts_contribution_to_ev(a, b, fraction):
+ # Note: The answers do become a bit off for small fractional values of a, b
+ dist = BetaDistribution(a, b)
+ tolerance = 1e-6 if a < 1 or b < 1 else 1e-8
+ assert dist.contribution_to_ev(dist.inv_contribution_to_ev(fraction)) == approx(
+ fraction, rel=tolerance
+ )
+
+
+@given(
+ shape=st.floats(min_value=0.1, max_value=100),
+ scale=st.floats(min_value=0.1, max_value=100),
+ x=st.floats(min_value=0, max_value=100),
+)
+def test_gamma_contribution_to_ev_basic(shape, scale, x):
+ dist = GammaDistribution(shape, scale)
+ assert dist.contribution_to_ev(0) == approx(0)
+ assert dist.contribution_to_ev(1) < dist.contribution_to_ev(2) or (
+ dist.contribution_to_ev(1) == 0 and dist.contribution_to_ev(2) == 0
+ )
+ if shape * scale <= 1:
+ assert dist.contribution_to_ev(shape * scale, normalized=False) < stats.gamma.cdf(
+ shape * scale, shape, scale=scale
+ )
+ assert dist.contribution_to_ev(100 * shape * scale, normalized=False) == approx(
+ shape * scale, rel=1e-3
+ )
+
+
+@given(
+ shape=st.floats(min_value=0.1, max_value=100),
+ scale=st.floats(min_value=0.1, max_value=100),
+ fraction=st.floats(min_value=0, max_value=1 - 1e-6),
+)
+@example(shape=1, scale=2, fraction=0.5)
+def test_gamma_inv_contribution_ev_inverts_contribution_to_ev(shape, scale, fraction):
+ dist = GammaDistribution(shape, scale)
+ tolerance = 1e-6 if shape < 1 or scale < 1 else 1e-8
+ assert dist.contribution_to_ev(dist.inv_contribution_to_ev(fraction)) == approx(
+ fraction, rel=tolerance
+ )
+
+
+@given(
+ shape=st.floats(min_value=1.1, max_value=100),
+ fraction=st.floats(min_value=0, max_value=1 - 1e-6),
+)
+def test_pareto_inv_contribution_to_ev_inverts_contribution_to_ev(shape, fraction):
+ dist = ParetoDistribution(shape)
+ assert dist.contribution_to_ev(dist.inv_contribution_to_ev(fraction)) == approx(
+ fraction, rel=1e-8
+ )
diff --git a/tests/test_numeric_distribution.py b/tests/test_numeric_distribution.py
new file mode 100644
index 0000000..973f624
--- /dev/null
+++ b/tests/test_numeric_distribution.py
@@ -0,0 +1,1678 @@
+from functools import reduce
+from hypothesis import assume, example, given, settings
+import hypothesis.strategies as st
+import numpy as np
+import operator
+from pytest import approx
+from scipy import integrate, stats
+import sys
+from unittest.mock import patch, Mock
+import warnings
+
+from ..squigglepy.distributions import (
+ BernoulliDistribution,
+ BetaDistribution,
+ ChiSquareDistribution,
+ ComplexDistribution,
+ ConstantDistribution,
+ ExponentialDistribution,
+ GammaDistribution,
+ LognormalDistribution,
+ MixtureDistribution,
+ NormalDistribution,
+ ParetoDistribution,
+ PERTDistribution,
+ UniformDistribution,
+ mixture,
+)
+from ..squigglepy.numeric_distribution import numeric, NumericDistribution, _bump_indexes
+from ..squigglepy import utils
+
+# There are a lot of functions testing various combinations of behaviors with
+# no obvious way to order them. These functions are ordered basically like this:
+#
+# 1. helper functions
+# 2. tests for constructors for norm and lognorm
+# 3. tests for basic operations on norm and lognorm, in the order
+# product > sum > negation > subtraction
+# 4. tests for other distributions
+# 5. tests for non-operation functions such as cdf/quantile
+# 6. special tests, such as profiling tests
+#
+# Tests with `basic` in the name use hard-coded values to ensure basic
+# functionality. Other tests use values generated by the hypothesis library.
+
+
+def relative_error(x, y):
+ if x == 0 and y == 0:
+ return 0
+ if x == 0:
+ return abs(y)
+ if y == 0:
+ return abs(x)
+ return max(x / y, y / x) - 1
+
+
+def fix_ordering(a, b):
+ """
+ Check that a and b are ordered correctly and that they're not tiny enough
+ to mess up floating point calculations.
+ """
+ if a > b:
+ a, b = b, a
+ assume(a != b)
+ assume(((b - a) / (50 * (abs(a) + abs(b)))) ** 2 > sys.float_info.epsilon)
+ assume(a == 0 or abs(a) > sys.float_info.epsilon)
+ assume(b == 0 or abs(b) > sys.float_info.epsilon)
+ return a, b
+
+
+@given(
+ mean1=st.floats(min_value=-1e5, max_value=1e5),
+ mean2=st.floats(min_value=-1e5, max_value=1e5),
+ sd1=st.floats(min_value=0.1, max_value=100),
+ sd2=st.floats(min_value=0.001, max_value=1000),
+)
+@example(mean1=0, mean2=-8, sd1=1, sd2=1)
+def test_sum_exact_summary_stats(mean1, mean2, sd1, sd2):
+ """Test that the formulas for exact moments are implemented correctly."""
+ dist1 = NormalDistribution(mean=mean1, sd=sd1)
+ dist2 = NormalDistribution(mean=mean2, sd=sd2)
+ hist1 = numeric(dist1, warn=False)
+ hist2 = numeric(dist2, warn=False)
+ hist_prod = hist1 + hist2
+ assert hist_prod.exact_mean == approx(
+ stats.norm.mean(mean1 + mean2, np.sqrt(sd1**2 + sd2**2))
+ )
+ assert hist_prod.exact_sd == approx(
+ stats.norm.std(
+ mean1 + mean2,
+ np.sqrt(sd1**2 + sd2**2),
+ )
+ )
+
+
+@given(
+ norm_mean1=st.floats(min_value=-np.log(1e9), max_value=np.log(1e9)),
+ norm_mean2=st.floats(min_value=-np.log(1e5), max_value=np.log(1e5)),
+ norm_sd1=st.floats(min_value=0.1, max_value=3),
+ norm_sd2=st.floats(min_value=0.001, max_value=3),
+)
+def test_lognorm_product_exact_summary_stats(norm_mean1, norm_mean2, norm_sd1, norm_sd2):
+ """Test that the formulas for exact moments are implemented correctly."""
+ dist1 = LognormalDistribution(norm_mean=norm_mean1, norm_sd=norm_sd1)
+ dist2 = LognormalDistribution(norm_mean=norm_mean2, norm_sd=norm_sd2)
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ hist1 = numeric(dist1, warn=False)
+ hist2 = numeric(dist2, warn=False)
+ hist_prod = hist1 * hist2
+ assert hist_prod.exact_mean == approx(
+ stats.lognorm.mean(
+ np.sqrt(norm_sd1**2 + norm_sd2**2), scale=np.exp(norm_mean1 + norm_mean2)
+ )
+ )
+ assert hist_prod.exact_sd == approx(
+ stats.lognorm.std(
+ np.sqrt(norm_sd1**2 + norm_sd2**2), scale=np.exp(norm_mean1 + norm_mean2)
+ )
+ )
+
+
+@given(
+ mean=st.floats(min_value=-np.log(1e9), max_value=np.log(1e9)),
+ sd=st.floats(min_value=0.001, max_value=100),
+)
+def test_norm_basic(mean, sd):
+ dist = NormalDistribution(mean=mean, sd=sd)
+ hist = numeric(dist, bin_sizing="uniform", warn=False)
+ assert hist.est_mean() == approx(mean)
+ assert hist.est_sd() == approx(sd, rel=0.01)
+
+
+@given(
+ norm_mean=st.floats(min_value=-np.log(1e9), max_value=np.log(1e9)),
+ norm_sd=st.floats(min_value=0.001, max_value=3),
+ bin_sizing=st.sampled_from(["uniform", "log-uniform", "ev", "mass"]),
+)
+@example(norm_mean=0, norm_sd=1, bin_sizing="log-uniform")
+def test_lognorm_mean(norm_mean, norm_sd, bin_sizing):
+ dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd)
+ hist = numeric(dist, bin_sizing=bin_sizing, warn=False)
+ if bin_sizing == "ev":
+ tolerance = 1e-6
+ elif bin_sizing == "log-uniform":
+ tolerance = 1e-2
+ else:
+ tolerance = 0.01 if dist.norm_sd < 3 else 0.1
+ assert hist.est_mean() == approx(
+ stats.lognorm.mean(dist.norm_sd, scale=np.exp(dist.norm_mean)),
+ rel=tolerance,
+ )
+
+
+@given(
+ norm_mean=st.floats(min_value=-np.log(1e9), max_value=np.log(1e9)),
+ norm_sd=st.floats(min_value=0.01, max_value=3),
+)
+def test_lognorm_sd(norm_mean, norm_sd):
+ dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd)
+ hist = numeric(dist, bin_sizing="log-uniform", warn=False)
+
+ def true_variance(left, right):
+ return integrate.quad(
+ lambda x: (x - dist.lognorm_mean) ** 2
+ * stats.lognorm.pdf(x, dist.norm_sd, scale=np.exp(dist.norm_mean)),
+ left,
+ right,
+ )[0]
+
+ def observed_variance(left, right):
+ return np.sum(hist.masses[left:right] * (hist.values[left:right] - hist.est_mean()) ** 2)
+
+ assert hist.est_sd() == approx(dist.lognorm_sd, rel=0.01 + 0.1 * norm_sd)
+
+
+@given(
+ mean=st.floats(min_value=-10, max_value=10),
+ sd=st.floats(min_value=0.01, max_value=10),
+ clip_zscore=st.floats(min_value=-4, max_value=4),
+)
+@example(mean=0.25, sd=6, clip_zscore=0)
+def test_norm_one_sided_clip(mean, sd, clip_zscore):
+ # TODO: changing allowance from 0 to 0.25 made this start failing. The
+ # problem is that deleting a bit of mass on one side is shifting the EV by
+ # more than the tolerance.
+ tolerance = 1e-3 if abs(clip_zscore) > 3 else 1e-5
+ clip = mean + clip_zscore * sd
+ dist = NormalDistribution(mean=mean, sd=sd, lclip=clip)
+ hist = numeric(dist, warn=False)
+ # The exact mean can still be a bit off because uniform bin_sizing doesn't
+ # cover the full domain
+ assert hist.exact_mean == approx(
+ stats.truncnorm.mean(clip_zscore, np.inf, loc=mean, scale=sd), rel=1e-5, abs=1e-9
+ )
+
+ assert hist.est_mean() == approx(
+ stats.truncnorm.mean(clip_zscore, np.inf, loc=mean, scale=sd), rel=tolerance, abs=tolerance
+ )
+
+ dist = NormalDistribution(mean=mean, sd=sd, rclip=clip)
+ hist = numeric(dist, warn=False)
+ assert hist.est_mean() == approx(
+ stats.truncnorm.mean(-np.inf, clip_zscore, loc=mean, scale=sd),
+ rel=tolerance,
+ abs=tolerance,
+ )
+ assert hist.exact_mean == approx(
+ stats.truncnorm.mean(-np.inf, clip_zscore, loc=mean, scale=sd), rel=1e-5, abs=1e-6
+ )
+
+
+@given(
+ mean=st.floats(min_value=-1, max_value=1),
+ sd=st.floats(min_value=0.01, max_value=10),
+ lclip_zscore=st.floats(min_value=-4, max_value=4),
+ rclip_zscore=st.floats(min_value=-4, max_value=4),
+)
+def test_norm_clip(mean, sd, lclip_zscore, rclip_zscore):
+ tolerance = 1e-3 if max(abs(lclip_zscore), abs(rclip_zscore)) > 3 else 1e-5
+ if lclip_zscore > rclip_zscore:
+ lclip_zscore, rclip_zscore = rclip_zscore, lclip_zscore
+ assume(abs(rclip_zscore - lclip_zscore) > 0.01)
+ lclip = mean + lclip_zscore * sd
+ rclip = mean + rclip_zscore * sd
+ dist = NormalDistribution(mean=mean, sd=sd, lclip=lclip, rclip=rclip)
+ hist = numeric(dist, warn=False)
+
+ assert hist.est_mean() == approx(
+ stats.truncnorm.mean(lclip_zscore, rclip_zscore, loc=mean, scale=sd), rel=tolerance
+ )
+ assert hist.est_mean() == approx(hist.exact_mean, rel=tolerance)
+ assert hist.exact_mean == approx(
+ stats.truncnorm.mean(lclip_zscore, rclip_zscore, loc=mean, scale=sd), rel=1e-6, abs=1e-10
+ )
+ assert hist.exact_sd == approx(
+ stats.truncnorm.std(lclip_zscore, rclip_zscore, loc=mean, scale=sd), rel=1e-6, abs=1e-10
+ )
+
+
+@given(
+ a=st.floats(-100, -1),
+ b=st.floats(1, 100),
+ lclip=st.floats(-100, -1),
+ rclip=st.floats(1, 100),
+)
+def test_uniform_clip(a, b, lclip, rclip):
+ dist = UniformDistribution(a, b)
+ dist.lclip = lclip
+ dist.rclip = rclip
+ narrow_dist = UniformDistribution(max(a, lclip), min(b, rclip))
+ hist = numeric(dist)
+ narrow_hist = numeric(narrow_dist)
+
+ assert hist.est_mean() == approx(narrow_hist.exact_mean)
+ assert hist.est_mean() == approx(narrow_hist.est_mean())
+ assert hist.values[0] == approx(narrow_hist.values[0])
+ assert hist.values[-1] == approx(narrow_hist.values[-1])
+
+
+@given(
+ norm_mean=st.floats(min_value=0.1, max_value=10),
+ norm_sd=st.floats(min_value=0.5, max_value=3),
+ clip_zscore=st.floats(min_value=-2, max_value=2),
+)
+def test_lognorm_clip_and_sum(norm_mean, norm_sd, clip_zscore):
+ clip = np.exp(norm_mean + norm_sd * clip_zscore)
+ left_dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd, rclip=clip)
+ right_dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd, lclip=clip)
+ left_hist = numeric(left_dist, warn=False)
+ right_hist = numeric(right_dist, warn=False)
+ left_mass = stats.lognorm.cdf(clip, norm_sd, scale=np.exp(norm_mean))
+ right_mass = 1 - left_mass
+ true_mean = stats.lognorm.mean(norm_sd, scale=np.exp(norm_mean))
+ sum_exact_mean = left_mass * left_hist.exact_mean + right_mass * right_hist.exact_mean
+ sum_hist_mean = left_mass * left_hist.est_mean() + right_mass * right_hist.est_mean()
+
+ # TODO: the error margin is surprisingly large
+ assert sum_exact_mean == approx(true_mean, rel=1e-3, abs=1e-6)
+ assert sum_hist_mean == approx(true_mean, rel=1e-3, abs=1e-6)
+
+
+@given(
+ mean1=st.floats(min_value=-1000, max_value=0.01),
+ mean2=st.floats(min_value=0.01, max_value=1000),
+ mean3=st.floats(min_value=0.01, max_value=1000),
+ sd1=st.floats(min_value=0.1, max_value=10),
+ sd2=st.floats(min_value=0.1, max_value=10),
+ sd3=st.floats(min_value=0.1, max_value=10),
+ bin_sizing=st.sampled_from(["ev", "mass", "uniform"]),
+)
+@example(mean1=5, mean2=5, mean3=4, sd1=1, sd2=1, sd3=1, bin_sizing="ev")
+@example(mean1=9, mean2=9, mean3=9, sd1=1, sd2=1, sd3=1, bin_sizing="ev")
+def test_norm_product(mean1, mean2, mean3, sd1, sd2, sd3, bin_sizing):
+ dist1 = NormalDistribution(mean=mean1, sd=sd1)
+ dist2 = NormalDistribution(mean=mean2, sd=sd2)
+ dist3 = NormalDistribution(mean=mean3, sd=sd3)
+ mean_tolerance = 1e-5
+ sd_tolerance = 0.2 if bin_sizing == "uniform" else 1
+
+ hist1 = numeric(dist1, num_bins=40, bin_sizing=bin_sizing, warn=False)
+ hist2 = numeric(dist2, num_bins=40, bin_sizing=bin_sizing, warn=False)
+ hist3 = numeric(dist3, num_bins=40, bin_sizing=bin_sizing, warn=False)
+ hist_prod = hist1 * hist2
+ assert hist_prod.est_mean() == approx(dist1.mean * dist2.mean, rel=mean_tolerance, abs=1e-8)
+ assert hist_prod.est_sd() == approx(
+ np.sqrt(
+ (dist1.sd**2 + dist1.mean**2) * (dist2.sd**2 + dist2.mean**2)
+ - dist1.mean**2 * dist2.mean**2
+ ),
+ rel=sd_tolerance,
+ )
+ hist3_prod = hist_prod * hist3
+ assert hist3_prod.est_mean() == approx(
+ dist1.mean * dist2.mean * dist3.mean, rel=mean_tolerance, abs=1e-8
+ )
+
+
+@given(
+ mean=st.floats(min_value=-10, max_value=10),
+ sd=st.floats(min_value=0.001, max_value=100),
+ num_bins=st.sampled_from([40, 100]),
+ bin_sizing=st.sampled_from(["ev", "mass", "uniform"]),
+)
+@settings(max_examples=100)
+def test_norm_mean_error_propagation(mean, sd, num_bins, bin_sizing):
+ """ "Test how quickly the error in the mean grows as distributions are
+ multiplied."""
+ dist = NormalDistribution(mean=mean, sd=sd)
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ hist = numeric(dist, num_bins=num_bins, bin_sizing=bin_sizing)
+ hist_base = numeric(dist, num_bins=num_bins, bin_sizing=bin_sizing)
+ tolerance = 1e-10 if bin_sizing == "ev" else 1e-5
+
+ for i in range(1, 17):
+ true_mean = mean**i
+ true_sd = np.sqrt((dist.sd**2 + dist.mean**2) ** i - dist.mean ** (2 * i))
+ if true_sd > 1e15:
+ break
+ assert hist.est_mean() == approx(
+ true_mean, abs=tolerance ** (1 / i), rel=tolerance ** (1 / i)
+ ), f"On iteration {i}"
+ hist = hist * hist_base
+
+
+@given(
+ mean1=st.floats(min_value=-100, max_value=100),
+ mean2=st.floats(min_value=-np.log(1e5), max_value=np.log(1e5)),
+ mean3=st.floats(min_value=-100, max_value=100),
+ sd1=st.floats(min_value=0.001, max_value=100),
+ sd2=st.floats(min_value=0.001, max_value=3),
+ sd3=st.floats(min_value=0.001, max_value=100),
+ # num_bins1=st.sampled_from([40, 100]),
+ # num_bins2=st.sampled_from([40, 100]),
+ num_bins1=st.sampled_from([100]),
+ num_bins2=st.sampled_from([100]),
+)
+@example(mean1=99, mean2=0, mean3=-1e-16, sd1=1.5, sd2=3, sd3=0.5, num_bins1=100, num_bins2=100)
+def test_norm_lognorm_product_sum(mean1, mean2, mean3, sd1, sd2, sd3, num_bins1, num_bins2):
+ dist1 = NormalDistribution(mean=mean1, sd=sd1)
+ dist2 = LognormalDistribution(norm_mean=mean2, norm_sd=sd2)
+ dist3 = NormalDistribution(mean=mean3, sd=sd3)
+ hist1 = numeric(dist1, num_bins=num_bins1, warn=False)
+ hist2 = numeric(dist2, num_bins=num_bins2, warn=False)
+ hist3 = numeric(dist3, num_bins=num_bins1, warn=False)
+ hist_prod = hist1 * hist2
+ assert all(np.diff(hist_prod.values) >= 0)
+ assert hist_prod.est_mean() == approx(hist_prod.exact_mean, abs=1e-5, rel=1e-5)
+
+ # SD is pretty inaccurate
+ sd_tolerance = 1 if num_bins1 == 100 and num_bins2 == 100 else 2
+ assert hist_prod.est_sd() == approx(hist_prod.exact_sd, rel=sd_tolerance)
+
+ hist_sum = hist_prod + hist3
+ assert hist_sum.est_mean() == approx(hist_sum.exact_mean, abs=1e-5, rel=1e-5)
+
+
+@given(
+ norm_mean=st.floats(min_value=np.log(1e-6), max_value=np.log(1e6)),
+ norm_sd=st.floats(min_value=0.001, max_value=2),
+ num_bins=st.sampled_from([40, 100]),
+ bin_sizing=st.sampled_from(["ev", "log-uniform", "fat-hybrid"]),
+)
+@settings(max_examples=10)
+def test_lognorm_mean_error_propagation(norm_mean, norm_sd, num_bins, bin_sizing):
+ dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd)
+ hist = numeric(dist, num_bins=num_bins, bin_sizing=bin_sizing, warn=False)
+ hist_base = numeric(dist, num_bins=num_bins, bin_sizing=bin_sizing, warn=False)
+ inv_tolerance = 1 - 1e-12 if bin_sizing == "ev" else 0.98
+
+ for i in range(1, 13):
+ true_mean = stats.lognorm.mean(np.sqrt(i) * norm_sd, scale=np.exp(i * norm_mean))
+ if bin_sizing == "ev":
+ # log-uniform can have out-of-order values due to the masses at the
+ # end being very small
+ assert all(np.diff(hist.values) >= 0), f"On iteration {i}: {hist.values}"
+ assert hist.est_mean() == approx(
+ true_mean, rel=1 - inv_tolerance**i
+ ), f"On iteration {i}"
+ hist = hist * hist_base
+
+
+@given(bin_sizing=st.sampled_from(["ev", "log-uniform"]))
+def test_lognorm_sd_error_propagation(bin_sizing):
+ verbose = False
+ dist = LognormalDistribution(norm_mean=0, norm_sd=0.1)
+ num_bins = 100
+ hist = numeric(dist, num_bins=num_bins, bin_sizing=bin_sizing, warn=False)
+ abs_error = []
+ rel_error = []
+
+ if verbose:
+ print("")
+ for i in [1, 2, 4, 8, 16, 32]:
+ oneshot = numeric(
+ LognormalDistribution(norm_mean=0, norm_sd=0.1 * np.sqrt(i)),
+ num_bins=num_bins,
+ bin_sizing=bin_sizing,
+ warn=False,
+ )
+ true_sd = hist.exact_sd
+ abs_error.append(abs(hist.est_sd() - true_sd))
+ rel_error.append(relative_error(hist.est_sd(), true_sd))
+ if verbose:
+ print(f"i={i:2d}: Hist error : {rel_error[-1] * 100:.4f}%")
+ print(
+ "i={:2d}: Hist / 1shot: {:.0f}%".format(
+ i, ((rel_error[-1] / relative_error(oneshot.est_sd(), true_sd)) * 100)
+ )
+ )
+ hist = hist * hist
+
+ expected_error_pcts = (
+ [0.9, 2.8, 9.9, 40.7, 211, 2678]
+ if bin_sizing == "ev"
+ else [12, 26.3, 99.8, 733, 32000, 1e9]
+ )
+
+ for i in range(len(expected_error_pcts)):
+ assert rel_error[i] < expected_error_pcts[i] / 100
+
+
+@given(
+ norm_mean1=st.floats(min_value=-np.log(1e9), max_value=np.log(1e9)),
+ norm_mean2=st.floats(min_value=-np.log(1e9), max_value=np.log(1e9)),
+ norm_sd1=st.floats(min_value=0.1, max_value=3),
+ norm_sd2=st.floats(min_value=0.1, max_value=3),
+ bin_sizing=st.sampled_from(["ev", "log-uniform", "fat-hybrid"]),
+)
+@example(norm_mean1=0, norm_mean2=0, norm_sd1=1, norm_sd2=1, bin_sizing="fat-hybrid")
+def test_lognorm_product(norm_mean1, norm_sd1, norm_mean2, norm_sd2, bin_sizing):
+ dists = [
+ LognormalDistribution(norm_mean=norm_mean2, norm_sd=norm_sd2),
+ LognormalDistribution(norm_mean=norm_mean1, norm_sd=norm_sd1),
+ ]
+ dist_prod = LognormalDistribution(
+ norm_mean=norm_mean1 + norm_mean2, norm_sd=np.sqrt(norm_sd1**2 + norm_sd2**2)
+ )
+ hists = [numeric(dist, bin_sizing=bin_sizing, warn=False) for dist in dists]
+ hist_prod = reduce(lambda acc, hist: acc * hist, hists)
+
+ # Lognorm width grows with e**norm_sd**2, so error tolerance grows the same way
+ sd_tolerance = 1.05 ** (1 + (norm_sd1 + norm_sd2) ** 2) - 1
+ mean_tolerance = 1e-3 if bin_sizing == "log-uniform" else 1e-6
+ assert hist_prod.est_mean() == approx(dist_prod.lognorm_mean, rel=mean_tolerance)
+ assert hist_prod.est_sd() == approx(dist_prod.lognorm_sd, rel=sd_tolerance)
+
+
+@given(
+ mean1=st.floats(-1e5, 1e5),
+ mean2=st.floats(min_value=-1e5, max_value=1e5),
+ sd1=st.floats(min_value=0.001, max_value=1e5),
+ sd2=st.floats(min_value=0.001, max_value=1e5),
+ num_bins=st.sampled_from([40, 100]),
+ bin_sizing=st.sampled_from(["ev", "uniform"]),
+)
+@example(mean1=0, mean2=0, sd1=1, sd2=16, num_bins=40, bin_sizing="ev")
+@example(mean1=0, mean2=0, sd1=7, sd2=1, num_bins=40, bin_sizing="ev")
+def test_norm_sum(mean1, mean2, sd1, sd2, num_bins, bin_sizing):
+ dist1 = NormalDistribution(mean=mean1, sd=sd1)
+ dist2 = NormalDistribution(mean=mean2, sd=sd2)
+ hist1 = numeric(dist1, num_bins=num_bins, bin_sizing=bin_sizing, warn=False)
+ hist2 = numeric(dist2, num_bins=num_bins, bin_sizing=bin_sizing, warn=False)
+ hist_sum = hist1 + hist2
+
+ # The further apart the means are, the less accurate the SD estimate is
+ distance_apart = abs(mean1 - mean2) / hist_sum.exact_sd
+ sd_tolerance = 2 + 0.5 * distance_apart
+ mean_tolerance = 1e-10 + 1e-10 * distance_apart
+
+ assert all(np.diff(hist_sum.values) >= 0)
+ assert hist_sum.est_mean() == approx(hist_sum.exact_mean, abs=mean_tolerance, rel=1e-5)
+ assert hist_sum.est_sd() == approx(hist_sum.exact_sd, rel=sd_tolerance)
+
+
+@given(
+ norm_mean1=st.floats(min_value=-np.log(1e6), max_value=np.log(1e6)),
+ norm_mean2=st.floats(min_value=-np.log(1e6), max_value=np.log(1e6)),
+ norm_sd1=st.floats(min_value=0.1, max_value=3),
+ norm_sd2=st.floats(min_value=0.01, max_value=3),
+ bin_sizing=st.sampled_from(["ev", "log-uniform"]),
+)
+def test_lognorm_sum(norm_mean1, norm_mean2, norm_sd1, norm_sd2, bin_sizing):
+ dist1 = LognormalDistribution(norm_mean=norm_mean1, norm_sd=norm_sd1)
+ dist2 = LognormalDistribution(norm_mean=norm_mean2, norm_sd=norm_sd2)
+ hist1 = numeric(dist1, bin_sizing=bin_sizing, warn=False)
+ hist2 = numeric(dist2, bin_sizing=bin_sizing, warn=False)
+ hist_sum = hist1 + hist2
+ assert all(np.diff(hist_sum.values) >= 0), hist_sum.values
+ mean_tolerance = 1e-3 if bin_sizing == "log-uniform" else 1e-6
+ assert hist_sum.est_mean() == approx(hist_sum.exact_mean, rel=mean_tolerance)
+
+ # SD is very inaccurate because adding lognormals produces some large but
+ # very low-probability values on the right tail and the only approach is to
+ # either downweight them or make the histogram much wider.
+ assert hist_sum.est_sd() > min(hist1.est_sd(), hist2.est_sd())
+ assert hist_sum.est_sd() == approx(hist_sum.exact_sd, rel=2)
+
+
+@given(
+ mean1=st.floats(min_value=-100, max_value=100),
+ mean2=st.floats(min_value=-np.log(1e5), max_value=np.log(1e5)),
+ sd1=st.floats(min_value=0.001, max_value=100),
+ sd2=st.floats(min_value=0.001, max_value=3),
+ lognorm_bin_sizing=st.sampled_from(["ev", "log-uniform", "fat-hybrid"]),
+)
+@example(mean1=-68, mean2=0, sd1=1, sd2=2.9, lognorm_bin_sizing="log-uniform")
+def test_norm_lognorm_sum(mean1, mean2, sd1, sd2, lognorm_bin_sizing):
+ dist1 = NormalDistribution(mean=mean1, sd=sd1)
+ dist2 = LognormalDistribution(norm_mean=mean2, norm_sd=sd2)
+ hist1 = numeric(dist1, warn=False)
+ hist2 = numeric(dist2, bin_sizing=lognorm_bin_sizing, warn=False)
+ hist_sum = hist1 + hist2
+ mean_tolerance = 0.005 if lognorm_bin_sizing == "log-uniform" else 1e-6
+ sd_tolerance = 0.5
+ assert all(np.diff(hist_sum.values) >= 0), hist_sum.values
+ assert hist_sum.est_mean() == approx(
+ hist_sum.exact_mean, abs=mean_tolerance, rel=mean_tolerance
+ )
+ assert hist_sum.est_sd() == approx(hist_sum.exact_sd, rel=sd_tolerance)
+
+
+@given(
+ norm_mean=st.floats(min_value=-10, max_value=10),
+ norm_sd=st.floats(min_value=0.1, max_value=2),
+)
+def test_lognorm_to_const_power(norm_mean, norm_sd):
+ # If you make the power bigger, mean stays pretty accurate but SD gets
+ # pretty far off (>100%) for high-variance dists
+ power = 1.5
+ dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd)
+ hist = numeric(dist, bin_sizing="log-uniform", warn=False)
+
+ hist_pow = hist**power
+ true_dist_pow = LognormalDistribution(norm_mean=power * norm_mean, norm_sd=power * norm_sd)
+ assert hist_pow.est_mean() == approx(true_dist_pow.lognorm_mean, rel=0.005)
+ assert hist_pow.est_sd() == approx(true_dist_pow.lognorm_sd, rel=0.5)
+
+
+@given(
+ norm_mean=st.floats(min_value=-1e6, max_value=1e6),
+ norm_sd=st.floats(min_value=0.001, max_value=3),
+ num_bins=st.sampled_from([40, 100]),
+ bin_sizing=st.sampled_from(["ev", "uniform"]),
+)
+def test_norm_negate(norm_mean, norm_sd, num_bins, bin_sizing):
+ dist = NormalDistribution(mean=0, sd=1)
+ hist = numeric(dist, warn=False)
+ neg_hist = -hist
+ assert neg_hist.est_mean() == approx(-hist.est_mean())
+ assert neg_hist.est_sd() == approx(hist.est_sd())
+
+
+@given(
+ norm_mean=st.floats(min_value=-np.log(1e9), max_value=np.log(1e9)),
+ norm_sd=st.floats(min_value=0.001, max_value=3),
+ num_bins=st.sampled_from([40, 100]),
+ bin_sizing=st.sampled_from(["ev", "log-uniform"]),
+)
+def test_lognorm_negate(norm_mean, norm_sd, num_bins, bin_sizing):
+ dist = LognormalDistribution(norm_mean=0, norm_sd=1)
+ hist = numeric(dist, warn=False)
+ neg_hist = -hist
+ assert neg_hist.est_mean() == approx(-hist.est_mean())
+ assert neg_hist.est_sd() == approx(hist.est_sd())
+
+
+@given(
+ type_and_size=st.sampled_from(["norm-ev", "norm-uniform", "lognorm-ev"]),
+ mean1=st.floats(min_value=-1e6, max_value=1e6),
+ mean2=st.floats(min_value=-100, max_value=100),
+ sd1=st.floats(min_value=0.001, max_value=1000),
+ sd2=st.floats(min_value=0.1, max_value=5),
+ num_bins=st.sampled_from([30, 100]),
+)
+def test_sub(type_and_size, mean1, mean2, sd1, sd2, num_bins):
+ dist1 = NormalDistribution(mean=mean1, sd=sd1)
+ dist2_type, bin_sizing = type_and_size.split("-")
+
+ if dist2_type == "norm":
+ dist2 = NormalDistribution(mean=mean2, sd=sd2)
+ neg_dist = NormalDistribution(mean=-mean2, sd=sd2)
+ elif dist2_type == "lognorm":
+ dist2 = LognormalDistribution(norm_mean=mean2, norm_sd=sd2)
+ # We can't negate a lognormal distribution by changing the params
+ neg_dist = None
+
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ hist1 = numeric(dist1, num_bins=num_bins, bin_sizing=bin_sizing)
+ hist2 = numeric(dist2, num_bins=num_bins, bin_sizing=bin_sizing)
+ hist_diff = hist1 - hist2
+ backward_diff = hist2 - hist1
+ assert not any(np.isnan(hist_diff.values))
+ assert all(np.diff(hist_diff.values) >= 0)
+ assert hist_diff.est_mean() == approx(-backward_diff.est_mean(), rel=0.01)
+ assert hist_diff.est_sd() == approx(backward_diff.est_sd(), rel=0.05)
+
+ if neg_dist:
+ neg_hist = numeric(neg_dist, num_bins=num_bins, bin_sizing=bin_sizing)
+ hist_sum = hist1 + neg_hist
+ assert hist_diff.est_mean() == approx(hist_sum.est_mean(), rel=0.01)
+ assert hist_diff.est_sd() == approx(hist_sum.est_sd(), rel=0.05)
+
+
+def test_lognorm_sub():
+ dist = LognormalDistribution(norm_mean=0, norm_sd=1)
+ hist = numeric(dist, warn=False)
+ hist_diff = 0.97 * hist - 0.03 * hist
+ assert not any(np.isnan(hist_diff.values))
+ assert all(np.diff(hist_diff.values) >= 0)
+ assert hist_diff.est_mean() == approx(0.94 * dist.lognorm_mean, rel=0.001)
+ assert hist_diff.est_sd() == approx(hist_diff.exact_sd, rel=0.05)
+
+
+@given(
+ mean=st.floats(min_value=-100, max_value=100),
+ sd=st.floats(min_value=0.001, max_value=1000),
+ scalar=st.floats(min_value=-100, max_value=100),
+)
+def test_scale(mean, sd, scalar):
+ assume(scalar != 0)
+ dist = NormalDistribution(mean=mean, sd=sd)
+ hist = numeric(dist, warn=False)
+ scaled_hist = scalar * hist
+ assert scaled_hist.est_mean() == approx(scalar * hist.est_mean(), abs=1e-6, rel=1e-6)
+ assert scaled_hist.est_sd() == approx(abs(scalar) * hist.est_sd(), abs=1e-6, rel=1e-6)
+ assert scaled_hist.exact_mean == approx(scalar * hist.exact_mean)
+ assert scaled_hist.exact_sd == approx(abs(scalar) * hist.exact_sd)
+
+
+@given(
+ mean=st.floats(min_value=-100, max_value=100),
+ sd=st.floats(min_value=0.001, max_value=1000),
+ scalar=st.floats(min_value=-100, max_value=100),
+)
+def test_shift_by(mean, sd, scalar):
+ dist = NormalDistribution(mean=mean, sd=sd)
+ hist = numeric(dist, warn=False)
+ shifted_hist = hist + scalar
+ assert shifted_hist.est_mean() == approx(hist.est_mean() + scalar, abs=1e-6, rel=1e-6)
+ assert shifted_hist.est_sd() == approx(hist.est_sd(), abs=1e-6, rel=1e-6)
+ assert shifted_hist.exact_mean == approx(hist.exact_mean + scalar)
+ assert shifted_hist.exact_sd == approx(hist.exact_sd)
+ assert shifted_hist.pos_ev_contribution - shifted_hist.neg_ev_contribution == approx(
+ shifted_hist.exact_mean
+ )
+ if shifted_hist.zero_bin_index < len(shifted_hist.values):
+ assert shifted_hist.values[shifted_hist.zero_bin_index] > 0
+ if shifted_hist.zero_bin_index > 0:
+ assert shifted_hist.values[shifted_hist.zero_bin_index - 1] < 0
+
+
+@given(
+ norm_mean=st.floats(min_value=-10, max_value=10),
+ norm_sd=st.floats(min_value=0.01, max_value=2.5),
+)
+def test_lognorm_reciprocal(norm_mean, norm_sd):
+ dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd)
+ reciprocal_dist = LognormalDistribution(norm_mean=-norm_mean, norm_sd=norm_sd)
+ hist = numeric(dist, bin_sizing="log-uniform", warn=False)
+ reciprocal_hist = 1 / hist
+ true_reciprocal_hist = numeric(reciprocal_dist, bin_sizing="log-uniform", warn=False)
+
+ # Taking the reciprocal does lose a good bit of accuracy because bin values
+ # are set as the expected value of the bin, and the EV of 1/X is pretty
+ # different. Could improve accuracy by writing
+ # reciprocal_contribution_to_ev functions for every distribution type, but
+ # that's probably not worth it.
+ assert reciprocal_hist.est_mean() == approx(reciprocal_dist.lognorm_mean, rel=0.05)
+ assert reciprocal_hist.est_sd() == approx(reciprocal_dist.lognorm_sd, rel=0.2)
+ assert reciprocal_hist.neg_ev_contribution == 0
+ assert reciprocal_hist.pos_ev_contribution == approx(
+ true_reciprocal_hist.pos_ev_contribution, rel=0.05
+ )
+
+
+@given(
+ norm_mean1=st.floats(min_value=-10, max_value=10),
+ norm_mean2=st.floats(min_value=-10, max_value=10),
+ norm_sd1=st.floats(min_value=0.01, max_value=2),
+ norm_sd2=st.floats(min_value=0.01, max_value=2),
+ bin_sizing1=st.sampled_from(["ev", "log-uniform"]),
+)
+def test_lognorm_quotient(norm_mean1, norm_mean2, norm_sd1, norm_sd2, bin_sizing1):
+ dist1 = LognormalDistribution(norm_mean=norm_mean1, norm_sd=norm_sd1)
+ dist2 = LognormalDistribution(norm_mean=norm_mean2, norm_sd=norm_sd2)
+ hist1 = numeric(dist1, bin_sizing=bin_sizing1, warn=False)
+ hist2 = numeric(dist2, bin_sizing="log-uniform", warn=False)
+ quotient_hist = hist1 / hist2
+ true_quotient_dist = LognormalDistribution(
+ norm_mean=norm_mean1 - norm_mean2, norm_sd=np.sqrt(norm_sd1**2 + norm_sd2**2)
+ )
+ true_quotient_hist = numeric(true_quotient_dist, bin_sizing="log-uniform", warn=False)
+
+ assert quotient_hist.est_mean() == approx(true_quotient_hist.est_mean(), rel=0.05)
+ assert quotient_hist.est_sd() == approx(true_quotient_hist.est_sd(), rel=0.2)
+ assert quotient_hist.neg_ev_contribution == approx(
+ true_quotient_hist.neg_ev_contribution, rel=0.01
+ )
+ assert quotient_hist.pos_ev_contribution == approx(
+ true_quotient_hist.pos_ev_contribution, rel=0.01
+ )
+
+
+@given(
+ mean=st.floats(min_value=-20, max_value=20),
+ sd=st.floats(min_value=0.1, max_value=1),
+)
+@example(mean=0, sd=2)
+def test_norm_exp(mean, sd):
+ dist = NormalDistribution(mean=mean, sd=sd)
+ hist = numeric(dist)
+ exp_hist = hist.exp()
+ true_exp_dist = LognormalDistribution(norm_mean=mean, norm_sd=sd)
+
+ # TODO: previously with richardson, mean was accurate to 0.005, and sd to
+ # 0.1, but now it's worse b/c it was using uniform before and now it's
+ # using ev
+ # assert exp_hist.est_mean() == approx(true_exp_dist.lognorm_mean, rel=0.005)
+ # assert exp_hist.est_sd() == approx(true_exp_dist.lognorm_sd, rel=0.1)
+ assert exp_hist.est_mean() == approx(true_exp_dist.lognorm_mean, rel=0.02)
+ assert exp_hist.est_sd() == approx(true_exp_dist.lognorm_sd, rel=0.5)
+
+
+@given(
+ loga=st.floats(min_value=-5, max_value=5),
+ logb=st.floats(min_value=0, max_value=10),
+)
+@example(loga=0, logb=0.001)
+@example(loga=0, logb=2)
+@example(loga=-5, logb=10)
+@settings(max_examples=1)
+def test_uniform_exp(loga, logb):
+ loga, logb = fix_ordering(loga, logb)
+ dist = UniformDistribution(loga, logb)
+ hist = numeric(dist)
+ exp_hist = hist.exp()
+ a = np.exp(loga)
+ b = np.exp(logb)
+ true_mean = (b - a) / np.log(b / a)
+ true_sd = np.sqrt((b**2 - a**2) / (2 * np.log(b / a)) - ((b - a) / (np.log(b / a))) ** 2)
+ assert exp_hist.est_mean() == approx(true_mean, rel=0.02)
+ if not np.isnan(true_sd):
+ # variance can be slightly negative due to rounding errors
+ assert exp_hist.est_sd() == approx(true_sd, rel=0.2, abs=1e-5)
+
+
+@given(
+ mean=st.floats(min_value=-20, max_value=20),
+ sd=st.floats(min_value=0.1, max_value=2),
+)
+def test_lognorm_log(mean, sd):
+ dist = LognormalDistribution(norm_mean=mean, norm_sd=sd)
+ hist = numeric(dist, warn=False)
+ log_hist = hist.log()
+ true_log_dist = NormalDistribution(mean=mean, sd=sd)
+ true_log_hist = numeric(true_log_dist, warn=False)
+ assert log_hist.est_mean() == approx(true_log_hist.exact_mean, rel=0.01, abs=1)
+ assert log_hist.est_sd() == approx(true_log_hist.exact_sd, rel=0.3)
+
+
+@given(
+ mean=st.floats(min_value=-20, max_value=20),
+ sd=st.floats(min_value=0.1, max_value=10),
+)
+@settings(max_examples=10)
+def test_norm_abs(mean, sd):
+ dist = NormalDistribution(mean=mean, sd=sd)
+ hist = abs(numeric(dist))
+ shape = abs(mean) / sd
+ scale = sd
+ true_mean = stats.foldnorm.mean(shape, loc=0, scale=scale)
+ true_sd = stats.foldnorm.std(shape, loc=0, scale=scale)
+ assert hist.est_mean() == approx(true_mean, rel=0.001)
+ assert hist.est_sd() == approx(true_sd, rel=0.01)
+
+
+@given(
+ mean=st.floats(min_value=-20, max_value=20),
+ sd=st.floats(min_value=0.1, max_value=10),
+ left_zscore=st.floats(min_value=-2, max_value=2),
+ zscore_width=st.floats(min_value=2, max_value=5),
+)
+@settings(max_examples=10)
+@example(mean=-2, sd=1, left_zscore=2, zscore_width=2)
+def test_given_value_satisfies(mean, sd, left_zscore, zscore_width):
+ right_zscore = left_zscore + zscore_width
+ left = mean + left_zscore * sd
+ right = mean + right_zscore * sd
+ dist = NormalDistribution(mean=mean, sd=sd)
+ hist = numeric(dist).given_value_satisfies(lambda x: x >= left and x <= right)
+ true_mean = stats.truncnorm.mean(left_zscore, right_zscore, loc=mean, scale=sd)
+ true_sd = stats.truncnorm.std(left_zscore, right_zscore, loc=mean, scale=sd)
+ assert hist.est_mean() == approx(true_mean, rel=0.1, abs=0.05)
+ assert hist.est_sd() == approx(true_sd, rel=0.1, abs=0.1)
+
+
+def test_probability_value_satisfies():
+ dist = NormalDistribution(mean=0, sd=1)
+ hist = numeric(dist)
+ prob1 = hist.probability_value_satisfies(lambda x: x >= 0)
+ assert prob1 == approx(0.5, rel=0.01)
+ prob2 = hist.probability_value_satisfies(lambda x: x < 1)
+ assert prob2 == approx(stats.norm.cdf(1), rel=0.01)
+
+
+@given(
+ a=st.floats(min_value=1e-6, max_value=1),
+ b=st.floats(min_value=1e-6, max_value=1),
+)
+@example(a=1, b=1e-5)
+def test_mixture(a, b):
+ if a + b > 1:
+ scale = a + b
+ a /= scale
+ b /= scale
+ c = max(0, 1 - a - b) # do max to fix floating point rounding
+ dist1 = NormalDistribution(mean=0, sd=5)
+ dist2 = NormalDistribution(mean=5, sd=3)
+ dist3 = NormalDistribution(mean=-1, sd=1)
+ mixture = MixtureDistribution([dist1, dist2, dist3], [a, b, c])
+ hist = numeric(mixture, bin_sizing="uniform")
+ assert hist.est_mean() == approx(a * dist1.mean + b * dist2.mean + c * dist3.mean, rel=1e-4)
+ assert hist.values[0] < 0
+
+
+def test_disjoint_mixture():
+ dist = LognormalDistribution(norm_mean=0, norm_sd=1)
+ hist1 = numeric(dist)
+ hist2 = -numeric(dist)
+ mixture = NumericDistribution.mixture([hist1, hist2], [0.97, 0.03], warn=False)
+ assert mixture.est_mean() == approx(0.94 * dist.lognorm_mean, rel=0.001)
+ assert mixture.values[0] < 0
+ assert mixture.values[1] < 0
+ assert mixture.values[-1] > 0
+ assert mixture.contribution_to_ev(0) == approx(0.03, rel=0.1)
+
+
+def test_mixture_distributivity():
+ one_sided_dist = LognormalDistribution(norm_mean=0, norm_sd=1)
+ ratio = [0.03, 0.97]
+ product_of_mixture = numeric(mixture([-one_sided_dist, one_sided_dist], ratio)) * numeric(
+ one_sided_dist
+ )
+ mixture_of_products = numeric(
+ mixture([-one_sided_dist * one_sided_dist, one_sided_dist * one_sided_dist], ratio)
+ )
+
+ assert product_of_mixture.exact_mean == approx(mixture_of_products.exact_mean, rel=1e-5)
+ assert product_of_mixture.exact_sd == approx(mixture_of_products.exact_sd, rel=1e-5)
+ assert product_of_mixture.est_mean() == approx(mixture_of_products.est_mean(), rel=1e-5)
+ assert product_of_mixture.est_sd() == approx(mixture_of_products.est_sd(), rel=1e-2)
+ assert product_of_mixture.ppf(0.5) == approx(mixture_of_products.ppf(0.5), rel=1e-3, abs=2e-3)
+
+
+@given(lclip=st.integers(-4, 4), width=st.integers(1, 4))
+@example(lclip=-1, width=2)
+@example(lclip=4, width=1)
+def test_numeric_clip(lclip, width):
+ rclip = lclip + width
+ dist = NormalDistribution(mean=0, sd=1)
+ full_hist = numeric(dist, num_bins=200, bin_sizing="uniform", warn=False)
+ clipped_hist = full_hist.clip(lclip, rclip)
+ assert clipped_hist.est_mean() == approx(stats.truncnorm.mean(lclip, rclip), rel=0.001)
+ hist_sum = clipped_hist + full_hist
+ assert hist_sum.est_mean() == approx(
+ stats.truncnorm.mean(lclip, rclip) + stats.norm.mean(), rel=0.001
+ )
+
+
+@given(
+ a=st.sampled_from([0.2, 0.3, 0.5, 0.7, 0.8]),
+ lclip=st.sampled_from([-1, 1, None]),
+ clip_width=st.sampled_from([2, 3, None]),
+ bin_sizing=st.sampled_from(["uniform", "ev"]),
+ # Only clip inner or outer dist b/c clipping both makes it hard to
+ # calculate what the mean should be
+ clip_inner=st.booleans(),
+)
+@example(a=0.7, lclip=1, clip_width=3, bin_sizing="ev", clip_inner=False)
+def test_sum2_clipped(a, lclip, clip_width, bin_sizing, clip_inner):
+ # Clipped NumericDist accuracy really benefits from more bins. It's not
+ # very accurate with 100 bins because a clipped histogram might end up with
+ # only 10 bins or so.
+ num_bins = 500 if not clip_inner and bin_sizing == "uniform" else 200
+ clip_outer = not clip_inner
+ b = max(0, 1 - a) # do max to fix floating point rounding
+ rclip = lclip + clip_width if lclip is not None and clip_width is not None else np.inf
+ if lclip is None:
+ lclip = -np.inf
+ dist1 = NormalDistribution(
+ mean=0,
+ sd=1,
+ lclip=lclip if clip_inner else None,
+ rclip=rclip if clip_inner else None,
+ )
+ dist2 = NormalDistribution(mean=1, sd=2)
+ hist = a * numeric(dist1, num_bins, bin_sizing, warn=False) + b * numeric(
+ dist2, num_bins, bin_sizing, warn=False
+ )
+ if clip_outer:
+ hist = hist.clip(lclip, rclip)
+ if clip_inner:
+ # Truncating then adding is more accurate than adding then truncating,
+ # which is good because truncate-then-add is the more typical use case
+ true_mean = a * stats.truncnorm.mean(lclip, rclip, 0, 1) + b * dist2.mean
+ tolerance = 0.01
+ else:
+ mixed_mean = a * dist1.mean + b * dist2.mean
+ mixed_sd = np.sqrt(a**2 * dist1.sd**2 + b**2 * dist2.sd**2)
+ lclip_zscore = (lclip - mixed_mean) / mixed_sd
+ rclip_zscore = (rclip - mixed_mean) / mixed_sd
+
+ true_mean = stats.truncnorm.mean(
+ lclip_zscore,
+ rclip_zscore,
+ mixed_mean,
+ mixed_sd,
+ )
+ tolerance = 0.01
+
+ assert hist.est_mean() == approx(true_mean, rel=tolerance, abs=tolerance / 10)
+
+
+@given(
+ a=st.floats(min_value=1e-3, max_value=1),
+ b=st.floats(min_value=1e-3, max_value=1),
+ lclip=st.sampled_from([-1, 1, None]),
+ clip_width=st.sampled_from([1, 3, None]),
+ bin_sizing=st.sampled_from(["uniform", "ev"]),
+ # Only clip inner or outer dist b/c clipping both makes it hard to
+ # calculate what the mean should be
+ clip_inner=st.booleans(),
+)
+@example(a=0.0625, b=0.015625, lclip=1, clip_width=1, bin_sizing="ev", clip_inner=False)
+@example(a=0.0625, b=0.015625, lclip=-2, clip_width=1, bin_sizing="ev", clip_inner=False)
+def test_sum3_clipped(a, b, lclip, clip_width, bin_sizing, clip_inner):
+ # Clipped sum accuracy really benefits from more bins.
+ num_bins = 500 if not clip_inner else 100
+ clip_outer = not clip_inner
+ if a + b > 1:
+ scale = a + b
+ a /= scale
+ b /= scale
+ c = max(0, 1 - a - b) # do max to fix floating point rounding
+ rclip = lclip + clip_width if lclip is not None and clip_width is not None else np.inf
+ if lclip is None:
+ lclip = -np.inf
+ dist1 = NormalDistribution(
+ mean=0,
+ sd=1,
+ lclip=lclip if clip_inner else None,
+ rclip=rclip if clip_inner else None,
+ )
+ dist2 = NormalDistribution(mean=1, sd=2)
+ dist3 = NormalDistribution(mean=-1, sd=0.75)
+ dist_sum = a * dist1 + b * dist2 + c * dist3
+ if clip_outer:
+ dist_sum.lclip = lclip
+ dist_sum.rclip = rclip
+
+ hist = numeric(dist_sum, num_bins=num_bins, bin_sizing=bin_sizing, warn=False)
+ if clip_inner:
+ true_mean = a * stats.truncnorm.mean(lclip, rclip, 0, 1) + b * dist2.mean + c * dist3.mean
+ tolerance = 0.01
+ else:
+ mixed_mean = a * dist1.mean + b * dist2.mean + c * dist3.mean
+ mixed_sd = np.sqrt(
+ a**2 * dist1.sd**2 + b**2 * dist2.sd**2 + c**2 * dist3.sd**2
+ )
+ lclip_zscore = (lclip - mixed_mean) / mixed_sd
+ rclip_zscore = (rclip - mixed_mean) / mixed_sd
+ true_mean = stats.truncnorm.mean(
+ lclip_zscore,
+ rclip_zscore,
+ mixed_mean,
+ mixed_sd,
+ )
+ tolerance = 0.05
+ assert hist.est_mean() == approx(true_mean, rel=tolerance, abs=tolerance / 10)
+
+
+def test_sum_with_zeros():
+ dist1 = NormalDistribution(mean=3, sd=1)
+ dist2 = NormalDistribution(mean=2, sd=1)
+ hist1 = numeric(dist1)
+ hist2 = numeric(dist2)
+ hist2 = hist2.condition_on_success(0.75)
+ assert hist2.exact_mean == approx(1.5)
+ assert hist2.est_mean() == approx(1.5, rel=1e-5)
+ assert hist2.est_sd() == approx(hist2.exact_sd, rel=1e-3)
+ hist_sum = hist1 + hist2
+ assert hist_sum.exact_mean == approx(4.5)
+ assert hist_sum.est_mean() == approx(4.5, rel=1e-5)
+
+
+def test_product_with_zeros():
+ dist1 = LognormalDistribution(norm_mean=1, norm_sd=1)
+ dist2 = LognormalDistribution(norm_mean=2, norm_sd=1)
+ hist1 = numeric(dist1)
+ hist2 = numeric(dist2)
+ hist1 = hist1.condition_on_success(2 / 3)
+ hist2 = hist2.condition_on_success(0.5)
+ assert hist2.exact_mean == approx(dist2.lognorm_mean / 2)
+ assert hist2.est_mean() == approx(dist2.lognorm_mean / 2, rel=1e-5)
+ hist_prod = hist1 * hist2
+ dist_prod = LognormalDistribution(norm_mean=3, norm_sd=np.sqrt(2))
+ assert hist_prod.exact_mean == approx(dist_prod.lognorm_mean / 3)
+ assert hist_prod.est_mean() == approx(dist_prod.lognorm_mean / 3, rel=1e-5)
+
+
+def test_shift_with_zeros():
+ dist = NormalDistribution(mean=1, sd=1)
+ wrapped_hist = numeric(dist, warn=False)
+ hist = wrapped_hist.condition_on_success(0.5)
+ shifted_hist = hist + 2
+ assert shifted_hist.exact_mean == approx(2.5)
+ assert shifted_hist.est_mean() == approx(2.5, rel=1e-5)
+ assert shifted_hist.masses[np.searchsorted(shifted_hist.values, 2)] == approx(0.5)
+ assert shifted_hist.est_sd() == approx(hist.est_sd(), rel=1e-3)
+ assert shifted_hist.est_sd() == approx(shifted_hist.exact_sd, rel=1e-3)
+
+
+def test_abs_with_zeros():
+ dist = NormalDistribution(mean=1, sd=2)
+ wrapped_hist = numeric(dist, warn=False)
+ hist = wrapped_hist.condition_on_success(0.5)
+ abs_hist = abs(hist)
+ true_mean = stats.foldnorm.mean(1 / 2, loc=0, scale=2)
+ assert abs_hist.est_mean() == approx(0.5 * true_mean, rel=0.001)
+
+
+def test_condition_on_success():
+ dist1 = NormalDistribution(mean=4, sd=2)
+ dist2 = LognormalDistribution(norm_mean=-1, norm_sd=1)
+ hist = numeric(dist1)
+ event = numeric(dist2)
+ outcome = hist.condition_on_success(event)
+ assert outcome.exact_mean == approx(hist.exact_mean * dist2.lognorm_mean)
+
+
+def test_probability_value_satisfies_with_zeros():
+ dist = NormalDistribution(mean=0, sd=1)
+ hist = numeric(dist).condition_on_success(0.5)
+ assert hist.probability_value_satisfies(lambda x: x == 0) == approx(0.5)
+ assert hist.probability_value_satisfies(lambda x: x != 0) == approx(0.5)
+ assert hist.probability_value_satisfies(lambda x: x > 0) == approx(0.25)
+ assert hist.probability_value_satisfies(lambda x: x >= 0) == approx(0.75)
+
+
+def test_quantile_with_zeros():
+ mean = 1
+ sd = 1
+ dist = NormalDistribution(mean=mean, sd=sd)
+ hist = numeric(dist, bin_sizing="uniform", warn=False).condition_on_success(0.25)
+
+ tolerance = 0.01
+
+ # When we scale down by 4x, the quantile that used to be at q is now at 4q
+ assert hist.quantile(0.025) == approx(stats.norm.ppf(0.1, mean, sd), rel=tolerance)
+ assert hist.quantile(stats.norm.cdf(-0.01, mean, sd) / 4) == approx(
+ -0.01, rel=tolerance, abs=1e-3
+ )
+
+ # The values in the ~middle 75% equal 0
+ assert hist.quantile(stats.norm.cdf(0.01, mean, sd) / 4) == 0
+ assert hist.quantile(0.4 + stats.norm.cdf(0.01, mean, sd) / 4) == 0
+
+ # The values above 0 work like the values below 0
+ assert hist.quantile(0.75 + stats.norm.cdf(0.01, mean, sd) / 4) == approx(
+ 0.01, rel=tolerance, abs=1e-3
+ )
+ assert hist.quantile([0.99]) == approx([stats.norm.ppf(0.96, mean, sd)], rel=tolerance)
+
+
+@given(
+ a=st.floats(min_value=-100, max_value=100),
+ b=st.floats(min_value=-100, max_value=100),
+)
+@settings(max_examples=10)
+def test_uniform_basic(a, b):
+ a, b = fix_ordering(a, b)
+ dist = UniformDistribution(x=a, y=b)
+ with warnings.catch_warnings():
+ # hypothesis generates some extremely tiny input params, which
+ # generates warnings about EV contributions being 0.
+ warnings.simplefilter("ignore")
+ hist = numeric(dist)
+ assert hist.est_mean() == approx((a + b) / 2, 1e-6)
+ assert hist.est_sd() == approx(np.sqrt(1 / 12 * (b - a) ** 2), rel=1e-3)
+
+
+def test_uniform_sum_basic():
+ # The sum of standard uniform distributions is also known as an Irwin-Hall
+ # distribution:
+ # https://en.wikipedia.org/wiki/Irwin%E2%80%93Hall_distribution
+ dist = UniformDistribution(0, 1)
+ hist1 = numeric(dist)
+ hist_sum = numeric(dist)
+ hist_sum += hist1
+ assert hist_sum.exact_mean == approx(1)
+ assert hist_sum.exact_sd == approx(np.sqrt(2 / 12))
+ assert hist_sum.est_mean() == approx(1)
+ assert hist_sum.est_sd() == approx(np.sqrt(2 / 12), rel=0.005)
+ hist_sum += hist1
+ assert hist_sum.est_mean() == approx(1.5)
+ assert hist_sum.est_sd() == approx(np.sqrt(3 / 12), rel=0.005)
+ hist_sum += hist1
+ assert hist_sum.est_mean() == approx(2)
+ assert hist_sum.est_sd() == approx(np.sqrt(4 / 12), rel=0.005)
+
+
+@given(
+ # I originally had both dists on [-1000, 1000] but then hypothesis would
+ # generate ~90% of cases with extremely tiny values that are too small for
+ # floating point operations to handle, so I forced most of the values to be
+ # at least a little away from 0.
+ a1=st.floats(min_value=-1000, max_value=0.001),
+ b1=st.floats(min_value=0.001, max_value=1000),
+ a2=st.floats(min_value=0, max_value=1000),
+ b2=st.floats(min_value=1, max_value=10000),
+ flip2=st.booleans(),
+)
+def test_uniform_sum(a1, b1, a2, b2, flip2):
+ if flip2:
+ a2, b2 = -b2, -a2
+ a1, b1 = fix_ordering(a1, b1)
+ a2, b2 = fix_ordering(a2, b2)
+ dist1 = UniformDistribution(x=a1, y=b1)
+ dist2 = UniformDistribution(x=a2, y=b2)
+ with warnings.catch_warnings():
+ # hypothesis generates some extremely tiny input params, which
+ # generates warnings about EV contributions being 0.
+ warnings.simplefilter("ignore")
+ hist1 = numeric(dist1)
+ hist2 = numeric(dist2)
+
+ hist_sum = hist1 + hist2
+ assert hist_sum.est_mean() == approx(hist_sum.exact_mean)
+ assert hist_sum.est_sd() == approx(hist_sum.exact_sd, rel=0.01)
+
+
+@given(
+ a1=st.floats(min_value=-1000, max_value=0.001),
+ b1=st.floats(min_value=0.001, max_value=1000),
+ a2=st.floats(min_value=0, max_value=1000),
+ b2=st.floats(min_value=1, max_value=10000),
+ flip2=st.booleans(),
+)
+@settings(max_examples=10)
+def test_uniform_prod(a1, b1, a2, b2, flip2):
+ if flip2:
+ a2, b2 = -b2, -a2
+ a1, b1 = fix_ordering(a1, b1)
+ a2, b2 = fix_ordering(a2, b2)
+ dist1 = UniformDistribution(x=a1, y=b1)
+ dist2 = UniformDistribution(x=a2, y=b2)
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ hist1 = numeric(dist1)
+ hist2 = numeric(dist2)
+ hist_prod = hist1 * hist2
+ assert hist_prod.est_mean() == approx(hist_prod.exact_mean, abs=1e-6, rel=1e-6)
+ assert hist_prod.est_sd() == approx(hist_prod.exact_sd, rel=0.01)
+
+
+@given(
+ a=st.floats(min_value=-1000, max_value=0.001),
+ b=st.floats(min_value=0.001, max_value=1000),
+ norm_mean=st.floats(np.log(0.001), np.log(1e6)),
+ norm_sd=st.floats(0.1, 2),
+)
+@settings(max_examples=10)
+def test_uniform_lognorm_prod(a, b, norm_mean, norm_sd):
+ a, b = fix_ordering(a, b)
+ dist1 = UniformDistribution(x=a, y=b)
+ dist2 = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd)
+ hist1 = numeric(dist1)
+ hist2 = numeric(dist2, warn=False)
+ hist_prod = hist1 * hist2
+ assert hist_prod.est_mean() == approx(hist_prod.exact_mean, rel=1e-7, abs=1e-7)
+ assert hist_prod.est_sd() == approx(hist_prod.exact_sd, rel=0.1)
+
+
+@given(
+ a=st.floats(min_value=0.5, max_value=100),
+ b=st.floats(min_value=0.5, max_value=100),
+)
+@settings(max_examples=10)
+def test_beta_basic(a, b):
+ dist = BetaDistribution(a, b)
+ hist = numeric(dist)
+ assert hist.exact_mean == approx(a / (a + b))
+ assert hist.exact_sd == approx(np.sqrt(a * b / ((a + b) ** 2 * (a + b + 1))))
+ assert hist.est_mean() == approx(hist.exact_mean)
+ assert hist.est_sd() == approx(hist.exact_sd, rel=0.02)
+
+
+@given(
+ a=st.floats(min_value=1, max_value=100),
+ b=st.floats(min_value=1, max_value=100),
+ mean=st.floats(-10, 10),
+ sd=st.floats(0.1, 10),
+)
+@settings(max_examples=10)
+def test_beta_sum(a, b, mean, sd):
+ dist1 = BetaDistribution(a=a, b=b)
+ dist2 = NormalDistribution(mean=mean, sd=sd)
+ hist1 = numeric(dist1)
+ hist2 = numeric(dist2)
+ hist_sum = hist1 + hist2
+ assert hist_sum.est_mean() == approx(hist_sum.exact_mean, rel=1e-7, abs=1e-7)
+ assert hist_sum.est_sd() == approx(hist_sum.exact_sd, rel=0.01)
+
+
+@given(
+ a=st.floats(min_value=1, max_value=100),
+ b=st.floats(min_value=1, max_value=100),
+ norm_mean=st.floats(-10, 10),
+ norm_sd=st.floats(0.1, 1),
+)
+@settings(max_examples=10)
+def test_beta_prod(a, b, norm_mean, norm_sd):
+ dist1 = BetaDistribution(a=a, b=b)
+ dist2 = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd)
+ hist1 = numeric(dist1)
+ hist2 = numeric(dist2)
+ hist_prod = hist1 * hist2
+ assert hist_prod.est_mean() == approx(hist_prod.exact_mean, rel=1e-7, abs=1e-7)
+ assert hist_prod.est_sd() == approx(hist_prod.exact_sd, rel=0.02)
+
+
+@given(
+ left=st.floats(min_value=-100, max_value=100),
+ dist_to_mode=st.floats(min_value=0.001, max_value=100),
+ dist_to_right=st.floats(min_value=0.001, max_value=100),
+)
+@settings(max_examples=10)
+def test_pert_basic(left, dist_to_mode, dist_to_right):
+ mode = left + dist_to_mode
+ right = mode + dist_to_right
+ dist = PERTDistribution(left=left, right=right, mode=mode)
+ hist = numeric(dist)
+ assert hist.exact_mean == approx((left + 4 * mode + right) / 6)
+ assert hist.est_mean() == approx(hist.exact_mean)
+ assert hist.exact_sd == approx(
+ (np.sqrt((hist.exact_mean - left) * (right - hist.exact_mean) / 7))
+ )
+ assert hist.est_sd() == approx(hist.exact_sd, rel=0.001)
+
+
+@given(
+ left=st.floats(min_value=-100, max_value=100),
+ dist_to_mode=st.floats(min_value=0.001, max_value=100),
+ dist_to_right=st.floats(min_value=0.001, max_value=100),
+ lam=st.floats(min_value=1, max_value=10),
+)
+@settings(max_examples=10)
+def test_pert_with_lambda(left, dist_to_mode, dist_to_right, lam):
+ mode = left + dist_to_mode
+ right = mode + dist_to_right
+ dist = PERTDistribution(left=left, right=right, mode=mode, lam=lam)
+ hist = numeric(dist)
+ true_mean = (left + lam * mode + right) / (lam + 2)
+ true_sd_lam4 = np.sqrt((hist.exact_mean - left) * (right - hist.exact_mean) / 7)
+ assert hist.exact_mean == approx(true_mean)
+ assert hist.est_mean() == approx(true_mean)
+ if lam < 3.9:
+ assert hist.est_sd() > true_sd_lam4
+ elif lam > 4.1:
+ assert hist.est_sd() < true_sd_lam4
+
+
+@given(
+ mode=st.floats(min_value=0.01, max_value=0.99),
+)
+@settings(max_examples=10)
+def test_pert_equals_beta(mode):
+ alpha = 1 + 4 * mode
+ beta = 1 + 4 * (1 - mode)
+ beta_dist = BetaDistribution(alpha, beta)
+ pert_dist = PERTDistribution(left=0, right=1, mode=mode)
+ beta_hist = numeric(beta_dist)
+ pert_hist = numeric(pert_dist)
+ assert beta_hist.exact_mean == approx(pert_hist.exact_mean)
+ assert beta_hist.exact_sd == approx(pert_hist.exact_sd)
+ assert beta_hist.est_mean() == approx(pert_hist.est_mean())
+ assert beta_hist.est_sd() == approx(pert_hist.est_sd(), rel=0.01)
+
+
+@given(
+ shape=st.floats(min_value=0.1, max_value=100),
+ scale=st.floats(min_value=0.1, max_value=100),
+ bin_sizing=st.sampled_from(["uniform", "ev", "mass", "fat-hybrid"]),
+)
+def test_gamma_basic(shape, scale, bin_sizing):
+ dist = GammaDistribution(shape=shape, scale=scale)
+ hist = numeric(dist, bin_sizing=bin_sizing, warn=False)
+ assert hist.exact_mean == approx(shape * scale)
+ assert hist.exact_sd == approx(np.sqrt(shape) * scale)
+ assert hist.est_mean() == approx(hist.exact_mean)
+ assert hist.est_sd() == approx(hist.exact_sd, rel=0.1)
+
+
+@given(
+ shape=st.floats(min_value=0.1, max_value=100),
+ scale=st.floats(min_value=0.1, max_value=100),
+ mean=st.floats(-10, 10),
+ sd=st.floats(0.1, 10),
+)
+def test_gamma_sum(shape, scale, mean, sd):
+ dist1 = GammaDistribution(shape=shape, scale=scale)
+ dist2 = NormalDistribution(mean=mean, sd=sd)
+ hist1 = numeric(dist1)
+ hist2 = numeric(dist2)
+ hist_sum = hist1 + hist2
+ assert hist_sum.est_mean() == approx(hist_sum.exact_mean, rel=1e-7, abs=1e-7)
+ assert hist_sum.est_sd() == approx(hist_sum.exact_sd, rel=0.02, abs=0.02)
+
+
+@given(
+ shape=st.floats(min_value=0.1, max_value=100),
+ scale=st.floats(min_value=0.1, max_value=100),
+ mean=st.floats(-10, 10),
+ sd=st.floats(0.1, 10),
+)
+def test_gamma_product(shape, scale, mean, sd):
+ dist1 = GammaDistribution(shape=shape, scale=scale)
+ dist2 = NormalDistribution(mean=mean, sd=sd)
+ hist1 = numeric(dist1)
+ hist2 = numeric(dist2)
+ hist_prod = hist1 * hist2
+ assert hist_prod.est_mean() == approx(hist_prod.exact_mean, rel=1e-7, abs=1e-7)
+ assert hist_prod.est_sd() == approx(hist_prod.exact_sd, rel=0.02)
+
+
+@given(
+ df=st.floats(min_value=0.1, max_value=100),
+)
+def test_chi_square(df):
+ dist = ChiSquareDistribution(df=df)
+ hist = numeric(dist)
+ assert hist.exact_mean == approx(df)
+ assert hist.exact_sd == approx(np.sqrt(2 * df))
+ assert hist.est_mean() == approx(hist.exact_mean)
+ assert hist.est_sd() == approx(hist.exact_sd, rel=0.01)
+
+
+@given(
+ scale=st.floats(min_value=0.1, max_value=1e6),
+)
+def test_exponential_dist(scale):
+ dist = ExponentialDistribution(scale=scale)
+ hist = numeric(dist)
+ assert hist.exact_mean == approx(scale)
+ assert hist.exact_sd == approx(scale)
+ assert hist.est_mean() == approx(hist.exact_mean)
+ assert hist.est_sd() == approx(hist.exact_sd, rel=0.01)
+
+
+@given(
+ shape=st.floats(min_value=1.1, max_value=100),
+)
+def test_pareto_dist(shape):
+ dist = ParetoDistribution(shape)
+ hist = numeric(dist, warn=shape >= 2)
+ assert hist.exact_mean == approx(shape / (shape - 1))
+ assert hist.est_mean() == approx(hist.exact_mean, rel=0.01 / (shape - 1))
+ if shape <= 2:
+ assert hist.exact_sd == approx(np.inf)
+ else:
+ assert hist.est_sd() == approx(hist.exact_sd, rel=max(0.01, 0.1 / (shape - 2)))
+
+
+@given(
+ x=st.floats(min_value=-100, max_value=100),
+ wrap_in_dist=st.booleans(),
+)
+@example(x=1, wrap_in_dist=False)
+def test_constant_dist(x, wrap_in_dist):
+ dist1 = NormalDistribution(mean=1, sd=1)
+ if wrap_in_dist:
+ dist2 = ConstantDistribution(x=x)
+ else:
+ dist2 = x
+ hist1 = numeric(dist1, warn=False)
+ hist2 = numeric(dist2, warn=False)
+ hist_sum = hist1 + hist2
+ assert hist_sum.exact_mean == approx(1 + x)
+ assert hist_sum.est_mean() == approx(1 + x, rel=1e-6)
+ assert hist_sum.exact_sd == approx(1)
+ assert hist_sum.est_sd() == approx(hist1.est_sd(), rel=1e-3)
+
+
+@given(
+ p=st.floats(min_value=0.001, max_value=0.999),
+)
+def test_bernoulli_dist(p):
+ dist = BernoulliDistribution(p=p)
+ hist = numeric(dist, warn=False)
+ assert hist.exact_mean == approx(p)
+ assert hist.est_mean() == approx(p, rel=1e-6)
+ assert hist.exact_sd == approx(np.sqrt(p * (1 - p)))
+ assert hist.est_sd() == approx(hist.exact_sd, rel=1e-6)
+
+
+def test_complex_dist():
+ left = NormalDistribution(mean=1, sd=1)
+ right = NormalDistribution(mean=0, sd=1)
+ dist = ComplexDistribution(left, right, operator.add)
+ hist = numeric(dist, warn=False)
+ assert hist.exact_mean == approx(1)
+ assert hist.est_mean() == approx(1, rel=1e-6)
+
+
+def test_complex_dist_with_float():
+ left = NormalDistribution(mean=1, sd=1)
+ right = 2
+ dist = ComplexDistribution(left, right, operator.mul)
+ hist = numeric(dist, warn=False)
+ assert hist.exact_mean == approx(2)
+ assert hist.est_mean() == approx(2, rel=1e-6)
+
+
+@given(
+ mean=st.floats(min_value=-10, max_value=10),
+ sd=st.floats(min_value=0.01, max_value=10),
+ bin_num=st.integers(min_value=5, max_value=95),
+)
+@example(mean=1, sd=0.7822265625, bin_num=5)
+def test_numeric_dist_contribution_to_ev(mean, sd, bin_num):
+ fraction = bin_num / 100
+ dist = NormalDistribution(mean=mean, sd=sd)
+ tolerance = 0.02 if bin_num < 10 or bin_num > 90 else 0.01
+ hist = numeric(dist, bin_sizing="uniform", num_bins=100, warn=False)
+ assert hist.contribution_to_ev(dist.inv_contribution_to_ev(fraction)) == approx(
+ fraction, rel=tolerance
+ )
+
+
+@given(
+ norm_mean=st.floats(min_value=-np.log(1e9), max_value=np.log(1e9)),
+ norm_sd=st.floats(min_value=0.001, max_value=4),
+ bin_num=st.integers(min_value=2, max_value=98),
+)
+def test_numeric_dist_inv_contribution_to_ev(norm_mean, norm_sd, bin_num):
+ # The nth value stored in the PMH represents a value between the nth and n+1th edges
+ dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd)
+ hist = numeric(dist, bin_sizing="ev", warn=False)
+ fraction = bin_num / hist.num_bins
+ prev_fraction = fraction - 1 / hist.num_bins
+ next_fraction = fraction
+ assert hist.inv_contribution_to_ev(fraction) > dist.inv_contribution_to_ev(prev_fraction)
+ assert hist.inv_contribution_to_ev(fraction) < dist.inv_contribution_to_ev(next_fraction)
+
+
+@given(
+ mean=st.floats(min_value=-100, max_value=100),
+ sd=st.floats(min_value=0.01, max_value=100),
+ percent=st.integers(min_value=0, max_value=100),
+)
+@example(mean=0, sd=1, percent=100)
+def test_quantile_uniform(mean, sd, percent):
+ # Note: Quantile interpolation can sometimes give incorrect results at the
+ # 0th percentile because if the first two bin edges are extremely close to
+ # 0, the values can be out of order due to floating point rounding.
+ assume(percent != 0 or abs(mean) / sd < 3)
+ dist = NormalDistribution(mean=mean, sd=sd)
+ hist = numeric(dist, num_bins=200, bin_sizing="uniform", warn=False)
+ if percent == 0:
+ assert hist.percentile(percent) <= hist.values[0]
+ elif percent == 100:
+ cum_mass = np.cumsum(hist.masses) - 0.5 * hist.masses
+ nonzero_indexes = [i for (i, d) in enumerate(np.diff(cum_mass)) if d > 0]
+ last_valid_index = max(nonzero_indexes)
+ assert hist.percentile(percent) >= hist.values[last_valid_index]
+ else:
+ assert hist.percentile(percent) == approx(
+ stats.norm.ppf(percent / 100, loc=mean, scale=sd), rel=0.02, abs=0.02
+ )
+
+
+@given(
+ norm_mean=st.floats(min_value=-5, max_value=5),
+ norm_sd=st.floats(min_value=0.1, max_value=2),
+ percent=st.integers(min_value=1, max_value=100),
+)
+def test_quantile_log_uniform(norm_mean, norm_sd, percent):
+ dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd)
+ hist = numeric(dist, num_bins=200, bin_sizing="log-uniform", warn=False)
+ if percent == 0:
+ assert hist.percentile(percent) <= hist.values[0]
+ elif percent == 100:
+ cum_mass = np.cumsum(hist.masses) - 0.5 * hist.masses
+ nonzero_indexes = [i for (i, d) in enumerate(np.diff(cum_mass)) if d > 0]
+ last_valid_index = max(nonzero_indexes)
+ assert hist.percentile(percent) >= hist.values[last_valid_index]
+ else:
+ assert hist.percentile(percent) == approx(
+ stats.lognorm.ppf(percent / 100, norm_sd, scale=np.exp(norm_mean)), rel=0.01
+ )
+
+
+@given(
+ norm_mean=st.floats(min_value=-5, max_value=5),
+ norm_sd=st.floats(min_value=0.1, max_value=2),
+ # Don't try smaller percentiles because the smaller bins have a lot of
+ # probability mass
+ percent=st.floats(min_value=20, max_value=99.9),
+)
+@example(norm_mean=0, norm_sd=0.5, percent=99.9)
+def test_quantile_ev(norm_mean, norm_sd, percent):
+ dist = LognormalDistribution(norm_mean=norm_mean, norm_sd=norm_sd)
+ hist = numeric(dist, num_bins=200, bin_sizing="ev", warn=False)
+ tolerance = 0.1 if percent < 70 or percent > 99 else 0.01
+ assert hist.percentile(percent) == approx(
+ stats.lognorm.ppf(percent / 100, norm_sd, scale=np.exp(norm_mean)), rel=tolerance
+ )
+
+
+@given(
+ mean=st.floats(min_value=100, max_value=100),
+ sd=st.floats(min_value=0.01, max_value=100),
+ percent=st.floats(min_value=1, max_value=99),
+)
+@example(mean=0, sd=1, percent=1)
+def test_quantile_mass(mean, sd, percent):
+ dist = NormalDistribution(mean=mean, sd=sd)
+ hist = numeric(dist, num_bins=200, bin_sizing="mass", warn=False)
+
+ # It's hard to make guarantees about how close the value will be, but we
+ # should know for sure that the cdf of the value is very close to the
+ # percent. Naive interpolation should have a maximum absolute error of 1 /
+ # num_bins.
+ assert 100 * stats.norm.cdf(hist.percentile(percent), mean, sd) == approx(percent, abs=0.1)
+
+
+@given(
+ mean=st.floats(min_value=100, max_value=100),
+ sd=st.floats(min_value=0.01, max_value=100),
+)
+@example(mean=100, sd=11.97)
+def test_cdf_mass(mean, sd):
+ dist = NormalDistribution(mean=mean, sd=sd)
+ hist = numeric(dist, num_bins=200, bin_sizing="mass", warn=False)
+
+ # should definitely be accurate to within 1 / num_bins but a smart interpolator
+ # can do better
+ tolerance = 0.001
+ assert hist.cdf(mean) == approx(0.5, abs=tolerance)
+ assert hist.cdf(mean - sd) == approx(stats.norm.cdf(-1), abs=tolerance)
+ assert hist.cdf(mean + 2 * sd) == approx(stats.norm.cdf(2), abs=tolerance)
+
+
+@given(
+ mean=st.floats(min_value=100, max_value=100),
+ sd=st.floats(min_value=0.01, max_value=100),
+ percent=st.integers(min_value=1, max_value=99),
+)
+def test_cdf_inverts_quantile(mean, sd, percent):
+ dist = NormalDistribution(mean=mean, sd=sd)
+ hist = numeric(dist, num_bins=200, bin_sizing="mass", warn=False)
+ assert 100 * hist.cdf(hist.percentile(percent)) == approx(percent, abs=0.1)
+
+
+@given(
+ mean1=st.floats(min_value=100, max_value=100),
+ mean2=st.floats(min_value=100, max_value=100),
+ sd1=st.floats(min_value=0.01, max_value=100),
+ sd2=st.floats(min_value=0.01, max_value=100),
+ percent=st.integers(min_value=1, max_value=99),
+)
+@example(mean1=100, mean2=100, sd1=1, sd2=99, percent=2)
+def test_quantile_mass_after_sum(mean1, mean2, sd1, sd2, percent):
+ dist1 = NormalDistribution(mean=mean1, sd=sd1)
+ dist2 = NormalDistribution(mean=mean2, sd=sd2)
+ hist1 = numeric(dist1, num_bins=200, bin_sizing="mass", warn=False)
+ hist2 = numeric(dist2, num_bins=200, bin_sizing="mass", warn=False)
+ hist_sum = hist1 + hist2
+ assert hist_sum.percentile(percent) == approx(
+ stats.norm.ppf(percent / 100, mean1 + mean2, np.sqrt(sd1**2 + sd2**2)),
+ rel=0.01 * (mean1 + mean2),
+ )
+ assert 100 * stats.norm.cdf(
+ hist_sum.percentile(percent), hist_sum.exact_mean, hist_sum.exact_sd
+ ) == approx(percent, abs=0.25)
+
+
+@patch.object(np.random, "uniform", Mock(return_value=0.5))
+@given(
+ mean=st.floats(min_value=-10, max_value=10),
+ bin_sizing=st.sampled_from(["uniform", "ev", "mass"]),
+)
+def test_sample(mean, bin_sizing):
+ dist = NormalDistribution(mean=mean, sd=1)
+ hist = numeric(dist, bin_sizing=bin_sizing)
+ tol = 0.001 if bin_sizing == "uniform" else 0.01
+ assert hist.sample() == approx(mean, rel=tol)
+
+
+def test_utils_get_percentiles_basic():
+ dist = NormalDistribution(mean=0, sd=1)
+ hist = numeric(dist, warn=False)
+ assert utils.get_percentiles(hist, 1) == hist.percentile(1)
+ assert utils.get_percentiles(hist, [5]) == hist.percentile([5])
+ assert all(utils.get_percentiles(hist, np.array([10, 20])) == hist.percentile([10, 20]))
+
+
+def test_bump_indexes():
+ assert _bump_indexes([2, 2, 2, 2], 4) == [0, 1, 2, 3]
+ assert _bump_indexes([2, 2, 2, 2], 6) == [2, 3, 4, 5]
+ assert _bump_indexes([2, 2, 2, 2], 8) == [2, 3, 4, 5]
+ assert _bump_indexes([1, 2, 2, 5, 7, 7, 8, 8, 8], 9) == list(range(9))
+ assert _bump_indexes([0, 0, 0, 6], 8) == [0, 1, 2, 6]
+ assert _bump_indexes([0, 0, 0, 9, 9, 9], 11) == [0, 1, 2, 8, 9, 10]
+
+
+def test_plot():
+ return None
+ hist = numeric(LognormalDistribution(norm_mean=0, norm_sd=1)) * numeric(
+ NormalDistribution(mean=0, sd=5)
+ )
+ # hist = numeric(LognormalDistribution(norm_mean=0, norm_sd=2))
+ hist.plot(scale="linear")
+
+
+def test_performance():
+ return None
+ # Note: I wrote some C++ code to approximate the behavior of distribution
+ # multiplication. On my machine, distribution multiplication (with profile
+ # = False) runs in 15s, and the equivalent C++ code (with -O3) runs in 11s.
+ # The C++ code is not well-optimized, the most glaring issue being it uses
+ # std::sort instead of something like argpartition (the trouble is that
+ # numpy's argpartition can partition on many values simultaneously, whereas
+ # C++'s std::partition can only partition on one value at a time, which is
+ # far slower).
+ dist1 = LognormalDistribution(norm_mean=0, norm_sd=1)
+ dist2 = LognormalDistribution(norm_mean=1, norm_sd=0.5)
+
+ import cProfile
+ import pstats
+ import io
+
+ pr = cProfile.Profile()
+ pr.enable()
+
+ for i in range(5000):
+ hist1 = numeric(dist1, num_bins=200)
+ hist2 = numeric(dist2, num_bins=200)
+ hist1 = hist1 * hist2
+
+ pr.disable()
+ s = io.StringIO()
+ sortby = "cumulative"
+ ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
+ ps.print_stats()
+ print(s.getvalue())