From 37720963ed4d7829482ec63b590c0ed782088de0 Mon Sep 17 00:00:00 2001
From: Fritz Obermeyer <fritz.obermeyer@gmail.com>
Date: Thu, 24 Aug 2017 20:39:08 -0700
Subject: [PATCH 1/3] Sketch Table class for heterogeneous data

---
 treecat/table.py | 100 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 treecat/table.py

diff --git a/treecat/table.py b/treecat/table.py
new file mode 100644
index 0000000..2b06485
--- /dev/null
+++ b/treecat/table.py
@@ -0,0 +1,100 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from treecat.util import jit
+
+
+@jit
+def jit_add_categorical_data(ragged_index, data, n, v, value):
+    """Update multinomial sufficient statistics."""
+    data[n, ragged_index[v] + value] += 1
+
+
+@jit
+def jit_add_real_data(data, n, v, value):
+    """Update real sufficient statistics."""
+    # Adapted from https://www.johndcook.com/blog/standard_deviation
+    ss = data[n, v, 1:4]
+    if ss[0] == 0:
+        ss[0] = 1.0
+        ss[1] = value
+        ss[2] = 0.0
+    else:
+        ss[0] += 1.0
+        old_diff = value - ss[1]
+        ss[1] += old_diff / ss[0]
+        new_diff = value - ss[1]
+        ss[2] += old_diff * new_diff
+
+
+class Table(object):
+    """Table of row-oriented heterogeneous repeated data."""
+
+    def __init__(self, schema, rows):
+        self._schema = dict(schema)
+        N = len(rows)
+
+        # Collect categorical features and values.
+        categorical_features = sorted(k for k, v in schema.items()
+                                      if v == 'categorical')
+        categorical_values = [set() for _ in categorical_features]
+        for row in rows:
+            for name, values in zip(categorical_features, categorical_values):
+                try:
+                    # TODO Handle repeated values and ordinals.
+                    value = row[name]
+                except KeyError:
+                    continue
+                values.add(value)
+        categorical_values = [sorted(v) for v in categorical_values]
+        categorical_index = [{v: i
+                              for i, v in enumerate(vs)}
+                             for vs in categorical_values]
+        self._categorical_features = categorical_features
+        self._categorical_values = categorical_values
+        self._categorical_index = categorical_index
+
+        # Create a ragged array of categorical sufficient statistics.
+        ragged_index = np.zeros(len(categorical_features) + 1, dtype=np.int32)
+        for v, name in enumerate(categorical_features):
+            dim = len(categorical_values[v])
+            assert dim < 128
+            ragged_index[1 + v] = ragged_index[v] + dim
+        data = np.zeros((N, ragged_index[-1]), dtype=np.int8)
+        for n, row in enumerate(rows):
+            for v, name in enumerate(categorical_features):
+                try:
+                    value = row[name]
+                except KeyError:
+                    continue
+                int_value = categorical_index[v][value]
+                jit_add_categorical_data(ragged_index, data, n, v, int_value)
+        self._categorical_ragged_index = ragged_index
+        self._categorical_data = data
+
+        # Collect real-valued features and create data.
+        real_features = sorted(k for k, v in schema.items() if v == 'real')
+        data = np.zeros((N, len(real_features), 3), dtype=np.float32)
+        for n, row in enumerate(rows):
+            for v, name in enumerate(real_features):
+                try:
+                    value = row[name]
+                except KeyError:
+                    continue
+                jit_add_real_data(data, n, v, value)
+        self._real_data = data
+
+    @property
+    def categorical_ragged_index(self):
+        return self._categorical_ragged_index
+
+    @property
+    def categorical_data(self):
+        return self._categorical_data
+
+    @property
+    def real_data(self):
+        return self._real_data

From 3b6fc86fd80324575b269518c508d5711a609d23 Mon Sep 17 00:00:00 2001
From: Fritz Obermeyer <fritz.obermeyer@gmail.com>
Date: Sat, 2 Sep 2017 09:49:21 -0700
Subject: [PATCH 2/3] Sketch Schema class for heterogeneous data

---
 treecat/table.py | 88 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 68 insertions(+), 20 deletions(-)

diff --git a/treecat/table.py b/treecat/table.py
index 2b06485..e1ac8a1 100644
--- a/treecat/table.py
+++ b/treecat/table.py
@@ -7,34 +7,39 @@
 from treecat.util import jit
 
 
-@jit
-def jit_add_categorical_data(ragged_index, data, n, v, value):
-    """Update multinomial sufficient statistics."""
-    data[n, ragged_index[v] + value] += 1
+class Schema(object):
+    """Schema describing data types of multivariate heterogeneous data."""
 
+    dtypes = ('multinomial', 'normal')
+    dtype_index = {dtypes.index(dtype) for dtype in dtypes}
 
-@jit
-def jit_add_real_data(data, n, v, value):
-    """Update real sufficient statistics."""
-    # Adapted from https://www.johndcook.com/blog/standard_deviation
-    ss = data[n, v, 1:4]
-    if ss[0] == 0:
-        ss[0] = 1.0
-        ss[1] = value
-        ss[2] = 0.0
-    else:
-        ss[0] += 1.0
-        old_diff = value - ss[1]
-        ss[1] += old_diff / ss[0]
-        new_diff = value - ss[1]
-        ss[2] += old_diff * new_diff
+    def __init__(self):
+        self._multinomial_features = []  # list(string).
+        self._multinomial_values = {}  # dict(string, list(string)).
+        self._multinomial_value_index = {}  # dict(string, dict(string, int)).
+        self._normal_features = []  # list(string).
+
+    def add_multinomial_feature(self, name):
+        assert name not in self._multinomial_features
+        self._multinomial_features.append(name)
+
+    def add_multinomial_value(self, name, value):
+        values = self._multinomial_values[name]
+        value_index = self._multinomial_value_index[name]
+        if value not in value_index:
+            value_index[value] = len(values)
+            values.append(value)
+
+    def add_normal_feature(self, name):
+        assert name not in self._normal_features
+        self._normal_features.append(name)
 
 
 class Table(object):
     """Table of row-oriented heterogeneous repeated data."""
 
     def __init__(self, schema, rows):
-        self._schema = dict(schema)
+        self._schema = dict(schema)  # TODO Use a Schema object.
         N = len(rows)
 
         # Collect categorical features and values.
@@ -87,6 +92,10 @@ def __init__(self, schema, rows):
                 jit_add_real_data(data, n, v, value)
         self._real_data = data
 
+    @property
+    def schema(self):
+        return self._schema
+
     @property
     def categorical_ragged_index(self):
         return self._categorical_ragged_index
@@ -98,3 +107,42 @@ def categorical_data(self):
     @property
     def real_data(self):
         return self._real_data
+
+    def __add__(self, other):
+        """Combine observations of two row-aligned datasets.
+
+        This adds data in the sense of repeated observation of each row.
+
+        See also `Table.cat()` for combining different rows
+        and `Table.join` for combining different columns.
+
+        Args:
+            other (Table): Another table with the same schema and aligned rows.
+
+        Returns:
+            Table: A table with the same schema and same number of rows.
+        """
+        raise NotImplementedError
+
+
+@jit
+def jit_add_categorical_data(ragged_index, data, n, v, value):
+    """Update multinomial sufficient statistics."""
+    data[n, ragged_index[v] + value] += 1
+
+
+@jit
+def jit_add_real_data(data, n, v, value):
+    """Update real sufficient statistics."""
+    # Adapted from https://www.johndcook.com/blog/standard_deviation
+    ss = data[n, v, 0:3]
+    if ss[0] == 0:
+        ss[0] = 1.0
+        ss[1] = value
+        ss[2] = 0.0
+    else:
+        ss[0] += 1.0
+        old_diff = value - ss[1]
+        ss[1] += old_diff / ss[0]
+        new_diff = value - ss[1]
+        ss[2] += old_diff * new_diff

From 73fb674dfc0f596d167e02cab45273f90c024c2b Mon Sep 17 00:00:00 2001
From: Fritz Obermeyer <fritz.obermeyer@gmail.com>
Date: Sat, 2 Sep 2017 15:51:24 -0700
Subject: [PATCH 3/3] Sketch ValueTable, JoinStats, and EdgeStats classes

---
 treecat/table.py  | 148 ------------------------
 treecat/tables.py | 281 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 281 insertions(+), 148 deletions(-)
 delete mode 100644 treecat/table.py
 create mode 100644 treecat/tables.py

diff --git a/treecat/table.py b/treecat/table.py
deleted file mode 100644
index e1ac8a1..0000000
--- a/treecat/table.py
+++ /dev/null
@@ -1,148 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from treecat.util import jit
-
-
-class Schema(object):
-    """Schema describing data types of multivariate heterogeneous data."""
-
-    dtypes = ('multinomial', 'normal')
-    dtype_index = {dtypes.index(dtype) for dtype in dtypes}
-
-    def __init__(self):
-        self._multinomial_features = []  # list(string).
-        self._multinomial_values = {}  # dict(string, list(string)).
-        self._multinomial_value_index = {}  # dict(string, dict(string, int)).
-        self._normal_features = []  # list(string).
-
-    def add_multinomial_feature(self, name):
-        assert name not in self._multinomial_features
-        self._multinomial_features.append(name)
-
-    def add_multinomial_value(self, name, value):
-        values = self._multinomial_values[name]
-        value_index = self._multinomial_value_index[name]
-        if value not in value_index:
-            value_index[value] = len(values)
-            values.append(value)
-
-    def add_normal_feature(self, name):
-        assert name not in self._normal_features
-        self._normal_features.append(name)
-
-
-class Table(object):
-    """Table of row-oriented heterogeneous repeated data."""
-
-    def __init__(self, schema, rows):
-        self._schema = dict(schema)  # TODO Use a Schema object.
-        N = len(rows)
-
-        # Collect categorical features and values.
-        categorical_features = sorted(k for k, v in schema.items()
-                                      if v == 'categorical')
-        categorical_values = [set() for _ in categorical_features]
-        for row in rows:
-            for name, values in zip(categorical_features, categorical_values):
-                try:
-                    # TODO Handle repeated values and ordinals.
-                    value = row[name]
-                except KeyError:
-                    continue
-                values.add(value)
-        categorical_values = [sorted(v) for v in categorical_values]
-        categorical_index = [{v: i
-                              for i, v in enumerate(vs)}
-                             for vs in categorical_values]
-        self._categorical_features = categorical_features
-        self._categorical_values = categorical_values
-        self._categorical_index = categorical_index
-
-        # Create a ragged array of categorical sufficient statistics.
-        ragged_index = np.zeros(len(categorical_features) + 1, dtype=np.int32)
-        for v, name in enumerate(categorical_features):
-            dim = len(categorical_values[v])
-            assert dim < 128
-            ragged_index[1 + v] = ragged_index[v] + dim
-        data = np.zeros((N, ragged_index[-1]), dtype=np.int8)
-        for n, row in enumerate(rows):
-            for v, name in enumerate(categorical_features):
-                try:
-                    value = row[name]
-                except KeyError:
-                    continue
-                int_value = categorical_index[v][value]
-                jit_add_categorical_data(ragged_index, data, n, v, int_value)
-        self._categorical_ragged_index = ragged_index
-        self._categorical_data = data
-
-        # Collect real-valued features and create data.
-        real_features = sorted(k for k, v in schema.items() if v == 'real')
-        data = np.zeros((N, len(real_features), 3), dtype=np.float32)
-        for n, row in enumerate(rows):
-            for v, name in enumerate(real_features):
-                try:
-                    value = row[name]
-                except KeyError:
-                    continue
-                jit_add_real_data(data, n, v, value)
-        self._real_data = data
-
-    @property
-    def schema(self):
-        return self._schema
-
-    @property
-    def categorical_ragged_index(self):
-        return self._categorical_ragged_index
-
-    @property
-    def categorical_data(self):
-        return self._categorical_data
-
-    @property
-    def real_data(self):
-        return self._real_data
-
-    def __add__(self, other):
-        """Combine observations of two row-aligned datasets.
-
-        This adds data in the sense of repeated observation of each row.
-
-        See also `Table.cat()` for combining different rows
-        and `Table.join` for combining different columns.
-
-        Args:
-            other (Table): Another table with the same schema and aligned rows.
-
-        Returns:
-            Table: A table with the same schema and same number of rows.
-        """
-        raise NotImplementedError
-
-
-@jit
-def jit_add_categorical_data(ragged_index, data, n, v, value):
-    """Update multinomial sufficient statistics."""
-    data[n, ragged_index[v] + value] += 1
-
-
-@jit
-def jit_add_real_data(data, n, v, value):
-    """Update real sufficient statistics."""
-    # Adapted from https://www.johndcook.com/blog/standard_deviation
-    ss = data[n, v, 0:3]
-    if ss[0] == 0:
-        ss[0] = 1.0
-        ss[1] = value
-        ss[2] = 0.0
-    else:
-        ss[0] += 1.0
-        old_diff = value - ss[1]
-        ss[1] += old_diff / ss[0]
-        new_diff = value - ss[1]
-        ss[2] += old_diff * new_diff
diff --git a/treecat/tables.py b/treecat/tables.py
new file mode 100644
index 0000000..7e33058
--- /dev/null
+++ b/treecat/tables.py
@@ -0,0 +1,281 @@
+"""Internal representations of heterogeous data and statistics."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from scipy.special import gammaln
+
+from treecat.util import jit
+
+
+class Schema(object):
+    """Schema describing a data types of multivariate heterogeneous data."""
+
+    DTYPES = ('categorical', 'normal')
+    DTYPE_INDEX = {DTYPES.index(dtype) for dtype in DTYPES}
+
+    def __init__(self, feature_types):
+        assert isinstance(feature_types, dict)
+
+        self._feature_types = dict(feature_types)
+        self._normal_features = tuple(
+            sorted([
+                name for name, dtype in feature_types.items
+                if dtype == 'normal'
+            ]))
+        self._normal_index = {
+            i: name
+            for name, i in enumerate(self._normal_features)
+        }
+        self._cat_values = {}  # dict(string, list(string)).
+        self._cat_value_index = {}  # dict(string, dict(string, int)).
+
+    def register_data(self, rows):
+        for row in rows:
+            for name, value in row.items():
+                if self._feature_types[name] == 'categorical':
+                    self.register_cat_value(name, value)
+
+    def register_cat_value(self, name, value):
+        values = self._cat_values[name]
+        value_index = self._cat_value_index[name]
+        if value not in value_index:
+            value_index[value] = len(values)
+            values.append(value)
+
+
+class ValueTable(object):
+    """Table of row-oriented heterogeneous single-observation data."""
+
+    def __init__(self, schema, num_rows):
+        assert isinstance(schema, Schema)
+        self._schema = schema
+        N = num_rows
+        Vr = len(schema._real_features)
+        Vc = len(schema._cat_features)
+
+        self._cat_data = np.zeros([N, Vc], dtype=np.int8)
+        self._real_data = np.zeros([N, Vr], dtype=np.float32)
+
+
+def gaussian_logprob_data(count, mean, count_times_variance, prior):
+    """
+    References:
+    - Kevin Murphy (2007)
+      "Conjugate Bayesian analysis of the Gaussian distribution"
+      http://www-devel.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf
+    """
+    raise NotImplementedError
+
+
+class JointStats(object):
+    """Sufficient statistics for a joint distribution of categoricals + reals.
+
+    This represents sufficient statistics for a mixture-of-Gaussians model of
+    heterogeneous categorical and real data, where the categoricals comprise a
+    mixture id and the reals are jointly modeled as a MVN conditional on the
+    categorical data. This assumes that all data is observed simultaneously.
+
+    The sufficient statistics are as follows:
+    - _ss_0 = number of observations of each mixture component.
+    - _ss_1 = means of each mixture component.
+    - _ss_2 = count * variance of each mixture component.
+    """
+
+    def __init__(self, cat_dims, real_dim):
+        cat_dim = np.product(cat_dims, np.int32)
+        self._cat_dims = cat_dims
+        self._flatten = np.cumprod(cat_dims) / cat_dims
+        self._ss_0 = np.zeros([cat_dim], np.int32)
+        self._ss_1 = np.zeros([cat_dim, real_dim], np.float32)
+        self._ss_2 = np.zeros([cat_dim, real_dim, real_dim], np.float32)
+
+    def reset(self):
+        self._ss_0[...] = 0
+        self._ss_1[...] = 0
+        self._ss_2[...] = 0
+
+    def add_row(self, cat_row, real_row):
+        assert isinstance(cat_row, np.ndarray)
+        assert isinstance(real_row, np.ndarray)
+        assert cat_row.shape == (len(self.cat_dims), )
+        assert real_row.shape == (self.real_dim, )
+        m = np.dot(self._flatten, cat_row)
+        self._ss_0[m] += 1
+        n = np.float32(self._ss_0[m])
+        one_over_n = 1.0 / n
+        diff = real_row - self._ss_1[m]
+        self._ss_1[m] += diff * one_over_n
+        self._ss_2 += np.outer(diff, diff) * ((1.0 + n) * one_over_n)
+
+    def remove_row(self, cat_row, real_row):
+        assert isinstance(cat_row, np.ndarray)
+        assert isinstance(real_row, np.ndarray)
+        raise NotImplementedError
+
+    def set_from_table(self, cat_table, real_table):
+        assert isinstance(cat_table, np.ndarray)
+        assert isinstance(real_table, np.ndarray)
+        assert cat_table.shape[0] == real_table.shape[0]
+        self.reset()
+        for n in range(cat_table.shape[0]):
+            self.add_row(cat_table[n, :], real_table[n, :])
+
+    def logprob(self, cat_prior, real_prior):
+        result = 0.0
+        for m in range(self._ss_0.shape[0]):
+            result += gammaln(self._ss_0[m] + cat_prior)
+            result += gaussian_logprob_data(self._ss_0[m], self._ss_1[m],
+                                            self._ss_2[m], real_prior)
+        return result
+
+
+class EdgesStats(object):
+    """Sufficient statistics for a joint distribution of a pair of variables.
+
+    This should be distributionally eqivalent to JointStats, but with variables
+    separated into two sets, say head and tail.
+    """
+
+    def __init__(self, head_cat_dim, head_real_dim, tail_cat_dim,
+                 tail_real_dim):
+        assert head_cat_dim >= 1
+        assert tail_cat_dim >= 1
+        raise NotImplementedError
+
+    def filter_forward(self, head_message):
+        raise NotImplementedError
+
+    def filter_backward(self, head_message):
+        raise NotImplementedError
+
+    def sample_forward(self, tail_sample):
+        raise NotImplementedError
+
+    def sample_backward(self, tail_sample):
+        raise NotImplementedError
+
+    def logprob(self, head_cat_prior, head_real_prior, tail_cat_prior,
+                tail_real_prior):
+        raise NotImplementedError
+
+
+class StatsTable(object):
+    """Table of row-oriented heterogeneous missing/repeated data."""
+
+    def __init__(self, schema, rows):
+        assert isinstance(schema, Schema)
+        self._schema = schema
+        N = len(rows)
+
+        # Collect categorical features and values.
+        cat_features = sorted(k for k, v in schema.items()
+                              if v == 'categorical')
+        cat_values = [set() for _ in cat_features]
+        for row in rows:
+            for name, values in zip(cat_features, cat_values):
+                try:
+                    # TODO Handle repeated values and ordinals.
+                    value = row[name]
+                except KeyError:
+                    continue
+                values.add(value)
+        cat_values = [sorted(v) for v in cat_values]
+        cat_index = [{v: i for i, v in enumerate(vs)} for vs in cat_values]
+        self._cat_features = cat_features
+        self._cat_values = cat_values
+        self._cat_index = cat_index
+
+        # Create a ragged array of categorical sufficient statistics.
+        ragged_index = np.zeros(len(cat_features) + 1, dtype=np.int32)
+        for v, name in enumerate(cat_features):
+            dim = len(cat_values[v])
+            assert dim < 128
+            ragged_index[1 + v] = ragged_index[v] + dim
+        data = np.zeros((N, ragged_index[-1]), dtype=np.int8)
+        for n, row in enumerate(rows):
+            for v, name in enumerate(cat_features):
+                try:
+                    value = row[name]
+                except KeyError:
+                    continue
+                int_value = cat_index[v][value]
+                jit_add_cat_data(ragged_index, data, n, v, int_value)
+        self._cat_ragged_index = ragged_index
+        self._cat_data = data
+
+        # Collect real-valued features and create data.
+        real_features = sorted(k for k, v in schema.items() if v == 'real')
+        data = np.zeros((N, len(real_features), 3), dtype=np.float32)
+        for n, row in enumerate(rows):
+            for v, name in enumerate(real_features):
+                try:
+                    value = row[name]
+                except KeyError:
+                    continue
+                jit_add_real_data(data, n, v, value)
+        self._real_data = data
+
+    @property
+    def schema(self):
+        return self._schema
+
+    @property
+    def num_rows(self):
+        return self._real_data.shape[0]
+
+    @property
+    def cat_ragged_index(self):
+        return self._cat_ragged_index
+
+    @property
+    def cat_data(self):
+        return self._cat_data
+
+    @property
+    def real_data(self):
+        return self._real_data
+
+    def __add__(self, other):
+        """Combine observations of two row-aligned datasets.
+
+        This adds data in the sense of repeated observation of each row.
+
+        See also `StatsTable.cat()` for combining different rows
+        and `StatsTable.join` for combining different columns.
+
+        Args:
+            other (StatsTable): Another StatsTable with the same schema and
+                aligned rows.
+
+        Returns:
+            StatsTable: A table with the same schema and same number of rows.
+        """
+        assert isinstance(other, StatsTable)
+        assert other.num_rows == self.num_rows
+        raise NotImplementedError
+
+
+@jit
+def jit_add_cat_data(ragged_index, data, n, v, value):
+    """Update multinomial sufficient statistics."""
+    data[n, ragged_index[v] + value] += 1
+
+
+@jit
+def jit_add_real_data(data, n, v, value):
+    """Update real sufficient statistics."""
+    # Adapted from https://www.johndcook.com/blog/standard_deviation
+    ss = data[n, v, 0:3]
+    if ss[0] == 0:
+        ss[0] = 1.0
+        ss[1] = value
+        ss[2] = 0.0
+    else:
+        ss[0] += 1.0
+        old_diff = value - ss[1]
+        ss[1] += old_diff / ss[0]
+        new_diff = value - ss[1]
+        ss[2] += old_diff * new_diff