From 37720963ed4d7829482ec63b590c0ed782088de0 Mon Sep 17 00:00:00 2001 From: Fritz Obermeyer Date: Thu, 24 Aug 2017 20:39:08 -0700 Subject: [PATCH 1/3] Sketch Table class for heterogeneous data --- treecat/table.py | 100 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 treecat/table.py diff --git a/treecat/table.py b/treecat/table.py new file mode 100644 index 0000000..2b06485 --- /dev/null +++ b/treecat/table.py @@ -0,0 +1,100 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from treecat.util import jit + + +@jit +def jit_add_categorical_data(ragged_index, data, n, v, value): + """Update multinomial sufficient statistics.""" + data[n, ragged_index[v] + value] += 1 + + +@jit +def jit_add_real_data(data, n, v, value): + """Update real sufficient statistics.""" + # Adapted from https://www.johndcook.com/blog/standard_deviation + ss = data[n, v, 1:4] + if ss[0] == 0: + ss[0] = 1.0 + ss[1] = value + ss[2] = 0.0 + else: + ss[0] += 1.0 + old_diff = value - ss[1] + ss[1] += old_diff / ss[0] + new_diff = value - ss[1] + ss[2] += old_diff * new_diff + + +class Table(object): + """Table of row-oriented heterogeneous repeated data.""" + + def __init__(self, schema, rows): + self._schema = dict(schema) + N = len(rows) + + # Collect categorical features and values. + categorical_features = sorted(k for k, v in schema.items() + if v == 'categorical') + categorical_values = [set() for _ in categorical_features] + for row in rows: + for name, values in zip(categorical_features, categorical_values): + try: + # TODO Handle repeated values and ordinals. + value = row[name] + except KeyError: + continue + values.add(value) + categorical_values = [sorted(v) for v in categorical_values] + categorical_index = [{v: i + for i, v in enumerate(vs)} + for vs in categorical_values] + self._categorical_features = categorical_features + self._categorical_values = categorical_values + self._categorical_index = categorical_index + + # Create a ragged array of categorical sufficient statistics. + ragged_index = np.zeros(len(categorical_features) + 1, dtype=np.int32) + for v, name in enumerate(categorical_features): + dim = len(categorical_values[v]) + assert dim < 128 + ragged_index[1 + v] = ragged_index[v] + dim + data = np.zeros((N, ragged_index[-1]), dtype=np.int8) + for n, row in enumerate(rows): + for v, name in enumerate(categorical_features): + try: + value = row[name] + except KeyError: + continue + int_value = categorical_index[v][value] + jit_add_categorical_data(ragged_index, data, n, v, int_value) + self._categorical_ragged_index = ragged_index + self._categorical_data = data + + # Collect real-valued features and create data. + real_features = sorted(k for k, v in schema.items() if v == 'real') + data = np.zeros((N, len(real_features), 3), dtype=np.float32) + for n, row in enumerate(rows): + for v, name in enumerate(real_features): + try: + value = row[name] + except KeyError: + continue + jit_add_real_data(data, n, v, value) + self._real_data = data + + @property + def categorical_ragged_index(self): + return self._categorical_ragged_index + + @property + def categorical_data(self): + return self._categorical_data + + @property + def real_data(self): + return self._real_data From 3b6fc86fd80324575b269518c508d5711a609d23 Mon Sep 17 00:00:00 2001 From: Fritz Obermeyer Date: Sat, 2 Sep 2017 09:49:21 -0700 Subject: [PATCH 2/3] Sketch Schema class for heterogeneous data --- treecat/table.py | 88 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 68 insertions(+), 20 deletions(-) diff --git a/treecat/table.py b/treecat/table.py index 2b06485..e1ac8a1 100644 --- a/treecat/table.py +++ b/treecat/table.py @@ -7,34 +7,39 @@ from treecat.util import jit -@jit -def jit_add_categorical_data(ragged_index, data, n, v, value): - """Update multinomial sufficient statistics.""" - data[n, ragged_index[v] + value] += 1 +class Schema(object): + """Schema describing data types of multivariate heterogeneous data.""" + dtypes = ('multinomial', 'normal') + dtype_index = {dtypes.index(dtype) for dtype in dtypes} -@jit -def jit_add_real_data(data, n, v, value): - """Update real sufficient statistics.""" - # Adapted from https://www.johndcook.com/blog/standard_deviation - ss = data[n, v, 1:4] - if ss[0] == 0: - ss[0] = 1.0 - ss[1] = value - ss[2] = 0.0 - else: - ss[0] += 1.0 - old_diff = value - ss[1] - ss[1] += old_diff / ss[0] - new_diff = value - ss[1] - ss[2] += old_diff * new_diff + def __init__(self): + self._multinomial_features = [] # list(string). + self._multinomial_values = {} # dict(string, list(string)). + self._multinomial_value_index = {} # dict(string, dict(string, int)). + self._normal_features = [] # list(string). + + def add_multinomial_feature(self, name): + assert name not in self._multinomial_features + self._multinomial_features.append(name) + + def add_multinomial_value(self, name, value): + values = self._multinomial_values[name] + value_index = self._multinomial_value_index[name] + if value not in value_index: + value_index[value] = len(values) + values.append(value) + + def add_normal_feature(self, name): + assert name not in self._normal_features + self._normal_features.append(name) class Table(object): """Table of row-oriented heterogeneous repeated data.""" def __init__(self, schema, rows): - self._schema = dict(schema) + self._schema = dict(schema) # TODO Use a Schema object. N = len(rows) # Collect categorical features and values. @@ -87,6 +92,10 @@ def __init__(self, schema, rows): jit_add_real_data(data, n, v, value) self._real_data = data + @property + def schema(self): + return self._schema + @property def categorical_ragged_index(self): return self._categorical_ragged_index @@ -98,3 +107,42 @@ def categorical_data(self): @property def real_data(self): return self._real_data + + def __add__(self, other): + """Combine observations of two row-aligned datasets. + + This adds data in the sense of repeated observation of each row. + + See also `Table.cat()` for combining different rows + and `Table.join` for combining different columns. + + Args: + other (Table): Another table with the same schema and aligned rows. + + Returns: + Table: A table with the same schema and same number of rows. + """ + raise NotImplementedError + + +@jit +def jit_add_categorical_data(ragged_index, data, n, v, value): + """Update multinomial sufficient statistics.""" + data[n, ragged_index[v] + value] += 1 + + +@jit +def jit_add_real_data(data, n, v, value): + """Update real sufficient statistics.""" + # Adapted from https://www.johndcook.com/blog/standard_deviation + ss = data[n, v, 0:3] + if ss[0] == 0: + ss[0] = 1.0 + ss[1] = value + ss[2] = 0.0 + else: + ss[0] += 1.0 + old_diff = value - ss[1] + ss[1] += old_diff / ss[0] + new_diff = value - ss[1] + ss[2] += old_diff * new_diff From 73fb674dfc0f596d167e02cab45273f90c024c2b Mon Sep 17 00:00:00 2001 From: Fritz Obermeyer Date: Sat, 2 Sep 2017 15:51:24 -0700 Subject: [PATCH 3/3] Sketch ValueTable, JoinStats, and EdgeStats classes --- treecat/table.py | 148 ------------------------ treecat/tables.py | 281 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 281 insertions(+), 148 deletions(-) delete mode 100644 treecat/table.py create mode 100644 treecat/tables.py diff --git a/treecat/table.py b/treecat/table.py deleted file mode 100644 index e1ac8a1..0000000 --- a/treecat/table.py +++ /dev/null @@ -1,148 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np - -from treecat.util import jit - - -class Schema(object): - """Schema describing data types of multivariate heterogeneous data.""" - - dtypes = ('multinomial', 'normal') - dtype_index = {dtypes.index(dtype) for dtype in dtypes} - - def __init__(self): - self._multinomial_features = [] # list(string). - self._multinomial_values = {} # dict(string, list(string)). - self._multinomial_value_index = {} # dict(string, dict(string, int)). - self._normal_features = [] # list(string). - - def add_multinomial_feature(self, name): - assert name not in self._multinomial_features - self._multinomial_features.append(name) - - def add_multinomial_value(self, name, value): - values = self._multinomial_values[name] - value_index = self._multinomial_value_index[name] - if value not in value_index: - value_index[value] = len(values) - values.append(value) - - def add_normal_feature(self, name): - assert name not in self._normal_features - self._normal_features.append(name) - - -class Table(object): - """Table of row-oriented heterogeneous repeated data.""" - - def __init__(self, schema, rows): - self._schema = dict(schema) # TODO Use a Schema object. - N = len(rows) - - # Collect categorical features and values. - categorical_features = sorted(k for k, v in schema.items() - if v == 'categorical') - categorical_values = [set() for _ in categorical_features] - for row in rows: - for name, values in zip(categorical_features, categorical_values): - try: - # TODO Handle repeated values and ordinals. - value = row[name] - except KeyError: - continue - values.add(value) - categorical_values = [sorted(v) for v in categorical_values] - categorical_index = [{v: i - for i, v in enumerate(vs)} - for vs in categorical_values] - self._categorical_features = categorical_features - self._categorical_values = categorical_values - self._categorical_index = categorical_index - - # Create a ragged array of categorical sufficient statistics. - ragged_index = np.zeros(len(categorical_features) + 1, dtype=np.int32) - for v, name in enumerate(categorical_features): - dim = len(categorical_values[v]) - assert dim < 128 - ragged_index[1 + v] = ragged_index[v] + dim - data = np.zeros((N, ragged_index[-1]), dtype=np.int8) - for n, row in enumerate(rows): - for v, name in enumerate(categorical_features): - try: - value = row[name] - except KeyError: - continue - int_value = categorical_index[v][value] - jit_add_categorical_data(ragged_index, data, n, v, int_value) - self._categorical_ragged_index = ragged_index - self._categorical_data = data - - # Collect real-valued features and create data. - real_features = sorted(k for k, v in schema.items() if v == 'real') - data = np.zeros((N, len(real_features), 3), dtype=np.float32) - for n, row in enumerate(rows): - for v, name in enumerate(real_features): - try: - value = row[name] - except KeyError: - continue - jit_add_real_data(data, n, v, value) - self._real_data = data - - @property - def schema(self): - return self._schema - - @property - def categorical_ragged_index(self): - return self._categorical_ragged_index - - @property - def categorical_data(self): - return self._categorical_data - - @property - def real_data(self): - return self._real_data - - def __add__(self, other): - """Combine observations of two row-aligned datasets. - - This adds data in the sense of repeated observation of each row. - - See also `Table.cat()` for combining different rows - and `Table.join` for combining different columns. - - Args: - other (Table): Another table with the same schema and aligned rows. - - Returns: - Table: A table with the same schema and same number of rows. - """ - raise NotImplementedError - - -@jit -def jit_add_categorical_data(ragged_index, data, n, v, value): - """Update multinomial sufficient statistics.""" - data[n, ragged_index[v] + value] += 1 - - -@jit -def jit_add_real_data(data, n, v, value): - """Update real sufficient statistics.""" - # Adapted from https://www.johndcook.com/blog/standard_deviation - ss = data[n, v, 0:3] - if ss[0] == 0: - ss[0] = 1.0 - ss[1] = value - ss[2] = 0.0 - else: - ss[0] += 1.0 - old_diff = value - ss[1] - ss[1] += old_diff / ss[0] - new_diff = value - ss[1] - ss[2] += old_diff * new_diff diff --git a/treecat/tables.py b/treecat/tables.py new file mode 100644 index 0000000..7e33058 --- /dev/null +++ b/treecat/tables.py @@ -0,0 +1,281 @@ +"""Internal representations of heterogeous data and statistics.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +from scipy.special import gammaln + +from treecat.util import jit + + +class Schema(object): + """Schema describing a data types of multivariate heterogeneous data.""" + + DTYPES = ('categorical', 'normal') + DTYPE_INDEX = {DTYPES.index(dtype) for dtype in DTYPES} + + def __init__(self, feature_types): + assert isinstance(feature_types, dict) + + self._feature_types = dict(feature_types) + self._normal_features = tuple( + sorted([ + name for name, dtype in feature_types.items + if dtype == 'normal' + ])) + self._normal_index = { + i: name + for name, i in enumerate(self._normal_features) + } + self._cat_values = {} # dict(string, list(string)). + self._cat_value_index = {} # dict(string, dict(string, int)). + + def register_data(self, rows): + for row in rows: + for name, value in row.items(): + if self._feature_types[name] == 'categorical': + self.register_cat_value(name, value) + + def register_cat_value(self, name, value): + values = self._cat_values[name] + value_index = self._cat_value_index[name] + if value not in value_index: + value_index[value] = len(values) + values.append(value) + + +class ValueTable(object): + """Table of row-oriented heterogeneous single-observation data.""" + + def __init__(self, schema, num_rows): + assert isinstance(schema, Schema) + self._schema = schema + N = num_rows + Vr = len(schema._real_features) + Vc = len(schema._cat_features) + + self._cat_data = np.zeros([N, Vc], dtype=np.int8) + self._real_data = np.zeros([N, Vr], dtype=np.float32) + + +def gaussian_logprob_data(count, mean, count_times_variance, prior): + """ + References: + - Kevin Murphy (2007) + "Conjugate Bayesian analysis of the Gaussian distribution" + http://www-devel.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf + """ + raise NotImplementedError + + +class JointStats(object): + """Sufficient statistics for a joint distribution of categoricals + reals. + + This represents sufficient statistics for a mixture-of-Gaussians model of + heterogeneous categorical and real data, where the categoricals comprise a + mixture id and the reals are jointly modeled as a MVN conditional on the + categorical data. This assumes that all data is observed simultaneously. + + The sufficient statistics are as follows: + - _ss_0 = number of observations of each mixture component. + - _ss_1 = means of each mixture component. + - _ss_2 = count * variance of each mixture component. + """ + + def __init__(self, cat_dims, real_dim): + cat_dim = np.product(cat_dims, np.int32) + self._cat_dims = cat_dims + self._flatten = np.cumprod(cat_dims) / cat_dims + self._ss_0 = np.zeros([cat_dim], np.int32) + self._ss_1 = np.zeros([cat_dim, real_dim], np.float32) + self._ss_2 = np.zeros([cat_dim, real_dim, real_dim], np.float32) + + def reset(self): + self._ss_0[...] = 0 + self._ss_1[...] = 0 + self._ss_2[...] = 0 + + def add_row(self, cat_row, real_row): + assert isinstance(cat_row, np.ndarray) + assert isinstance(real_row, np.ndarray) + assert cat_row.shape == (len(self.cat_dims), ) + assert real_row.shape == (self.real_dim, ) + m = np.dot(self._flatten, cat_row) + self._ss_0[m] += 1 + n = np.float32(self._ss_0[m]) + one_over_n = 1.0 / n + diff = real_row - self._ss_1[m] + self._ss_1[m] += diff * one_over_n + self._ss_2 += np.outer(diff, diff) * ((1.0 + n) * one_over_n) + + def remove_row(self, cat_row, real_row): + assert isinstance(cat_row, np.ndarray) + assert isinstance(real_row, np.ndarray) + raise NotImplementedError + + def set_from_table(self, cat_table, real_table): + assert isinstance(cat_table, np.ndarray) + assert isinstance(real_table, np.ndarray) + assert cat_table.shape[0] == real_table.shape[0] + self.reset() + for n in range(cat_table.shape[0]): + self.add_row(cat_table[n, :], real_table[n, :]) + + def logprob(self, cat_prior, real_prior): + result = 0.0 + for m in range(self._ss_0.shape[0]): + result += gammaln(self._ss_0[m] + cat_prior) + result += gaussian_logprob_data(self._ss_0[m], self._ss_1[m], + self._ss_2[m], real_prior) + return result + + +class EdgesStats(object): + """Sufficient statistics for a joint distribution of a pair of variables. + + This should be distributionally eqivalent to JointStats, but with variables + separated into two sets, say head and tail. + """ + + def __init__(self, head_cat_dim, head_real_dim, tail_cat_dim, + tail_real_dim): + assert head_cat_dim >= 1 + assert tail_cat_dim >= 1 + raise NotImplementedError + + def filter_forward(self, head_message): + raise NotImplementedError + + def filter_backward(self, head_message): + raise NotImplementedError + + def sample_forward(self, tail_sample): + raise NotImplementedError + + def sample_backward(self, tail_sample): + raise NotImplementedError + + def logprob(self, head_cat_prior, head_real_prior, tail_cat_prior, + tail_real_prior): + raise NotImplementedError + + +class StatsTable(object): + """Table of row-oriented heterogeneous missing/repeated data.""" + + def __init__(self, schema, rows): + assert isinstance(schema, Schema) + self._schema = schema + N = len(rows) + + # Collect categorical features and values. + cat_features = sorted(k for k, v in schema.items() + if v == 'categorical') + cat_values = [set() for _ in cat_features] + for row in rows: + for name, values in zip(cat_features, cat_values): + try: + # TODO Handle repeated values and ordinals. + value = row[name] + except KeyError: + continue + values.add(value) + cat_values = [sorted(v) for v in cat_values] + cat_index = [{v: i for i, v in enumerate(vs)} for vs in cat_values] + self._cat_features = cat_features + self._cat_values = cat_values + self._cat_index = cat_index + + # Create a ragged array of categorical sufficient statistics. + ragged_index = np.zeros(len(cat_features) + 1, dtype=np.int32) + for v, name in enumerate(cat_features): + dim = len(cat_values[v]) + assert dim < 128 + ragged_index[1 + v] = ragged_index[v] + dim + data = np.zeros((N, ragged_index[-1]), dtype=np.int8) + for n, row in enumerate(rows): + for v, name in enumerate(cat_features): + try: + value = row[name] + except KeyError: + continue + int_value = cat_index[v][value] + jit_add_cat_data(ragged_index, data, n, v, int_value) + self._cat_ragged_index = ragged_index + self._cat_data = data + + # Collect real-valued features and create data. + real_features = sorted(k for k, v in schema.items() if v == 'real') + data = np.zeros((N, len(real_features), 3), dtype=np.float32) + for n, row in enumerate(rows): + for v, name in enumerate(real_features): + try: + value = row[name] + except KeyError: + continue + jit_add_real_data(data, n, v, value) + self._real_data = data + + @property + def schema(self): + return self._schema + + @property + def num_rows(self): + return self._real_data.shape[0] + + @property + def cat_ragged_index(self): + return self._cat_ragged_index + + @property + def cat_data(self): + return self._cat_data + + @property + def real_data(self): + return self._real_data + + def __add__(self, other): + """Combine observations of two row-aligned datasets. + + This adds data in the sense of repeated observation of each row. + + See also `StatsTable.cat()` for combining different rows + and `StatsTable.join` for combining different columns. + + Args: + other (StatsTable): Another StatsTable with the same schema and + aligned rows. + + Returns: + StatsTable: A table with the same schema and same number of rows. + """ + assert isinstance(other, StatsTable) + assert other.num_rows == self.num_rows + raise NotImplementedError + + +@jit +def jit_add_cat_data(ragged_index, data, n, v, value): + """Update multinomial sufficient statistics.""" + data[n, ragged_index[v] + value] += 1 + + +@jit +def jit_add_real_data(data, n, v, value): + """Update real sufficient statistics.""" + # Adapted from https://www.johndcook.com/blog/standard_deviation + ss = data[n, v, 0:3] + if ss[0] == 0: + ss[0] = 1.0 + ss[1] = value + ss[2] = 0.0 + else: + ss[0] += 1.0 + old_diff = value - ss[1] + ss[1] += old_diff / ss[0] + new_diff = value - ss[1] + ss[2] += old_diff * new_diff