py-simulated-assemblage/simulate_assemblages.py at master · acabaniss/py-simulated-assemblage · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""Generate simulated assemblages and compare simulation vs. observed artifacts.

Author: Andrew Cabaniss
"""

import numpy as np
import scipy
import matplotlib.pyplot as plt
import random as rd

def simulate_assemblages_collection(context_counts, N = 10000, p_dist = 2, center = 'observed'):
    """Calculate a theoretical object-assemblage matrix with similar marginal probabilities for an entire site.

    Parameters
    ----------
    context_counts :pandas.DataFrame
        Rows need to be the contexts, columns need to be integer counts of objects
    N : int
        number of theoretical assemblages to generate
    p_dist : int
        Dimensions for the Minkowski distance: 1 for Manhattan/taxi-cab, 2 for Euclidian
    center : {'observed', 'calculated'}
        If observed: the mean of all simulated distributions is used
        If calculated: the expected mean is used instead.

    Returns
    -------
    results : dict
        The results dict stores different data from the simulation:
        1. raw_simulation - the numpy.array of the simulated assemblages
        2. raw_observed - the formatted observed data as a numpy.array
        3. distance_simulation - the distances from the center of the simulated distribtion
        4. distance_observed - the distance of our actual assemblage from the center of the simulated distribution
        5. p-value - the proportion of simulated assemblages further from the center than the observed assemblages
        6. N - number of assemblages generated
        7. p_dist - dimensionality of the distance used
        8. center - the center chosen for distances to be calculated from
    """
    # Value checks
    if center not in ['observed', 'calculated']:
        raise ValueError("center neither 'observed' nor 'calculated")

    #First, the observed data are formatted and used to construct the parameters for a weighted random distribution
    observed = np.asanyarray(context_counts) #convert from DataFrame to ndarray
    num_objects = observed.sum().sum() #total number of objects across all contexts/types
    num_contexts = observed.shape[0] # number of contexts
    num_types = observed.shape[1] #number of different types of objects
    marginal_types = observed.sum(axis = 0)*1./num_objects #marginal distribution of the types of objects
    marginal_contexts = observed.sum(axis = 1)*1./num_objects #marginal distribution of the contexts

    #generate simulated assemblages
    collector = np.zeros((N,num_contexts,num_types))
    for i in range(N):
        for o in range(num_objects):
            x = rd.choices(range(num_contexts),weights = marginal_contexts)
            y = rd.choices(range(num_types),weights = marginal_types)
            collector[i,x,y] += 1

    #Determine the center to be used for measuring all distances
    if center == 'calculated':
        center_calc = np.outer(marginal_contexts, marginal_types)*num_objects #calculate the theoretical center of the distribution
    elif center == 'observed':
        center_calc = collector.mean(axis = 0) #calculate the actual center of teh distribution

    #prepare the distribution, observed data, and center of teh distribution for distance calculation
    center_calc_flat = center_calc.reshape(1,-1) #reshape to be flat for distances
    collector_flat = np.reshape(collector,(N, -1)) #reshape to be flat for distances
    data_flat = np.asanyarray(observed).reshape(1, -1) #reshape to be flat for distances

    #calculate distances within the simulated distribution
    dists_boot = scipy.spatial.distance_matrix(center_calc_flat, collector_flat, p = p_dist).flatten()
    dist_real = scipy.spatial.distance_matrix(center_calc_flat, data_flat , p = p_dist).flatten()[0]

    #return results
    result = {'raw_simulation' : collector,
              'raw_observed' : observed,

             'distance_simulation' : dists_boot,
             'distance_observed' : dist_real,
             'p-value' : sum(dists_boot>=dist_real)*1./len(dists_boot),
             'N' : N,
             'p_dist' : p_dist,
             'center' : center}
    return result

def simulate_assemblage_single(context_counts, context_index, N = 10000, p_dist = 2, center = 'observed'):
    """Calculate a theoretical object-assemblage matrix with similar marginal probabilities for an entire site.

    Parameters
    ----------
    context_counts :pandas.DataFrame
        Rows need to be the contexts, columns need to be integer counts of objects
    context_index : int
        which row needs to be used
    N : int
        number of theoretical assemblages to generate
    p_dist : int
        Dimensions for the Minkowski distance: 1 for Manhattan/taxi-cab, 2 for Euclidian
    center : {'observed', 'calculated'}
        If observed: the mean of all simulated distributions is used
        If calculated: the expected mean is used instead.

    Returns
    -------
    results : dict
        The results dict stores different data from the simulation:
        1. raw_simulation - the numpy.array of the simulated assemblages
        2. raw_observed - the formatted observed data as a numpy.array
        3. distance_simulation - the distances from the center of the simulated distribtion
        4. distance_observed - the distance of our actual assemblage from the center of the simulated distribution
        5. p-value - the proportion of simulated assemblages further from the center than the observed assemblages
        6. N - number of assemblages generated
        7. p_dist - dimensionality of the distance used
        8. center - which center definition used
    """
    if center not in ['observed', 'calculated']:
        raise ValueError("center neither 'observed' nor 'calculated")

    #First, the observed data are formatted and used to construct the parameters for a weighted random distribution
    observed_sites = np.asanyarray(context_counts) #convert from DataFrame or other structure to ndarray
    observed = observed_sites[context_index]
    num_objects = observed.sum() #objects in this context
    num_types = observed_sites.shape[1] #number of different types of objects
    marginal_types = observed_sites.sum(axis = 0)*1./observed_sites.sum().sum() #marginal distribution of the types of objects

    #generate simulated assemblages
    collector = np.zeros((N,num_types))
    for i in range(N):
        for o in range(num_objects):
            y = rd.choices(range(num_types),weights = marginal_types)
            collector[i,y] += 1

    #Determine the center to be used for measuring all distances
    if center == 'calculated':
        center_calc = marginal_types*num_objects #calculate the theoretical center of the distribution
    elif center == 'observed':
        center_calc = collector.mean(axis = 0) #calculate the actual center of teh distribution

    #calculate distances within the simulated distribution
    dists_boot = scipy.spatial.distance_matrix(center_calc.reshape(1,-1), collector, p = p_dist).flatten()
    dist_real = scipy.spatial.distance_matrix(center_calc.reshape(1,-1), observed.reshape(1,-1) , p = p_dist).flatten()[0]

    #return results
    result = {'raw_simulation' : collector,
              'raw_observed' : observed,
             'distance_simulation' : dists_boot,
             'distance_observed' : dist_real,
             'p-value' : sum(dists_boot>=dist_real)*1./len(dists_boot),
             'N' : N,
             'p_dist' : p_dist,
             'center' : center}
    return result