From 9166543692a5bec50a5e6e43a489b5099de56717 Mon Sep 17 00:00:00 2001
From: Clay Foye <CFoye@Creare.com>
Date: Mon, 3 Apr 2023 12:15:30 -0400
Subject: [PATCH 01/14] MAINT: toy_data and benchmark #508

---
 efficient-stats-calc/benchmark.py | 60 +++++++++++++++++++++++++++++++
 efficient-stats-calc/data_gen.py  | 34 ++++++++++++++++++
 2 files changed, 94 insertions(+)
 create mode 100644 efficient-stats-calc/benchmark.py
 create mode 100644 efficient-stats-calc/data_gen.py

diff --git a/efficient-stats-calc/benchmark.py b/efficient-stats-calc/benchmark.py
new file mode 100644
index 00000000..6d681ae9
--- /dev/null
+++ b/efficient-stats-calc/benchmark.py
@@ -0,0 +1,60 @@
+import time
+import numpy as np
+from data_gen import *
+
+
+# Example method (replace with your own methods)
+def brute_force(x, y):
+    return np.mean(f(x,y))
+
+# Benchmarking tool
+def benchmark(methods, x, y, f, n_runs=10):
+    results = {}
+    
+    # Calculate the ground truth s(c)
+    c_true = f(x, y)
+    s_c_true = np.mean(c_true)
+    
+    for name, method in methods.items():
+        # Initialize variables to store execution time and errors
+        exec_times = []
+        errors = []
+        
+        for _ in range(n_runs):
+            # Measure execution time
+            start_time = time.time()
+            s_c_approx = method(x, y)
+            exec_time = time.time() - start_time
+            exec_times.append(exec_time)
+            
+            # Calculate the error
+            error = np.abs(s_c_approx - s_c_true)
+            errors.append(error)
+        
+        # Calculate average execution time and error
+        avg_exec_time = np.mean(exec_times)
+        avg_error = np.mean(errors)
+        
+        results[name] = {
+            'average_execution_time': avg_exec_time,
+            'average_error': avg_error
+        }
+        
+    return results
+
+# Define methods to benchmark
+methods = {
+    'brute_force': brute_force
+}
+
+# Generate toy data
+n_samples = 1000
+x, y = generate_data(n_samples)
+
+# Run the benchmark
+results = benchmark(methods, x, y, f)
+print("Benchmark results:")
+for method, result in results.items():
+    print(f"{method}:")
+    print(f"  Average execution time: {result['average_execution_time']:.6f} seconds")
+    print(f"  Average error: {result['average_error']:.6f}")
diff --git a/efficient-stats-calc/data_gen.py b/efficient-stats-calc/data_gen.py
new file mode 100644
index 00000000..b15fbbfb
--- /dev/null
+++ b/efficient-stats-calc/data_gen.py
@@ -0,0 +1,34 @@
+import numpy as np
+
+# Non-linear n-dimensional function f(x, y)
+def f(x, y):
+    return np.sin(x) * np.cos(y)
+
+# Generate random x and y data
+def generate_data(n_samples):
+    x = np.random.uniform(-np.pi, np.pi, n_samples)
+    y = np.random.uniform(-np.pi, np.pi, n_samples)
+    return x, y
+
+# Calculate statistics s(x), s(y) and s(c)
+def calculate_statistics(x, y, func):
+    s_x = np.mean(x)
+    s_y = np.mean(y)
+    c = func(x, y)
+    s_c = np.mean(c)
+    return s_x, s_y, s_c
+
+# Generate toy data and calculate statistics
+def create_toy_data(n_samples, func):
+    x, y = generate_data(n_samples)
+    s_x, s_y, s_c = calculate_statistics(x, y, func)
+    return x, y, s_x, s_y, s_c
+
+"""
+Example Usage
+"""
+n_samples = 1000
+x, y, s_x, s_y, s_c = create_toy_data(n_samples, f)
+print("s(x):", s_x)
+print("s(y):", s_y)
+print("s(c):", s_c)

From 40a54212bbd955f8c5fa9dea5ed3388864be5124 Mon Sep 17 00:00:00 2001
From: Clay Foye <CFoye@Creare.com>
Date: Mon, 3 Apr 2023 13:59:12 -0400
Subject: [PATCH 02/14] MAINT: MLP Toy Implementation #508

---
 .gitignore                             |  3 ++
 efficient-stats-calc/benchmark.py      | 14 ++++++--
 efficient-stats-calc/deep-learn/cnn.py | 35 ++++++++++++++++++
 efficient-stats-calc/deep-learn/mlp.py | 50 ++++++++++++++++++++++++++
 4 files changed, 100 insertions(+), 2 deletions(-)
 create mode 100644 efficient-stats-calc/deep-learn/cnn.py
 create mode 100644 efficient-stats-calc/deep-learn/mlp.py

diff --git a/.gitignore b/.gitignore
index d9d6232d..2ab57bd8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,6 @@ doc/source/example-links.inc
 doc/source/changelog.md
 .pytest_cache/
 build/
+
+# machine learning
+.h5
\ No newline at end of file
diff --git a/efficient-stats-calc/benchmark.py b/efficient-stats-calc/benchmark.py
index 6d681ae9..121a336e 100644
--- a/efficient-stats-calc/benchmark.py
+++ b/efficient-stats-calc/benchmark.py
@@ -1,12 +1,20 @@
 import time
 import numpy as np
 from data_gen import *
+from tensorflow.keras.models import load_model
 
+MLP_MODEL = "models/mlp.h5"
 
-# Example method (replace with your own methods)
+# Brute Force -- computing s(f(x,y))
 def brute_force(x, y):
     return np.mean(f(x,y))
 
+# Multi-layer Perceptron
+def mlp(x, y):
+    model = load_model(MLP_MODEL)
+    return model.predict(np.array([np.hstack([x,y])]))[0][0]
+    
+
 # Benchmarking tool
 def benchmark(methods, x, y, f, n_runs=10):
     results = {}
@@ -44,13 +52,15 @@ def benchmark(methods, x, y, f, n_runs=10):
 
 # Define methods to benchmark
 methods = {
-    'brute_force': brute_force
+    'brute_force': brute_force,
+    'multilayer_perceptron': mlp
 }
 
 # Generate toy data
 n_samples = 1000
 x, y = generate_data(n_samples)
 
+
 # Run the benchmark
 results = benchmark(methods, x, y, f)
 print("Benchmark results:")
diff --git a/efficient-stats-calc/deep-learn/cnn.py b/efficient-stats-calc/deep-learn/cnn.py
new file mode 100644
index 00000000..a93aaeae
--- /dev/null
+++ b/efficient-stats-calc/deep-learn/cnn.py
@@ -0,0 +1,35 @@
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Conv2D, Flatten, Dense
+from tensorflow.keras.optimizers import Adam
+
+# Load your data here
+# X_train, y_train, X_val, y_val
+
+# Ensure your input data is in the correct format: (samples, height, width, channels)
+# For grayscale grid data, channels = 1
+# For RGB grid data, channels = 3
+
+# Set the dimensions of input and output
+input_shape = X_train.shape[1:]
+output_dim = y_train.shape[1]
+
+# Define the CNN model
+model = Sequential([
+    Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape),
+    Conv2D(64, kernel_size=(3, 3), activation='relu'),
+    Flatten(),
+    Dense(64, activation='relu'),
+    Dense(output_dim, activation='linear')
+])
+
+# Compile the model
+model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mean_absolute_error'])
+
+# Train the model
+history = model.fit(X_train, y_train, batch_size=32, epochs=100, validation_data=(X_val, y_val))
+
+# Make predictions with the trained model
+# X_test: your test data
+predictions = model.predict(X_test)
diff --git a/efficient-stats-calc/deep-learn/mlp.py b/efficient-stats-calc/deep-learn/mlp.py
new file mode 100644
index 00000000..a7086e20
--- /dev/null
+++ b/efficient-stats-calc/deep-learn/mlp.py
@@ -0,0 +1,50 @@
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense
+from tensorflow.keras.optimizers import Adam
+
+import sys
+import os
+sys.path.append(os.path.abspath('.'))
+import data_gen
+
+# Load your data here
+# X_train, y_train, X_val, y_val
+
+def generate_data(N, data_gen):
+    X = []
+    y = []
+
+    for _ in range(N):
+        data = np.hstack(data_gen.generate_data(1000))
+        X.append(data)
+        y.append(np.mean(data_gen.f(data[:1000], data[1000:])))
+
+    return np.array(X), np.array(y)
+
+# Create training and validation data
+N_train = 10000  # Number of training datapoints
+N_val = 20  # Number of validation datapoints
+
+X_train, y_train = generate_data(N_train, data_gen)
+X_val, y_val = generate_data(N_val, data_gen)
+
+# Set the dimensions of input and output
+input_dim = X_train.shape[1]
+output_dim = y_train.shape[0]
+
+# Define the MLP model
+model = Sequential([
+    Dense(64, activation='relu', input_shape=(input_dim,)),
+    Dense(32, activation='relu'),
+    Dense(output_dim, activation='linear')
+])
+
+# Compile the model
+model.compile(optimizer=Adam(learning_rate=1e-5), loss='mean_squared_error', metrics=['mean_absolute_error'])
+
+# Train the model
+history = model.fit(X_train, y_train, batch_size=32, epochs=100, validation_data=(X_val, y_val))
+
+model.save("models/mlp.h5")

From 44b87feed52c49698fe100f28897ee7749ca853e Mon Sep 17 00:00:00 2001
From: Clay Foye <CFoye@Creare.com>
Date: Mon, 3 Apr 2023 14:24:54 -0400
Subject: [PATCH 03/14] MAINT: CNN Toy Implement #508

---
 efficient-stats-calc/benchmark.py      | 27 ++++++++++++++++++--
 efficient-stats-calc/deep-learn/cnn.py | 34 +++++++++++++++++++++++---
 efficient-stats-calc/deep-learn/mlp.py |  6 ++---
 3 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/efficient-stats-calc/benchmark.py b/efficient-stats-calc/benchmark.py
index 121a336e..f07d1fc0 100644
--- a/efficient-stats-calc/benchmark.py
+++ b/efficient-stats-calc/benchmark.py
@@ -5,6 +5,11 @@
 
 MLP_MODEL = "models/mlp.h5"
 
+# Constants
+CNN_MODEL = 'models/cnn.h5'
+WIDTH = 32
+HEIGHT = 32
+
 # Brute Force -- computing s(f(x,y))
 def brute_force(x, y):
     return np.mean(f(x,y))
@@ -14,6 +19,23 @@ def mlp(x, y):
     model = load_model(MLP_MODEL)
     return model.predict(np.array([np.hstack([x,y])]))[0][0]
     
+    
+# Convolutional Neural Network
+def cnn(x, y):
+    # Load the trained CNN model
+    model = load_model(CNN_MODEL)
+
+    # Reshape x and y into the format expected by the CNN
+    x_data = x.reshape(HEIGHT, WIDTH)
+    y_data = y.reshape(HEIGHT, WIDTH)
+    data = np.stack([x_data, y_data], axis=-1)  # shape: (height, width, 2)
+    input_data = np.expand_dims(data, axis=0)  # shape: (1, height, width, 2)
+
+    # Perform prediction using the model
+    prediction = model.predict(input_data)[0][0]
+
+    return prediction
+
 
 # Benchmarking tool
 def benchmark(methods, x, y, f, n_runs=10):
@@ -53,11 +75,12 @@ def benchmark(methods, x, y, f, n_runs=10):
 # Define methods to benchmark
 methods = {
     'brute_force': brute_force,
-    'multilayer_perceptron': mlp
+    'multilayer_perceptron': mlp,
+    'convolutional': cnn
 }
 
 # Generate toy data
-n_samples = 1000
+n_samples = WIDTH*HEIGHT
 x, y = generate_data(n_samples)
 
 
diff --git a/efficient-stats-calc/deep-learn/cnn.py b/efficient-stats-calc/deep-learn/cnn.py
index a93aaeae..819efd3f 100644
--- a/efficient-stats-calc/deep-learn/cnn.py
+++ b/efficient-stats-calc/deep-learn/cnn.py
@@ -4,6 +4,12 @@
 from tensorflow.keras.layers import Conv2D, Flatten, Dense
 from tensorflow.keras.optimizers import Adam
 
+
+import sys
+import os
+sys.path.append(os.path.abspath('.'))
+import data_gen
+
 # Load your data here
 # X_train, y_train, X_val, y_val
 
@@ -11,9 +17,31 @@
 # For grayscale grid data, channels = 1
 # For RGB grid data, channels = 3
 
+def generate_data_cnn(N, data_gen, width, height):
+    X = []
+    y = []
+
+    for _ in range(N):
+        x_data, y_data = data_gen.generate_data(width * height)
+        x_data = x_data.reshape(height, width)
+        y_data = y_data.reshape(height, width)
+        data = np.stack([x_data, y_data], axis=-1)  # shape: (height, width, 2)
+        X.append(data)
+        y.append(np.mean(data_gen.f(x_data, y_data)))
+
+    return np.array(X), np.array(y)
+
+# Create training and validation data
+N_train = 100  # Number of training datapoints
+N_val = 20  # Number of validation datapoints
+width, height = 32, 32  # Dimensions of the square images
+
+X_train, y_train = generate_data_cnn(N_train, data_gen, width, height)
+X_val, y_val = generate_data_cnn(N_val, data_gen, width, height)
+
 # Set the dimensions of input and output
 input_shape = X_train.shape[1:]
-output_dim = y_train.shape[1]
+output_dim = y_train.shape[0]
 
 # Define the CNN model
 model = Sequential([
@@ -30,6 +58,4 @@
 # Train the model
 history = model.fit(X_train, y_train, batch_size=32, epochs=100, validation_data=(X_val, y_val))
 
-# Make predictions with the trained model
-# X_test: your test data
-predictions = model.predict(X_test)
+model.save("models/cnn.h5")
\ No newline at end of file
diff --git a/efficient-stats-calc/deep-learn/mlp.py b/efficient-stats-calc/deep-learn/mlp.py
index a7086e20..db3d9c78 100644
--- a/efficient-stats-calc/deep-learn/mlp.py
+++ b/efficient-stats-calc/deep-learn/mlp.py
@@ -17,14 +17,14 @@ def generate_data(N, data_gen):
     y = []
 
     for _ in range(N):
-        data = np.hstack(data_gen.generate_data(1000))
+        data = np.hstack(data_gen.generate_data(1024))
         X.append(data)
-        y.append(np.mean(data_gen.f(data[:1000], data[1000:])))
+        y.append(np.mean(data_gen.f(data[:1024], data[1024:])))
 
     return np.array(X), np.array(y)
 
 # Create training and validation data
-N_train = 10000  # Number of training datapoints
+N_train = 1000  # Number of training datapoints
 N_val = 20  # Number of validation datapoints
 
 X_train, y_train = generate_data(N_train, data_gen)

From 212f74b2712bc030af40864fcd10b0744ea7e571 Mon Sep 17 00:00:00 2001
From: Clay Foye <CFoye@Creare.com>
Date: Mon, 3 Apr 2023 16:47:25 -0400
Subject: [PATCH 04/14] FIX: MLP learns PDFs

---
 efficient-stats-calc/deep-learn/mlp.py | 60 +++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 10 deletions(-)

diff --git a/efficient-stats-calc/deep-learn/mlp.py b/efficient-stats-calc/deep-learn/mlp.py
index db3d9c78..fce234d1 100644
--- a/efficient-stats-calc/deep-learn/mlp.py
+++ b/efficient-stats-calc/deep-learn/mlp.py
@@ -9,30 +9,70 @@
 sys.path.append(os.path.abspath('.'))
 import data_gen
 
+from scipy.stats import gaussian_kde
+
+
 # Load your data here
 # X_train, y_train, X_val, y_val
 
-def generate_data(N, data_gen):
+# def generate_data(N, data_gen):
+#     X = []
+#     y = []
+
+#     for _ in range(N):
+#         data = np.hstack(data_gen.generate_data(1024))
+#         X.append(data)
+#         y.append(np.mean(data_gen.f(data[:1024], data[1024:])))
+
+#     return np.array(X), np.array(y)
+
+
+# # Create training and validation data
+# N_train = 1000  # Number of training datapoints
+# N_val = 20  # Number of validation datapoints
+
+# X_train, y_train = generate_data(N_train, data_gen)
+# X_val, y_val = generate_data(N_val, data_gen)
+
+
+def generate_pdf_data(N, data_gen, num_points=1024):
     X = []
     y = []
 
     for _ in range(N):
-        data = np.hstack(data_gen.generate_data(1024))
-        X.append(data)
-        y.append(np.mean(data_gen.f(data[:1024], data[1024:])))
+        data = np.hstack(data_gen.generate_data(num_points))
+        x_data = data[:num_points]
+        y_data = data[num_points:]
+
+        # Estimate the PDFs of x and y using KDE
+        kde_x = gaussian_kde(x_data)
+        kde_y = gaussian_kde(y_data)
+
+        # Compute f(x, y) and estimate its PDF using KDE
+        f_data = data_gen.f(x_data, y_data)
+        kde_f = gaussian_kde(f_data)
+
+        # Stack the PDFs of x, y, and f(x, y) as input features and target
+        x_pdf = kde_x(x_data)
+        y_pdf = kde_y(y_data)
+        f_pdf = kde_f(f_data)
+
+        X.append(np.hstack([x_pdf, y_pdf]))
+        y.append([f_pdf])
+
 
     return np.array(X), np.array(y)
 
-# Create training and validation data
-N_train = 1000  # Number of training datapoints
-N_val = 20  # Number of validation datapoints
+# Example usage:
+N = 5000  # Number of training datapoints
+val = 100
+X_train, y_train = generate_pdf_data(N, data_gen)
+X_val, y_val = generate_pdf_data(val, data_gen)
 
-X_train, y_train = generate_data(N_train, data_gen)
-X_val, y_val = generate_data(N_val, data_gen)
 
 # Set the dimensions of input and output
 input_dim = X_train.shape[1]
-output_dim = y_train.shape[0]
+output_dim = y_train.shape[1]
 
 # Define the MLP model
 model = Sequential([

From 9e8b50ce196b5e0406d57437f2ab66c0b896f51b Mon Sep 17 00:00:00 2001
From: Clay Foye <CFoye@Creare.com>
Date: Mon, 3 Apr 2023 16:54:15 -0400
Subject: [PATCH 05/14] MAINT: Learn CDF's

---
 efficient-stats-calc/deep-learn/mlp.py | 65 +++++++++++++++-----------
 1 file changed, 38 insertions(+), 27 deletions(-)

diff --git a/efficient-stats-calc/deep-learn/mlp.py b/efficient-stats-calc/deep-learn/mlp.py
index fce234d1..687fb4d5 100644
--- a/efficient-stats-calc/deep-learn/mlp.py
+++ b/efficient-stats-calc/deep-learn/mlp.py
@@ -10,30 +10,9 @@
 import data_gen
 
 from scipy.stats import gaussian_kde
+from scipy.integrate import cumtrapz
 
 
-# Load your data here
-# X_train, y_train, X_val, y_val
-
-# def generate_data(N, data_gen):
-#     X = []
-#     y = []
-
-#     for _ in range(N):
-#         data = np.hstack(data_gen.generate_data(1024))
-#         X.append(data)
-#         y.append(np.mean(data_gen.f(data[:1024], data[1024:])))
-
-#     return np.array(X), np.array(y)
-
-
-# # Create training and validation data
-# N_train = 1000  # Number of training datapoints
-# N_val = 20  # Number of validation datapoints
-
-# X_train, y_train = generate_data(N_train, data_gen)
-# X_val, y_val = generate_data(N_val, data_gen)
-
 
 def generate_pdf_data(N, data_gen, num_points=1024):
     X = []
@@ -63,11 +42,43 @@ def generate_pdf_data(N, data_gen, num_points=1024):
 
     return np.array(X), np.array(y)
 
+def generate_cdf_data(N, data_gen, num_points=1024):
+    X = []
+    y = []
+
+    for _ in range(N):
+        data = np.hstack(data_gen.generate_data(num_points))
+        x_data = data[:num_points]
+        y_data = data[num_points:]
+
+        # Estimate the PDFs of x and y using KDE
+        kde_x = gaussian_kde(x_data)
+        kde_y = gaussian_kde(y_data)
+
+        # Compute f(x, y) and estimate its PDF using KDE
+        f_data = data_gen.f(x_data, y_data)
+        kde_f = gaussian_kde(f_data)
+
+        # Compute the CDFs of x, y, and f(x, y) using the PDFs
+        x_cdf = cumtrapz(kde_x(x_data), x_data, initial=0)
+        y_cdf = cumtrapz(kde_y(y_data), y_data, initial=0)
+        f_cdf = cumtrapz(kde_f(f_data), f_data, initial=0)
+
+        # Normalize the CDFs
+        x_cdf /= x_cdf[-1]
+        y_cdf /= y_cdf[-1]
+        f_cdf /= f_cdf[-1]
+
+        X.append(np.hstack([x_cdf, y_cdf]))
+        y.append([f_cdf])
+
+    return np.array(X), np.array(y)
+
 # Example usage:
-N = 5000  # Number of training datapoints
-val = 100
-X_train, y_train = generate_pdf_data(N, data_gen)
-X_val, y_val = generate_pdf_data(val, data_gen)
+N = 1000  # Number of training datapoints
+val = 20
+X_train, y_train = generate_cdf_data(N, data_gen)
+X_val, y_val = generate_cdf_data(val, data_gen)
 
 
 # Set the dimensions of input and output
@@ -85,6 +96,6 @@ def generate_pdf_data(N, data_gen, num_points=1024):
 model.compile(optimizer=Adam(learning_rate=1e-5), loss='mean_squared_error', metrics=['mean_absolute_error'])
 
 # Train the model
-history = model.fit(X_train, y_train, batch_size=32, epochs=100, validation_data=(X_val, y_val))
+history = model.fit(X_train, y_train, batch_size=8, epochs=100, validation_data=(X_val, y_val))
 
 model.save("models/mlp.h5")

From 20417e96a871412c86e4ac9b903ffdafae40cc52 Mon Sep 17 00:00:00 2001
From: Clay Foye <CFoye@Creare.com>
Date: Tue, 4 Apr 2023 13:54:31 -0400
Subject: [PATCH 06/14] FIX: Models now learn CDFs #508

---
 .../benchmarks/benchmark_cdf.py               | 110 ++++++++++++
 .../benchmark_mean.py}                        |   0
 efficient-stats-calc/data_gen.py              | 156 +++++++++++++++++-
 efficient-stats-calc/deep-learn/cnn.py        |  56 +++----
 efficient-stats-calc/deep-learn/mlp.py        |  88 ++--------
 .../sparse-grid/sparse_grid_approx.py         |  71 ++++++++
 6 files changed, 367 insertions(+), 114 deletions(-)
 create mode 100644 efficient-stats-calc/benchmarks/benchmark_cdf.py
 rename efficient-stats-calc/{benchmark.py => benchmarks/benchmark_mean.py} (100%)
 create mode 100644 efficient-stats-calc/sparse-grid/sparse_grid_approx.py

diff --git a/efficient-stats-calc/benchmarks/benchmark_cdf.py b/efficient-stats-calc/benchmarks/benchmark_cdf.py
new file mode 100644
index 00000000..5eeaab93
--- /dev/null
+++ b/efficient-stats-calc/benchmarks/benchmark_cdf.py
@@ -0,0 +1,110 @@
+import time
+import numpy as np
+
+import sys
+import os
+sys.path.append(os.path.abspath('.'))
+from data_gen import *
+import data_gen
+from tensorflow.keras.models import load_model
+from scipy.stats import gaussian_kde
+from scipy.integrate import cumtrapz
+
+import time
+
+MLP_MODEL = "models/mlp.h5"
+
+# Constants
+CNN_MODEL = 'models/cnn_1d.h5'
+N_SAMPLES = 1024
+
+# Brute Force -- computing s(f(x,y))
+def brute_force(x_cdf, y_cdf):
+    return NotImplementedError
+    
+
+# Multi-layer Perceptron
+def mlp(x_cdf, y_cdf):
+    model = load_model(MLP_MODEL)
+    print(x_cdf)
+    print(model.predict(np.array([np.hstack([x_cdf,y_cdf])]))[0])
+    return model.predict(np.array([np.hstack([x_cdf,y_cdf])]))[0]
+    
+    
+# Convolutional Neural Network
+def cnn(x_cdf, y_cdf):
+    # Load the trained CNN model
+    model = load_model(CNN_MODEL)
+
+    cdf = np.array([np.hstack([x_cdf,y_cdf])])
+    cdf = cdf.reshape(-1, cdf.shape[1], 1)
+    
+    
+    # Perform prediction using the model
+    prediction = model.predict(cdf)[0]
+    
+    print(prediction)
+
+    return prediction
+
+def benchmark(methods, data_gen, f, n_runs=1, n_sets=10):
+    results = {}
+    
+    def calculate_cdf(data):
+        sorted_data = np.sort(data)
+        probabilities = np.linspace(0, 1, len(sorted_data))
+        return sorted_data, probabilities
+
+    for name, method in methods.items():
+        # Initialize variables to store execution time and errors
+        exec_times = []
+        errors = []
+        
+        for _ in range(n_runs):
+            set_errors = []
+            for i in range(n_sets):
+                x, y = data_gen.generate_data(N_SAMPLES)
+                c_true = f(x, y)
+
+                # Calculate the CDFs of x, y, and f(x, y)
+                x_sorted, x_cdf = calculate_cdf(x)
+                y_sorted, y_cdf = calculate_cdf(y)
+                c_sorted, c_cdf_true = calculate_cdf(c_true)
+
+                # Measure execution time
+                start_time = time.time()
+                c_cdf_approx = method(x_cdf, y_cdf)
+                exec_time = time.time() - start_time
+                
+                # Calculate the error
+                error = np.mean(np.abs(c_cdf_approx - c_cdf_true))
+                set_errors.append(error)
+
+            exec_times.append(exec_time)
+            errors.append(np.mean(set_errors))
+        
+        # Calculate average execution time and error
+        avg_exec_time = np.mean(exec_times)
+        avg_error = np.mean(errors)
+        
+        results[name] = {
+            'average_execution_time': avg_exec_time,
+            'average_error': avg_error
+        }
+        
+    return results
+
+
+# Define methods to benchmark
+methods = {
+    'mlp': mlp,
+    'cnn': cnn
+}   
+
+# Run the benchmark
+results = benchmark(methods, data_gen, f, n_runs=1, n_sets=1)
+print("Benchmark results:")
+for method, result in results.items():
+    print(f"{method}:")
+    print(f"  Average execution time: {result['average_execution_time']:.6f} seconds")
+    print(f"  Average error: {result['average_error']:.6f}")
diff --git a/efficient-stats-calc/benchmark.py b/efficient-stats-calc/benchmarks/benchmark_mean.py
similarity index 100%
rename from efficient-stats-calc/benchmark.py
rename to efficient-stats-calc/benchmarks/benchmark_mean.py
diff --git a/efficient-stats-calc/data_gen.py b/efficient-stats-calc/data_gen.py
index b15fbbfb..9ff51713 100644
--- a/efficient-stats-calc/data_gen.py
+++ b/efficient-stats-calc/data_gen.py
@@ -1,15 +1,20 @@
 import numpy as np
+import concurrent.futures
+from scipy.stats import gaussian_kde
+from scipy.integrate import cumtrapz
 
 # Non-linear n-dimensional function f(x, y)
 def f(x, y):
     return np.sin(x) * np.cos(y)
 
-# Generate random x and y data
-def generate_data(n_samples):
+def generate_data(n_samples, seed=None):
+        
+    if seed is not None:
+        np.random.seed(seed)
+    
     x = np.random.uniform(-np.pi, np.pi, n_samples)
     y = np.random.uniform(-np.pi, np.pi, n_samples)
     return x, y
-
 # Calculate statistics s(x), s(y) and s(c)
 def calculate_statistics(x, y, func):
     s_x = np.mean(x)
@@ -27,8 +32,143 @@ def create_toy_data(n_samples, func):
 """
 Example Usage
 """
-n_samples = 1000
-x, y, s_x, s_y, s_c = create_toy_data(n_samples, f)
-print("s(x):", s_x)
-print("s(y):", s_y)
-print("s(c):", s_c)
+# n_samples = 1000
+# x, y, s_x, s_y, s_c = create_toy_data(n_samples, f)
+# print("s(x):", s_x)
+# print("s(y):", s_y)
+# print("s(c):", s_c)
+
+
+
+
+def generate_data_cnn(N, data_gen, width, height):
+    X = []
+    y = []
+
+    for _ in range(N):
+        x_data, y_data = data_gen.generate_data(width * height)
+        x_data = x_data.reshape(height, width)
+        y_data = y_data.reshape(height, width)
+        data = np.stack([x_data, y_data], axis=-1)  # shape: (height, width, 2)
+        X.append(data)
+        y.append(np.mean(data_gen.f(x_data, y_data)))
+
+    return np.array(X), np.array(y)
+
+def generate_pdf_data(N, data_gen, num_points=1024):
+    X = []
+    y = []
+
+    for _ in range(N):
+        data = np.hstack(data_gen.generate_data(num_points))
+        x_data = data[:num_points]
+        y_data = data[num_points:]
+
+        # Estimate the PDFs of x and y using KDE
+        kde_x = gaussian_kde(x_data)
+        kde_y = gaussian_kde(y_data)
+
+        # Compute f(x, y) and estimate its PDF using KDE
+        f_data = data_gen.f(x_data, y_data)
+        kde_f = gaussian_kde(f_data)
+
+        # Stack the PDFs of x, y, and f(x, y) as input features and target
+        x_pdf = kde_x(x_data)
+        y_pdf = kde_y(y_data)
+        f_pdf = kde_f(f_data)
+
+        X.append(np.hstack([x_pdf, y_pdf]))
+        y.append([f_pdf])
+
+
+    return np.array(X), np.array(y)
+
+def generate_cdf_data(N, data_gen, num_points=1024, grid_size=100):
+    X = []
+    y = []
+
+    for _ in range(N):
+        data = np.hstack(data_gen.generate_data(num_points))
+        x_data = data[:num_points]
+        y_data = data[num_points:]
+
+        # Create a lower resolution grid
+        x_grid = np.linspace(x_data.min(), x_data.max(), grid_size)
+        y_grid = np.linspace(y_data.min(), y_data.max(), grid_size)
+
+        # Estimate the PDFs of x and y using KDE
+        # Example: Use Silverman's rule of thumb
+        kde_x = gaussian_kde(x_data, bw_method='silverman')
+        kde_y = gaussian_kde(y_data, bw_method='silverman')
+
+        # Compute f(x, y) and estimate its PDF using KDE
+        f_data = data_gen.f(x_data, y_data)
+        kde_f = gaussian_kde(f_data, bw_method='silverman')
+
+        # Compute the CDFs of x, y, and f(x, y) using the PDFs on the grid
+        x_cdf = cumtrapz(kde_x(x_grid), x_grid, initial=0)
+        y_cdf = cumtrapz(kde_y(y_grid), y_grid, initial=0)
+        f_cdf = cumtrapz(kde_f(f_data), f_data, initial=0)
+
+        # Normalize the CDFs
+        x_cdf /= x_cdf[-1]
+        y_cdf /= y_cdf[-1]
+        f_cdf /= f_cdf[-1]
+
+        X.append(np.hstack([x_cdf, y_cdf]))
+        y.append([f_cdf])
+
+    return np.array(X), np.array(y)
+
+def generate_cdf_data_parallel(N, data_gen, num_workers=8, num_points=1024):
+    # Define a helper function to process each data point
+    def process_datapoint(data_gen):
+        data = np.hstack(data_gen.generate_data(num_points))
+        x_data = data[:num_points]
+        y_data = data[num_points:]
+
+        # Calculate the CDFs of x, y, and f(x, y) directly from the data points
+        x_sorted, x_cdf = calculate_cdf(x_data)
+        y_sorted, y_cdf = calculate_cdf(y_data)
+        f_data = data_gen.f(x_data, y_data)
+        f_sorted, f_cdf = calculate_cdf(f_data)
+
+        return np.hstack([x_cdf, y_cdf]), f_cdf
+
+    def calculate_cdf(data):
+        sorted_data = np.sort(data)
+        probabilities = np.linspace(0, 1, len(sorted_data))
+        return sorted_data, probabilities
+
+    X = []
+    y = []
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
+        # Parallelize the generation of data points
+        futures = [executor.submit(process_datapoint, data_gen) for _ in range(N)]
+
+        for future in futures:
+            x_cdf_y_cdf, f_cdf = future.result()
+            X.append(x_cdf_y_cdf)
+            y.append([f_cdf])
+
+    return np.array(X), np.array(y)
+
+def save_cdf_data(X, y, X_filename, y_filename):
+    np.save(X_filename, X)
+    np.save(y_filename, y)
+
+def load_cdf_data(X_filename, y_filename):
+    X = np.load(X_filename)
+    y = np.load(y_filename)
+    return X, y
+
+N = 10000
+import data_gen
+
+X_train, y_train = generate_cdf_data_parallel(N, data_gen, num_workers=8)
+save_cdf_data(X_train, y_train, "data/X_train.npy", "data/y_train.npy")
+
+
+X_val, y_val = generate_cdf_data_parallel(200, data_gen, num_workers=8)
+save_cdf_data(X_train, y_train, "data/X_val.npy", "data/y_val.npy")
\ No newline at end of file
diff --git a/efficient-stats-calc/deep-learn/cnn.py b/efficient-stats-calc/deep-learn/cnn.py
index 819efd3f..6baf57f4 100644
--- a/efficient-stats-calc/deep-learn/cnn.py
+++ b/efficient-stats-calc/deep-learn/cnn.py
@@ -1,52 +1,38 @@
 import numpy as np
 import tensorflow as tf
 from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import Conv2D, Flatten, Dense
+from tensorflow.keras.layers import Conv1D, Conv2D, Flatten, Dense
 from tensorflow.keras.optimizers import Adam
+import concurrent.futures
+
 
 
 import sys
 import os
 sys.path.append(os.path.abspath('.'))
-import data_gen
-
-# Load your data here
-# X_train, y_train, X_val, y_val
-
-# Ensure your input data is in the correct format: (samples, height, width, channels)
-# For grayscale grid data, channels = 1
-# For RGB grid data, channels = 3
-
-def generate_data_cnn(N, data_gen, width, height):
-    X = []
-    y = []
+sys.path.append(os.path.abspath('./deep-learn'))
+from data_gen import load_cdf_data
 
-    for _ in range(N):
-        x_data, y_data = data_gen.generate_data(width * height)
-        x_data = x_data.reshape(height, width)
-        y_data = y_data.reshape(height, width)
-        data = np.stack([x_data, y_data], axis=-1)  # shape: (height, width, 2)
-        X.append(data)
-        y.append(np.mean(data_gen.f(x_data, y_data)))
 
-    return np.array(X), np.array(y)
+X_train, y_train = load_cdf_data("data/X_train.npy", "data/y_train.npy")
+X_val, y_val = load_cdf_data("data/X_val.npy", "data/y_val.npy")
 
-# Create training and validation data
-N_train = 100  # Number of training datapoints
-N_val = 20  # Number of validation datapoints
-width, height = 32, 32  # Dimensions of the square images
+input_dim = X_train.shape[1]
+output_dim=y_train.shape[2]
 
-X_train, y_train = generate_data_cnn(N_train, data_gen, width, height)
-X_val, y_val = generate_data_cnn(N_val, data_gen, width, height)
+print(input_dim)
+print(output_dim)
+# Update input_shape for the 1D CNN model
+input_shape = (input_dim, 1)
 
-# Set the dimensions of input and output
-input_shape = X_train.shape[1:]
-output_dim = y_train.shape[0]
+# Reshape the input data
+X_train_cnn = X_train.reshape(-1, input_dim, 1)
+X_val_cnn = X_val.reshape(-1, input_dim, 1)
 
-# Define the CNN model
+# Define the 1D CNN model
 model = Sequential([
-    Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape),
-    Conv2D(64, kernel_size=(3, 3), activation='relu'),
+    Conv1D(32, kernel_size=3, activation='relu', input_shape=input_shape),
+    Conv1D(64, kernel_size=3, activation='relu'),
     Flatten(),
     Dense(64, activation='relu'),
     Dense(output_dim, activation='linear')
@@ -56,6 +42,6 @@ def generate_data_cnn(N, data_gen, width, height):
 model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mean_absolute_error'])
 
 # Train the model
-history = model.fit(X_train, y_train, batch_size=32, epochs=100, validation_data=(X_val, y_val))
+history = model.fit(X_train_cnn, y_train, batch_size=32, epochs=10, validation_data=(X_val_cnn, y_val))
 
-model.save("models/cnn.h5")
\ No newline at end of file
+model.save("models/cnn_1d.h5")
\ No newline at end of file
diff --git a/efficient-stats-calc/deep-learn/mlp.py b/efficient-stats-calc/deep-learn/mlp.py
index 687fb4d5..8559181e 100644
--- a/efficient-stats-calc/deep-learn/mlp.py
+++ b/efficient-stats-calc/deep-learn/mlp.py
@@ -8,82 +8,25 @@
 import os
 sys.path.append(os.path.abspath('.'))
 import data_gen
+from data_gen import load_cdf_data
 
-from scipy.stats import gaussian_kde
-from scipy.integrate import cumtrapz
-
-
-
-def generate_pdf_data(N, data_gen, num_points=1024):
-    X = []
-    y = []
-
-    for _ in range(N):
-        data = np.hstack(data_gen.generate_data(num_points))
-        x_data = data[:num_points]
-        y_data = data[num_points:]
-
-        # Estimate the PDFs of x and y using KDE
-        kde_x = gaussian_kde(x_data)
-        kde_y = gaussian_kde(y_data)
-
-        # Compute f(x, y) and estimate its PDF using KDE
-        f_data = data_gen.f(x_data, y_data)
-        kde_f = gaussian_kde(f_data)
-
-        # Stack the PDFs of x, y, and f(x, y) as input features and target
-        x_pdf = kde_x(x_data)
-        y_pdf = kde_y(y_data)
-        f_pdf = kde_f(f_data)
-
-        X.append(np.hstack([x_pdf, y_pdf]))
-        y.append([f_pdf])
-
-
-    return np.array(X), np.array(y)
-
-def generate_cdf_data(N, data_gen, num_points=1024):
-    X = []
-    y = []
-
-    for _ in range(N):
-        data = np.hstack(data_gen.generate_data(num_points))
-        x_data = data[:num_points]
-        y_data = data[num_points:]
-
-        # Estimate the PDFs of x and y using KDE
-        kde_x = gaussian_kde(x_data)
-        kde_y = gaussian_kde(y_data)
-
-        # Compute f(x, y) and estimate its PDF using KDE
-        f_data = data_gen.f(x_data, y_data)
-        kde_f = gaussian_kde(f_data)
-
-        # Compute the CDFs of x, y, and f(x, y) using the PDFs
-        x_cdf = cumtrapz(kde_x(x_data), x_data, initial=0)
-        y_cdf = cumtrapz(kde_y(y_data), y_data, initial=0)
-        f_cdf = cumtrapz(kde_f(f_data), f_data, initial=0)
-
-        # Normalize the CDFs
-        x_cdf /= x_cdf[-1]
-        y_cdf /= y_cdf[-1]
-        f_cdf /= f_cdf[-1]
-
-        X.append(np.hstack([x_cdf, y_cdf]))
-        y.append([f_cdf])
-
-    return np.array(X), np.array(y)
 
 # Example usage:
-N = 1000  # Number of training datapoints
-val = 20
-X_train, y_train = generate_cdf_data(N, data_gen)
-X_val, y_val = generate_cdf_data(val, data_gen)
+X_train, y_train = load_cdf_data("data/X_train.npy", "data/y_train.npy")
+X_val, y_val = load_cdf_data("data/X_val.npy", "data/y_val.npy")
+
+print("X_train shape:", X_train.shape)
+print("y_train shape:", y_train.shape)
+print("X_val shape:", X_val.shape)
+print("y_val shape:", y_val.shape)
 
 
 # Set the dimensions of input and output
 input_dim = X_train.shape[1]
-output_dim = y_train.shape[1]
+output_dim = y_train.shape[2]
+
+print("Input Dim", input_dim)
+print("Output Dim:", output_dim )
 
 # Define the MLP model
 model = Sequential([
@@ -93,9 +36,12 @@ def generate_cdf_data(N, data_gen, num_points=1024):
 ])
 
 # Compile the model
-model.compile(optimizer=Adam(learning_rate=1e-5), loss='mean_squared_error', metrics=['mean_absolute_error'])
+model.compile(optimizer=Adam(learning_rate=1e-5), loss=tf.keras.losses.Huber(delta=1.0), metrics=['mean_absolute_error'])
 
 # Train the model
-history = model.fit(X_train, y_train, batch_size=8, epochs=100, validation_data=(X_val, y_val))
+history = model.fit(X_train, y_train, batch_size=32, epochs=100, validation_data=(X_val, y_val))
+
+print(model.predict(np.expand_dims(X_train[0], axis=0)))
+print(y_train[0])
 
 model.save("models/mlp.h5")
diff --git a/efficient-stats-calc/sparse-grid/sparse_grid_approx.py b/efficient-stats-calc/sparse-grid/sparse_grid_approx.py
new file mode 100644
index 00000000..296b5c68
--- /dev/null
+++ b/efficient-stats-calc/sparse-grid/sparse_grid_approx.py
@@ -0,0 +1,71 @@
+import numpy as np
+from scipy import interpolate
+
+# Step 1: Construct a sparse grid of sample points
+# Define the number of dimensions
+n_dim = 2
+
+# Define the sparse grid
+grid = ???  # Choose a sparse grid method
+
+# Define the number of sample points
+n_points = ???  # Choose the number of sample points
+
+# Generate the sample points
+sample_points = grid.generate_points(n_points)
+
+# Step 2: Interpolate the joint probability distribution
+# Evaluate the PDFs at the sample points
+pdf_x = ???  # Evaluate the PDF P(x) at the sample points
+pdf_y = ???  # Evaluate the PDF P(y) at the sample points
+
+# Construct a meshgrid of the sample points
+X, Y = np.meshgrid(sample_points[:, 0], sample_points[:, 1])
+
+# Interpolate the joint probability distribution
+joint_pdf = interpolate.interp2d(X, Y, pdf_x * pdf_y)
+
+# Step 3: Compute the PDF of the function output
+# Define the function F(P(x), P(y))
+def F(p_x, p_y):
+    return ???  # Define the function F
+
+# Evaluate the function F at the sample points
+f_values = F(pdf_x, pdf_y)
+
+# Compute the PDF of the function output
+output_pdf = interpolate.interp1d(f_values, joint_pdf(X, Y))
+
+# Step 4: Find the value c of the PDF of the function output that we want to approximate
+c = ???  # Choose a value of the PDF of the function output that we want to approximate
+
+# Step 5: Find the set of input values (P(x), P(y)) that satisfy the equation F(P(x), P(y)) = PDF(c)
+# Define the equation F(P(x), P(y)) = PDF(c) in terms of P(y)
+def equation_p_y(p_x, p_y):
+    return output_pdf(F(p_x, p_y)) - c
+
+# Use a numerical solver to find the set of input values that satisfy the equation
+input_values = ???  # Choose a numerical solver to find the input values
+
+# Step 6: Construct a sparse grid of sample points in the input space of the function F(P(x), P(y))
+# Define the sparse grid
+grid_f = ???  # Choose a sparse grid method
+
+# Define the number of sample points
+n_points_f = ???  # Choose the number of sample points
+
+# Generate the sample points
+sample_points_f = grid_f.generate_points(n_points_f)
+
+# Step 7: Interpolate the function values at unsampled points
+# Evaluate the function F at the sample points
+f_values_f = F(sample_points_f[:, 0], sample_points_f[:, 1])
+
+# Interpolate the function values
+interpolated_f = interpolate.interp1d(f_values_f, sample_points_f)
+
+# Step 8: Compute the map F such that F(P(x), P(y)) = PDF(c)
+# Use the interpolated function to compute the map
+def map_F(p_x, p_y):
+    return interpolated_f(output_pdf(F(p_x, p_y)))
+

From c81fe7ee6450eb41342d506fcdecb166096a2752 Mon Sep 17 00:00:00 2001
From: Clay Foye <CFoye@Creare.com>
Date: Wed, 5 Apr 2023 09:38:08 -0400
Subject: [PATCH 07/14] MAINT: fix imports

---
 efficient-stats-calc/benchmarks/benchmark_cdf.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/efficient-stats-calc/benchmarks/benchmark_cdf.py b/efficient-stats-calc/benchmarks/benchmark_cdf.py
index 5eeaab93..c288974d 100644
--- a/efficient-stats-calc/benchmarks/benchmark_cdf.py
+++ b/efficient-stats-calc/benchmarks/benchmark_cdf.py
@@ -7,8 +7,7 @@
 from data_gen import *
 import data_gen
 from tensorflow.keras.models import load_model
-from scipy.stats import gaussian_kde
-from scipy.integrate import cumtrapz
+
 
 import time
 
@@ -108,3 +107,5 @@ def calculate_cdf(data):
     print(f"{method}:")
     print(f"  Average execution time: {result['average_execution_time']:.6f} seconds")
     print(f"  Average error: {result['average_error']:.6f}")
+
+

From 074e1c2231f7446848793144750a0b033d834183 Mon Sep 17 00:00:00 2001
From: Clay Foye <CFoye@Creare.com>
Date: Wed, 5 Apr 2023 09:39:04 -0400
Subject: [PATCH 08/14] DOC: Document Changes

---
 efficient-stats-calc/datalib/real_data.md | 30 +++++++++++++++++++++++
 efficient-stats-calc/deep-learn/mlp.py    |  1 -
 2 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 efficient-stats-calc/datalib/real_data.md

diff --git a/efficient-stats-calc/datalib/real_data.md b/efficient-stats-calc/datalib/real_data.md
new file mode 100644
index 00000000..fbe5fb76
--- /dev/null
+++ b/efficient-stats-calc/datalib/real_data.md
@@ -0,0 +1,30 @@
+# Real Data
+
+Need: 
+- Soil Density (geowatch)
+LAYERS=algorithm.standard_mobility.soil_density.SoilDensityComposite
+- Soil Type (gw)
+- Soil Texture (gw)?
+LAYERS=datalib.soil.texture.USCSTextureComposite
+- Soil Temperature (gw)
+LAYERS=datalib.weather.erdc_lis72.SoilTemperature0to10
+- Soil Moisture (soilmap)
+
+
+## Example
+
+```
+c=podpac.Coordinates([podpac.clinspace([43.700, 43.704, 128, 'lat'], podpac.clinspace([-72.305, -72.301, 128, 'lon'], '2023-01-01', ['lat', 'lon', 'time'])
+
+sm = soilmap.datalib.geowatch.SoilMoisture()
+sm.eval(c)
+
+podpac.settings["username-PW@geowatch_server"] = {"username": "creare", "password": "<iwillsayverbally"}
+podpac.settings.save()
+```
+
+## TODO
+1. Wrap Soil Strength
+2. Fix Caching 
+3. Test on small region
+
diff --git a/efficient-stats-calc/deep-learn/mlp.py b/efficient-stats-calc/deep-learn/mlp.py
index 8559181e..7cdbf8a7 100644
--- a/efficient-stats-calc/deep-learn/mlp.py
+++ b/efficient-stats-calc/deep-learn/mlp.py
@@ -7,7 +7,6 @@
 import sys
 import os
 sys.path.append(os.path.abspath('.'))
-import data_gen
 from data_gen import load_cdf_data
 
 

From e6cbfc4dc647d6176d9c38d207f66ec0929e98c8 Mon Sep 17 00:00:00 2001
From: Clay Foye <CFoye@Creare.com>
Date: Thu, 6 Apr 2023 08:27:11 -0400
Subject: [PATCH 09/14] Maint: PDFs of SM

---
 efficient-stats-calc/datalib/real_data.md     | 19 ++++
 efficient-stats-calc/datalib/soil_mositure.py | 92 +++++++++++++++++++
 2 files changed, 111 insertions(+)
 create mode 100644 efficient-stats-calc/datalib/soil_mositure.py

diff --git a/efficient-stats-calc/datalib/real_data.md b/efficient-stats-calc/datalib/real_data.md
index fbe5fb76..d5e38604 100644
--- a/efficient-stats-calc/datalib/real_data.md
+++ b/efficient-stats-calc/datalib/real_data.md
@@ -28,3 +28,22 @@ podpac.settings.save()
 2. Fix Caching 
 3. Test on small region
 
+
+
+1. Pull multiple year soilmoisture coarse resolution for 3x3 pixels
+only do work in the middle pixel
+SoilMoisture.solmst0_to_10
+
+c=podpac.Coordinates([podpac.clinspace(43.09, 42.91, 3, 'lat'), podpac.clinspace(-73.155, -72.91, 3, 'lon'), '2023-01-01'], ['lat', 'lon', 'time'])
+- correct resolution?
+2. Cache it
+
+3. Feed 
+
+
+## TODO PT 2
+1. Get weatherscale resolution data for every day of a month on a single pixel
+2. Get construct a CDF as expected output
+3. Get fine resolution data for that pixel at the fine resolution for every day of a month
+4. Construct a CDF as input to model
+
diff --git a/efficient-stats-calc/datalib/soil_mositure.py b/efficient-stats-calc/datalib/soil_mositure.py
new file mode 100644
index 00000000..59a4b14d
--- /dev/null
+++ b/efficient-stats-calc/datalib/soil_mositure.py
@@ -0,0 +1,92 @@
+import numpy as np
+
+import soilmap
+import podpac
+import datetime
+
+def calculate_cdf(data):
+    sorted_data = np.sort(data)
+    probabilities = np.linspace(0, 1, len(sorted_data))
+    return probabilities
+
+def calculate_pdf(data, days):
+    # Calculate the kernel density estimate
+    kde = stats.gaussian_kde(data)
+
+    x_values = np.linspace(min(data), max(data), days)
+    
+    # Evaluate the kde on the data
+    pdf = kde.evaluate(x_values)
+    
+    return pdf
+
+
+def get_soil_moisture_pdfs(years, month, lat_clinspace, lon_clinspace, days):
+
+    sm = soilmap.datalib.geowatch.SoilMoisture(cache_ctrl=['ram']).solmst_0_10
+    pdfs = []
+
+    for year in years:
+        # Create a list of dates for the specified month up to and including the 28th day
+        dates = [f"{year}-{month:02d}-{day:02d}" for day in range(1, days-1)]
+
+        soil_moisture_data = []
+
+        # request the data one day at a time:
+        for date in dates:
+            # Get the coordinates for the current date
+            current_c = podpac.Coordinates([lat_clinspace, lon_clinspace, date], ['lat', 'lon', 'time'])
+            # Get soil moisture data for the given coordinates
+            o = sm.eval(current_c)
+            soil_moisture_data.append(o.values[1][1][0])
+        
+        # Sanitize soil_moisture_data for nans:
+        soil_moisture_data = np.array(soil_moisture_data)
+        soil_moisture_data = soil_moisture_data[~np.isnan(soil_moisture_data)]
+        print(soil_moisture_data)
+        # if soil moisture data is not empty:
+        if len(soil_moisture_data) > 0:
+            pdfs.append(calculate_pdf(soil_moisture_data, days))
+            print(year)
+
+    return np.array(pdfs)
+
+def get_soil_moisture_levels(years, month, lat_clinspace, lon_clinspace, days):
+    sm = soilmap.datalib.geowatch.SoilMoisture().solmst_0_10
+    soil_moisture_data_all_years = []
+
+    for year in years:
+        # Create a list of dates for the specified month up to and including the 28th day
+        dates = [f"{year}-{month:02d}-{day:02d}" for day in range(1, days-1)]
+
+        soil_moisture_data = []
+
+        # request the data one day at a time:
+        for date in dates:
+            # Get the coordinates for the current date
+            current_c = podpac.Coordinates([lat_clinspace, lon_clinspace, date], ['lat', 'lon', 'time'])
+            print(current_c)
+            # Get soil moisture data for the given coordinates
+            o = sm.eval(current_c)
+
+            if o is np.nan:
+                continue
+            print(o)
+            soil_moisture_data.append(o.values[1][1][0])
+
+        soil_moisture_data_all_years.append(soil_moisture_data)
+
+    return np.array(soil_moisture_data_all_years)
+
+from scipy import stats
+
+
+
+month = 6
+years = range(2012, 2023)
+lat_clinspace = podpac.clinspace(43.09, 42.91, 3, 'lat')
+lon_clinspace = podpac.clinspace(-73.155, -72.91, 3, 'lon')
+soil_moisture_data = get_soil_moisture_pdfs(years, month, lat_clinspace, lon_clinspace, 28)
+print(soil_moisture_data)
+podpac.utils.clear_cache(mode="all")
+np.save("datalib/data/june_soil_moisture_pdfs.npy", soil_moisture_data)
\ No newline at end of file

From ab6618a51819ba216dbdba134572193edc762f79 Mon Sep 17 00:00:00 2001
From: Clay Foye <CFoye@Creare.com>
Date: Wed, 10 May 2023 12:02:45 -0400
Subject: [PATCH 10/14] CODE: Toy Example displaying functionality

---
 efficient-stats-calc/pdfs/toy_examples.py | 225 ++++++++++++++++++++++
 1 file changed, 225 insertions(+)
 create mode 100644 efficient-stats-calc/pdfs/toy_examples.py

diff --git a/efficient-stats-calc/pdfs/toy_examples.py b/efficient-stats-calc/pdfs/toy_examples.py
new file mode 100644
index 00000000..0f102442
--- /dev/null
+++ b/efficient-stats-calc/pdfs/toy_examples.py
@@ -0,0 +1,225 @@
+import podpac
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.ndimage import gaussian_filter
+import random
+from scipy.stats import gaussian_kde
+import xarray as xr
+'''
+Toy Data Creation
+'''
+
+def generate_sm_data(N, random_seed=None, scale=1, sigma=3):
+    if random_seed is not None:
+        np.random.seed(random_seed)
+
+    elevation_data = np.random.rand(N, N) * scale
+    elevation_data = gaussian_filter(elevation_data, sigma=sigma)
+
+    return elevation_data
+
+def year_long_sm(mean_sm_data, monthly_coefficients, coefficient_std_dev=0.05):
+    year_long_sm_data = []
+
+    for month_coefficient in monthly_coefficients:
+        month_coefficient_array = np.random.normal(loc=month_coefficient, scale=coefficient_std_dev, size=mean_sm_data.shape)
+        month_sm_data = mean_sm_data * month_coefficient_array
+        year_long_sm_data.append(month_sm_data)
+
+    return year_long_sm_data
+
+def decade_long_sm(mean_sm_data, monthly_coefficients, years=10, coefficient_std_dev=0.05):
+    decade_long_sm_data = []
+
+    for _ in range(years):
+        year_long_data = year_long_sm(mean_sm_data, monthly_coefficients, coefficient_std_dev)
+        decade_long_sm_data.extend(year_long_data)
+
+    return decade_long_sm_data
+
+
+def create_coordinates(N, YEARS):
+    # create coordinates for data
+
+    fine_lat = np.linspace(-10, 10, N)    # Fine Data
+    fine_lon = np.linspace(-10, 10, N)    # Coarse Data
+
+
+
+    time = np.linspace(0, (12*YEARS)-1, 12*YEARS)
+    coords = podpac.Coordinates([time, fine_lat, fine_lon], ['time','lat', 'lon']) # Fine coords
+    
+    return coords
+
+
+"""
+Data Modification/Utilities
+"""
+
+def g(data):
+    return 1 / (1 + np.exp(-data))
+
+def create_pdf(all_data, N_bins):
+    n_rows, n_cols = all_data.shape[1:3]
+
+    # Initialize the PDFs, edges, and centers arrays
+    pdfs_shape = (N_bins, n_rows, n_cols)
+    edges_shape = (N_bins + 1, n_rows, n_cols)
+    all_pdfs = np.zeros(pdfs_shape)
+    all_edges = np.zeros(edges_shape)
+
+    # Calculate the common edges using the flattened data
+    flat_data = all_data.ravel()
+    flat_data = flat_data[np.isfinite(flat_data)]
+    common_edges = np.histogram_bin_edges(flat_data, bins=N_bins)
+
+    # Calculate the histograms and centers for each element of the 2D grid
+    for i in range(n_rows):
+        for j in range(n_cols):
+            tmp = all_data[:, i, j].ravel()
+            tmp = tmp[np.isfinite(tmp)]
+            all_pdfs[:, i, j], _ = np.histogram(tmp, density=True, bins=common_edges)
+            all_edges[:, i, j] = common_edges
+
+    # Calculate centers
+    all_centers = (all_edges[:-1] + all_edges[1:]) / 2
+
+    return all_pdfs, all_edges, all_centers
+
+def create_stats_f_data(stats_coords_f, g_x):
+    stats_f_data = []
+    shape = stats_coords_f.shape
+    shape = (1, shape[1], shape[2])  # set maximum time shape to 1
+    for coords in stats_coords_f.iterchunks(shape):
+        stats_f_data.append(g_x.eval(coords))  # should be fast -- all local cached data (Except vegetation for some reason?)
+    stats_f_data = xr.concat(stats_f_data, 'time')
+    
+    return stats_f_data
+    
+
+
+"""
+Data Visualization
+"""
+def plot_year_long_sm(year_long_data):
+    n_months = len(year_long_data)
+    n_cols = 3
+    n_rows = (n_months + n_cols - 1) // n_cols
+
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 4))
+    axes = axes.flatten()
+
+    for month, sm_data in enumerate(year_long_data, start=1):
+        ax = axes[month - 1]
+        img = ax.imshow(sm_data, cmap='viridis', origin='lower')
+        ax.set_title(f'Month {month}')
+        ax.axis('off')
+
+    # Remove unused subplots
+    for ax in axes[n_months:]:
+        ax.remove()
+
+    # Add a colorbar
+    fig.subplots_adjust(right=0.8)
+    cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
+    fig.colorbar(img, cax=cbar_ax, label='Soil Moisture')
+
+    plt.show()
+
+def create_histogram(m_sm_matrices, N_bins):
+    # Calculate the average soil moisture value for each matrix
+    avg_soil_moisture = [np.mean(matrix) for matrix in m_sm_matrices]
+
+    # Create the histogram
+    plt.hist(avg_soil_moisture, bins=N_bins)
+    plt.xlabel('Average Soil Moisture')
+    plt.ylabel('Frequency')
+    plt.title('Histogram of Soil Moisture Matrices')
+    plt.show()
+
+
+def plot_pdf_for_single_grid_square(all_pdfs, all_edges, row, col):
+    # Select the PDF and edges for the specific grid square
+    pdf = all_pdfs[:, row, col]
+    edges = all_edges[:, row, col]
+
+    # Calculate the centers of the bins
+    centers = (edges[:-1] + edges[1:]) / 2
+
+    # Plot the PDF
+    plt.plot(centers, pdf)
+    plt.xlabel("Value")
+    plt.ylabel("Probability Density")
+    plt.title(f"PDF for Grid Square ({row}, {col})")
+    plt.show()
+
+"""
+Main
+"""
+
+def approx_sm(N, YEARS, N_BINS, DO_PLOT, monthly_coefficients):
+
+
+    """
+    Data Gen
+    """
+
+    # Data Gen
+    average_sm = generate_sm_data(N, random_seed=0)
+    decade_long_data = decade_long_sm(average_sm, monthly_coefficients, YEARS)
+
+    
+    if DO_PLOT:
+        year_long_data = year_long_sm(average_sm, monthly_coefficients)
+        plot_year_long_sm(year_long_data)
+        
+    print("Data Generated")
+
+    """
+    Coordinate Gen
+    """
+    
+    coords = create_coordinates(N, YEARS)
+    print(coords)
+    
+    
+    """
+    Create PDF's for each grid square
+    """
+    
+    data = np.array(decade_long_data)
+    all_pdfs, all_edges, all_centers = create_pdf(data, N_BINS)
+    
+    """
+    Calculate Means
+    """
+    true_mean = g(data).mean(0)
+    cheap_mean = (g(all_centers) * all_pdfs * (np.diff(all_edges, axis=0))).sum(axis=0)
+    print("Mean Absolute Error", np.abs(true_mean - cheap_mean).mean())
+    
+    
+    if DO_PLOT or True:
+        plt.figure()
+        ax1 = plt.subplot(131)
+        plt.imshow(true_mean)
+        plt.title("Truth")
+        plt.colorbar()
+        plt.subplot(132, sharex=ax1, sharey=ax1)
+        plt.imshow(cheap_mean,)
+        plt.title(f"Cheap,Approx, nbins={N_BINS}")
+        plt.colorbar()
+        plt.subplot(133, sharex=ax1, sharey=ax1)
+        plt.title("Cheap - Truth")
+        plt.imshow(cheap_mean - true_mean, cmap='BrBG')#, vmin=-0.02, vmax=0.02)
+        plt.colorbar()
+        plt.show()
+    
+
+if __name__ == '__main__':
+    # Constants
+    N = 50  # size of the fine coords
+    YEARS = 10 # Total number of years to create years for
+    N_BINS = 32 # Number of bins for the histogram
+    DO_PLOT = False # plots or not
+    monthly_coefficients = [1.2, 1.1, 1.0, 0.8, 0.6, 0.5, 0.4, 0.6, 0.8, 1.0, 1.1, 1.2]
+    approx_sm(N, YEARS, N_BINS, DO_PLOT, monthly_coefficients)
\ No newline at end of file

From 27140d217985e3437e22d65c2eaee723aa981959 Mon Sep 17 00:00:00 2001
From: Clay Foye <CFoye@Creare.com>
Date: Mon, 22 May 2023 11:28:24 -0400
Subject: [PATCH 11/14] CODE: Real Example

---
 efficient-stats-calc/pdfs/real_example.py | 384 ++++++++++++++++++++++
 1 file changed, 384 insertions(+)
 create mode 100644 efficient-stats-calc/pdfs/real_example.py

diff --git a/efficient-stats-calc/pdfs/real_example.py b/efficient-stats-calc/pdfs/real_example.py
new file mode 100644
index 00000000..264839bc
--- /dev/null
+++ b/efficient-stats-calc/pdfs/real_example.py
@@ -0,0 +1,384 @@
+import podpac
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.ndimage import gaussian_filter
+import random
+from scipy.stats import gaussian_kde
+import xarray as xr
+from tqdm import tqdm
+from matplotlib.colors import LogNorm
+
+
+import soilmap.datalib.geowatch as geowatch
+
+
+def get_coordinates():
+    # Make a set of fine and coarse coordinates
+    # Center coordinates about Creare
+    center_lat, center_lon = 43.682102, -72.233455
+    # get deltas using 110 km / deg and a 30 x 30 km box
+    f_box_lat_lon = 30 / 2 / 110
+    n_fine = int(30 / (30 / 1000))  # Number of 30 m boxes in a 30km square (hint, it's 1000)
+    n_coarse = 3
+    f_coords =  podpac.Coordinates([
+        podpac.clinspace(center_lat + f_box_lat_lon, center_lat - f_box_lat_lon, n_fine),
+        podpac.clinspace(center_lon - f_box_lat_lon, center_lon + f_box_lat_lon, n_fine),
+            ["2022-06-01T12"]], ["lat", "lon", "time"]
+    )
+    c_coords = f_coords[::333, 100::400]
+    
+    return c_coords, f_coords
+
+def fill_time_dimensions(c_coords, f_coords):
+      # make the coordinates for the brute-force technique
+    e1 = podpac.algorithm.ExpandCoordinates(time=["0,D", "30,D", '1,D'])
+    day_time_coords = podpac.Coordinates([e1.get_modified_coordinates1d(f_coords, 'time')])
+    e2 = podpac.algorithm.ExpandCoordinates(source=e1, time=["-4,Y", "0,Y", '1,Y'])
+    all_time_coords = podpac.Coordinates([e2.get_modified_coordinates1d(day_time_coords, 'time')])
+
+    all_f_coords = podpac.coordinates.merge_dims([f_coords.drop('time'), all_time_coords])
+    all_c_coords = podpac.coordinates.merge_dims([c_coords.drop('time'), all_time_coords])
+    
+
+    return all_c_coords, all_f_coords
+
+def get_veg_node(all_f_coords):
+      # Also, use the average of the vegetation, for initial testing
+    sm = geowatch.SoilMoisture()
+    o_veg = []
+    vegetation = sm.vegetation
+    one_month_coords = all_f_coords[:, :, :32*5:5]
+    shape = one_month_coords[:, :, :32*5:5].shape
+    shape = (shape[0], shape[1], 1)  # set maximum time shape to 1
+    for coords in one_month_coords.iterchunks(shape):
+        o_veg.append(vegetation.eval(coords))
+    o_veg = xr.concat(o_veg, 'time')
+    print(o_veg.mean('time').data)
+    veg_node = podpac.data.Array(source=o_veg.mean('time').data, coordinates=all_f_coords.drop('time'))
+    
+    return veg_node
+
+def get_constant_veg_node(all_f_coords, constant_veg_value):
+    # Get the spatial coordinates from all_f_coords
+    spatial_coords = all_f_coords.drop('time')
+    
+    # Create an array with the same spatial shape as all_f_coords but filled with the constant vegetation value
+    constant_veg_data = np.full(spatial_coords.shape, constant_veg_value)
+    
+    # Create a PODPAC Array node with the constant vegetation data and the spatial coordinates
+    constant_veg_node = podpac.data.Array(source=constant_veg_data, coordinates=spatial_coords)
+    
+    return constant_veg_node
+
+def get_constant_land_use(all_f_coords, constant_land_value):
+    # Get the spatial coordinates from all_f_coords
+    spatial_coords = all_f_coords.drop('time')
+    
+    # Create an array with the same spatial shape as all_f_coords but filled with the constant vegetation value
+    constant_veg_data = np.full(spatial_coords.shape, constant_land_value)
+    
+    # Create a PODPAC Array node with the constant vegetation data and the spatial coordinates
+    constant_land_node = podpac.data.Array(source=constant_veg_data, coordinates=spatial_coords)
+    
+    return constant_land_node
+
+def get_calibrated_sm(all_c_coords, veg_node, land_use_node):
+        # Quick test to make sure everything makes sense
+    sm = geowatch.SoilMoisture(vegetation=veg_node, vegetation_mean=veg_node, land_use=land_use_node)
+    sm_ca = sm.solmst_0_10
+    sm_cr = sm.relsolmst_0_10  # you can compute this from the abs soil moisture above... but I'm lazy so I'll just get this data for now
+    
+    all_ca_data = [] #coarse absolute
+    all_cr_data = [] #coarse relative
+    shape = all_c_coords.shape
+    shape = (shape[0], shape[1], 1)
+    for coords in all_c_coords.iterchunks(shape):
+        all_ca_data.append(sm_ca.eval(coords))
+        all_cr_data.append(sm_cr.eval(coords))
+    
+    # Merge the data in all_ca_data along the 'time' dimension
+    all_ca_data = xr.concat(all_ca_data, 'time')
+
+    # Merge the data in all_cr_data along the 'time' dimension
+    all_cr_data = xr.concat(all_cr_data, 'time')
+   
+    # All Coarse Absolute Data
+    all_ca_arr = podpac.data.Array(
+        source=all_ca_data,
+        coordinates=all_c_coords,
+        interpolation="bilinear"
+    )
+    # All Coarse Relative Data
+    all_cr_arr = podpac.data.Array(
+        source=all_cr_data,
+        coordinates=all_c_coords,
+        interpolation="bilinear"
+    )
+
+    # Create a new SoilMoisture node that adjusts the soil moisture data based on vegetation data
+    return all_ca_data, all_cr_data, geowatch.SoilMoisture(solmst_0_10=all_ca_arr, relsolmst_0_10=all_cr_arr, vegetation=veg_node, vegetation_mean=veg_node, land_use=land_use_node)
+
+def get_stats_data(all_c_coords, all_f_coords, all_ca_edges, all_cr_edges, constant_veg_value, constant_land_value, n_bins):
+    # Get the centers of the bins
+    all_ca_centers = (all_ca_edges[..., 1:] + all_ca_edges[..., :-1]) * 0.5
+    all_cr_centers = (all_cr_edges[..., 1:] + all_cr_edges[..., :-1]) * 0.5
+
+    # Get stats coords
+    stats_coords_f = all_f_coords[:, :, :n_bins * 2:2]
+    stats_coords = all_c_coords[:, :, :n_bins * 2:2]
+    
+    # Create abs soil moisture coords at the centers of the bins
+    abs_ws_stats = podpac.data.Array(
+        source=all_ca_centers, # Here's the weatherscale data at the centers of the bins
+        coordinates=stats_coords,  # Mock the coordinates
+        interpolation="bilinear"
+    )
+    # Create relative soil moisture coords at the centers of the bins
+    rel_ws_stats = podpac.data.Array(
+        source=all_cr_centers,
+        coordinates=stats_coords,  # Mock the coordinates
+        interpolation="bilinear"
+    )
+    # Get the spatial coordinates from stats_coords
+    spatial_coords = stats_coords.drop('time')
+    
+    constant_veg_data = np.full(spatial_coords.shape, constant_veg_value)
+    veg_node = podpac.data.Array(source=constant_veg_data, coordinates=spatial_coords)
+    
+    constant_land_data = np.full(spatial_coords.shape, constant_land_value)
+    land_use_node = podpac.data.Array(source=constant_land_data, coordinates=spatial_coords)
+
+    # Make the g(x) function
+    sm_stats = geowatch.SoilMoisture(solmst_0_10=abs_ws_stats, relsolmst_0_10=rel_ws_stats, vegetation=veg_node, vegetation_mean=veg_node, land_use=land_use_node)
+
+    # Evaluate g(x) 
+    stats_f_data = []
+    shape = stats_coords_f.shape
+    shape = (shape[0], shape[1], 1)  # set maximum time shape to 1
+    for coords in stats_coords_f.iterchunks(shape):
+        stats_f_data.append(sm_stats.eval(coords))  # should be fast -- all local cached data (Except vegetation for some reason? )
+    stats_f_data = xr.concat(stats_f_data, 'time')
+        
+    return stats_f_data
+
+def get_fine_pdfs_edges(all_ca_pdfs, all_ca_edges, n_bins, all_c_coords, all_f_coords):
+    ca_edges = podpac.data.Array(
+        source=all_ca_edges,
+        coordinates=all_c_coords[:, :, :n_bins + 1],  # Again, the time coordinate here is fake -- I just want to interpolate space
+        interpolation='bilinear',    )
+    ca_edges_f = ca_edges.eval(all_f_coords[:, :, :n_bins + 1])
+    # pdfs
+    ca_pdfs = podpac.data.Array(
+        source=all_ca_pdfs,
+        coordinates=all_c_coords[:, :, :n_bins],  # Again, the time coordinate here is fake -- I just want to interpolate space
+        interpolation='bilinear',
+    )
+    ca_pdfs_f = ca_pdfs.eval(all_f_coords[:, :, :n_bins])
+    
+    return ca_edges_f, ca_pdfs_f
+
+def get_fine_data(all_f_coords, sm_centered):
+    # get the data for the cheap approach
+    all_f_data = []
+    shape = all_f_coords.shape
+    shape = (shape[0], shape[1], 1)
+    for coords in all_f_coords.iterchunks(shape):
+        all_f_data.append(sm_centered.eval(coords))
+    all_f_data = xr.concat(all_f_data, 'time')
+
+    return all_f_data
+
+
+def create_pdfs(n_bins, all_ca_data, all_cr_data):
+    # Create new arrays with a third dimension of size n_bins.
+    new_shape = all_ca_data.shape[:2] + (n_bins,)
+    new_shape_edges = all_ca_data.shape[:2] + (n_bins + 1,)
+    all_ca_pdfs = np.zeros((new_shape))
+    all_ca_edges = np.zeros((new_shape_edges))
+    all_cr_pdfs = np.zeros((new_shape))
+    all_cr_edges = np.zeros((new_shape_edges))
+
+    # Loop over each element in the 2D array, computing the histograms for the corresponding 1D data.
+    for i in range(new_shape[0]):
+        for j in range(new_shape[1]):
+            # Compute the histogram for the i,j-th element of all_ca_data.
+            tmp = all_ca_data.data[i, j].ravel()
+            tmp = tmp[np.isfinite(tmp)]
+            all_ca_pdfs[i, j, :], all_ca_edges[i, j, :] = np.histogram(tmp, density=True, bins=n_bins)
+
+            # Compute the histogram for the i,j-th element of all_cr_data.
+            tmp = all_cr_data.data[i, j].ravel()
+            tmp = tmp[np.isfinite(tmp)]
+            all_cr_pdfs[i, j, :], all_cr_edges[i, j, :] = np.histogram(tmp, density=True, bins=n_bins)
+
+    return all_ca_pdfs, all_ca_edges, all_cr_pdfs, all_cr_edges
+
+def validate_pdfs(all_ca_pdfs, all_ca_edges):
+         # Check on one of these
+    assert np.abs(1 - (all_ca_pdfs[1, 1] * (all_ca_edges[1, 1][1:] - all_ca_edges[1, 1][:-1])).sum()) < 1e-14
+    plt.stairs(all_ca_pdfs[1, 1], all_ca_edges[1, 1], fill=True)
+    plt.stairs(all_ca_pdfs[1, 1], all_ca_edges[1, 1], color='k', fill=False)
+    plt.show()
+
+def compute_mse(cheap_mean, truth_mean):
+    squared_diffs = np.nan_to_num((cheap_mean - truth_mean)**2)
+    valid_mask = np.isfinite(cheap_mean) & np.isfinite(truth_mean)  # mask where neither array is NaN
+
+    # Compute the mean of the squared differences, ignoring any NaN values
+    mse = np.nanmean(squared_diffs[valid_mask])
+    return mse
+    
+def plot_difference(arr1, arr2, n_bins, title=""):
+    # Plot results
+    plt.figure()
+    kwargs = dict()
+    ax1 = plt.subplot(131)
+    plt.imshow(arr1, **kwargs)
+    plt.title("Truth "+title)
+    plt.colorbar()
+    plt.subplot(132, sharex=ax1, sharey=ax1)
+    plt.imshow(arr2, **kwargs)
+    plt.title(f"Cheap,Approx, nbins={n_bins}, " + title)
+    plt.colorbar()
+    plt.subplot(133, sharex=ax1, sharey=ax1)
+    plt.title("Cheap - Truth "+ title)
+    plt.imshow(arr2 - arr1, cmap='BrBG')
+    plt.colorbar()
+    plt.show()
+        
+def eff_stats_calc(n_bins, veg_value, land_use_value, DO_LOG=False, DO_PLOT=False):
+        # Get lat and lon coordinates for coarse data with empty time
+        c_coords, f_coords = get_coordinates()
+        
+        # Fill in time dimension 
+        all_c_coords, all_f_coords = fill_time_dimensions(c_coords, f_coords)
+        
+        if DO_LOG:
+            print("Got Coords...")
+        
+        # Get a vegetation node for constant vegetation:
+        # veg_node = get_veg_node(all_f_coords)
+        veg_node = get_constant_veg_node(all_f_coords, veg_value)
+        land_use_node = get_constant_land_use(all_f_coords, land_use_value)
+        
+        # Get a soil moisture node with centered on coarse coords
+        all_ca_data, all_cr_data, sm_centered = get_calibrated_sm(all_c_coords, veg_node, land_use_node)
+        
+        if DO_LOG:
+            print("Calibrated SM...")
+        
+        # Get Fine SM Data:
+        all_f_data = get_fine_data(all_f_coords, sm_centered)
+        
+        if DO_LOG:
+            print("Got Fine Data...")
+        
+        # Create a PDF from the data
+        all_ca_pdfs, all_ca_edges, all_cr_pdfs, all_cr_edges = create_pdfs(n_bins, all_ca_data, all_cr_data)
+        #TODO: Figure out how we can use coarse relative pdfs?
+        # validate_pdfs(all_ca_pdfs, all_ca_edges)
+        
+        if DO_LOG:
+            print("Created PDFs...")
+        
+        # Create a SM node with the centers
+        stats_f_data = get_stats_data(all_c_coords, all_f_coords, all_ca_edges, all_cr_edges, veg_value, land_use_value ,n_bins)
+        
+        if DO_LOG:
+            print("Got Stats Data...")
+        
+        # Get the fine pdfs by interpolating coarse-scale data edges
+        # TODO: figure out ohow to use coarse relative pdfs?
+        ca_edges_f, ca_pdfs_f = get_fine_pdfs_edges(all_ca_pdfs, all_ca_edges, n_bins, all_c_coords, all_f_coords)
+        
+        if DO_LOG:
+            print("Got Fine PDFs...")
+        
+        # Calculate cheap mean and variance
+        cheap_mean = (stats_f_data.data * ca_pdfs_f.data * (ca_edges_f[..., 1:].data - ca_edges_f[..., :-1].data)).sum(axis=-1)
+        deviations = stats_f_data.data - cheap_mean[..., np.newaxis]
+        cheap_var = (deviations**2 * ca_pdfs_f.data * (ca_edges_f[..., 1:].data - ca_edges_f[..., :-1].data)).sum(axis=-1)
+        
+        
+        if DO_LOG:
+            print("Calculated Cheap Mean...")
+        
+        # Calculate correct mean
+        truth_mean = all_f_data.mean('time').data
+        truth_var = all_f_data.var('time').data
+        
+        if DO_LOG:
+            print("Calculated Truth Mean...")
+        
+        if DO_PLOT:
+            plot_difference(truth_mean, cheap_mean, n_bins, title="Mean")
+            plot_difference((truth_var), (cheap_var), n_bins, title="Variance")
+            
+        
+        cheap_std_dev = np.sqrt(cheap_var)
+        truth_std_dev = np.sqrt(truth_var)
+        
+        if DO_LOG:
+            print("Calculated StdDev...")
+        
+        # Compute the squared differences, ignoring any NaN values
+        mse = compute_mse(cheap_mean, truth_mean)
+        
+        return cheap_mean, truth_mean, mse
+    
+
+if __name__ == '__main__':
+    """
+    1. Generate PDFs out of coarse data
+    2. Calculate centers for that data
+    3. Create SoilMoisutre node using those centers
+    4. Evaluate SM node at those centers
+    5. Multiply eval by pdf and bin widths
+    """
+    with podpac.settings:
+        # Constants
+        VEG_VALUE = 50
+        LAND_USE_VALUE = 0
+        DO_LOG = True
+        DO_PLOT = False
+
+        # Cache
+        podpac.settings["DEFAULT_CACHE"] = ["ram", "disk"]
+        podpac.settings["MULTITHREADING"] = False
+        # podpac.settings["CACHE_OUTPUT_DEFAULT"] = False
+        # podpac.utils.clear_cache(mode='ram')
+        # podpac.utils.clear_cache(mode='disk')
+
+        # Initialize lists to store the bin counts and the corresponding MSE values
+        bin_counts = []
+        mse_values = []
+        
+        # Initialize the progress bar
+        pbar = tqdm(total=7, desc='Processing bins', ncols=155)
+
+        # Run eff_stats_calc for 1, 2, 4, 8, 16, 32, 64 bins
+        # [1, 2, 4, 8, 16, 32, 64]:
+        for n_bins in [1, 2, 4, 8, 16, 32, 64]:
+            if n_bins == 64:
+                _, _, mse = eff_stats_calc(n_bins, VEG_VALUE, LAND_USE_VALUE, DO_LOG, True)
+            else:
+                _, _, mse = eff_stats_calc(n_bins, VEG_VALUE, LAND_USE_VALUE, DO_LOG, DO_PLOT)
+            bin_counts.append(n_bins)
+            mse_values.append(mse)
+            
+            #Update the progress bar
+            pbar.update(1)
+
+        pbar.close()
+
+        # Plot the results
+        plt.figure(figsize=(10,6))
+        plt.loglog(bin_counts, mse_values, marker='o')
+        plt.title('Mean Squared Error vs Number of Bins')
+        plt.xlabel('Number of Bins')
+        plt.ylabel('Mean Squared Error')
+        plt.grid(True)
+        plt.show()
+
+        
+

From 737cc5706e65cde9001216376115f9e49b00fc43 Mon Sep 17 00:00:00 2001
From: Clay Foye <CFoye@Creare.com>
Date: Mon, 22 May 2023 14:27:09 -0400
Subject: [PATCH 12/14] CLEANUP: Remove all non-working or draft files

---
 .../benchmarks/benchmark_cdf.py               | 111 -----------
 .../benchmarks/benchmark_mean.py              |  93 ----------
 efficient-stats-calc/data_gen.py              | 174 ------------------
 efficient-stats-calc/datalib/real_data.md     |  49 -----
 efficient-stats-calc/datalib/soil_mositure.py |  92 ---------
 efficient-stats-calc/deep-learn/cnn.py        |  47 -----
 efficient-stats-calc/deep-learn/mlp.py        |  46 -----
 .../{pdfs => }/real_example.py                |   0
 .../sparse-grid/sparse_grid_approx.py         |  71 -------
 .../{pdfs/toy_examples.py => toy_example.py}  |   0
 10 files changed, 683 deletions(-)
 delete mode 100644 efficient-stats-calc/benchmarks/benchmark_cdf.py
 delete mode 100644 efficient-stats-calc/benchmarks/benchmark_mean.py
 delete mode 100644 efficient-stats-calc/data_gen.py
 delete mode 100644 efficient-stats-calc/datalib/real_data.md
 delete mode 100644 efficient-stats-calc/datalib/soil_mositure.py
 delete mode 100644 efficient-stats-calc/deep-learn/cnn.py
 delete mode 100644 efficient-stats-calc/deep-learn/mlp.py
 rename efficient-stats-calc/{pdfs => }/real_example.py (100%)
 delete mode 100644 efficient-stats-calc/sparse-grid/sparse_grid_approx.py
 rename efficient-stats-calc/{pdfs/toy_examples.py => toy_example.py} (100%)

diff --git a/efficient-stats-calc/benchmarks/benchmark_cdf.py b/efficient-stats-calc/benchmarks/benchmark_cdf.py
deleted file mode 100644
index c288974d..00000000
--- a/efficient-stats-calc/benchmarks/benchmark_cdf.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import time
-import numpy as np
-
-import sys
-import os
-sys.path.append(os.path.abspath('.'))
-from data_gen import *
-import data_gen
-from tensorflow.keras.models import load_model
-
-
-import time
-
-MLP_MODEL = "models/mlp.h5"
-
-# Constants
-CNN_MODEL = 'models/cnn_1d.h5'
-N_SAMPLES = 1024
-
-# Brute Force -- computing s(f(x,y))
-def brute_force(x_cdf, y_cdf):
-    return NotImplementedError
-    
-
-# Multi-layer Perceptron
-def mlp(x_cdf, y_cdf):
-    model = load_model(MLP_MODEL)
-    print(x_cdf)
-    print(model.predict(np.array([np.hstack([x_cdf,y_cdf])]))[0])
-    return model.predict(np.array([np.hstack([x_cdf,y_cdf])]))[0]
-    
-    
-# Convolutional Neural Network
-def cnn(x_cdf, y_cdf):
-    # Load the trained CNN model
-    model = load_model(CNN_MODEL)
-
-    cdf = np.array([np.hstack([x_cdf,y_cdf])])
-    cdf = cdf.reshape(-1, cdf.shape[1], 1)
-    
-    
-    # Perform prediction using the model
-    prediction = model.predict(cdf)[0]
-    
-    print(prediction)
-
-    return prediction
-
-def benchmark(methods, data_gen, f, n_runs=1, n_sets=10):
-    results = {}
-    
-    def calculate_cdf(data):
-        sorted_data = np.sort(data)
-        probabilities = np.linspace(0, 1, len(sorted_data))
-        return sorted_data, probabilities
-
-    for name, method in methods.items():
-        # Initialize variables to store execution time and errors
-        exec_times = []
-        errors = []
-        
-        for _ in range(n_runs):
-            set_errors = []
-            for i in range(n_sets):
-                x, y = data_gen.generate_data(N_SAMPLES)
-                c_true = f(x, y)
-
-                # Calculate the CDFs of x, y, and f(x, y)
-                x_sorted, x_cdf = calculate_cdf(x)
-                y_sorted, y_cdf = calculate_cdf(y)
-                c_sorted, c_cdf_true = calculate_cdf(c_true)
-
-                # Measure execution time
-                start_time = time.time()
-                c_cdf_approx = method(x_cdf, y_cdf)
-                exec_time = time.time() - start_time
-                
-                # Calculate the error
-                error = np.mean(np.abs(c_cdf_approx - c_cdf_true))
-                set_errors.append(error)
-
-            exec_times.append(exec_time)
-            errors.append(np.mean(set_errors))
-        
-        # Calculate average execution time and error
-        avg_exec_time = np.mean(exec_times)
-        avg_error = np.mean(errors)
-        
-        results[name] = {
-            'average_execution_time': avg_exec_time,
-            'average_error': avg_error
-        }
-        
-    return results
-
-
-# Define methods to benchmark
-methods = {
-    'mlp': mlp,
-    'cnn': cnn
-}   
-
-# Run the benchmark
-results = benchmark(methods, data_gen, f, n_runs=1, n_sets=1)
-print("Benchmark results:")
-for method, result in results.items():
-    print(f"{method}:")
-    print(f"  Average execution time: {result['average_execution_time']:.6f} seconds")
-    print(f"  Average error: {result['average_error']:.6f}")
-
-
diff --git a/efficient-stats-calc/benchmarks/benchmark_mean.py b/efficient-stats-calc/benchmarks/benchmark_mean.py
deleted file mode 100644
index f07d1fc0..00000000
--- a/efficient-stats-calc/benchmarks/benchmark_mean.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import time
-import numpy as np
-from data_gen import *
-from tensorflow.keras.models import load_model
-
-MLP_MODEL = "models/mlp.h5"
-
-# Constants
-CNN_MODEL = 'models/cnn.h5'
-WIDTH = 32
-HEIGHT = 32
-
-# Brute Force -- computing s(f(x,y))
-def brute_force(x, y):
-    return np.mean(f(x,y))
-
-# Multi-layer Perceptron
-def mlp(x, y):
-    model = load_model(MLP_MODEL)
-    return model.predict(np.array([np.hstack([x,y])]))[0][0]
-    
-    
-# Convolutional Neural Network
-def cnn(x, y):
-    # Load the trained CNN model
-    model = load_model(CNN_MODEL)
-
-    # Reshape x and y into the format expected by the CNN
-    x_data = x.reshape(HEIGHT, WIDTH)
-    y_data = y.reshape(HEIGHT, WIDTH)
-    data = np.stack([x_data, y_data], axis=-1)  # shape: (height, width, 2)
-    input_data = np.expand_dims(data, axis=0)  # shape: (1, height, width, 2)
-
-    # Perform prediction using the model
-    prediction = model.predict(input_data)[0][0]
-
-    return prediction
-
-
-# Benchmarking tool
-def benchmark(methods, x, y, f, n_runs=10):
-    results = {}
-    
-    # Calculate the ground truth s(c)
-    c_true = f(x, y)
-    s_c_true = np.mean(c_true)
-    
-    for name, method in methods.items():
-        # Initialize variables to store execution time and errors
-        exec_times = []
-        errors = []
-        
-        for _ in range(n_runs):
-            # Measure execution time
-            start_time = time.time()
-            s_c_approx = method(x, y)
-            exec_time = time.time() - start_time
-            exec_times.append(exec_time)
-            
-            # Calculate the error
-            error = np.abs(s_c_approx - s_c_true)
-            errors.append(error)
-        
-        # Calculate average execution time and error
-        avg_exec_time = np.mean(exec_times)
-        avg_error = np.mean(errors)
-        
-        results[name] = {
-            'average_execution_time': avg_exec_time,
-            'average_error': avg_error
-        }
-        
-    return results
-
-# Define methods to benchmark
-methods = {
-    'brute_force': brute_force,
-    'multilayer_perceptron': mlp,
-    'convolutional': cnn
-}
-
-# Generate toy data
-n_samples = WIDTH*HEIGHT
-x, y = generate_data(n_samples)
-
-
-# Run the benchmark
-results = benchmark(methods, x, y, f)
-print("Benchmark results:")
-for method, result in results.items():
-    print(f"{method}:")
-    print(f"  Average execution time: {result['average_execution_time']:.6f} seconds")
-    print(f"  Average error: {result['average_error']:.6f}")
diff --git a/efficient-stats-calc/data_gen.py b/efficient-stats-calc/data_gen.py
deleted file mode 100644
index 9ff51713..00000000
--- a/efficient-stats-calc/data_gen.py
+++ /dev/null
@@ -1,174 +0,0 @@
-import numpy as np
-import concurrent.futures
-from scipy.stats import gaussian_kde
-from scipy.integrate import cumtrapz
-
-# Non-linear n-dimensional function f(x, y)
-def f(x, y):
-    return np.sin(x) * np.cos(y)
-
-def generate_data(n_samples, seed=None):
-        
-    if seed is not None:
-        np.random.seed(seed)
-    
-    x = np.random.uniform(-np.pi, np.pi, n_samples)
-    y = np.random.uniform(-np.pi, np.pi, n_samples)
-    return x, y
-# Calculate statistics s(x), s(y) and s(c)
-def calculate_statistics(x, y, func):
-    s_x = np.mean(x)
-    s_y = np.mean(y)
-    c = func(x, y)
-    s_c = np.mean(c)
-    return s_x, s_y, s_c
-
-# Generate toy data and calculate statistics
-def create_toy_data(n_samples, func):
-    x, y = generate_data(n_samples)
-    s_x, s_y, s_c = calculate_statistics(x, y, func)
-    return x, y, s_x, s_y, s_c
-
-"""
-Example Usage
-"""
-# n_samples = 1000
-# x, y, s_x, s_y, s_c = create_toy_data(n_samples, f)
-# print("s(x):", s_x)
-# print("s(y):", s_y)
-# print("s(c):", s_c)
-
-
-
-
-def generate_data_cnn(N, data_gen, width, height):
-    X = []
-    y = []
-
-    for _ in range(N):
-        x_data, y_data = data_gen.generate_data(width * height)
-        x_data = x_data.reshape(height, width)
-        y_data = y_data.reshape(height, width)
-        data = np.stack([x_data, y_data], axis=-1)  # shape: (height, width, 2)
-        X.append(data)
-        y.append(np.mean(data_gen.f(x_data, y_data)))
-
-    return np.array(X), np.array(y)
-
-def generate_pdf_data(N, data_gen, num_points=1024):
-    X = []
-    y = []
-
-    for _ in range(N):
-        data = np.hstack(data_gen.generate_data(num_points))
-        x_data = data[:num_points]
-        y_data = data[num_points:]
-
-        # Estimate the PDFs of x and y using KDE
-        kde_x = gaussian_kde(x_data)
-        kde_y = gaussian_kde(y_data)
-
-        # Compute f(x, y) and estimate its PDF using KDE
-        f_data = data_gen.f(x_data, y_data)
-        kde_f = gaussian_kde(f_data)
-
-        # Stack the PDFs of x, y, and f(x, y) as input features and target
-        x_pdf = kde_x(x_data)
-        y_pdf = kde_y(y_data)
-        f_pdf = kde_f(f_data)
-
-        X.append(np.hstack([x_pdf, y_pdf]))
-        y.append([f_pdf])
-
-
-    return np.array(X), np.array(y)
-
-def generate_cdf_data(N, data_gen, num_points=1024, grid_size=100):
-    X = []
-    y = []
-
-    for _ in range(N):
-        data = np.hstack(data_gen.generate_data(num_points))
-        x_data = data[:num_points]
-        y_data = data[num_points:]
-
-        # Create a lower resolution grid
-        x_grid = np.linspace(x_data.min(), x_data.max(), grid_size)
-        y_grid = np.linspace(y_data.min(), y_data.max(), grid_size)
-
-        # Estimate the PDFs of x and y using KDE
-        # Example: Use Silverman's rule of thumb
-        kde_x = gaussian_kde(x_data, bw_method='silverman')
-        kde_y = gaussian_kde(y_data, bw_method='silverman')
-
-        # Compute f(x, y) and estimate its PDF using KDE
-        f_data = data_gen.f(x_data, y_data)
-        kde_f = gaussian_kde(f_data, bw_method='silverman')
-
-        # Compute the CDFs of x, y, and f(x, y) using the PDFs on the grid
-        x_cdf = cumtrapz(kde_x(x_grid), x_grid, initial=0)
-        y_cdf = cumtrapz(kde_y(y_grid), y_grid, initial=0)
-        f_cdf = cumtrapz(kde_f(f_data), f_data, initial=0)
-
-        # Normalize the CDFs
-        x_cdf /= x_cdf[-1]
-        y_cdf /= y_cdf[-1]
-        f_cdf /= f_cdf[-1]
-
-        X.append(np.hstack([x_cdf, y_cdf]))
-        y.append([f_cdf])
-
-    return np.array(X), np.array(y)
-
-def generate_cdf_data_parallel(N, data_gen, num_workers=8, num_points=1024):
-    # Define a helper function to process each data point
-    def process_datapoint(data_gen):
-        data = np.hstack(data_gen.generate_data(num_points))
-        x_data = data[:num_points]
-        y_data = data[num_points:]
-
-        # Calculate the CDFs of x, y, and f(x, y) directly from the data points
-        x_sorted, x_cdf = calculate_cdf(x_data)
-        y_sorted, y_cdf = calculate_cdf(y_data)
-        f_data = data_gen.f(x_data, y_data)
-        f_sorted, f_cdf = calculate_cdf(f_data)
-
-        return np.hstack([x_cdf, y_cdf]), f_cdf
-
-    def calculate_cdf(data):
-        sorted_data = np.sort(data)
-        probabilities = np.linspace(0, 1, len(sorted_data))
-        return sorted_data, probabilities
-
-    X = []
-    y = []
-
-    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
-        # Parallelize the generation of data points
-        futures = [executor.submit(process_datapoint, data_gen) for _ in range(N)]
-
-        for future in futures:
-            x_cdf_y_cdf, f_cdf = future.result()
-            X.append(x_cdf_y_cdf)
-            y.append([f_cdf])
-
-    return np.array(X), np.array(y)
-
-def save_cdf_data(X, y, X_filename, y_filename):
-    np.save(X_filename, X)
-    np.save(y_filename, y)
-
-def load_cdf_data(X_filename, y_filename):
-    X = np.load(X_filename)
-    y = np.load(y_filename)
-    return X, y
-
-N = 10000
-import data_gen
-
-X_train, y_train = generate_cdf_data_parallel(N, data_gen, num_workers=8)
-save_cdf_data(X_train, y_train, "data/X_train.npy", "data/y_train.npy")
-
-
-X_val, y_val = generate_cdf_data_parallel(200, data_gen, num_workers=8)
-save_cdf_data(X_train, y_train, "data/X_val.npy", "data/y_val.npy")
\ No newline at end of file
diff --git a/efficient-stats-calc/datalib/real_data.md b/efficient-stats-calc/datalib/real_data.md
deleted file mode 100644
index d5e38604..00000000
--- a/efficient-stats-calc/datalib/real_data.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# Real Data
-
-Need: 
-- Soil Density (geowatch)
-LAYERS=algorithm.standard_mobility.soil_density.SoilDensityComposite
-- Soil Type (gw)
-- Soil Texture (gw)?
-LAYERS=datalib.soil.texture.USCSTextureComposite
-- Soil Temperature (gw)
-LAYERS=datalib.weather.erdc_lis72.SoilTemperature0to10
-- Soil Moisture (soilmap)
-
-
-## Example
-
-```
-c=podpac.Coordinates([podpac.clinspace([43.700, 43.704, 128, 'lat'], podpac.clinspace([-72.305, -72.301, 128, 'lon'], '2023-01-01', ['lat', 'lon', 'time'])
-
-sm = soilmap.datalib.geowatch.SoilMoisture()
-sm.eval(c)
-
-podpac.settings["username-PW@geowatch_server"] = {"username": "creare", "password": "<iwillsayverbally"}
-podpac.settings.save()
-```
-
-## TODO
-1. Wrap Soil Strength
-2. Fix Caching 
-3. Test on small region
-
-
-
-1. Pull multiple year soilmoisture coarse resolution for 3x3 pixels
-only do work in the middle pixel
-SoilMoisture.solmst0_to_10
-
-c=podpac.Coordinates([podpac.clinspace(43.09, 42.91, 3, 'lat'), podpac.clinspace(-73.155, -72.91, 3, 'lon'), '2023-01-01'], ['lat', 'lon', 'time'])
-- correct resolution?
-2. Cache it
-
-3. Feed 
-
-
-## TODO PT 2
-1. Get weatherscale resolution data for every day of a month on a single pixel
-2. Get construct a CDF as expected output
-3. Get fine resolution data for that pixel at the fine resolution for every day of a month
-4. Construct a CDF as input to model
-
diff --git a/efficient-stats-calc/datalib/soil_mositure.py b/efficient-stats-calc/datalib/soil_mositure.py
deleted file mode 100644
index 59a4b14d..00000000
--- a/efficient-stats-calc/datalib/soil_mositure.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import numpy as np
-
-import soilmap
-import podpac
-import datetime
-
-def calculate_cdf(data):
-    sorted_data = np.sort(data)
-    probabilities = np.linspace(0, 1, len(sorted_data))
-    return probabilities
-
-def calculate_pdf(data, days):
-    # Calculate the kernel density estimate
-    kde = stats.gaussian_kde(data)
-
-    x_values = np.linspace(min(data), max(data), days)
-    
-    # Evaluate the kde on the data
-    pdf = kde.evaluate(x_values)
-    
-    return pdf
-
-
-def get_soil_moisture_pdfs(years, month, lat_clinspace, lon_clinspace, days):
-
-    sm = soilmap.datalib.geowatch.SoilMoisture(cache_ctrl=['ram']).solmst_0_10
-    pdfs = []
-
-    for year in years:
-        # Create a list of dates for the specified month up to and including the 28th day
-        dates = [f"{year}-{month:02d}-{day:02d}" for day in range(1, days-1)]
-
-        soil_moisture_data = []
-
-        # request the data one day at a time:
-        for date in dates:
-            # Get the coordinates for the current date
-            current_c = podpac.Coordinates([lat_clinspace, lon_clinspace, date], ['lat', 'lon', 'time'])
-            # Get soil moisture data for the given coordinates
-            o = sm.eval(current_c)
-            soil_moisture_data.append(o.values[1][1][0])
-        
-        # Sanitize soil_moisture_data for nans:
-        soil_moisture_data = np.array(soil_moisture_data)
-        soil_moisture_data = soil_moisture_data[~np.isnan(soil_moisture_data)]
-        print(soil_moisture_data)
-        # if soil moisture data is not empty:
-        if len(soil_moisture_data) > 0:
-            pdfs.append(calculate_pdf(soil_moisture_data, days))
-            print(year)
-
-    return np.array(pdfs)
-
-def get_soil_moisture_levels(years, month, lat_clinspace, lon_clinspace, days):
-    sm = soilmap.datalib.geowatch.SoilMoisture().solmst_0_10
-    soil_moisture_data_all_years = []
-
-    for year in years:
-        # Create a list of dates for the specified month up to and including the 28th day
-        dates = [f"{year}-{month:02d}-{day:02d}" for day in range(1, days-1)]
-
-        soil_moisture_data = []
-
-        # request the data one day at a time:
-        for date in dates:
-            # Get the coordinates for the current date
-            current_c = podpac.Coordinates([lat_clinspace, lon_clinspace, date], ['lat', 'lon', 'time'])
-            print(current_c)
-            # Get soil moisture data for the given coordinates
-            o = sm.eval(current_c)
-
-            if o is np.nan:
-                continue
-            print(o)
-            soil_moisture_data.append(o.values[1][1][0])
-
-        soil_moisture_data_all_years.append(soil_moisture_data)
-
-    return np.array(soil_moisture_data_all_years)
-
-from scipy import stats
-
-
-
-month = 6
-years = range(2012, 2023)
-lat_clinspace = podpac.clinspace(43.09, 42.91, 3, 'lat')
-lon_clinspace = podpac.clinspace(-73.155, -72.91, 3, 'lon')
-soil_moisture_data = get_soil_moisture_pdfs(years, month, lat_clinspace, lon_clinspace, 28)
-print(soil_moisture_data)
-podpac.utils.clear_cache(mode="all")
-np.save("datalib/data/june_soil_moisture_pdfs.npy", soil_moisture_data)
\ No newline at end of file
diff --git a/efficient-stats-calc/deep-learn/cnn.py b/efficient-stats-calc/deep-learn/cnn.py
deleted file mode 100644
index 6baf57f4..00000000
--- a/efficient-stats-calc/deep-learn/cnn.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import numpy as np
-import tensorflow as tf
-from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import Conv1D, Conv2D, Flatten, Dense
-from tensorflow.keras.optimizers import Adam
-import concurrent.futures
-
-
-
-import sys
-import os
-sys.path.append(os.path.abspath('.'))
-sys.path.append(os.path.abspath('./deep-learn'))
-from data_gen import load_cdf_data
-
-
-X_train, y_train = load_cdf_data("data/X_train.npy", "data/y_train.npy")
-X_val, y_val = load_cdf_data("data/X_val.npy", "data/y_val.npy")
-
-input_dim = X_train.shape[1]
-output_dim=y_train.shape[2]
-
-print(input_dim)
-print(output_dim)
-# Update input_shape for the 1D CNN model
-input_shape = (input_dim, 1)
-
-# Reshape the input data
-X_train_cnn = X_train.reshape(-1, input_dim, 1)
-X_val_cnn = X_val.reshape(-1, input_dim, 1)
-
-# Define the 1D CNN model
-model = Sequential([
-    Conv1D(32, kernel_size=3, activation='relu', input_shape=input_shape),
-    Conv1D(64, kernel_size=3, activation='relu'),
-    Flatten(),
-    Dense(64, activation='relu'),
-    Dense(output_dim, activation='linear')
-])
-
-# Compile the model
-model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mean_absolute_error'])
-
-# Train the model
-history = model.fit(X_train_cnn, y_train, batch_size=32, epochs=10, validation_data=(X_val_cnn, y_val))
-
-model.save("models/cnn_1d.h5")
\ No newline at end of file
diff --git a/efficient-stats-calc/deep-learn/mlp.py b/efficient-stats-calc/deep-learn/mlp.py
deleted file mode 100644
index 7cdbf8a7..00000000
--- a/efficient-stats-calc/deep-learn/mlp.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import numpy as np
-import tensorflow as tf
-from tensorflow.keras.models import Sequential
-from tensorflow.keras.layers import Dense
-from tensorflow.keras.optimizers import Adam
-
-import sys
-import os
-sys.path.append(os.path.abspath('.'))
-from data_gen import load_cdf_data
-
-
-# Example usage:
-X_train, y_train = load_cdf_data("data/X_train.npy", "data/y_train.npy")
-X_val, y_val = load_cdf_data("data/X_val.npy", "data/y_val.npy")
-
-print("X_train shape:", X_train.shape)
-print("y_train shape:", y_train.shape)
-print("X_val shape:", X_val.shape)
-print("y_val shape:", y_val.shape)
-
-
-# Set the dimensions of input and output
-input_dim = X_train.shape[1]
-output_dim = y_train.shape[2]
-
-print("Input Dim", input_dim)
-print("Output Dim:", output_dim )
-
-# Define the MLP model
-model = Sequential([
-    Dense(64, activation='relu', input_shape=(input_dim,)),
-    Dense(32, activation='relu'),
-    Dense(output_dim, activation='linear')
-])
-
-# Compile the model
-model.compile(optimizer=Adam(learning_rate=1e-5), loss=tf.keras.losses.Huber(delta=1.0), metrics=['mean_absolute_error'])
-
-# Train the model
-history = model.fit(X_train, y_train, batch_size=32, epochs=100, validation_data=(X_val, y_val))
-
-print(model.predict(np.expand_dims(X_train[0], axis=0)))
-print(y_train[0])
-
-model.save("models/mlp.h5")
diff --git a/efficient-stats-calc/pdfs/real_example.py b/efficient-stats-calc/real_example.py
similarity index 100%
rename from efficient-stats-calc/pdfs/real_example.py
rename to efficient-stats-calc/real_example.py
diff --git a/efficient-stats-calc/sparse-grid/sparse_grid_approx.py b/efficient-stats-calc/sparse-grid/sparse_grid_approx.py
deleted file mode 100644
index 296b5c68..00000000
--- a/efficient-stats-calc/sparse-grid/sparse_grid_approx.py
+++ /dev/null
@@ -1,71 +0,0 @@
-import numpy as np
-from scipy import interpolate
-
-# Step 1: Construct a sparse grid of sample points
-# Define the number of dimensions
-n_dim = 2
-
-# Define the sparse grid
-grid = ???  # Choose a sparse grid method
-
-# Define the number of sample points
-n_points = ???  # Choose the number of sample points
-
-# Generate the sample points
-sample_points = grid.generate_points(n_points)
-
-# Step 2: Interpolate the joint probability distribution
-# Evaluate the PDFs at the sample points
-pdf_x = ???  # Evaluate the PDF P(x) at the sample points
-pdf_y = ???  # Evaluate the PDF P(y) at the sample points
-
-# Construct a meshgrid of the sample points
-X, Y = np.meshgrid(sample_points[:, 0], sample_points[:, 1])
-
-# Interpolate the joint probability distribution
-joint_pdf = interpolate.interp2d(X, Y, pdf_x * pdf_y)
-
-# Step 3: Compute the PDF of the function output
-# Define the function F(P(x), P(y))
-def F(p_x, p_y):
-    return ???  # Define the function F
-
-# Evaluate the function F at the sample points
-f_values = F(pdf_x, pdf_y)
-
-# Compute the PDF of the function output
-output_pdf = interpolate.interp1d(f_values, joint_pdf(X, Y))
-
-# Step 4: Find the value c of the PDF of the function output that we want to approximate
-c = ???  # Choose a value of the PDF of the function output that we want to approximate
-
-# Step 5: Find the set of input values (P(x), P(y)) that satisfy the equation F(P(x), P(y)) = PDF(c)
-# Define the equation F(P(x), P(y)) = PDF(c) in terms of P(y)
-def equation_p_y(p_x, p_y):
-    return output_pdf(F(p_x, p_y)) - c
-
-# Use a numerical solver to find the set of input values that satisfy the equation
-input_values = ???  # Choose a numerical solver to find the input values
-
-# Step 6: Construct a sparse grid of sample points in the input space of the function F(P(x), P(y))
-# Define the sparse grid
-grid_f = ???  # Choose a sparse grid method
-
-# Define the number of sample points
-n_points_f = ???  # Choose the number of sample points
-
-# Generate the sample points
-sample_points_f = grid_f.generate_points(n_points_f)
-
-# Step 7: Interpolate the function values at unsampled points
-# Evaluate the function F at the sample points
-f_values_f = F(sample_points_f[:, 0], sample_points_f[:, 1])
-
-# Interpolate the function values
-interpolated_f = interpolate.interp1d(f_values_f, sample_points_f)
-
-# Step 8: Compute the map F such that F(P(x), P(y)) = PDF(c)
-# Use the interpolated function to compute the map
-def map_F(p_x, p_y):
-    return interpolated_f(output_pdf(F(p_x, p_y)))
-
diff --git a/efficient-stats-calc/pdfs/toy_examples.py b/efficient-stats-calc/toy_example.py
similarity index 100%
rename from efficient-stats-calc/pdfs/toy_examples.py
rename to efficient-stats-calc/toy_example.py

From a2c883c35ffe5bd9ebdcf140ba505589c7588e46 Mon Sep 17 00:00:00 2001
From: Clay Foye <CFoye@Creare.com>
Date: Mon, 22 May 2023 14:29:09 -0400
Subject: [PATCH 13/14] DIR: move to "soil_moisture"

---
 efficient-stats-calc/{ => soil_moisture}/real_example.py | 0
 efficient-stats-calc/{ => soil_moisture}/toy_example.py  | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename efficient-stats-calc/{ => soil_moisture}/real_example.py (100%)
 rename efficient-stats-calc/{ => soil_moisture}/toy_example.py (100%)

diff --git a/efficient-stats-calc/real_example.py b/efficient-stats-calc/soil_moisture/real_example.py
similarity index 100%
rename from efficient-stats-calc/real_example.py
rename to efficient-stats-calc/soil_moisture/real_example.py
diff --git a/efficient-stats-calc/toy_example.py b/efficient-stats-calc/soil_moisture/toy_example.py
similarity index 100%
rename from efficient-stats-calc/toy_example.py
rename to efficient-stats-calc/soil_moisture/toy_example.py

From 0a90f515f3ef1532f2b6101714298d905b8cc378 Mon Sep 17 00:00:00 2001
From: Clay Foye <CFoye@Creare.com>
Date: Tue, 23 May 2023 10:45:25 -0400
Subject: [PATCH 14/14] CODE: Changed x axis of MSE plot to linear

---
 efficient-stats-calc/soil_moisture/real_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/efficient-stats-calc/soil_moisture/real_example.py b/efficient-stats-calc/soil_moisture/real_example.py
index 264839bc..efcbe194 100644
--- a/efficient-stats-calc/soil_moisture/real_example.py
+++ b/efficient-stats-calc/soil_moisture/real_example.py
@@ -373,7 +373,7 @@ def eff_stats_calc(n_bins, veg_value, land_use_value, DO_LOG=False, DO_PLOT=Fals
 
         # Plot the results
         plt.figure(figsize=(10,6))
-        plt.loglog(bin_counts, mse_values, marker='o')
+        plt.semilogy(bin_counts, mse_values, marker='o')
         plt.title('Mean Squared Error vs Number of Bins')
         plt.xlabel('Number of Bins')
         plt.ylabel('Mean Squared Error')