From 9166543692a5bec50a5e6e43a489b5099de56717 Mon Sep 17 00:00:00 2001 From: Clay Foye Date: Mon, 3 Apr 2023 12:15:30 -0400 Subject: [PATCH 01/14] MAINT: toy_data and benchmark #508 --- efficient-stats-calc/benchmark.py | 60 +++++++++++++++++++++++++++++++ efficient-stats-calc/data_gen.py | 34 ++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 efficient-stats-calc/benchmark.py create mode 100644 efficient-stats-calc/data_gen.py diff --git a/efficient-stats-calc/benchmark.py b/efficient-stats-calc/benchmark.py new file mode 100644 index 00000000..6d681ae9 --- /dev/null +++ b/efficient-stats-calc/benchmark.py @@ -0,0 +1,60 @@ +import time +import numpy as np +from data_gen import * + + +# Example method (replace with your own methods) +def brute_force(x, y): + return np.mean(f(x,y)) + +# Benchmarking tool +def benchmark(methods, x, y, f, n_runs=10): + results = {} + + # Calculate the ground truth s(c) + c_true = f(x, y) + s_c_true = np.mean(c_true) + + for name, method in methods.items(): + # Initialize variables to store execution time and errors + exec_times = [] + errors = [] + + for _ in range(n_runs): + # Measure execution time + start_time = time.time() + s_c_approx = method(x, y) + exec_time = time.time() - start_time + exec_times.append(exec_time) + + # Calculate the error + error = np.abs(s_c_approx - s_c_true) + errors.append(error) + + # Calculate average execution time and error + avg_exec_time = np.mean(exec_times) + avg_error = np.mean(errors) + + results[name] = { + 'average_execution_time': avg_exec_time, + 'average_error': avg_error + } + + return results + +# Define methods to benchmark +methods = { + 'brute_force': brute_force +} + +# Generate toy data +n_samples = 1000 +x, y = generate_data(n_samples) + +# Run the benchmark +results = benchmark(methods, x, y, f) +print("Benchmark results:") +for method, result in results.items(): + print(f"{method}:") + print(f" Average execution time: {result['average_execution_time']:.6f} seconds") + print(f" Average error: {result['average_error']:.6f}") diff --git a/efficient-stats-calc/data_gen.py b/efficient-stats-calc/data_gen.py new file mode 100644 index 00000000..b15fbbfb --- /dev/null +++ b/efficient-stats-calc/data_gen.py @@ -0,0 +1,34 @@ +import numpy as np + +# Non-linear n-dimensional function f(x, y) +def f(x, y): + return np.sin(x) * np.cos(y) + +# Generate random x and y data +def generate_data(n_samples): + x = np.random.uniform(-np.pi, np.pi, n_samples) + y = np.random.uniform(-np.pi, np.pi, n_samples) + return x, y + +# Calculate statistics s(x), s(y) and s(c) +def calculate_statistics(x, y, func): + s_x = np.mean(x) + s_y = np.mean(y) + c = func(x, y) + s_c = np.mean(c) + return s_x, s_y, s_c + +# Generate toy data and calculate statistics +def create_toy_data(n_samples, func): + x, y = generate_data(n_samples) + s_x, s_y, s_c = calculate_statistics(x, y, func) + return x, y, s_x, s_y, s_c + +""" +Example Usage +""" +n_samples = 1000 +x, y, s_x, s_y, s_c = create_toy_data(n_samples, f) +print("s(x):", s_x) +print("s(y):", s_y) +print("s(c):", s_c) From 40a54212bbd955f8c5fa9dea5ed3388864be5124 Mon Sep 17 00:00:00 2001 From: Clay Foye Date: Mon, 3 Apr 2023 13:59:12 -0400 Subject: [PATCH 02/14] MAINT: MLP Toy Implementation #508 --- .gitignore | 3 ++ efficient-stats-calc/benchmark.py | 14 ++++++-- efficient-stats-calc/deep-learn/cnn.py | 35 ++++++++++++++++++ efficient-stats-calc/deep-learn/mlp.py | 50 ++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 efficient-stats-calc/deep-learn/cnn.py create mode 100644 efficient-stats-calc/deep-learn/mlp.py diff --git a/.gitignore b/.gitignore index d9d6232d..2ab57bd8 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,6 @@ doc/source/example-links.inc doc/source/changelog.md .pytest_cache/ build/ + +# machine learning +.h5 \ No newline at end of file diff --git a/efficient-stats-calc/benchmark.py b/efficient-stats-calc/benchmark.py index 6d681ae9..121a336e 100644 --- a/efficient-stats-calc/benchmark.py +++ b/efficient-stats-calc/benchmark.py @@ -1,12 +1,20 @@ import time import numpy as np from data_gen import * +from tensorflow.keras.models import load_model +MLP_MODEL = "models/mlp.h5" -# Example method (replace with your own methods) +# Brute Force -- computing s(f(x,y)) def brute_force(x, y): return np.mean(f(x,y)) +# Multi-layer Perceptron +def mlp(x, y): + model = load_model(MLP_MODEL) + return model.predict(np.array([np.hstack([x,y])]))[0][0] + + # Benchmarking tool def benchmark(methods, x, y, f, n_runs=10): results = {} @@ -44,13 +52,15 @@ def benchmark(methods, x, y, f, n_runs=10): # Define methods to benchmark methods = { - 'brute_force': brute_force + 'brute_force': brute_force, + 'multilayer_perceptron': mlp } # Generate toy data n_samples = 1000 x, y = generate_data(n_samples) + # Run the benchmark results = benchmark(methods, x, y, f) print("Benchmark results:") diff --git a/efficient-stats-calc/deep-learn/cnn.py b/efficient-stats-calc/deep-learn/cnn.py new file mode 100644 index 00000000..a93aaeae --- /dev/null +++ b/efficient-stats-calc/deep-learn/cnn.py @@ -0,0 +1,35 @@ +import numpy as np +import tensorflow as tf +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Conv2D, Flatten, Dense +from tensorflow.keras.optimizers import Adam + +# Load your data here +# X_train, y_train, X_val, y_val + +# Ensure your input data is in the correct format: (samples, height, width, channels) +# For grayscale grid data, channels = 1 +# For RGB grid data, channels = 3 + +# Set the dimensions of input and output +input_shape = X_train.shape[1:] +output_dim = y_train.shape[1] + +# Define the CNN model +model = Sequential([ + Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape), + Conv2D(64, kernel_size=(3, 3), activation='relu'), + Flatten(), + Dense(64, activation='relu'), + Dense(output_dim, activation='linear') +]) + +# Compile the model +model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mean_absolute_error']) + +# Train the model +history = model.fit(X_train, y_train, batch_size=32, epochs=100, validation_data=(X_val, y_val)) + +# Make predictions with the trained model +# X_test: your test data +predictions = model.predict(X_test) diff --git a/efficient-stats-calc/deep-learn/mlp.py b/efficient-stats-calc/deep-learn/mlp.py new file mode 100644 index 00000000..a7086e20 --- /dev/null +++ b/efficient-stats-calc/deep-learn/mlp.py @@ -0,0 +1,50 @@ +import numpy as np +import tensorflow as tf +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense +from tensorflow.keras.optimizers import Adam + +import sys +import os +sys.path.append(os.path.abspath('.')) +import data_gen + +# Load your data here +# X_train, y_train, X_val, y_val + +def generate_data(N, data_gen): + X = [] + y = [] + + for _ in range(N): + data = np.hstack(data_gen.generate_data(1000)) + X.append(data) + y.append(np.mean(data_gen.f(data[:1000], data[1000:]))) + + return np.array(X), np.array(y) + +# Create training and validation data +N_train = 10000 # Number of training datapoints +N_val = 20 # Number of validation datapoints + +X_train, y_train = generate_data(N_train, data_gen) +X_val, y_val = generate_data(N_val, data_gen) + +# Set the dimensions of input and output +input_dim = X_train.shape[1] +output_dim = y_train.shape[0] + +# Define the MLP model +model = Sequential([ + Dense(64, activation='relu', input_shape=(input_dim,)), + Dense(32, activation='relu'), + Dense(output_dim, activation='linear') +]) + +# Compile the model +model.compile(optimizer=Adam(learning_rate=1e-5), loss='mean_squared_error', metrics=['mean_absolute_error']) + +# Train the model +history = model.fit(X_train, y_train, batch_size=32, epochs=100, validation_data=(X_val, y_val)) + +model.save("models/mlp.h5") From 44b87feed52c49698fe100f28897ee7749ca853e Mon Sep 17 00:00:00 2001 From: Clay Foye Date: Mon, 3 Apr 2023 14:24:54 -0400 Subject: [PATCH 03/14] MAINT: CNN Toy Implement #508 --- efficient-stats-calc/benchmark.py | 27 ++++++++++++++++++-- efficient-stats-calc/deep-learn/cnn.py | 34 +++++++++++++++++++++++--- efficient-stats-calc/deep-learn/mlp.py | 6 ++--- 3 files changed, 58 insertions(+), 9 deletions(-) diff --git a/efficient-stats-calc/benchmark.py b/efficient-stats-calc/benchmark.py index 121a336e..f07d1fc0 100644 --- a/efficient-stats-calc/benchmark.py +++ b/efficient-stats-calc/benchmark.py @@ -5,6 +5,11 @@ MLP_MODEL = "models/mlp.h5" +# Constants +CNN_MODEL = 'models/cnn.h5' +WIDTH = 32 +HEIGHT = 32 + # Brute Force -- computing s(f(x,y)) def brute_force(x, y): return np.mean(f(x,y)) @@ -14,6 +19,23 @@ def mlp(x, y): model = load_model(MLP_MODEL) return model.predict(np.array([np.hstack([x,y])]))[0][0] + +# Convolutional Neural Network +def cnn(x, y): + # Load the trained CNN model + model = load_model(CNN_MODEL) + + # Reshape x and y into the format expected by the CNN + x_data = x.reshape(HEIGHT, WIDTH) + y_data = y.reshape(HEIGHT, WIDTH) + data = np.stack([x_data, y_data], axis=-1) # shape: (height, width, 2) + input_data = np.expand_dims(data, axis=0) # shape: (1, height, width, 2) + + # Perform prediction using the model + prediction = model.predict(input_data)[0][0] + + return prediction + # Benchmarking tool def benchmark(methods, x, y, f, n_runs=10): @@ -53,11 +75,12 @@ def benchmark(methods, x, y, f, n_runs=10): # Define methods to benchmark methods = { 'brute_force': brute_force, - 'multilayer_perceptron': mlp + 'multilayer_perceptron': mlp, + 'convolutional': cnn } # Generate toy data -n_samples = 1000 +n_samples = WIDTH*HEIGHT x, y = generate_data(n_samples) diff --git a/efficient-stats-calc/deep-learn/cnn.py b/efficient-stats-calc/deep-learn/cnn.py index a93aaeae..819efd3f 100644 --- a/efficient-stats-calc/deep-learn/cnn.py +++ b/efficient-stats-calc/deep-learn/cnn.py @@ -4,6 +4,12 @@ from tensorflow.keras.layers import Conv2D, Flatten, Dense from tensorflow.keras.optimizers import Adam + +import sys +import os +sys.path.append(os.path.abspath('.')) +import data_gen + # Load your data here # X_train, y_train, X_val, y_val @@ -11,9 +17,31 @@ # For grayscale grid data, channels = 1 # For RGB grid data, channels = 3 +def generate_data_cnn(N, data_gen, width, height): + X = [] + y = [] + + for _ in range(N): + x_data, y_data = data_gen.generate_data(width * height) + x_data = x_data.reshape(height, width) + y_data = y_data.reshape(height, width) + data = np.stack([x_data, y_data], axis=-1) # shape: (height, width, 2) + X.append(data) + y.append(np.mean(data_gen.f(x_data, y_data))) + + return np.array(X), np.array(y) + +# Create training and validation data +N_train = 100 # Number of training datapoints +N_val = 20 # Number of validation datapoints +width, height = 32, 32 # Dimensions of the square images + +X_train, y_train = generate_data_cnn(N_train, data_gen, width, height) +X_val, y_val = generate_data_cnn(N_val, data_gen, width, height) + # Set the dimensions of input and output input_shape = X_train.shape[1:] -output_dim = y_train.shape[1] +output_dim = y_train.shape[0] # Define the CNN model model = Sequential([ @@ -30,6 +58,4 @@ # Train the model history = model.fit(X_train, y_train, batch_size=32, epochs=100, validation_data=(X_val, y_val)) -# Make predictions with the trained model -# X_test: your test data -predictions = model.predict(X_test) +model.save("models/cnn.h5") \ No newline at end of file diff --git a/efficient-stats-calc/deep-learn/mlp.py b/efficient-stats-calc/deep-learn/mlp.py index a7086e20..db3d9c78 100644 --- a/efficient-stats-calc/deep-learn/mlp.py +++ b/efficient-stats-calc/deep-learn/mlp.py @@ -17,14 +17,14 @@ def generate_data(N, data_gen): y = [] for _ in range(N): - data = np.hstack(data_gen.generate_data(1000)) + data = np.hstack(data_gen.generate_data(1024)) X.append(data) - y.append(np.mean(data_gen.f(data[:1000], data[1000:]))) + y.append(np.mean(data_gen.f(data[:1024], data[1024:]))) return np.array(X), np.array(y) # Create training and validation data -N_train = 10000 # Number of training datapoints +N_train = 1000 # Number of training datapoints N_val = 20 # Number of validation datapoints X_train, y_train = generate_data(N_train, data_gen) From 212f74b2712bc030af40864fcd10b0744ea7e571 Mon Sep 17 00:00:00 2001 From: Clay Foye Date: Mon, 3 Apr 2023 16:47:25 -0400 Subject: [PATCH 04/14] FIX: MLP learns PDFs --- efficient-stats-calc/deep-learn/mlp.py | 60 +++++++++++++++++++++----- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/efficient-stats-calc/deep-learn/mlp.py b/efficient-stats-calc/deep-learn/mlp.py index db3d9c78..fce234d1 100644 --- a/efficient-stats-calc/deep-learn/mlp.py +++ b/efficient-stats-calc/deep-learn/mlp.py @@ -9,30 +9,70 @@ sys.path.append(os.path.abspath('.')) import data_gen +from scipy.stats import gaussian_kde + + # Load your data here # X_train, y_train, X_val, y_val -def generate_data(N, data_gen): +# def generate_data(N, data_gen): +# X = [] +# y = [] + +# for _ in range(N): +# data = np.hstack(data_gen.generate_data(1024)) +# X.append(data) +# y.append(np.mean(data_gen.f(data[:1024], data[1024:]))) + +# return np.array(X), np.array(y) + + +# # Create training and validation data +# N_train = 1000 # Number of training datapoints +# N_val = 20 # Number of validation datapoints + +# X_train, y_train = generate_data(N_train, data_gen) +# X_val, y_val = generate_data(N_val, data_gen) + + +def generate_pdf_data(N, data_gen, num_points=1024): X = [] y = [] for _ in range(N): - data = np.hstack(data_gen.generate_data(1024)) - X.append(data) - y.append(np.mean(data_gen.f(data[:1024], data[1024:]))) + data = np.hstack(data_gen.generate_data(num_points)) + x_data = data[:num_points] + y_data = data[num_points:] + + # Estimate the PDFs of x and y using KDE + kde_x = gaussian_kde(x_data) + kde_y = gaussian_kde(y_data) + + # Compute f(x, y) and estimate its PDF using KDE + f_data = data_gen.f(x_data, y_data) + kde_f = gaussian_kde(f_data) + + # Stack the PDFs of x, y, and f(x, y) as input features and target + x_pdf = kde_x(x_data) + y_pdf = kde_y(y_data) + f_pdf = kde_f(f_data) + + X.append(np.hstack([x_pdf, y_pdf])) + y.append([f_pdf]) + return np.array(X), np.array(y) -# Create training and validation data -N_train = 1000 # Number of training datapoints -N_val = 20 # Number of validation datapoints +# Example usage: +N = 5000 # Number of training datapoints +val = 100 +X_train, y_train = generate_pdf_data(N, data_gen) +X_val, y_val = generate_pdf_data(val, data_gen) -X_train, y_train = generate_data(N_train, data_gen) -X_val, y_val = generate_data(N_val, data_gen) # Set the dimensions of input and output input_dim = X_train.shape[1] -output_dim = y_train.shape[0] +output_dim = y_train.shape[1] # Define the MLP model model = Sequential([ From 9e8b50ce196b5e0406d57437f2ab66c0b896f51b Mon Sep 17 00:00:00 2001 From: Clay Foye Date: Mon, 3 Apr 2023 16:54:15 -0400 Subject: [PATCH 05/14] MAINT: Learn CDF's --- efficient-stats-calc/deep-learn/mlp.py | 65 +++++++++++++++----------- 1 file changed, 38 insertions(+), 27 deletions(-) diff --git a/efficient-stats-calc/deep-learn/mlp.py b/efficient-stats-calc/deep-learn/mlp.py index fce234d1..687fb4d5 100644 --- a/efficient-stats-calc/deep-learn/mlp.py +++ b/efficient-stats-calc/deep-learn/mlp.py @@ -10,30 +10,9 @@ import data_gen from scipy.stats import gaussian_kde +from scipy.integrate import cumtrapz -# Load your data here -# X_train, y_train, X_val, y_val - -# def generate_data(N, data_gen): -# X = [] -# y = [] - -# for _ in range(N): -# data = np.hstack(data_gen.generate_data(1024)) -# X.append(data) -# y.append(np.mean(data_gen.f(data[:1024], data[1024:]))) - -# return np.array(X), np.array(y) - - -# # Create training and validation data -# N_train = 1000 # Number of training datapoints -# N_val = 20 # Number of validation datapoints - -# X_train, y_train = generate_data(N_train, data_gen) -# X_val, y_val = generate_data(N_val, data_gen) - def generate_pdf_data(N, data_gen, num_points=1024): X = [] @@ -63,11 +42,43 @@ def generate_pdf_data(N, data_gen, num_points=1024): return np.array(X), np.array(y) +def generate_cdf_data(N, data_gen, num_points=1024): + X = [] + y = [] + + for _ in range(N): + data = np.hstack(data_gen.generate_data(num_points)) + x_data = data[:num_points] + y_data = data[num_points:] + + # Estimate the PDFs of x and y using KDE + kde_x = gaussian_kde(x_data) + kde_y = gaussian_kde(y_data) + + # Compute f(x, y) and estimate its PDF using KDE + f_data = data_gen.f(x_data, y_data) + kde_f = gaussian_kde(f_data) + + # Compute the CDFs of x, y, and f(x, y) using the PDFs + x_cdf = cumtrapz(kde_x(x_data), x_data, initial=0) + y_cdf = cumtrapz(kde_y(y_data), y_data, initial=0) + f_cdf = cumtrapz(kde_f(f_data), f_data, initial=0) + + # Normalize the CDFs + x_cdf /= x_cdf[-1] + y_cdf /= y_cdf[-1] + f_cdf /= f_cdf[-1] + + X.append(np.hstack([x_cdf, y_cdf])) + y.append([f_cdf]) + + return np.array(X), np.array(y) + # Example usage: -N = 5000 # Number of training datapoints -val = 100 -X_train, y_train = generate_pdf_data(N, data_gen) -X_val, y_val = generate_pdf_data(val, data_gen) +N = 1000 # Number of training datapoints +val = 20 +X_train, y_train = generate_cdf_data(N, data_gen) +X_val, y_val = generate_cdf_data(val, data_gen) # Set the dimensions of input and output @@ -85,6 +96,6 @@ def generate_pdf_data(N, data_gen, num_points=1024): model.compile(optimizer=Adam(learning_rate=1e-5), loss='mean_squared_error', metrics=['mean_absolute_error']) # Train the model -history = model.fit(X_train, y_train, batch_size=32, epochs=100, validation_data=(X_val, y_val)) +history = model.fit(X_train, y_train, batch_size=8, epochs=100, validation_data=(X_val, y_val)) model.save("models/mlp.h5") From 20417e96a871412c86e4ac9b903ffdafae40cc52 Mon Sep 17 00:00:00 2001 From: Clay Foye Date: Tue, 4 Apr 2023 13:54:31 -0400 Subject: [PATCH 06/14] FIX: Models now learn CDFs #508 --- .../benchmarks/benchmark_cdf.py | 110 ++++++++++++ .../benchmark_mean.py} | 0 efficient-stats-calc/data_gen.py | 156 +++++++++++++++++- efficient-stats-calc/deep-learn/cnn.py | 56 +++---- efficient-stats-calc/deep-learn/mlp.py | 88 ++-------- .../sparse-grid/sparse_grid_approx.py | 71 ++++++++ 6 files changed, 367 insertions(+), 114 deletions(-) create mode 100644 efficient-stats-calc/benchmarks/benchmark_cdf.py rename efficient-stats-calc/{benchmark.py => benchmarks/benchmark_mean.py} (100%) create mode 100644 efficient-stats-calc/sparse-grid/sparse_grid_approx.py diff --git a/efficient-stats-calc/benchmarks/benchmark_cdf.py b/efficient-stats-calc/benchmarks/benchmark_cdf.py new file mode 100644 index 00000000..5eeaab93 --- /dev/null +++ b/efficient-stats-calc/benchmarks/benchmark_cdf.py @@ -0,0 +1,110 @@ +import time +import numpy as np + +import sys +import os +sys.path.append(os.path.abspath('.')) +from data_gen import * +import data_gen +from tensorflow.keras.models import load_model +from scipy.stats import gaussian_kde +from scipy.integrate import cumtrapz + +import time + +MLP_MODEL = "models/mlp.h5" + +# Constants +CNN_MODEL = 'models/cnn_1d.h5' +N_SAMPLES = 1024 + +# Brute Force -- computing s(f(x,y)) +def brute_force(x_cdf, y_cdf): + return NotImplementedError + + +# Multi-layer Perceptron +def mlp(x_cdf, y_cdf): + model = load_model(MLP_MODEL) + print(x_cdf) + print(model.predict(np.array([np.hstack([x_cdf,y_cdf])]))[0]) + return model.predict(np.array([np.hstack([x_cdf,y_cdf])]))[0] + + +# Convolutional Neural Network +def cnn(x_cdf, y_cdf): + # Load the trained CNN model + model = load_model(CNN_MODEL) + + cdf = np.array([np.hstack([x_cdf,y_cdf])]) + cdf = cdf.reshape(-1, cdf.shape[1], 1) + + + # Perform prediction using the model + prediction = model.predict(cdf)[0] + + print(prediction) + + return prediction + +def benchmark(methods, data_gen, f, n_runs=1, n_sets=10): + results = {} + + def calculate_cdf(data): + sorted_data = np.sort(data) + probabilities = np.linspace(0, 1, len(sorted_data)) + return sorted_data, probabilities + + for name, method in methods.items(): + # Initialize variables to store execution time and errors + exec_times = [] + errors = [] + + for _ in range(n_runs): + set_errors = [] + for i in range(n_sets): + x, y = data_gen.generate_data(N_SAMPLES) + c_true = f(x, y) + + # Calculate the CDFs of x, y, and f(x, y) + x_sorted, x_cdf = calculate_cdf(x) + y_sorted, y_cdf = calculate_cdf(y) + c_sorted, c_cdf_true = calculate_cdf(c_true) + + # Measure execution time + start_time = time.time() + c_cdf_approx = method(x_cdf, y_cdf) + exec_time = time.time() - start_time + + # Calculate the error + error = np.mean(np.abs(c_cdf_approx - c_cdf_true)) + set_errors.append(error) + + exec_times.append(exec_time) + errors.append(np.mean(set_errors)) + + # Calculate average execution time and error + avg_exec_time = np.mean(exec_times) + avg_error = np.mean(errors) + + results[name] = { + 'average_execution_time': avg_exec_time, + 'average_error': avg_error + } + + return results + + +# Define methods to benchmark +methods = { + 'mlp': mlp, + 'cnn': cnn +} + +# Run the benchmark +results = benchmark(methods, data_gen, f, n_runs=1, n_sets=1) +print("Benchmark results:") +for method, result in results.items(): + print(f"{method}:") + print(f" Average execution time: {result['average_execution_time']:.6f} seconds") + print(f" Average error: {result['average_error']:.6f}") diff --git a/efficient-stats-calc/benchmark.py b/efficient-stats-calc/benchmarks/benchmark_mean.py similarity index 100% rename from efficient-stats-calc/benchmark.py rename to efficient-stats-calc/benchmarks/benchmark_mean.py diff --git a/efficient-stats-calc/data_gen.py b/efficient-stats-calc/data_gen.py index b15fbbfb..9ff51713 100644 --- a/efficient-stats-calc/data_gen.py +++ b/efficient-stats-calc/data_gen.py @@ -1,15 +1,20 @@ import numpy as np +import concurrent.futures +from scipy.stats import gaussian_kde +from scipy.integrate import cumtrapz # Non-linear n-dimensional function f(x, y) def f(x, y): return np.sin(x) * np.cos(y) -# Generate random x and y data -def generate_data(n_samples): +def generate_data(n_samples, seed=None): + + if seed is not None: + np.random.seed(seed) + x = np.random.uniform(-np.pi, np.pi, n_samples) y = np.random.uniform(-np.pi, np.pi, n_samples) return x, y - # Calculate statistics s(x), s(y) and s(c) def calculate_statistics(x, y, func): s_x = np.mean(x) @@ -27,8 +32,143 @@ def create_toy_data(n_samples, func): """ Example Usage """ -n_samples = 1000 -x, y, s_x, s_y, s_c = create_toy_data(n_samples, f) -print("s(x):", s_x) -print("s(y):", s_y) -print("s(c):", s_c) +# n_samples = 1000 +# x, y, s_x, s_y, s_c = create_toy_data(n_samples, f) +# print("s(x):", s_x) +# print("s(y):", s_y) +# print("s(c):", s_c) + + + + +def generate_data_cnn(N, data_gen, width, height): + X = [] + y = [] + + for _ in range(N): + x_data, y_data = data_gen.generate_data(width * height) + x_data = x_data.reshape(height, width) + y_data = y_data.reshape(height, width) + data = np.stack([x_data, y_data], axis=-1) # shape: (height, width, 2) + X.append(data) + y.append(np.mean(data_gen.f(x_data, y_data))) + + return np.array(X), np.array(y) + +def generate_pdf_data(N, data_gen, num_points=1024): + X = [] + y = [] + + for _ in range(N): + data = np.hstack(data_gen.generate_data(num_points)) + x_data = data[:num_points] + y_data = data[num_points:] + + # Estimate the PDFs of x and y using KDE + kde_x = gaussian_kde(x_data) + kde_y = gaussian_kde(y_data) + + # Compute f(x, y) and estimate its PDF using KDE + f_data = data_gen.f(x_data, y_data) + kde_f = gaussian_kde(f_data) + + # Stack the PDFs of x, y, and f(x, y) as input features and target + x_pdf = kde_x(x_data) + y_pdf = kde_y(y_data) + f_pdf = kde_f(f_data) + + X.append(np.hstack([x_pdf, y_pdf])) + y.append([f_pdf]) + + + return np.array(X), np.array(y) + +def generate_cdf_data(N, data_gen, num_points=1024, grid_size=100): + X = [] + y = [] + + for _ in range(N): + data = np.hstack(data_gen.generate_data(num_points)) + x_data = data[:num_points] + y_data = data[num_points:] + + # Create a lower resolution grid + x_grid = np.linspace(x_data.min(), x_data.max(), grid_size) + y_grid = np.linspace(y_data.min(), y_data.max(), grid_size) + + # Estimate the PDFs of x and y using KDE + # Example: Use Silverman's rule of thumb + kde_x = gaussian_kde(x_data, bw_method='silverman') + kde_y = gaussian_kde(y_data, bw_method='silverman') + + # Compute f(x, y) and estimate its PDF using KDE + f_data = data_gen.f(x_data, y_data) + kde_f = gaussian_kde(f_data, bw_method='silverman') + + # Compute the CDFs of x, y, and f(x, y) using the PDFs on the grid + x_cdf = cumtrapz(kde_x(x_grid), x_grid, initial=0) + y_cdf = cumtrapz(kde_y(y_grid), y_grid, initial=0) + f_cdf = cumtrapz(kde_f(f_data), f_data, initial=0) + + # Normalize the CDFs + x_cdf /= x_cdf[-1] + y_cdf /= y_cdf[-1] + f_cdf /= f_cdf[-1] + + X.append(np.hstack([x_cdf, y_cdf])) + y.append([f_cdf]) + + return np.array(X), np.array(y) + +def generate_cdf_data_parallel(N, data_gen, num_workers=8, num_points=1024): + # Define a helper function to process each data point + def process_datapoint(data_gen): + data = np.hstack(data_gen.generate_data(num_points)) + x_data = data[:num_points] + y_data = data[num_points:] + + # Calculate the CDFs of x, y, and f(x, y) directly from the data points + x_sorted, x_cdf = calculate_cdf(x_data) + y_sorted, y_cdf = calculate_cdf(y_data) + f_data = data_gen.f(x_data, y_data) + f_sorted, f_cdf = calculate_cdf(f_data) + + return np.hstack([x_cdf, y_cdf]), f_cdf + + def calculate_cdf(data): + sorted_data = np.sort(data) + probabilities = np.linspace(0, 1, len(sorted_data)) + return sorted_data, probabilities + + X = [] + y = [] + + with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: + # Parallelize the generation of data points + futures = [executor.submit(process_datapoint, data_gen) for _ in range(N)] + + for future in futures: + x_cdf_y_cdf, f_cdf = future.result() + X.append(x_cdf_y_cdf) + y.append([f_cdf]) + + return np.array(X), np.array(y) + +def save_cdf_data(X, y, X_filename, y_filename): + np.save(X_filename, X) + np.save(y_filename, y) + +def load_cdf_data(X_filename, y_filename): + X = np.load(X_filename) + y = np.load(y_filename) + return X, y + +N = 10000 +import data_gen + +X_train, y_train = generate_cdf_data_parallel(N, data_gen, num_workers=8) +save_cdf_data(X_train, y_train, "data/X_train.npy", "data/y_train.npy") + + +X_val, y_val = generate_cdf_data_parallel(200, data_gen, num_workers=8) +save_cdf_data(X_train, y_train, "data/X_val.npy", "data/y_val.npy") \ No newline at end of file diff --git a/efficient-stats-calc/deep-learn/cnn.py b/efficient-stats-calc/deep-learn/cnn.py index 819efd3f..6baf57f4 100644 --- a/efficient-stats-calc/deep-learn/cnn.py +++ b/efficient-stats-calc/deep-learn/cnn.py @@ -1,52 +1,38 @@ import numpy as np import tensorflow as tf from tensorflow.keras.models import Sequential -from tensorflow.keras.layers import Conv2D, Flatten, Dense +from tensorflow.keras.layers import Conv1D, Conv2D, Flatten, Dense from tensorflow.keras.optimizers import Adam +import concurrent.futures + import sys import os sys.path.append(os.path.abspath('.')) -import data_gen - -# Load your data here -# X_train, y_train, X_val, y_val - -# Ensure your input data is in the correct format: (samples, height, width, channels) -# For grayscale grid data, channels = 1 -# For RGB grid data, channels = 3 - -def generate_data_cnn(N, data_gen, width, height): - X = [] - y = [] +sys.path.append(os.path.abspath('./deep-learn')) +from data_gen import load_cdf_data - for _ in range(N): - x_data, y_data = data_gen.generate_data(width * height) - x_data = x_data.reshape(height, width) - y_data = y_data.reshape(height, width) - data = np.stack([x_data, y_data], axis=-1) # shape: (height, width, 2) - X.append(data) - y.append(np.mean(data_gen.f(x_data, y_data))) - return np.array(X), np.array(y) +X_train, y_train = load_cdf_data("data/X_train.npy", "data/y_train.npy") +X_val, y_val = load_cdf_data("data/X_val.npy", "data/y_val.npy") -# Create training and validation data -N_train = 100 # Number of training datapoints -N_val = 20 # Number of validation datapoints -width, height = 32, 32 # Dimensions of the square images +input_dim = X_train.shape[1] +output_dim=y_train.shape[2] -X_train, y_train = generate_data_cnn(N_train, data_gen, width, height) -X_val, y_val = generate_data_cnn(N_val, data_gen, width, height) +print(input_dim) +print(output_dim) +# Update input_shape for the 1D CNN model +input_shape = (input_dim, 1) -# Set the dimensions of input and output -input_shape = X_train.shape[1:] -output_dim = y_train.shape[0] +# Reshape the input data +X_train_cnn = X_train.reshape(-1, input_dim, 1) +X_val_cnn = X_val.reshape(-1, input_dim, 1) -# Define the CNN model +# Define the 1D CNN model model = Sequential([ - Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape), - Conv2D(64, kernel_size=(3, 3), activation='relu'), + Conv1D(32, kernel_size=3, activation='relu', input_shape=input_shape), + Conv1D(64, kernel_size=3, activation='relu'), Flatten(), Dense(64, activation='relu'), Dense(output_dim, activation='linear') @@ -56,6 +42,6 @@ def generate_data_cnn(N, data_gen, width, height): model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mean_absolute_error']) # Train the model -history = model.fit(X_train, y_train, batch_size=32, epochs=100, validation_data=(X_val, y_val)) +history = model.fit(X_train_cnn, y_train, batch_size=32, epochs=10, validation_data=(X_val_cnn, y_val)) -model.save("models/cnn.h5") \ No newline at end of file +model.save("models/cnn_1d.h5") \ No newline at end of file diff --git a/efficient-stats-calc/deep-learn/mlp.py b/efficient-stats-calc/deep-learn/mlp.py index 687fb4d5..8559181e 100644 --- a/efficient-stats-calc/deep-learn/mlp.py +++ b/efficient-stats-calc/deep-learn/mlp.py @@ -8,82 +8,25 @@ import os sys.path.append(os.path.abspath('.')) import data_gen +from data_gen import load_cdf_data -from scipy.stats import gaussian_kde -from scipy.integrate import cumtrapz - - - -def generate_pdf_data(N, data_gen, num_points=1024): - X = [] - y = [] - - for _ in range(N): - data = np.hstack(data_gen.generate_data(num_points)) - x_data = data[:num_points] - y_data = data[num_points:] - - # Estimate the PDFs of x and y using KDE - kde_x = gaussian_kde(x_data) - kde_y = gaussian_kde(y_data) - - # Compute f(x, y) and estimate its PDF using KDE - f_data = data_gen.f(x_data, y_data) - kde_f = gaussian_kde(f_data) - - # Stack the PDFs of x, y, and f(x, y) as input features and target - x_pdf = kde_x(x_data) - y_pdf = kde_y(y_data) - f_pdf = kde_f(f_data) - - X.append(np.hstack([x_pdf, y_pdf])) - y.append([f_pdf]) - - - return np.array(X), np.array(y) - -def generate_cdf_data(N, data_gen, num_points=1024): - X = [] - y = [] - - for _ in range(N): - data = np.hstack(data_gen.generate_data(num_points)) - x_data = data[:num_points] - y_data = data[num_points:] - - # Estimate the PDFs of x and y using KDE - kde_x = gaussian_kde(x_data) - kde_y = gaussian_kde(y_data) - - # Compute f(x, y) and estimate its PDF using KDE - f_data = data_gen.f(x_data, y_data) - kde_f = gaussian_kde(f_data) - - # Compute the CDFs of x, y, and f(x, y) using the PDFs - x_cdf = cumtrapz(kde_x(x_data), x_data, initial=0) - y_cdf = cumtrapz(kde_y(y_data), y_data, initial=0) - f_cdf = cumtrapz(kde_f(f_data), f_data, initial=0) - - # Normalize the CDFs - x_cdf /= x_cdf[-1] - y_cdf /= y_cdf[-1] - f_cdf /= f_cdf[-1] - - X.append(np.hstack([x_cdf, y_cdf])) - y.append([f_cdf]) - - return np.array(X), np.array(y) # Example usage: -N = 1000 # Number of training datapoints -val = 20 -X_train, y_train = generate_cdf_data(N, data_gen) -X_val, y_val = generate_cdf_data(val, data_gen) +X_train, y_train = load_cdf_data("data/X_train.npy", "data/y_train.npy") +X_val, y_val = load_cdf_data("data/X_val.npy", "data/y_val.npy") + +print("X_train shape:", X_train.shape) +print("y_train shape:", y_train.shape) +print("X_val shape:", X_val.shape) +print("y_val shape:", y_val.shape) # Set the dimensions of input and output input_dim = X_train.shape[1] -output_dim = y_train.shape[1] +output_dim = y_train.shape[2] + +print("Input Dim", input_dim) +print("Output Dim:", output_dim ) # Define the MLP model model = Sequential([ @@ -93,9 +36,12 @@ def generate_cdf_data(N, data_gen, num_points=1024): ]) # Compile the model -model.compile(optimizer=Adam(learning_rate=1e-5), loss='mean_squared_error', metrics=['mean_absolute_error']) +model.compile(optimizer=Adam(learning_rate=1e-5), loss=tf.keras.losses.Huber(delta=1.0), metrics=['mean_absolute_error']) # Train the model -history = model.fit(X_train, y_train, batch_size=8, epochs=100, validation_data=(X_val, y_val)) +history = model.fit(X_train, y_train, batch_size=32, epochs=100, validation_data=(X_val, y_val)) + +print(model.predict(np.expand_dims(X_train[0], axis=0))) +print(y_train[0]) model.save("models/mlp.h5") diff --git a/efficient-stats-calc/sparse-grid/sparse_grid_approx.py b/efficient-stats-calc/sparse-grid/sparse_grid_approx.py new file mode 100644 index 00000000..296b5c68 --- /dev/null +++ b/efficient-stats-calc/sparse-grid/sparse_grid_approx.py @@ -0,0 +1,71 @@ +import numpy as np +from scipy import interpolate + +# Step 1: Construct a sparse grid of sample points +# Define the number of dimensions +n_dim = 2 + +# Define the sparse grid +grid = ??? # Choose a sparse grid method + +# Define the number of sample points +n_points = ??? # Choose the number of sample points + +# Generate the sample points +sample_points = grid.generate_points(n_points) + +# Step 2: Interpolate the joint probability distribution +# Evaluate the PDFs at the sample points +pdf_x = ??? # Evaluate the PDF P(x) at the sample points +pdf_y = ??? # Evaluate the PDF P(y) at the sample points + +# Construct a meshgrid of the sample points +X, Y = np.meshgrid(sample_points[:, 0], sample_points[:, 1]) + +# Interpolate the joint probability distribution +joint_pdf = interpolate.interp2d(X, Y, pdf_x * pdf_y) + +# Step 3: Compute the PDF of the function output +# Define the function F(P(x), P(y)) +def F(p_x, p_y): + return ??? # Define the function F + +# Evaluate the function F at the sample points +f_values = F(pdf_x, pdf_y) + +# Compute the PDF of the function output +output_pdf = interpolate.interp1d(f_values, joint_pdf(X, Y)) + +# Step 4: Find the value c of the PDF of the function output that we want to approximate +c = ??? # Choose a value of the PDF of the function output that we want to approximate + +# Step 5: Find the set of input values (P(x), P(y)) that satisfy the equation F(P(x), P(y)) = PDF(c) +# Define the equation F(P(x), P(y)) = PDF(c) in terms of P(y) +def equation_p_y(p_x, p_y): + return output_pdf(F(p_x, p_y)) - c + +# Use a numerical solver to find the set of input values that satisfy the equation +input_values = ??? # Choose a numerical solver to find the input values + +# Step 6: Construct a sparse grid of sample points in the input space of the function F(P(x), P(y)) +# Define the sparse grid +grid_f = ??? # Choose a sparse grid method + +# Define the number of sample points +n_points_f = ??? # Choose the number of sample points + +# Generate the sample points +sample_points_f = grid_f.generate_points(n_points_f) + +# Step 7: Interpolate the function values at unsampled points +# Evaluate the function F at the sample points +f_values_f = F(sample_points_f[:, 0], sample_points_f[:, 1]) + +# Interpolate the function values +interpolated_f = interpolate.interp1d(f_values_f, sample_points_f) + +# Step 8: Compute the map F such that F(P(x), P(y)) = PDF(c) +# Use the interpolated function to compute the map +def map_F(p_x, p_y): + return interpolated_f(output_pdf(F(p_x, p_y))) + From c81fe7ee6450eb41342d506fcdecb166096a2752 Mon Sep 17 00:00:00 2001 From: Clay Foye Date: Wed, 5 Apr 2023 09:38:08 -0400 Subject: [PATCH 07/14] MAINT: fix imports --- efficient-stats-calc/benchmarks/benchmark_cdf.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/efficient-stats-calc/benchmarks/benchmark_cdf.py b/efficient-stats-calc/benchmarks/benchmark_cdf.py index 5eeaab93..c288974d 100644 --- a/efficient-stats-calc/benchmarks/benchmark_cdf.py +++ b/efficient-stats-calc/benchmarks/benchmark_cdf.py @@ -7,8 +7,7 @@ from data_gen import * import data_gen from tensorflow.keras.models import load_model -from scipy.stats import gaussian_kde -from scipy.integrate import cumtrapz + import time @@ -108,3 +107,5 @@ def calculate_cdf(data): print(f"{method}:") print(f" Average execution time: {result['average_execution_time']:.6f} seconds") print(f" Average error: {result['average_error']:.6f}") + + From 074e1c2231f7446848793144750a0b033d834183 Mon Sep 17 00:00:00 2001 From: Clay Foye Date: Wed, 5 Apr 2023 09:39:04 -0400 Subject: [PATCH 08/14] DOC: Document Changes --- efficient-stats-calc/datalib/real_data.md | 30 +++++++++++++++++++++++ efficient-stats-calc/deep-learn/mlp.py | 1 - 2 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 efficient-stats-calc/datalib/real_data.md diff --git a/efficient-stats-calc/datalib/real_data.md b/efficient-stats-calc/datalib/real_data.md new file mode 100644 index 00000000..fbe5fb76 --- /dev/null +++ b/efficient-stats-calc/datalib/real_data.md @@ -0,0 +1,30 @@ +# Real Data + +Need: +- Soil Density (geowatch) +LAYERS=algorithm.standard_mobility.soil_density.SoilDensityComposite +- Soil Type (gw) +- Soil Texture (gw)? +LAYERS=datalib.soil.texture.USCSTextureComposite +- Soil Temperature (gw) +LAYERS=datalib.weather.erdc_lis72.SoilTemperature0to10 +- Soil Moisture (soilmap) + + +## Example + +``` +c=podpac.Coordinates([podpac.clinspace([43.700, 43.704, 128, 'lat'], podpac.clinspace([-72.305, -72.301, 128, 'lon'], '2023-01-01', ['lat', 'lon', 'time']) + +sm = soilmap.datalib.geowatch.SoilMoisture() +sm.eval(c) + +podpac.settings["username-PW@geowatch_server"] = {"username": "creare", "password": " Date: Thu, 6 Apr 2023 08:27:11 -0400 Subject: [PATCH 09/14] Maint: PDFs of SM --- efficient-stats-calc/datalib/real_data.md | 19 ++++ efficient-stats-calc/datalib/soil_mositure.py | 92 +++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 efficient-stats-calc/datalib/soil_mositure.py diff --git a/efficient-stats-calc/datalib/real_data.md b/efficient-stats-calc/datalib/real_data.md index fbe5fb76..d5e38604 100644 --- a/efficient-stats-calc/datalib/real_data.md +++ b/efficient-stats-calc/datalib/real_data.md @@ -28,3 +28,22 @@ podpac.settings.save() 2. Fix Caching 3. Test on small region + + +1. Pull multiple year soilmoisture coarse resolution for 3x3 pixels +only do work in the middle pixel +SoilMoisture.solmst0_to_10 + +c=podpac.Coordinates([podpac.clinspace(43.09, 42.91, 3, 'lat'), podpac.clinspace(-73.155, -72.91, 3, 'lon'), '2023-01-01'], ['lat', 'lon', 'time']) +- correct resolution? +2. Cache it + +3. Feed + + +## TODO PT 2 +1. Get weatherscale resolution data for every day of a month on a single pixel +2. Get construct a CDF as expected output +3. Get fine resolution data for that pixel at the fine resolution for every day of a month +4. Construct a CDF as input to model + diff --git a/efficient-stats-calc/datalib/soil_mositure.py b/efficient-stats-calc/datalib/soil_mositure.py new file mode 100644 index 00000000..59a4b14d --- /dev/null +++ b/efficient-stats-calc/datalib/soil_mositure.py @@ -0,0 +1,92 @@ +import numpy as np + +import soilmap +import podpac +import datetime + +def calculate_cdf(data): + sorted_data = np.sort(data) + probabilities = np.linspace(0, 1, len(sorted_data)) + return probabilities + +def calculate_pdf(data, days): + # Calculate the kernel density estimate + kde = stats.gaussian_kde(data) + + x_values = np.linspace(min(data), max(data), days) + + # Evaluate the kde on the data + pdf = kde.evaluate(x_values) + + return pdf + + +def get_soil_moisture_pdfs(years, month, lat_clinspace, lon_clinspace, days): + + sm = soilmap.datalib.geowatch.SoilMoisture(cache_ctrl=['ram']).solmst_0_10 + pdfs = [] + + for year in years: + # Create a list of dates for the specified month up to and including the 28th day + dates = [f"{year}-{month:02d}-{day:02d}" for day in range(1, days-1)] + + soil_moisture_data = [] + + # request the data one day at a time: + for date in dates: + # Get the coordinates for the current date + current_c = podpac.Coordinates([lat_clinspace, lon_clinspace, date], ['lat', 'lon', 'time']) + # Get soil moisture data for the given coordinates + o = sm.eval(current_c) + soil_moisture_data.append(o.values[1][1][0]) + + # Sanitize soil_moisture_data for nans: + soil_moisture_data = np.array(soil_moisture_data) + soil_moisture_data = soil_moisture_data[~np.isnan(soil_moisture_data)] + print(soil_moisture_data) + # if soil moisture data is not empty: + if len(soil_moisture_data) > 0: + pdfs.append(calculate_pdf(soil_moisture_data, days)) + print(year) + + return np.array(pdfs) + +def get_soil_moisture_levels(years, month, lat_clinspace, lon_clinspace, days): + sm = soilmap.datalib.geowatch.SoilMoisture().solmst_0_10 + soil_moisture_data_all_years = [] + + for year in years: + # Create a list of dates for the specified month up to and including the 28th day + dates = [f"{year}-{month:02d}-{day:02d}" for day in range(1, days-1)] + + soil_moisture_data = [] + + # request the data one day at a time: + for date in dates: + # Get the coordinates for the current date + current_c = podpac.Coordinates([lat_clinspace, lon_clinspace, date], ['lat', 'lon', 'time']) + print(current_c) + # Get soil moisture data for the given coordinates + o = sm.eval(current_c) + + if o is np.nan: + continue + print(o) + soil_moisture_data.append(o.values[1][1][0]) + + soil_moisture_data_all_years.append(soil_moisture_data) + + return np.array(soil_moisture_data_all_years) + +from scipy import stats + + + +month = 6 +years = range(2012, 2023) +lat_clinspace = podpac.clinspace(43.09, 42.91, 3, 'lat') +lon_clinspace = podpac.clinspace(-73.155, -72.91, 3, 'lon') +soil_moisture_data = get_soil_moisture_pdfs(years, month, lat_clinspace, lon_clinspace, 28) +print(soil_moisture_data) +podpac.utils.clear_cache(mode="all") +np.save("datalib/data/june_soil_moisture_pdfs.npy", soil_moisture_data) \ No newline at end of file From ab6618a51819ba216dbdba134572193edc762f79 Mon Sep 17 00:00:00 2001 From: Clay Foye Date: Wed, 10 May 2023 12:02:45 -0400 Subject: [PATCH 10/14] CODE: Toy Example displaying functionality --- efficient-stats-calc/pdfs/toy_examples.py | 225 ++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 efficient-stats-calc/pdfs/toy_examples.py diff --git a/efficient-stats-calc/pdfs/toy_examples.py b/efficient-stats-calc/pdfs/toy_examples.py new file mode 100644 index 00000000..0f102442 --- /dev/null +++ b/efficient-stats-calc/pdfs/toy_examples.py @@ -0,0 +1,225 @@ +import podpac +import numpy as np +import matplotlib.pyplot as plt +from scipy.ndimage import gaussian_filter +import random +from scipy.stats import gaussian_kde +import xarray as xr +''' +Toy Data Creation +''' + +def generate_sm_data(N, random_seed=None, scale=1, sigma=3): + if random_seed is not None: + np.random.seed(random_seed) + + elevation_data = np.random.rand(N, N) * scale + elevation_data = gaussian_filter(elevation_data, sigma=sigma) + + return elevation_data + +def year_long_sm(mean_sm_data, monthly_coefficients, coefficient_std_dev=0.05): + year_long_sm_data = [] + + for month_coefficient in monthly_coefficients: + month_coefficient_array = np.random.normal(loc=month_coefficient, scale=coefficient_std_dev, size=mean_sm_data.shape) + month_sm_data = mean_sm_data * month_coefficient_array + year_long_sm_data.append(month_sm_data) + + return year_long_sm_data + +def decade_long_sm(mean_sm_data, monthly_coefficients, years=10, coefficient_std_dev=0.05): + decade_long_sm_data = [] + + for _ in range(years): + year_long_data = year_long_sm(mean_sm_data, monthly_coefficients, coefficient_std_dev) + decade_long_sm_data.extend(year_long_data) + + return decade_long_sm_data + + +def create_coordinates(N, YEARS): + # create coordinates for data + + fine_lat = np.linspace(-10, 10, N) # Fine Data + fine_lon = np.linspace(-10, 10, N) # Coarse Data + + + + time = np.linspace(0, (12*YEARS)-1, 12*YEARS) + coords = podpac.Coordinates([time, fine_lat, fine_lon], ['time','lat', 'lon']) # Fine coords + + return coords + + +""" +Data Modification/Utilities +""" + +def g(data): + return 1 / (1 + np.exp(-data)) + +def create_pdf(all_data, N_bins): + n_rows, n_cols = all_data.shape[1:3] + + # Initialize the PDFs, edges, and centers arrays + pdfs_shape = (N_bins, n_rows, n_cols) + edges_shape = (N_bins + 1, n_rows, n_cols) + all_pdfs = np.zeros(pdfs_shape) + all_edges = np.zeros(edges_shape) + + # Calculate the common edges using the flattened data + flat_data = all_data.ravel() + flat_data = flat_data[np.isfinite(flat_data)] + common_edges = np.histogram_bin_edges(flat_data, bins=N_bins) + + # Calculate the histograms and centers for each element of the 2D grid + for i in range(n_rows): + for j in range(n_cols): + tmp = all_data[:, i, j].ravel() + tmp = tmp[np.isfinite(tmp)] + all_pdfs[:, i, j], _ = np.histogram(tmp, density=True, bins=common_edges) + all_edges[:, i, j] = common_edges + + # Calculate centers + all_centers = (all_edges[:-1] + all_edges[1:]) / 2 + + return all_pdfs, all_edges, all_centers + +def create_stats_f_data(stats_coords_f, g_x): + stats_f_data = [] + shape = stats_coords_f.shape + shape = (1, shape[1], shape[2]) # set maximum time shape to 1 + for coords in stats_coords_f.iterchunks(shape): + stats_f_data.append(g_x.eval(coords)) # should be fast -- all local cached data (Except vegetation for some reason?) + stats_f_data = xr.concat(stats_f_data, 'time') + + return stats_f_data + + + +""" +Data Visualization +""" +def plot_year_long_sm(year_long_data): + n_months = len(year_long_data) + n_cols = 3 + n_rows = (n_months + n_cols - 1) // n_cols + + fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 4)) + axes = axes.flatten() + + for month, sm_data in enumerate(year_long_data, start=1): + ax = axes[month - 1] + img = ax.imshow(sm_data, cmap='viridis', origin='lower') + ax.set_title(f'Month {month}') + ax.axis('off') + + # Remove unused subplots + for ax in axes[n_months:]: + ax.remove() + + # Add a colorbar + fig.subplots_adjust(right=0.8) + cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7]) + fig.colorbar(img, cax=cbar_ax, label='Soil Moisture') + + plt.show() + +def create_histogram(m_sm_matrices, N_bins): + # Calculate the average soil moisture value for each matrix + avg_soil_moisture = [np.mean(matrix) for matrix in m_sm_matrices] + + # Create the histogram + plt.hist(avg_soil_moisture, bins=N_bins) + plt.xlabel('Average Soil Moisture') + plt.ylabel('Frequency') + plt.title('Histogram of Soil Moisture Matrices') + plt.show() + + +def plot_pdf_for_single_grid_square(all_pdfs, all_edges, row, col): + # Select the PDF and edges for the specific grid square + pdf = all_pdfs[:, row, col] + edges = all_edges[:, row, col] + + # Calculate the centers of the bins + centers = (edges[:-1] + edges[1:]) / 2 + + # Plot the PDF + plt.plot(centers, pdf) + plt.xlabel("Value") + plt.ylabel("Probability Density") + plt.title(f"PDF for Grid Square ({row}, {col})") + plt.show() + +""" +Main +""" + +def approx_sm(N, YEARS, N_BINS, DO_PLOT, monthly_coefficients): + + + """ + Data Gen + """ + + # Data Gen + average_sm = generate_sm_data(N, random_seed=0) + decade_long_data = decade_long_sm(average_sm, monthly_coefficients, YEARS) + + + if DO_PLOT: + year_long_data = year_long_sm(average_sm, monthly_coefficients) + plot_year_long_sm(year_long_data) + + print("Data Generated") + + """ + Coordinate Gen + """ + + coords = create_coordinates(N, YEARS) + print(coords) + + + """ + Create PDF's for each grid square + """ + + data = np.array(decade_long_data) + all_pdfs, all_edges, all_centers = create_pdf(data, N_BINS) + + """ + Calculate Means + """ + true_mean = g(data).mean(0) + cheap_mean = (g(all_centers) * all_pdfs * (np.diff(all_edges, axis=0))).sum(axis=0) + print("Mean Absolute Error", np.abs(true_mean - cheap_mean).mean()) + + + if DO_PLOT or True: + plt.figure() + ax1 = plt.subplot(131) + plt.imshow(true_mean) + plt.title("Truth") + plt.colorbar() + plt.subplot(132, sharex=ax1, sharey=ax1) + plt.imshow(cheap_mean,) + plt.title(f"Cheap,Approx, nbins={N_BINS}") + plt.colorbar() + plt.subplot(133, sharex=ax1, sharey=ax1) + plt.title("Cheap - Truth") + plt.imshow(cheap_mean - true_mean, cmap='BrBG')#, vmin=-0.02, vmax=0.02) + plt.colorbar() + plt.show() + + +if __name__ == '__main__': + # Constants + N = 50 # size of the fine coords + YEARS = 10 # Total number of years to create years for + N_BINS = 32 # Number of bins for the histogram + DO_PLOT = False # plots or not + monthly_coefficients = [1.2, 1.1, 1.0, 0.8, 0.6, 0.5, 0.4, 0.6, 0.8, 1.0, 1.1, 1.2] + approx_sm(N, YEARS, N_BINS, DO_PLOT, monthly_coefficients) \ No newline at end of file From 27140d217985e3437e22d65c2eaee723aa981959 Mon Sep 17 00:00:00 2001 From: Clay Foye Date: Mon, 22 May 2023 11:28:24 -0400 Subject: [PATCH 11/14] CODE: Real Example --- efficient-stats-calc/pdfs/real_example.py | 384 ++++++++++++++++++++++ 1 file changed, 384 insertions(+) create mode 100644 efficient-stats-calc/pdfs/real_example.py diff --git a/efficient-stats-calc/pdfs/real_example.py b/efficient-stats-calc/pdfs/real_example.py new file mode 100644 index 00000000..264839bc --- /dev/null +++ b/efficient-stats-calc/pdfs/real_example.py @@ -0,0 +1,384 @@ +import podpac +import numpy as np +import matplotlib.pyplot as plt +from scipy.ndimage import gaussian_filter +import random +from scipy.stats import gaussian_kde +import xarray as xr +from tqdm import tqdm +from matplotlib.colors import LogNorm + + +import soilmap.datalib.geowatch as geowatch + + +def get_coordinates(): + # Make a set of fine and coarse coordinates + # Center coordinates about Creare + center_lat, center_lon = 43.682102, -72.233455 + # get deltas using 110 km / deg and a 30 x 30 km box + f_box_lat_lon = 30 / 2 / 110 + n_fine = int(30 / (30 / 1000)) # Number of 30 m boxes in a 30km square (hint, it's 1000) + n_coarse = 3 + f_coords = podpac.Coordinates([ + podpac.clinspace(center_lat + f_box_lat_lon, center_lat - f_box_lat_lon, n_fine), + podpac.clinspace(center_lon - f_box_lat_lon, center_lon + f_box_lat_lon, n_fine), + ["2022-06-01T12"]], ["lat", "lon", "time"] + ) + c_coords = f_coords[::333, 100::400] + + return c_coords, f_coords + +def fill_time_dimensions(c_coords, f_coords): + # make the coordinates for the brute-force technique + e1 = podpac.algorithm.ExpandCoordinates(time=["0,D", "30,D", '1,D']) + day_time_coords = podpac.Coordinates([e1.get_modified_coordinates1d(f_coords, 'time')]) + e2 = podpac.algorithm.ExpandCoordinates(source=e1, time=["-4,Y", "0,Y", '1,Y']) + all_time_coords = podpac.Coordinates([e2.get_modified_coordinates1d(day_time_coords, 'time')]) + + all_f_coords = podpac.coordinates.merge_dims([f_coords.drop('time'), all_time_coords]) + all_c_coords = podpac.coordinates.merge_dims([c_coords.drop('time'), all_time_coords]) + + + return all_c_coords, all_f_coords + +def get_veg_node(all_f_coords): + # Also, use the average of the vegetation, for initial testing + sm = geowatch.SoilMoisture() + o_veg = [] + vegetation = sm.vegetation + one_month_coords = all_f_coords[:, :, :32*5:5] + shape = one_month_coords[:, :, :32*5:5].shape + shape = (shape[0], shape[1], 1) # set maximum time shape to 1 + for coords in one_month_coords.iterchunks(shape): + o_veg.append(vegetation.eval(coords)) + o_veg = xr.concat(o_veg, 'time') + print(o_veg.mean('time').data) + veg_node = podpac.data.Array(source=o_veg.mean('time').data, coordinates=all_f_coords.drop('time')) + + return veg_node + +def get_constant_veg_node(all_f_coords, constant_veg_value): + # Get the spatial coordinates from all_f_coords + spatial_coords = all_f_coords.drop('time') + + # Create an array with the same spatial shape as all_f_coords but filled with the constant vegetation value + constant_veg_data = np.full(spatial_coords.shape, constant_veg_value) + + # Create a PODPAC Array node with the constant vegetation data and the spatial coordinates + constant_veg_node = podpac.data.Array(source=constant_veg_data, coordinates=spatial_coords) + + return constant_veg_node + +def get_constant_land_use(all_f_coords, constant_land_value): + # Get the spatial coordinates from all_f_coords + spatial_coords = all_f_coords.drop('time') + + # Create an array with the same spatial shape as all_f_coords but filled with the constant vegetation value + constant_veg_data = np.full(spatial_coords.shape, constant_land_value) + + # Create a PODPAC Array node with the constant vegetation data and the spatial coordinates + constant_land_node = podpac.data.Array(source=constant_veg_data, coordinates=spatial_coords) + + return constant_land_node + +def get_calibrated_sm(all_c_coords, veg_node, land_use_node): + # Quick test to make sure everything makes sense + sm = geowatch.SoilMoisture(vegetation=veg_node, vegetation_mean=veg_node, land_use=land_use_node) + sm_ca = sm.solmst_0_10 + sm_cr = sm.relsolmst_0_10 # you can compute this from the abs soil moisture above... but I'm lazy so I'll just get this data for now + + all_ca_data = [] #coarse absolute + all_cr_data = [] #coarse relative + shape = all_c_coords.shape + shape = (shape[0], shape[1], 1) + for coords in all_c_coords.iterchunks(shape): + all_ca_data.append(sm_ca.eval(coords)) + all_cr_data.append(sm_cr.eval(coords)) + + # Merge the data in all_ca_data along the 'time' dimension + all_ca_data = xr.concat(all_ca_data, 'time') + + # Merge the data in all_cr_data along the 'time' dimension + all_cr_data = xr.concat(all_cr_data, 'time') + + # All Coarse Absolute Data + all_ca_arr = podpac.data.Array( + source=all_ca_data, + coordinates=all_c_coords, + interpolation="bilinear" + ) + # All Coarse Relative Data + all_cr_arr = podpac.data.Array( + source=all_cr_data, + coordinates=all_c_coords, + interpolation="bilinear" + ) + + # Create a new SoilMoisture node that adjusts the soil moisture data based on vegetation data + return all_ca_data, all_cr_data, geowatch.SoilMoisture(solmst_0_10=all_ca_arr, relsolmst_0_10=all_cr_arr, vegetation=veg_node, vegetation_mean=veg_node, land_use=land_use_node) + +def get_stats_data(all_c_coords, all_f_coords, all_ca_edges, all_cr_edges, constant_veg_value, constant_land_value, n_bins): + # Get the centers of the bins + all_ca_centers = (all_ca_edges[..., 1:] + all_ca_edges[..., :-1]) * 0.5 + all_cr_centers = (all_cr_edges[..., 1:] + all_cr_edges[..., :-1]) * 0.5 + + # Get stats coords + stats_coords_f = all_f_coords[:, :, :n_bins * 2:2] + stats_coords = all_c_coords[:, :, :n_bins * 2:2] + + # Create abs soil moisture coords at the centers of the bins + abs_ws_stats = podpac.data.Array( + source=all_ca_centers, # Here's the weatherscale data at the centers of the bins + coordinates=stats_coords, # Mock the coordinates + interpolation="bilinear" + ) + # Create relative soil moisture coords at the centers of the bins + rel_ws_stats = podpac.data.Array( + source=all_cr_centers, + coordinates=stats_coords, # Mock the coordinates + interpolation="bilinear" + ) + # Get the spatial coordinates from stats_coords + spatial_coords = stats_coords.drop('time') + + constant_veg_data = np.full(spatial_coords.shape, constant_veg_value) + veg_node = podpac.data.Array(source=constant_veg_data, coordinates=spatial_coords) + + constant_land_data = np.full(spatial_coords.shape, constant_land_value) + land_use_node = podpac.data.Array(source=constant_land_data, coordinates=spatial_coords) + + # Make the g(x) function + sm_stats = geowatch.SoilMoisture(solmst_0_10=abs_ws_stats, relsolmst_0_10=rel_ws_stats, vegetation=veg_node, vegetation_mean=veg_node, land_use=land_use_node) + + # Evaluate g(x) + stats_f_data = [] + shape = stats_coords_f.shape + shape = (shape[0], shape[1], 1) # set maximum time shape to 1 + for coords in stats_coords_f.iterchunks(shape): + stats_f_data.append(sm_stats.eval(coords)) # should be fast -- all local cached data (Except vegetation for some reason? ) + stats_f_data = xr.concat(stats_f_data, 'time') + + return stats_f_data + +def get_fine_pdfs_edges(all_ca_pdfs, all_ca_edges, n_bins, all_c_coords, all_f_coords): + ca_edges = podpac.data.Array( + source=all_ca_edges, + coordinates=all_c_coords[:, :, :n_bins + 1], # Again, the time coordinate here is fake -- I just want to interpolate space + interpolation='bilinear', ) + ca_edges_f = ca_edges.eval(all_f_coords[:, :, :n_bins + 1]) + # pdfs + ca_pdfs = podpac.data.Array( + source=all_ca_pdfs, + coordinates=all_c_coords[:, :, :n_bins], # Again, the time coordinate here is fake -- I just want to interpolate space + interpolation='bilinear', + ) + ca_pdfs_f = ca_pdfs.eval(all_f_coords[:, :, :n_bins]) + + return ca_edges_f, ca_pdfs_f + +def get_fine_data(all_f_coords, sm_centered): + # get the data for the cheap approach + all_f_data = [] + shape = all_f_coords.shape + shape = (shape[0], shape[1], 1) + for coords in all_f_coords.iterchunks(shape): + all_f_data.append(sm_centered.eval(coords)) + all_f_data = xr.concat(all_f_data, 'time') + + return all_f_data + + +def create_pdfs(n_bins, all_ca_data, all_cr_data): + # Create new arrays with a third dimension of size n_bins. + new_shape = all_ca_data.shape[:2] + (n_bins,) + new_shape_edges = all_ca_data.shape[:2] + (n_bins + 1,) + all_ca_pdfs = np.zeros((new_shape)) + all_ca_edges = np.zeros((new_shape_edges)) + all_cr_pdfs = np.zeros((new_shape)) + all_cr_edges = np.zeros((new_shape_edges)) + + # Loop over each element in the 2D array, computing the histograms for the corresponding 1D data. + for i in range(new_shape[0]): + for j in range(new_shape[1]): + # Compute the histogram for the i,j-th element of all_ca_data. + tmp = all_ca_data.data[i, j].ravel() + tmp = tmp[np.isfinite(tmp)] + all_ca_pdfs[i, j, :], all_ca_edges[i, j, :] = np.histogram(tmp, density=True, bins=n_bins) + + # Compute the histogram for the i,j-th element of all_cr_data. + tmp = all_cr_data.data[i, j].ravel() + tmp = tmp[np.isfinite(tmp)] + all_cr_pdfs[i, j, :], all_cr_edges[i, j, :] = np.histogram(tmp, density=True, bins=n_bins) + + return all_ca_pdfs, all_ca_edges, all_cr_pdfs, all_cr_edges + +def validate_pdfs(all_ca_pdfs, all_ca_edges): + # Check on one of these + assert np.abs(1 - (all_ca_pdfs[1, 1] * (all_ca_edges[1, 1][1:] - all_ca_edges[1, 1][:-1])).sum()) < 1e-14 + plt.stairs(all_ca_pdfs[1, 1], all_ca_edges[1, 1], fill=True) + plt.stairs(all_ca_pdfs[1, 1], all_ca_edges[1, 1], color='k', fill=False) + plt.show() + +def compute_mse(cheap_mean, truth_mean): + squared_diffs = np.nan_to_num((cheap_mean - truth_mean)**2) + valid_mask = np.isfinite(cheap_mean) & np.isfinite(truth_mean) # mask where neither array is NaN + + # Compute the mean of the squared differences, ignoring any NaN values + mse = np.nanmean(squared_diffs[valid_mask]) + return mse + +def plot_difference(arr1, arr2, n_bins, title=""): + # Plot results + plt.figure() + kwargs = dict() + ax1 = plt.subplot(131) + plt.imshow(arr1, **kwargs) + plt.title("Truth "+title) + plt.colorbar() + plt.subplot(132, sharex=ax1, sharey=ax1) + plt.imshow(arr2, **kwargs) + plt.title(f"Cheap,Approx, nbins={n_bins}, " + title) + plt.colorbar() + plt.subplot(133, sharex=ax1, sharey=ax1) + plt.title("Cheap - Truth "+ title) + plt.imshow(arr2 - arr1, cmap='BrBG') + plt.colorbar() + plt.show() + +def eff_stats_calc(n_bins, veg_value, land_use_value, DO_LOG=False, DO_PLOT=False): + # Get lat and lon coordinates for coarse data with empty time + c_coords, f_coords = get_coordinates() + + # Fill in time dimension + all_c_coords, all_f_coords = fill_time_dimensions(c_coords, f_coords) + + if DO_LOG: + print("Got Coords...") + + # Get a vegetation node for constant vegetation: + # veg_node = get_veg_node(all_f_coords) + veg_node = get_constant_veg_node(all_f_coords, veg_value) + land_use_node = get_constant_land_use(all_f_coords, land_use_value) + + # Get a soil moisture node with centered on coarse coords + all_ca_data, all_cr_data, sm_centered = get_calibrated_sm(all_c_coords, veg_node, land_use_node) + + if DO_LOG: + print("Calibrated SM...") + + # Get Fine SM Data: + all_f_data = get_fine_data(all_f_coords, sm_centered) + + if DO_LOG: + print("Got Fine Data...") + + # Create a PDF from the data + all_ca_pdfs, all_ca_edges, all_cr_pdfs, all_cr_edges = create_pdfs(n_bins, all_ca_data, all_cr_data) + #TODO: Figure out how we can use coarse relative pdfs? + # validate_pdfs(all_ca_pdfs, all_ca_edges) + + if DO_LOG: + print("Created PDFs...") + + # Create a SM node with the centers + stats_f_data = get_stats_data(all_c_coords, all_f_coords, all_ca_edges, all_cr_edges, veg_value, land_use_value ,n_bins) + + if DO_LOG: + print("Got Stats Data...") + + # Get the fine pdfs by interpolating coarse-scale data edges + # TODO: figure out ohow to use coarse relative pdfs? + ca_edges_f, ca_pdfs_f = get_fine_pdfs_edges(all_ca_pdfs, all_ca_edges, n_bins, all_c_coords, all_f_coords) + + if DO_LOG: + print("Got Fine PDFs...") + + # Calculate cheap mean and variance + cheap_mean = (stats_f_data.data * ca_pdfs_f.data * (ca_edges_f[..., 1:].data - ca_edges_f[..., :-1].data)).sum(axis=-1) + deviations = stats_f_data.data - cheap_mean[..., np.newaxis] + cheap_var = (deviations**2 * ca_pdfs_f.data * (ca_edges_f[..., 1:].data - ca_edges_f[..., :-1].data)).sum(axis=-1) + + + if DO_LOG: + print("Calculated Cheap Mean...") + + # Calculate correct mean + truth_mean = all_f_data.mean('time').data + truth_var = all_f_data.var('time').data + + if DO_LOG: + print("Calculated Truth Mean...") + + if DO_PLOT: + plot_difference(truth_mean, cheap_mean, n_bins, title="Mean") + plot_difference((truth_var), (cheap_var), n_bins, title="Variance") + + + cheap_std_dev = np.sqrt(cheap_var) + truth_std_dev = np.sqrt(truth_var) + + if DO_LOG: + print("Calculated StdDev...") + + # Compute the squared differences, ignoring any NaN values + mse = compute_mse(cheap_mean, truth_mean) + + return cheap_mean, truth_mean, mse + + +if __name__ == '__main__': + """ + 1. Generate PDFs out of coarse data + 2. Calculate centers for that data + 3. Create SoilMoisutre node using those centers + 4. Evaluate SM node at those centers + 5. Multiply eval by pdf and bin widths + """ + with podpac.settings: + # Constants + VEG_VALUE = 50 + LAND_USE_VALUE = 0 + DO_LOG = True + DO_PLOT = False + + # Cache + podpac.settings["DEFAULT_CACHE"] = ["ram", "disk"] + podpac.settings["MULTITHREADING"] = False + # podpac.settings["CACHE_OUTPUT_DEFAULT"] = False + # podpac.utils.clear_cache(mode='ram') + # podpac.utils.clear_cache(mode='disk') + + # Initialize lists to store the bin counts and the corresponding MSE values + bin_counts = [] + mse_values = [] + + # Initialize the progress bar + pbar = tqdm(total=7, desc='Processing bins', ncols=155) + + # Run eff_stats_calc for 1, 2, 4, 8, 16, 32, 64 bins + # [1, 2, 4, 8, 16, 32, 64]: + for n_bins in [1, 2, 4, 8, 16, 32, 64]: + if n_bins == 64: + _, _, mse = eff_stats_calc(n_bins, VEG_VALUE, LAND_USE_VALUE, DO_LOG, True) + else: + _, _, mse = eff_stats_calc(n_bins, VEG_VALUE, LAND_USE_VALUE, DO_LOG, DO_PLOT) + bin_counts.append(n_bins) + mse_values.append(mse) + + #Update the progress bar + pbar.update(1) + + pbar.close() + + # Plot the results + plt.figure(figsize=(10,6)) + plt.loglog(bin_counts, mse_values, marker='o') + plt.title('Mean Squared Error vs Number of Bins') + plt.xlabel('Number of Bins') + plt.ylabel('Mean Squared Error') + plt.grid(True) + plt.show() + + + From 737cc5706e65cde9001216376115f9e49b00fc43 Mon Sep 17 00:00:00 2001 From: Clay Foye Date: Mon, 22 May 2023 14:27:09 -0400 Subject: [PATCH 12/14] CLEANUP: Remove all non-working or draft files --- .../benchmarks/benchmark_cdf.py | 111 ----------- .../benchmarks/benchmark_mean.py | 93 ---------- efficient-stats-calc/data_gen.py | 174 ------------------ efficient-stats-calc/datalib/real_data.md | 49 ----- efficient-stats-calc/datalib/soil_mositure.py | 92 --------- efficient-stats-calc/deep-learn/cnn.py | 47 ----- efficient-stats-calc/deep-learn/mlp.py | 46 ----- .../{pdfs => }/real_example.py | 0 .../sparse-grid/sparse_grid_approx.py | 71 ------- .../{pdfs/toy_examples.py => toy_example.py} | 0 10 files changed, 683 deletions(-) delete mode 100644 efficient-stats-calc/benchmarks/benchmark_cdf.py delete mode 100644 efficient-stats-calc/benchmarks/benchmark_mean.py delete mode 100644 efficient-stats-calc/data_gen.py delete mode 100644 efficient-stats-calc/datalib/real_data.md delete mode 100644 efficient-stats-calc/datalib/soil_mositure.py delete mode 100644 efficient-stats-calc/deep-learn/cnn.py delete mode 100644 efficient-stats-calc/deep-learn/mlp.py rename efficient-stats-calc/{pdfs => }/real_example.py (100%) delete mode 100644 efficient-stats-calc/sparse-grid/sparse_grid_approx.py rename efficient-stats-calc/{pdfs/toy_examples.py => toy_example.py} (100%) diff --git a/efficient-stats-calc/benchmarks/benchmark_cdf.py b/efficient-stats-calc/benchmarks/benchmark_cdf.py deleted file mode 100644 index c288974d..00000000 --- a/efficient-stats-calc/benchmarks/benchmark_cdf.py +++ /dev/null @@ -1,111 +0,0 @@ -import time -import numpy as np - -import sys -import os -sys.path.append(os.path.abspath('.')) -from data_gen import * -import data_gen -from tensorflow.keras.models import load_model - - -import time - -MLP_MODEL = "models/mlp.h5" - -# Constants -CNN_MODEL = 'models/cnn_1d.h5' -N_SAMPLES = 1024 - -# Brute Force -- computing s(f(x,y)) -def brute_force(x_cdf, y_cdf): - return NotImplementedError - - -# Multi-layer Perceptron -def mlp(x_cdf, y_cdf): - model = load_model(MLP_MODEL) - print(x_cdf) - print(model.predict(np.array([np.hstack([x_cdf,y_cdf])]))[0]) - return model.predict(np.array([np.hstack([x_cdf,y_cdf])]))[0] - - -# Convolutional Neural Network -def cnn(x_cdf, y_cdf): - # Load the trained CNN model - model = load_model(CNN_MODEL) - - cdf = np.array([np.hstack([x_cdf,y_cdf])]) - cdf = cdf.reshape(-1, cdf.shape[1], 1) - - - # Perform prediction using the model - prediction = model.predict(cdf)[0] - - print(prediction) - - return prediction - -def benchmark(methods, data_gen, f, n_runs=1, n_sets=10): - results = {} - - def calculate_cdf(data): - sorted_data = np.sort(data) - probabilities = np.linspace(0, 1, len(sorted_data)) - return sorted_data, probabilities - - for name, method in methods.items(): - # Initialize variables to store execution time and errors - exec_times = [] - errors = [] - - for _ in range(n_runs): - set_errors = [] - for i in range(n_sets): - x, y = data_gen.generate_data(N_SAMPLES) - c_true = f(x, y) - - # Calculate the CDFs of x, y, and f(x, y) - x_sorted, x_cdf = calculate_cdf(x) - y_sorted, y_cdf = calculate_cdf(y) - c_sorted, c_cdf_true = calculate_cdf(c_true) - - # Measure execution time - start_time = time.time() - c_cdf_approx = method(x_cdf, y_cdf) - exec_time = time.time() - start_time - - # Calculate the error - error = np.mean(np.abs(c_cdf_approx - c_cdf_true)) - set_errors.append(error) - - exec_times.append(exec_time) - errors.append(np.mean(set_errors)) - - # Calculate average execution time and error - avg_exec_time = np.mean(exec_times) - avg_error = np.mean(errors) - - results[name] = { - 'average_execution_time': avg_exec_time, - 'average_error': avg_error - } - - return results - - -# Define methods to benchmark -methods = { - 'mlp': mlp, - 'cnn': cnn -} - -# Run the benchmark -results = benchmark(methods, data_gen, f, n_runs=1, n_sets=1) -print("Benchmark results:") -for method, result in results.items(): - print(f"{method}:") - print(f" Average execution time: {result['average_execution_time']:.6f} seconds") - print(f" Average error: {result['average_error']:.6f}") - - diff --git a/efficient-stats-calc/benchmarks/benchmark_mean.py b/efficient-stats-calc/benchmarks/benchmark_mean.py deleted file mode 100644 index f07d1fc0..00000000 --- a/efficient-stats-calc/benchmarks/benchmark_mean.py +++ /dev/null @@ -1,93 +0,0 @@ -import time -import numpy as np -from data_gen import * -from tensorflow.keras.models import load_model - -MLP_MODEL = "models/mlp.h5" - -# Constants -CNN_MODEL = 'models/cnn.h5' -WIDTH = 32 -HEIGHT = 32 - -# Brute Force -- computing s(f(x,y)) -def brute_force(x, y): - return np.mean(f(x,y)) - -# Multi-layer Perceptron -def mlp(x, y): - model = load_model(MLP_MODEL) - return model.predict(np.array([np.hstack([x,y])]))[0][0] - - -# Convolutional Neural Network -def cnn(x, y): - # Load the trained CNN model - model = load_model(CNN_MODEL) - - # Reshape x and y into the format expected by the CNN - x_data = x.reshape(HEIGHT, WIDTH) - y_data = y.reshape(HEIGHT, WIDTH) - data = np.stack([x_data, y_data], axis=-1) # shape: (height, width, 2) - input_data = np.expand_dims(data, axis=0) # shape: (1, height, width, 2) - - # Perform prediction using the model - prediction = model.predict(input_data)[0][0] - - return prediction - - -# Benchmarking tool -def benchmark(methods, x, y, f, n_runs=10): - results = {} - - # Calculate the ground truth s(c) - c_true = f(x, y) - s_c_true = np.mean(c_true) - - for name, method in methods.items(): - # Initialize variables to store execution time and errors - exec_times = [] - errors = [] - - for _ in range(n_runs): - # Measure execution time - start_time = time.time() - s_c_approx = method(x, y) - exec_time = time.time() - start_time - exec_times.append(exec_time) - - # Calculate the error - error = np.abs(s_c_approx - s_c_true) - errors.append(error) - - # Calculate average execution time and error - avg_exec_time = np.mean(exec_times) - avg_error = np.mean(errors) - - results[name] = { - 'average_execution_time': avg_exec_time, - 'average_error': avg_error - } - - return results - -# Define methods to benchmark -methods = { - 'brute_force': brute_force, - 'multilayer_perceptron': mlp, - 'convolutional': cnn -} - -# Generate toy data -n_samples = WIDTH*HEIGHT -x, y = generate_data(n_samples) - - -# Run the benchmark -results = benchmark(methods, x, y, f) -print("Benchmark results:") -for method, result in results.items(): - print(f"{method}:") - print(f" Average execution time: {result['average_execution_time']:.6f} seconds") - print(f" Average error: {result['average_error']:.6f}") diff --git a/efficient-stats-calc/data_gen.py b/efficient-stats-calc/data_gen.py deleted file mode 100644 index 9ff51713..00000000 --- a/efficient-stats-calc/data_gen.py +++ /dev/null @@ -1,174 +0,0 @@ -import numpy as np -import concurrent.futures -from scipy.stats import gaussian_kde -from scipy.integrate import cumtrapz - -# Non-linear n-dimensional function f(x, y) -def f(x, y): - return np.sin(x) * np.cos(y) - -def generate_data(n_samples, seed=None): - - if seed is not None: - np.random.seed(seed) - - x = np.random.uniform(-np.pi, np.pi, n_samples) - y = np.random.uniform(-np.pi, np.pi, n_samples) - return x, y -# Calculate statistics s(x), s(y) and s(c) -def calculate_statistics(x, y, func): - s_x = np.mean(x) - s_y = np.mean(y) - c = func(x, y) - s_c = np.mean(c) - return s_x, s_y, s_c - -# Generate toy data and calculate statistics -def create_toy_data(n_samples, func): - x, y = generate_data(n_samples) - s_x, s_y, s_c = calculate_statistics(x, y, func) - return x, y, s_x, s_y, s_c - -""" -Example Usage -""" -# n_samples = 1000 -# x, y, s_x, s_y, s_c = create_toy_data(n_samples, f) -# print("s(x):", s_x) -# print("s(y):", s_y) -# print("s(c):", s_c) - - - - -def generate_data_cnn(N, data_gen, width, height): - X = [] - y = [] - - for _ in range(N): - x_data, y_data = data_gen.generate_data(width * height) - x_data = x_data.reshape(height, width) - y_data = y_data.reshape(height, width) - data = np.stack([x_data, y_data], axis=-1) # shape: (height, width, 2) - X.append(data) - y.append(np.mean(data_gen.f(x_data, y_data))) - - return np.array(X), np.array(y) - -def generate_pdf_data(N, data_gen, num_points=1024): - X = [] - y = [] - - for _ in range(N): - data = np.hstack(data_gen.generate_data(num_points)) - x_data = data[:num_points] - y_data = data[num_points:] - - # Estimate the PDFs of x and y using KDE - kde_x = gaussian_kde(x_data) - kde_y = gaussian_kde(y_data) - - # Compute f(x, y) and estimate its PDF using KDE - f_data = data_gen.f(x_data, y_data) - kde_f = gaussian_kde(f_data) - - # Stack the PDFs of x, y, and f(x, y) as input features and target - x_pdf = kde_x(x_data) - y_pdf = kde_y(y_data) - f_pdf = kde_f(f_data) - - X.append(np.hstack([x_pdf, y_pdf])) - y.append([f_pdf]) - - - return np.array(X), np.array(y) - -def generate_cdf_data(N, data_gen, num_points=1024, grid_size=100): - X = [] - y = [] - - for _ in range(N): - data = np.hstack(data_gen.generate_data(num_points)) - x_data = data[:num_points] - y_data = data[num_points:] - - # Create a lower resolution grid - x_grid = np.linspace(x_data.min(), x_data.max(), grid_size) - y_grid = np.linspace(y_data.min(), y_data.max(), grid_size) - - # Estimate the PDFs of x and y using KDE - # Example: Use Silverman's rule of thumb - kde_x = gaussian_kde(x_data, bw_method='silverman') - kde_y = gaussian_kde(y_data, bw_method='silverman') - - # Compute f(x, y) and estimate its PDF using KDE - f_data = data_gen.f(x_data, y_data) - kde_f = gaussian_kde(f_data, bw_method='silverman') - - # Compute the CDFs of x, y, and f(x, y) using the PDFs on the grid - x_cdf = cumtrapz(kde_x(x_grid), x_grid, initial=0) - y_cdf = cumtrapz(kde_y(y_grid), y_grid, initial=0) - f_cdf = cumtrapz(kde_f(f_data), f_data, initial=0) - - # Normalize the CDFs - x_cdf /= x_cdf[-1] - y_cdf /= y_cdf[-1] - f_cdf /= f_cdf[-1] - - X.append(np.hstack([x_cdf, y_cdf])) - y.append([f_cdf]) - - return np.array(X), np.array(y) - -def generate_cdf_data_parallel(N, data_gen, num_workers=8, num_points=1024): - # Define a helper function to process each data point - def process_datapoint(data_gen): - data = np.hstack(data_gen.generate_data(num_points)) - x_data = data[:num_points] - y_data = data[num_points:] - - # Calculate the CDFs of x, y, and f(x, y) directly from the data points - x_sorted, x_cdf = calculate_cdf(x_data) - y_sorted, y_cdf = calculate_cdf(y_data) - f_data = data_gen.f(x_data, y_data) - f_sorted, f_cdf = calculate_cdf(f_data) - - return np.hstack([x_cdf, y_cdf]), f_cdf - - def calculate_cdf(data): - sorted_data = np.sort(data) - probabilities = np.linspace(0, 1, len(sorted_data)) - return sorted_data, probabilities - - X = [] - y = [] - - with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor: - # Parallelize the generation of data points - futures = [executor.submit(process_datapoint, data_gen) for _ in range(N)] - - for future in futures: - x_cdf_y_cdf, f_cdf = future.result() - X.append(x_cdf_y_cdf) - y.append([f_cdf]) - - return np.array(X), np.array(y) - -def save_cdf_data(X, y, X_filename, y_filename): - np.save(X_filename, X) - np.save(y_filename, y) - -def load_cdf_data(X_filename, y_filename): - X = np.load(X_filename) - y = np.load(y_filename) - return X, y - -N = 10000 -import data_gen - -X_train, y_train = generate_cdf_data_parallel(N, data_gen, num_workers=8) -save_cdf_data(X_train, y_train, "data/X_train.npy", "data/y_train.npy") - - -X_val, y_val = generate_cdf_data_parallel(200, data_gen, num_workers=8) -save_cdf_data(X_train, y_train, "data/X_val.npy", "data/y_val.npy") \ No newline at end of file diff --git a/efficient-stats-calc/datalib/real_data.md b/efficient-stats-calc/datalib/real_data.md deleted file mode 100644 index d5e38604..00000000 --- a/efficient-stats-calc/datalib/real_data.md +++ /dev/null @@ -1,49 +0,0 @@ -# Real Data - -Need: -- Soil Density (geowatch) -LAYERS=algorithm.standard_mobility.soil_density.SoilDensityComposite -- Soil Type (gw) -- Soil Texture (gw)? -LAYERS=datalib.soil.texture.USCSTextureComposite -- Soil Temperature (gw) -LAYERS=datalib.weather.erdc_lis72.SoilTemperature0to10 -- Soil Moisture (soilmap) - - -## Example - -``` -c=podpac.Coordinates([podpac.clinspace([43.700, 43.704, 128, 'lat'], podpac.clinspace([-72.305, -72.301, 128, 'lon'], '2023-01-01', ['lat', 'lon', 'time']) - -sm = soilmap.datalib.geowatch.SoilMoisture() -sm.eval(c) - -podpac.settings["username-PW@geowatch_server"] = {"username": "creare", "password": " 0: - pdfs.append(calculate_pdf(soil_moisture_data, days)) - print(year) - - return np.array(pdfs) - -def get_soil_moisture_levels(years, month, lat_clinspace, lon_clinspace, days): - sm = soilmap.datalib.geowatch.SoilMoisture().solmst_0_10 - soil_moisture_data_all_years = [] - - for year in years: - # Create a list of dates for the specified month up to and including the 28th day - dates = [f"{year}-{month:02d}-{day:02d}" for day in range(1, days-1)] - - soil_moisture_data = [] - - # request the data one day at a time: - for date in dates: - # Get the coordinates for the current date - current_c = podpac.Coordinates([lat_clinspace, lon_clinspace, date], ['lat', 'lon', 'time']) - print(current_c) - # Get soil moisture data for the given coordinates - o = sm.eval(current_c) - - if o is np.nan: - continue - print(o) - soil_moisture_data.append(o.values[1][1][0]) - - soil_moisture_data_all_years.append(soil_moisture_data) - - return np.array(soil_moisture_data_all_years) - -from scipy import stats - - - -month = 6 -years = range(2012, 2023) -lat_clinspace = podpac.clinspace(43.09, 42.91, 3, 'lat') -lon_clinspace = podpac.clinspace(-73.155, -72.91, 3, 'lon') -soil_moisture_data = get_soil_moisture_pdfs(years, month, lat_clinspace, lon_clinspace, 28) -print(soil_moisture_data) -podpac.utils.clear_cache(mode="all") -np.save("datalib/data/june_soil_moisture_pdfs.npy", soil_moisture_data) \ No newline at end of file diff --git a/efficient-stats-calc/deep-learn/cnn.py b/efficient-stats-calc/deep-learn/cnn.py deleted file mode 100644 index 6baf57f4..00000000 --- a/efficient-stats-calc/deep-learn/cnn.py +++ /dev/null @@ -1,47 +0,0 @@ -import numpy as np -import tensorflow as tf -from tensorflow.keras.models import Sequential -from tensorflow.keras.layers import Conv1D, Conv2D, Flatten, Dense -from tensorflow.keras.optimizers import Adam -import concurrent.futures - - - -import sys -import os -sys.path.append(os.path.abspath('.')) -sys.path.append(os.path.abspath('./deep-learn')) -from data_gen import load_cdf_data - - -X_train, y_train = load_cdf_data("data/X_train.npy", "data/y_train.npy") -X_val, y_val = load_cdf_data("data/X_val.npy", "data/y_val.npy") - -input_dim = X_train.shape[1] -output_dim=y_train.shape[2] - -print(input_dim) -print(output_dim) -# Update input_shape for the 1D CNN model -input_shape = (input_dim, 1) - -# Reshape the input data -X_train_cnn = X_train.reshape(-1, input_dim, 1) -X_val_cnn = X_val.reshape(-1, input_dim, 1) - -# Define the 1D CNN model -model = Sequential([ - Conv1D(32, kernel_size=3, activation='relu', input_shape=input_shape), - Conv1D(64, kernel_size=3, activation='relu'), - Flatten(), - Dense(64, activation='relu'), - Dense(output_dim, activation='linear') -]) - -# Compile the model -model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error', metrics=['mean_absolute_error']) - -# Train the model -history = model.fit(X_train_cnn, y_train, batch_size=32, epochs=10, validation_data=(X_val_cnn, y_val)) - -model.save("models/cnn_1d.h5") \ No newline at end of file diff --git a/efficient-stats-calc/deep-learn/mlp.py b/efficient-stats-calc/deep-learn/mlp.py deleted file mode 100644 index 7cdbf8a7..00000000 --- a/efficient-stats-calc/deep-learn/mlp.py +++ /dev/null @@ -1,46 +0,0 @@ -import numpy as np -import tensorflow as tf -from tensorflow.keras.models import Sequential -from tensorflow.keras.layers import Dense -from tensorflow.keras.optimizers import Adam - -import sys -import os -sys.path.append(os.path.abspath('.')) -from data_gen import load_cdf_data - - -# Example usage: -X_train, y_train = load_cdf_data("data/X_train.npy", "data/y_train.npy") -X_val, y_val = load_cdf_data("data/X_val.npy", "data/y_val.npy") - -print("X_train shape:", X_train.shape) -print("y_train shape:", y_train.shape) -print("X_val shape:", X_val.shape) -print("y_val shape:", y_val.shape) - - -# Set the dimensions of input and output -input_dim = X_train.shape[1] -output_dim = y_train.shape[2] - -print("Input Dim", input_dim) -print("Output Dim:", output_dim ) - -# Define the MLP model -model = Sequential([ - Dense(64, activation='relu', input_shape=(input_dim,)), - Dense(32, activation='relu'), - Dense(output_dim, activation='linear') -]) - -# Compile the model -model.compile(optimizer=Adam(learning_rate=1e-5), loss=tf.keras.losses.Huber(delta=1.0), metrics=['mean_absolute_error']) - -# Train the model -history = model.fit(X_train, y_train, batch_size=32, epochs=100, validation_data=(X_val, y_val)) - -print(model.predict(np.expand_dims(X_train[0], axis=0))) -print(y_train[0]) - -model.save("models/mlp.h5") diff --git a/efficient-stats-calc/pdfs/real_example.py b/efficient-stats-calc/real_example.py similarity index 100% rename from efficient-stats-calc/pdfs/real_example.py rename to efficient-stats-calc/real_example.py diff --git a/efficient-stats-calc/sparse-grid/sparse_grid_approx.py b/efficient-stats-calc/sparse-grid/sparse_grid_approx.py deleted file mode 100644 index 296b5c68..00000000 --- a/efficient-stats-calc/sparse-grid/sparse_grid_approx.py +++ /dev/null @@ -1,71 +0,0 @@ -import numpy as np -from scipy import interpolate - -# Step 1: Construct a sparse grid of sample points -# Define the number of dimensions -n_dim = 2 - -# Define the sparse grid -grid = ??? # Choose a sparse grid method - -# Define the number of sample points -n_points = ??? # Choose the number of sample points - -# Generate the sample points -sample_points = grid.generate_points(n_points) - -# Step 2: Interpolate the joint probability distribution -# Evaluate the PDFs at the sample points -pdf_x = ??? # Evaluate the PDF P(x) at the sample points -pdf_y = ??? # Evaluate the PDF P(y) at the sample points - -# Construct a meshgrid of the sample points -X, Y = np.meshgrid(sample_points[:, 0], sample_points[:, 1]) - -# Interpolate the joint probability distribution -joint_pdf = interpolate.interp2d(X, Y, pdf_x * pdf_y) - -# Step 3: Compute the PDF of the function output -# Define the function F(P(x), P(y)) -def F(p_x, p_y): - return ??? # Define the function F - -# Evaluate the function F at the sample points -f_values = F(pdf_x, pdf_y) - -# Compute the PDF of the function output -output_pdf = interpolate.interp1d(f_values, joint_pdf(X, Y)) - -# Step 4: Find the value c of the PDF of the function output that we want to approximate -c = ??? # Choose a value of the PDF of the function output that we want to approximate - -# Step 5: Find the set of input values (P(x), P(y)) that satisfy the equation F(P(x), P(y)) = PDF(c) -# Define the equation F(P(x), P(y)) = PDF(c) in terms of P(y) -def equation_p_y(p_x, p_y): - return output_pdf(F(p_x, p_y)) - c - -# Use a numerical solver to find the set of input values that satisfy the equation -input_values = ??? # Choose a numerical solver to find the input values - -# Step 6: Construct a sparse grid of sample points in the input space of the function F(P(x), P(y)) -# Define the sparse grid -grid_f = ??? # Choose a sparse grid method - -# Define the number of sample points -n_points_f = ??? # Choose the number of sample points - -# Generate the sample points -sample_points_f = grid_f.generate_points(n_points_f) - -# Step 7: Interpolate the function values at unsampled points -# Evaluate the function F at the sample points -f_values_f = F(sample_points_f[:, 0], sample_points_f[:, 1]) - -# Interpolate the function values -interpolated_f = interpolate.interp1d(f_values_f, sample_points_f) - -# Step 8: Compute the map F such that F(P(x), P(y)) = PDF(c) -# Use the interpolated function to compute the map -def map_F(p_x, p_y): - return interpolated_f(output_pdf(F(p_x, p_y))) - diff --git a/efficient-stats-calc/pdfs/toy_examples.py b/efficient-stats-calc/toy_example.py similarity index 100% rename from efficient-stats-calc/pdfs/toy_examples.py rename to efficient-stats-calc/toy_example.py From a2c883c35ffe5bd9ebdcf140ba505589c7588e46 Mon Sep 17 00:00:00 2001 From: Clay Foye Date: Mon, 22 May 2023 14:29:09 -0400 Subject: [PATCH 13/14] DIR: move to "soil_moisture" --- efficient-stats-calc/{ => soil_moisture}/real_example.py | 0 efficient-stats-calc/{ => soil_moisture}/toy_example.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename efficient-stats-calc/{ => soil_moisture}/real_example.py (100%) rename efficient-stats-calc/{ => soil_moisture}/toy_example.py (100%) diff --git a/efficient-stats-calc/real_example.py b/efficient-stats-calc/soil_moisture/real_example.py similarity index 100% rename from efficient-stats-calc/real_example.py rename to efficient-stats-calc/soil_moisture/real_example.py diff --git a/efficient-stats-calc/toy_example.py b/efficient-stats-calc/soil_moisture/toy_example.py similarity index 100% rename from efficient-stats-calc/toy_example.py rename to efficient-stats-calc/soil_moisture/toy_example.py From 0a90f515f3ef1532f2b6101714298d905b8cc378 Mon Sep 17 00:00:00 2001 From: Clay Foye Date: Tue, 23 May 2023 10:45:25 -0400 Subject: [PATCH 14/14] CODE: Changed x axis of MSE plot to linear --- efficient-stats-calc/soil_moisture/real_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/efficient-stats-calc/soil_moisture/real_example.py b/efficient-stats-calc/soil_moisture/real_example.py index 264839bc..efcbe194 100644 --- a/efficient-stats-calc/soil_moisture/real_example.py +++ b/efficient-stats-calc/soil_moisture/real_example.py @@ -373,7 +373,7 @@ def eff_stats_calc(n_bins, veg_value, land_use_value, DO_LOG=False, DO_PLOT=Fals # Plot the results plt.figure(figsize=(10,6)) - plt.loglog(bin_counts, mse_values, marker='o') + plt.semilogy(bin_counts, mse_values, marker='o') plt.title('Mean Squared Error vs Number of Bins') plt.xlabel('Number of Bins') plt.ylabel('Mean Squared Error')