diff --git a/.gitignore b/.gitignore index d9d6232d..2ab57bd8 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,6 @@ doc/source/example-links.inc doc/source/changelog.md .pytest_cache/ build/ + +# machine learning +.h5 \ No newline at end of file diff --git a/efficient-stats-calc/soil_moisture/real_example.py b/efficient-stats-calc/soil_moisture/real_example.py new file mode 100644 index 00000000..efcbe194 --- /dev/null +++ b/efficient-stats-calc/soil_moisture/real_example.py @@ -0,0 +1,384 @@ +import podpac +import numpy as np +import matplotlib.pyplot as plt +from scipy.ndimage import gaussian_filter +import random +from scipy.stats import gaussian_kde +import xarray as xr +from tqdm import tqdm +from matplotlib.colors import LogNorm + + +import soilmap.datalib.geowatch as geowatch + + +def get_coordinates(): + # Make a set of fine and coarse coordinates + # Center coordinates about Creare + center_lat, center_lon = 43.682102, -72.233455 + # get deltas using 110 km / deg and a 30 x 30 km box + f_box_lat_lon = 30 / 2 / 110 + n_fine = int(30 / (30 / 1000)) # Number of 30 m boxes in a 30km square (hint, it's 1000) + n_coarse = 3 + f_coords = podpac.Coordinates([ + podpac.clinspace(center_lat + f_box_lat_lon, center_lat - f_box_lat_lon, n_fine), + podpac.clinspace(center_lon - f_box_lat_lon, center_lon + f_box_lat_lon, n_fine), + ["2022-06-01T12"]], ["lat", "lon", "time"] + ) + c_coords = f_coords[::333, 100::400] + + return c_coords, f_coords + +def fill_time_dimensions(c_coords, f_coords): + # make the coordinates for the brute-force technique + e1 = podpac.algorithm.ExpandCoordinates(time=["0,D", "30,D", '1,D']) + day_time_coords = podpac.Coordinates([e1.get_modified_coordinates1d(f_coords, 'time')]) + e2 = podpac.algorithm.ExpandCoordinates(source=e1, time=["-4,Y", "0,Y", '1,Y']) + all_time_coords = podpac.Coordinates([e2.get_modified_coordinates1d(day_time_coords, 'time')]) + + all_f_coords = podpac.coordinates.merge_dims([f_coords.drop('time'), all_time_coords]) + all_c_coords = podpac.coordinates.merge_dims([c_coords.drop('time'), all_time_coords]) + + + return all_c_coords, all_f_coords + +def get_veg_node(all_f_coords): + # Also, use the average of the vegetation, for initial testing + sm = geowatch.SoilMoisture() + o_veg = [] + vegetation = sm.vegetation + one_month_coords = all_f_coords[:, :, :32*5:5] + shape = one_month_coords[:, :, :32*5:5].shape + shape = (shape[0], shape[1], 1) # set maximum time shape to 1 + for coords in one_month_coords.iterchunks(shape): + o_veg.append(vegetation.eval(coords)) + o_veg = xr.concat(o_veg, 'time') + print(o_veg.mean('time').data) + veg_node = podpac.data.Array(source=o_veg.mean('time').data, coordinates=all_f_coords.drop('time')) + + return veg_node + +def get_constant_veg_node(all_f_coords, constant_veg_value): + # Get the spatial coordinates from all_f_coords + spatial_coords = all_f_coords.drop('time') + + # Create an array with the same spatial shape as all_f_coords but filled with the constant vegetation value + constant_veg_data = np.full(spatial_coords.shape, constant_veg_value) + + # Create a PODPAC Array node with the constant vegetation data and the spatial coordinates + constant_veg_node = podpac.data.Array(source=constant_veg_data, coordinates=spatial_coords) + + return constant_veg_node + +def get_constant_land_use(all_f_coords, constant_land_value): + # Get the spatial coordinates from all_f_coords + spatial_coords = all_f_coords.drop('time') + + # Create an array with the same spatial shape as all_f_coords but filled with the constant vegetation value + constant_veg_data = np.full(spatial_coords.shape, constant_land_value) + + # Create a PODPAC Array node with the constant vegetation data and the spatial coordinates + constant_land_node = podpac.data.Array(source=constant_veg_data, coordinates=spatial_coords) + + return constant_land_node + +def get_calibrated_sm(all_c_coords, veg_node, land_use_node): + # Quick test to make sure everything makes sense + sm = geowatch.SoilMoisture(vegetation=veg_node, vegetation_mean=veg_node, land_use=land_use_node) + sm_ca = sm.solmst_0_10 + sm_cr = sm.relsolmst_0_10 # you can compute this from the abs soil moisture above... but I'm lazy so I'll just get this data for now + + all_ca_data = [] #coarse absolute + all_cr_data = [] #coarse relative + shape = all_c_coords.shape + shape = (shape[0], shape[1], 1) + for coords in all_c_coords.iterchunks(shape): + all_ca_data.append(sm_ca.eval(coords)) + all_cr_data.append(sm_cr.eval(coords)) + + # Merge the data in all_ca_data along the 'time' dimension + all_ca_data = xr.concat(all_ca_data, 'time') + + # Merge the data in all_cr_data along the 'time' dimension + all_cr_data = xr.concat(all_cr_data, 'time') + + # All Coarse Absolute Data + all_ca_arr = podpac.data.Array( + source=all_ca_data, + coordinates=all_c_coords, + interpolation="bilinear" + ) + # All Coarse Relative Data + all_cr_arr = podpac.data.Array( + source=all_cr_data, + coordinates=all_c_coords, + interpolation="bilinear" + ) + + # Create a new SoilMoisture node that adjusts the soil moisture data based on vegetation data + return all_ca_data, all_cr_data, geowatch.SoilMoisture(solmst_0_10=all_ca_arr, relsolmst_0_10=all_cr_arr, vegetation=veg_node, vegetation_mean=veg_node, land_use=land_use_node) + +def get_stats_data(all_c_coords, all_f_coords, all_ca_edges, all_cr_edges, constant_veg_value, constant_land_value, n_bins): + # Get the centers of the bins + all_ca_centers = (all_ca_edges[..., 1:] + all_ca_edges[..., :-1]) * 0.5 + all_cr_centers = (all_cr_edges[..., 1:] + all_cr_edges[..., :-1]) * 0.5 + + # Get stats coords + stats_coords_f = all_f_coords[:, :, :n_bins * 2:2] + stats_coords = all_c_coords[:, :, :n_bins * 2:2] + + # Create abs soil moisture coords at the centers of the bins + abs_ws_stats = podpac.data.Array( + source=all_ca_centers, # Here's the weatherscale data at the centers of the bins + coordinates=stats_coords, # Mock the coordinates + interpolation="bilinear" + ) + # Create relative soil moisture coords at the centers of the bins + rel_ws_stats = podpac.data.Array( + source=all_cr_centers, + coordinates=stats_coords, # Mock the coordinates + interpolation="bilinear" + ) + # Get the spatial coordinates from stats_coords + spatial_coords = stats_coords.drop('time') + + constant_veg_data = np.full(spatial_coords.shape, constant_veg_value) + veg_node = podpac.data.Array(source=constant_veg_data, coordinates=spatial_coords) + + constant_land_data = np.full(spatial_coords.shape, constant_land_value) + land_use_node = podpac.data.Array(source=constant_land_data, coordinates=spatial_coords) + + # Make the g(x) function + sm_stats = geowatch.SoilMoisture(solmst_0_10=abs_ws_stats, relsolmst_0_10=rel_ws_stats, vegetation=veg_node, vegetation_mean=veg_node, land_use=land_use_node) + + # Evaluate g(x) + stats_f_data = [] + shape = stats_coords_f.shape + shape = (shape[0], shape[1], 1) # set maximum time shape to 1 + for coords in stats_coords_f.iterchunks(shape): + stats_f_data.append(sm_stats.eval(coords)) # should be fast -- all local cached data (Except vegetation for some reason? ) + stats_f_data = xr.concat(stats_f_data, 'time') + + return stats_f_data + +def get_fine_pdfs_edges(all_ca_pdfs, all_ca_edges, n_bins, all_c_coords, all_f_coords): + ca_edges = podpac.data.Array( + source=all_ca_edges, + coordinates=all_c_coords[:, :, :n_bins + 1], # Again, the time coordinate here is fake -- I just want to interpolate space + interpolation='bilinear', ) + ca_edges_f = ca_edges.eval(all_f_coords[:, :, :n_bins + 1]) + # pdfs + ca_pdfs = podpac.data.Array( + source=all_ca_pdfs, + coordinates=all_c_coords[:, :, :n_bins], # Again, the time coordinate here is fake -- I just want to interpolate space + interpolation='bilinear', + ) + ca_pdfs_f = ca_pdfs.eval(all_f_coords[:, :, :n_bins]) + + return ca_edges_f, ca_pdfs_f + +def get_fine_data(all_f_coords, sm_centered): + # get the data for the cheap approach + all_f_data = [] + shape = all_f_coords.shape + shape = (shape[0], shape[1], 1) + for coords in all_f_coords.iterchunks(shape): + all_f_data.append(sm_centered.eval(coords)) + all_f_data = xr.concat(all_f_data, 'time') + + return all_f_data + + +def create_pdfs(n_bins, all_ca_data, all_cr_data): + # Create new arrays with a third dimension of size n_bins. + new_shape = all_ca_data.shape[:2] + (n_bins,) + new_shape_edges = all_ca_data.shape[:2] + (n_bins + 1,) + all_ca_pdfs = np.zeros((new_shape)) + all_ca_edges = np.zeros((new_shape_edges)) + all_cr_pdfs = np.zeros((new_shape)) + all_cr_edges = np.zeros((new_shape_edges)) + + # Loop over each element in the 2D array, computing the histograms for the corresponding 1D data. + for i in range(new_shape[0]): + for j in range(new_shape[1]): + # Compute the histogram for the i,j-th element of all_ca_data. + tmp = all_ca_data.data[i, j].ravel() + tmp = tmp[np.isfinite(tmp)] + all_ca_pdfs[i, j, :], all_ca_edges[i, j, :] = np.histogram(tmp, density=True, bins=n_bins) + + # Compute the histogram for the i,j-th element of all_cr_data. + tmp = all_cr_data.data[i, j].ravel() + tmp = tmp[np.isfinite(tmp)] + all_cr_pdfs[i, j, :], all_cr_edges[i, j, :] = np.histogram(tmp, density=True, bins=n_bins) + + return all_ca_pdfs, all_ca_edges, all_cr_pdfs, all_cr_edges + +def validate_pdfs(all_ca_pdfs, all_ca_edges): + # Check on one of these + assert np.abs(1 - (all_ca_pdfs[1, 1] * (all_ca_edges[1, 1][1:] - all_ca_edges[1, 1][:-1])).sum()) < 1e-14 + plt.stairs(all_ca_pdfs[1, 1], all_ca_edges[1, 1], fill=True) + plt.stairs(all_ca_pdfs[1, 1], all_ca_edges[1, 1], color='k', fill=False) + plt.show() + +def compute_mse(cheap_mean, truth_mean): + squared_diffs = np.nan_to_num((cheap_mean - truth_mean)**2) + valid_mask = np.isfinite(cheap_mean) & np.isfinite(truth_mean) # mask where neither array is NaN + + # Compute the mean of the squared differences, ignoring any NaN values + mse = np.nanmean(squared_diffs[valid_mask]) + return mse + +def plot_difference(arr1, arr2, n_bins, title=""): + # Plot results + plt.figure() + kwargs = dict() + ax1 = plt.subplot(131) + plt.imshow(arr1, **kwargs) + plt.title("Truth "+title) + plt.colorbar() + plt.subplot(132, sharex=ax1, sharey=ax1) + plt.imshow(arr2, **kwargs) + plt.title(f"Cheap,Approx, nbins={n_bins}, " + title) + plt.colorbar() + plt.subplot(133, sharex=ax1, sharey=ax1) + plt.title("Cheap - Truth "+ title) + plt.imshow(arr2 - arr1, cmap='BrBG') + plt.colorbar() + plt.show() + +def eff_stats_calc(n_bins, veg_value, land_use_value, DO_LOG=False, DO_PLOT=False): + # Get lat and lon coordinates for coarse data with empty time + c_coords, f_coords = get_coordinates() + + # Fill in time dimension + all_c_coords, all_f_coords = fill_time_dimensions(c_coords, f_coords) + + if DO_LOG: + print("Got Coords...") + + # Get a vegetation node for constant vegetation: + # veg_node = get_veg_node(all_f_coords) + veg_node = get_constant_veg_node(all_f_coords, veg_value) + land_use_node = get_constant_land_use(all_f_coords, land_use_value) + + # Get a soil moisture node with centered on coarse coords + all_ca_data, all_cr_data, sm_centered = get_calibrated_sm(all_c_coords, veg_node, land_use_node) + + if DO_LOG: + print("Calibrated SM...") + + # Get Fine SM Data: + all_f_data = get_fine_data(all_f_coords, sm_centered) + + if DO_LOG: + print("Got Fine Data...") + + # Create a PDF from the data + all_ca_pdfs, all_ca_edges, all_cr_pdfs, all_cr_edges = create_pdfs(n_bins, all_ca_data, all_cr_data) + #TODO: Figure out how we can use coarse relative pdfs? + # validate_pdfs(all_ca_pdfs, all_ca_edges) + + if DO_LOG: + print("Created PDFs...") + + # Create a SM node with the centers + stats_f_data = get_stats_data(all_c_coords, all_f_coords, all_ca_edges, all_cr_edges, veg_value, land_use_value ,n_bins) + + if DO_LOG: + print("Got Stats Data...") + + # Get the fine pdfs by interpolating coarse-scale data edges + # TODO: figure out ohow to use coarse relative pdfs? + ca_edges_f, ca_pdfs_f = get_fine_pdfs_edges(all_ca_pdfs, all_ca_edges, n_bins, all_c_coords, all_f_coords) + + if DO_LOG: + print("Got Fine PDFs...") + + # Calculate cheap mean and variance + cheap_mean = (stats_f_data.data * ca_pdfs_f.data * (ca_edges_f[..., 1:].data - ca_edges_f[..., :-1].data)).sum(axis=-1) + deviations = stats_f_data.data - cheap_mean[..., np.newaxis] + cheap_var = (deviations**2 * ca_pdfs_f.data * (ca_edges_f[..., 1:].data - ca_edges_f[..., :-1].data)).sum(axis=-1) + + + if DO_LOG: + print("Calculated Cheap Mean...") + + # Calculate correct mean + truth_mean = all_f_data.mean('time').data + truth_var = all_f_data.var('time').data + + if DO_LOG: + print("Calculated Truth Mean...") + + if DO_PLOT: + plot_difference(truth_mean, cheap_mean, n_bins, title="Mean") + plot_difference((truth_var), (cheap_var), n_bins, title="Variance") + + + cheap_std_dev = np.sqrt(cheap_var) + truth_std_dev = np.sqrt(truth_var) + + if DO_LOG: + print("Calculated StdDev...") + + # Compute the squared differences, ignoring any NaN values + mse = compute_mse(cheap_mean, truth_mean) + + return cheap_mean, truth_mean, mse + + +if __name__ == '__main__': + """ + 1. Generate PDFs out of coarse data + 2. Calculate centers for that data + 3. Create SoilMoisutre node using those centers + 4. Evaluate SM node at those centers + 5. Multiply eval by pdf and bin widths + """ + with podpac.settings: + # Constants + VEG_VALUE = 50 + LAND_USE_VALUE = 0 + DO_LOG = True + DO_PLOT = False + + # Cache + podpac.settings["DEFAULT_CACHE"] = ["ram", "disk"] + podpac.settings["MULTITHREADING"] = False + # podpac.settings["CACHE_OUTPUT_DEFAULT"] = False + # podpac.utils.clear_cache(mode='ram') + # podpac.utils.clear_cache(mode='disk') + + # Initialize lists to store the bin counts and the corresponding MSE values + bin_counts = [] + mse_values = [] + + # Initialize the progress bar + pbar = tqdm(total=7, desc='Processing bins', ncols=155) + + # Run eff_stats_calc for 1, 2, 4, 8, 16, 32, 64 bins + # [1, 2, 4, 8, 16, 32, 64]: + for n_bins in [1, 2, 4, 8, 16, 32, 64]: + if n_bins == 64: + _, _, mse = eff_stats_calc(n_bins, VEG_VALUE, LAND_USE_VALUE, DO_LOG, True) + else: + _, _, mse = eff_stats_calc(n_bins, VEG_VALUE, LAND_USE_VALUE, DO_LOG, DO_PLOT) + bin_counts.append(n_bins) + mse_values.append(mse) + + #Update the progress bar + pbar.update(1) + + pbar.close() + + # Plot the results + plt.figure(figsize=(10,6)) + plt.semilogy(bin_counts, mse_values, marker='o') + plt.title('Mean Squared Error vs Number of Bins') + plt.xlabel('Number of Bins') + plt.ylabel('Mean Squared Error') + plt.grid(True) + plt.show() + + + diff --git a/efficient-stats-calc/soil_moisture/toy_example.py b/efficient-stats-calc/soil_moisture/toy_example.py new file mode 100644 index 00000000..0f102442 --- /dev/null +++ b/efficient-stats-calc/soil_moisture/toy_example.py @@ -0,0 +1,225 @@ +import podpac +import numpy as np +import matplotlib.pyplot as plt +from scipy.ndimage import gaussian_filter +import random +from scipy.stats import gaussian_kde +import xarray as xr +''' +Toy Data Creation +''' + +def generate_sm_data(N, random_seed=None, scale=1, sigma=3): + if random_seed is not None: + np.random.seed(random_seed) + + elevation_data = np.random.rand(N, N) * scale + elevation_data = gaussian_filter(elevation_data, sigma=sigma) + + return elevation_data + +def year_long_sm(mean_sm_data, monthly_coefficients, coefficient_std_dev=0.05): + year_long_sm_data = [] + + for month_coefficient in monthly_coefficients: + month_coefficient_array = np.random.normal(loc=month_coefficient, scale=coefficient_std_dev, size=mean_sm_data.shape) + month_sm_data = mean_sm_data * month_coefficient_array + year_long_sm_data.append(month_sm_data) + + return year_long_sm_data + +def decade_long_sm(mean_sm_data, monthly_coefficients, years=10, coefficient_std_dev=0.05): + decade_long_sm_data = [] + + for _ in range(years): + year_long_data = year_long_sm(mean_sm_data, monthly_coefficients, coefficient_std_dev) + decade_long_sm_data.extend(year_long_data) + + return decade_long_sm_data + + +def create_coordinates(N, YEARS): + # create coordinates for data + + fine_lat = np.linspace(-10, 10, N) # Fine Data + fine_lon = np.linspace(-10, 10, N) # Coarse Data + + + + time = np.linspace(0, (12*YEARS)-1, 12*YEARS) + coords = podpac.Coordinates([time, fine_lat, fine_lon], ['time','lat', 'lon']) # Fine coords + + return coords + + +""" +Data Modification/Utilities +""" + +def g(data): + return 1 / (1 + np.exp(-data)) + +def create_pdf(all_data, N_bins): + n_rows, n_cols = all_data.shape[1:3] + + # Initialize the PDFs, edges, and centers arrays + pdfs_shape = (N_bins, n_rows, n_cols) + edges_shape = (N_bins + 1, n_rows, n_cols) + all_pdfs = np.zeros(pdfs_shape) + all_edges = np.zeros(edges_shape) + + # Calculate the common edges using the flattened data + flat_data = all_data.ravel() + flat_data = flat_data[np.isfinite(flat_data)] + common_edges = np.histogram_bin_edges(flat_data, bins=N_bins) + + # Calculate the histograms and centers for each element of the 2D grid + for i in range(n_rows): + for j in range(n_cols): + tmp = all_data[:, i, j].ravel() + tmp = tmp[np.isfinite(tmp)] + all_pdfs[:, i, j], _ = np.histogram(tmp, density=True, bins=common_edges) + all_edges[:, i, j] = common_edges + + # Calculate centers + all_centers = (all_edges[:-1] + all_edges[1:]) / 2 + + return all_pdfs, all_edges, all_centers + +def create_stats_f_data(stats_coords_f, g_x): + stats_f_data = [] + shape = stats_coords_f.shape + shape = (1, shape[1], shape[2]) # set maximum time shape to 1 + for coords in stats_coords_f.iterchunks(shape): + stats_f_data.append(g_x.eval(coords)) # should be fast -- all local cached data (Except vegetation for some reason?) + stats_f_data = xr.concat(stats_f_data, 'time') + + return stats_f_data + + + +""" +Data Visualization +""" +def plot_year_long_sm(year_long_data): + n_months = len(year_long_data) + n_cols = 3 + n_rows = (n_months + n_cols - 1) // n_cols + + fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 4)) + axes = axes.flatten() + + for month, sm_data in enumerate(year_long_data, start=1): + ax = axes[month - 1] + img = ax.imshow(sm_data, cmap='viridis', origin='lower') + ax.set_title(f'Month {month}') + ax.axis('off') + + # Remove unused subplots + for ax in axes[n_months:]: + ax.remove() + + # Add a colorbar + fig.subplots_adjust(right=0.8) + cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7]) + fig.colorbar(img, cax=cbar_ax, label='Soil Moisture') + + plt.show() + +def create_histogram(m_sm_matrices, N_bins): + # Calculate the average soil moisture value for each matrix + avg_soil_moisture = [np.mean(matrix) for matrix in m_sm_matrices] + + # Create the histogram + plt.hist(avg_soil_moisture, bins=N_bins) + plt.xlabel('Average Soil Moisture') + plt.ylabel('Frequency') + plt.title('Histogram of Soil Moisture Matrices') + plt.show() + + +def plot_pdf_for_single_grid_square(all_pdfs, all_edges, row, col): + # Select the PDF and edges for the specific grid square + pdf = all_pdfs[:, row, col] + edges = all_edges[:, row, col] + + # Calculate the centers of the bins + centers = (edges[:-1] + edges[1:]) / 2 + + # Plot the PDF + plt.plot(centers, pdf) + plt.xlabel("Value") + plt.ylabel("Probability Density") + plt.title(f"PDF for Grid Square ({row}, {col})") + plt.show() + +""" +Main +""" + +def approx_sm(N, YEARS, N_BINS, DO_PLOT, monthly_coefficients): + + + """ + Data Gen + """ + + # Data Gen + average_sm = generate_sm_data(N, random_seed=0) + decade_long_data = decade_long_sm(average_sm, monthly_coefficients, YEARS) + + + if DO_PLOT: + year_long_data = year_long_sm(average_sm, monthly_coefficients) + plot_year_long_sm(year_long_data) + + print("Data Generated") + + """ + Coordinate Gen + """ + + coords = create_coordinates(N, YEARS) + print(coords) + + + """ + Create PDF's for each grid square + """ + + data = np.array(decade_long_data) + all_pdfs, all_edges, all_centers = create_pdf(data, N_BINS) + + """ + Calculate Means + """ + true_mean = g(data).mean(0) + cheap_mean = (g(all_centers) * all_pdfs * (np.diff(all_edges, axis=0))).sum(axis=0) + print("Mean Absolute Error", np.abs(true_mean - cheap_mean).mean()) + + + if DO_PLOT or True: + plt.figure() + ax1 = plt.subplot(131) + plt.imshow(true_mean) + plt.title("Truth") + plt.colorbar() + plt.subplot(132, sharex=ax1, sharey=ax1) + plt.imshow(cheap_mean,) + plt.title(f"Cheap,Approx, nbins={N_BINS}") + plt.colorbar() + plt.subplot(133, sharex=ax1, sharey=ax1) + plt.title("Cheap - Truth") + plt.imshow(cheap_mean - true_mean, cmap='BrBG')#, vmin=-0.02, vmax=0.02) + plt.colorbar() + plt.show() + + +if __name__ == '__main__': + # Constants + N = 50 # size of the fine coords + YEARS = 10 # Total number of years to create years for + N_BINS = 32 # Number of bins for the histogram + DO_PLOT = False # plots or not + monthly_coefficients = [1.2, 1.1, 1.0, 0.8, 0.6, 0.5, 0.4, 0.6, 0.8, 1.0, 1.1, 1.2] + approx_sm(N, YEARS, N_BINS, DO_PLOT, monthly_coefficients) \ No newline at end of file