diff --git a/lstm_function_fulldata_test.py b/lstm_function_fulldata_test.py new file mode 100644 index 0000000..51fd71b --- /dev/null +++ b/lstm_function_fulldata_test.py @@ -0,0 +1,180 @@ +""" +This code works with total data + +""" +import numpy as np +import xarray as xr +import tensorflow as tf +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MinMaxScaler +from tensorflow import keras +from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, LSTM +import os +import time +os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' +#--------------------------------------------------------------------- +# Function to read data and process +#--------------------------------------------------------------------- +def get_data(test_size=0.2, random_state=0): + file_path = 'Data_noaa_copernicus/noaa_avhrr/noaa_icesmi_combinefile.nc' + ds = xr.open_dataset(file_path) + + days = ds['time'].values + lat = ds['lat'].values + lon = ds['lon'].values + + # Preprocess data, handling NaN values by avoiding them + sst_data = ds['sst'].values + + # Find indices of non-NaN values + non_nan_indices = np.where(~np.isnan(sst_data)) + + # Extract non-NaN values + sst_data = sst_data[non_nan_indices[0], non_nan_indices[1], non_nan_indices[2]] + + # Reshape the data to match the LSTM input shape + sst_data = sst_data.reshape(sst_data.shape[0], -1) + + # Normalize the data + scaler = MinMaxScaler() + sst_data_normalized = scaler.fit_transform(sst_data) + + # Create sequences for time series prediction + sequence_length = 10 + X_data, Y_data = [], [] + for i in range(len(sst_data_normalized) - sequence_length): + X_data.append(sst_data_normalized[i:i+sequence_length]) + Y_data.append(sst_data_normalized[i+sequence_length]) + + X_data, Y_data = np.array(X_data), np.array(Y_data) + + # Split the data into training and validation sets + X_train, X_val, Y_train, Y_val = train_test_split(X_data, Y_data, test_size=test_size, random_state=random_state) + + # Return Feature and Target variables for training and validation + return X_train, X_val, Y_train, Y_val + +#--------------------------------------------------------------------- +# Function to create the default configuration for the model. This will be overridden as +# required during experimentation REGRESSION & LINEAR +#--------------------------------------------------------------------- +def base_model_config(): + model_config = { + "HIDDEN_NODES": [8, 16, 32], + "HIDDEN_ACTIVATION": 'relu', + "OUTPUT_NODES": 1, + "OUTPUT_ACTIVATION": "linear", + "WEIGHTS_INITIALIZER": "glorot_normal", + "BIAS_INITIALIZER": "zeros", + "NORMALIZATION": "batch", + "OPTIMIZER": "adam", + "LEARNING_RATE": 0.001, + "REGULARIZER": None, + "DROPOUT_RATE": 0.1, + "EPOCHS": 10, + "BATCH_SIZE": 32, + "VALIDATION_SPLIT": 0.2, + "VERBOSE": 0, + "LOSS_FUNCTION": "mean_squared_error", + "METRICS": ["mean_squared_error", "mean_absolute_error"] + } + return model_config + +#--------------------------------------------------------------------- +# Function to create a model and fit the model +#--------------------------------------------------------------------- +def create_and_run_model(model_config, X, Y, model_name): + model = keras.Sequential(name=model_name) + + for layer in range(len(model_config["HIDDEN_NODES"])): + if layer == 0: + model.add( + LSTM( + model_config["HIDDEN_NODES"][layer], + return_sequences=True, + input_shape=(X.shape[1], X.shape[2]), + name="LSTM-Layer-" + str(layer), + kernel_initializer=model_config["WEIGHTS_INITIALIZER"], + bias_initializer=model_config["BIAS_INITIALIZER"], + activation=model_config["HIDDEN_ACTIVATION"] + ) + ) + else: + if model_config["NORMALIZATION"] == "batch": + model.add(BatchNormalization()) + + if model_config["DROPOUT_RATE"] > 0.0: + model.add(Dropout(model_config["DROPOUT_RATE"])) + + model.add( + LSTM( + model_config["HIDDEN_NODES"][layer], + activation=model_config["HIDDEN_ACTIVATION"], + return_sequences=True + ) + ) + + model.add( + LSTM( + model_config["OUTPUT_NODES"], + name="Output-Layer", + activation=model_config["OUTPUT_ACTIVATION"] + ) + ) + + optimizer = keras.optimizers.Adam(learning_rate=model_config["LEARNING_RATE"]) + + model.compile( + loss=model_config["LOSS_FUNCTION"], + optimizer=optimizer, + metrics=model_config["METRICS"] + ) + + print("\n******************************************************") + model.summary() + + history = model.fit( + X, + Y, + batch_size=model_config["BATCH_SIZE"], + epochs=model_config["EPOCHS"], + verbose=model_config["VERBOSE"], + validation_split=model_config["VALIDATION_SPLIT"] + ) + + return history + +# Function to plot a graph based on the results derived +def plot_graph(history, title): + import matplotlib.pyplot as plt + + plt.figure(figsize=(15, 8)) + plt.plot(history.history['loss'], label='Training Loss', linewidth=3) + plt.plot(history.history['val_loss'], label='Validation Loss', linewidth=3) + + plt.title(title) + plt.xlabel("Epochs") + plt.ylabel("Loss") + plt.legend() + plt.show() + +# Initialize the measures +accuracy_measures = {} + +for batch_size in range(32, 128, 16): + + # Load default configuration + model_config = base_model_config() + + X_train, X_val, Y_train, Y_val = get_data() + + + model_config["EPOCHS"] = 10 + # Set batch size to experiment value + model_config["BATCH_SIZE"] = batch_size + model_name = "Batch-Size-" + str(batch_size) + history = create_and_run_model(model_config, X_train, Y_train, model_name) + + accuracy_measures[model_name] = history.history["mean_squared_error"] + + diff --git a/lstm_function_halfdata_test.py b/lstm_function_halfdata_test.py new file mode 100644 index 0000000..847e404 --- /dev/null +++ b/lstm_function_halfdata_test.py @@ -0,0 +1,182 @@ +""" +This code works with 50%(1993-2022) data +For testing as total data is taking too much time + +""" +import pandas as pd +import numpy as np +import xarray as xr +import tensorflow as tf +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MinMaxScaler +from tensorflow import keras +from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, LSTM +import os +import time +os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' +#--------------------------------------------------------------------- +# Function to read data and process +#--------------------------------------------------------------------- + +def get_data(test_size=0.2, random_state=0): + file_path = '../data/noaa_icesmi_combinefile.nc' + ds = xr.open_dataset(file_path) + + # Filter data for time range (from 1/1/1993 to 31/12/2023) + start_date = pd.to_datetime('1993-01-01') + end_date = pd.to_datetime('2023-12-31') + ds_filtered = ds.sel(time=slice(start_date, end_date)) + + # Preprocess data, handling NaN values by avoiding them + sst_data = ds_filtered['sst'].values + + # Find indices of non-NaN values + non_nan_indices = np.where(~np.isnan(sst_data)) + + # Extract non-NaN values + sst_data = sst_data[non_nan_indices[0], non_nan_indices[1], non_nan_indices[2]] + + # Reshape the data to match the LSTM input shape + sst_data = sst_data.reshape(sst_data.shape[0], -1) + + # Normalize the data + scaler = MinMaxScaler() + sst_data_normalized = scaler.fit_transform(sst_data) + + # Create sequences for time series prediction + sequence_length = 10 + X_data, Y_data = [], [] + for i in range(len(sst_data_normalized) - sequence_length): + X_data.append(sst_data_normalized[i:i+sequence_length]) + Y_data.append(sst_data_normalized[i+sequence_length]) + + X_data, Y_data = np.array(X_data), np.array(Y_data) + + # Split the data into training and validation sets + X_train, X_val, Y_train, Y_val = train_test_split(X_data, Y_data, test_size=test_size, random_state=random_state) + + # Return Feature and Target variables for training and validation + return X_train, X_val, Y_train, Y_val + +#--------------------------------------------------------------------- +# Function to create the default configuration for the model. This will be overridden as +# required during experimentation REGRESSION & LINEAR +#--------------------------------------------------------------------- +def base_model_config(): + model_config = { + "HIDDEN_NODES": [8, 16, 32], + "HIDDEN_ACTIVATION": 'relu', + "OUTPUT_NODES": 1, + "OUTPUT_ACTIVATION": "linear", + "WEIGHTS_INITIALIZER": "glorot_normal", + "BIAS_INITIALIZER": "zeros", + "NORMALIZATION": "batch", + "OPTIMIZER": "adam", + "LEARNING_RATE": 0.001, + "REGULARIZER": None, + "DROPOUT_RATE": 0.1, + "EPOCHS": 10, + "BATCH_SIZE": 32, + "VALIDATION_SPLIT": 0.2, + "VERBOSE": 0, + "LOSS_FUNCTION": "mean_squared_error", + "METRICS": ["mean_squared_error", "mean_absolute_error"] + } + return model_config + +#--------------------------------------------------------------------- +# Function to create a model and fit the model +#--------------------------------------------------------------------- +def create_and_run_model(model_config, X, Y, model_name): + model = keras.Sequential(name=model_name) + + for layer in range(len(model_config["HIDDEN_NODES"])): + if layer == 0: + model.add( + LSTM( + model_config["HIDDEN_NODES"][layer], + return_sequences=True, + input_shape=(X.shape[1], X.shape[2]), + name="LSTM-Layer-" + str(layer), + kernel_initializer=model_config["WEIGHTS_INITIALIZER"], + bias_initializer=model_config["BIAS_INITIALIZER"], + activation=model_config["HIDDEN_ACTIVATION"] + ) + ) + else: + if model_config["NORMALIZATION"] == "batch": + model.add(BatchNormalization()) + + if model_config["DROPOUT_RATE"] > 0.0: + model.add(Dropout(model_config["DROPOUT_RATE"])) + + model.add( + LSTM( + model_config["HIDDEN_NODES"][layer], + activation=model_config["HIDDEN_ACTIVATION"], + return_sequences=True + ) + ) + + model.add( + LSTM( + model_config["OUTPUT_NODES"], + name="Output-Layer", + activation=model_config["OUTPUT_ACTIVATION"] + ) + ) + + optimizer = keras.optimizers.Adam(learning_rate=model_config["LEARNING_RATE"]) + + model.compile( + loss=model_config["LOSS_FUNCTION"], + optimizer=optimizer, + metrics=model_config["METRICS"] + ) + + print("\n******************************************************") + model.summary() + + history = model.fit( + X, + Y, + batch_size=model_config["BATCH_SIZE"], + epochs=model_config["EPOCHS"], + verbose=model_config["VERBOSE"], + validation_split=model_config["VALIDATION_SPLIT"] + ) + + return history + +# Function to plot a graph based on the results derived +def plot_graph(history, title): + import matplotlib.pyplot as plt + + plt.figure(figsize=(15, 8)) + plt.plot(history.history['loss'], label='Training Loss', linewidth=3) + plt.plot(history.history['val_loss'], label='Validation Loss', linewidth=3) + + plt.title(title) + plt.xlabel("Epochs") + plt.ylabel("Loss") + plt.legend() + plt.show() + +# Initialize the measures +accuracy_measures = {} + +for batch_size in range(32, 128, 16): + + # Load default configuration + model_config = base_model_config() + # Acquire and process input data + X_train, X_val, Y_train, Y_val = get_data() + + # set epoch to 20 + model_config["EPOCHS"] = 10 + # Set batch size to experiment value + model_config["BATCH_SIZE"] = batch_size + model_name = "Batch-Size-" + str(batch_size) + history = create_and_run_model(model_config, X_train, Y_train, model_name) + + accuracy_measures[model_name] = history.history["mean_squared_error"] diff --git a/lstm_model_f.py b/lstm_model_f.py new file mode 100644 index 0000000..3321957 --- /dev/null +++ b/lstm_model_f.py @@ -0,0 +1,103 @@ +import numpy as np +import xarray as xr +from sklearn.preprocessing import MinMaxScaler +from sklearn.model_selection import train_test_split +import tensorflow as tf +from tensorflow.keras import optimizers, Sequential, Model +import tensorflow.keras.layers as L +from tensorflow.keras.callbacks import EarlyStopping +import matplotlib.pyplot as plt +from sklearn.metrics import mean_squared_error +import os +os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' + + +file_path = 'Data_noaa_copernicus/noaa_avhrr/noaa_icesmi_combinefile.nc' +ds = xr.open_dataset(file_path) + +time = ds['time'].values +lat = ds['lat'].values +lon = ds['lon'].values + +# Preprocess data, handling NaN values by avoiding them +sst_data = ds['sst'].values + +# Find indices of non-NaN values +non_nan_indices = np.where(~np.isnan(sst_data)) + +# Extract non-NaN values +sst_data = sst_data[non_nan_indices[0], non_nan_indices[1], non_nan_indices[2]] + +# Reshape the data to match the LSTM input shape +sst_data = sst_data.reshape(sst_data.shape[0], -1) + +# Normalize the data +scaler = MinMaxScaler() +sst_data_normalized = scaler.fit_transform(sst_data) + +# Create sequences for time series prediction +sequence_length = 10 +X, Y = [], [] +for i in range(len(sst_data_normalized) - sequence_length): + X.append(sst_data_normalized[i:i+sequence_length]) + Y.append(sst_data_normalized[i+sequence_length]) + +X, Y = np.array(X), np.array(Y) + +X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42) + +# Build the LSTM model +lstm_model = Sequential() + +lstm_model.add(L.InputLayer(shape=(X_train.shape[1], X_train.shape[2]))) +lstm_model.add(L.LSTM(10, return_sequences=True)) +lstm_model.add(L.LSTM(6, activation='relu',return_sequences=True)) +lstm_model.add(L.LSTM(1, activation='relu')) +lstm_model.add(L.Dense(10, activation='relu')) +lstm_model.add(L.Dense(10, activation='relu')) +lstm_model.add(L.Dense(1, activation='linear')) +lstm_model.summary() + +adam = optimizers.Adam(learning_rate=0.001) +lstm_model.compile(loss='mse', optimizer=adam) + +early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True) + +lstm_history = lstm_model.fit(X_train, Y_train, + epochs=20, batch_size=32, validation_split=0.2, + verbose=2, callbacks=[early_stopping]) + +plt.figure(figsize=(12, 6)) +plt.plot(lstm_history.history['loss'], label='Training Loss') +plt.plot(lstm_history.history['val_loss'], label='Validation Loss') +plt.title('Training and Validation Loss') +plt.xlabel('Epoch') +plt.ylabel('Loss') +plt.legend() +plt.show() + +y_test_pred = lstm_model.predict(X_test) +mse = mean_squared_error(Y_test, y_test_pred) +print(f'Test MSE: {mse}') + +predictions = lstm_model.predict(X_test) + +# Inverse transform the predictions to the original scale +predictions = scaler.inverse_transform(predictions) + +""" +# Plot true values and predicted values +plt.figure(figsize=(16, 8)) + +# Plot true values +plt.plot(time[sequence_length:], sst_data[sequence_length:], label='True Values', color='blue') + +# Plot predicted values +plt.plot(time[-len(predictions):], predictions, label='Predicted Values', color='red') + +plt.title('True vs Predicted Sea Surface Temperature') +plt.xlabel('Time') +plt.ylabel('Sea Surface Temperature') +plt.legend() +plt.show() +""" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..60dab8d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,79 @@ +absl-py==2.1.0 +asttokens==2.4.1 +astunparse==1.6.3 +certifi==2024.2.2 +cftime==1.6.3 +charset-normalizer==3.3.2 +comm==0.2.2 +contourpy==1.2.0 +cycler==0.12.1 +debugpy==1.8.1 +decorator==5.1.1 +dm-tree==0.1.8 +executing==2.0.1 +flatbuffers==24.3.7 +fonttools==4.49.0 +gast==0.5.4 +google-pasta==0.2.0 +grpcio==1.62.1 +h5netcdf==1.3.0 +h5py==3.10.0 +idna==3.6 +ipykernel==6.29.3 +ipython==8.22.2 +jedi==0.19.1 +joblib==1.3.2 +jupyter_client==8.6.1 +jupyter_core==5.7.2 +keras==3.0.5 +kiwisolver==1.4.5 +libclang==16.0.6 +Markdown==3.5.2 +markdown-it-py==3.0.0 +MarkupSafe==2.1.5 +matplotlib==3.8.3 +matplotlib-inline==0.1.6 +mdurl==0.1.2 +ml-dtypes==0.3.2 +namex==0.0.7 +nest-asyncio==1.6.0 +netCDF4==1.6.5 +numpy==1.26.4 +opt-einsum==3.3.0 +packaging==24.0 +pandas==2.2.1 +parso==0.8.3 +pexpect==4.9.0 +pillow==10.2.0 +platformdirs==4.2.0 +prompt-toolkit==3.0.43 +protobuf==4.25.3 +psutil==5.9.8 +ptyprocess==0.7.0 +pure-eval==0.2.2 +Pygments==2.17.2 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +pytz==2024.1 +pyzmq==25.1.2 +requests==2.31.0 +rich==13.7.1 +scikit-learn==1.4.1.post1 +scipy==1.12.0 +six==1.16.0 +stack-data==0.6.3 +tensorboard==2.16.2 +tensorboard-data-server==0.7.2 +tensorflow==2.16.1 +tensorflow-io-gcs-filesystem==0.36.0 +termcolor==2.4.0 +threadpoolctl==3.3.0 +tornado==6.4 +traitlets==5.14.2 +typing_extensions==4.10.0 +tzdata==2024.1 +urllib3==2.2.1 +wcwidth==0.2.13 +Werkzeug==3.0.1 +wrapt==1.16.0 +xarray==2024.2.0