shubhvjain · AnneHartebrodt · Dec 20, 2023 · Dec 21, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,7 @@
 # Dockerfile
 
 # Use an official Python runtime as a base image
-FROM python:3.8
+FROM python:3.11
 
 # Set the working directory in the container
 WORKDIR /app

diff --git a/models/BE_v1.h5 b/models/BE_v1.h5
diff --git a/models/BG_v1.h5 b/models/BG_v1.h5
diff --git a/models/CH_StandardScaler_model_96.h5 b/models/CH_StandardScaler_model_96.h5
diff --git a/models/CZ_MinMaxScaler_model_24.h5 b/models/CZ_MinMaxScaler_model_24.h5
diff --git a/models/DE_StandardScaler_model_24.h5 b/models/DE_StandardScaler_model_24.h5
diff --git a/models/EE_v1.h5 b/models/EE_v1.h5
diff --git a/models/ES_v1.h5 b/models/ES_v1.h5
diff --git a/models/FR_StandardScaler_model_48.h5 b/models/FR_StandardScaler_model_48.h5
diff --git a/models/NL_StandardScaler_model_96.h5 b/models/NL_StandardScaler_model_96.h5
diff --git a/models/PL_MinMaxScaler_model_120.h5 b/models/PL_MinMaxScaler_model_120.h5
diff --git a/models/SE_v1.h5 b/models/SE_v1.h5
diff --git a/models/metadata.json b/models/metadata.json
@@ -1,35 +1 @@
-{
-  "models":[
-    {
-      "name":"BE_v1.h5",
-      "country":"BE",
-      "input_sequence":60,
-      "description":"Trained on historical data (2020-01-01 - 2023-05-01)"
-    },
-    {
-      "name":"BG_v1.h5",
-      "country":"BG",
-      "input_sequence":24,
-      "description":"Trained on historical data (2020-01-01 - 2023-05-01)"
-    },
-    {
-      "name":"EE_v1.h5",
-      "country":"EE",
-      "input_sequence":60,
-      "description":"Trained on historical data (2020-01-01 - 2023-05-01)"
-    },
-    {
-      "name":"ES_v1.h5",
-      "country":"ES",
-      "input_sequence":24,
-      "description":"Trained on historical data (2020-01-01 - 2023-05-01)"
-    },
-    {
-      "name":"SE_v1.h5",
-      "country":"SE",
-      "input_sequence":24,
-      "description":"Trained on historical data (2020-01-01 - 2023-05-01)"
-    }
-
-  ]
-}
+{"models": [{"name": "DE_StandardScaler_model_24.h5", "version": 1, "country": "DE", "input_sequence": 24, "scaler": {"name": "StandardScaler", "scale": [15.767449748908355], "mean": [39.62043795620438]}, "description": "Trained on historical data (2020-01-01 - 2023-05-01)"}, {"name": "FR_StandardScaler_model_48.h5", "version": 1, "country": "FR", "input_sequence": 48, "scaler": {"name": "StandardScaler", "scale": [6.415490823485095], "mean": [23.130246350364963]}, "description": "Trained on historical data (2020-01-01 - 2023-05-01)"}, {"name": "PL_MinMaxScaler_model_120.h5", "version": 1, "country": "PL", "input_sequence": 120, "scaler": {"name": "MinMaxScaler", "data_min": [1.0], "data_max": [57.0], "scale": [0.017857142857142856], "min": [-0.017857142857142856]}, "description": "Trained on historical data (2020-01-01 - 2023-05-01)"}, {"name": "CH_StandardScaler_model_96.h5", "version": 1, "country": "CH", "input_sequence": 96, "scaler": {"name": "StandardScaler", "scale": [19.803016600075686], "mean": [45.33485401459854]}, "description": "Trained on historical data (2020-01-01 - 2023-05-01)"}, {"name": "NL_StandardScaler_model_96.h5", "version": 1, "country": "NL", "input_sequence": 96, "scaler": {"name": "StandardScaler", "scale": [10.839645485125265], "mean": [16.836070559610704]}, "description": "Trained on historical data (2020-01-01 - 2023-05-01)"}, {"name": "CZ_MinMaxScaler_model_24.h5", "version": 1, "country": "CZ", "input_sequence": 24, "scaler": {"name": "MinMaxScaler", "data_min": [4.0], "data_max": [34.0], "scale": [0.03333333333333333], "min": [-0.13333333333333333]}, "description": "Trained on historical data (2020-01-01 - 2023-05-01)"}]}
diff --git a/predictionModel.py b/predictionModel.py
@@ -14,12 +14,15 @@
 import numpy as np
 from sklearn.preprocessing import StandardScaler
 from tensorflow.keras.models import load_model
+import re
 
 import entsoeAPI as en
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
 
 def get_model_metadata(model):
     """Returns metadata for the selected model from the metadata.json file in the model folder"""
     with open("./models/metadata.json", "r") as file:
+
         data = json.load(file)
         obj = [o for o in data["models"] if o["name"] == model]
         if len(obj) == 1:
@@ -28,19 +31,30 @@ def get_model_metadata(model):
             raise Exception("Invalid model name")
 
 
+# def get_available_country_list():
+#     """Returns a list of country codes for which prediction models are available.
+#     All models are stored in the 'model' folder. There can be multiple models for one country.
+#     This method returns the unique names of all countries for which models exist.
+#     """
+#     country_names = set()
+#     folder_path = "./models"
+#     for filename in os.listdir(folder_path):
+#         if os.path.isfile(os.path.join(folder_path, filename)) and filename.endswith(".h5"):
+#             country_name = filename.split('_')[0]
+#             country_names.add(country_name)
+#     return list(country_names)
+
 def get_available_country_list():
     """Returns a list of country codes for which prediction models are available.
     All models are stored in the 'model' folder. There can be multiple models for one country.
     This method returns the unique names of all countries for which models exist.
     """
     country_names = set()
-    folder_path = "./models"
-    for filename in os.listdir(folder_path):
-        if os.path.isfile(os.path.join(folder_path, filename)) and filename.endswith(".h5"):
-            country_name = filename.split('_')[0]
-            country_names.add(country_name)
-    return list(country_names)
-
+    print('Getting countries')
+    with open("./models/metadata.json", "r") as file:
+        data = json.load(file)
+        obj = [o["country"] for o in data['models']]
+    return obj
 
 def get_latest_model_name_for(country):
     """Returns the latest prediction model version number for a country.
@@ -57,8 +71,25 @@ def get_latest_model_name_for(country):
                 highestNumberFile = fileName
     return highestNumberFile
 
+def get_latest_model_name_for(country):
+    """Returns the latest prediction model version number for a country.
+    All models stored in the 'model' folder follow a common file naming convention: "countrycode_version".
+    This method returns the value of the highest version available for the given country.
+    """
+    last_version  = -1
+    filename = None
+    with open("./models/metadata.json", "r") as file:
+        data = json.load(file)['models']
+        for d in data:
+            if d['country'] == country:
+                if d['version']>last_version:
+                    last_version = d['version']
+                    filename = d['name']
+    return filename, last_version
+
+
 
-def get_date_range():
+def get_date_range(input_sequence):
     """Returns a dictionary comprising two keys: 'start' and 'end'. 
     These values are used as the start and end dates to retrieve actual generation data from the ENTSOE API.
     The 'start' date is established as 3 days before the current date, ensuring a comprehensive historical range. 
@@ -70,7 +101,8 @@ def get_date_range():
     For instance, if the current time is 14:34, the end date will be 13:00 of the current day, encompassing data up to the preceding hour.
     """
     today_utc = datetime.now()
-    start_date = (today_utc - timedelta(days=3)).replace(hour=0,
+    days = input_sequence/24
+    start_date = (today_utc - timedelta(days=days)).replace(hour=0,
                                                          minute=0, second=0, microsecond=0)
     end_date = (today_utc - timedelta(hours=1)
                 ).replace(minute=0, second=0, microsecond=0)
@@ -86,63 +118,95 @@ def get_percent_actual_generation(country, input_sequence):
     The value of n is determined by the input_sequence provided.
     The output from this method serves as input for running the model.
     '''
-    input = get_date_range()
+    input = get_date_range(input_sequence)
     data = en.get_actual_percent_renewable(
         country, input["start"], input["end"], True)
     # data.to_csv("./data/test-"+country+".csv")
     last_n_rows = data.tail(input_sequence)
     return last_n_rows
 
 
-def run_model(model_name, input) -> pd.DataFrame:
-    """Generates prediction values for the next 48 hours by running the provided model, using the input data. 
-    :param model_name : The file name of a model (without any extension) located within the 'model' folder. E.g "FR_v5"
-    :param input : pd.DataFrame containing the actual percentage of renewable values up to a certain time period in the recent past
-    Predictions are generated for the upcoming 48 hours, starting from the last hour in the input data
+def predict(model_name, last_values, scaler, seq_len):
     """
-    seq_length = len(input)
-    date = input[['startTimeUTC']].copy()
-    # Convert 'startTimeUTC' column to datetime
-    date['startTimeUTC'] = pd.to_datetime(date['startTimeUTC'])
-    # Get the last date value
-    last_date = date.iloc[-1]['startTimeUTC']
-    # Calculate the next hour
-    next_hour = last_date + timedelta(hours=1)
-    # Create a range of 48 hours starting from the next hour
-    next_48_hours = pd.date_range(next_hour, periods=48, freq='H')
-    # Create a DataFrame with the next 48 hours
-    next_48_hours_df = pd.DataFrame(
-        {'startTimeUTC': next_48_hours.strftime('%Y%m%d%H%M')})
-    # print(next_48_hours_df)
-    # Construct the model filename by appending '.h5' to the model name
+    Predicts the next 48 hours of percent renewable energy based on a pre-trained model.
+
+    Args:
+        model_name (str): The name of the pre-trained model file.
+        last_values (pd.DataFrame): DataFrame containing the last values of percentRenewable and startTime.
+
+    Returns:
+        pd.DataFrame: DataFrame containing the forecast values and timestamps.
+    """
+    # Extract scaling technique and sequence length from the model name
+    last_values_subset = last_values[['percentRenewable', 'startTimeUTC']].copy()
+    last_values_subset['startTimeUTC'] = pd.to_datetime(last_values_subset['startTimeUTC'], format='%Y%m%d%H%M')
+
+    # Extract the last timestamp from the input data
+    last_timestamp = last_values_subset['startTimeUTC'].iloc[-1]
+
+    # Extract sequence length from the model name
+
     model_filename = "./models/"+model_name
     # Load the specified model
-    lstm = load_model(model_filename, compile=False)
-    scaler = StandardScaler()
-    percent_renewable = input['percentRenewable']
-    forecast_values_total = []
-    prev_values_total = percent_renewable.values.flatten()
+    #lstm = load_model(model_filename, compile=False)
+    # Load the pre-trained model
+    model = load_model(model_filename, compile=False)
+
+    # Extract the last (seq_len-1) values from last_values
+    last_values = last_values['percentRenewable'].tail(seq_len - 1).values.flatten()
+
+    # Initialize the scaler based on the scaling techniq
+    # List to store the forecast values
+    forecast_values = []
+
+    # Generate forecasts for the next 48 hours
     for _ in range(48):
-        scaled_prev_values_total = scaler.fit_transform(
-            prev_values_total.reshape(-1, 1))
-        x_pred_total = scaled_prev_values_total[-(
-            seq_length-1):].reshape(1, (seq_length-1), 1)
-        # Make the prediction using the loaded model
-        predicted_value_total = lstm.predict(x_pred_total, verbose=0)
+        # Scale the last values
+        scaled_last_values = scaler.transform(last_values.reshape(-1, 1))
+
+        # Prepare the input for prediction
+        x_pred = scaled_last_values[-(seq_len - 1):].reshape(1, (seq_len - 1), 1)
+
+        # Predict the next value
+        predicted_value = model.predict(x_pred)
+
         # Inverse transform the predicted value
-        predicted_value_total = scaler.inverse_transform(predicted_value_total)
-        forecast_values_total.append(predicted_value_total[0][0])
-        prev_values_total = np.append(prev_values_total, predicted_value_total)
-        prev_values_total = prev_values_total[1:]
-    # Create a DataFrame
-    forecast_df = pd.DataFrame(
-        {'startTimeUTC': next_48_hours_df['startTimeUTC'], 'percentRenewableForecast': forecast_values_total})
-    forecast_df["percentRenewableForecast"] = forecast_df["percentRenewableForecast"].round(
-    ).astype(int)
-    forecast_df['percentRenewableForecast'] = forecast_df['percentRenewableForecast'].apply(
-        lambda x: 0 if x <= 0 else x)
+        predicted_value = scaler.inverse_transform(predicted_value)
+
+        # Append the predicted value to the forecast_values
+        forecast_values.append(predicted_value[0][0])
+
+        # Update last_values with the predicted value
+        last_values = np.append(last_values, predicted_value)
+
+    # Generate the next 48 timestamps
+    forecast_timestamps = pd.date_range(start=last_timestamp, periods=49, freq='H')[1:]
+
+    # Create a DataFrame with forecast values and timestamps
+    forecast_df = pd.DataFrame({'startTimeUTC': forecast_timestamps, 'percentRenewableForecast': forecast_values})
     return forecast_df
 
+def get_scaler(model_meta):
+    """
+    Initialized the scaler from the metadata
+    """
+    if model_meta['scaler']['name'] == 'StandardScaler':
+        # reinitialize scaler
+        new_scaler = StandardScaler(with_mean=False, with_std=False)
+        new_scaler.mean_ = model_meta['scaler']['mean']
+        new_scaler.scale_ = model_meta['scaler']['scale']
+
+    elif model_meta['scaler']['name'] == 'MinMaxScaler':
+        new_scaler = MinMaxScaler(feature_range=(0, 1))
+        new_scaler.data_min_ = model_meta['scaler']['data_min']
+        new_scaler.data_max_ = model_meta['scaler']['data_max']
+        new_scaler.scale_ = model_meta['scaler']['scale']
+        new_scaler.min_ = model_meta['scaler']['min']
+
+    else:
+        raise ValueError('Invalid Scaler name')
+
+    return new_scaler
 
 def run_latest_model(country) -> dict:
     """ Returns  predictions by running the latest version of model available for the input country
@@ -151,18 +215,24 @@ def run_latest_model(country) -> dict:
     :return Dictionary { "input": { "country":"", "model":"", "start":"", "end":"",  "percentRenewable":[],  } , "output": <pandas dataframe> }
     """
     # get the name of the latest model  and its metadata
-    model_name = get_latest_model_name_for(country)
+    model_name, version = get_latest_model_name_for(country)
     model_meta = get_model_metadata(model_name)
+
     input_sequence = model_meta["input_sequence"]
     country = model_meta["country"]
     # get input for the model : last n values of percent renewable
     input_data = get_percent_actual_generation(country, input_sequence)
-    #print(input_data)
+    print(input_data.shape)
     input_percentage = input_data["percentRenewable"].tolist()
     input_start = input_data.iloc[0]["startTimeUTC"]
     input_end = input_data.iloc[-1]["startTimeUTC"]
+
+    # get the scaler
+    scaler = get_scaler(model_meta)
+
     # run the model
-    output = run_model(model_name, input_data)
+    output = predict(model_name, input_data, scaler, input_sequence)
+
     return {
         "input": {
             "country": country,