diff --git a/Chapter14/dataGeneration/README.md b/Chapter14/dataGeneration/README.md index e69de29..69d9998 100644 --- a/Chapter14/dataGeneration/README.md +++ b/Chapter14/dataGeneration/README.md @@ -0,0 +1,19 @@ +What it does : + + 1. Combine two CSV files, where each row belong to a unique timestamp, which is common across the two tables + 2. Predict the solar power generation at any point using the combined CSV file. + +Dependancies : + + 1. matplotlib module is needed to be installed in the local machine to run this program. + 2. sklearn module is needed to be installed in the local machine to run this program. + 3. gdown module is needed to be installed in the local machine. + 4. zipfile module is needed to be installed in the local machine. + +Things to check before running : + + 1. Check whether you have given the correct location of your dataset file. + 2. You should have access to the file in the Google Drive. + + + \ No newline at end of file diff --git a/Chapter14/dataGeneration/data_generator.ipynb b/Chapter14/dataGeneration/data_generator.ipynb new file mode 100644 index 0000000..48b8cc7 --- /dev/null +++ b/Chapter14/dataGeneration/data_generator.ipynb @@ -0,0 +1,276 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "data_generator.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "hFv0gCCZFUGv" + }, + "source": [ + "# **Problem: Predict Solar power generation.**\n", + "\n", + "1. Combine two CSV files, where each row belong to a unique timestamp, which is common across the two tables.\n", + "2. Predict the solar power generation at any point using the combined CSV file.\n", + "\n", + "**Examples:**\n", + "\n", + "Change the variable 'url' by providing the google drive URL of the zip file, that you want to download.\n", + "\n", + "Eg:url = 'https://drive.google.com/file/d/1dVBMQb-eKRq92WMKfJDbTBt-j-W_5s5u/view?usp=sharing'\n", + "\n", + "Run all the cells. After executing the last cell, you will see the predicted solar power mapped with the actual solar power generated.\n", + "\n", + "**Notes:**\n", + "\n", + "Following things are needed to be checked before running the program.\n", + "1. matplotlib module is needed to be installed in the local machine to run this program. \n", + "2. sklearn module is needed to be installed in the local machine to run this program. \n", + "3. gdown module is needed to be installed in the local machine.\n", + "4. zipfile module is needed to be installed in the local machine.\n", + "5. Check whether you have given the correct location of your dataset file.\n", + "6. You should have access to the file in the Google Drive.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PgDcnNPEGETe" + }, + "source": [ + "# **Import Modules**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "lnbm7tctGHeT" + }, + "source": [ + "# Import pandas\n", + "import pandas as pd\n", + "\n", + "# Import pyplot module to plot the results\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Import train_test_split module to split the data into train and test\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Import LinearRegression module to use in model training\n", + "from sklearn.linear_model import LinearRegression\n", + "\n", + "# Import gdown module to download files from google drive\n", + "import gdown\n", + "\n", + "# Import zip file module to open the zip file\n", + "from zipfile import ZipFile" + ], + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rpDGPXjKGK20" + }, + "source": [ + "# **Get the file location from google drive and download**" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 102 + }, + "id": "22ygR_COGOi2", + "outputId": "bff0a77c-1a85-4e11-9418-0fb8bc7bce95" + }, + "source": [ + ",# Please change the URL as needed (make sure you have the access to the file)\n", + "\n", + "url = 'https://drive.google.com/file/d/1dVBMQb-eKRq92WMKfJDbTBt-j-W_5s5u/view?usp=sharing'\n", + "\n", + "# Derive the file id from the URL\n", + "file_id = url.split('/')[-2]\n", + "\n", + "# Derive the download url of the the file\n", + "download_url = 'https://drive.google.com/uc?id=' + file_id\n", + "\n", + "# Give the location you want to save it in your local machine\n", + "file_location = 'solar.zip'\n", + "\n", + "# Download the file from drive to your local machine\n", + "gdown.download(download_url, file_location, quiet=False)" + ], + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Downloading...\n", + "From: https://drive.google.com/uc?id=1dVBMQb-eKRq92WMKfJDbTBt-j-W_5s5u\n", + "To: /content/solar.zip\n", + "100%|██████████| 1.01M/1.01M [00:00<00:00, 105MB/s]\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'solar.zip'" + ] + }, + "metadata": {}, + "execution_count": 6 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yQFUYnj3GQ8i" + }, + "source": [ + "# **Unzip the zip dataset**" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XFjoCiZ0GUEM", + "outputId": "e11287c9-818a-4cb5-90b7-149d43240511" + }, + "source": [ + "!unzip /content/solar.zip -d \"/content/unzipped_folder/\"" + ], + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Archive: /content/solar.zip\n", + " creating: /content/unzipped_folder/solar/\n", + " inflating: /content/unzipped_folder/solar/Plant_2_Generation_Data.csv \n", + " inflating: /content/unzipped_folder/solar/Plant_2_Weather_Sensor_Data.csv \n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "waqSASm3GXqC" + }, + "source": [ + "# **Read and combine the CSVs**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Ql0wj_T4Ga4r" + }, + "source": [ + "# Read 1st csv file\n", + "plant = pd.read_csv('unzipped_folder/solar/Plant_2_Generation_Data.csv', sep = ',', engine = 'python', header = 0)\n", + "\n", + "# Read 2nd csv file\n", + "weather = pd.read_csv('unzipped_folder/solar/Plant_2_Weather_Sensor_Data.csv', sep = ',', engine = 'python', header = 0)\n", + "\n", + "# Combine the two csv files using DATE_TIME coloumn\n", + "combined_file = plant.merge(weather, on=[\"DATE_TIME\", \"PLANT_ID\"], suffixes=(\"_GENERATION\", \"_WEATHER\"))\n", + "\n", + "# Save the combined as a csv\n", + "combined_file.to_csv('output.csv', sep = ',')" + ], + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uzJNGQdOGdxV" + }, + "source": [ + "# **Start the training and prediction**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "GMD2mGPWGgtn" + }, + "source": [ + "# Get feature coloumns\n", + "X2 = combined_file[['AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']]\n", + "\n", + "# Get target coloumn\n", + "y2 = combined_file['AC_POWER']\n", + "\n", + "# Split the data into train and test\n", + "X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3)\n", + "\n", + "# Initialize LinearRegression class\n", + "lm2 = LinearRegression()\n", + "\n", + "# Fit the training data\n", + "lm2.fit(X2_train, y2_train)\n", + "\n", + "# Get the predictions\n", + "predictions = lm2.predict(X2_test)" + ], + "execution_count": 10, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2sHTClUUGkop" + }, + "source": [ + "# **Plot the results**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ojs96P6ZGn5U" + }, + "source": [ + "plt.scatter(y2_test, predictions)\n", + "plt.title('Actual Solar Output Values vs Predicted Values for Plant 2')\n", + "plt.xlabel('Predicted Output')\n", + "plt.ylabel('Actual Output')\n", + "\n", + "plt.show()" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/Chapter14/dataGeneration/data_generator.py b/Chapter14/dataGeneration/data_generator.py index 18286bd..e4a42ff 100644 --- a/Chapter14/dataGeneration/data_generator.py +++ b/Chapter14/dataGeneration/data_generator.py @@ -1,6 +1,101 @@ -# TODO: Write a script to generate data from this dataset: https://www.kaggle.com/anikannal/solar-power-generation-data -# It has two files (a) Plant_1_Generation_Data.csv and (b) Plant_1_Weather_Sensor_Data.csv -# The script should do the following -# (a) For a given id, combine the above two tables, where each row belong to a unique timestamp, which is common across the two tables -# (b) Another script that can use the above to predict the solar power generation at any point. -# TODO: Code should be well commented. \ No newline at end of file +'''Copyright (c) 2021 AIClub + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the Software without restriction, including without +limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial +portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT +LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO +EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +OR OTHER DEALINGS IN THE SOFTWARE.''' + +# Import pandas +import pandas as pd + +# Import pyplot module to plot the results +import matplotlib.pyplot as plt + +# Import train_test_split module to split the data into train and test +from sklearn.model_selection import train_test_split + +# Import LinearRegression module to use in model training +from sklearn.linear_model import LinearRegression + +# Import gdown module to download files from google drive +import gdown + +# Import zip file module to open the zip file +from zipfile import ZipFile + +#--------------------------------------------- Get the file location from google drive --------------------------------------------------- + +# Please change the URL as needed (make sure you have the access to the file) + +url = 'https://drive.google.com/file/d/1dVBMQb-eKRq92WMKfJDbTBt-j-W_5s5u/view?usp=sharing' + +# Derive the file id from the URL +file_id = url.split('/')[-2] + +# Derive the download url of the the file +download_url = 'https://drive.google.com/uc?id=' + file_id + +# Give the location you want to save it in your local machine +file_location = 'solar.zip' + +#--------------------------------------------- Download and extract the zip file ----------------------------------------------------------- + +# Download the file from drive to your local machine +gdown.download(download_url, file_location) + +# Open the downloaded zip file and extract its contents +with ZipFile(file_location, "r") as zip_file: + zip_file.extractall() + +# ------------------------------------------------- Read and combine the CSVs -------------------------------------------------------------- + +# Read 1st csv file +plant = pd.read_csv('solar/Plant_2_Generation_Data.csv', sep = ',', engine = 'python', header = 0) + +# Read 2nd csv file +weather = pd.read_csv('solar/Plant_2_Weather_Sensor_Data.csv', sep = ',', engine = 'python', header = 0) + +# Combine the two csv files using DATE_TIME coloumn +combined_file = plant.merge(weather, on=["DATE_TIME", "PLANT_ID"], suffixes=("_GENERATION", "_WEATHER")) + +# Save the combined as a csv +combined_file.to_csv('output.csv', sep = ',') + +#------------------------------------------------- Start the training and prediction -------------------------------------------------------- + +# Get feature coloumns +X2 = combined_file[['AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']] + +# Get target coloumn +y2 = combined_file['AC_POWER'] + +# Split the data into train and test +X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3) + +# Initialize LinearRegression class +lm2 = LinearRegression() + +# Fit the training data +lm2.fit(X2_train, y2_train) + +# Get the predictions +predictions = lm2.predict(X2_test) + +# ----------------------------------------------- Plot the results -------------------------------------------------------------------------- + +plt.scatter(y2_test, predictions) +plt.title('Actual Solar Output Values vs Predicted Values for Plant 2') +plt.xlabel('Predicted Output') +plt.ylabel('Actual Output') + +plt.show() \ No newline at end of file