diff --git a/challenge1/analysis/shino/.ipynb_checkpoints/Challenge1-checkpoint.ipynb b/challenge1/analysis/shino/.ipynb_checkpoints/Challenge1-checkpoint.ipynb new file mode 100644 index 000000000..32fed455e --- /dev/null +++ b/challenge1/analysis/shino/.ipynb_checkpoints/Challenge1-checkpoint.ipynb @@ -0,0 +1,1524 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.ensemble import RandomForestClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "train_df = pd.read_csv('../../data/training_dataset_500.csv')\n", + "test_df = pd.read_csv('../../data/test_dataset_500.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | ID | \n", + "Label | \n", + "House | \n", + "Year | \n", + "Month | \n", + "Temperature | \n", + "Daylight | \n", + "EnergyProduction | \n", + "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "2011 | \n", + "7 | \n", + "26.2 | \n", + "178.9 | \n", + "740 | \n", + "
| 1 | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "2011 | \n", + "8 | \n", + "25.8 | \n", + "169.7 | \n", + "731 | \n", + "
| 2 | \n", + "2 | \n", + "2 | \n", + "1 | \n", + "2011 | \n", + "9 | \n", + "22.8 | \n", + "170.2 | \n", + "694 | \n", + "
| 3 | \n", + "3 | \n", + "3 | \n", + "1 | \n", + "2011 | \n", + "10 | \n", + "16.4 | \n", + "169.1 | \n", + "688 | \n", + "
| 4 | \n", + "4 | \n", + "4 | \n", + "1 | \n", + "2011 | \n", + "11 | \n", + "11.4 | \n", + "169.1 | \n", + "650 | \n", + "
| 5 | \n", + "5 | \n", + "5 | \n", + "1 | \n", + "2011 | \n", + "12 | \n", + "4.2 | \n", + "199.5 | \n", + "763 | \n", + "
| 6 | \n", + "6 | \n", + "6 | \n", + "1 | \n", + "2012 | \n", + "1 | \n", + "1.8 | \n", + "203.1 | \n", + "765 | \n", + "
| 7 | \n", + "7 | \n", + "7 | \n", + "1 | \n", + "2012 | \n", + "2 | \n", + "2.8 | \n", + "178.2 | \n", + "706 | \n", + "
| 8 | \n", + "8 | \n", + "8 | \n", + "1 | \n", + "2012 | \n", + "3 | \n", + "6.7 | \n", + "172.7 | \n", + "788 | \n", + "
| 9 | \n", + "9 | \n", + "9 | \n", + "1 | \n", + "2012 | \n", + "4 | \n", + "12.6 | \n", + "182.2 | \n", + "831 | \n", + "
| 10 | \n", + "10 | \n", + "10 | \n", + "1 | \n", + "2012 | \n", + "5 | \n", + "17.6 | \n", + "214.2 | \n", + "955 | \n", + "
| 11 | \n", + "11 | \n", + "11 | \n", + "1 | \n", + "2012 | \n", + "6 | \n", + "20.8 | \n", + "143.0 | \n", + "837 | \n", + "
| \n", + " | ID | \n", + "Label | \n", + "House | \n", + "Year | \n", + "Month | \n", + "Temperature | \n", + "Daylight | \n", + "EnergyProduction | \n", + "
|---|---|---|---|---|---|---|---|---|
| 11488 | \n", + "11987 | \n", + "11 | \n", + "500 | \n", + "2012 | \n", + "6 | \n", + "21.8 | \n", + "152.1 | \n", + "645 | \n", + "
| 11489 | \n", + "11988 | \n", + "12 | \n", + "500 | \n", + "2012 | \n", + "7 | \n", + "26.2 | \n", + "169.8 | \n", + "661 | \n", + "
| 11490 | \n", + "11989 | \n", + "13 | \n", + "500 | \n", + "2012 | \n", + "8 | \n", + "27.8 | \n", + "257.9 | \n", + "822 | \n", + "
| 11491 | \n", + "11990 | \n", + "14 | \n", + "500 | \n", + "2012 | \n", + "9 | \n", + "24.7 | \n", + "183.2 | \n", + "665 | \n", + "
| 11492 | \n", + "11991 | \n", + "15 | \n", + "500 | \n", + "2012 | \n", + "10 | \n", + "17.4 | \n", + "201.2 | \n", + "655 | \n", + "
| 11493 | \n", + "11992 | \n", + "16 | \n", + "500 | \n", + "2012 | \n", + "11 | \n", + "9.7 | \n", + "203.5 | \n", + "582 | \n", + "
| 11494 | \n", + "11993 | \n", + "17 | \n", + "500 | \n", + "2012 | \n", + "12 | \n", + "3.8 | \n", + "194.2 | \n", + "534 | \n", + "
| 11495 | \n", + "11994 | \n", + "18 | \n", + "500 | \n", + "2013 | \n", + "1 | \n", + "2.0 | \n", + "234.6 | \n", + "640 | \n", + "
| 11496 | \n", + "11995 | \n", + "19 | \n", + "500 | \n", + "2013 | \n", + "2 | \n", + "4.2 | \n", + "201.8 | \n", + "638 | \n", + "
| 11497 | \n", + "11996 | \n", + "20 | \n", + "500 | \n", + "2013 | \n", + "3 | \n", + "11.2 | \n", + "234.0 | \n", + "778 | \n", + "
| 11498 | \n", + "11997 | \n", + "21 | \n", + "500 | \n", + "2013 | \n", + "4 | \n", + "13.6 | \n", + "237.1 | \n", + "758 | \n", + "
| 11499 | \n", + "11998 | \n", + "22 | \n", + "500 | \n", + "2013 | \n", + "5 | \n", + "19.2 | \n", + "258.4 | \n", + "838 | \n", + "
| \n", + " | ID | \n", + "House | \n", + "Daylight | \n", + "EnergyProduction | \n", + "
|---|---|---|---|---|
| 11495 | \n", + "11994 | \n", + "500 | \n", + "234.6 | \n", + "640 | \n", + "
| 11496 | \n", + "11995 | \n", + "500 | \n", + "201.8 | \n", + "638 | \n", + "
| 11497 | \n", + "11996 | \n", + "500 | \n", + "234.0 | \n", + "778 | \n", + "
| 11498 | \n", + "11997 | \n", + "500 | \n", + "237.1 | \n", + "758 | \n", + "
| 11499 | \n", + "11998 | \n", + "500 | \n", + "258.4 | \n", + "838 | \n", + "
| \n", + " | House | \n", + "Daylight | \n", + "EnergyProduction | \n", + "
|---|---|---|---|
| House | \n", + "1.000000 | \n", + "0.001583 | \n", + "-0.008303 | \n", + "
| Daylight | \n", + "0.001583 | \n", + "1.000000 | \n", + "0.531577 | \n", + "
| EnergyProduction | \n", + "-0.008303 | \n", + "0.531577 | \n", + "1.000000 | \n", + "
| \n", + " | ID | \n", + "Label | \n", + "House | \n", + "Year | \n", + "Month | \n", + "Temperature | \n", + "Daylight | \n", + "EnergyProduction | \n", + "
|---|---|---|---|---|---|---|---|---|
| 495 | \n", + "11903 | \n", + "23 | \n", + "496 | \n", + "2013 | \n", + "6 | \n", + "19.3 | \n", + "125.9 | \n", + "483 | \n", + "
| 496 | \n", + "11927 | \n", + "23 | \n", + "497 | \n", + "2013 | \n", + "6 | \n", + "21.9 | \n", + "122.4 | \n", + "628 | \n", + "
| 497 | \n", + "11951 | \n", + "23 | \n", + "498 | \n", + "2013 | \n", + "6 | \n", + "22.8 | \n", + "127.2 | \n", + "673 | \n", + "
| 498 | \n", + "11975 | \n", + "23 | \n", + "499 | \n", + "2013 | \n", + "6 | \n", + "21.9 | \n", + "126.8 | \n", + "735 | \n", + "
| 499 | \n", + "11999 | \n", + "23 | \n", + "500 | \n", + "2013 | \n", + "6 | \n", + "22.7 | \n", + "122.9 | \n", + "586 | \n", + "
import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.linear_model import LinearRegression
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+train_df = pd.read_csv('../../data/training_dataset_500.csv')
+test_df = pd.read_csv('../../data/test_dataset_500.csv')
+train_df.head(12)
+train_df.tail(12)
+train_df.shape, test_df.shape
+train_df_hde = train_df.drop(['Label','Year','Month','Temperature'], axis=1)
+train_df_hde.tail()
+train_df_hde[['House','Daylight','EnergyProduction']].corr()
+lr = LinearRegression()
+X = train_df_hde[['Daylight']].values
+Y = train_df_hde[['EnergyProduction']].values
+lr.fit(X,Y)
+print('coefficient = ', lr.coef_[0]) # 説明変数の係数を出力
+print('intercept = ', lr.intercept_) # 切片を出力
+plt.scatter(X, Y, color = 'blue') # 説明変数と目的変数のデータ点の散布図をプロット
+plt.plot(X, lr.predict(X), color = 'red') # 回帰直線をプロット
+
+plt.title('Regression Line') # 図のタイトル
+plt.xlabel('Daylight') # x軸のラベル
+plt.ylabel('EnergyProduction') # y軸のラベル
+plt.grid() # グリッド線を表示
+
+plt.show() # 図の表示
+# DaylightとEnergyProductionに相関はあるが、ばらつきが大きい。
+# Houseごとに、発電能力に差があるためでは?
+MeanEnergy = np.mean(train_df_hde['EnergyProduction'])
+print(MeanEnergy)
+pivot_House = pd.pivot_table(train_df_hde, index='House')
+# 各Houseごとの発電量の平均を求める
+cases_House = train_df.shape[0]/pivot_House.shape[0]
+print(cases_House)
+eachHouse_perMeanEnergy = pivot_House/MeanEnergy
+# 各Houseごとの平均を、全体の平均から求める係数を算出する
+k_Energy_House_Mean = eachHouse_perMeanEnergy[['EnergyProduction']]
+k_Energy_House_Mean_array = k_Energy_House_Mean.values
+k_Energy_House_Mean_array.shape
+test_X = test_df[['Daylight']].values
+test_Y = test_df[['EnergyProduction']].values
+plt.scatter(test_X, test_Y, color='blue')
+plt.scatter(test_X, lr.predict(test_X), color='black')
+ sim_Y = k_Energy_House_Mean_array * lr.predict(test_X)
+plt.scatter(test_X, lr.predict(test_X), color = 'black')
+plt.scatter(test_X, sim_Y, color = 'red')
+plt.scatter(test_X, test_Y, color='blue')
+# ばらつき具合は、Houseごとの係数を考慮することで表現できている。
+# 全体的に、発電量が小さい。北半球6月なら、太陽の角度が大きいことを考慮できていないため?
+test_df.tail()
+# RandomForestを使い、Month, Daylight, House, TemperatureからEnergyProductionを推測してみる。
+from sklearn.ensemble import RandomForestClassifier
+model = RandomForestClassifier(max_depth=1000)
+X = train_df.drop(['ID', 'Label', 'Year'], axis=1)
+test_X = test_df.drop(['ID', 'Label', 'Year'], axis=1)
+model.fit(X,Y)
+predicted = model.predict(test_X)
+model.score(test_X, test_Y)
+model.score(test_X, predicted)
+plt.scatter(test_X[['House']].values, predicted, color='red')
+plt.scatter(test_X[['House']].values, test_Y, color='blue')
+plt.scatter(test_X[['Daylight']].values, predicted, color='red')
+plt.scatter(test_X[['Daylight']].values, test_Y, color='blue')
+# RandomForestを使い、Month, Daylight, HouseからEnergyProductionを推測してみる。(Temperatureを除いた場合)
+X_hdm = train_df.drop(['ID', 'Label', 'Year', 'Temperature'], axis=1)
+test_X_hdm = test_df.drop(['ID', 'Label', 'Year', 'Temperature'], axis=1)
+model_hdm = RandomForestClassifier(max_depth=1000)
+model_hdm.fit(X_hdm,Y)
+predicted_hdm = model_hdm.predict(test_X_hdm)
+plt.scatter(test_X_hdm[['House']].values, predicted, color='red')
+plt.scatter(test_X_hdm[['House']].values, test_Y, color='blue')
+plt.scatter(test_X_hdm[['Daylight']].values, predicted, color='red')
+plt.scatter(test_X_hdm[['Daylight']].values, test_Y, color='blue')
+# Temperatureの影響があるかは、はっきりしない。
+model_hdm.score(test_X_hdm, test_Y)
+# SVCも試してみる。
+from sklearn.svm import SVC
+model_svc = SVC(gamma='scale')
+model_svc.fit(X, Y)
+test_Y_svc = model_svc.predict(test_X)
+plt.scatter(test_X[['House']].values, test_Y_svc, color='red')
+plt.scatter(test_X[['House']].values, test_Y, color='blue')
+model_svc.score(test_X, test_Y)
+model_svc_hdm = SVC(gamma='scale')
+model_svc_hdm.fit(X_hdm, Y)
+# RandomForestを使い、Month, Daylight, HouseからEnergyProductionを推測したものが、この中では一番よかった。
+plt.scatter(test_X[['House']].values, predicted, color='red')
+plt.scatter(test_X[['House']].values, test_Y, color='blue')
+solution = pd.DataFrame({'House': test_df['House'], 'EnergyProduction': predicted})
+solution.info
+solution.to_csv("predicted_energy_production.csv")
+
+| \n", + " | ID | \n", + "Label | \n", + "House | \n", + "Year | \n", + "Month | \n", + "Temperature | \n", + "Daylight | \n", + "EnergyProduction | \n", + "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "2011 | \n", + "7 | \n", + "26.2 | \n", + "178.9 | \n", + "740 | \n", + "
| 1 | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "2011 | \n", + "8 | \n", + "25.8 | \n", + "169.7 | \n", + "731 | \n", + "
| 2 | \n", + "2 | \n", + "2 | \n", + "1 | \n", + "2011 | \n", + "9 | \n", + "22.8 | \n", + "170.2 | \n", + "694 | \n", + "
| 3 | \n", + "3 | \n", + "3 | \n", + "1 | \n", + "2011 | \n", + "10 | \n", + "16.4 | \n", + "169.1 | \n", + "688 | \n", + "
| 4 | \n", + "4 | \n", + "4 | \n", + "1 | \n", + "2011 | \n", + "11 | \n", + "11.4 | \n", + "169.1 | \n", + "650 | \n", + "
| 5 | \n", + "5 | \n", + "5 | \n", + "1 | \n", + "2011 | \n", + "12 | \n", + "4.2 | \n", + "199.5 | \n", + "763 | \n", + "
| 6 | \n", + "6 | \n", + "6 | \n", + "1 | \n", + "2012 | \n", + "1 | \n", + "1.8 | \n", + "203.1 | \n", + "765 | \n", + "
| 7 | \n", + "7 | \n", + "7 | \n", + "1 | \n", + "2012 | \n", + "2 | \n", + "2.8 | \n", + "178.2 | \n", + "706 | \n", + "
| 8 | \n", + "8 | \n", + "8 | \n", + "1 | \n", + "2012 | \n", + "3 | \n", + "6.7 | \n", + "172.7 | \n", + "788 | \n", + "
| 9 | \n", + "9 | \n", + "9 | \n", + "1 | \n", + "2012 | \n", + "4 | \n", + "12.6 | \n", + "182.2 | \n", + "831 | \n", + "
| 10 | \n", + "10 | \n", + "10 | \n", + "1 | \n", + "2012 | \n", + "5 | \n", + "17.6 | \n", + "214.2 | \n", + "955 | \n", + "
| 11 | \n", + "11 | \n", + "11 | \n", + "1 | \n", + "2012 | \n", + "6 | \n", + "20.8 | \n", + "143.0 | \n", + "837 | \n", + "
| \n", + " | ID | \n", + "Label | \n", + "House | \n", + "Year | \n", + "Month | \n", + "Temperature | \n", + "Daylight | \n", + "EnergyProduction | \n", + "
|---|---|---|---|---|---|---|---|---|
| 11488 | \n", + "11987 | \n", + "11 | \n", + "500 | \n", + "2012 | \n", + "6 | \n", + "21.8 | \n", + "152.1 | \n", + "645 | \n", + "
| 11489 | \n", + "11988 | \n", + "12 | \n", + "500 | \n", + "2012 | \n", + "7 | \n", + "26.2 | \n", + "169.8 | \n", + "661 | \n", + "
| 11490 | \n", + "11989 | \n", + "13 | \n", + "500 | \n", + "2012 | \n", + "8 | \n", + "27.8 | \n", + "257.9 | \n", + "822 | \n", + "
| 11491 | \n", + "11990 | \n", + "14 | \n", + "500 | \n", + "2012 | \n", + "9 | \n", + "24.7 | \n", + "183.2 | \n", + "665 | \n", + "
| 11492 | \n", + "11991 | \n", + "15 | \n", + "500 | \n", + "2012 | \n", + "10 | \n", + "17.4 | \n", + "201.2 | \n", + "655 | \n", + "
| 11493 | \n", + "11992 | \n", + "16 | \n", + "500 | \n", + "2012 | \n", + "11 | \n", + "9.7 | \n", + "203.5 | \n", + "582 | \n", + "
| 11494 | \n", + "11993 | \n", + "17 | \n", + "500 | \n", + "2012 | \n", + "12 | \n", + "3.8 | \n", + "194.2 | \n", + "534 | \n", + "
| 11495 | \n", + "11994 | \n", + "18 | \n", + "500 | \n", + "2013 | \n", + "1 | \n", + "2.0 | \n", + "234.6 | \n", + "640 | \n", + "
| 11496 | \n", + "11995 | \n", + "19 | \n", + "500 | \n", + "2013 | \n", + "2 | \n", + "4.2 | \n", + "201.8 | \n", + "638 | \n", + "
| 11497 | \n", + "11996 | \n", + "20 | \n", + "500 | \n", + "2013 | \n", + "3 | \n", + "11.2 | \n", + "234.0 | \n", + "778 | \n", + "
| 11498 | \n", + "11997 | \n", + "21 | \n", + "500 | \n", + "2013 | \n", + "4 | \n", + "13.6 | \n", + "237.1 | \n", + "758 | \n", + "
| 11499 | \n", + "11998 | \n", + "22 | \n", + "500 | \n", + "2013 | \n", + "5 | \n", + "19.2 | \n", + "258.4 | \n", + "838 | \n", + "
| \n", + " | ID | \n", + "House | \n", + "Daylight | \n", + "EnergyProduction | \n", + "
|---|---|---|---|---|
| 11495 | \n", + "11994 | \n", + "500 | \n", + "234.6 | \n", + "640 | \n", + "
| 11496 | \n", + "11995 | \n", + "500 | \n", + "201.8 | \n", + "638 | \n", + "
| 11497 | \n", + "11996 | \n", + "500 | \n", + "234.0 | \n", + "778 | \n", + "
| 11498 | \n", + "11997 | \n", + "500 | \n", + "237.1 | \n", + "758 | \n", + "
| 11499 | \n", + "11998 | \n", + "500 | \n", + "258.4 | \n", + "838 | \n", + "
| \n", + " | House | \n", + "Daylight | \n", + "EnergyProduction | \n", + "
|---|---|---|---|
| House | \n", + "1.000000 | \n", + "0.001583 | \n", + "-0.008303 | \n", + "
| Daylight | \n", + "0.001583 | \n", + "1.000000 | \n", + "0.531577 | \n", + "
| EnergyProduction | \n", + "-0.008303 | \n", + "0.531577 | \n", + "1.000000 | \n", + "
| \n", + " | ID | \n", + "Label | \n", + "House | \n", + "Year | \n", + "Month | \n", + "Temperature | \n", + "Daylight | \n", + "EnergyProduction | \n", + "
|---|---|---|---|---|---|---|---|---|
| 495 | \n", + "11903 | \n", + "23 | \n", + "496 | \n", + "2013 | \n", + "6 | \n", + "19.3 | \n", + "125.9 | \n", + "483 | \n", + "
| 496 | \n", + "11927 | \n", + "23 | \n", + "497 | \n", + "2013 | \n", + "6 | \n", + "21.9 | \n", + "122.4 | \n", + "628 | \n", + "
| 497 | \n", + "11951 | \n", + "23 | \n", + "498 | \n", + "2013 | \n", + "6 | \n", + "22.8 | \n", + "127.2 | \n", + "673 | \n", + "
| 498 | \n", + "11975 | \n", + "23 | \n", + "499 | \n", + "2013 | \n", + "6 | \n", + "21.9 | \n", + "126.8 | \n", + "735 | \n", + "
| 499 | \n", + "11999 | \n", + "23 | \n", + "500 | \n", + "2013 | \n", + "6 | \n", + "22.7 | \n", + "122.9 | \n", + "586 | \n", + "