diff --git a/your-code/main.ipynb b/your-code/main.ipynb index bad6d94..ef95e3d 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -9,10 +9,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from scipy import stats" + ] }, { "cell_type": "markdown", @@ -23,10 +27,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "(40325, 14)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users = pd.read_csv('users_table.csv')\n", + "users.shape" + ] }, { "cell_type": "markdown", @@ -37,10 +55,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "userId int64\n", + "Reputation int64\n", + "CreationDate object\n", + "DisplayName object\n", + "LastAccessDate object\n", + "WebsiteUrl object\n", + "Location object\n", + "AboutMe object\n", + "Views int64\n", + "UpVotes int64\n", + "DownVotes int64\n", + "AccountId int64\n", + "Age float64\n", + "ProfileImageUrl object\n", + "dtype: object" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users.rename(columns={'Id': 'userId'}, inplace= True)\n", + "users.dtypes" + ] }, { "cell_type": "markdown", @@ -51,10 +97,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "Id int64\n", + "PostTypeId int64\n", + "AcceptedAnswerId float64\n", + "CreaionDate object\n", + "Score int64\n", + "ViewCount float64\n", + "Body object\n", + "OwnerUserId float64\n", + "LasActivityDate object\n", + "Title object\n", + "Tags object\n", + "AnswerCount float64\n", + "CommentCount int64\n", + "FavoriteCount float64\n", + "LastEditorUserId float64\n", + "LastEditDate object\n", + "CommunityOwnedDate object\n", + "ParentId float64\n", + "ClosedDate object\n", + "OwnerDisplayName object\n", + "LastEditorDisplayName object\n", + "dtype: object" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "posts = pd.read_csv('posts_table.csv')\n", + "posts.dtypes" + ] }, { "cell_type": "markdown", @@ -65,10 +146,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "posts.rename(columns= {'Id': 'postId', 'OwnerUserId': 'userId'}, inplace= True)" + ] }, { "cell_type": "markdown", @@ -81,10 +164,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "users = users[['userId', 'Reputation', 'Views', 'UpVotes', 'DownVotes']]\n", + "posts = posts[['postId', 'Score', 'userId', 'ViewCount', 'CommentCount']]\n" + ] }, { "cell_type": "markdown", @@ -96,10 +182,49 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "userId int64\n", + "Reputation int64\n", + "Views int64\n", + "UpVotes int64\n", + "DownVotes int64\n", + "dtype: object" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(38962, 9)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_posts = users.merge(posts, on='userId')\n", + "users_posts.shape" + ] }, { "cell_type": "markdown", @@ -110,10 +235,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "userId 0\n", + "Reputation 0\n", + "Views 0\n", + "UpVotes 0\n", + "DownVotes 0\n", + "postId 0\n", + "Score 0\n", + "ViewCount 23572\n", + "CommentCount 0\n", + "dtype: int64\n", + "Index(['postId', 'Score', 'userId', 'ViewCount', 'CommentCount'], dtype='object')\n" + ] + } + ], + "source": [ + "print(users_posts.isnull().sum())\n", + "print(posts.columns) # The large number indicates the missing values " + ] }, { "cell_type": "markdown", @@ -125,10 +271,93 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "98.0 43\n", + "150.0 43\n", + "122.0 42\n", + "156.0 41\n", + "108.0 41\n", + " ..\n", + "2313.0 1\n", + "6113.0 1\n", + "3463.0 1\n", + "3531.0 1\n", + "3357.0 1\n", + "Name: ViewCount, Length: 3402, dtype: int64\n", + "3783 98.0\n", + "4962 98.0\n", + "9146 98.0\n", + "10242 98.0\n", + "11610 98.0\n", + "12399 98.0\n", + "18635 98.0\n", + "19165 98.0\n", + "19188 98.0\n", + "20384 98.0\n", + "20553 98.0\n", + "21331 98.0\n", + "21787 98.0\n", + "22577 98.0\n", + "22861 98.0\n", + "23103 98.0\n", + "23280 98.0\n", + "23678 98.0\n", + "23711 98.0\n", + "23872 98.0\n", + "25400 98.0\n", + "26363 98.0\n", + "27079 98.0\n", + "27203 98.0\n", + "27752 98.0\n", + "27846 98.0\n", + "30138 98.0\n", + "30508 98.0\n", + "30529 98.0\n", + "30991 98.0\n", + "32295 98.0\n", + "32781 98.0\n", + "32816 98.0\n", + "33329 98.0\n", + "34078 98.0\n", + "34574 98.0\n", + "35873 98.0\n", + "36613 98.0\n", + "36631 98.0\n", + "36688 98.0\n", + "37074 98.0\n", + "37786 98.0\n", + "38072 98.0\n", + "Name: ViewCount, dtype: float64\n", + "Series([], Name: ViewCount, dtype: float64)\n", + "0.0 23572\n", + "98.0 43\n", + "150.0 43\n", + "122.0 42\n", + "108.0 41\n", + " ... \n", + "1664.0 1\n", + "1337.0 1\n", + "64481.0 1\n", + "2313.0 1\n", + "3357.0 1\n", + "Name: ViewCount, Length: 3403, dtype: int64\n" + ] + } + ], + "source": [ + "print(users_posts['ViewCount'].value_counts())\n", + "print(users_posts['ViewCount'][users_posts['ViewCount'] == 98])\n", + "print(users_posts['ViewCount'][users_posts['ViewCount'] == 0])\n", + "\n", + "users_posts['ViewCount'].fillna(0, inplace=True)\n", + "print(users_posts['ViewCount'].value_counts())" + ] }, { "cell_type": "markdown", @@ -137,6 +366,36 @@ "#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? " ] }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "userId int64\n", + "Reputation int64\n", + "Views int64\n", + "UpVotes int64\n", + "DownVotes int64\n", + "postId int64\n", + "Score int64\n", + "ViewCount int64\n", + "CommentCount int64\n", + "dtype: object" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_posts = users_posts.astype({'ViewCount': 'int64'})\n", + "users_posts.dtypes" + ] + }, { "cell_type": "code", "execution_count": null, @@ -147,7 +406,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -161,7 +420,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.9.7" } }, "nbformat": 4, diff --git a/weather-raw.csv b/your-code/weather-raw.csv similarity index 100% rename from weather-raw.csv rename to your-code/weather-raw.csv diff --git a/your-code/weather.ipynb b/your-code/weather.ipynb index 4fc40ab..5c8dc9e 100644 --- a/your-code/weather.ipynb +++ b/your-code/weather.ipynb @@ -11,17 +11,216 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pandas as pd\n", + "#for the code to work I moved the weather file to the main folder" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idyearmonthelementd1d2d3d4d5d6...d22d23d24d25d26d27d28d29d30d31
0MX1700420101tmaxNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN27.8NaN
1MX1700420101tminNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN14.5NaN
2MX1700420102tmaxNaN27.324.1NaNNaNNaN...NaN29.9NaNNaNNaNNaNNaNNaNNaNNaN
3MX1700420102tminNaN14.414.4NaNNaNNaN...NaN10.7NaNNaNNaNNaNNaNNaNNaNNaN
4MX1700420103tmaxNaNNaNNaNNaN32.1NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows × 35 columns

\n", + "
" + ], + "text/plain": [ + " id year month element d1 d2 d3 d4 d5 d6 ... d22 d23 \\\n", + "0 MX17004 2010 1 tmax NaN NaN NaN NaN NaN NaN ... NaN NaN \n", + "1 MX17004 2010 1 tmin NaN NaN NaN NaN NaN NaN ... NaN NaN \n", + "2 MX17004 2010 2 tmax NaN 27.3 24.1 NaN NaN NaN ... NaN 29.9 \n", + "3 MX17004 2010 2 tmin NaN 14.4 14.4 NaN NaN NaN ... NaN 10.7 \n", + "4 MX17004 2010 3 tmax NaN NaN NaN NaN 32.1 NaN ... NaN NaN \n", + "\n", + " d24 d25 d26 d27 d28 d29 d30 d31 \n", + "0 NaN NaN NaN NaN NaN NaN 27.8 NaN \n", + "1 NaN NaN NaN NaN NaN NaN 14.5 NaN \n", + "2 NaN NaN NaN NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + "[5 rows x 35 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "weather = pd.read_csv('weather-raw.csv')\n", + "weather.head()" + ] }, { "cell_type": "code", @@ -47,7 +246,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -61,7 +260,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.9.7" } }, "nbformat": 4,