diff --git a/your-code/main.ipynb b/your-code/main.ipynb index bad6d94..ef95e3d 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -9,10 +9,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from scipy import stats" + ] }, { "cell_type": "markdown", @@ -23,10 +27,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "(40325, 14)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users = pd.read_csv('users_table.csv')\n", + "users.shape" + ] }, { "cell_type": "markdown", @@ -37,10 +55,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "userId int64\n", + "Reputation int64\n", + "CreationDate object\n", + "DisplayName object\n", + "LastAccessDate object\n", + "WebsiteUrl object\n", + "Location object\n", + "AboutMe object\n", + "Views int64\n", + "UpVotes int64\n", + "DownVotes int64\n", + "AccountId int64\n", + "Age float64\n", + "ProfileImageUrl object\n", + "dtype: object" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users.rename(columns={'Id': 'userId'}, inplace= True)\n", + "users.dtypes" + ] }, { "cell_type": "markdown", @@ -51,10 +97,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "Id int64\n", + "PostTypeId int64\n", + "AcceptedAnswerId float64\n", + "CreaionDate object\n", + "Score int64\n", + "ViewCount float64\n", + "Body object\n", + "OwnerUserId float64\n", + "LasActivityDate object\n", + "Title object\n", + "Tags object\n", + "AnswerCount float64\n", + "CommentCount int64\n", + "FavoriteCount float64\n", + "LastEditorUserId float64\n", + "LastEditDate object\n", + "CommunityOwnedDate object\n", + "ParentId float64\n", + "ClosedDate object\n", + "OwnerDisplayName object\n", + "LastEditorDisplayName object\n", + "dtype: object" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "posts = pd.read_csv('posts_table.csv')\n", + "posts.dtypes" + ] }, { "cell_type": "markdown", @@ -65,10 +146,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "posts.rename(columns= {'Id': 'postId', 'OwnerUserId': 'userId'}, inplace= True)" + ] }, { "cell_type": "markdown", @@ -81,10 +164,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "users = users[['userId', 'Reputation', 'Views', 'UpVotes', 'DownVotes']]\n", + "posts = posts[['postId', 'Score', 'userId', 'ViewCount', 'CommentCount']]\n" + ] }, { "cell_type": "markdown", @@ -96,10 +182,49 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "userId int64\n", + "Reputation int64\n", + "Views int64\n", + "UpVotes int64\n", + "DownVotes int64\n", + "dtype: object" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(38962, 9)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_posts = users.merge(posts, on='userId')\n", + "users_posts.shape" + ] }, { "cell_type": "markdown", @@ -110,10 +235,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "userId 0\n", + "Reputation 0\n", + "Views 0\n", + "UpVotes 0\n", + "DownVotes 0\n", + "postId 0\n", + "Score 0\n", + "ViewCount 23572\n", + "CommentCount 0\n", + "dtype: int64\n", + "Index(['postId', 'Score', 'userId', 'ViewCount', 'CommentCount'], dtype='object')\n" + ] + } + ], + "source": [ + "print(users_posts.isnull().sum())\n", + "print(posts.columns) # The large number indicates the missing values " + ] }, { "cell_type": "markdown", @@ -125,10 +271,93 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "98.0 43\n", + "150.0 43\n", + "122.0 42\n", + "156.0 41\n", + "108.0 41\n", + " ..\n", + "2313.0 1\n", + "6113.0 1\n", + "3463.0 1\n", + "3531.0 1\n", + "3357.0 1\n", + "Name: ViewCount, Length: 3402, dtype: int64\n", + "3783 98.0\n", + "4962 98.0\n", + "9146 98.0\n", + "10242 98.0\n", + "11610 98.0\n", + "12399 98.0\n", + "18635 98.0\n", + "19165 98.0\n", + "19188 98.0\n", + "20384 98.0\n", + "20553 98.0\n", + "21331 98.0\n", + "21787 98.0\n", + "22577 98.0\n", + "22861 98.0\n", + "23103 98.0\n", + "23280 98.0\n", + "23678 98.0\n", + "23711 98.0\n", + "23872 98.0\n", + "25400 98.0\n", + "26363 98.0\n", + "27079 98.0\n", + "27203 98.0\n", + "27752 98.0\n", + "27846 98.0\n", + "30138 98.0\n", + "30508 98.0\n", + "30529 98.0\n", + "30991 98.0\n", + "32295 98.0\n", + "32781 98.0\n", + "32816 98.0\n", + "33329 98.0\n", + "34078 98.0\n", + "34574 98.0\n", + "35873 98.0\n", + "36613 98.0\n", + "36631 98.0\n", + "36688 98.0\n", + "37074 98.0\n", + "37786 98.0\n", + "38072 98.0\n", + "Name: ViewCount, dtype: float64\n", + "Series([], Name: ViewCount, dtype: float64)\n", + "0.0 23572\n", + "98.0 43\n", + "150.0 43\n", + "122.0 42\n", + "108.0 41\n", + " ... \n", + "1664.0 1\n", + "1337.0 1\n", + "64481.0 1\n", + "2313.0 1\n", + "3357.0 1\n", + "Name: ViewCount, Length: 3403, dtype: int64\n" + ] + } + ], + "source": [ + "print(users_posts['ViewCount'].value_counts())\n", + "print(users_posts['ViewCount'][users_posts['ViewCount'] == 98])\n", + "print(users_posts['ViewCount'][users_posts['ViewCount'] == 0])\n", + "\n", + "users_posts['ViewCount'].fillna(0, inplace=True)\n", + "print(users_posts['ViewCount'].value_counts())" + ] }, { "cell_type": "markdown", @@ -137,6 +366,36 @@ "#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? " ] }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "userId int64\n", + "Reputation int64\n", + "Views int64\n", + "UpVotes int64\n", + "DownVotes int64\n", + "postId int64\n", + "Score int64\n", + "ViewCount int64\n", + "CommentCount int64\n", + "dtype: object" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_posts = users_posts.astype({'ViewCount': 'int64'})\n", + "users_posts.dtypes" + ] + }, { "cell_type": "code", "execution_count": null, @@ -147,7 +406,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -161,7 +420,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.9.7" } }, "nbformat": 4, diff --git a/weather-raw.csv b/your-code/weather-raw.csv similarity index 100% rename from weather-raw.csv rename to your-code/weather-raw.csv diff --git a/your-code/weather.ipynb b/your-code/weather.ipynb index 4fc40ab..5c8dc9e 100644 --- a/your-code/weather.ipynb +++ b/your-code/weather.ipynb @@ -11,17 +11,216 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pandas as pd\n", + "#for the code to work I moved the weather file to the main folder" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | id | \n", + "year | \n", + "month | \n", + "element | \n", + "d1 | \n", + "d2 | \n", + "d3 | \n", + "d4 | \n", + "d5 | \n", + "d6 | \n", + "... | \n", + "d22 | \n", + "d23 | \n", + "d24 | \n", + "d25 | \n", + "d26 | \n", + "d27 | \n", + "d28 | \n", + "d29 | \n", + "d30 | \n", + "d31 | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "MX17004 | \n", + "2010 | \n", + "1 | \n", + "tmax | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "27.8 | \n", + "NaN | \n", + "
| 1 | \n", + "MX17004 | \n", + "2010 | \n", + "1 | \n", + "tmin | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "14.5 | \n", + "NaN | \n", + "
| 2 | \n", + "MX17004 | \n", + "2010 | \n", + "2 | \n", + "tmax | \n", + "NaN | \n", + "27.3 | \n", + "24.1 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "... | \n", + "NaN | \n", + "29.9 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
| 3 | \n", + "MX17004 | \n", + "2010 | \n", + "2 | \n", + "tmin | \n", + "NaN | \n", + "14.4 | \n", + "14.4 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "... | \n", + "NaN | \n", + "10.7 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
| 4 | \n", + "MX17004 | \n", + "2010 | \n", + "3 | \n", + "tmax | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "32.1 | \n", + "NaN | \n", + "... | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
5 rows × 35 columns
\n", + "