diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..b56a444 Binary files /dev/null and b/.DS_Store differ diff --git a/your-code/.DS_Store b/your-code/.DS_Store new file mode 100644 index 0000000..40fe872 Binary files /dev/null and b/your-code/.DS_Store differ diff --git a/your-code/.ipynb_checkpoints/main-checkpoint.ipynb b/your-code/.ipynb_checkpoints/main-checkpoint.ipynb new file mode 100644 index 0000000..4a983fc --- /dev/null +++ b/your-code/.ipynb_checkpoints/main-checkpoint.ipynb @@ -0,0 +1,1324 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1. Import pandas library" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2. Import users table:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | Id | \n", + "Reputation | \n", + "CreationDate | \n", + "DisplayName | \n", + "LastAccessDate | \n", + "WebsiteUrl | \n", + "Location | \n", + "AboutMe | \n", + "Views | \n", + "UpVotes | \n", + "DownVotes | \n", + "AccountId | \n", + "Age | \n", + "ProfileImageUrl | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "-1 | \n", + "1 | \n", + "2010-07-19 06:55:26 | \n", + "Community | \n", + "2010-07-19 06:55:26 | \n", + "http://meta.stackexchange.com/ | \n", + "on the server farm | \n", + "<p>Hi, I'm not really a person.</p>\\r\\n\\r\\n<p>... | \n", + "0 | \n", + "5007 | \n", + "1920 | \n", + "-1 | \n", + "NaN | \n", + "NaN | \n", + "
| 1 | \n", + "2 | \n", + "101 | \n", + "2010-07-19 14:01:36 | \n", + "Geoff Dalgas | \n", + "2013-11-12 22:07:23 | \n", + "http://stackoverflow.com | \n", + "Corvallis, OR | \n", + "<p>Developer on the StackOverflow team. Find ... | \n", + "25 | \n", + "3 | \n", + "0 | \n", + "2 | \n", + "37.0 | \n", + "NaN | \n", + "
| 2 | \n", + "3 | \n", + "101 | \n", + "2010-07-19 15:34:50 | \n", + "Jarrod Dixon | \n", + "2014-08-08 06:42:58 | \n", + "http://stackoverflow.com | \n", + "New York, NY | \n", + "<p><a href=\"http://blog.stackoverflow.com/2009... | \n", + "22 | \n", + "19 | \n", + "0 | \n", + "3 | \n", + "35.0 | \n", + "NaN | \n", + "
| 3 | \n", + "4 | \n", + "101 | \n", + "2010-07-19 19:03:27 | \n", + "Emmett | \n", + "2014-01-02 09:31:02 | \n", + "http://minesweeperonline.com | \n", + "San Francisco, CA | \n", + "<p>currently at a startup in SF</p>\\r\\n\\r\\n<p>... | \n", + "11 | \n", + "0 | \n", + "0 | \n", + "1998 | \n", + "28.0 | \n", + "http://i.stack.imgur.com/d1oHX.jpg | \n", + "
| 4 | \n", + "5 | \n", + "6792 | \n", + "2010-07-19 19:03:57 | \n", + "Shane | \n", + "2014-08-13 00:23:47 | \n", + "http://www.statalgo.com | \n", + "New York, NY | \n", + "<p>Quantitative researcher focusing on statist... | \n", + "1145 | \n", + "662 | \n", + "5 | \n", + "54503 | \n", + "35.0 | \n", + "NaN | \n", + "
Hi, I'm not really a person.
\\r\\n\\r\\n... 0 5007 \n", + "1
Developer on the StackOverflow team. Find ... 25 3 \n", + "2
\\r\\n\\r\\n... 11 0 \n", + "4
Quantitative researcher focusing on statist... 1145 662 \n", + "\n", + " DownVotes AccountId Age ProfileImageUrl \n", + "0 1920 -1 NaN NaN \n", + "1 0 2 37.0 NaN \n", + "2 0 3 35.0 NaN \n", + "3 0 1998 28.0 http://i.stack.imgur.com/d1oHX.jpg \n", + "4 5 54503 35.0 NaN " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_table=pd.read_csv(\"users_table.csv\")\n", + "users_table.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3. Rename Id column to userId" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | userId | \n", + "Reputation | \n", + "CreationDate | \n", + "DisplayName | \n", + "LastAccessDate | \n", + "WebsiteUrl | \n", + "Location | \n", + "AboutMe | \n", + "Views | \n", + "UpVotes | \n", + "DownVotes | \n", + "AccountId | \n", + "Age | \n", + "ProfileImageUrl | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "-1 | \n", + "1 | \n", + "2010-07-19 06:55:26 | \n", + "Community | \n", + "2010-07-19 06:55:26 | \n", + "http://meta.stackexchange.com/ | \n", + "on the server farm | \n", + "<p>Hi, I'm not really a person.</p>\\r\\n\\r\\n<p>... | \n", + "0 | \n", + "5007 | \n", + "1920 | \n", + "-1 | \n", + "NaN | \n", + "NaN | \n", + "
| 1 | \n", + "2 | \n", + "101 | \n", + "2010-07-19 14:01:36 | \n", + "Geoff Dalgas | \n", + "2013-11-12 22:07:23 | \n", + "http://stackoverflow.com | \n", + "Corvallis, OR | \n", + "<p>Developer on the StackOverflow team. Find ... | \n", + "25 | \n", + "3 | \n", + "0 | \n", + "2 | \n", + "37.0 | \n", + "NaN | \n", + "
| 2 | \n", + "3 | \n", + "101 | \n", + "2010-07-19 15:34:50 | \n", + "Jarrod Dixon | \n", + "2014-08-08 06:42:58 | \n", + "http://stackoverflow.com | \n", + "New York, NY | \n", + "<p><a href=\"http://blog.stackoverflow.com/2009... | \n", + "22 | \n", + "19 | \n", + "0 | \n", + "3 | \n", + "35.0 | \n", + "NaN | \n", + "
| 3 | \n", + "4 | \n", + "101 | \n", + "2010-07-19 19:03:27 | \n", + "Emmett | \n", + "2014-01-02 09:31:02 | \n", + "http://minesweeperonline.com | \n", + "San Francisco, CA | \n", + "<p>currently at a startup in SF</p>\\r\\n\\r\\n<p>... | \n", + "11 | \n", + "0 | \n", + "0 | \n", + "1998 | \n", + "28.0 | \n", + "http://i.stack.imgur.com/d1oHX.jpg | \n", + "
| 4 | \n", + "5 | \n", + "6792 | \n", + "2010-07-19 19:03:57 | \n", + "Shane | \n", + "2014-08-13 00:23:47 | \n", + "http://www.statalgo.com | \n", + "New York, NY | \n", + "<p>Quantitative researcher focusing on statist... | \n", + "1145 | \n", + "662 | \n", + "5 | \n", + "54503 | \n", + "35.0 | \n", + "NaN | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 40320 | \n", + "55743 | \n", + "1 | \n", + "2014-09-13 21:03:50 | \n", + "AussieMeg | \n", + "2014-09-13 21:18:52 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "5026902 | \n", + "NaN | \n", + "http://graph.facebook.com/665821703/picture?ty... | \n", + "
| 40321 | \n", + "55744 | \n", + "6 | \n", + "2014-09-13 21:39:30 | \n", + "Mia Maria | \n", + "2014-09-13 21:39:30 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "5026998 | \n", + "NaN | \n", + "NaN | \n", + "
| 40322 | \n", + "55745 | \n", + "101 | \n", + "2014-09-13 23:45:27 | \n", + "tronbabylove | \n", + "2014-09-13 23:45:27 | \n", + "NaN | \n", + "United States | \n", + "NaN | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "481766 | \n", + "NaN | \n", + "https://www.gravatar.com/avatar/faa7a3fdbd8308... | \n", + "
| 40323 | \n", + "55746 | \n", + "106 | \n", + "2014-09-14 00:29:41 | \n", + "GPP | \n", + "2014-09-14 02:05:17 | \n", + "NaN | \n", + "NaN | \n", + "<p>Stats noobie, product, marketing & medi... | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "976289 | \n", + "NaN | \n", + "https://www.gravatar.com/avatar/6d9e9fa6b783a3... | \n", + "
| 40324 | \n", + "55747 | \n", + "1 | \n", + "2014-09-14 01:01:44 | \n", + "Shivam Agrawal | \n", + "2014-09-14 01:19:04 | \n", + "NaN | \n", + "India | \n", + "<p>Maths Enthusiast </p>\\r\\n | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "5027354 | \n", + "NaN | \n", + "https://lh4.googleusercontent.com/-ZsXhwVaFmiY... | \n", + "
40325 rows × 14 columns
\n", + "Hi, I'm not really a person.
\\r\\n\\r\\n... \n", + "1 Corvallis, OR
Developer on the StackOverflow team. Find ... \n", + "2 New York, NY
\\r\\n\\r\\n... \n", + "4 New York, NY
Quantitative researcher focusing on statist... \n", + "... ... ... \n", + "40320 NaN NaN \n", + "40321 NaN NaN \n", + "40322 United States NaN \n", + "40323 NaN
Stats noobie, product, marketing & medi... \n", + "40324 India
Maths Enthusiast
\\r\\n \n", + "\n", + " Views UpVotes DownVotes AccountId Age \\\n", + "0 0 5007 1920 -1 NaN \n", + "1 25 3 0 2 37.0 \n", + "2 22 19 0 3 35.0 \n", + "3 11 0 0 1998 28.0 \n", + "4 1145 662 5 54503 35.0 \n", + "... ... ... ... ... ... \n", + "40320 0 0 0 5026902 NaN \n", + "40321 1 0 0 5026998 NaN \n", + "40322 0 0 0 481766 NaN \n", + "40323 1 0 0 976289 NaN \n", + "40324 0 0 0 5027354 NaN \n", + "\n", + " ProfileImageUrl \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 http://i.stack.imgur.com/d1oHX.jpg \n", + "4 NaN \n", + "... ... \n", + "40320 http://graph.facebook.com/665821703/picture?ty... \n", + "40321 NaN \n", + "40322 https://www.gravatar.com/avatar/faa7a3fdbd8308... \n", + "40323 https://www.gravatar.com/avatar/6d9e9fa6b783a3... \n", + "40324 https://lh4.googleusercontent.com/-ZsXhwVaFmiY... \n", + "\n", + "[40325 rows x 14 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_table=users_table.rename(columns={\"Id\":\"userId\"})\n", + "users_table" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4. Import posts table:" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "| \n", + " | Id | \n", + "PostTypeId | \n", + "AcceptedAnswerId | \n", + "CreaionDate | \n", + "Score | \n", + "ViewCount | \n", + "Body | \n", + "OwnerUserId | \n", + "LasActivityDate | \n", + "Title | \n", + "... | \n", + "AnswerCount | \n", + "CommentCount | \n", + "FavoriteCount | \n", + "LastEditorUserId | \n", + "LastEditDate | \n", + "CommunityOwnedDate | \n", + "ParentId | \n", + "ClosedDate | \n", + "OwnerDisplayName | \n", + "LastEditorDisplayName | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "1 | \n", + "1 | \n", + "15.0 | \n", + "2010-07-19 19:12:12 | \n", + "23 | \n", + "1278.0 | \n", + "<p>How should I elicit prior distributions fro... | \n", + "8.0 | \n", + "2010-09-15 21:08:26 | \n", + "Eliciting priors from experts | \n", + "... | \n", + "5.0 | \n", + "1 | \n", + "14.0 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
1 rows × 21 columns
\n", + "How should I elicit prior distributions fro... 8.0 \n", + "\n", + " LasActivityDate Title ... AnswerCount \\\n", + "0 2010-09-15 21:08:26 Eliciting priors from experts ... 5.0 \n", + "\n", + " CommentCount FavoriteCount LastEditorUserId LastEditDate \\\n", + "0 1 14.0 NaN NaN \n", + "\n", + " CommunityOwnedDate ParentId ClosedDate OwnerDisplayName \\\n", + "0 NaN NaN NaN NaN \n", + "\n", + " LastEditorDisplayName \n", + "0 NaN \n", + "\n", + "[1 rows x 21 columns]" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "posts_table=pd.read_csv(\"posts_table.csv\")\n", + "posts_table.head(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5. Rename Id column to postId and OwnerUserId to userId" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | postId | \n", + "PostTypeId | \n", + "AcceptedAnswerId | \n", + "CreaionDate | \n", + "Score | \n", + "ViewCount | \n", + "Body | \n", + "userId | \n", + "LasActivityDate | \n", + "Title | \n", + "... | \n", + "AnswerCount | \n", + "CommentCount | \n", + "FavoriteCount | \n", + "LastEditorUserId | \n", + "LastEditDate | \n", + "CommunityOwnedDate | \n", + "ParentId | \n", + "ClosedDate | \n", + "OwnerDisplayName | \n", + "LastEditorDisplayName | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "1 | \n", + "1 | \n", + "15.0 | \n", + "2010-07-19 19:12:12 | \n", + "23 | \n", + "1278.0 | \n", + "<p>How should I elicit prior distributions fro... | \n", + "8.0 | \n", + "2010-09-15 21:08:26 | \n", + "Eliciting priors from experts | \n", + "... | \n", + "5.0 | \n", + "1 | \n", + "14.0 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
| 1 | \n", + "2 | \n", + "1 | \n", + "59.0 | \n", + "2010-07-19 19:12:57 | \n", + "22 | \n", + "8198.0 | \n", + "<p>In many different statistical methods there... | \n", + "24.0 | \n", + "2012-11-12 09:21:54 | \n", + "What is normality? | \n", + "... | \n", + "7.0 | \n", + "1 | \n", + "8.0 | \n", + "88.0 | \n", + "2010-08-07 17:56:44 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
| 2 | \n", + "3 | \n", + "1 | \n", + "5.0 | \n", + "2010-07-19 19:13:28 | \n", + "54 | \n", + "3613.0 | \n", + "<p>What are some valuable Statistical Analysis... | \n", + "18.0 | \n", + "2013-05-27 14:48:36 | \n", + "What are some valuable Statistical Analysis op... | \n", + "... | \n", + "19.0 | \n", + "4 | \n", + "36.0 | \n", + "183.0 | \n", + "2011-02-12 05:50:03 | \n", + "2010-07-19 19:13:28 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
| 3 | \n", + "4 | \n", + "1 | \n", + "135.0 | \n", + "2010-07-19 19:13:31 | \n", + "13 | \n", + "5224.0 | \n", + "<p>I have two groups of data. Each with a dif... | \n", + "23.0 | \n", + "2010-09-08 03:00:19 | \n", + "Assessing the significance of differences in d... | \n", + "... | \n", + "5.0 | \n", + "2 | \n", + "2.0 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
| 4 | \n", + "5 | \n", + "2 | \n", + "NaN | \n", + "2010-07-19 19:14:43 | \n", + "81 | \n", + "NaN | \n", + "<p>The R-project</p>\\n\\n<p><a href=\"http://www... | \n", + "23.0 | \n", + "2010-07-19 19:21:15 | \n", + "NaN | \n", + "... | \n", + "NaN | \n", + "3 | \n", + "NaN | \n", + "23.0 | \n", + "2010-07-19 19:21:15 | \n", + "2010-07-19 19:14:43 | \n", + "3.0 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
5 rows × 21 columns
\n", + "How should I elicit prior distributions fro... 8.0 \n", + "1 8198.0
In many different statistical methods there... 24.0 \n", + "2 3613.0
What are some valuable Statistical Analysis... 18.0 \n", + "3 5224.0
I have two groups of data. Each with a dif... 23.0 \n", + "4 NaN
The R-project
\\n\\n| \n", + " | userId | \n", + "Reputation | \n", + "Views | \n", + "UpVotes | \n", + "DownVotes | \n", + "postId | \n", + "Score | \n", + "ViewCount | \n", + "CommentCount | \n", + "
|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "-1.0 | \n", + "1.0 | \n", + "0.0 | \n", + "5007.0 | \n", + "1920.0 | \n", + "2175.0 | \n", + "0.0 | \n", + "NaN | \n", + "0.0 | \n", + "
| 1 | \n", + "-1.0 | \n", + "1.0 | \n", + "0.0 | \n", + "5007.0 | \n", + "1920.0 | \n", + "8576.0 | \n", + "0.0 | \n", + "NaN | \n", + "0.0 | \n", + "
| 2 | \n", + "-1.0 | \n", + "1.0 | \n", + "0.0 | \n", + "5007.0 | \n", + "1920.0 | \n", + "8578.0 | \n", + "0.0 | \n", + "NaN | \n", + "0.0 | \n", + "
| 3 | \n", + "-1.0 | \n", + "1.0 | \n", + "0.0 | \n", + "5007.0 | \n", + "1920.0 | \n", + "8981.0 | \n", + "0.0 | \n", + "NaN | \n", + "0.0 | \n", + "
| 4 | \n", + "-1.0 | \n", + "1.0 | \n", + "0.0 | \n", + "5007.0 | \n", + "1920.0 | \n", + "8982.0 | \n", + "0.0 | \n", + "NaN | \n", + "0.0 | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 72182 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "46117.0 | \n", + "0.0 | \n", + "355.0 | \n", + "0.0 | \n", + "
| 72183 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "46260.0 | \n", + "5.0 | \n", + "145.0 | \n", + "5.0 | \n", + "
| 72184 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "46836.0 | \n", + "3.0 | \n", + "406.0 | \n", + "0.0 | \n", + "
| 72185 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "46892.0 | \n", + "1.0 | \n", + "NaN | \n", + "0.0 | \n", + "
| 72186 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "47308.0 | \n", + "2.0 | \n", + "129.0 | \n", + "0.0 | \n", + "
72187 rows × 9 columns
\n", + "" + ], + "text/plain": [ + " userId Reputation Views UpVotes DownVotes postId Score \\\n", + "0 -1.0 1.0 0.0 5007.0 1920.0 2175.0 0.0 \n", + "1 -1.0 1.0 0.0 5007.0 1920.0 8576.0 0.0 \n", + "2 -1.0 1.0 0.0 5007.0 1920.0 8578.0 0.0 \n", + "3 -1.0 1.0 0.0 5007.0 1920.0 8981.0 0.0 \n", + "4 -1.0 1.0 0.0 5007.0 1920.0 8982.0 0.0 \n", + "... ... ... ... ... ... ... ... \n", + "72182 NaN NaN NaN NaN NaN 46117.0 0.0 \n", + "72183 NaN NaN NaN NaN NaN 46260.0 5.0 \n", + "72184 NaN NaN NaN NaN NaN 46836.0 3.0 \n", + "72185 NaN NaN NaN NaN NaN 46892.0 1.0 \n", + "72186 NaN NaN NaN NaN NaN 47308.0 2.0 \n", + "\n", + " ViewCount CommentCount \n", + "0 NaN 0.0 \n", + "1 NaN 0.0 \n", + "2 NaN 0.0 \n", + "3 NaN 0.0 \n", + "4 NaN 0.0 \n", + "... ... ... \n", + "72182 355.0 0.0 \n", + "72183 145.0 5.0 \n", + "72184 406.0 0.0 \n", + "72185 NaN 0.0 \n", + "72186 129.0 0.0 \n", + "\n", + "[72187 rows x 9 columns]" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a=users_new.shape\n", + "a1=users_new[\"userId\"].nunique()\n", + "b=posts_new.shape\n", + "b1=posts_new[\"userId\"].nunique() # get the number of unique value almost double numbers of rows, which is weird\n", + "print(a,a1)\n", + "print(b,b1)\n", + "data_merge=users_new.merge(posts_new,how=\"outer\",on=\"userId\") \n", + "\n", + "print(data_merge.shape)\n", + "data_merge" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 8. How many missing values do you have in your merged dataframe? On which columns?" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "userId 1038\n", + "Reputation 1038\n", + "Views 1038\n", + "UpVotes 1038\n", + "DownVotes 1038\n", + "postId 32187\n", + "Score 32187\n", + "ViewCount 56292\n", + "CommentCount 32187\n", + "dtype: int64" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_merge.isnull().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 9. You will need to make something with missing values. Will you clean or filling them? Explain. \n", + "**Remember** to check the results of your code before passing to the next step" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(71149, 9)\n", + " userId Reputation Views UpVotes DownVotes postId Score ViewCount \\\n", + "0 -1.0 1.0 0.0 5007.0 1920.0 2175.0 0.0 NaN \n", + "1 -1.0 1.0 0.0 5007.0 1920.0 8576.0 0.0 NaN \n", + "2 -1.0 1.0 0.0 5007.0 1920.0 8578.0 0.0 NaN \n", + "3 -1.0 1.0 0.0 5007.0 1920.0 8981.0 0.0 NaN \n", + "4 -1.0 1.0 0.0 5007.0 1920.0 8982.0 0.0 NaN \n", + "\n", + " CommentCount \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "(58584, 7)\n", + " userId Reputation Views UpVotes DownVotes Score CommentCount\n", + "0 -1.0 1.0 0.0 5007.0 1920.0 0.0 0.0\n", + "123 2.0 101.0 25.0 3.0 0.0 NaN NaN\n", + "124 3.0 101.0 22.0 19.0 0.0 NaN NaN\n", + "125 4.0 101.0 11.0 0.0 0.0 NaN NaN\n", + "126 5.0 6792.0 1145.0 662.0 5.0 152.0 5.0\n" + ] + } + ], + "source": [ + "#Drop rows with missing userID, without userId the data is not valid\n", + "data_validId =data_merge.dropna(axis=0, subset=['userId'])\n", + "print(data_validId.shape)\n", + "print(data_validId.head())\n", + "#PostId is not accurate with one user has many different post ID, and half of the table missing postID -> remove postID\n", + "#Remove ViewCount because most of info is missing ( 56292 out of 70K row)\n", + "data_validId2=data_validId.drop([\"postId\",\"ViewCount\"],axis=1)\n", + "data_validId2=data_validId2.drop_duplicates()\n", + "print(data_validId2.shape)\n", + "print(data_validId2.head())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? " + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " userId Reputation Views UpVotes DownVotes Score CommentCount\n", + "0 -1.0 1.0 0.0 5007.0 1920.0 0.0 0.0\n", + "123 2.0 101.0 25.0 3.0 0.0 NaN NaN\n", + "124 3.0 101.0 22.0 19.0 0.0 NaN NaN\n", + "125 4.0 101.0 11.0 0.0 0.0 NaN NaN\n", + "126 5.0 6792.0 1145.0 662.0 5.0 152.0 5.0\n" + ] + }, + { + "data": { + "text/plain": [ + "userId object\n", + "Reputation object\n", + "Views float64\n", + "UpVotes float64\n", + "DownVotes float64\n", + "Score float64\n", + "CommentCount float64\n", + "dtype: object" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#userID, reputation should be string\n", + "convert_str = {'userId': str,'Reputation': str}\n", + "data_validId2 = data_validId2.astype(convert_str)\n", + "print(data_validId2.head())\n", + "data_validId2.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/your-code/.posts_table.csv.icloud b/your-code/.posts_table.csv.icloud new file mode 100644 index 0000000..06e51c5 Binary files /dev/null and b/your-code/.posts_table.csv.icloud differ diff --git a/your-code/.users_table.csv.icloud b/your-code/.users_table.csv.icloud new file mode 100644 index 0000000..a7132c5 Binary files /dev/null and b/your-code/.users_table.csv.icloud differ diff --git a/your-code/datasets.rar b/your-code/datasets.rar deleted file mode 100644 index db8661c..0000000 Binary files a/your-code/datasets.rar and /dev/null differ diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 7900997..4a983fc 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -1,169 +1,1324 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 1. Import pandas library" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 2. Import users table:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 3. Rename Id column to userId" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 4. Import posts table:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 5. Rename Id column to postId and OwnerUserId to userId" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 6. Define new dataframes for users and posts with the following selected columns:\n", - " **users columns**: userId, Reputation,Views,UpVotes,DownVotes\n", - " **posts columns**: postId, Score,userId,ViewCount,CommentCount" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 7. Merge both dataframes, users and posts. \n", - "You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 8. How many missing values do you have in your merged dataframe? On which columns?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 9. You will need to make something with missing values. Will you clean or filling them? Explain. \n", - "**Remember** to check the results of your code before passing to the next step" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1. Import pandas library" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2. Import users table:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "| \n", + " | Id | \n", + "Reputation | \n", + "CreationDate | \n", + "DisplayName | \n", + "LastAccessDate | \n", + "WebsiteUrl | \n", + "Location | \n", + "AboutMe | \n", + "Views | \n", + "UpVotes | \n", + "DownVotes | \n", + "AccountId | \n", + "Age | \n", + "ProfileImageUrl | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "-1 | \n", + "1 | \n", + "2010-07-19 06:55:26 | \n", + "Community | \n", + "2010-07-19 06:55:26 | \n", + "http://meta.stackexchange.com/ | \n", + "on the server farm | \n", + "<p>Hi, I'm not really a person.</p>\\r\\n\\r\\n<p>... | \n", + "0 | \n", + "5007 | \n", + "1920 | \n", + "-1 | \n", + "NaN | \n", + "NaN | \n", + "
| 1 | \n", + "2 | \n", + "101 | \n", + "2010-07-19 14:01:36 | \n", + "Geoff Dalgas | \n", + "2013-11-12 22:07:23 | \n", + "http://stackoverflow.com | \n", + "Corvallis, OR | \n", + "<p>Developer on the StackOverflow team. Find ... | \n", + "25 | \n", + "3 | \n", + "0 | \n", + "2 | \n", + "37.0 | \n", + "NaN | \n", + "
| 2 | \n", + "3 | \n", + "101 | \n", + "2010-07-19 15:34:50 | \n", + "Jarrod Dixon | \n", + "2014-08-08 06:42:58 | \n", + "http://stackoverflow.com | \n", + "New York, NY | \n", + "<p><a href=\"http://blog.stackoverflow.com/2009... | \n", + "22 | \n", + "19 | \n", + "0 | \n", + "3 | \n", + "35.0 | \n", + "NaN | \n", + "
| 3 | \n", + "4 | \n", + "101 | \n", + "2010-07-19 19:03:27 | \n", + "Emmett | \n", + "2014-01-02 09:31:02 | \n", + "http://minesweeperonline.com | \n", + "San Francisco, CA | \n", + "<p>currently at a startup in SF</p>\\r\\n\\r\\n<p>... | \n", + "11 | \n", + "0 | \n", + "0 | \n", + "1998 | \n", + "28.0 | \n", + "http://i.stack.imgur.com/d1oHX.jpg | \n", + "
| 4 | \n", + "5 | \n", + "6792 | \n", + "2010-07-19 19:03:57 | \n", + "Shane | \n", + "2014-08-13 00:23:47 | \n", + "http://www.statalgo.com | \n", + "New York, NY | \n", + "<p>Quantitative researcher focusing on statist... | \n", + "1145 | \n", + "662 | \n", + "5 | \n", + "54503 | \n", + "35.0 | \n", + "NaN | \n", + "
Hi, I'm not really a person.
\\r\\n\\r\\n... 0 5007 \n", + "1
Developer on the StackOverflow team. Find ... 25 3 \n", + "2
\\r\\n\\r\\n... 11 0 \n", + "4
Quantitative researcher focusing on statist... 1145 662 \n", + "\n", + " DownVotes AccountId Age ProfileImageUrl \n", + "0 1920 -1 NaN NaN \n", + "1 0 2 37.0 NaN \n", + "2 0 3 35.0 NaN \n", + "3 0 1998 28.0 http://i.stack.imgur.com/d1oHX.jpg \n", + "4 5 54503 35.0 NaN " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_table=pd.read_csv(\"users_table.csv\")\n", + "users_table.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3. Rename Id column to userId" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | userId | \n", + "Reputation | \n", + "CreationDate | \n", + "DisplayName | \n", + "LastAccessDate | \n", + "WebsiteUrl | \n", + "Location | \n", + "AboutMe | \n", + "Views | \n", + "UpVotes | \n", + "DownVotes | \n", + "AccountId | \n", + "Age | \n", + "ProfileImageUrl | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "-1 | \n", + "1 | \n", + "2010-07-19 06:55:26 | \n", + "Community | \n", + "2010-07-19 06:55:26 | \n", + "http://meta.stackexchange.com/ | \n", + "on the server farm | \n", + "<p>Hi, I'm not really a person.</p>\\r\\n\\r\\n<p>... | \n", + "0 | \n", + "5007 | \n", + "1920 | \n", + "-1 | \n", + "NaN | \n", + "NaN | \n", + "
| 1 | \n", + "2 | \n", + "101 | \n", + "2010-07-19 14:01:36 | \n", + "Geoff Dalgas | \n", + "2013-11-12 22:07:23 | \n", + "http://stackoverflow.com | \n", + "Corvallis, OR | \n", + "<p>Developer on the StackOverflow team. Find ... | \n", + "25 | \n", + "3 | \n", + "0 | \n", + "2 | \n", + "37.0 | \n", + "NaN | \n", + "
| 2 | \n", + "3 | \n", + "101 | \n", + "2010-07-19 15:34:50 | \n", + "Jarrod Dixon | \n", + "2014-08-08 06:42:58 | \n", + "http://stackoverflow.com | \n", + "New York, NY | \n", + "<p><a href=\"http://blog.stackoverflow.com/2009... | \n", + "22 | \n", + "19 | \n", + "0 | \n", + "3 | \n", + "35.0 | \n", + "NaN | \n", + "
| 3 | \n", + "4 | \n", + "101 | \n", + "2010-07-19 19:03:27 | \n", + "Emmett | \n", + "2014-01-02 09:31:02 | \n", + "http://minesweeperonline.com | \n", + "San Francisco, CA | \n", + "<p>currently at a startup in SF</p>\\r\\n\\r\\n<p>... | \n", + "11 | \n", + "0 | \n", + "0 | \n", + "1998 | \n", + "28.0 | \n", + "http://i.stack.imgur.com/d1oHX.jpg | \n", + "
| 4 | \n", + "5 | \n", + "6792 | \n", + "2010-07-19 19:03:57 | \n", + "Shane | \n", + "2014-08-13 00:23:47 | \n", + "http://www.statalgo.com | \n", + "New York, NY | \n", + "<p>Quantitative researcher focusing on statist... | \n", + "1145 | \n", + "662 | \n", + "5 | \n", + "54503 | \n", + "35.0 | \n", + "NaN | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 40320 | \n", + "55743 | \n", + "1 | \n", + "2014-09-13 21:03:50 | \n", + "AussieMeg | \n", + "2014-09-13 21:18:52 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "5026902 | \n", + "NaN | \n", + "http://graph.facebook.com/665821703/picture?ty... | \n", + "
| 40321 | \n", + "55744 | \n", + "6 | \n", + "2014-09-13 21:39:30 | \n", + "Mia Maria | \n", + "2014-09-13 21:39:30 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "5026998 | \n", + "NaN | \n", + "NaN | \n", + "
| 40322 | \n", + "55745 | \n", + "101 | \n", + "2014-09-13 23:45:27 | \n", + "tronbabylove | \n", + "2014-09-13 23:45:27 | \n", + "NaN | \n", + "United States | \n", + "NaN | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "481766 | \n", + "NaN | \n", + "https://www.gravatar.com/avatar/faa7a3fdbd8308... | \n", + "
| 40323 | \n", + "55746 | \n", + "106 | \n", + "2014-09-14 00:29:41 | \n", + "GPP | \n", + "2014-09-14 02:05:17 | \n", + "NaN | \n", + "NaN | \n", + "<p>Stats noobie, product, marketing & medi... | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "976289 | \n", + "NaN | \n", + "https://www.gravatar.com/avatar/6d9e9fa6b783a3... | \n", + "
| 40324 | \n", + "55747 | \n", + "1 | \n", + "2014-09-14 01:01:44 | \n", + "Shivam Agrawal | \n", + "2014-09-14 01:19:04 | \n", + "NaN | \n", + "India | \n", + "<p>Maths Enthusiast </p>\\r\\n | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "5027354 | \n", + "NaN | \n", + "https://lh4.googleusercontent.com/-ZsXhwVaFmiY... | \n", + "
40325 rows × 14 columns
\n", + "Hi, I'm not really a person.
\\r\\n\\r\\n... \n", + "1 Corvallis, OR
Developer on the StackOverflow team. Find ... \n", + "2 New York, NY
\\r\\n\\r\\n... \n", + "4 New York, NY
Quantitative researcher focusing on statist... \n", + "... ... ... \n", + "40320 NaN NaN \n", + "40321 NaN NaN \n", + "40322 United States NaN \n", + "40323 NaN
Stats noobie, product, marketing & medi... \n", + "40324 India
Maths Enthusiast
\\r\\n \n", + "\n", + " Views UpVotes DownVotes AccountId Age \\\n", + "0 0 5007 1920 -1 NaN \n", + "1 25 3 0 2 37.0 \n", + "2 22 19 0 3 35.0 \n", + "3 11 0 0 1998 28.0 \n", + "4 1145 662 5 54503 35.0 \n", + "... ... ... ... ... ... \n", + "40320 0 0 0 5026902 NaN \n", + "40321 1 0 0 5026998 NaN \n", + "40322 0 0 0 481766 NaN \n", + "40323 1 0 0 976289 NaN \n", + "40324 0 0 0 5027354 NaN \n", + "\n", + " ProfileImageUrl \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 http://i.stack.imgur.com/d1oHX.jpg \n", + "4 NaN \n", + "... ... \n", + "40320 http://graph.facebook.com/665821703/picture?ty... \n", + "40321 NaN \n", + "40322 https://www.gravatar.com/avatar/faa7a3fdbd8308... \n", + "40323 https://www.gravatar.com/avatar/6d9e9fa6b783a3... \n", + "40324 https://lh4.googleusercontent.com/-ZsXhwVaFmiY... \n", + "\n", + "[40325 rows x 14 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_table=users_table.rename(columns={\"Id\":\"userId\"})\n", + "users_table" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4. Import posts table:" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "| \n", + " | Id | \n", + "PostTypeId | \n", + "AcceptedAnswerId | \n", + "CreaionDate | \n", + "Score | \n", + "ViewCount | \n", + "Body | \n", + "OwnerUserId | \n", + "LasActivityDate | \n", + "Title | \n", + "... | \n", + "AnswerCount | \n", + "CommentCount | \n", + "FavoriteCount | \n", + "LastEditorUserId | \n", + "LastEditDate | \n", + "CommunityOwnedDate | \n", + "ParentId | \n", + "ClosedDate | \n", + "OwnerDisplayName | \n", + "LastEditorDisplayName | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "1 | \n", + "1 | \n", + "15.0 | \n", + "2010-07-19 19:12:12 | \n", + "23 | \n", + "1278.0 | \n", + "<p>How should I elicit prior distributions fro... | \n", + "8.0 | \n", + "2010-09-15 21:08:26 | \n", + "Eliciting priors from experts | \n", + "... | \n", + "5.0 | \n", + "1 | \n", + "14.0 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
1 rows × 21 columns
\n", + "How should I elicit prior distributions fro... 8.0 \n", + "\n", + " LasActivityDate Title ... AnswerCount \\\n", + "0 2010-09-15 21:08:26 Eliciting priors from experts ... 5.0 \n", + "\n", + " CommentCount FavoriteCount LastEditorUserId LastEditDate \\\n", + "0 1 14.0 NaN NaN \n", + "\n", + " CommunityOwnedDate ParentId ClosedDate OwnerDisplayName \\\n", + "0 NaN NaN NaN NaN \n", + "\n", + " LastEditorDisplayName \n", + "0 NaN \n", + "\n", + "[1 rows x 21 columns]" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "posts_table=pd.read_csv(\"posts_table.csv\")\n", + "posts_table.head(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5. Rename Id column to postId and OwnerUserId to userId" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | postId | \n", + "PostTypeId | \n", + "AcceptedAnswerId | \n", + "CreaionDate | \n", + "Score | \n", + "ViewCount | \n", + "Body | \n", + "userId | \n", + "LasActivityDate | \n", + "Title | \n", + "... | \n", + "AnswerCount | \n", + "CommentCount | \n", + "FavoriteCount | \n", + "LastEditorUserId | \n", + "LastEditDate | \n", + "CommunityOwnedDate | \n", + "ParentId | \n", + "ClosedDate | \n", + "OwnerDisplayName | \n", + "LastEditorDisplayName | \n", + "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "1 | \n", + "1 | \n", + "15.0 | \n", + "2010-07-19 19:12:12 | \n", + "23 | \n", + "1278.0 | \n", + "<p>How should I elicit prior distributions fro... | \n", + "8.0 | \n", + "2010-09-15 21:08:26 | \n", + "Eliciting priors from experts | \n", + "... | \n", + "5.0 | \n", + "1 | \n", + "14.0 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
| 1 | \n", + "2 | \n", + "1 | \n", + "59.0 | \n", + "2010-07-19 19:12:57 | \n", + "22 | \n", + "8198.0 | \n", + "<p>In many different statistical methods there... | \n", + "24.0 | \n", + "2012-11-12 09:21:54 | \n", + "What is normality? | \n", + "... | \n", + "7.0 | \n", + "1 | \n", + "8.0 | \n", + "88.0 | \n", + "2010-08-07 17:56:44 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
| 2 | \n", + "3 | \n", + "1 | \n", + "5.0 | \n", + "2010-07-19 19:13:28 | \n", + "54 | \n", + "3613.0 | \n", + "<p>What are some valuable Statistical Analysis... | \n", + "18.0 | \n", + "2013-05-27 14:48:36 | \n", + "What are some valuable Statistical Analysis op... | \n", + "... | \n", + "19.0 | \n", + "4 | \n", + "36.0 | \n", + "183.0 | \n", + "2011-02-12 05:50:03 | \n", + "2010-07-19 19:13:28 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
| 3 | \n", + "4 | \n", + "1 | \n", + "135.0 | \n", + "2010-07-19 19:13:31 | \n", + "13 | \n", + "5224.0 | \n", + "<p>I have two groups of data. Each with a dif... | \n", + "23.0 | \n", + "2010-09-08 03:00:19 | \n", + "Assessing the significance of differences in d... | \n", + "... | \n", + "5.0 | \n", + "2 | \n", + "2.0 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
| 4 | \n", + "5 | \n", + "2 | \n", + "NaN | \n", + "2010-07-19 19:14:43 | \n", + "81 | \n", + "NaN | \n", + "<p>The R-project</p>\\n\\n<p><a href=\"http://www... | \n", + "23.0 | \n", + "2010-07-19 19:21:15 | \n", + "NaN | \n", + "... | \n", + "NaN | \n", + "3 | \n", + "NaN | \n", + "23.0 | \n", + "2010-07-19 19:21:15 | \n", + "2010-07-19 19:14:43 | \n", + "3.0 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "
5 rows × 21 columns
\n", + "How should I elicit prior distributions fro... 8.0 \n", + "1 8198.0
In many different statistical methods there... 24.0 \n", + "2 3613.0
What are some valuable Statistical Analysis... 18.0 \n", + "3 5224.0
I have two groups of data. Each with a dif... 23.0 \n", + "4 NaN
The R-project
\\n\\n| \n", + " | userId | \n", + "Reputation | \n", + "Views | \n", + "UpVotes | \n", + "DownVotes | \n", + "postId | \n", + "Score | \n", + "ViewCount | \n", + "CommentCount | \n", + "
|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "-1.0 | \n", + "1.0 | \n", + "0.0 | \n", + "5007.0 | \n", + "1920.0 | \n", + "2175.0 | \n", + "0.0 | \n", + "NaN | \n", + "0.0 | \n", + "
| 1 | \n", + "-1.0 | \n", + "1.0 | \n", + "0.0 | \n", + "5007.0 | \n", + "1920.0 | \n", + "8576.0 | \n", + "0.0 | \n", + "NaN | \n", + "0.0 | \n", + "
| 2 | \n", + "-1.0 | \n", + "1.0 | \n", + "0.0 | \n", + "5007.0 | \n", + "1920.0 | \n", + "8578.0 | \n", + "0.0 | \n", + "NaN | \n", + "0.0 | \n", + "
| 3 | \n", + "-1.0 | \n", + "1.0 | \n", + "0.0 | \n", + "5007.0 | \n", + "1920.0 | \n", + "8981.0 | \n", + "0.0 | \n", + "NaN | \n", + "0.0 | \n", + "
| 4 | \n", + "-1.0 | \n", + "1.0 | \n", + "0.0 | \n", + "5007.0 | \n", + "1920.0 | \n", + "8982.0 | \n", + "0.0 | \n", + "NaN | \n", + "0.0 | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 72182 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "46117.0 | \n", + "0.0 | \n", + "355.0 | \n", + "0.0 | \n", + "
| 72183 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "46260.0 | \n", + "5.0 | \n", + "145.0 | \n", + "5.0 | \n", + "
| 72184 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "46836.0 | \n", + "3.0 | \n", + "406.0 | \n", + "0.0 | \n", + "
| 72185 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "46892.0 | \n", + "1.0 | \n", + "NaN | \n", + "0.0 | \n", + "
| 72186 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "47308.0 | \n", + "2.0 | \n", + "129.0 | \n", + "0.0 | \n", + "
72187 rows × 9 columns
\n", + "" + ], + "text/plain": [ + " userId Reputation Views UpVotes DownVotes postId Score \\\n", + "0 -1.0 1.0 0.0 5007.0 1920.0 2175.0 0.0 \n", + "1 -1.0 1.0 0.0 5007.0 1920.0 8576.0 0.0 \n", + "2 -1.0 1.0 0.0 5007.0 1920.0 8578.0 0.0 \n", + "3 -1.0 1.0 0.0 5007.0 1920.0 8981.0 0.0 \n", + "4 -1.0 1.0 0.0 5007.0 1920.0 8982.0 0.0 \n", + "... ... ... ... ... ... ... ... \n", + "72182 NaN NaN NaN NaN NaN 46117.0 0.0 \n", + "72183 NaN NaN NaN NaN NaN 46260.0 5.0 \n", + "72184 NaN NaN NaN NaN NaN 46836.0 3.0 \n", + "72185 NaN NaN NaN NaN NaN 46892.0 1.0 \n", + "72186 NaN NaN NaN NaN NaN 47308.0 2.0 \n", + "\n", + " ViewCount CommentCount \n", + "0 NaN 0.0 \n", + "1 NaN 0.0 \n", + "2 NaN 0.0 \n", + "3 NaN 0.0 \n", + "4 NaN 0.0 \n", + "... ... ... \n", + "72182 355.0 0.0 \n", + "72183 145.0 5.0 \n", + "72184 406.0 0.0 \n", + "72185 NaN 0.0 \n", + "72186 129.0 0.0 \n", + "\n", + "[72187 rows x 9 columns]" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a=users_new.shape\n", + "a1=users_new[\"userId\"].nunique()\n", + "b=posts_new.shape\n", + "b1=posts_new[\"userId\"].nunique() # get the number of unique value almost double numbers of rows, which is weird\n", + "print(a,a1)\n", + "print(b,b1)\n", + "data_merge=users_new.merge(posts_new,how=\"outer\",on=\"userId\") \n", + "\n", + "print(data_merge.shape)\n", + "data_merge" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 8. How many missing values do you have in your merged dataframe? On which columns?" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "userId 1038\n", + "Reputation 1038\n", + "Views 1038\n", + "UpVotes 1038\n", + "DownVotes 1038\n", + "postId 32187\n", + "Score 32187\n", + "ViewCount 56292\n", + "CommentCount 32187\n", + "dtype: int64" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_merge.isnull().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 9. You will need to make something with missing values. Will you clean or filling them? Explain. \n", + "**Remember** to check the results of your code before passing to the next step" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(71149, 9)\n", + " userId Reputation Views UpVotes DownVotes postId Score ViewCount \\\n", + "0 -1.0 1.0 0.0 5007.0 1920.0 2175.0 0.0 NaN \n", + "1 -1.0 1.0 0.0 5007.0 1920.0 8576.0 0.0 NaN \n", + "2 -1.0 1.0 0.0 5007.0 1920.0 8578.0 0.0 NaN \n", + "3 -1.0 1.0 0.0 5007.0 1920.0 8981.0 0.0 NaN \n", + "4 -1.0 1.0 0.0 5007.0 1920.0 8982.0 0.0 NaN \n", + "\n", + " CommentCount \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 \n", + "4 0.0 \n", + "(58584, 7)\n", + " userId Reputation Views UpVotes DownVotes Score CommentCount\n", + "0 -1.0 1.0 0.0 5007.0 1920.0 0.0 0.0\n", + "123 2.0 101.0 25.0 3.0 0.0 NaN NaN\n", + "124 3.0 101.0 22.0 19.0 0.0 NaN NaN\n", + "125 4.0 101.0 11.0 0.0 0.0 NaN NaN\n", + "126 5.0 6792.0 1145.0 662.0 5.0 152.0 5.0\n" + ] + } + ], + "source": [ + "#Drop rows with missing userID, without userId the data is not valid\n", + "data_validId =data_merge.dropna(axis=0, subset=['userId'])\n", + "print(data_validId.shape)\n", + "print(data_validId.head())\n", + "#PostId is not accurate with one user has many different post ID, and half of the table missing postID -> remove postID\n", + "#Remove ViewCount because most of info is missing ( 56292 out of 70K row)\n", + "data_validId2=data_validId.drop([\"postId\",\"ViewCount\"],axis=1)\n", + "data_validId2=data_validId2.drop_duplicates()\n", + "print(data_validId2.shape)\n", + "print(data_validId2.head())\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? " + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " userId Reputation Views UpVotes DownVotes Score CommentCount\n", + "0 -1.0 1.0 0.0 5007.0 1920.0 0.0 0.0\n", + "123 2.0 101.0 25.0 3.0 0.0 NaN NaN\n", + "124 3.0 101.0 22.0 19.0 0.0 NaN NaN\n", + "125 4.0 101.0 11.0 0.0 0.0 NaN NaN\n", + "126 5.0 6792.0 1145.0 662.0 5.0 152.0 5.0\n" + ] + }, + { + "data": { + "text/plain": [ + "userId object\n", + "Reputation object\n", + "Views float64\n", + "UpVotes float64\n", + "DownVotes float64\n", + "Score float64\n", + "CommentCount float64\n", + "dtype: object" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#userID, reputation should be string\n", + "convert_str = {'userId': str,'Reputation': str}\n", + "data_validId2 = data_validId2.astype(convert_str)\n", + "print(data_validId2.head())\n", + "data_validId2.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}