diff --git a/your-code/main.ipynb b/your-code/main.ipynb index bad6d94..3170286 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -7,6 +7,15 @@ "#### 1. Import pandas library" ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, { "cell_type": "code", "execution_count": null, @@ -21,6 +30,320 @@ "#### 2. Import users table:" ] }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdReputationCreationDateDisplayNameLastAccessDateWebsiteUrlLocationAboutMeViewsUpVotesDownVotesAccountIdAgeProfileImageUrl
0-112010-07-19 06:55:26Community2010-07-19 06:55:26http://meta.stackexchange.com/on the server farm<p>Hi, I'm not really a person.</p>\\r\\n\\r\\n<p>...050071920-1NaNNaN
121012010-07-19 14:01:36Geoff Dalgas2013-11-12 22:07:23http://stackoverflow.comCorvallis, OR<p>Developer on the StackOverflow team. Find ...2530237.0NaN
231012010-07-19 15:34:50Jarrod Dixon2014-08-08 06:42:58http://stackoverflow.comNew York, NY<p><a href=\"http://blog.stackoverflow.com/2009...22190335.0NaN
341012010-07-19 19:03:27Emmett2014-01-02 09:31:02http://minesweeperonline.comSan Francisco, CA<p>currently at a startup in SF</p>\\r\\n\\r\\n<p>...1100199828.0http://i.stack.imgur.com/d1oHX.jpg
4567922010-07-19 19:03:57Shane2014-08-13 00:23:47http://www.statalgo.comNew York, NY<p>Quantitative researcher focusing on statist...114566255450335.0NaN
.............................................
403205574312014-09-13 21:03:50AussieMeg2014-09-13 21:18:52NaNNaNNaN0005026902NaNhttp://graph.facebook.com/665821703/picture?ty...
403215574462014-09-13 21:39:30Mia Maria2014-09-13 21:39:30NaNNaNNaN1005026998NaNNaN
40322557451012014-09-13 23:45:27tronbabylove2014-09-13 23:45:27NaNUnited StatesNaN000481766NaNhttps://www.gravatar.com/avatar/faa7a3fdbd8308...
40323557461062014-09-14 00:29:41GPP2014-09-14 02:05:17NaNNaN<p>Stats noobie, product, marketing &amp; medi...100976289NaNhttps://www.gravatar.com/avatar/6d9e9fa6b783a3...
403245574712014-09-14 01:01:44Shivam Agrawal2014-09-14 01:19:04NaNIndia<p>Maths Enthusiast </p>\\r\\n0005027354NaNhttps://lh4.googleusercontent.com/-ZsXhwVaFmiY...
\n", + "

40325 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " Id Reputation CreationDate DisplayName \\\n", + "0 -1 1 2010-07-19 06:55:26 Community \n", + "1 2 101 2010-07-19 14:01:36 Geoff Dalgas \n", + "2 3 101 2010-07-19 15:34:50 Jarrod Dixon \n", + "3 4 101 2010-07-19 19:03:27 Emmett \n", + "4 5 6792 2010-07-19 19:03:57 Shane \n", + "... ... ... ... ... \n", + "40320 55743 1 2014-09-13 21:03:50 AussieMeg \n", + "40321 55744 6 2014-09-13 21:39:30 Mia Maria \n", + "40322 55745 101 2014-09-13 23:45:27 tronbabylove \n", + "40323 55746 106 2014-09-14 00:29:41 GPP \n", + "40324 55747 1 2014-09-14 01:01:44 Shivam Agrawal \n", + "\n", + " LastAccessDate WebsiteUrl \\\n", + "0 2010-07-19 06:55:26 http://meta.stackexchange.com/ \n", + "1 2013-11-12 22:07:23 http://stackoverflow.com \n", + "2 2014-08-08 06:42:58 http://stackoverflow.com \n", + "3 2014-01-02 09:31:02 http://minesweeperonline.com \n", + "4 2014-08-13 00:23:47 http://www.statalgo.com \n", + "... ... ... \n", + "40320 2014-09-13 21:18:52 NaN \n", + "40321 2014-09-13 21:39:30 NaN \n", + "40322 2014-09-13 23:45:27 NaN \n", + "40323 2014-09-14 02:05:17 NaN \n", + "40324 2014-09-14 01:19:04 NaN \n", + "\n", + " Location AboutMe \\\n", + "0 on the server farm

Hi, I'm not really a person.

\\r\\n\\r\\n

... \n", + "1 Corvallis, OR

Developer on the StackOverflow team. Find ... \n", + "2 New York, NY

currently at a startup in SF

\\r\\n\\r\\n

... \n", + "4 New York, NY

Quantitative researcher focusing on statist... \n", + "... ... ... \n", + "40320 NaN NaN \n", + "40321 NaN NaN \n", + "40322 United States NaN \n", + "40323 NaN

Stats noobie, product, marketing & medi... \n", + "40324 India

Maths Enthusiast

\\r\\n \n", + "\n", + " Views UpVotes DownVotes AccountId Age \\\n", + "0 0 5007 1920 -1 NaN \n", + "1 25 3 0 2 37.0 \n", + "2 22 19 0 3 35.0 \n", + "3 11 0 0 1998 28.0 \n", + "4 1145 662 5 54503 35.0 \n", + "... ... ... ... ... ... \n", + "40320 0 0 0 5026902 NaN \n", + "40321 1 0 0 5026998 NaN \n", + "40322 0 0 0 481766 NaN \n", + "40323 1 0 0 976289 NaN \n", + "40324 0 0 0 5027354 NaN \n", + "\n", + " ProfileImageUrl \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 http://i.stack.imgur.com/d1oHX.jpg \n", + "4 NaN \n", + "... ... \n", + "40320 http://graph.facebook.com/665821703/picture?ty... \n", + "40321 NaN \n", + "40322 https://www.gravatar.com/avatar/faa7a3fdbd8308... \n", + "40323 https://www.gravatar.com/avatar/6d9e9fa6b783a3... \n", + "40324 https://lh4.googleusercontent.com/-ZsXhwVaFmiY... \n", + "\n", + "[40325 rows x 14 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data=pd.read_csv(\"users_table.csv\")\n", + "data" + ] + }, { "cell_type": "code", "execution_count": null, @@ -35,6 +358,186 @@ "#### 3. Rename Id column to userId" ] }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "data_df = data.rename(columns={'Id': 'userId'})" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationCreationDateDisplayNameLastAccessDateWebsiteUrlLocationAboutMeViewsUpVotesDownVotesAccountIdAgeProfileImageUrl
0-112010-07-19 06:55:26Community2010-07-19 06:55:26http://meta.stackexchange.com/on the server farm<p>Hi, I'm not really a person.</p>\\r\\n\\r\\n<p>...050071920-1NaNNaN
121012010-07-19 14:01:36Geoff Dalgas2013-11-12 22:07:23http://stackoverflow.comCorvallis, OR<p>Developer on the StackOverflow team. Find ...2530237.0NaN
231012010-07-19 15:34:50Jarrod Dixon2014-08-08 06:42:58http://stackoverflow.comNew York, NY<p><a href=\"http://blog.stackoverflow.com/2009...22190335.0NaN
341012010-07-19 19:03:27Emmett2014-01-02 09:31:02http://minesweeperonline.comSan Francisco, CA<p>currently at a startup in SF</p>\\r\\n\\r\\n<p>...1100199828.0http://i.stack.imgur.com/d1oHX.jpg
4567922010-07-19 19:03:57Shane2014-08-13 00:23:47http://www.statalgo.comNew York, NY<p>Quantitative researcher focusing on statist...114566255450335.0NaN
\n", + "
" + ], + "text/plain": [ + " userId Reputation CreationDate DisplayName LastAccessDate \\\n", + "0 -1 1 2010-07-19 06:55:26 Community 2010-07-19 06:55:26 \n", + "1 2 101 2010-07-19 14:01:36 Geoff Dalgas 2013-11-12 22:07:23 \n", + "2 3 101 2010-07-19 15:34:50 Jarrod Dixon 2014-08-08 06:42:58 \n", + "3 4 101 2010-07-19 19:03:27 Emmett 2014-01-02 09:31:02 \n", + "4 5 6792 2010-07-19 19:03:57 Shane 2014-08-13 00:23:47 \n", + "\n", + " WebsiteUrl Location \\\n", + "0 http://meta.stackexchange.com/ on the server farm \n", + "1 http://stackoverflow.com Corvallis, OR \n", + "2 http://stackoverflow.com New York, NY \n", + "3 http://minesweeperonline.com San Francisco, CA \n", + "4 http://www.statalgo.com New York, NY \n", + "\n", + " AboutMe Views UpVotes \\\n", + "0

Hi, I'm not really a person.

\\r\\n\\r\\n

... 0 5007 \n", + "1

Developer on the StackOverflow team. Find ... 25 3 \n", + "2

currently at a startup in SF

\\r\\n\\r\\n

... 11 0 \n", + "4

Quantitative researcher focusing on statist... 1145 662 \n", + "\n", + " DownVotes AccountId Age ProfileImageUrl \n", + "0 1920 -1 NaN NaN \n", + "1 0 2 37.0 NaN \n", + "2 0 3 35.0 NaN \n", + "3 0 1998 28.0 http://i.stack.imgur.com/d1oHX.jpg \n", + "4 5 54503 35.0 NaN " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_df.head()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -49,6 +552,430 @@ "#### 4. Import posts table:" ] }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdPostTypeIdAcceptedAnswerIdCreaionDateScoreViewCountBodyOwnerUserIdLasActivityDateTitle...AnswerCountCommentCountFavoriteCountLastEditorUserIdLastEditDateCommunityOwnedDateParentIdClosedDateOwnerDisplayNameLastEditorDisplayName
01115.02010-07-19 19:12:12231278.0<p>How should I elicit prior distributions fro...8.02010-09-15 21:08:26Eliciting priors from experts...5.0114.0NaNNaNNaNNaNNaNNaNNaN
12159.02010-07-19 19:12:57228198.0<p>In many different statistical methods there...24.02012-11-12 09:21:54What is normality?...7.018.088.02010-08-07 17:56:44NaNNaNNaNNaNNaN
2315.02010-07-19 19:13:28543613.0<p>What are some valuable Statistical Analysis...18.02013-05-27 14:48:36What are some valuable Statistical Analysis op......19.0436.0183.02011-02-12 05:50:032010-07-19 19:13:28NaNNaNNaNNaN
341135.02010-07-19 19:13:31135224.0<p>I have two groups of data. Each with a dif...23.02010-09-08 03:00:19Assessing the significance of differences in d......5.022.0NaNNaNNaNNaNNaNNaNNaN
452NaN2010-07-19 19:14:4381NaN<p>The R-project</p>\\n\\n<p><a href=\"http://www...23.02010-07-19 19:21:15NaN...NaN3NaN23.02010-07-19 19:21:152010-07-19 19:14:433.0NaNNaNNaN
..................................................................
39995483212NaN2013-01-23 09:00:010NaN<p>you can use the matlab codes for svm and co...19966.02013-01-23 09:00:01NaN...NaN0NaNNaNNaNNaN45118.0NaNNaNNaN
39996483222NaN2013-01-23 09:09:343NaN<p>I use <a href=\"http://www.gnu.org/software/...892.02013-01-23 13:13:30NaN...NaN2NaN892.02013-01-23 13:13:30NaN48311.0NaNNaNNaN
39997483232NaN2013-01-23 09:16:441NaN<p>If I understand your question correctly, yo...2020.02013-01-23 09:16:44NaN...NaN0NaNNaNNaNNaN48247.0NaNNaNNaN
39998483242NaN2013-01-23 09:36:073NaN<p>Doesn't really help you with your question,...19914.02013-01-23 09:36:07NaN...NaN0NaNNaNNaNNaN48297.0NaNNaNNaN
39999483251NaN2013-01-23 09:44:07-1116.0<p>I have 10 vectors each having 100,000 point...19968.02013-02-22 11:23:54are data sets obtained from a Normal distribut......2.04NaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

40000 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " Id PostTypeId AcceptedAnswerId CreaionDate Score \\\n", + "0 1 1 15.0 2010-07-19 19:12:12 23 \n", + "1 2 1 59.0 2010-07-19 19:12:57 22 \n", + "2 3 1 5.0 2010-07-19 19:13:28 54 \n", + "3 4 1 135.0 2010-07-19 19:13:31 13 \n", + "4 5 2 NaN 2010-07-19 19:14:43 81 \n", + "... ... ... ... ... ... \n", + "39995 48321 2 NaN 2013-01-23 09:00:01 0 \n", + "39996 48322 2 NaN 2013-01-23 09:09:34 3 \n", + "39997 48323 2 NaN 2013-01-23 09:16:44 1 \n", + "39998 48324 2 NaN 2013-01-23 09:36:07 3 \n", + "39999 48325 1 NaN 2013-01-23 09:44:07 -1 \n", + "\n", + " ViewCount Body \\\n", + "0 1278.0

How should I elicit prior distributions fro... \n", + "1 8198.0

In many different statistical methods there... \n", + "2 3613.0

What are some valuable Statistical Analysis... \n", + "3 5224.0

I have two groups of data. Each with a dif... \n", + "4 NaN

The R-project

\\n\\n

you can use the matlab codes for svm and co... \n", + "39996 NaN

I use If I understand your question correctly, yo... \n", + "39998 NaN

Doesn't really help you with your question,... \n", + "39999 116.0

I have 10 vectors each having 100,000 point... \n", + "\n", + " OwnerUserId LasActivityDate \\\n", + "0 8.0 2010-09-15 21:08:26 \n", + "1 24.0 2012-11-12 09:21:54 \n", + "2 18.0 2013-05-27 14:48:36 \n", + "3 23.0 2010-09-08 03:00:19 \n", + "4 23.0 2010-07-19 19:21:15 \n", + "... ... ... \n", + "39995 19966.0 2013-01-23 09:00:01 \n", + "39996 892.0 2013-01-23 13:13:30 \n", + "39997 2020.0 2013-01-23 09:16:44 \n", + "39998 19914.0 2013-01-23 09:36:07 \n", + "39999 19968.0 2013-02-22 11:23:54 \n", + "\n", + " Title ... AnswerCount \\\n", + "0 Eliciting priors from experts ... 5.0 \n", + "1 What is normality? ... 7.0 \n", + "2 What are some valuable Statistical Analysis op... ... 19.0 \n", + "3 Assessing the significance of differences in d... ... 5.0 \n", + "4 NaN ... NaN \n", + "... ... ... ... \n", + "39995 NaN ... NaN \n", + "39996 NaN ... NaN \n", + "39997 NaN ... NaN \n", + "39998 NaN ... NaN \n", + "39999 are data sets obtained from a Normal distribut... ... 2.0 \n", + "\n", + " CommentCount FavoriteCount LastEditorUserId LastEditDate \\\n", + "0 1 14.0 NaN NaN \n", + "1 1 8.0 88.0 2010-08-07 17:56:44 \n", + "2 4 36.0 183.0 2011-02-12 05:50:03 \n", + "3 2 2.0 NaN NaN \n", + "4 3 NaN 23.0 2010-07-19 19:21:15 \n", + "... ... ... ... ... \n", + "39995 0 NaN NaN NaN \n", + "39996 2 NaN 892.0 2013-01-23 13:13:30 \n", + "39997 0 NaN NaN NaN \n", + "39998 0 NaN NaN NaN \n", + "39999 4 NaN NaN NaN \n", + "\n", + " CommunityOwnedDate ParentId ClosedDate OwnerDisplayName \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 2010-07-19 19:13:28 NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 2010-07-19 19:14:43 3.0 NaN NaN \n", + "... ... ... ... ... \n", + "39995 NaN 45118.0 NaN NaN \n", + "39996 NaN 48311.0 NaN NaN \n", + "39997 NaN 48247.0 NaN NaN \n", + "39998 NaN 48297.0 NaN NaN \n", + "39999 NaN NaN NaN NaN \n", + "\n", + " LastEditorDisplayName \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "... ... \n", + "39995 NaN \n", + "39996 NaN \n", + "39997 NaN \n", + "39998 NaN \n", + "39999 NaN \n", + "\n", + "[40000 rows x 21 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "posts=pd.read_csv(\"posts_table.csv\")\n", + "posts" + ] + }, { "cell_type": "code", "execution_count": null, @@ -63,6 +990,326 @@ "#### 5. Rename Id column to postId and OwnerUserId to userId" ] }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "posts_df = data.rename(columns={'OwnerUserId': 'userId'})" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdReputationCreationDateDisplayNameLastAccessDateWebsiteUrlLocationAboutMeViewsUpVotesDownVotesAccountIdAgeProfileImageUrl
0-112010-07-19 06:55:26Community2010-07-19 06:55:26http://meta.stackexchange.com/on the server farm<p>Hi, I'm not really a person.</p>\\r\\n\\r\\n<p>...050071920-1NaNNaN
121012010-07-19 14:01:36Geoff Dalgas2013-11-12 22:07:23http://stackoverflow.comCorvallis, OR<p>Developer on the StackOverflow team. Find ...2530237.0NaN
231012010-07-19 15:34:50Jarrod Dixon2014-08-08 06:42:58http://stackoverflow.comNew York, NY<p><a href=\"http://blog.stackoverflow.com/2009...22190335.0NaN
341012010-07-19 19:03:27Emmett2014-01-02 09:31:02http://minesweeperonline.comSan Francisco, CA<p>currently at a startup in SF</p>\\r\\n\\r\\n<p>...1100199828.0http://i.stack.imgur.com/d1oHX.jpg
4567922010-07-19 19:03:57Shane2014-08-13 00:23:47http://www.statalgo.comNew York, NY<p>Quantitative researcher focusing on statist...114566255450335.0NaN
.............................................
403205574312014-09-13 21:03:50AussieMeg2014-09-13 21:18:52NaNNaNNaN0005026902NaNhttp://graph.facebook.com/665821703/picture?ty...
403215574462014-09-13 21:39:30Mia Maria2014-09-13 21:39:30NaNNaNNaN1005026998NaNNaN
40322557451012014-09-13 23:45:27tronbabylove2014-09-13 23:45:27NaNUnited StatesNaN000481766NaNhttps://www.gravatar.com/avatar/faa7a3fdbd8308...
40323557461062014-09-14 00:29:41GPP2014-09-14 02:05:17NaNNaN<p>Stats noobie, product, marketing &amp; medi...100976289NaNhttps://www.gravatar.com/avatar/6d9e9fa6b783a3...
403245574712014-09-14 01:01:44Shivam Agrawal2014-09-14 01:19:04NaNIndia<p>Maths Enthusiast </p>\\r\\n0005027354NaNhttps://lh4.googleusercontent.com/-ZsXhwVaFmiY...
\n", + "

40325 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " Id Reputation CreationDate DisplayName \\\n", + "0 -1 1 2010-07-19 06:55:26 Community \n", + "1 2 101 2010-07-19 14:01:36 Geoff Dalgas \n", + "2 3 101 2010-07-19 15:34:50 Jarrod Dixon \n", + "3 4 101 2010-07-19 19:03:27 Emmett \n", + "4 5 6792 2010-07-19 19:03:57 Shane \n", + "... ... ... ... ... \n", + "40320 55743 1 2014-09-13 21:03:50 AussieMeg \n", + "40321 55744 6 2014-09-13 21:39:30 Mia Maria \n", + "40322 55745 101 2014-09-13 23:45:27 tronbabylove \n", + "40323 55746 106 2014-09-14 00:29:41 GPP \n", + "40324 55747 1 2014-09-14 01:01:44 Shivam Agrawal \n", + "\n", + " LastAccessDate WebsiteUrl \\\n", + "0 2010-07-19 06:55:26 http://meta.stackexchange.com/ \n", + "1 2013-11-12 22:07:23 http://stackoverflow.com \n", + "2 2014-08-08 06:42:58 http://stackoverflow.com \n", + "3 2014-01-02 09:31:02 http://minesweeperonline.com \n", + "4 2014-08-13 00:23:47 http://www.statalgo.com \n", + "... ... ... \n", + "40320 2014-09-13 21:18:52 NaN \n", + "40321 2014-09-13 21:39:30 NaN \n", + "40322 2014-09-13 23:45:27 NaN \n", + "40323 2014-09-14 02:05:17 NaN \n", + "40324 2014-09-14 01:19:04 NaN \n", + "\n", + " Location AboutMe \\\n", + "0 on the server farm

Hi, I'm not really a person.

\\r\\n\\r\\n

... \n", + "1 Corvallis, OR

Developer on the StackOverflow team. Find ... \n", + "2 New York, NY

currently at a startup in SF

\\r\\n\\r\\n

... \n", + "4 New York, NY

Quantitative researcher focusing on statist... \n", + "... ... ... \n", + "40320 NaN NaN \n", + "40321 NaN NaN \n", + "40322 United States NaN \n", + "40323 NaN

Stats noobie, product, marketing & medi... \n", + "40324 India

Maths Enthusiast

\\r\\n \n", + "\n", + " Views UpVotes DownVotes AccountId Age \\\n", + "0 0 5007 1920 -1 NaN \n", + "1 25 3 0 2 37.0 \n", + "2 22 19 0 3 35.0 \n", + "3 11 0 0 1998 28.0 \n", + "4 1145 662 5 54503 35.0 \n", + "... ... ... ... ... ... \n", + "40320 0 0 0 5026902 NaN \n", + "40321 1 0 0 5026998 NaN \n", + "40322 0 0 0 481766 NaN \n", + "40323 1 0 0 976289 NaN \n", + "40324 0 0 0 5027354 NaN \n", + "\n", + " ProfileImageUrl \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 http://i.stack.imgur.com/d1oHX.jpg \n", + "4 NaN \n", + "... ... \n", + "40320 http://graph.facebook.com/665821703/picture?ty... \n", + "40321 NaN \n", + "40322 https://www.gravatar.com/avatar/faa7a3fdbd8308... \n", + "40323 https://www.gravatar.com/avatar/6d9e9fa6b783a3... \n", + "40324 https://lh4.googleusercontent.com/-ZsXhwVaFmiY... \n", + "\n", + "[40325 rows x 14 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -84,7 +1331,14 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "\n", + "users_df =data_df\n", + "\n", + "users_df_selected = users_df.loc[:, ['userId', 'Reputation', 'Views', 'UpVotes', 'DownVotes']]\n", + "\n", + "posts_df_selected = posts_df.loc[:, ['postId', 'Score', 'userId', 'ViewCount', 'CommentCount']]" + ] }, { "cell_type": "markdown", @@ -99,7 +1353,9 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "merged_df = pd.merge(users_df_selected, posts_df_selected, on='userId')" + ] }, { "cell_type": "markdown", @@ -113,7 +1369,9 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "missing_values = merged_df.isna().sum()" + ] }, { "cell_type": "markdown", @@ -128,7 +1386,9 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "merged_df_filled = merged_df.fillna(merged_df.mean())" + ] }, { "cell_type": "markdown", @@ -142,12 +1402,24 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "merged_df['userId'] = merged_df['userId'].astype(int)\n", + "merged_df['postId'] = merged_df['postId'].astype(int)\n", + "merged_df['Reputation'] = merged_df['Reputation'].astype(int)\n", + "merged_df['Views'] = merged_df['Views'].astype(int)\n", + "merged_df['UpVotes'] = merged_df['UpVotes'].astype(int)\n", + "merged_df['DownVotes'] = merged_df['DownVotes'].astype(int)\n", + "merged_df['Score'] = merged_df['Score'].astype(int)\n", + "merged_df['ViewCount'] = merged_df['ViewCount'].astype(float)\n", + "merged_df['CommentCount'] = merged_df['CommentCount'].astype(int)\n", + "\n", + "merged_df.dtypes" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -161,7 +1433,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.9.13" } }, "nbformat": 4,