From 71a661ff6565368d69d839d42f2cb1391ceed1c8 Mon Sep 17 00:00:00 2001 From: Bruno Gama Date: Tue, 10 May 2022 19:06:25 +0100 Subject: [PATCH] lab done --- your-code/main.ipynb | 2038 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 2012 insertions(+), 26 deletions(-) diff --git a/your-code/main.ipynb b/your-code/main.ipynb index bad6d94..9175939 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -9,10 +9,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 106, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] }, { "cell_type": "markdown", @@ -23,10 +26,317 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 107, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdReputationCreationDateDisplayNameLastAccessDateWebsiteUrlLocationAboutMeViewsUpVotesDownVotesAccountIdAgeProfileImageUrl
0-112010-07-19 06:55:26Community2010-07-19 06:55:26http://meta.stackexchange.com/on the server farm<p>Hi, I'm not really a person.</p>\\n\\n<p>I'm ...050071920-1NaNNaN
121012010-07-19 14:01:36Geoff Dalgas2013-11-12 22:07:23http://stackoverflow.comCorvallis, OR<p>Developer on the StackOverflow team. Find ...2530237.0NaN
231012010-07-19 15:34:50Jarrod Dixon2014-08-08 06:42:58http://stackoverflow.comNew York, NY<p><a href=\"http://blog.stackoverflow.com/2009...22190335.0NaN
341012010-07-19 19:03:27Emmett2014-01-02 09:31:02http://minesweeperonline.comSan Francisco, CA<p>currently at a startup in SF</p>\\n\\n<p>form...1100199828.0http://i.stack.imgur.com/d1oHX.jpg
4567922010-07-19 19:03:57Shane2014-08-13 00:23:47http://www.statalgo.comNew York, NY<p>Quantitative researcher focusing on statist...114566255450335.0NaN
.............................................
403205574312014-09-13 21:03:50AussieMeg2014-09-13 21:18:52NaNNaNNaN0005026902NaNhttp://graph.facebook.com/665821703/picture?ty...
403215574462014-09-13 21:39:30Mia Maria2014-09-13 21:39:30NaNNaNNaN1005026998NaNNaN
40322557451012014-09-13 23:45:27tronbabylove2014-09-13 23:45:27NaNUnited StatesNaN000481766NaNhttps://www.gravatar.com/avatar/faa7a3fdbd8308...
40323557461062014-09-14 00:29:41GPP2014-09-14 02:05:17NaNNaN<p>Stats noobie, product, marketing &amp; medi...100976289NaNhttps://www.gravatar.com/avatar/6d9e9fa6b783a3...
403245574712014-09-14 01:01:44Shivam Agrawal2014-09-14 01:19:04NaNIndia<p>Maths Enthusiast </p>\\n0005027354NaNhttps://lh4.googleusercontent.com/-ZsXhwVaFmiY...
\n", + "

40325 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " Id Reputation CreationDate DisplayName \\\n", + "0 -1 1 2010-07-19 06:55:26 Community \n", + "1 2 101 2010-07-19 14:01:36 Geoff Dalgas \n", + "2 3 101 2010-07-19 15:34:50 Jarrod Dixon \n", + "3 4 101 2010-07-19 19:03:27 Emmett \n", + "4 5 6792 2010-07-19 19:03:57 Shane \n", + "... ... ... ... ... \n", + "40320 55743 1 2014-09-13 21:03:50 AussieMeg \n", + "40321 55744 6 2014-09-13 21:39:30 Mia Maria \n", + "40322 55745 101 2014-09-13 23:45:27 tronbabylove \n", + "40323 55746 106 2014-09-14 00:29:41 GPP \n", + "40324 55747 1 2014-09-14 01:01:44 Shivam Agrawal \n", + "\n", + " LastAccessDate WebsiteUrl \\\n", + "0 2010-07-19 06:55:26 http://meta.stackexchange.com/ \n", + "1 2013-11-12 22:07:23 http://stackoverflow.com \n", + "2 2014-08-08 06:42:58 http://stackoverflow.com \n", + "3 2014-01-02 09:31:02 http://minesweeperonline.com \n", + "4 2014-08-13 00:23:47 http://www.statalgo.com \n", + "... ... ... \n", + "40320 2014-09-13 21:18:52 NaN \n", + "40321 2014-09-13 21:39:30 NaN \n", + "40322 2014-09-13 23:45:27 NaN \n", + "40323 2014-09-14 02:05:17 NaN \n", + "40324 2014-09-14 01:19:04 NaN \n", + "\n", + " Location AboutMe \\\n", + "0 on the server farm

Hi, I'm not really a person.

\\n\\n

I'm ... \n", + "1 Corvallis, OR

Developer on the StackOverflow team. Find ... \n", + "2 New York, NY

currently at a startup in SF

\\n\\n

form... \n", + "4 New York, NY

Quantitative researcher focusing on statist... \n", + "... ... ... \n", + "40320 NaN NaN \n", + "40321 NaN NaN \n", + "40322 United States NaN \n", + "40323 NaN

Stats noobie, product, marketing & medi... \n", + "40324 India

Maths Enthusiast

\\n \n", + "\n", + " Views UpVotes DownVotes AccountId Age \\\n", + "0 0 5007 1920 -1 NaN \n", + "1 25 3 0 2 37.0 \n", + "2 22 19 0 3 35.0 \n", + "3 11 0 0 1998 28.0 \n", + "4 1145 662 5 54503 35.0 \n", + "... ... ... ... ... ... \n", + "40320 0 0 0 5026902 NaN \n", + "40321 1 0 0 5026998 NaN \n", + "40322 0 0 0 481766 NaN \n", + "40323 1 0 0 976289 NaN \n", + "40324 0 0 0 5027354 NaN \n", + "\n", + " ProfileImageUrl \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 http://i.stack.imgur.com/d1oHX.jpg \n", + "4 NaN \n", + "... ... \n", + "40320 http://graph.facebook.com/665821703/picture?ty... \n", + "40321 NaN \n", + "40322 https://www.gravatar.com/avatar/faa7a3fdbd8308... \n", + "40323 https://www.gravatar.com/avatar/6d9e9fa6b783a3... \n", + "40324 https://lh4.googleusercontent.com/-ZsXhwVaFmiY... \n", + "\n", + "[40325 rows x 14 columns]" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users = pd.read_csv('users_table.csv')\n", + "users" + ] }, { "cell_type": "markdown", @@ -37,10 +347,318 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 108, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationCreationDateDisplayNameLastAccessDateWebsiteUrlLocationAboutMeViewsUpVotesDownVotesAccountIdAgeProfileImageUrl
0-112010-07-19 06:55:26Community2010-07-19 06:55:26http://meta.stackexchange.com/on the server farm<p>Hi, I'm not really a person.</p>\\n\\n<p>I'm ...050071920-1NaNNaN
121012010-07-19 14:01:36Geoff Dalgas2013-11-12 22:07:23http://stackoverflow.comCorvallis, OR<p>Developer on the StackOverflow team. Find ...2530237.0NaN
231012010-07-19 15:34:50Jarrod Dixon2014-08-08 06:42:58http://stackoverflow.comNew York, NY<p><a href=\"http://blog.stackoverflow.com/2009...22190335.0NaN
341012010-07-19 19:03:27Emmett2014-01-02 09:31:02http://minesweeperonline.comSan Francisco, CA<p>currently at a startup in SF</p>\\n\\n<p>form...1100199828.0http://i.stack.imgur.com/d1oHX.jpg
4567922010-07-19 19:03:57Shane2014-08-13 00:23:47http://www.statalgo.comNew York, NY<p>Quantitative researcher focusing on statist...114566255450335.0NaN
.............................................
403205574312014-09-13 21:03:50AussieMeg2014-09-13 21:18:52NaNNaNNaN0005026902NaNhttp://graph.facebook.com/665821703/picture?ty...
403215574462014-09-13 21:39:30Mia Maria2014-09-13 21:39:30NaNNaNNaN1005026998NaNNaN
40322557451012014-09-13 23:45:27tronbabylove2014-09-13 23:45:27NaNUnited StatesNaN000481766NaNhttps://www.gravatar.com/avatar/faa7a3fdbd8308...
40323557461062014-09-14 00:29:41GPP2014-09-14 02:05:17NaNNaN<p>Stats noobie, product, marketing &amp; medi...100976289NaNhttps://www.gravatar.com/avatar/6d9e9fa6b783a3...
403245574712014-09-14 01:01:44Shivam Agrawal2014-09-14 01:19:04NaNIndia<p>Maths Enthusiast </p>\\n0005027354NaNhttps://lh4.googleusercontent.com/-ZsXhwVaFmiY...
\n", + "

40325 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " userId Reputation CreationDate DisplayName \\\n", + "0 -1 1 2010-07-19 06:55:26 Community \n", + "1 2 101 2010-07-19 14:01:36 Geoff Dalgas \n", + "2 3 101 2010-07-19 15:34:50 Jarrod Dixon \n", + "3 4 101 2010-07-19 19:03:27 Emmett \n", + "4 5 6792 2010-07-19 19:03:57 Shane \n", + "... ... ... ... ... \n", + "40320 55743 1 2014-09-13 21:03:50 AussieMeg \n", + "40321 55744 6 2014-09-13 21:39:30 Mia Maria \n", + "40322 55745 101 2014-09-13 23:45:27 tronbabylove \n", + "40323 55746 106 2014-09-14 00:29:41 GPP \n", + "40324 55747 1 2014-09-14 01:01:44 Shivam Agrawal \n", + "\n", + " LastAccessDate WebsiteUrl \\\n", + "0 2010-07-19 06:55:26 http://meta.stackexchange.com/ \n", + "1 2013-11-12 22:07:23 http://stackoverflow.com \n", + "2 2014-08-08 06:42:58 http://stackoverflow.com \n", + "3 2014-01-02 09:31:02 http://minesweeperonline.com \n", + "4 2014-08-13 00:23:47 http://www.statalgo.com \n", + "... ... ... \n", + "40320 2014-09-13 21:18:52 NaN \n", + "40321 2014-09-13 21:39:30 NaN \n", + "40322 2014-09-13 23:45:27 NaN \n", + "40323 2014-09-14 02:05:17 NaN \n", + "40324 2014-09-14 01:19:04 NaN \n", + "\n", + " Location AboutMe \\\n", + "0 on the server farm

Hi, I'm not really a person.

\\n\\n

I'm ... \n", + "1 Corvallis, OR

Developer on the StackOverflow team. Find ... \n", + "2 New York, NY

currently at a startup in SF

\\n\\n

form... \n", + "4 New York, NY

Quantitative researcher focusing on statist... \n", + "... ... ... \n", + "40320 NaN NaN \n", + "40321 NaN NaN \n", + "40322 United States NaN \n", + "40323 NaN

Stats noobie, product, marketing & medi... \n", + "40324 India

Maths Enthusiast

\\n \n", + "\n", + " Views UpVotes DownVotes AccountId Age \\\n", + "0 0 5007 1920 -1 NaN \n", + "1 25 3 0 2 37.0 \n", + "2 22 19 0 3 35.0 \n", + "3 11 0 0 1998 28.0 \n", + "4 1145 662 5 54503 35.0 \n", + "... ... ... ... ... ... \n", + "40320 0 0 0 5026902 NaN \n", + "40321 1 0 0 5026998 NaN \n", + "40322 0 0 0 481766 NaN \n", + "40323 1 0 0 976289 NaN \n", + "40324 0 0 0 5027354 NaN \n", + "\n", + " ProfileImageUrl \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 http://i.stack.imgur.com/d1oHX.jpg \n", + "4 NaN \n", + "... ... \n", + "40320 http://graph.facebook.com/665821703/picture?ty... \n", + "40321 NaN \n", + "40322 https://www.gravatar.com/avatar/faa7a3fdbd8308... \n", + "40323 https://www.gravatar.com/avatar/6d9e9fa6b783a3... \n", + "40324 https://lh4.googleusercontent.com/-ZsXhwVaFmiY... \n", + "\n", + "[40325 rows x 14 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "users1 = users.rename(columns = {'Id':'userId'}, inplace=False)\n", + "\n", + "display(users1)\n", + "\n" + ] }, { "cell_type": "markdown", @@ -51,10 +669,234 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 136, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdPostTypeIdAcceptedAnswerIdCreaionDateScoreViewCountBodyOwnerUserIdLasActivityDateTitle...AnswerCountCommentCountFavoriteCountLastEditorUserIdLastEditDateCommunityOwnedDateParentIdClosedDateOwnerDisplayNameLastEditorDisplayName
01115.02010-07-19 19:12:12231278.0<p>How should I elicit prior distributions fro...8.02010-09-15 21:08:26Eliciting priors from experts...5.0114.0NaNNaNNaNNaNNaNNaNNaN
12159.02010-07-19 19:12:57228198.0<p>In many different statistical methods there...24.02012-11-12 09:21:54What is normality?...7.018.088.02010-08-07 17:56:44NaNNaNNaNNaNNaN
2315.02010-07-19 19:13:28543613.0<p>What are some valuable Statistical Analysis...18.02013-05-27 14:48:36What are some valuable Statistical Analysis op......19.0436.0183.02011-02-12 05:50:032010-07-19 19:13:28NaNNaNNaNNaN
341135.02010-07-19 19:13:31135224.0<p>I have two groups of data. Each with a dif...23.02010-09-08 03:00:19Assessing the significance of differences in d......5.022.0NaNNaNNaNNaNNaNNaNNaN
452NaN2010-07-19 19:14:4381NaN<p>The R-project</p>\\n\\n<p><a href=\"http://www...23.02010-07-19 19:21:15NaN...NaN3NaN23.02010-07-19 19:21:152010-07-19 19:14:433.0NaNNaNNaN
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " Id PostTypeId AcceptedAnswerId CreaionDate Score ViewCount \\\n", + "0 1 1 15.0 2010-07-19 19:12:12 23 1278.0 \n", + "1 2 1 59.0 2010-07-19 19:12:57 22 8198.0 \n", + "2 3 1 5.0 2010-07-19 19:13:28 54 3613.0 \n", + "3 4 1 135.0 2010-07-19 19:13:31 13 5224.0 \n", + "4 5 2 NaN 2010-07-19 19:14:43 81 NaN \n", + "\n", + " Body OwnerUserId \\\n", + "0

How should I elicit prior distributions fro... 8.0 \n", + "1

In many different statistical methods there... 24.0 \n", + "2

What are some valuable Statistical Analysis... 18.0 \n", + "3

I have two groups of data. Each with a dif... 23.0 \n", + "4

The R-project

\\n\\n

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postIdPostTypeIdAcceptedAnswerIdCreaionDateScoreViewCountBodyuserIdLasActivityDateTitle...AnswerCountCommentCountFavoriteCountLastEditorUserIdLastEditDateCommunityOwnedDateParentIdClosedDateOwnerDisplayNameLastEditorDisplayName
01115.02010-07-19 19:12:12231278.0<p>How should I elicit prior distributions fro...8.02010-09-15 21:08:26Eliciting priors from experts...5.0114.0NaNNaNNaNNaNNaNNaNNaN
12159.02010-07-19 19:12:57228198.0<p>In many different statistical methods there...24.02012-11-12 09:21:54What is normality?...7.018.088.02010-08-07 17:56:44NaNNaNNaNNaNNaN
2315.02010-07-19 19:13:28543613.0<p>What are some valuable Statistical Analysis...18.02013-05-27 14:48:36What are some valuable Statistical Analysis op......19.0436.0183.02011-02-12 05:50:032010-07-19 19:13:28NaNNaNNaNNaN
341135.02010-07-19 19:13:31135224.0<p>I have two groups of data. Each with a dif...23.02010-09-08 03:00:19Assessing the significance of differences in d......5.022.0NaNNaNNaNNaNNaNNaNNaN
452NaN2010-07-19 19:14:4381NaN<p>The R-project</p>\\n\\n<p><a href=\"http://www...23.02010-07-19 19:21:15NaN...NaN3NaN23.02010-07-19 19:21:152010-07-19 19:14:433.0NaNNaNNaN
..................................................................
39995483212NaN2013-01-23 09:00:010NaN<p>you can use the matlab codes for svm and co...19966.02013-01-23 09:00:01NaN...NaN0NaNNaNNaNNaN45118.0NaNNaNNaN
39996483222NaN2013-01-23 09:09:343NaN<p>I use <a href=\"http://www.gnu.org/software/...892.02013-01-23 13:13:30NaN...NaN2NaN892.02013-01-23 13:13:30NaN48311.0NaNNaNNaN
39997483232NaN2013-01-23 09:16:441NaN<p>If I understand your question correctly, yo...2020.02013-01-23 09:16:44NaN...NaN0NaNNaNNaNNaN48247.0NaNNaNNaN
39998483242NaN2013-01-23 09:36:073NaN<p>Doesn't really help you with your question,...19914.02013-01-23 09:36:07NaN...NaN0NaNNaNNaNNaN48297.0NaNNaNNaN
39999483251NaN2013-01-23 09:44:07-1116.0<p>I have 10 vectors each having 100,000 point...19968.02013-02-22 11:23:54are data sets obtained from a Normal distribut......2.04NaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

40000 rows × 21 columns

\n", + "" + ], + "text/plain": [ + " postId PostTypeId AcceptedAnswerId CreaionDate Score \\\n", + "0 1 1 15.0 2010-07-19 19:12:12 23 \n", + "1 2 1 59.0 2010-07-19 19:12:57 22 \n", + "2 3 1 5.0 2010-07-19 19:13:28 54 \n", + "3 4 1 135.0 2010-07-19 19:13:31 13 \n", + "4 5 2 NaN 2010-07-19 19:14:43 81 \n", + "... ... ... ... ... ... \n", + "39995 48321 2 NaN 2013-01-23 09:00:01 0 \n", + "39996 48322 2 NaN 2013-01-23 09:09:34 3 \n", + "39997 48323 2 NaN 2013-01-23 09:16:44 1 \n", + "39998 48324 2 NaN 2013-01-23 09:36:07 3 \n", + "39999 48325 1 NaN 2013-01-23 09:44:07 -1 \n", + "\n", + " ViewCount Body userId \\\n", + "0 1278.0

How should I elicit prior distributions fro... 8.0 \n", + "1 8198.0

In many different statistical methods there... 24.0 \n", + "2 3613.0

What are some valuable Statistical Analysis... 18.0 \n", + "3 5224.0

I have two groups of data. Each with a dif... 23.0 \n", + "4 NaN

The R-project

\\n\\n

you can use the matlab codes for svm and co... 19966.0 \n", + "39996 NaN

I use If I understand your question correctly, yo... 2020.0 \n", + "39998 NaN

Doesn't really help you with your question,... 19914.0 \n", + "39999 116.0

I have 10 vectors each having 100,000 point... 19968.0 \n", + "\n", + " LasActivityDate Title \\\n", + "0 2010-09-15 21:08:26 Eliciting priors from experts \n", + "1 2012-11-12 09:21:54 What is normality? \n", + "2 2013-05-27 14:48:36 What are some valuable Statistical Analysis op... \n", + "3 2010-09-08 03:00:19 Assessing the significance of differences in d... \n", + "4 2010-07-19 19:21:15 NaN \n", + "... ... ... \n", + "39995 2013-01-23 09:00:01 NaN \n", + "39996 2013-01-23 13:13:30 NaN \n", + "39997 2013-01-23 09:16:44 NaN \n", + "39998 2013-01-23 09:36:07 NaN \n", + "39999 2013-02-22 11:23:54 are data sets obtained from a Normal distribut... \n", + "\n", + " ... AnswerCount CommentCount FavoriteCount LastEditorUserId \\\n", + "0 ... 5.0 1 14.0 NaN \n", + "1 ... 7.0 1 8.0 88.0 \n", + "2 ... 19.0 4 36.0 183.0 \n", + "3 ... 5.0 2 2.0 NaN \n", + "4 ... NaN 3 NaN 23.0 \n", + "... ... ... ... ... ... \n", + "39995 ... NaN 0 NaN NaN \n", + "39996 ... NaN 2 NaN 892.0 \n", + "39997 ... NaN 0 NaN NaN \n", + "39998 ... NaN 0 NaN NaN \n", + "39999 ... 2.0 4 NaN NaN \n", + "\n", + " LastEditDate CommunityOwnedDate ParentId ClosedDate \\\n", + "0 NaN NaN NaN NaN \n", + "1 2010-08-07 17:56:44 NaN NaN NaN \n", + "2 2011-02-12 05:50:03 2010-07-19 19:13:28 NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 2010-07-19 19:21:15 2010-07-19 19:14:43 3.0 NaN \n", + "... ... ... ... ... \n", + "39995 NaN NaN 45118.0 NaN \n", + "39996 2013-01-23 13:13:30 NaN 48311.0 NaN \n", + "39997 NaN NaN 48247.0 NaN \n", + "39998 NaN NaN 48297.0 NaN \n", + "39999 NaN NaN NaN NaN \n", + "\n", + " OwnerDisplayName LastEditorDisplayName \n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "... ... ... \n", + "39995 NaN NaN \n", + "39996 NaN NaN \n", + "39997 NaN NaN \n", + "39998 NaN NaN \n", + "39999 NaN NaN \n", + "\n", + "[40000 rows x 21 columns]" + ] + }, + "execution_count": 137, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "posts = posts.rename(columns={'Id':'postId', 'OwnerUserId':'userId'})\n", + "posts" + ] }, { "cell_type": "markdown", @@ -81,10 +1327,311 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 138, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationViewsUpVotesDownVotes
0-11050071920
121012530
2310122190
341011100
45679211456625
..................
40320557431000
40321557446100
4032255745101000
4032355746106100
40324557471000
\n", + "

40325 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " userId Reputation Views UpVotes DownVotes\n", + "0 -1 1 0 5007 1920\n", + "1 2 101 25 3 0\n", + "2 3 101 22 19 0\n", + "3 4 101 11 0 0\n", + "4 5 6792 1145 662 5\n", + "... ... ... ... ... ...\n", + "40320 55743 1 0 0 0\n", + "40321 55744 6 1 0 0\n", + "40322 55745 101 0 0 0\n", + "40323 55746 106 1 0 0\n", + "40324 55747 1 0 0 0\n", + "\n", + "[40325 rows x 5 columns]" + ] + }, + "execution_count": 138, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_users = users1[['userId','Reputation', 'Views', 'UpVotes', 'DownVotes']]\n", + "new_users" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postIdScoreuserIdViewCountCommentCount
01238.01278.01
122224.08198.01
235418.03613.04
341323.05224.02
458123.0NaN3
..................
3999548321019966.0NaN0
39996483223892.0NaN2
399974832312020.0NaN0
3999848324319914.0NaN0
3999948325-119968.0116.04
\n", + "

40000 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " postId Score userId ViewCount CommentCount\n", + "0 1 23 8.0 1278.0 1\n", + "1 2 22 24.0 8198.0 1\n", + "2 3 54 18.0 3613.0 4\n", + "3 4 13 23.0 5224.0 2\n", + "4 5 81 23.0 NaN 3\n", + "... ... ... ... ... ...\n", + "39995 48321 0 19966.0 NaN 0\n", + "39996 48322 3 892.0 NaN 2\n", + "39997 48323 1 2020.0 NaN 0\n", + "39998 48324 3 19914.0 NaN 0\n", + "39999 48325 -1 19968.0 116.0 4\n", + "\n", + "[40000 rows x 5 columns]" + ] + }, + "execution_count": 139, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_posts = posts[['postId', 'Score', 'userId', 'ViewCount', 'CommentCount']]\n", + "new_posts" + ] }, { "cell_type": "markdown", @@ -96,10 +1643,218 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 140, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationViewsUpVotesDownVotespostIdScoreViewCountCommentCount
0-1105007192021750NaN0
1-1105007192085760NaN0
2-1105007192085780NaN0
3-1105007192089810NaN0
4-1105007192089820NaN0
..............................
389574593411100340031115.02
389584619236100406675326.02
389594652223513271174613166.00
389605237122120027237243357.05
3896155226119230161761NaN0
\n", + "

38962 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " userId Reputation Views UpVotes DownVotes postId Score \\\n", + "0 -1 1 0 5007 1920 2175 0 \n", + "1 -1 1 0 5007 1920 8576 0 \n", + "2 -1 1 0 5007 1920 8578 0 \n", + "3 -1 1 0 5007 1920 8981 0 \n", + "4 -1 1 0 5007 1920 8982 0 \n", + "... ... ... ... ... ... ... ... \n", + "38957 45934 11 1 0 0 34003 1 \n", + "38958 46192 36 1 0 0 40667 5 \n", + "38959 46522 235 13 27 1 17461 3 \n", + "38960 52371 221 2 0 0 27237 24 \n", + "38961 55226 119 2 3 0 16176 1 \n", + "\n", + " ViewCount CommentCount \n", + "0 NaN 0 \n", + "1 NaN 0 \n", + "2 NaN 0 \n", + "3 NaN 0 \n", + "4 NaN 0 \n", + "... ... ... \n", + "38957 115.0 2 \n", + "38958 326.0 2 \n", + "38959 166.0 0 \n", + "38960 3357.0 5 \n", + "38961 NaN 0 \n", + "\n", + "[38962 rows x 9 columns]" + ] + }, + "execution_count": 140, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "posts_users = new_users.merge(new_posts, on='userId')\n", + "posts_users" + ] }, { "cell_type": "markdown", @@ -110,10 +1865,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 146, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "userId 0\n", + "Reputation 0\n", + "Views 0\n", + "UpVotes 0\n", + "DownVotes 0\n", + "postId 0\n", + "Score 0\n", + "ViewCount 23572\n", + "CommentCount 0\n", + "dtype: int64" + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "posts_users.isnull().sum()\n", + "# there are 23572 missing values in the column 'ViewCount'" + ] }, { "cell_type": "markdown", @@ -125,10 +1903,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 149, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# Depeding on what kind of information I will need in the future, I can drop the column\n", + "# using .drop('colums=') if it is not relevant for what I need or I swap into zeros using the\n", + "# numpy zeros method to gain other crucial info that I might need " + ] }, { "cell_type": "markdown", @@ -137,6 +1919,210 @@ "#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? " ] }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationViewsUpVotesDownVotespostIdScoreCommentCount
0-11050071920217500
1-11050071920857600
2-11050071920857800
3-11050071920898100
4-11050071920898200
...........................
3895745934111003400312
3895846192361004066752
3895946522235132711746130
389605237122120027237245
38961552261192301617610
\n", + "

38962 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " userId Reputation Views UpVotes DownVotes postId Score \\\n", + "0 -1 1 0 5007 1920 2175 0 \n", + "1 -1 1 0 5007 1920 8576 0 \n", + "2 -1 1 0 5007 1920 8578 0 \n", + "3 -1 1 0 5007 1920 8981 0 \n", + "4 -1 1 0 5007 1920 8982 0 \n", + "... ... ... ... ... ... ... ... \n", + "38957 45934 11 1 0 0 34003 1 \n", + "38958 46192 36 1 0 0 40667 5 \n", + "38959 46522 235 13 27 1 17461 3 \n", + "38960 52371 221 2 0 0 27237 24 \n", + "38961 55226 119 2 3 0 16176 1 \n", + "\n", + " CommentCount \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "... ... \n", + "38957 2 \n", + "38958 2 \n", + "38959 0 \n", + "38960 5 \n", + "38961 0 \n", + "\n", + "[38962 rows x 8 columns]" + ] + }, + "execution_count": 165, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_posts_users = posts_users.drop(columns=['ViewCount'])\n", + "new_posts_users\n", + "# the column ['ViewCount'] would be the column I would drop due the the amount of zeros it has\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -161,7 +2147,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.9.7" } }, "nbformat": 4,