From 0ae0969b1d95ec5353ddfc05aab17d7b813d16d1 Mon Sep 17 00:00:00 2001 From: Paola Date: Fri, 28 Jan 2022 16:51:00 -0600 Subject: [PATCH] Laboratorio terminado --- .../.ipynb_checkpoints/main-checkpoint.ipynb | 1881 ++++++++++++++++- your-code/main.ipynb | 1881 ++++++++++++++++- 2 files changed, 3702 insertions(+), 60 deletions(-) diff --git a/your-code/.ipynb_checkpoints/main-checkpoint.ipynb b/your-code/.ipynb_checkpoints/main-checkpoint.ipynb index 31724c5..d4d08bd 100644 --- a/your-code/.ipynb_checkpoints/main-checkpoint.ipynb +++ b/your-code/.ipynb_checkpoints/main-checkpoint.ipynb @@ -9,10 +9,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pandas as pd" + ] }, { "cell_type": "markdown", @@ -23,10 +25,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pymysql\n", + "from sqlalchemy import create_engine" + ] }, { "cell_type": "markdown", @@ -37,10 +42,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "engine = create_engine('mysql+pymysql://guest:relational@relational.fit.cvut.cz')" + ] }, { "cell_type": "markdown", @@ -51,10 +58,327 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "#user_tab = pd.read_sql_query('SEELECT * FROM stats.users', engine)\n", + "user_tab = pd.read_sql_query('SELECT * FROM stats.users', \n", + " engine)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdReputationCreationDateDisplayNameLastAccessDateWebsiteUrlLocationAboutMeViewsUpVotesDownVotesAccountIdAgeProfileImageUrl
0-112010-07-19 06:55:26Community2010-07-19 06:55:26http://meta.stackexchange.com/on the server farm<p>Hi, I'm not really a person.</p>\\n\\n<p>I'm ...050071920-1NaNNone
121012010-07-19 14:01:36Geoff Dalgas2013-11-12 22:07:23http://stackoverflow.comCorvallis, OR<p>Developer on the StackOverflow team. Find ...2530237.0None
231012010-07-19 15:34:50Jarrod Dixon2014-08-08 06:42:58http://stackoverflow.comNew York, NY<p><a href=\"http://blog.stackoverflow.com/2009...22190335.0None
341012010-07-19 19:03:27Emmett2014-01-02 09:31:02http://minesweeperonline.comSan Francisco, CA<p>currently at a startup in SF</p>\\n\\n<p>form...1100199828.0http://i.stack.imgur.com/d1oHX.jpg
4567922010-07-19 19:03:57Shane2014-08-13 00:23:47http://www.statalgo.comNew York, NY<p>Quantitative researcher focusing on statist...114566255450335.0None
.............................................
403205574312014-09-13 21:03:50AussieMeg2014-09-13 21:18:52NoneNoneNone0005026902NaNhttp://graph.facebook.com/665821703/picture?ty...
403215574462014-09-13 21:39:30Mia Maria2014-09-13 21:39:30NoneNoneNone1005026998NaNNone
40322557451012014-09-13 23:45:27tronbabylove2014-09-13 23:45:27NoneUnited StatesNone000481766NaNhttps://www.gravatar.com/avatar/faa7a3fdbd8308...
40323557461062014-09-14 00:29:41GPP2014-09-14 02:05:17NoneNone<p>Stats noobie, product, marketing &amp; medi...100976289NaNhttps://www.gravatar.com/avatar/6d9e9fa6b783a3...
403245574712014-09-14 01:01:44Shivam Agrawal2014-09-14 01:19:04NoneIndia<p>Maths Enthusiast </p>\\n0005027354NaNhttps://lh4.googleusercontent.com/-ZsXhwVaFmiY...
\n", + "

40325 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " Id Reputation CreationDate DisplayName \\\n", + "0 -1 1 2010-07-19 06:55:26 Community \n", + "1 2 101 2010-07-19 14:01:36 Geoff Dalgas \n", + "2 3 101 2010-07-19 15:34:50 Jarrod Dixon \n", + "3 4 101 2010-07-19 19:03:27 Emmett \n", + "4 5 6792 2010-07-19 19:03:57 Shane \n", + "... ... ... ... ... \n", + "40320 55743 1 2014-09-13 21:03:50 AussieMeg \n", + "40321 55744 6 2014-09-13 21:39:30 Mia Maria \n", + "40322 55745 101 2014-09-13 23:45:27 tronbabylove \n", + "40323 55746 106 2014-09-14 00:29:41 GPP \n", + "40324 55747 1 2014-09-14 01:01:44 Shivam Agrawal \n", + "\n", + " LastAccessDate WebsiteUrl Location \\\n", + "0 2010-07-19 06:55:26 http://meta.stackexchange.com/ on the server farm \n", + "1 2013-11-12 22:07:23 http://stackoverflow.com Corvallis, OR \n", + "2 2014-08-08 06:42:58 http://stackoverflow.com New York, NY \n", + "3 2014-01-02 09:31:02 http://minesweeperonline.com San Francisco, CA \n", + "4 2014-08-13 00:23:47 http://www.statalgo.com New York, NY \n", + "... ... ... ... \n", + "40320 2014-09-13 21:18:52 None None \n", + "40321 2014-09-13 21:39:30 None None \n", + "40322 2014-09-13 23:45:27 None United States \n", + "40323 2014-09-14 02:05:17 None None \n", + "40324 2014-09-14 01:19:04 None India \n", + "\n", + " AboutMe Views UpVotes \\\n", + "0

Hi, I'm not really a person.

\\n\\n

I'm ... 0 5007 \n", + "1

Developer on the StackOverflow team. Find ... 25 3 \n", + "2

currently at a startup in SF

\\n\\n

form... 11 0 \n", + "4

Quantitative researcher focusing on statist... 1145 662 \n", + "... ... ... ... \n", + "40320 None 0 0 \n", + "40321 None 1 0 \n", + "40322 None 0 0 \n", + "40323

Stats noobie, product, marketing & medi... 1 0 \n", + "40324

Maths Enthusiast

\\n 0 0 \n", + "\n", + " DownVotes AccountId Age \\\n", + "0 1920 -1 NaN \n", + "1 0 2 37.0 \n", + "2 0 3 35.0 \n", + "3 0 1998 28.0 \n", + "4 5 54503 35.0 \n", + "... ... ... ... \n", + "40320 0 5026902 NaN \n", + "40321 0 5026998 NaN \n", + "40322 0 481766 NaN \n", + "40323 0 976289 NaN \n", + "40324 0 5027354 NaN \n", + "\n", + " ProfileImageUrl \n", + "0 None \n", + "1 None \n", + "2 None \n", + "3 http://i.stack.imgur.com/d1oHX.jpg \n", + "4 None \n", + "... ... \n", + "40320 http://graph.facebook.com/665821703/picture?ty... \n", + "40321 None \n", + "40322 https://www.gravatar.com/avatar/faa7a3fdbd8308... \n", + "40323 https://www.gravatar.com/avatar/6d9e9fa6b783a3... \n", + "40324 https://lh4.googleusercontent.com/-ZsXhwVaFmiY... \n", + "\n", + "[40325 rows x 14 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_tab" + ] }, { "cell_type": "markdown", @@ -65,10 +389,317 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationCreationDateDisplayNameLastAccessDateWebsiteUrlLocationAboutMeViewsUpVotesDownVotesAccountIdAgeProfileImageUrl
0-112010-07-19 06:55:26Community2010-07-19 06:55:26http://meta.stackexchange.com/on the server farm<p>Hi, I'm not really a person.</p>\\n\\n<p>I'm ...050071920-1NaNNone
121012010-07-19 14:01:36Geoff Dalgas2013-11-12 22:07:23http://stackoverflow.comCorvallis, OR<p>Developer on the StackOverflow team. Find ...2530237.0None
231012010-07-19 15:34:50Jarrod Dixon2014-08-08 06:42:58http://stackoverflow.comNew York, NY<p><a href=\"http://blog.stackoverflow.com/2009...22190335.0None
341012010-07-19 19:03:27Emmett2014-01-02 09:31:02http://minesweeperonline.comSan Francisco, CA<p>currently at a startup in SF</p>\\n\\n<p>form...1100199828.0http://i.stack.imgur.com/d1oHX.jpg
4567922010-07-19 19:03:57Shane2014-08-13 00:23:47http://www.statalgo.comNew York, NY<p>Quantitative researcher focusing on statist...114566255450335.0None
.............................................
403205574312014-09-13 21:03:50AussieMeg2014-09-13 21:18:52NoneNoneNone0005026902NaNhttp://graph.facebook.com/665821703/picture?ty...
403215574462014-09-13 21:39:30Mia Maria2014-09-13 21:39:30NoneNoneNone1005026998NaNNone
40322557451012014-09-13 23:45:27tronbabylove2014-09-13 23:45:27NoneUnited StatesNone000481766NaNhttps://www.gravatar.com/avatar/faa7a3fdbd8308...
40323557461062014-09-14 00:29:41GPP2014-09-14 02:05:17NoneNone<p>Stats noobie, product, marketing &amp; medi...100976289NaNhttps://www.gravatar.com/avatar/6d9e9fa6b783a3...
403245574712014-09-14 01:01:44Shivam Agrawal2014-09-14 01:19:04NoneIndia<p>Maths Enthusiast </p>\\n0005027354NaNhttps://lh4.googleusercontent.com/-ZsXhwVaFmiY...
\n", + "

40325 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " userId Reputation CreationDate DisplayName \\\n", + "0 -1 1 2010-07-19 06:55:26 Community \n", + "1 2 101 2010-07-19 14:01:36 Geoff Dalgas \n", + "2 3 101 2010-07-19 15:34:50 Jarrod Dixon \n", + "3 4 101 2010-07-19 19:03:27 Emmett \n", + "4 5 6792 2010-07-19 19:03:57 Shane \n", + "... ... ... ... ... \n", + "40320 55743 1 2014-09-13 21:03:50 AussieMeg \n", + "40321 55744 6 2014-09-13 21:39:30 Mia Maria \n", + "40322 55745 101 2014-09-13 23:45:27 tronbabylove \n", + "40323 55746 106 2014-09-14 00:29:41 GPP \n", + "40324 55747 1 2014-09-14 01:01:44 Shivam Agrawal \n", + "\n", + " LastAccessDate WebsiteUrl Location \\\n", + "0 2010-07-19 06:55:26 http://meta.stackexchange.com/ on the server farm \n", + "1 2013-11-12 22:07:23 http://stackoverflow.com Corvallis, OR \n", + "2 2014-08-08 06:42:58 http://stackoverflow.com New York, NY \n", + "3 2014-01-02 09:31:02 http://minesweeperonline.com San Francisco, CA \n", + "4 2014-08-13 00:23:47 http://www.statalgo.com New York, NY \n", + "... ... ... ... \n", + "40320 2014-09-13 21:18:52 None None \n", + "40321 2014-09-13 21:39:30 None None \n", + "40322 2014-09-13 23:45:27 None United States \n", + "40323 2014-09-14 02:05:17 None None \n", + "40324 2014-09-14 01:19:04 None India \n", + "\n", + " AboutMe Views UpVotes \\\n", + "0

Hi, I'm not really a person.

\\n\\n

I'm ... 0 5007 \n", + "1

Developer on the StackOverflow team. Find ... 25 3 \n", + "2

currently at a startup in SF

\\n\\n

form... 11 0 \n", + "4

Quantitative researcher focusing on statist... 1145 662 \n", + "... ... ... ... \n", + "40320 None 0 0 \n", + "40321 None 1 0 \n", + "40322 None 0 0 \n", + "40323

Stats noobie, product, marketing & medi... 1 0 \n", + "40324

Maths Enthusiast

\\n 0 0 \n", + "\n", + " DownVotes AccountId Age \\\n", + "0 1920 -1 NaN \n", + "1 0 2 37.0 \n", + "2 0 3 35.0 \n", + "3 0 1998 28.0 \n", + "4 5 54503 35.0 \n", + "... ... ... ... \n", + "40320 0 5026902 NaN \n", + "40321 0 5026998 NaN \n", + "40322 0 481766 NaN \n", + "40323 0 976289 NaN \n", + "40324 0 5027354 NaN \n", + "\n", + " ProfileImageUrl \n", + "0 None \n", + "1 None \n", + "2 None \n", + "3 http://i.stack.imgur.com/d1oHX.jpg \n", + "4 None \n", + "... ... \n", + "40320 http://graph.facebook.com/665821703/picture?ty... \n", + "40321 None \n", + "40322 https://www.gravatar.com/avatar/faa7a3fdbd8308... \n", + "40323 https://www.gravatar.com/avatar/6d9e9fa6b783a3... \n", + "40324 https://lh4.googleusercontent.com/-ZsXhwVaFmiY... \n", + "\n", + "[40325 rows x 14 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_tab = user_tab.rename(columns={'Id':'userId'})\n", + "user_tab" + ] }, { "cell_type": "markdown", @@ -79,10 +710,428 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdPostTypeIdAcceptedAnswerIdCreaionDateScoreViewCountBodyOwnerUserIdLasActivityDateTitle...AnswerCountCommentCountFavoriteCountLastEditorUserIdLastEditDateCommunityOwnedDateParentIdClosedDateOwnerDisplayNameLastEditorDisplayName
01115.02010-07-19 19:12:12231278.0<p>How should I elicit prior distributions fro...8.02010-09-15 21:08:26Eliciting priors from experts...5.0114.0NaNNaTNaTNaNNaTNoneNone
12159.02010-07-19 19:12:57228198.0<p>In many different statistical methods there...24.02012-11-12 09:21:54What is normality?...7.018.088.02010-08-07 17:56:44NaTNaNNaTNoneNone
2315.02010-07-19 19:13:28543613.0<p>What are some valuable Statistical Analysis...18.02013-05-27 14:48:36What are some valuable Statistical Analysis op......19.0436.0183.02011-02-12 05:50:032010-07-19 19:13:28NaNNaTNoneNone
341135.02010-07-19 19:13:31135224.0<p>I have two groups of data. Each with a dif...23.02010-09-08 03:00:19Assessing the significance of differences in d......5.022.0NaNNaTNaTNaNNaTNoneNone
452NaN2010-07-19 19:14:4381NaN<p>The R-project</p>\\n\\n<p><a href=\"http://www...23.02010-07-19 19:21:15None...NaN3NaN23.02010-07-19 19:21:152010-07-19 19:14:433.0NaTNoneNone
..................................................................
919711153742NaN2014-09-13 23:45:392NaN<p>This grew too long for a comment, but I thi...805.02014-09-14 02:05:41None...NaN2NaN805.02014-09-14 02:05:41NaT115367.0NaTNoneNone
919721153751NaN2014-09-13 23:46:0509.0<p>Assume a classification problem where there...49365.02014-09-14 02:09:23Detecting a consistent pattern in a dataset vi......1.00NaNNaNNaTNaTNaNNaTNoneNone
919731153761NaN2014-09-14 01:27:5415.0<p>My goal is to create a formula that can giv...55746.02014-09-14 01:40:55How to project video viewcount based on histor......0.02NaN7290.02014-09-14 01:40:55NaTNaNNaTNoneNone
919741153772NaN2014-09-14 02:03:280NaN<p>As a practical answer to the real questions...805.02014-09-14 02:54:13None...NaN0NaN805.02014-09-14 02:54:13NaT115358.0NaTNoneNone
919751153782NaN2014-09-14 02:09:230NaN<p>Decision trees are notoriously <strong>unst...7250.02014-09-14 02:09:23None...NaN0NaNNaNNaTNaT115375.0NaTNoneNone
\n", + "

91976 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " Id PostTypeId AcceptedAnswerId CreaionDate Score \\\n", + "0 1 1 15.0 2010-07-19 19:12:12 23 \n", + "1 2 1 59.0 2010-07-19 19:12:57 22 \n", + "2 3 1 5.0 2010-07-19 19:13:28 54 \n", + "3 4 1 135.0 2010-07-19 19:13:31 13 \n", + "4 5 2 NaN 2010-07-19 19:14:43 81 \n", + "... ... ... ... ... ... \n", + "91971 115374 2 NaN 2014-09-13 23:45:39 2 \n", + "91972 115375 1 NaN 2014-09-13 23:46:05 0 \n", + "91973 115376 1 NaN 2014-09-14 01:27:54 1 \n", + "91974 115377 2 NaN 2014-09-14 02:03:28 0 \n", + "91975 115378 2 NaN 2014-09-14 02:09:23 0 \n", + "\n", + " ViewCount Body \\\n", + "0 1278.0

How should I elicit prior distributions fro... \n", + "1 8198.0

In many different statistical methods there... \n", + "2 3613.0

What are some valuable Statistical Analysis... \n", + "3 5224.0

I have two groups of data. Each with a dif... \n", + "4 NaN

The R-project

\\n\\n

This grew too long for a comment, but I thi... \n", + "91972 9.0

Assume a classification problem where there... \n", + "91973 5.0

My goal is to create a formula that can giv... \n", + "91974 NaN

As a practical answer to the real questions... \n", + "91975 NaN

Decision trees are notoriously unst... \n", + "\n", + " OwnerUserId LasActivityDate \\\n", + "0 8.0 2010-09-15 21:08:26 \n", + "1 24.0 2012-11-12 09:21:54 \n", + "2 18.0 2013-05-27 14:48:36 \n", + "3 23.0 2010-09-08 03:00:19 \n", + "4 23.0 2010-07-19 19:21:15 \n", + "... ... ... \n", + "91971 805.0 2014-09-14 02:05:41 \n", + "91972 49365.0 2014-09-14 02:09:23 \n", + "91973 55746.0 2014-09-14 01:40:55 \n", + "91974 805.0 2014-09-14 02:54:13 \n", + "91975 7250.0 2014-09-14 02:09:23 \n", + "\n", + " Title ... AnswerCount \\\n", + "0 Eliciting priors from experts ... 5.0 \n", + "1 What is normality? ... 7.0 \n", + "2 What are some valuable Statistical Analysis op... ... 19.0 \n", + "3 Assessing the significance of differences in d... ... 5.0 \n", + "4 None ... NaN \n", + "... ... ... ... \n", + "91971 None ... NaN \n", + "91972 Detecting a consistent pattern in a dataset vi... ... 1.0 \n", + "91973 How to project video viewcount based on histor... ... 0.0 \n", + "91974 None ... NaN \n", + "91975 None ... NaN \n", + "\n", + " CommentCount FavoriteCount LastEditorUserId LastEditDate \\\n", + "0 1 14.0 NaN NaT \n", + "1 1 8.0 88.0 2010-08-07 17:56:44 \n", + "2 4 36.0 183.0 2011-02-12 05:50:03 \n", + "3 2 2.0 NaN NaT \n", + "4 3 NaN 23.0 2010-07-19 19:21:15 \n", + "... ... ... ... ... \n", + "91971 2 NaN 805.0 2014-09-14 02:05:41 \n", + "91972 0 NaN NaN NaT \n", + "91973 2 NaN 7290.0 2014-09-14 01:40:55 \n", + "91974 0 NaN 805.0 2014-09-14 02:54:13 \n", + "91975 0 NaN NaN NaT \n", + "\n", + " CommunityOwnedDate ParentId ClosedDate OwnerDisplayName \\\n", + "0 NaT NaN NaT None \n", + "1 NaT NaN NaT None \n", + "2 2010-07-19 19:13:28 NaN NaT None \n", + "3 NaT NaN NaT None \n", + "4 2010-07-19 19:14:43 3.0 NaT None \n", + "... ... ... ... ... \n", + "91971 NaT 115367.0 NaT None \n", + "91972 NaT NaN NaT None \n", + "91973 NaT NaN NaT None \n", + "91974 NaT 115358.0 NaT None \n", + "91975 NaT 115375.0 NaT None \n", + "\n", + " LastEditorDisplayName \n", + "0 None \n", + "1 None \n", + "2 None \n", + "3 None \n", + "4 None \n", + "... ... \n", + "91971 None \n", + "91972 None \n", + "91973 None \n", + "91974 None \n", + "91975 None \n", + "\n", + "[91976 rows x 21 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "posts_tab = pd.read_sql_query('SELECT * FROM stats.posts', \n", + " engine)\n", + "posts_tab" + ] }, { "cell_type": "markdown", @@ -93,10 +1142,414 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postIdPostTypeIdAcceptedAnswerIdCreaionDateScoreViewCountBodyuserIdLasActivityDateTitle...AnswerCountCommentCountFavoriteCountLastEditorUserIdLastEditDateCommunityOwnedDateParentIdClosedDateOwnerDisplayNameLastEditorDisplayName
01115.02010-07-19 19:12:12231278.0<p>How should I elicit prior distributions fro...8.02010-09-15 21:08:26Eliciting priors from experts...5.0114.0NaNNaTNaTNaNNaTNoneNone
12159.02010-07-19 19:12:57228198.0<p>In many different statistical methods there...24.02012-11-12 09:21:54What is normality?...7.018.088.02010-08-07 17:56:44NaTNaNNaTNoneNone
2315.02010-07-19 19:13:28543613.0<p>What are some valuable Statistical Analysis...18.02013-05-27 14:48:36What are some valuable Statistical Analysis op......19.0436.0183.02011-02-12 05:50:032010-07-19 19:13:28NaNNaTNoneNone
341135.02010-07-19 19:13:31135224.0<p>I have two groups of data. Each with a dif...23.02010-09-08 03:00:19Assessing the significance of differences in d......5.022.0NaNNaTNaTNaNNaTNoneNone
452NaN2010-07-19 19:14:4381NaN<p>The R-project</p>\\n\\n<p><a href=\"http://www...23.02010-07-19 19:21:15None...NaN3NaN23.02010-07-19 19:21:152010-07-19 19:14:433.0NaTNoneNone
..................................................................
919711153742NaN2014-09-13 23:45:392NaN<p>This grew too long for a comment, but I thi...805.02014-09-14 02:05:41None...NaN2NaN805.02014-09-14 02:05:41NaT115367.0NaTNoneNone
919721153751NaN2014-09-13 23:46:0509.0<p>Assume a classification problem where there...49365.02014-09-14 02:09:23Detecting a consistent pattern in a dataset vi......1.00NaNNaNNaTNaTNaNNaTNoneNone
919731153761NaN2014-09-14 01:27:5415.0<p>My goal is to create a formula that can giv...55746.02014-09-14 01:40:55How to project video viewcount based on histor......0.02NaN7290.02014-09-14 01:40:55NaTNaNNaTNoneNone
919741153772NaN2014-09-14 02:03:280NaN<p>As a practical answer to the real questions...805.02014-09-14 02:54:13None...NaN0NaN805.02014-09-14 02:54:13NaT115358.0NaTNoneNone
919751153782NaN2014-09-14 02:09:230NaN<p>Decision trees are notoriously <strong>unst...7250.02014-09-14 02:09:23None...NaN0NaNNaNNaTNaT115375.0NaTNoneNone
\n", + "

91976 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " postId PostTypeId AcceptedAnswerId CreaionDate Score \\\n", + "0 1 1 15.0 2010-07-19 19:12:12 23 \n", + "1 2 1 59.0 2010-07-19 19:12:57 22 \n", + "2 3 1 5.0 2010-07-19 19:13:28 54 \n", + "3 4 1 135.0 2010-07-19 19:13:31 13 \n", + "4 5 2 NaN 2010-07-19 19:14:43 81 \n", + "... ... ... ... ... ... \n", + "91971 115374 2 NaN 2014-09-13 23:45:39 2 \n", + "91972 115375 1 NaN 2014-09-13 23:46:05 0 \n", + "91973 115376 1 NaN 2014-09-14 01:27:54 1 \n", + "91974 115377 2 NaN 2014-09-14 02:03:28 0 \n", + "91975 115378 2 NaN 2014-09-14 02:09:23 0 \n", + "\n", + " ViewCount Body userId \\\n", + "0 1278.0

How should I elicit prior distributions fro... 8.0 \n", + "1 8198.0

In many different statistical methods there... 24.0 \n", + "2 3613.0

What are some valuable Statistical Analysis... 18.0 \n", + "3 5224.0

I have two groups of data. Each with a dif... 23.0 \n", + "4 NaN

The R-project

\\n\\n

This grew too long for a comment, but I thi... 805.0 \n", + "91972 9.0

Assume a classification problem where there... 49365.0 \n", + "91973 5.0

My goal is to create a formula that can giv... 55746.0 \n", + "91974 NaN

As a practical answer to the real questions... 805.0 \n", + "91975 NaN

Decision trees are notoriously unst... 7250.0 \n", + "\n", + " LasActivityDate Title \\\n", + "0 2010-09-15 21:08:26 Eliciting priors from experts \n", + "1 2012-11-12 09:21:54 What is normality? \n", + "2 2013-05-27 14:48:36 What are some valuable Statistical Analysis op... \n", + "3 2010-09-08 03:00:19 Assessing the significance of differences in d... \n", + "4 2010-07-19 19:21:15 None \n", + "... ... ... \n", + "91971 2014-09-14 02:05:41 None \n", + "91972 2014-09-14 02:09:23 Detecting a consistent pattern in a dataset vi... \n", + "91973 2014-09-14 01:40:55 How to project video viewcount based on histor... \n", + "91974 2014-09-14 02:54:13 None \n", + "91975 2014-09-14 02:09:23 None \n", + "\n", + " ... AnswerCount CommentCount FavoriteCount LastEditorUserId \\\n", + "0 ... 5.0 1 14.0 NaN \n", + "1 ... 7.0 1 8.0 88.0 \n", + "2 ... 19.0 4 36.0 183.0 \n", + "3 ... 5.0 2 2.0 NaN \n", + "4 ... NaN 3 NaN 23.0 \n", + "... ... ... ... ... ... \n", + "91971 ... NaN 2 NaN 805.0 \n", + "91972 ... 1.0 0 NaN NaN \n", + "91973 ... 0.0 2 NaN 7290.0 \n", + "91974 ... NaN 0 NaN 805.0 \n", + "91975 ... NaN 0 NaN NaN \n", + "\n", + " LastEditDate CommunityOwnedDate ParentId ClosedDate \\\n", + "0 NaT NaT NaN NaT \n", + "1 2010-08-07 17:56:44 NaT NaN NaT \n", + "2 2011-02-12 05:50:03 2010-07-19 19:13:28 NaN NaT \n", + "3 NaT NaT NaN NaT \n", + "4 2010-07-19 19:21:15 2010-07-19 19:14:43 3.0 NaT \n", + "... ... ... ... ... \n", + "91971 2014-09-14 02:05:41 NaT 115367.0 NaT \n", + "91972 NaT NaT NaN NaT \n", + "91973 2014-09-14 01:40:55 NaT NaN NaT \n", + "91974 2014-09-14 02:54:13 NaT 115358.0 NaT \n", + "91975 NaT NaT 115375.0 NaT \n", + "\n", + " OwnerDisplayName LastEditorDisplayName \n", + "0 None None \n", + "1 None None \n", + "2 None None \n", + "3 None None \n", + "4 None None \n", + "... ... ... \n", + "91971 None None \n", + "91972 None None \n", + "91973 None None \n", + "91974 None None \n", + "91975 None None \n", + "\n", + "[91976 rows x 21 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "posts_tab.rename(columns={'Id':'postId', 'OwnerUserId':'userId'}, inplace=True)\n", + "posts_tab" + ] }, { "cell_type": "markdown", @@ -109,10 +1562,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "users_columns = user_tab[['userId', 'Reputation', 'Views', 'UpVotes', 'DownVotes']]\n", + "posts_columns = posts_tab[['postId', 'Score', 'userId', 'ViewCount', 'CommentCount']]" + ] }, { "cell_type": "markdown", @@ -126,8 +1582,217 @@ "cell_type": "code", "execution_count": 11, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationViewsUpVotesDownVotespostIdScoreViewCountCommentCount
0-1105007192021750NaN0
1-1105007192085760NaN0
2-1105007192085780NaN0
3-1105007192089810NaN0
4-1105007192089820NaN0
..............................
90579557341000115352016.00
905805573811000115360240.04
90581557426000115366117.00
90582557446100115370113.02
905835574610610011537615.02
\n", + "

90584 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " userId Reputation Views UpVotes DownVotes postId Score \\\n", + "0 -1 1 0 5007 1920 2175 0 \n", + "1 -1 1 0 5007 1920 8576 0 \n", + "2 -1 1 0 5007 1920 8578 0 \n", + "3 -1 1 0 5007 1920 8981 0 \n", + "4 -1 1 0 5007 1920 8982 0 \n", + "... ... ... ... ... ... ... ... \n", + "90579 55734 1 0 0 0 115352 0 \n", + "90580 55738 11 0 0 0 115360 2 \n", + "90581 55742 6 0 0 0 115366 1 \n", + "90582 55744 6 1 0 0 115370 1 \n", + "90583 55746 106 1 0 0 115376 1 \n", + "\n", + " ViewCount CommentCount \n", + "0 NaN 0 \n", + "1 NaN 0 \n", + "2 NaN 0 \n", + "3 NaN 0 \n", + "4 NaN 0 \n", + "... ... ... \n", + "90579 16.0 0 \n", + "90580 40.0 4 \n", + "90581 17.0 0 \n", + "90582 13.0 2 \n", + "90583 5.0 2 \n", + "\n", + "[90584 rows x 9 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_merge = pd.merge(left=users_columns, right=posts_columns, left_on='userId',\n", + " right_on='userId')\n", + "data_merge" + ] }, { "cell_type": "markdown", @@ -138,10 +1803,67 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 90584 entries, 0 to 90583\n", + "Data columns (total 9 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 userId 90584 non-null int64 \n", + " 1 Reputation 90584 non-null int64 \n", + " 2 Views 90584 non-null int64 \n", + " 3 UpVotes 90584 non-null int64 \n", + " 4 DownVotes 90584 non-null int64 \n", + " 5 postId 90584 non-null int64 \n", + " 6 Score 90584 non-null int64 \n", + " 7 ViewCount 42188 non-null float64\n", + " 8 CommentCount 90584 non-null int64 \n", + "dtypes: float64(1), int64(8)\n", + "memory usage: 6.9 MB\n" + ] + } + ], + "source": [ + "data_merge.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "userId 0\n", + "Reputation 0\n", + "Views 0\n", + "UpVotes 0\n", + "DownVotes 0\n", + "postId 0\n", + "Score 0\n", + "ViewCount 48396\n", + "CommentCount 0\n", + "dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nulls_cols = data_merge.isnull().sum()\n", + "nulls_cols" + ] }, { "cell_type": "markdown", @@ -153,10 +1875,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data_merge[['ViewCount']] = data_merge[['ViewCount']].fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "userId 0\n", + "Reputation 0\n", + "Views 0\n", + "UpVotes 0\n", + "DownVotes 0\n", + "postId 0\n", + "Score 0\n", + "ViewCount 0\n", + "CommentCount 0\n", + "dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_merge.isnull().sum()" + ] }, { "cell_type": "markdown", @@ -167,10 +1920,78 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 90584 entries, 0 to 90583\n", + "Data columns (total 9 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 userId 90584 non-null int64 \n", + " 1 Reputation 90584 non-null int64 \n", + " 2 Views 90584 non-null int64 \n", + " 3 UpVotes 90584 non-null int64 \n", + " 4 DownVotes 90584 non-null int64 \n", + " 5 postId 90584 non-null int64 \n", + " 6 Score 90584 non-null int64 \n", + " 7 ViewCount 90584 non-null float64\n", + " 8 CommentCount 90584 non-null int64 \n", + "dtypes: float64(1), int64(8)\n", + "memory usage: 6.9 MB\n" + ] + } + ], + "source": [ + "data_merge.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data_merge = data_merge.astype({'ViewCount': int})" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 90584 entries, 0 to 90583\n", + "Data columns (total 9 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 userId 90584 non-null int64\n", + " 1 Reputation 90584 non-null int64\n", + " 2 Views 90584 non-null int64\n", + " 3 UpVotes 90584 non-null int64\n", + " 4 DownVotes 90584 non-null int64\n", + " 5 postId 90584 non-null int64\n", + " 6 Score 90584 non-null int64\n", + " 7 ViewCount 90584 non-null int64\n", + " 8 CommentCount 90584 non-null int64\n", + "dtypes: int64(9)\n", + "memory usage: 6.9 MB\n" + ] + } + ], + "source": [ + "data_merge.info()" + ] }, { "cell_type": "markdown", @@ -182,7 +2003,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -196,7 +2017,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.9.7" } }, "nbformat": 4, diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 31724c5..d4d08bd 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -9,10 +9,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pandas as pd" + ] }, { "cell_type": "markdown", @@ -23,10 +25,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pymysql\n", + "from sqlalchemy import create_engine" + ] }, { "cell_type": "markdown", @@ -37,10 +42,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "engine = create_engine('mysql+pymysql://guest:relational@relational.fit.cvut.cz')" + ] }, { "cell_type": "markdown", @@ -51,10 +58,327 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "#user_tab = pd.read_sql_query('SEELECT * FROM stats.users', engine)\n", + "user_tab = pd.read_sql_query('SELECT * FROM stats.users', \n", + " engine)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdReputationCreationDateDisplayNameLastAccessDateWebsiteUrlLocationAboutMeViewsUpVotesDownVotesAccountIdAgeProfileImageUrl
0-112010-07-19 06:55:26Community2010-07-19 06:55:26http://meta.stackexchange.com/on the server farm<p>Hi, I'm not really a person.</p>\\n\\n<p>I'm ...050071920-1NaNNone
121012010-07-19 14:01:36Geoff Dalgas2013-11-12 22:07:23http://stackoverflow.comCorvallis, OR<p>Developer on the StackOverflow team. Find ...2530237.0None
231012010-07-19 15:34:50Jarrod Dixon2014-08-08 06:42:58http://stackoverflow.comNew York, NY<p><a href=\"http://blog.stackoverflow.com/2009...22190335.0None
341012010-07-19 19:03:27Emmett2014-01-02 09:31:02http://minesweeperonline.comSan Francisco, CA<p>currently at a startup in SF</p>\\n\\n<p>form...1100199828.0http://i.stack.imgur.com/d1oHX.jpg
4567922010-07-19 19:03:57Shane2014-08-13 00:23:47http://www.statalgo.comNew York, NY<p>Quantitative researcher focusing on statist...114566255450335.0None
.............................................
403205574312014-09-13 21:03:50AussieMeg2014-09-13 21:18:52NoneNoneNone0005026902NaNhttp://graph.facebook.com/665821703/picture?ty...
403215574462014-09-13 21:39:30Mia Maria2014-09-13 21:39:30NoneNoneNone1005026998NaNNone
40322557451012014-09-13 23:45:27tronbabylove2014-09-13 23:45:27NoneUnited StatesNone000481766NaNhttps://www.gravatar.com/avatar/faa7a3fdbd8308...
40323557461062014-09-14 00:29:41GPP2014-09-14 02:05:17NoneNone<p>Stats noobie, product, marketing &amp; medi...100976289NaNhttps://www.gravatar.com/avatar/6d9e9fa6b783a3...
403245574712014-09-14 01:01:44Shivam Agrawal2014-09-14 01:19:04NoneIndia<p>Maths Enthusiast </p>\\n0005027354NaNhttps://lh4.googleusercontent.com/-ZsXhwVaFmiY...
\n", + "

40325 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " Id Reputation CreationDate DisplayName \\\n", + "0 -1 1 2010-07-19 06:55:26 Community \n", + "1 2 101 2010-07-19 14:01:36 Geoff Dalgas \n", + "2 3 101 2010-07-19 15:34:50 Jarrod Dixon \n", + "3 4 101 2010-07-19 19:03:27 Emmett \n", + "4 5 6792 2010-07-19 19:03:57 Shane \n", + "... ... ... ... ... \n", + "40320 55743 1 2014-09-13 21:03:50 AussieMeg \n", + "40321 55744 6 2014-09-13 21:39:30 Mia Maria \n", + "40322 55745 101 2014-09-13 23:45:27 tronbabylove \n", + "40323 55746 106 2014-09-14 00:29:41 GPP \n", + "40324 55747 1 2014-09-14 01:01:44 Shivam Agrawal \n", + "\n", + " LastAccessDate WebsiteUrl Location \\\n", + "0 2010-07-19 06:55:26 http://meta.stackexchange.com/ on the server farm \n", + "1 2013-11-12 22:07:23 http://stackoverflow.com Corvallis, OR \n", + "2 2014-08-08 06:42:58 http://stackoverflow.com New York, NY \n", + "3 2014-01-02 09:31:02 http://minesweeperonline.com San Francisco, CA \n", + "4 2014-08-13 00:23:47 http://www.statalgo.com New York, NY \n", + "... ... ... ... \n", + "40320 2014-09-13 21:18:52 None None \n", + "40321 2014-09-13 21:39:30 None None \n", + "40322 2014-09-13 23:45:27 None United States \n", + "40323 2014-09-14 02:05:17 None None \n", + "40324 2014-09-14 01:19:04 None India \n", + "\n", + " AboutMe Views UpVotes \\\n", + "0

Hi, I'm not really a person.

\\n\\n

I'm ... 0 5007 \n", + "1

Developer on the StackOverflow team. Find ... 25 3 \n", + "2

\\n\\n

form... 11 0 \n", + "4

Quantitative researcher focusing on statist... 1145 662 \n", + "... ... ... ... \n", + "40320 None 0 0 \n", + "40321 None 1 0 \n", + "40322 None 0 0 \n", + "40323

Stats noobie, product, marketing & medi... 1 0 \n", + "40324

Maths Enthusiast

\\n 0 0 \n", + "\n", + " DownVotes AccountId Age \\\n", + "0 1920 -1 NaN \n", + "1 0 2 37.0 \n", + "2 0 3 35.0 \n", + "3 0 1998 28.0 \n", + "4 5 54503 35.0 \n", + "... ... ... ... \n", + "40320 0 5026902 NaN \n", + "40321 0 5026998 NaN \n", + "40322 0 481766 NaN \n", + "40323 0 976289 NaN \n", + "40324 0 5027354 NaN \n", + "\n", + " ProfileImageUrl \n", + "0 None \n", + "1 None \n", + "2 None \n", + "3 http://i.stack.imgur.com/d1oHX.jpg \n", + "4 None \n", + "... ... \n", + "40320 http://graph.facebook.com/665821703/picture?ty... \n", + "40321 None \n", + "40322 https://www.gravatar.com/avatar/faa7a3fdbd8308... \n", + "40323 https://www.gravatar.com/avatar/6d9e9fa6b783a3... \n", + "40324 https://lh4.googleusercontent.com/-ZsXhwVaFmiY... \n", + "\n", + "[40325 rows x 14 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_tab" + ] }, { "cell_type": "markdown", @@ -65,10 +389,317 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationCreationDateDisplayNameLastAccessDateWebsiteUrlLocationAboutMeViewsUpVotesDownVotesAccountIdAgeProfileImageUrl
0-112010-07-19 06:55:26Community2010-07-19 06:55:26http://meta.stackexchange.com/on the server farm<p>Hi, I'm not really a person.</p>\\n\\n<p>I'm ...050071920-1NaNNone
121012010-07-19 14:01:36Geoff Dalgas2013-11-12 22:07:23http://stackoverflow.comCorvallis, OR<p>Developer on the StackOverflow team. Find ...2530237.0None
231012010-07-19 15:34:50Jarrod Dixon2014-08-08 06:42:58http://stackoverflow.comNew York, NY<p><a href=\"http://blog.stackoverflow.com/2009...22190335.0None
341012010-07-19 19:03:27Emmett2014-01-02 09:31:02http://minesweeperonline.comSan Francisco, CA<p>currently at a startup in SF</p>\\n\\n<p>form...1100199828.0http://i.stack.imgur.com/d1oHX.jpg
4567922010-07-19 19:03:57Shane2014-08-13 00:23:47http://www.statalgo.comNew York, NY<p>Quantitative researcher focusing on statist...114566255450335.0None
.............................................
403205574312014-09-13 21:03:50AussieMeg2014-09-13 21:18:52NoneNoneNone0005026902NaNhttp://graph.facebook.com/665821703/picture?ty...
403215574462014-09-13 21:39:30Mia Maria2014-09-13 21:39:30NoneNoneNone1005026998NaNNone
40322557451012014-09-13 23:45:27tronbabylove2014-09-13 23:45:27NoneUnited StatesNone000481766NaNhttps://www.gravatar.com/avatar/faa7a3fdbd8308...
40323557461062014-09-14 00:29:41GPP2014-09-14 02:05:17NoneNone<p>Stats noobie, product, marketing &amp; medi...100976289NaNhttps://www.gravatar.com/avatar/6d9e9fa6b783a3...
403245574712014-09-14 01:01:44Shivam Agrawal2014-09-14 01:19:04NoneIndia<p>Maths Enthusiast </p>\\n0005027354NaNhttps://lh4.googleusercontent.com/-ZsXhwVaFmiY...
\n", + "

40325 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " userId Reputation CreationDate DisplayName \\\n", + "0 -1 1 2010-07-19 06:55:26 Community \n", + "1 2 101 2010-07-19 14:01:36 Geoff Dalgas \n", + "2 3 101 2010-07-19 15:34:50 Jarrod Dixon \n", + "3 4 101 2010-07-19 19:03:27 Emmett \n", + "4 5 6792 2010-07-19 19:03:57 Shane \n", + "... ... ... ... ... \n", + "40320 55743 1 2014-09-13 21:03:50 AussieMeg \n", + "40321 55744 6 2014-09-13 21:39:30 Mia Maria \n", + "40322 55745 101 2014-09-13 23:45:27 tronbabylove \n", + "40323 55746 106 2014-09-14 00:29:41 GPP \n", + "40324 55747 1 2014-09-14 01:01:44 Shivam Agrawal \n", + "\n", + " LastAccessDate WebsiteUrl Location \\\n", + "0 2010-07-19 06:55:26 http://meta.stackexchange.com/ on the server farm \n", + "1 2013-11-12 22:07:23 http://stackoverflow.com Corvallis, OR \n", + "2 2014-08-08 06:42:58 http://stackoverflow.com New York, NY \n", + "3 2014-01-02 09:31:02 http://minesweeperonline.com San Francisco, CA \n", + "4 2014-08-13 00:23:47 http://www.statalgo.com New York, NY \n", + "... ... ... ... \n", + "40320 2014-09-13 21:18:52 None None \n", + "40321 2014-09-13 21:39:30 None None \n", + "40322 2014-09-13 23:45:27 None United States \n", + "40323 2014-09-14 02:05:17 None None \n", + "40324 2014-09-14 01:19:04 None India \n", + "\n", + " AboutMe Views UpVotes \\\n", + "0

Hi, I'm not really a person.

\\n\\n

I'm ... 0 5007 \n", + "1

Developer on the StackOverflow team. Find ... 25 3 \n", + "2

currently at a startup in SF

\\n\\n

form... 11 0 \n", + "4

Quantitative researcher focusing on statist... 1145 662 \n", + "... ... ... ... \n", + "40320 None 0 0 \n", + "40321 None 1 0 \n", + "40322 None 0 0 \n", + "40323

Stats noobie, product, marketing & medi... 1 0 \n", + "40324

Maths Enthusiast

\\n 0 0 \n", + "\n", + " DownVotes AccountId Age \\\n", + "0 1920 -1 NaN \n", + "1 0 2 37.0 \n", + "2 0 3 35.0 \n", + "3 0 1998 28.0 \n", + "4 5 54503 35.0 \n", + "... ... ... ... \n", + "40320 0 5026902 NaN \n", + "40321 0 5026998 NaN \n", + "40322 0 481766 NaN \n", + "40323 0 976289 NaN \n", + "40324 0 5027354 NaN \n", + "\n", + " ProfileImageUrl \n", + "0 None \n", + "1 None \n", + "2 None \n", + "3 http://i.stack.imgur.com/d1oHX.jpg \n", + "4 None \n", + "... ... \n", + "40320 http://graph.facebook.com/665821703/picture?ty... \n", + "40321 None \n", + "40322 https://www.gravatar.com/avatar/faa7a3fdbd8308... \n", + "40323 https://www.gravatar.com/avatar/6d9e9fa6b783a3... \n", + "40324 https://lh4.googleusercontent.com/-ZsXhwVaFmiY... \n", + "\n", + "[40325 rows x 14 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_tab = user_tab.rename(columns={'Id':'userId'})\n", + "user_tab" + ] }, { "cell_type": "markdown", @@ -79,10 +710,428 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdPostTypeIdAcceptedAnswerIdCreaionDateScoreViewCountBodyOwnerUserIdLasActivityDateTitle...AnswerCountCommentCountFavoriteCountLastEditorUserIdLastEditDateCommunityOwnedDateParentIdClosedDateOwnerDisplayNameLastEditorDisplayName
01115.02010-07-19 19:12:12231278.0<p>How should I elicit prior distributions fro...8.02010-09-15 21:08:26Eliciting priors from experts...5.0114.0NaNNaTNaTNaNNaTNoneNone
12159.02010-07-19 19:12:57228198.0<p>In many different statistical methods there...24.02012-11-12 09:21:54What is normality?...7.018.088.02010-08-07 17:56:44NaTNaNNaTNoneNone
2315.02010-07-19 19:13:28543613.0<p>What are some valuable Statistical Analysis...18.02013-05-27 14:48:36What are some valuable Statistical Analysis op......19.0436.0183.02011-02-12 05:50:032010-07-19 19:13:28NaNNaTNoneNone
341135.02010-07-19 19:13:31135224.0<p>I have two groups of data. Each with a dif...23.02010-09-08 03:00:19Assessing the significance of differences in d......5.022.0NaNNaTNaTNaNNaTNoneNone
452NaN2010-07-19 19:14:4381NaN<p>The R-project</p>\\n\\n<p><a href=\"http://www...23.02010-07-19 19:21:15None...NaN3NaN23.02010-07-19 19:21:152010-07-19 19:14:433.0NaTNoneNone
..................................................................
919711153742NaN2014-09-13 23:45:392NaN<p>This grew too long for a comment, but I thi...805.02014-09-14 02:05:41None...NaN2NaN805.02014-09-14 02:05:41NaT115367.0NaTNoneNone
919721153751NaN2014-09-13 23:46:0509.0<p>Assume a classification problem where there...49365.02014-09-14 02:09:23Detecting a consistent pattern in a dataset vi......1.00NaNNaNNaTNaTNaNNaTNoneNone
919731153761NaN2014-09-14 01:27:5415.0<p>My goal is to create a formula that can giv...55746.02014-09-14 01:40:55How to project video viewcount based on histor......0.02NaN7290.02014-09-14 01:40:55NaTNaNNaTNoneNone
919741153772NaN2014-09-14 02:03:280NaN<p>As a practical answer to the real questions...805.02014-09-14 02:54:13None...NaN0NaN805.02014-09-14 02:54:13NaT115358.0NaTNoneNone
919751153782NaN2014-09-14 02:09:230NaN<p>Decision trees are notoriously <strong>unst...7250.02014-09-14 02:09:23None...NaN0NaNNaNNaTNaT115375.0NaTNoneNone
\n", + "

91976 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " Id PostTypeId AcceptedAnswerId CreaionDate Score \\\n", + "0 1 1 15.0 2010-07-19 19:12:12 23 \n", + "1 2 1 59.0 2010-07-19 19:12:57 22 \n", + "2 3 1 5.0 2010-07-19 19:13:28 54 \n", + "3 4 1 135.0 2010-07-19 19:13:31 13 \n", + "4 5 2 NaN 2010-07-19 19:14:43 81 \n", + "... ... ... ... ... ... \n", + "91971 115374 2 NaN 2014-09-13 23:45:39 2 \n", + "91972 115375 1 NaN 2014-09-13 23:46:05 0 \n", + "91973 115376 1 NaN 2014-09-14 01:27:54 1 \n", + "91974 115377 2 NaN 2014-09-14 02:03:28 0 \n", + "91975 115378 2 NaN 2014-09-14 02:09:23 0 \n", + "\n", + " ViewCount Body \\\n", + "0 1278.0

How should I elicit prior distributions fro... \n", + "1 8198.0

In many different statistical methods there... \n", + "2 3613.0

What are some valuable Statistical Analysis... \n", + "3 5224.0

I have two groups of data. Each with a dif... \n", + "4 NaN

The R-project

\\n\\n

This grew too long for a comment, but I thi... \n", + "91972 9.0

Assume a classification problem where there... \n", + "91973 5.0

My goal is to create a formula that can giv... \n", + "91974 NaN

As a practical answer to the real questions... \n", + "91975 NaN

Decision trees are notoriously unst... \n", + "\n", + " OwnerUserId LasActivityDate \\\n", + "0 8.0 2010-09-15 21:08:26 \n", + "1 24.0 2012-11-12 09:21:54 \n", + "2 18.0 2013-05-27 14:48:36 \n", + "3 23.0 2010-09-08 03:00:19 \n", + "4 23.0 2010-07-19 19:21:15 \n", + "... ... ... \n", + "91971 805.0 2014-09-14 02:05:41 \n", + "91972 49365.0 2014-09-14 02:09:23 \n", + "91973 55746.0 2014-09-14 01:40:55 \n", + "91974 805.0 2014-09-14 02:54:13 \n", + "91975 7250.0 2014-09-14 02:09:23 \n", + "\n", + " Title ... AnswerCount \\\n", + "0 Eliciting priors from experts ... 5.0 \n", + "1 What is normality? ... 7.0 \n", + "2 What are some valuable Statistical Analysis op... ... 19.0 \n", + "3 Assessing the significance of differences in d... ... 5.0 \n", + "4 None ... NaN \n", + "... ... ... ... \n", + "91971 None ... NaN \n", + "91972 Detecting a consistent pattern in a dataset vi... ... 1.0 \n", + "91973 How to project video viewcount based on histor... ... 0.0 \n", + "91974 None ... NaN \n", + "91975 None ... NaN \n", + "\n", + " CommentCount FavoriteCount LastEditorUserId LastEditDate \\\n", + "0 1 14.0 NaN NaT \n", + "1 1 8.0 88.0 2010-08-07 17:56:44 \n", + "2 4 36.0 183.0 2011-02-12 05:50:03 \n", + "3 2 2.0 NaN NaT \n", + "4 3 NaN 23.0 2010-07-19 19:21:15 \n", + "... ... ... ... ... \n", + "91971 2 NaN 805.0 2014-09-14 02:05:41 \n", + "91972 0 NaN NaN NaT \n", + "91973 2 NaN 7290.0 2014-09-14 01:40:55 \n", + "91974 0 NaN 805.0 2014-09-14 02:54:13 \n", + "91975 0 NaN NaN NaT \n", + "\n", + " CommunityOwnedDate ParentId ClosedDate OwnerDisplayName \\\n", + "0 NaT NaN NaT None \n", + "1 NaT NaN NaT None \n", + "2 2010-07-19 19:13:28 NaN NaT None \n", + "3 NaT NaN NaT None \n", + "4 2010-07-19 19:14:43 3.0 NaT None \n", + "... ... ... ... ... \n", + "91971 NaT 115367.0 NaT None \n", + "91972 NaT NaN NaT None \n", + "91973 NaT NaN NaT None \n", + "91974 NaT 115358.0 NaT None \n", + "91975 NaT 115375.0 NaT None \n", + "\n", + " LastEditorDisplayName \n", + "0 None \n", + "1 None \n", + "2 None \n", + "3 None \n", + "4 None \n", + "... ... \n", + "91971 None \n", + "91972 None \n", + "91973 None \n", + "91974 None \n", + "91975 None \n", + "\n", + "[91976 rows x 21 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "posts_tab = pd.read_sql_query('SELECT * FROM stats.posts', \n", + " engine)\n", + "posts_tab" + ] }, { "cell_type": "markdown", @@ -93,10 +1142,414 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postIdPostTypeIdAcceptedAnswerIdCreaionDateScoreViewCountBodyuserIdLasActivityDateTitle...AnswerCountCommentCountFavoriteCountLastEditorUserIdLastEditDateCommunityOwnedDateParentIdClosedDateOwnerDisplayNameLastEditorDisplayName
01115.02010-07-19 19:12:12231278.0<p>How should I elicit prior distributions fro...8.02010-09-15 21:08:26Eliciting priors from experts...5.0114.0NaNNaTNaTNaNNaTNoneNone
12159.02010-07-19 19:12:57228198.0<p>In many different statistical methods there...24.02012-11-12 09:21:54What is normality?...7.018.088.02010-08-07 17:56:44NaTNaNNaTNoneNone
2315.02010-07-19 19:13:28543613.0<p>What are some valuable Statistical Analysis...18.02013-05-27 14:48:36What are some valuable Statistical Analysis op......19.0436.0183.02011-02-12 05:50:032010-07-19 19:13:28NaNNaTNoneNone
341135.02010-07-19 19:13:31135224.0<p>I have two groups of data. Each with a dif...23.02010-09-08 03:00:19Assessing the significance of differences in d......5.022.0NaNNaTNaTNaNNaTNoneNone
452NaN2010-07-19 19:14:4381NaN<p>The R-project</p>\\n\\n<p><a href=\"http://www...23.02010-07-19 19:21:15None...NaN3NaN23.02010-07-19 19:21:152010-07-19 19:14:433.0NaTNoneNone
..................................................................
919711153742NaN2014-09-13 23:45:392NaN<p>This grew too long for a comment, but I thi...805.02014-09-14 02:05:41None...NaN2NaN805.02014-09-14 02:05:41NaT115367.0NaTNoneNone
919721153751NaN2014-09-13 23:46:0509.0<p>Assume a classification problem where there...49365.02014-09-14 02:09:23Detecting a consistent pattern in a dataset vi......1.00NaNNaNNaTNaTNaNNaTNoneNone
919731153761NaN2014-09-14 01:27:5415.0<p>My goal is to create a formula that can giv...55746.02014-09-14 01:40:55How to project video viewcount based on histor......0.02NaN7290.02014-09-14 01:40:55NaTNaNNaTNoneNone
919741153772NaN2014-09-14 02:03:280NaN<p>As a practical answer to the real questions...805.02014-09-14 02:54:13None...NaN0NaN805.02014-09-14 02:54:13NaT115358.0NaTNoneNone
919751153782NaN2014-09-14 02:09:230NaN<p>Decision trees are notoriously <strong>unst...7250.02014-09-14 02:09:23None...NaN0NaNNaNNaTNaT115375.0NaTNoneNone
\n", + "

91976 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " postId PostTypeId AcceptedAnswerId CreaionDate Score \\\n", + "0 1 1 15.0 2010-07-19 19:12:12 23 \n", + "1 2 1 59.0 2010-07-19 19:12:57 22 \n", + "2 3 1 5.0 2010-07-19 19:13:28 54 \n", + "3 4 1 135.0 2010-07-19 19:13:31 13 \n", + "4 5 2 NaN 2010-07-19 19:14:43 81 \n", + "... ... ... ... ... ... \n", + "91971 115374 2 NaN 2014-09-13 23:45:39 2 \n", + "91972 115375 1 NaN 2014-09-13 23:46:05 0 \n", + "91973 115376 1 NaN 2014-09-14 01:27:54 1 \n", + "91974 115377 2 NaN 2014-09-14 02:03:28 0 \n", + "91975 115378 2 NaN 2014-09-14 02:09:23 0 \n", + "\n", + " ViewCount Body userId \\\n", + "0 1278.0

How should I elicit prior distributions fro... 8.0 \n", + "1 8198.0

In many different statistical methods there... 24.0 \n", + "2 3613.0

What are some valuable Statistical Analysis... 18.0 \n", + "3 5224.0

I have two groups of data. Each with a dif... 23.0 \n", + "4 NaN

The R-project

\\n\\n

This grew too long for a comment, but I thi... 805.0 \n", + "91972 9.0

Assume a classification problem where there... 49365.0 \n", + "91973 5.0

My goal is to create a formula that can giv... 55746.0 \n", + "91974 NaN

As a practical answer to the real questions... 805.0 \n", + "91975 NaN

Decision trees are notoriously unst... 7250.0 \n", + "\n", + " LasActivityDate Title \\\n", + "0 2010-09-15 21:08:26 Eliciting priors from experts \n", + "1 2012-11-12 09:21:54 What is normality? \n", + "2 2013-05-27 14:48:36 What are some valuable Statistical Analysis op... \n", + "3 2010-09-08 03:00:19 Assessing the significance of differences in d... \n", + "4 2010-07-19 19:21:15 None \n", + "... ... ... \n", + "91971 2014-09-14 02:05:41 None \n", + "91972 2014-09-14 02:09:23 Detecting a consistent pattern in a dataset vi... \n", + "91973 2014-09-14 01:40:55 How to project video viewcount based on histor... \n", + "91974 2014-09-14 02:54:13 None \n", + "91975 2014-09-14 02:09:23 None \n", + "\n", + " ... AnswerCount CommentCount FavoriteCount LastEditorUserId \\\n", + "0 ... 5.0 1 14.0 NaN \n", + "1 ... 7.0 1 8.0 88.0 \n", + "2 ... 19.0 4 36.0 183.0 \n", + "3 ... 5.0 2 2.0 NaN \n", + "4 ... NaN 3 NaN 23.0 \n", + "... ... ... ... ... ... \n", + "91971 ... NaN 2 NaN 805.0 \n", + "91972 ... 1.0 0 NaN NaN \n", + "91973 ... 0.0 2 NaN 7290.0 \n", + "91974 ... NaN 0 NaN 805.0 \n", + "91975 ... NaN 0 NaN NaN \n", + "\n", + " LastEditDate CommunityOwnedDate ParentId ClosedDate \\\n", + "0 NaT NaT NaN NaT \n", + "1 2010-08-07 17:56:44 NaT NaN NaT \n", + "2 2011-02-12 05:50:03 2010-07-19 19:13:28 NaN NaT \n", + "3 NaT NaT NaN NaT \n", + "4 2010-07-19 19:21:15 2010-07-19 19:14:43 3.0 NaT \n", + "... ... ... ... ... \n", + "91971 2014-09-14 02:05:41 NaT 115367.0 NaT \n", + "91972 NaT NaT NaN NaT \n", + "91973 2014-09-14 01:40:55 NaT NaN NaT \n", + "91974 2014-09-14 02:54:13 NaT 115358.0 NaT \n", + "91975 NaT NaT 115375.0 NaT \n", + "\n", + " OwnerDisplayName LastEditorDisplayName \n", + "0 None None \n", + "1 None None \n", + "2 None None \n", + "3 None None \n", + "4 None None \n", + "... ... ... \n", + "91971 None None \n", + "91972 None None \n", + "91973 None None \n", + "91974 None None \n", + "91975 None None \n", + "\n", + "[91976 rows x 21 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "posts_tab.rename(columns={'Id':'postId', 'OwnerUserId':'userId'}, inplace=True)\n", + "posts_tab" + ] }, { "cell_type": "markdown", @@ -109,10 +1562,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "users_columns = user_tab[['userId', 'Reputation', 'Views', 'UpVotes', 'DownVotes']]\n", + "posts_columns = posts_tab[['postId', 'Score', 'userId', 'ViewCount', 'CommentCount']]" + ] }, { "cell_type": "markdown", @@ -126,8 +1582,217 @@ "cell_type": "code", "execution_count": 11, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationViewsUpVotesDownVotespostIdScoreViewCountCommentCount
0-1105007192021750NaN0
1-1105007192085760NaN0
2-1105007192085780NaN0
3-1105007192089810NaN0
4-1105007192089820NaN0
..............................
90579557341000115352016.00
905805573811000115360240.04
90581557426000115366117.00
90582557446100115370113.02
905835574610610011537615.02
\n", + "

90584 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " userId Reputation Views UpVotes DownVotes postId Score \\\n", + "0 -1 1 0 5007 1920 2175 0 \n", + "1 -1 1 0 5007 1920 8576 0 \n", + "2 -1 1 0 5007 1920 8578 0 \n", + "3 -1 1 0 5007 1920 8981 0 \n", + "4 -1 1 0 5007 1920 8982 0 \n", + "... ... ... ... ... ... ... ... \n", + "90579 55734 1 0 0 0 115352 0 \n", + "90580 55738 11 0 0 0 115360 2 \n", + "90581 55742 6 0 0 0 115366 1 \n", + "90582 55744 6 1 0 0 115370 1 \n", + "90583 55746 106 1 0 0 115376 1 \n", + "\n", + " ViewCount CommentCount \n", + "0 NaN 0 \n", + "1 NaN 0 \n", + "2 NaN 0 \n", + "3 NaN 0 \n", + "4 NaN 0 \n", + "... ... ... \n", + "90579 16.0 0 \n", + "90580 40.0 4 \n", + "90581 17.0 0 \n", + "90582 13.0 2 \n", + "90583 5.0 2 \n", + "\n", + "[90584 rows x 9 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_merge = pd.merge(left=users_columns, right=posts_columns, left_on='userId',\n", + " right_on='userId')\n", + "data_merge" + ] }, { "cell_type": "markdown", @@ -138,10 +1803,67 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 90584 entries, 0 to 90583\n", + "Data columns (total 9 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 userId 90584 non-null int64 \n", + " 1 Reputation 90584 non-null int64 \n", + " 2 Views 90584 non-null int64 \n", + " 3 UpVotes 90584 non-null int64 \n", + " 4 DownVotes 90584 non-null int64 \n", + " 5 postId 90584 non-null int64 \n", + " 6 Score 90584 non-null int64 \n", + " 7 ViewCount 42188 non-null float64\n", + " 8 CommentCount 90584 non-null int64 \n", + "dtypes: float64(1), int64(8)\n", + "memory usage: 6.9 MB\n" + ] + } + ], + "source": [ + "data_merge.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "userId 0\n", + "Reputation 0\n", + "Views 0\n", + "UpVotes 0\n", + "DownVotes 0\n", + "postId 0\n", + "Score 0\n", + "ViewCount 48396\n", + "CommentCount 0\n", + "dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nulls_cols = data_merge.isnull().sum()\n", + "nulls_cols" + ] }, { "cell_type": "markdown", @@ -153,10 +1875,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data_merge[['ViewCount']] = data_merge[['ViewCount']].fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "userId 0\n", + "Reputation 0\n", + "Views 0\n", + "UpVotes 0\n", + "DownVotes 0\n", + "postId 0\n", + "Score 0\n", + "ViewCount 0\n", + "CommentCount 0\n", + "dtype: int64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_merge.isnull().sum()" + ] }, { "cell_type": "markdown", @@ -167,10 +1920,78 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 90584 entries, 0 to 90583\n", + "Data columns (total 9 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 userId 90584 non-null int64 \n", + " 1 Reputation 90584 non-null int64 \n", + " 2 Views 90584 non-null int64 \n", + " 3 UpVotes 90584 non-null int64 \n", + " 4 DownVotes 90584 non-null int64 \n", + " 5 postId 90584 non-null int64 \n", + " 6 Score 90584 non-null int64 \n", + " 7 ViewCount 90584 non-null float64\n", + " 8 CommentCount 90584 non-null int64 \n", + "dtypes: float64(1), int64(8)\n", + "memory usage: 6.9 MB\n" + ] + } + ], + "source": [ + "data_merge.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "data_merge = data_merge.astype({'ViewCount': int})" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 90584 entries, 0 to 90583\n", + "Data columns (total 9 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 userId 90584 non-null int64\n", + " 1 Reputation 90584 non-null int64\n", + " 2 Views 90584 non-null int64\n", + " 3 UpVotes 90584 non-null int64\n", + " 4 DownVotes 90584 non-null int64\n", + " 5 postId 90584 non-null int64\n", + " 6 Score 90584 non-null int64\n", + " 7 ViewCount 90584 non-null int64\n", + " 8 CommentCount 90584 non-null int64\n", + "dtypes: int64(9)\n", + "memory usage: 6.9 MB\n" + ] + } + ], + "source": [ + "data_merge.info()" + ] }, { "cell_type": "markdown", @@ -182,7 +2003,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -196,7 +2017,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.9.7" } }, "nbformat": 4,

currently at a startup in SF