From 09e53baa016cc017f3e5280ccb9c02c86998685b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Dias?= Date: Tue, 21 Feb 2023 10:36:26 +0000 Subject: [PATCH] done --- your-code/main.ipynb | 862 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 837 insertions(+), 25 deletions(-) diff --git a/your-code/main.ipynb b/your-code/main.ipynb index bad6d94..7916bc4 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -9,10 +9,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pandas as pd" + ] }, { "cell_type": "markdown", @@ -23,10 +25,175 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdReputationCreationDateDisplayNameLastAccessDateWebsiteUrlLocationAboutMeViewsUpVotesDownVotesAccountIdAgeProfileImageUrl
0-112010-07-19 06:55:26Community2010-07-19 06:55:26http://meta.stackexchange.com/on the server farm<p>Hi, I'm not really a person.</p>\\r\\n\\r\\n<p>...050071920-1NaNNaN
121012010-07-19 14:01:36Geoff Dalgas2013-11-12 22:07:23http://stackoverflow.comCorvallis, OR<p>Developer on the StackOverflow team. Find ...2530237.0NaN
231012010-07-19 15:34:50Jarrod Dixon2014-08-08 06:42:58http://stackoverflow.comNew York, NY<p><a href=\"http://blog.stackoverflow.com/2009...22190335.0NaN
341012010-07-19 19:03:27Emmett2014-01-02 09:31:02http://minesweeperonline.comSan Francisco, CA<p>currently at a startup in SF</p>\\r\\n\\r\\n<p>...1100199828.0http://i.stack.imgur.com/d1oHX.jpg
4567922010-07-19 19:03:57Shane2014-08-13 00:23:47http://www.statalgo.comNew York, NY<p>Quantitative researcher focusing on statist...114566255450335.0NaN
\n", + "
" + ], + "text/plain": [ + " Id Reputation CreationDate DisplayName LastAccessDate \\\n", + "0 -1 1 2010-07-19 06:55:26 Community 2010-07-19 06:55:26 \n", + "1 2 101 2010-07-19 14:01:36 Geoff Dalgas 2013-11-12 22:07:23 \n", + "2 3 101 2010-07-19 15:34:50 Jarrod Dixon 2014-08-08 06:42:58 \n", + "3 4 101 2010-07-19 19:03:27 Emmett 2014-01-02 09:31:02 \n", + "4 5 6792 2010-07-19 19:03:57 Shane 2014-08-13 00:23:47 \n", + "\n", + " WebsiteUrl Location \\\n", + "0 http://meta.stackexchange.com/ on the server farm \n", + "1 http://stackoverflow.com Corvallis, OR \n", + "2 http://stackoverflow.com New York, NY \n", + "3 http://minesweeperonline.com San Francisco, CA \n", + "4 http://www.statalgo.com New York, NY \n", + "\n", + " AboutMe Views UpVotes \\\n", + "0

Hi, I'm not really a person.

\\r\\n\\r\\n

... 0 5007 \n", + "1

Developer on the StackOverflow team. Find ... 25 3 \n", + "2

currently at a startup in SF

\\r\\n\\r\\n

... 11 0 \n", + "4

Quantitative researcher focusing on statist... 1145 662 \n", + "\n", + " DownVotes AccountId Age ProfileImageUrl \n", + "0 1920 -1 NaN NaN \n", + "1 0 2 37.0 NaN \n", + "2 0 3 35.0 NaN \n", + "3 0 1998 28.0 http://i.stack.imgur.com/d1oHX.jpg \n", + "4 5 54503 35.0 NaN " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_df = pd.read_csv(\"users_table.csv\")\n", + "users_df.head()" + ] }, { "cell_type": "markdown", @@ -37,10 +204,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "users_df.rename(columns={\"Id\":\"userId\"}, inplace=True)" + ] }, { "cell_type": "markdown", @@ -51,10 +220,234 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdPostTypeIdAcceptedAnswerIdCreaionDateScoreViewCountBodyOwnerUserIdLasActivityDateTitle...AnswerCountCommentCountFavoriteCountLastEditorUserIdLastEditDateCommunityOwnedDateParentIdClosedDateOwnerDisplayNameLastEditorDisplayName
01115.02010-07-19 19:12:12231278.0<p>How should I elicit prior distributions fro...8.02010-09-15 21:08:26Eliciting priors from experts...5.0114.0NaNNaNNaNNaNNaNNaNNaN
12159.02010-07-19 19:12:57228198.0<p>In many different statistical methods there...24.02012-11-12 09:21:54What is normality?...7.018.088.02010-08-07 17:56:44NaNNaNNaNNaNNaN
2315.02010-07-19 19:13:28543613.0<p>What are some valuable Statistical Analysis...18.02013-05-27 14:48:36What are some valuable Statistical Analysis op......19.0436.0183.02011-02-12 05:50:032010-07-19 19:13:28NaNNaNNaNNaN
341135.02010-07-19 19:13:31135224.0<p>I have two groups of data. Each with a dif...23.02010-09-08 03:00:19Assessing the significance of differences in d......5.022.0NaNNaNNaNNaNNaNNaNNaN
452NaN2010-07-19 19:14:4381NaN<p>The R-project</p>\\n\\n<p><a href=\"http://www...23.02010-07-19 19:21:15NaN...NaN3NaN23.02010-07-19 19:21:152010-07-19 19:14:433.0NaNNaNNaN
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " Id PostTypeId AcceptedAnswerId CreaionDate Score ViewCount \\\n", + "0 1 1 15.0 2010-07-19 19:12:12 23 1278.0 \n", + "1 2 1 59.0 2010-07-19 19:12:57 22 8198.0 \n", + "2 3 1 5.0 2010-07-19 19:13:28 54 3613.0 \n", + "3 4 1 135.0 2010-07-19 19:13:31 13 5224.0 \n", + "4 5 2 NaN 2010-07-19 19:14:43 81 NaN \n", + "\n", + " Body OwnerUserId \\\n", + "0

How should I elicit prior distributions fro... 8.0 \n", + "1

In many different statistical methods there... 24.0 \n", + "2

What are some valuable Statistical Analysis... 18.0 \n", + "3

I have two groups of data. Each with a dif... 23.0 \n", + "4

The R-project

\\n\\n

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postIdScoreuserIdViewCountCommentCountReputationViewsUpVotesDownVotes
01238.01278.016764.01089.0604.025.0
122224.08198.01344.048.036.01.0
235418.03613.04128.08.016.00.0
341323.05224.02308.052.034.01.0
458123.0NaN3308.052.034.01.0
\n", + "" + ], + "text/plain": [ + " postId Score userId ViewCount CommentCount Reputation Views \\\n", + "0 1 23 8.0 1278.0 1 6764.0 1089.0 \n", + "1 2 22 24.0 8198.0 1 344.0 48.0 \n", + "2 3 54 18.0 3613.0 4 128.0 8.0 \n", + "3 4 13 23.0 5224.0 2 308.0 52.0 \n", + "4 5 81 23.0 NaN 3 308.0 52.0 \n", + "\n", + " UpVotes DownVotes \n", + "0 604.0 25.0 \n", + "1 36.0 1.0 \n", + "2 16.0 0.0 \n", + "3 34.0 1.0 \n", + "4 34.0 1.0 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged = pd.merge(left=posts, \n", + " right=users, \n", + " left_on='userId', \n", + " right_on='userId', \n", + " how='left')\n", + "merged.head()" + ] }, { "cell_type": "markdown", @@ -110,10 +633,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "postId 0\n", + "Score 0\n", + "userId 1038\n", + "ViewCount 24105\n", + "CommentCount 0\n", + "Reputation 1038\n", + "Views 1038\n", + "UpVotes 1038\n", + "DownVotes 1038\n", + "dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged.isnull().sum()" + ] }, { "cell_type": "markdown", @@ -125,10 +670,227 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "merged.dropna(subset=['userId'], inplace=True)\n", + "merged.dropna(subset=['ViewCount'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postIdScoreuserIdViewCountCommentCountReputationViewsUpVotesDownVotes
01238.01278.016764.01089.0604.025.0
122224.08198.01344.048.036.01.0
235418.03613.04128.08.016.00.0
341323.05224.02308.052.034.01.0
561525.029229.056792.01145.0662.05.0
..............................
399874831121887.0310.0638.031.07.00.0
3998948314418603.062.0042.012.00.00.0
3999148317819960.01894.01153.09.018.00.0
3999248318019962.052.041.01.00.00.0
3999948325-119968.0116.041.01.00.00.0
\n", + "

15390 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " postId Score userId ViewCount CommentCount Reputation Views \\\n", + "0 1 23 8.0 1278.0 1 6764.0 1089.0 \n", + "1 2 22 24.0 8198.0 1 344.0 48.0 \n", + "2 3 54 18.0 3613.0 4 128.0 8.0 \n", + "3 4 13 23.0 5224.0 2 308.0 52.0 \n", + "5 6 152 5.0 29229.0 5 6792.0 1145.0 \n", + "... ... ... ... ... ... ... ... \n", + "39987 48311 2 1887.0 310.0 6 38.0 31.0 \n", + "39989 48314 4 18603.0 62.0 0 42.0 12.0 \n", + "39991 48317 8 19960.0 1894.0 1 153.0 9.0 \n", + "39992 48318 0 19962.0 52.0 4 1.0 1.0 \n", + "39999 48325 -1 19968.0 116.0 4 1.0 1.0 \n", + "\n", + " UpVotes DownVotes \n", + "0 604.0 25.0 \n", + "1 36.0 1.0 \n", + "2 16.0 0.0 \n", + "3 34.0 1.0 \n", + "5 662.0 5.0 \n", + "... ... ... \n", + "39987 7.0 0.0 \n", + "39989 0.0 0.0 \n", + "39991 18.0 0.0 \n", + "39992 0.0 0.0 \n", + "39999 0.0 0.0 \n", + "\n", + "[15390 rows x 9 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged" + ] }, { "cell_type": "markdown", @@ -137,17 +899,62 @@ "#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? " ] }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "postId int64\n", + "Score int64\n", + "userId float64\n", + "ViewCount float64\n", + "CommentCount int64\n", + "Reputation float64\n", + "Views float64\n", + "UpVotes float64\n", + "DownVotes float64\n", + "dtype: object" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged.dtypes" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "#We're going to change the ones that are float64." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "merged[\"userId\"] = merged[\"userId\"].astype(int)\n", + "merged[\"ViewCount\"] = merged[\"ViewCount\"].astype(int)\n", + "merged[\"Reputation\"] = merged[\"Reputation\"].astype(int)\n", + "merged[\"Views\"] = merged[\"Views\"].astype(int)\n", + "merged[\"UpVotes\"] = merged[\"UpVotes\"].astype(int)\n", + "merged[\"DownVotes\"] = merged[\"DownVotes\"].astype(int)" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "base", "language": "python", "name": "python3" }, @@ -161,7 +968,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.9.12" + }, + "vscode": { + "interpreter": { + "hash": "b2a2b8698d3474f38e9a1ba9d54ec85347e811d97918b8fc315ec0c82e60adc7" + } } }, "nbformat": 4,