From dfa539f3a1bf38d08b63cd4041d8bfbe68bbac5f Mon Sep 17 00:00:00 2001 From: Milena Perez Date: Wed, 8 Feb 2023 11:50:46 +0000 Subject: [PATCH] comment --- your-code/main.ipynb | 574 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 549 insertions(+), 25 deletions(-) diff --git a/your-code/main.ipynb b/your-code/main.ipynb index bad6d94..9ed2015 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -9,10 +9,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pandas as pd" + ] }, { "cell_type": "markdown", @@ -23,10 +25,112 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdReputationCreationDateDisplayNameLastAccessDateWebsiteUrlLocationAboutMeViewsUpVotesDownVotesAccountIdAgeProfileImageUrl
0-112010-07-19 06:55:26Community2010-07-19 06:55:26http://meta.stackexchange.com/on the server farm<p>Hi, I'm not really a person.</p>\\r\\n\\r\\n<p>...050071920-1NaNNaN
121012010-07-19 14:01:36Geoff Dalgas2013-11-12 22:07:23http://stackoverflow.comCorvallis, OR<p>Developer on the StackOverflow team. Find ...2530237.0NaN
\n", + "
" + ], + "text/plain": [ + " Id Reputation CreationDate DisplayName LastAccessDate \\\n", + "0 -1 1 2010-07-19 06:55:26 Community 2010-07-19 06:55:26 \n", + "1 2 101 2010-07-19 14:01:36 Geoff Dalgas 2013-11-12 22:07:23 \n", + "\n", + " WebsiteUrl Location \\\n", + "0 http://meta.stackexchange.com/ on the server farm \n", + "1 http://stackoverflow.com Corvallis, OR \n", + "\n", + " AboutMe Views UpVotes \\\n", + "0

Hi, I'm not really a person.

\\r\\n\\r\\n

... 0 5007 \n", + "1

Developer on the StackOverflow team. Find ... 25 3 \n", + "\n", + " DownVotes AccountId Age ProfileImageUrl \n", + "0 1920 -1 NaN NaN \n", + "1 0 2 37.0 NaN " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users= pd.read_csv(\"C:/Users/milena.xavier/Desktop/Iron/Labs/week2/lab-data-cleaning/your-code/users_table.csv\")\n", + "users.head(2)" + ] }, { "cell_type": "markdown", @@ -37,10 +141,91 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationCreationDateDisplayNameLastAccessDateWebsiteUrlLocationAboutMeViewsUpVotesDownVotesAccountIdAgeProfileImageUrl
0-112010-07-19 06:55:26Community2010-07-19 06:55:26http://meta.stackexchange.com/on the server farm<p>Hi, I'm not really a person.</p>\\r\\n\\r\\n<p>...050071920-1NaNNaN
\n", + "
" + ], + "text/plain": [ + " userId Reputation CreationDate DisplayName LastAccessDate \\\n", + "0 -1 1 2010-07-19 06:55:26 Community 2010-07-19 06:55:26 \n", + "\n", + " WebsiteUrl Location \\\n", + "0 http://meta.stackexchange.com/ on the server farm \n", + "\n", + " AboutMe Views UpVotes \\\n", + "0

Hi, I'm not really a person.

\\r\\n\\r\\n

... 0 5007 \n", + "\n", + " DownVotes AccountId Age ProfileImageUrl \n", + "0 1920 -1 NaN NaN " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users.rename(columns = {'Id':'userId'}, inplace = True)\n", + "users.head(1)" + ] }, { "cell_type": "markdown", @@ -51,10 +236,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "posts=pd.read_csv(\"C:/Users/milena.xavier/Desktop/Iron/Labs/week2/lab-data-cleaning/your-code/posts_table.csv\")" + ] }, { "cell_type": "markdown", @@ -65,10 +252,114 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postIdPostTypeIdAcceptedAnswerIdCreaionDateScoreViewCountBodyuserIdLasActivityDateTitle...AnswerCountCommentCountFavoriteCountLastEditorUserIdLastEditDateCommunityOwnedDateParentIdClosedDateOwnerDisplayNameLastEditorDisplayName
01115.02010-07-19 19:12:12231278.0<p>How should I elicit prior distributions fro...8.02010-09-15 21:08:26Eliciting priors from experts...5.0114.0NaNNaNNaNNaNNaNNaNNaN
\n", + "

1 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " postId PostTypeId AcceptedAnswerId CreaionDate Score \\\n", + "0 1 1 15.0 2010-07-19 19:12:12 23 \n", + "\n", + " ViewCount Body userId \\\n", + "0 1278.0

How should I elicit prior distributions fro... 8.0 \n", + "\n", + " LasActivityDate Title ... AnswerCount \\\n", + "0 2010-09-15 21:08:26 Eliciting priors from experts ... 5.0 \n", + "\n", + " CommentCount FavoriteCount LastEditorUserId LastEditDate \\\n", + "0 1 14.0 NaN NaN \n", + "\n", + " CommunityOwnedDate ParentId ClosedDate OwnerDisplayName \\\n", + "0 NaN NaN NaN NaN \n", + "\n", + " LastEditorDisplayName \n", + "0 NaN \n", + "\n", + "[1 rows x 21 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "posts.rename(columns = {'Id':'postId', 'OwnerUserId': 'userId'}, inplace = True)\n", + "posts.head(1)" + ] }, { "cell_type": "markdown", @@ -81,10 +372,135 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationViewsUpVotesDownVotespostIdScoreViewCountCommentCount
0-1105007192021750NaN0
1-1105007192085760NaN0
2-1105007192085780NaN0
3-1105007192089810NaN0
4-1105007192089820NaN0
\n", + "
" + ], + "text/plain": [ + " userId Reputation Views UpVotes DownVotes postId Score ViewCount \\\n", + "0 -1 1 0 5007 1920 2175 0 NaN \n", + "1 -1 1 0 5007 1920 8576 0 NaN \n", + "2 -1 1 0 5007 1920 8578 0 NaN \n", + "3 -1 1 0 5007 1920 8981 0 NaN \n", + "4 -1 1 0 5007 1920 8982 0 NaN \n", + "\n", + " CommentCount \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merge_users = users[[\"userId\", \"Reputation\", \"Views\", \"UpVotes\", \"DownVotes\"]]\n", + "merge_posts=posts[[\"postId\", \"Score\", \"userId\", \"ViewCount\", \"CommentCount\"]]\n", + "\n", + "\n", + "new_df=pd.merge(merge_users, merge_posts, on=\"userId\")\n", + "new_df.head()" + ] }, { "cell_type": "markdown", @@ -99,7 +515,9 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "new_df=pd.merge(merge_users, merge_posts, on=\"userId\")" + ] }, { "cell_type": "markdown", @@ -110,10 +528,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "userId 0\n", + "Reputation 0\n", + "Views 0\n", + "UpVotes 0\n", + "DownVotes 0\n", + "postId 0\n", + "Score 0\n", + "ViewCount 23572\n", + "CommentCount 0\n", + "dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df.isna().sum()" + ] }, { "cell_type": "markdown", @@ -125,10 +565,34 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "userId 0\n", + "Reputation 0\n", + "Views 0\n", + "UpVotes 0\n", + "DownVotes 0\n", + "postId 0\n", + "Score 0\n", + "ViewCount 0\n", + "CommentCount 0\n", + "dtype: int64" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df['ViewCount'] = new_df['ViewCount'].fillna(0)\n", + "new_df.isna().sum()\n", + "#I replaced by zero because these posts didnt have any Views. It is not because they dont exist at all." + ] }, { "cell_type": "markdown", @@ -137,6 +601,66 @@ "#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? " ] }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "userId int64\n", + "Reputation int64\n", + "Views int64\n", + "UpVotes int64\n", + "DownVotes int64\n", + "postId int64\n", + "Score int64\n", + "ViewCount float64\n", + "CommentCount int64\n", + "dtype: object" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "userId int64\n", + "Reputation int64\n", + "Views int64\n", + "UpVotes int64\n", + "DownVotes int64\n", + "postId int64\n", + "Score int64\n", + "ViewCount int32\n", + "CommentCount int64\n", + "dtype: object" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "new_df['ViewCount'] = new_df['ViewCount'].astype(int)\n", + "new_df.dtypes\n", + "#I changed from float to int since there is no half View :)\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -147,7 +671,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -161,7 +685,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.9.13" } }, "nbformat": 4,