From 52e169e7f387075573f7a24f004a9a16aec799ed Mon Sep 17 00:00:00 2001 From: Sanober Khoso Date: Tue, 24 Oct 2023 23:22:15 +0200 Subject: [PATCH] SanoberKhoso --- your-code/main.ipynb | 1835 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 1666 insertions(+), 169 deletions(-) diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 7900997..6976535 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -1,169 +1,1666 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 1. Import pandas library" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 2. Import users table:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 3. Rename Id column to userId" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 4. Import posts table:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 5. Rename Id column to postId and OwnerUserId to userId" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 6. Define new dataframes for users and posts with the following selected columns:\n", - " **users columns**: userId, Reputation,Views,UpVotes,DownVotes\n", - " **posts columns**: postId, Score,userId,ViewCount,CommentCount" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 7. Merge both dataframes, users and posts. \n", - "You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 8. How many missing values do you have in your merged dataframe? On which columns?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 9. You will need to make something with missing values. Will you clean or filling them? Explain. \n", - "**Remember** to check the results of your code before passing to the next step" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1. Import pandas library" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2. Import users table:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "users_raw = pd.read_csv('users_table.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdReputationCreationDateDisplayNameLastAccessDateWebsiteUrlLocationAboutMeViewsUpVotesDownVotesAccountIdAgeProfileImageUrl
0-112010-07-19 06:55:26Community2010-07-19 06:55:26http://meta.stackexchange.com/on the server farm<p>Hi, I'm not really a person.</p>\\r\\n\\r\\n<p>...050071920-1NaNNaN
121012010-07-19 14:01:36Geoff Dalgas2013-11-12 22:07:23http://stackoverflow.comCorvallis, OR<p>Developer on the StackOverflow team. Find ...2530237.0NaN
231012010-07-19 15:34:50Jarrod Dixon2014-08-08 06:42:58http://stackoverflow.comNew York, NY<p><a href=\"http://blog.stackoverflow.com/2009...22190335.0NaN
341012010-07-19 19:03:27Emmett2014-01-02 09:31:02http://minesweeperonline.comSan Francisco, CA<p>currently at a startup in SF</p>\\r\\n\\r\\n<p>...1100199828.0http://i.stack.imgur.com/d1oHX.jpg
4567922010-07-19 19:03:57Shane2014-08-13 00:23:47http://www.statalgo.comNew York, NY<p>Quantitative researcher focusing on statist...114566255450335.0NaN
\n", + "
" + ], + "text/plain": [ + " Id Reputation CreationDate DisplayName LastAccessDate \\\n", + "0 -1 1 2010-07-19 06:55:26 Community 2010-07-19 06:55:26 \n", + "1 2 101 2010-07-19 14:01:36 Geoff Dalgas 2013-11-12 22:07:23 \n", + "2 3 101 2010-07-19 15:34:50 Jarrod Dixon 2014-08-08 06:42:58 \n", + "3 4 101 2010-07-19 19:03:27 Emmett 2014-01-02 09:31:02 \n", + "4 5 6792 2010-07-19 19:03:57 Shane 2014-08-13 00:23:47 \n", + "\n", + " WebsiteUrl Location \\\n", + "0 http://meta.stackexchange.com/ on the server farm \n", + "1 http://stackoverflow.com Corvallis, OR \n", + "2 http://stackoverflow.com New York, NY \n", + "3 http://minesweeperonline.com San Francisco, CA \n", + "4 http://www.statalgo.com New York, NY \n", + "\n", + " AboutMe Views UpVotes \\\n", + "0

Hi, I'm not really a person.

\\r\\n\\r\\n

... 0 5007 \n", + "1

Developer on the StackOverflow team. Find ... 25 3 \n", + "2

currently at a startup in SF

\\r\\n\\r\\n

... 11 0 \n", + "4

Quantitative researcher focusing on statist... 1145 662 \n", + "\n", + " DownVotes AccountId Age ProfileImageUrl \n", + "0 1920 -1 NaN NaN \n", + "1 0 2 37.0 NaN \n", + "2 0 3 35.0 NaN \n", + "3 0 1998 28.0 http://i.stack.imgur.com/d1oHX.jpg \n", + "4 5 54503 35.0 NaN " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_raw.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3. Rename Id column to userId" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationCreationDateDisplayNameLastAccessDateWebsiteUrlLocationAboutMeViewsUpVotesDownVotesAccountIdAgeProfileImageUrl
0-112010-07-19 06:55:26Community2010-07-19 06:55:26http://meta.stackexchange.com/on the server farm<p>Hi, I'm not really a person.</p>\\r\\n\\r\\n<p>...050071920-1NaNNaN
121012010-07-19 14:01:36Geoff Dalgas2013-11-12 22:07:23http://stackoverflow.comCorvallis, OR<p>Developer on the StackOverflow team. Find ...2530237.0NaN
231012010-07-19 15:34:50Jarrod Dixon2014-08-08 06:42:58http://stackoverflow.comNew York, NY<p><a href=\"http://blog.stackoverflow.com/2009...22190335.0NaN
341012010-07-19 19:03:27Emmett2014-01-02 09:31:02http://minesweeperonline.comSan Francisco, CA<p>currently at a startup in SF</p>\\r\\n\\r\\n<p>...1100199828.0http://i.stack.imgur.com/d1oHX.jpg
4567922010-07-19 19:03:57Shane2014-08-13 00:23:47http://www.statalgo.comNew York, NY<p>Quantitative researcher focusing on statist...114566255450335.0NaN
\n", + "
" + ], + "text/plain": [ + " userId Reputation CreationDate DisplayName LastAccessDate \\\n", + "0 -1 1 2010-07-19 06:55:26 Community 2010-07-19 06:55:26 \n", + "1 2 101 2010-07-19 14:01:36 Geoff Dalgas 2013-11-12 22:07:23 \n", + "2 3 101 2010-07-19 15:34:50 Jarrod Dixon 2014-08-08 06:42:58 \n", + "3 4 101 2010-07-19 19:03:27 Emmett 2014-01-02 09:31:02 \n", + "4 5 6792 2010-07-19 19:03:57 Shane 2014-08-13 00:23:47 \n", + "\n", + " WebsiteUrl Location \\\n", + "0 http://meta.stackexchange.com/ on the server farm \n", + "1 http://stackoverflow.com Corvallis, OR \n", + "2 http://stackoverflow.com New York, NY \n", + "3 http://minesweeperonline.com San Francisco, CA \n", + "4 http://www.statalgo.com New York, NY \n", + "\n", + " AboutMe Views UpVotes \\\n", + "0

Hi, I'm not really a person.

\\r\\n\\r\\n

... 0 5007 \n", + "1

Developer on the StackOverflow team. Find ... 25 3 \n", + "2

currently at a startup in SF

\\r\\n\\r\\n

... 11 0 \n", + "4

Quantitative researcher focusing on statist... 1145 662 \n", + "\n", + " DownVotes AccountId Age ProfileImageUrl \n", + "0 1920 -1 NaN NaN \n", + "1 0 2 37.0 NaN \n", + "2 0 3 35.0 NaN \n", + "3 0 1998 28.0 http://i.stack.imgur.com/d1oHX.jpg \n", + "4 5 54503 35.0 NaN " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_raw.rename(columns={'Id':'userId'},inplace=True)\n", + "users_raw.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4. Import posts table:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdPostTypeIdAcceptedAnswerIdCreaionDateScoreViewCountBodyOwnerUserIdLasActivityDateTitle...AnswerCountCommentCountFavoriteCountLastEditorUserIdLastEditDateCommunityOwnedDateParentIdClosedDateOwnerDisplayNameLastEditorDisplayName
01115.02010-07-19 19:12:12231278.0<p>How should I elicit prior distributions fro...8.02010-09-15 21:08:26Eliciting priors from experts...5.0114.0NaNNaNNaNNaNNaNNaNNaN
12159.02010-07-19 19:12:57228198.0<p>In many different statistical methods there...24.02012-11-12 09:21:54What is normality?...7.018.088.02010-08-07 17:56:44NaNNaNNaNNaNNaN
2315.02010-07-19 19:13:28543613.0<p>What are some valuable Statistical Analysis...18.02013-05-27 14:48:36What are some valuable Statistical Analysis op......19.0436.0183.02011-02-12 05:50:032010-07-19 19:13:28NaNNaNNaNNaN
341135.02010-07-19 19:13:31135224.0<p>I have two groups of data. Each with a dif...23.02010-09-08 03:00:19Assessing the significance of differences in d......5.022.0NaNNaNNaNNaNNaNNaNNaN
452NaN2010-07-19 19:14:4381NaN<p>The R-project</p>\\n\\n<p><a href=\"http://www...23.02010-07-19 19:21:15NaN...NaN3NaN23.02010-07-19 19:21:152010-07-19 19:14:433.0NaNNaNNaN
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " Id PostTypeId AcceptedAnswerId CreaionDate Score ViewCount \\\n", + "0 1 1 15.0 2010-07-19 19:12:12 23 1278.0 \n", + "1 2 1 59.0 2010-07-19 19:12:57 22 8198.0 \n", + "2 3 1 5.0 2010-07-19 19:13:28 54 3613.0 \n", + "3 4 1 135.0 2010-07-19 19:13:31 13 5224.0 \n", + "4 5 2 NaN 2010-07-19 19:14:43 81 NaN \n", + "\n", + " Body OwnerUserId \\\n", + "0

How should I elicit prior distributions fro... 8.0 \n", + "1

In many different statistical methods there... 24.0 \n", + "2

What are some valuable Statistical Analysis... 18.0 \n", + "3

I have two groups of data. Each with a dif... 23.0 \n", + "4

The R-project

\\n\\n

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postIdPostTypeIdAcceptedAnswerIdCreaionDateScoreViewCountBodyuserIdLasActivityDateTitle...AnswerCountCommentCountFavoriteCountLastEditorUserIdLastEditDateCommunityOwnedDateParentIdClosedDateOwnerDisplayNameLastEditorDisplayName
01115.02010-07-19 19:12:12231278.0<p>How should I elicit prior distributions fro...8.02010-09-15 21:08:26Eliciting priors from experts...5.0114.0NaNNaNNaNNaNNaNNaNNaN
12159.02010-07-19 19:12:57228198.0<p>In many different statistical methods there...24.02012-11-12 09:21:54What is normality?...7.018.088.02010-08-07 17:56:44NaNNaNNaNNaNNaN
2315.02010-07-19 19:13:28543613.0<p>What are some valuable Statistical Analysis...18.02013-05-27 14:48:36What are some valuable Statistical Analysis op......19.0436.0183.02011-02-12 05:50:032010-07-19 19:13:28NaNNaNNaNNaN
341135.02010-07-19 19:13:31135224.0<p>I have two groups of data. Each with a dif...23.02010-09-08 03:00:19Assessing the significance of differences in d......5.022.0NaNNaNNaNNaNNaNNaNNaN
452NaN2010-07-19 19:14:4381NaN<p>The R-project</p>\\n\\n<p><a href=\"http://www...23.02010-07-19 19:21:15NaN...NaN3NaN23.02010-07-19 19:21:152010-07-19 19:14:433.0NaNNaNNaN
\n", + "

5 rows × 21 columns

\n", + "" + ], + "text/plain": [ + " postId PostTypeId AcceptedAnswerId CreaionDate Score \\\n", + "0 1 1 15.0 2010-07-19 19:12:12 23 \n", + "1 2 1 59.0 2010-07-19 19:12:57 22 \n", + "2 3 1 5.0 2010-07-19 19:13:28 54 \n", + "3 4 1 135.0 2010-07-19 19:13:31 13 \n", + "4 5 2 NaN 2010-07-19 19:14:43 81 \n", + "\n", + " ViewCount Body userId \\\n", + "0 1278.0

How should I elicit prior distributions fro... 8.0 \n", + "1 8198.0

In many different statistical methods there... 24.0 \n", + "2 3613.0

What are some valuable Statistical Analysis... 18.0 \n", + "3 5224.0

I have two groups of data. Each with a dif... 23.0 \n", + "4 NaN

The R-project

\\n\\n

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationViewsUpVotesDownVotes
0-11050071920
121012530
2310122190
341011100
45679211456625
\n", + "" + ], + "text/plain": [ + " userId Reputation Views UpVotes DownVotes\n", + "0 -1 1 0 5007 1920\n", + "1 2 101 25 3 0\n", + "2 3 101 22 19 0\n", + "3 4 101 11 0 0\n", + "4 5 6792 1145 662 5" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users = users_raw[['userId','Reputation','Views','UpVotes','DownVotes']]\n", + "users.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postIdScoreuserIdViewCountCommentCount
01238.01278.01
122224.08198.01
235418.03613.04
341323.05224.02
458123.0NaN3
\n", + "
" + ], + "text/plain": [ + " postId Score userId ViewCount CommentCount\n", + "0 1 23 8.0 1278.0 1\n", + "1 2 22 24.0 8198.0 1\n", + "2 3 54 18.0 3613.0 4\n", + "3 4 13 23.0 5224.0 2\n", + "4 5 81 23.0 NaN 3" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "posts=posts_raw[['postId', 'Score','userId','ViewCount','CommentCount']]\n", + "posts.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 7. Merge both dataframes, users and posts. \n", + "You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationViewsUpVotesDownVotespostIdScoreViewCountCommentCount
0-1105007192021750NaN0
1-1105007192085760NaN0
2-1105007192085780NaN0
3-1105007192089810NaN0
4-1105007192089820NaN0
\n", + "
" + ], + "text/plain": [ + " userId Reputation Views UpVotes DownVotes postId Score ViewCount \\\n", + "0 -1 1 0 5007 1920 2175 0 NaN \n", + "1 -1 1 0 5007 1920 8576 0 NaN \n", + "2 -1 1 0 5007 1920 8578 0 NaN \n", + "3 -1 1 0 5007 1920 8981 0 NaN \n", + "4 -1 1 0 5007 1920 8982 0 NaN \n", + "\n", + " CommentCount \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_posts = users.merge(posts, how='inner', on='userId')\n", + "users_posts.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 8. How many missing values do you have in your merged dataframe? On which columns?" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ViewCount 23572\n", + "dtype: int64" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "null_cols = users_posts.isnull().sum()\n", + "null_cols[null_cols>0]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationViewsUpVotesDownVotespostIdScoreViewCountCommentCount
0-1105007192021750NaN0
1-1105007192085760NaN0
2-1105007192085780NaN0
3-1105007192089810NaN0
4-1105007192089820NaN0
..............................
212567921145662531685NaN3
213567921145662532234NaN1
2145679211456625323915NaN5
215567921145662533193NaN1
216567921145662534146NaN4
\n", + "

200 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " userId Reputation Views UpVotes DownVotes postId Score ViewCount \\\n", + "0 -1 1 0 5007 1920 2175 0 NaN \n", + "1 -1 1 0 5007 1920 8576 0 NaN \n", + "2 -1 1 0 5007 1920 8578 0 NaN \n", + "3 -1 1 0 5007 1920 8981 0 NaN \n", + "4 -1 1 0 5007 1920 8982 0 NaN \n", + ".. ... ... ... ... ... ... ... ... \n", + "212 5 6792 1145 662 5 3168 5 NaN \n", + "213 5 6792 1145 662 5 3223 4 NaN \n", + "214 5 6792 1145 662 5 3239 15 NaN \n", + "215 5 6792 1145 662 5 3319 3 NaN \n", + "216 5 6792 1145 662 5 3414 6 NaN \n", + "\n", + " CommentCount \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + ".. ... \n", + "212 3 \n", + "213 1 \n", + "214 5 \n", + "215 1 \n", + "216 4 \n", + "\n", + "[200 rows x 9 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_data = users_posts[users_posts['ViewCount'].isnull()==True]\n", + "test_data.head(200)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11032 954\n", + "919 820\n", + "686 650\n", + "930 443\n", + "4505 383\n", + " ... \n", + "7832 1\n", + "7829 1\n", + "7818 1\n", + "7792 1\n", + "55226 1\n", + "Name: userId, Length: 2618, dtype: int64" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_data['userId'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 9. You will need to make something with missing values. Will you clean or filling them? Explain. \n", + "**Remember** to check the results of your code before passing to the next step" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Considering only ViewCount is null and we do have around 2618 records with that value, we can fill the column with zero if it is null and it should be safe for statistical purposes\n" + ] + } + ], + "source": [ + "print(\"Considering only ViewCount is null and we do have around 2618 records with that value, we can fill the column with zero if it is null and it should be safe for statistical purposes\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? " + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationViewsUpVotesDownVotespostIdScoreViewCountCommentCount
0-11050071920217500.00
1-11050071920857600.00
2-11050071920857800.00
3-11050071920898100.00
4-11050071920898200.00
\n", + "
" + ], + "text/plain": [ + " userId Reputation Views UpVotes DownVotes postId Score ViewCount \\\n", + "0 -1 1 0 5007 1920 2175 0 0.0 \n", + "1 -1 1 0 5007 1920 8576 0 0.0 \n", + "2 -1 1 0 5007 1920 8578 0 0.0 \n", + "3 -1 1 0 5007 1920 8981 0 0.0 \n", + "4 -1 1 0 5007 1920 8982 0 0.0 \n", + "\n", + " CommentCount \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_posts['ViewCount'] = users_posts['ViewCount'].fillna(0)\n", + "users_posts.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}