From d5a045ff1427ea83fb54a44b2a9b852eef0ed06c Mon Sep 17 00:00:00 2001 From: Daniel Gonzalez Date: Mon, 22 Nov 2021 14:48:02 -0600 Subject: [PATCH 1/2] entrega lab --- your-code/main.ipynb | 1225 +++++++++++++++++++++++++++++++++++------- 1 file changed, 1023 insertions(+), 202 deletions(-) diff --git a/your-code/main.ipynb b/your-code/main.ipynb index 31724c5..8857238 100644 --- a/your-code/main.ipynb +++ b/your-code/main.ipynb @@ -1,204 +1,1025 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 1. Import pandas library" - ] + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + }, + "colab": { + "name": "main.ipynb", + "provenance": [], + "collapsed_sections": [] + } }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 2. Import pymysql and sqlalchemy as you have learnt in the lesson of importing/exporting data \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 3. Create a mysql engine to set the connection to the server. Check the connection details in [this link](https://relational.fit.cvut.cz/search?tableCount%5B%5D=0-10&tableCount%5B%5D=10-30&dataType%5B%5D=Numeric&databaseSize%5B%5D=KB&databaseSize%5B%5D=MB)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 4. Import the users table " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 5. Rename Id column to userId" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 6. Import the posts table. " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 7. Rename Id column to postId and OwnerUserId to userId" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 8. Define new dataframes for users and posts with the following selected columns:\n", - " **users columns**: userId, Reputation,Views,UpVotes,DownVotes\n", - " **posts columns**: postId, Score,userID,ViewCount,CommentCount" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 8. Merge both dataframes, users and posts. \n", - "You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 9. How many missing values do you have in your merged dataframe? On which columns?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 10. You will need to make something with missing values. Will you clean or filling them? Explain. \n", - "**Remember** to check the results of your code before passing to the next step" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 11. Adjust the data types in order to avoid future issues. Which ones should be changed? " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Bonus: Identify extreme values in your merged dataframe as you have learned in class, create a dataframe called outliers with the same columns as our data set and calculate the bounds. The values of the outliers dataframe will be the values of the merged_df that fall outside that bounds. You will need to save your outliers dataframe to a csv file on your-code folder." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "ibRuvGpz7xaN" + }, + "source": [ + "#### 1. Import pandas library" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "q9akui1F7xaW", + "outputId": "a4e32126-f214-489c-ac91-b1b000c1ffb2" + }, + "source": [ + "!pip install pymysql" + ], + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Collecting pymysql\n", + " Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)\n", + "\u001b[?25l\r\u001b[K |███████▌ | 10 kB 21.8 MB/s eta 0:00:01\r\u001b[K |███████████████ | 20 kB 23.9 MB/s eta 0:00:01\r\u001b[K |██████████████████████▍ | 30 kB 12.2 MB/s eta 0:00:01\r\u001b[K |██████████████████████████████ | 40 kB 9.4 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 43 kB 1.8 MB/s \n", + "\u001b[?25hInstalling collected packages: pymysql\n", + "Successfully installed pymysql-1.0.2\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "tdtIReKs7zQQ" + }, + "source": [ + "import pandas as pd" + ], + "execution_count": 45, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3LWluftN7xaY" + }, + "source": [ + "#### 2. Import pymysql and sqlalchemy as you have learnt in the lesson of importing/exporting data \n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "OMkTAepQ7xaY" + }, + "source": [ + "import pymysql\n", + "from sqlalchemy import create_engine" + ], + "execution_count": 46, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kzzXvfxL7xaZ" + }, + "source": [ + "#### 3. Create a mysql engine to set the connection to the server. Check the connection details in [this link](https://relational.fit.cvut.cz/search?tableCount%5B%5D=0-10&tableCount%5B%5D=10-30&dataType%5B%5D=Numeric&databaseSize%5B%5D=KB&databaseSize%5B%5D=MB)" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "yJzjhyAs7xaZ" + }, + "source": [ + "engine = create_engine('mysql+pymysql://guest:relational@relational.fit.cvut.cz/stats')" + ], + "execution_count": 47, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BKCX8cM-7xaZ" + }, + "source": [ + "#### 4. Import the users table " + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 348 + }, + "id": "zgO3qWJN7xaa", + "outputId": "a733d330-af41-4a3a-c7f6-d3718592e3a5" + }, + "source": [ + "data = pd.read_sql_query('SELECT * FROM stats.users',engine)\n", + "data.head()" + ], + "execution_count": 48, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdReputationCreationDateDisplayNameLastAccessDateWebsiteUrlLocationAboutMeViewsUpVotesDownVotesAccountIdAgeProfileImageUrl
0-112010-07-19 06:55:26Community2010-07-19 06:55:26http://meta.stackexchange.com/on the server farm<p>Hi, I'm not really a person.</p>\\n\\n<p>I'm ...050071920-1NaNNone
121012010-07-19 14:01:36Geoff Dalgas2013-11-12 22:07:23http://stackoverflow.comCorvallis, OR<p>Developer on the StackOverflow team. Find ...2530237.0None
231012010-07-19 15:34:50Jarrod Dixon2014-08-08 06:42:58http://stackoverflow.comNew York, NY<p><a href=\"http://blog.stackoverflow.com/2009...22190335.0None
341012010-07-19 19:03:27Emmett2014-01-02 09:31:02http://minesweeperonline.comSan Francisco, CA<p>currently at a startup in SF</p>\\n\\n<p>form...1100199828.0http://i.stack.imgur.com/d1oHX.jpg
4567922010-07-19 19:03:57Shane2014-08-13 00:23:47http://www.statalgo.comNew York, NY<p>Quantitative researcher focusing on statist...114566255450335.0None
\n", + "
" + ], + "text/plain": [ + " Id Reputation ... Age ProfileImageUrl\n", + "0 -1 1 ... NaN None\n", + "1 2 101 ... 37.0 None\n", + "2 3 101 ... 35.0 None\n", + "3 4 101 ... 28.0 http://i.stack.imgur.com/d1oHX.jpg\n", + "4 5 6792 ... 35.0 None\n", + "\n", + "[5 rows x 14 columns]" + ] + }, + "metadata": {}, + "execution_count": 48 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iEGINkSQ7xab" + }, + "source": [ + "#### 5. Rename Id column to userId" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 348 + }, + "id": "TCBu6hVW7xab", + "outputId": "0372fb55-8429-4635-e185-644e9431ca35" + }, + "source": [ + "users = data.rename(columns={'Id': 'userId'}).head()\n", + "users.head()" + ], + "execution_count": 49, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationCreationDateDisplayNameLastAccessDateWebsiteUrlLocationAboutMeViewsUpVotesDownVotesAccountIdAgeProfileImageUrl
0-112010-07-19 06:55:26Community2010-07-19 06:55:26http://meta.stackexchange.com/on the server farm<p>Hi, I'm not really a person.</p>\\n\\n<p>I'm ...050071920-1NaNNone
121012010-07-19 14:01:36Geoff Dalgas2013-11-12 22:07:23http://stackoverflow.comCorvallis, OR<p>Developer on the StackOverflow team. Find ...2530237.0None
231012010-07-19 15:34:50Jarrod Dixon2014-08-08 06:42:58http://stackoverflow.comNew York, NY<p><a href=\"http://blog.stackoverflow.com/2009...22190335.0None
341012010-07-19 19:03:27Emmett2014-01-02 09:31:02http://minesweeperonline.comSan Francisco, CA<p>currently at a startup in SF</p>\\n\\n<p>form...1100199828.0http://i.stack.imgur.com/d1oHX.jpg
4567922010-07-19 19:03:57Shane2014-08-13 00:23:47http://www.statalgo.comNew York, NY<p>Quantitative researcher focusing on statist...114566255450335.0None
\n", + "
" + ], + "text/plain": [ + " userId Reputation ... Age ProfileImageUrl\n", + "0 -1 1 ... NaN None\n", + "1 2 101 ... 37.0 None\n", + "2 3 101 ... 35.0 None\n", + "3 4 101 ... 28.0 http://i.stack.imgur.com/d1oHX.jpg\n", + "4 5 6792 ... 35.0 None\n", + "\n", + "[5 rows x 14 columns]" + ] + }, + "metadata": {}, + "execution_count": 49 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "koTr9ylS7xab" + }, + "source": [ + "#### 6. Import the posts table. " + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 521 + }, + "id": "dFRjmFkh7xac", + "outputId": "41765990-7ea2-4345-f482-44a4be7aa4f3" + }, + "source": [ + "posts = pd.read_sql_query('SELECT * FROM stats.posts',engine)\n", + "posts.head()" + ], + "execution_count": 50, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdPostTypeIdAcceptedAnswerIdCreaionDateScoreViewCountBodyOwnerUserIdLasActivityDateTitleTagsAnswerCountCommentCountFavoriteCountLastEditorUserIdLastEditDateCommunityOwnedDateParentIdClosedDateOwnerDisplayNameLastEditorDisplayName
01115.02010-07-19 19:12:12231278.0<p>How should I elicit prior distributions fro...8.02010-09-15 21:08:26Eliciting priors from experts<bayesian><prior><elicitation>5.0114.0NaNNaTNaTNaNNaTNoneNone
12159.02010-07-19 19:12:57228198.0<p>In many different statistical methods there...24.02012-11-12 09:21:54What is normality?<distributions><normality>7.018.088.02010-08-07 17:56:44NaTNaNNaTNoneNone
2315.02010-07-19 19:13:28543613.0<p>What are some valuable Statistical Analysis...18.02013-05-27 14:48:36What are some valuable Statistical Analysis op...<software><open-source>19.0436.0183.02011-02-12 05:50:032010-07-19 19:13:28NaNNaTNoneNone
341135.02010-07-19 19:13:31135224.0<p>I have two groups of data. Each with a dif...23.02010-09-08 03:00:19Assessing the significance of differences in d...<distributions><statistical-significance>5.022.0NaNNaTNaTNaNNaTNoneNone
452NaN2010-07-19 19:14:4381NaN<p>The R-project</p>\\n\\n<p><a href=\"http://www...23.02010-07-19 19:21:15NoneNoneNaN3NaN23.02010-07-19 19:21:152010-07-19 19:14:433.0NaTNoneNone
\n", + "
" + ], + "text/plain": [ + " Id PostTypeId ... OwnerDisplayName LastEditorDisplayName\n", + "0 1 1 ... None None\n", + "1 2 1 ... None None\n", + "2 3 1 ... None None\n", + "3 4 1 ... None None\n", + "4 5 2 ... None None\n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "metadata": {}, + "execution_count": 50 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KdAiFpJD7xac" + }, + "source": [ + "#### 7. Rename Id column to postId and OwnerUserId to userId" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "EPGQNuFB7xac" + }, + "source": [ + "posts2 = posts.rename(columns={'Id': 'postID', 'OwnerUserId': 'userId'}).head()\n", + "posts2.head()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dc2i-tvc7xad" + }, + "source": [ + "#### 8. Define new dataframes for users and posts with the following selected columns:\n", + " **users columns**: userId, Reputation,Views,UpVotes,DownVotes\n", + " **posts columns**: postId, Score,userID,ViewCount,CommentCount" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "pp37FD9l7xad" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "U5CQ5hi2Ezla", + "outputId": "4a2a856a-3ea5-4833-dfee-9cd2a48f24b0" + }, + "source": [ + "users_filt = users[['userId','Reputation','Views','UpVotes','DownVotes']]\n", + "posts_filt = posts2[['postID','Score','userId','ViewCount','CommentCount']]\n", + "posts_filt.head()\n" + ], + "execution_count": 51, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postIDScoreuserIdViewCountCommentCount
01238.01278.01
122224.08198.01
235418.03613.04
341323.05224.02
458123.0NaN3
\n", + "
" + ], + "text/plain": [ + " postID Score userId ViewCount CommentCount\n", + "0 1 23 8.0 1278.0 1\n", + "1 2 22 24.0 8198.0 1\n", + "2 3 54 18.0 3613.0 4\n", + "3 4 13 23.0 5224.0 2\n", + "4 5 81 23.0 NaN 3" + ] + }, + "metadata": {}, + "execution_count": 51 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Jir8RBEO7xad" + }, + "source": [ + "#### 8. Merge both dataframes, users and posts. \n", + "You will need to make a [merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html) of posts and users dataframes." + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 + }, + "id": "qxEO0rH57xae", + "outputId": "8e319777-dcb2-4e3d-cc6e-03e0889843b3" + }, + "source": [ + "merged_df = pd.merge(left=users_filt, right=posts_filt)\n", + "merged_df.head(4)" + ], + "execution_count": 58, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationViewsUpVotesDownVotespostIDScoreViewCountCommentCount
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [userId, Reputation, Views, UpVotes, DownVotes, postID, Score, ViewCount, CommentCount]\n", + "Index: []" + ] + }, + "metadata": {}, + "execution_count": 58 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ELp32l5u7xae" + }, + "source": [ + "#### 9. How many missing values do you have in your merged dataframe? On which columns?" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xX0em7kc7xae" + }, + "source": [ + "users.info()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Ce0fLZ97JV3M" + }, + "source": [ + "merged_df.info()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "510cRIWe7xaf" + }, + "source": [ + "#### 10. You will need to make something with missing values. Will you clean or filling them? Explain. \n", + "**Remember** to check the results of your code before passing to the next step" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 49 + }, + "id": "2XSJxL5C7xaf", + "outputId": "f28212c4-fc21-48ee-cacb-2c51a14aba22" + }, + "source": [ + "merged_df[['ViewCount']] = merged_df[['ViewCount']].fillna(0)\n", + "merged_df.head()" + ], + "execution_count": 59, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationViewsUpVotesDownVotespostIDScoreViewCountCommentCount
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [userId, Reputation, Views, UpVotes, DownVotes, postID, Score, ViewCount, CommentCount]\n", + "Index: []" + ] + }, + "metadata": {}, + "execution_count": 59 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s0LJUJQW7xaf" + }, + "source": [ + "#### 11. Adjust the data types in order to avoid future issues. Which ones should be changed? " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "k6NUFICr7xaf" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s0Hwo-CO7xaf" + }, + "source": [ + "#### Bonus: Identify extreme values in your merged dataframe as you have learned in class, create a dataframe called outliers with the same columns as our data set and calculate the bounds. The values of the outliers dataframe will be the values of the merged_df that fall outside that bounds. You will need to save your outliers dataframe to a csv file on your-code folder." + ] + } + ] +} \ No newline at end of file From 3b5dfacbeb384cddd06ddb4b9d69b44c64d327d1 Mon Sep 17 00:00:00 2001 From: Daniel Gonzalez Date: Thu, 25 Nov 2021 20:17:34 -0600 Subject: [PATCH 2/2] Solucion Lab --- your-code/weather.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/your-code/weather.ipynb b/your-code/weather.ipynb index 4fc40ab..b419ee1 100644 --- a/your-code/weather.ipynb +++ b/your-code/weather.ipynb @@ -47,7 +47,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -61,7 +61,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.8.11" } }, "nbformat": 4,