diff --git a/main.ipynb b/main.ipynb new file mode 100644 index 0000000..6702a25 --- /dev/null +++ b/main.ipynb @@ -0,0 +1,843 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1. Import pandas library" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2. Import users table:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdReputationCreationDateDisplayNameLastAccessDateWebsiteUrlLocationAboutMeViewsUpVotesDownVotesAccountIdAgeProfileImageUrl
0-112010-07-19 06:55:26Community2010-07-19 06:55:26http://meta.stackexchange.com/on the server farm<p>Hi, I'm not really a person.</p>\\n\\n<p>I'm ...050071920-1NaNNaN
121012010-07-19 14:01:36Geoff Dalgas2013-11-12 22:07:23http://stackoverflow.comCorvallis, OR<p>Developer on the StackOverflow team. Find ...2530237.0NaN
231012010-07-19 15:34:50Jarrod Dixon2014-08-08 06:42:58http://stackoverflow.comNew York, NY<p><a href=\"http://blog.stackoverflow.com/2009...22190335.0NaN
341012010-07-19 19:03:27Emmett2014-01-02 09:31:02http://minesweeperonline.comSan Francisco, CA<p>currently at a startup in SF</p>\\n\\n<p>form...1100199828.0http://i.stack.imgur.com/d1oHX.jpg
4567922010-07-19 19:03:57Shane2014-08-13 00:23:47http://www.statalgo.comNew York, NY<p>Quantitative researcher focusing on statist...114566255450335.0NaN
\n", + "
" + ], + "text/plain": [ + " Id Reputation CreationDate DisplayName LastAccessDate \\\n", + "0 -1 1 2010-07-19 06:55:26 Community 2010-07-19 06:55:26 \n", + "1 2 101 2010-07-19 14:01:36 Geoff Dalgas 2013-11-12 22:07:23 \n", + "2 3 101 2010-07-19 15:34:50 Jarrod Dixon 2014-08-08 06:42:58 \n", + "3 4 101 2010-07-19 19:03:27 Emmett 2014-01-02 09:31:02 \n", + "4 5 6792 2010-07-19 19:03:57 Shane 2014-08-13 00:23:47 \n", + "\n", + " WebsiteUrl Location \\\n", + "0 http://meta.stackexchange.com/ on the server farm \n", + "1 http://stackoverflow.com Corvallis, OR \n", + "2 http://stackoverflow.com New York, NY \n", + "3 http://minesweeperonline.com San Francisco, CA \n", + "4 http://www.statalgo.com New York, NY \n", + "\n", + " AboutMe Views UpVotes \\\n", + "0

Hi, I'm not really a person.

\\n\\n

I'm ... 0 5007 \n", + "1

Developer on the StackOverflow team. Find ... 25 3 \n", + "2

currently at a startup in SF

\\n\\n

form... 11 0 \n", + "4

Quantitative researcher focusing on statist... 1145 662 \n", + "\n", + " DownVotes AccountId Age ProfileImageUrl \n", + "0 1920 -1 NaN NaN \n", + "1 0 2 37.0 NaN \n", + "2 0 3 35.0 NaN \n", + "3 0 1998 28.0 http://i.stack.imgur.com/d1oHX.jpg \n", + "4 5 54503 35.0 NaN " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users= pd.read_csv('users_table.csv')\n", + "users.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3. Rename Id column to userId" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "users.rename(columns={\"Id\": \"userId\"}, inplace= True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4. Import posts table:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdPostTypeIdAcceptedAnswerIdCreaionDateScoreViewCountBodyOwnerUserIdLasActivityDateTitle...AnswerCountCommentCountFavoriteCountLastEditorUserIdLastEditDateCommunityOwnedDateParentIdClosedDateOwnerDisplayNameLastEditorDisplayName
01115.02010-07-19 19:12:12231278.0<p>How should I elicit prior distributions fro...8.02010-09-15 21:08:26Eliciting priors from experts...5.0114.0NaNNaNNaNNaNNaNNaNNaN
12159.02010-07-19 19:12:57228198.0<p>In many different statistical methods there...24.02012-11-12 09:21:54What is normality?...7.018.088.02010-08-07 17:56:44NaNNaNNaNNaNNaN
2315.02010-07-19 19:13:28543613.0<p>What are some valuable Statistical Analysis...18.02013-05-27 14:48:36What are some valuable Statistical Analysis op......19.0436.0183.02011-02-12 05:50:032010-07-19 19:13:28NaNNaNNaNNaN
341135.02010-07-19 19:13:31135224.0<p>I have two groups of data. Each with a dif...23.02010-09-08 03:00:19Assessing the significance of differences in d......5.022.0NaNNaNNaNNaNNaNNaNNaN
452NaN2010-07-19 19:14:4381NaN<p>The R-project</p>\\n\\n<p><a href=\"http://www...23.02010-07-19 19:21:15NaN...NaN3NaN23.02010-07-19 19:21:152010-07-19 19:14:433.0NaNNaNNaN
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " Id PostTypeId AcceptedAnswerId CreaionDate Score ViewCount \\\n", + "0 1 1 15.0 2010-07-19 19:12:12 23 1278.0 \n", + "1 2 1 59.0 2010-07-19 19:12:57 22 8198.0 \n", + "2 3 1 5.0 2010-07-19 19:13:28 54 3613.0 \n", + "3 4 1 135.0 2010-07-19 19:13:31 13 5224.0 \n", + "4 5 2 NaN 2010-07-19 19:14:43 81 NaN \n", + "\n", + " Body OwnerUserId \\\n", + "0

How should I elicit prior distributions fro... 8.0 \n", + "1

In many different statistical methods there... 24.0 \n", + "2

What are some valuable Statistical Analysis... 18.0 \n", + "3

I have two groups of data. Each with a dif... 23.0 \n", + "4

The R-project

\\n\\n

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdReputationViewsUpVotesDownVotespostIdScoreViewCountCommentCount
0-1105007192021750NaN0
1-1105007192085760NaN0
2-1105007192085780NaN0
3-1105007192089810NaN0
4-1105007192089820NaN0
\n", + "" + ], + "text/plain": [ + " userId Reputation Views UpVotes DownVotes postId Score ViewCount \\\n", + "0 -1 1 0 5007 1920 2175 0 NaN \n", + "1 -1 1 0 5007 1920 8576 0 NaN \n", + "2 -1 1 0 5007 1920 8578 0 NaN \n", + "3 -1 1 0 5007 1920 8981 0 NaN \n", + "4 -1 1 0 5007 1920 8982 0 NaN \n", + "\n", + " CommentCount \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df = users.merge(posts, how='inner', on='userId')\n", + "merged_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 8. How many missing values do you have in your merged dataframe? On which columns?" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "userId 0\n", + "Reputation 0\n", + "Views 0\n", + "UpVotes 0\n", + "DownVotes 0\n", + "postId 0\n", + "Score 0\n", + "ViewCount 23572\n", + "CommentCount 0\n", + "dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df.isna().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "ViewCount has 23572 missing values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 9. You will need to make something with missing values. Will you clean or filling them? Explain. \n", + "**Remember** to check the results of your code before passing to the next step" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(38962, 9)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "98.0 43\n", + "150.0 43\n", + "122.0 42\n", + "156.0 41\n", + "108.0 41\n", + " ..\n", + "2153.0 1\n", + "4410.0 1\n", + "1682.0 1\n", + "4045.0 1\n", + "2150.0 1\n", + "Name: ViewCount, Length: 3402, dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df['ViewCount'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "merged_df['ViewCount'].fillna(0 , inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Filling missing values with 0 as there are many values present in the dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 10. Adjust the data types in order to avoid future issues. Which ones should be changed? " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 38962 entries, 0 to 38961\n", + "Data columns (total 9 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 userId 38962 non-null int64 \n", + " 1 Reputation 38962 non-null int64 \n", + " 2 Views 38962 non-null int64 \n", + " 3 UpVotes 38962 non-null int64 \n", + " 4 DownVotes 38962 non-null int64 \n", + " 5 postId 38962 non-null int64 \n", + " 6 Score 38962 non-null int64 \n", + " 7 ViewCount 38962 non-null float64\n", + " 8 CommentCount 38962 non-null int64 \n", + "dtypes: float64(1), int64(8)\n", + "memory usage: 3.0 MB\n" + ] + } + ], + "source": [ + "merged_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "merged_df['ViewCount'] = merged_df['ViewCount'].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 38962 entries, 0 to 38961\n", + "Data columns (total 9 columns):\n", + " # Column Non-Null Count Dtype\n", + "--- ------ -------------- -----\n", + " 0 userId 38962 non-null int64\n", + " 1 Reputation 38962 non-null int64\n", + " 2 Views 38962 non-null int64\n", + " 3 UpVotes 38962 non-null int64\n", + " 4 DownVotes 38962 non-null int64\n", + " 5 postId 38962 non-null int64\n", + " 6 Score 38962 non-null int64\n", + " 7 ViewCount 38962 non-null int32\n", + " 8 CommentCount 38962 non-null int64\n", + "dtypes: int32(1), int64(8)\n", + "memory usage: 2.8 MB\n" + ] + } + ], + "source": [ + "merged_df.info()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/weather.ipynb b/weather.ipynb new file mode 100644 index 0000000..9bd2905 --- /dev/null +++ b/weather.ipynb @@ -0,0 +1,742 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Global Historical Climatology Network Dataset\n", + "Variables are stored in both rows and columns\n", + "This dataset represents the daily weather records for a weather station (MX17004) in Mexico for five months in 2010." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idyearmonthelementd1d2d3d4d5d6...d22d23d24d25d26d27d28d29d30d31
0MX1700420101tmaxNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN27.8NaN
1MX1700420101tminNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaN14.5NaN
2MX1700420102tmaxNaN27.324.1NaNNaNNaN...NaN29.9NaNNaNNaNNaNNaNNaNNaNNaN
3MX1700420102tminNaN14.414.4NaNNaNNaN...NaN10.7NaNNaNNaNNaNNaNNaNNaNNaN
4MX1700420103tmaxNaNNaNNaNNaN32.1NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows × 35 columns

\n", + "
" + ], + "text/plain": [ + " id year month element d1 d2 d3 d4 d5 d6 ... d22 d23 \\\n", + "0 MX17004 2010 1 tmax NaN NaN NaN NaN NaN NaN ... NaN NaN \n", + "1 MX17004 2010 1 tmin NaN NaN NaN NaN NaN NaN ... NaN NaN \n", + "2 MX17004 2010 2 tmax NaN 27.3 24.1 NaN NaN NaN ... NaN 29.9 \n", + "3 MX17004 2010 2 tmin NaN 14.4 14.4 NaN NaN NaN ... NaN 10.7 \n", + "4 MX17004 2010 3 tmax NaN NaN NaN NaN 32.1 NaN ... NaN NaN \n", + "\n", + " d24 d25 d26 d27 d28 d29 d30 d31 \n", + "0 NaN NaN NaN NaN NaN NaN 27.8 NaN \n", + "1 NaN NaN NaN NaN NaN NaN 14.5 NaN \n", + "2 NaN NaN NaN NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + "[5 rows x 35 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mx_df= pd.read_csv('weather-raw.csv')\n", + "mx_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 22 entries, 0 to 21\n", + "Data columns (total 35 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 22 non-null object \n", + " 1 year 22 non-null int64 \n", + " 2 month 22 non-null int64 \n", + " 3 element 22 non-null object \n", + " 4 d1 2 non-null float64\n", + " 5 d2 4 non-null float64\n", + " 6 d3 4 non-null float64\n", + " 7 d4 2 non-null float64\n", + " 8 d5 8 non-null float64\n", + " 9 d6 2 non-null float64\n", + " 10 d7 2 non-null float64\n", + " 11 d8 2 non-null float64\n", + " 12 d9 0 non-null float64\n", + " 13 d10 2 non-null float64\n", + " 14 d11 2 non-null float64\n", + " 15 d12 0 non-null float64\n", + " 16 d13 2 non-null float64\n", + " 17 d14 4 non-null float64\n", + " 18 d15 2 non-null float64\n", + " 19 d16 2 non-null float64\n", + " 20 d17 2 non-null float64\n", + " 21 d18 0 non-null float64\n", + " 22 d19 0 non-null float64\n", + " 23 d20 0 non-null float64\n", + " 24 d21 0 non-null float64\n", + " 25 d22 0 non-null float64\n", + " 26 d23 4 non-null float64\n", + " 27 d24 0 non-null float64\n", + " 28 d25 2 non-null float64\n", + " 29 d26 2 non-null float64\n", + " 30 d27 6 non-null float64\n", + " 31 d28 2 non-null float64\n", + " 32 d29 4 non-null float64\n", + " 33 d30 2 non-null float64\n", + " 34 d31 2 non-null float64\n", + "dtypes: float64(31), int64(2), object(2)\n", + "memory usage: 6.1+ KB\n" + ] + } + ], + "source": [ + "mx_df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(22, 35)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mx_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "year 0\n", + "month 0\n", + "element 0\n", + "d1 20\n", + "d2 18\n", + "d3 18\n", + "d4 20\n", + "d5 14\n", + "d6 20\n", + "d7 20\n", + "d8 20\n", + "d10 20\n", + "d11 20\n", + "d13 20\n", + "d14 18\n", + "d15 20\n", + "d16 20\n", + "d17 20\n", + "d23 18\n", + "d24 22\n", + "d25 20\n", + "d26 20\n", + "d27 16\n", + "d28 20\n", + "d29 18\n", + "d30 20\n", + "d31 20\n", + "dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mx_df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "mx_df.drop(['d9','d12','d18','d19','d20','d21','d22'], axis= 1, inplace= True)\n", + "#Dropping this columns as all values in the column are null." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "#using melt method to convert columns into row \n", + "mx_df = mx_df.melt(['id','year', 'month','element'],['d1', 'd2','d3','d4','d5','d6','d7','d8','d10','d11','d13','d14','d15','d16','d17','d23','d24','d25','d26','d27','d28','d29','d30','d31'], 'prev cols') " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idyearmonthelementprev colsvalue
0MX1700420101tmaxd1NaN
1MX1700420101tmind1NaN
2MX1700420102tmaxd1NaN
3MX1700420102tmind1NaN
4MX1700420103tmaxd1NaN
.....................
523MX17004201010tmind31NaN
524MX17004201011tmaxd31NaN
525MX17004201011tmind31NaN
526MX17004201012tmaxd31NaN
527MX17004201012tmind31NaN
\n", + "

528 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " id year month element prev cols value\n", + "0 MX17004 2010 1 tmax d1 NaN\n", + "1 MX17004 2010 1 tmin d1 NaN\n", + "2 MX17004 2010 2 tmax d1 NaN\n", + "3 MX17004 2010 2 tmin d1 NaN\n", + "4 MX17004 2010 3 tmax d1 NaN\n", + ".. ... ... ... ... ... ...\n", + "523 MX17004 2010 10 tmin d31 NaN\n", + "524 MX17004 2010 11 tmax d31 NaN\n", + "525 MX17004 2010 11 tmin d31 NaN\n", + "526 MX17004 2010 12 tmax d31 NaN\n", + "527 MX17004 2010 12 tmin d31 NaN\n", + "\n", + "[528 rows x 6 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mx_df" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "mx_df['value'].fillna(np.nan, inplace= True)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idyearmonthelementprev colsvalue
20MX17004201012tmaxd129.9
21MX17004201012tmind113.8
24MX1700420102tmaxd227.3
25MX1700420102tmind214.4
40MX17004201011tmaxd231.3
.....................
477MX1700420108tmind2915.3
484MX1700420101tmaxd3027.8
485MX1700420101tmind3014.5
520MX1700420108tmaxd3125.4
521MX1700420108tmind3115.4
\n", + "

66 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " id year month element prev cols value\n", + "20 MX17004 2010 12 tmax d1 29.9\n", + "21 MX17004 2010 12 tmin d1 13.8\n", + "24 MX17004 2010 2 tmax d2 27.3\n", + "25 MX17004 2010 2 tmin d2 14.4\n", + "40 MX17004 2010 11 tmax d2 31.3\n", + ".. ... ... ... ... ... ...\n", + "477 MX17004 2010 8 tmin d29 15.3\n", + "484 MX17004 2010 1 tmax d30 27.8\n", + "485 MX17004 2010 1 tmin d30 14.5\n", + "520 MX17004 2010 8 tmax d31 25.4\n", + "521 MX17004 2010 8 tmin d31 15.4\n", + "\n", + "[66 rows x 6 columns]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Dropping rows where value column has NAN\n", + "mx_df.dropna(subset=['value'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mx_df.to_csv('')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/your-code/data_cleaning.ipynb b/your-code/data_cleaning.ipynb new file mode 100644 index 0000000..3cc6d26 --- /dev/null +++ b/your-code/data_cleaning.ipynb @@ -0,0 +1,3351 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RankCountry ordependent territoryAreaPopulationDensity
RankCountry ordependent territorykm2sq. mi.Populationper km2per sq.mi.
01Macau (China)30.5012.006508342133955268
12Monaco2.020.78375501858948145
23Singapore719.90278.005612300779620192
34Hong Kong (China)1106.30427.007409800669817348
45Gibraltar (UK)[12]6.802.6033140487412624
56Bahrain757.00292.00145120019174965
67Vatican City0.440.1780018184709
78Malta315.00122.0047570115103911
89Maldives298.00115.0037811412693287
910Bermuda (UK)52.0020.006377912273178
\n", + "
" + ], + "text/plain": [ + " Rank Country ordependent territory Area Population Density \\\n", + " Rank Country ordependent territory km2 sq. mi. Population per km2 \n", + "0 1 Macau (China) 30.50 12.00 650834 21339 \n", + "1 2 Monaco 2.02 0.78 37550 18589 \n", + "2 3 Singapore 719.90 278.00 5612300 7796 \n", + "3 4 Hong Kong (China) 1106.30 427.00 7409800 6698 \n", + "4 5 Gibraltar (UK)[12] 6.80 2.60 33140 4874 \n", + "5 6 Bahrain 757.00 292.00 1451200 1917 \n", + "6 7 Vatican City 0.44 0.17 800 1818 \n", + "7 8 Malta 315.00 122.00 475701 1510 \n", + "8 9 Maldives 298.00 115.00 378114 1269 \n", + "9 10 Bermuda (UK) 52.00 20.00 63779 1227 \n", + "\n", + " \n", + " per sq.mi. \n", + "0 55268 \n", + "1 48145 \n", + "2 20192 \n", + "3 17348 \n", + "4 12624 \n", + "5 4965 \n", + "6 4709 \n", + "7 3911 \n", + "8 3287 \n", + "9 3178 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x = pd.read_html('https://en.wikipedia.org/wiki/Population_density')[0]\n", + "x\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Rank', 'Country ordependent territory', 'km2', 'sq. mi.', 'Population',\n", + " 'per km2', 'per sq.mi.'],\n", + " dtype='object')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x.columns.droplevel()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RankCountry ordependent territoryAreaPopulationDensity
RankCountry ordependent territorykm2sq. mi.Populationper km2per sq.mi.
01Macau (China)30.5012.006508342133955268
12Monaco2.020.78375501858948145
23Singapore719.90278.005612300779620192
34Hong Kong (China)1106.30427.007409800669817348
45Gibraltar (UK)[12]6.802.6033140487412624
56Bahrain757.00292.00145120019174965
67Vatican City0.440.1780018184709
78Malta315.00122.0047570115103911
89Maldives298.00115.0037811412693287
910Bermuda (UK)52.0020.006377912273178
\n", + "
" + ], + "text/plain": [ + " Rank Country ordependent territory Area Population Density \\\n", + " Rank Country ordependent territory km2 sq. mi. Population per km2 \n", + "0 1 Macau (China) 30.50 12.00 650834 21339 \n", + "1 2 Monaco 2.02 0.78 37550 18589 \n", + "2 3 Singapore 719.90 278.00 5612300 7796 \n", + "3 4 Hong Kong (China) 1106.30 427.00 7409800 6698 \n", + "4 5 Gibraltar (UK)[12] 6.80 2.60 33140 4874 \n", + "5 6 Bahrain 757.00 292.00 1451200 1917 \n", + "6 7 Vatican City 0.44 0.17 800 1818 \n", + "7 8 Malta 315.00 122.00 475701 1510 \n", + "8 9 Maldives 298.00 115.00 378114 1269 \n", + "9 10 Bermuda (UK) 52.00 20.00 63779 1227 \n", + "\n", + " \n", + " per sq.mi. \n", + "0 55268 \n", + "1 48145 \n", + "2 20192 \n", + "3 17348 \n", + "4 12624 \n", + "5 4965 \n", + "6 4709 \n", + "7 3911 \n", + "8 3287 \n", + "9 3178 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MakeModelYearEngine DisplacementCylindersTransmissionDrivetrainVehicle ClassFuel TypeFuel Barrels/YearCity MPGHighway MPGCombined MPGCO2 Emission Grams/MileFuel Cost/Year
0AM GeneralDJ Po Vehicle 2WD19842.54.0Automatic 3-spd2-Wheel DriveSpecial Purpose Vehicle 2WDRegular19.388824181717522.7647061950
1AM GeneralFJ8c Post Office19844.26.0Automatic 3-spd2-Wheel DriveSpecial Purpose Vehicle 2WDRegular25.354615131313683.6153852550
2AM GeneralPost Office DJ5 2WD19852.54.0Automatic 3-spdRear-Wheel DriveSpecial Purpose Vehicle 2WDRegular20.600625161716555.4375002100
3AM GeneralPost Office DJ8 2WD19854.26.0Automatic 3-spdRear-Wheel DriveSpecial Purpose Vehicle 2WDRegular25.354615131313683.6153852550
4ASC IncorporatedGNX19873.86.0Automatic 4-spdRear-Wheel DriveMidsize CarsPremium20.600625142116555.4375002550
................................................
35947smartfortwo coupe20131.03.0Auto(AM5)Rear-Wheel DriveTwo SeatersPremium9.155833343836244.0000001100
35948smartfortwo coupe20141.03.0Auto(AM5)Rear-Wheel DriveTwo SeatersPremium9.155833343836243.0000001100
35949smartfortwo coupe20151.03.0Auto(AM5)Rear-Wheel DriveTwo SeatersPremium9.155833343836244.0000001100
35950smartfortwo coupe20160.93.0Auto(AM6)Rear-Wheel DriveTwo SeatersPremium9.155833343936246.0000001100
35951smartfortwo coupe20160.93.0Manual 5-spdRear-Wheel DriveTwo SeatersPremium9.417429323935255.0000001150
\n", + "

35952 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " Make Model Year Engine Displacement \\\n", + "0 AM General DJ Po Vehicle 2WD 1984 2.5 \n", + "1 AM General FJ8c Post Office 1984 4.2 \n", + "2 AM General Post Office DJ5 2WD 1985 2.5 \n", + "3 AM General Post Office DJ8 2WD 1985 4.2 \n", + "4 ASC Incorporated GNX 1987 3.8 \n", + "... ... ... ... ... \n", + "35947 smart fortwo coupe 2013 1.0 \n", + "35948 smart fortwo coupe 2014 1.0 \n", + "35949 smart fortwo coupe 2015 1.0 \n", + "35950 smart fortwo coupe 2016 0.9 \n", + "35951 smart fortwo coupe 2016 0.9 \n", + "\n", + " Cylinders Transmission Drivetrain \\\n", + "0 4.0 Automatic 3-spd 2-Wheel Drive \n", + "1 6.0 Automatic 3-spd 2-Wheel Drive \n", + "2 4.0 Automatic 3-spd Rear-Wheel Drive \n", + "3 6.0 Automatic 3-spd Rear-Wheel Drive \n", + "4 6.0 Automatic 4-spd Rear-Wheel Drive \n", + "... ... ... ... \n", + "35947 3.0 Auto(AM5) Rear-Wheel Drive \n", + "35948 3.0 Auto(AM5) Rear-Wheel Drive \n", + "35949 3.0 Auto(AM5) Rear-Wheel Drive \n", + "35950 3.0 Auto(AM6) Rear-Wheel Drive \n", + "35951 3.0 Manual 5-spd Rear-Wheel Drive \n", + "\n", + " Vehicle Class Fuel Type Fuel Barrels/Year City MPG \\\n", + "0 Special Purpose Vehicle 2WD Regular 19.388824 18 \n", + "1 Special Purpose Vehicle 2WD Regular 25.354615 13 \n", + "2 Special Purpose Vehicle 2WD Regular 20.600625 16 \n", + "3 Special Purpose Vehicle 2WD Regular 25.354615 13 \n", + "4 Midsize Cars Premium 20.600625 14 \n", + "... ... ... ... ... \n", + "35947 Two Seaters Premium 9.155833 34 \n", + "35948 Two Seaters Premium 9.155833 34 \n", + "35949 Two Seaters Premium 9.155833 34 \n", + "35950 Two Seaters Premium 9.155833 34 \n", + "35951 Two Seaters Premium 9.417429 32 \n", + "\n", + " Highway MPG Combined MPG CO2 Emission Grams/Mile Fuel Cost/Year \n", + "0 17 17 522.764706 1950 \n", + "1 13 13 683.615385 2550 \n", + "2 17 16 555.437500 2100 \n", + "3 13 13 683.615385 2550 \n", + "4 21 16 555.437500 2550 \n", + "... ... ... ... ... \n", + "35947 38 36 244.000000 1100 \n", + "35948 38 36 243.000000 1100 \n", + "35949 38 36 244.000000 1100 \n", + "35950 39 36 246.000000 1100 \n", + "35951 39 35 255.000000 1150 \n", + "\n", + "[35952 rows x 15 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.read_csv('data_set/vehicles/vehicles_tab.txt', sep='\\t')\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MakeModelYearEngine DisplacementCylindersTransmissionDrivetrainVehicle ClassFuel TypeFuel Barrels/YearCity MPGHighway MPGCombined MPGCO2 Emission Grams/MileFuel Cost/Year
0AM GeneralDJ Po Vehicle 2WD19842.54.0Automatic 3-spd2-Wheel DriveSpecial Purpose Vehicle 2WDRegular19.388824181717522.7647061950
1AM GeneralFJ8c Post Office19844.26.0Automatic 3-spd2-Wheel DriveSpecial Purpose Vehicle 2WDRegular25.354615131313683.6153852550
2AM GeneralPost Office DJ5 2WD19852.54.0Automatic 3-spdRear-Wheel DriveSpecial Purpose Vehicle 2WDRegular20.600625161716555.4375002100
3AM GeneralPost Office DJ8 2WD19854.26.0Automatic 3-spdRear-Wheel DriveSpecial Purpose Vehicle 2WDRegular25.354615131313683.6153852550
4ASC IncorporatedGNX19873.86.0Automatic 4-spdRear-Wheel DriveMidsize CarsPremium20.600625142116555.4375002550
................................................
35947smartfortwo coupe20131.03.0Auto(AM5)Rear-Wheel DriveTwo SeatersPremium9.155833343836244.0000001100
35948smartfortwo coupe20141.03.0Auto(AM5)Rear-Wheel DriveTwo SeatersPremium9.155833343836243.0000001100
35949smartfortwo coupe20151.03.0Auto(AM5)Rear-Wheel DriveTwo SeatersPremium9.155833343836244.0000001100
35950smartfortwo coupe20160.93.0Auto(AM6)Rear-Wheel DriveTwo SeatersPremium9.155833343936246.0000001100
35951smartfortwo coupe20160.93.0Manual 5-spdRear-Wheel DriveTwo SeatersPremium9.417429323935255.0000001150
\n", + "

35952 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " Make Model Year Engine Displacement \\\n", + "0 AM General DJ Po Vehicle 2WD 1984 2.5 \n", + "1 AM General FJ8c Post Office 1984 4.2 \n", + "2 AM General Post Office DJ5 2WD 1985 2.5 \n", + "3 AM General Post Office DJ8 2WD 1985 4.2 \n", + "4 ASC Incorporated GNX 1987 3.8 \n", + "... ... ... ... ... \n", + "35947 smart fortwo coupe 2013 1.0 \n", + "35948 smart fortwo coupe 2014 1.0 \n", + "35949 smart fortwo coupe 2015 1.0 \n", + "35950 smart fortwo coupe 2016 0.9 \n", + "35951 smart fortwo coupe 2016 0.9 \n", + "\n", + " Cylinders Transmission Drivetrain \\\n", + "0 4.0 Automatic 3-spd 2-Wheel Drive \n", + "1 6.0 Automatic 3-spd 2-Wheel Drive \n", + "2 4.0 Automatic 3-spd Rear-Wheel Drive \n", + "3 6.0 Automatic 3-spd Rear-Wheel Drive \n", + "4 6.0 Automatic 4-spd Rear-Wheel Drive \n", + "... ... ... ... \n", + "35947 3.0 Auto(AM5) Rear-Wheel Drive \n", + "35948 3.0 Auto(AM5) Rear-Wheel Drive \n", + "35949 3.0 Auto(AM5) Rear-Wheel Drive \n", + "35950 3.0 Auto(AM6) Rear-Wheel Drive \n", + "35951 3.0 Manual 5-spd Rear-Wheel Drive \n", + "\n", + " Vehicle Class Fuel Type Fuel Barrels/Year City MPG \\\n", + "0 Special Purpose Vehicle 2WD Regular 19.388824 18 \n", + "1 Special Purpose Vehicle 2WD Regular 25.354615 13 \n", + "2 Special Purpose Vehicle 2WD Regular 20.600625 16 \n", + "3 Special Purpose Vehicle 2WD Regular 25.354615 13 \n", + "4 Midsize Cars Premium 20.600625 14 \n", + "... ... ... ... ... \n", + "35947 Two Seaters Premium 9.155833 34 \n", + "35948 Two Seaters Premium 9.155833 34 \n", + "35949 Two Seaters Premium 9.155833 34 \n", + "35950 Two Seaters Premium 9.155833 34 \n", + "35951 Two Seaters Premium 9.417429 32 \n", + "\n", + " Highway MPG Combined MPG CO2 Emission Grams/Mile Fuel Cost/Year \n", + "0 17 17 522.764706 1950 \n", + "1 13 13 683.615385 2550 \n", + "2 17 16 555.437500 2100 \n", + "3 13 13 683.615385 2550 \n", + "4 21 16 555.437500 2550 \n", + "... ... ... ... ... \n", + "35947 38 36 244.000000 1100 \n", + "35948 38 36 243.000000 1100 \n", + "35949 38 36 244.000000 1100 \n", + "35950 39 36 246.000000 1100 \n", + "35951 39 35 255.000000 1150 \n", + "\n", + "[35952 rows x 15 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.read_csv('data_set/vehicles/vehicles_pipe.txt', sep='|')\n", + "data " + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MakeModelYearEngine DisplacementCylindersTransmissionDrivetrainVehicle ClassFuel TypeFuel Barrels/YearCity MPGHighway MPGCombined MPGCO2 Emission Grams/MileFuel Cost/Year
0AM GeneralDJ Po Vehicle 2WD19842.54Automatic 3-spd2-Wheel DriveSpecial Purpose Vehicle 2WDRegular19.388824181717522.7647061950
1AM GeneralFJ8c Post Office19844.26Automatic 3-spd2-Wheel DriveSpecial Purpose Vehicle 2WDRegular25.354615131313683.6153852550
2AM GeneralPost Office DJ5 2WD19852.54Automatic 3-spdRear-Wheel DriveSpecial Purpose Vehicle 2WDRegular20.600625161716555.4375002100
3AM GeneralPost Office DJ8 2WD19854.26Automatic 3-spdRear-Wheel DriveSpecial Purpose Vehicle 2WDRegular25.354615131313683.6153852550
4ASC IncorporatedGNX19873.86Automatic 4-spdRear-Wheel DriveMidsize CarsPremium20.600625142116555.4375002550
................................................
35947smartfortwo coupe20131.03Auto(AM5)Rear-Wheel DriveTwo SeatersPremium9.155833343836244.0000001100
35948smartfortwo coupe20141.03Auto(AM5)Rear-Wheel DriveTwo SeatersPremium9.155833343836243.0000001100
35949smartfortwo coupe20151.03Auto(AM5)Rear-Wheel DriveTwo SeatersPremium9.155833343836244.0000001100
35950smartfortwo coupe20160.93Auto(AM6)Rear-Wheel DriveTwo SeatersPremium9.155833343936246.0000001100
35951smartfortwo coupe20160.93Manual 5-spdRear-Wheel DriveTwo SeatersPremium9.417429323935255.0000001150
\n", + "

35952 rows × 15 columns

\n", + "
" + ], + "text/plain": [ + " Make Model Year Engine Displacement \\\n", + "0 AM General DJ Po Vehicle 2WD 1984 2.5 \n", + "1 AM General FJ8c Post Office 1984 4.2 \n", + "2 AM General Post Office DJ5 2WD 1985 2.5 \n", + "3 AM General Post Office DJ8 2WD 1985 4.2 \n", + "4 ASC Incorporated GNX 1987 3.8 \n", + "... ... ... ... ... \n", + "35947 smart fortwo coupe 2013 1.0 \n", + "35948 smart fortwo coupe 2014 1.0 \n", + "35949 smart fortwo coupe 2015 1.0 \n", + "35950 smart fortwo coupe 2016 0.9 \n", + "35951 smart fortwo coupe 2016 0.9 \n", + "\n", + " Cylinders Transmission Drivetrain \\\n", + "0 4 Automatic 3-spd 2-Wheel Drive \n", + "1 6 Automatic 3-spd 2-Wheel Drive \n", + "2 4 Automatic 3-spd Rear-Wheel Drive \n", + "3 6 Automatic 3-spd Rear-Wheel Drive \n", + "4 6 Automatic 4-spd Rear-Wheel Drive \n", + "... ... ... ... \n", + "35947 3 Auto(AM5) Rear-Wheel Drive \n", + "35948 3 Auto(AM5) Rear-Wheel Drive \n", + "35949 3 Auto(AM5) Rear-Wheel Drive \n", + "35950 3 Auto(AM6) Rear-Wheel Drive \n", + "35951 3 Manual 5-spd Rear-Wheel Drive \n", + "\n", + " Vehicle Class Fuel Type Fuel Barrels/Year City MPG \\\n", + "0 Special Purpose Vehicle 2WD Regular 19.388824 18 \n", + "1 Special Purpose Vehicle 2WD Regular 25.354615 13 \n", + "2 Special Purpose Vehicle 2WD Regular 20.600625 16 \n", + "3 Special Purpose Vehicle 2WD Regular 25.354615 13 \n", + "4 Midsize Cars Premium 20.600625 14 \n", + "... ... ... ... ... \n", + "35947 Two Seaters Premium 9.155833 34 \n", + "35948 Two Seaters Premium 9.155833 34 \n", + "35949 Two Seaters Premium 9.155833 34 \n", + "35950 Two Seaters Premium 9.155833 34 \n", + "35951 Two Seaters Premium 9.417429 32 \n", + "\n", + " Highway MPG Combined MPG CO2 Emission Grams/Mile Fuel Cost/Year \n", + "0 17 17 522.764706 1950 \n", + "1 13 13 683.615385 2550 \n", + "2 17 16 555.437500 2100 \n", + "3 13 13 683.615385 2550 \n", + "4 21 16 555.437500 2550 \n", + "... ... ... ... ... \n", + "35947 38 36 244.000000 1100 \n", + "35948 38 36 243.000000 1100 \n", + "35949 38 36 244.000000 1100 \n", + "35950 39 36 246.000000 1100 \n", + "35951 39 35 255.000000 1150 \n", + "\n", + "[35952 rows x 15 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.read_json('/Users/chuksfolder/Desktop/data_cleaning/data_set/vehicles/vehicles.json')\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/chuksfolder/opt/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3444: DtypeWarning: Columns (70,71,72,73,74,76,79) have mixed types.Specify dtype option on import or set low_memory=False.\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
barrels08barrelsA08charge120charge240city08city08UcityA08cityA08UcityCDcityE...mfrCodec240Dscrcharge240bc240bDscrcreatedOnmodifiedOnstartStopphevCityphevHwyphevComb
015.6957140.00.00.0190.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
129.9645450.00.00.090.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
212.2077780.00.00.0230.000.00.00.0...NaNNaN0.0NaNTue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013NaN000
\n", + "

3 rows × 83 columns

\n", + "
" + ], + "text/plain": [ + " barrels08 barrelsA08 charge120 charge240 city08 city08U cityA08 \\\n", + "0 15.695714 0.0 0.0 0.0 19 0.0 0 \n", + "1 29.964545 0.0 0.0 0.0 9 0.0 0 \n", + "2 12.207778 0.0 0.0 0.0 23 0.0 0 \n", + "\n", + " cityA08U cityCD cityE ... mfrCode c240Dscr charge240b c240bDscr \\\n", + "0 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n", + "1 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n", + "2 0.0 0.0 0.0 ... NaN NaN 0.0 NaN \n", + "\n", + " createdOn modifiedOn startStop \\\n", + "0 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n", + "1 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n", + "2 Tue Jan 01 00:00:00 EST 2013 Tue Jan 01 00:00:00 EST 2013 NaN \n", + "\n", + " phevCity phevHwy phevComb \n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "\n", + "[3 rows x 83 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = pd.read_csv('data_set/vehicles/vehicles_messy.csv')\n", + "ds.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(37843, 83)" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
barrels08barrelsA08charge120charge240city08city08UcityA08cityA08UcityCDcityE...UCityUCityAUHighwayUHighwayAyearyouSaveSpendcharge240bphevCityphevHwyphevComb
count37843.00000037843.00000037843.037843.00000037843.00000037843.00000037843.00000037843.00000037843.00000037843.000000...37843.00000037843.00000037843.00000037843.00000037843.00000037843.00000037843.00000037843.00000037843.00000037843.000000
mean17.5325060.2161690.00.02353117.9413894.0427370.5201490.3271630.0004060.184790...22.5872290.65238033.6192210.9338452000.064398-2658.9990220.0043600.0693130.0682030.068573
std4.5759501.1415270.00.4276476.6603609.6458203.8378743.5425960.0399182.904558...9.3501635.28454710.0483266.05945610.3905882553.0983290.1427761.9668061.8719861.913647
min0.0600000.0000000.00.0000006.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000001984.000000-22250.0000000.0000000.0000000.0000000.000000
25%14.3308700.0000000.00.00000015.0000000.0000000.0000000.0000000.0000000.000000...18.0000000.00000027.1000000.0000001990.000000-4250.0000000.0000000.0000000.0000000.000000
50%17.3478950.0000000.00.00000017.0000000.0000000.0000000.0000000.0000000.000000...21.0000000.00000033.0000000.0000002001.000000-2500.0000000.0000000.0000000.0000000.000000
75%20.6006250.0000000.00.00000020.0000000.0000000.0000000.0000000.0000000.000000...25.1393000.00000038.1096000.0000002009.000000-750.0000000.0000000.0000000.0000000.000000
max47.08714318.3116670.012.000000138.000000138.304000127.000000127.0930005.350000122.000000...197.577100181.560900159.100000152.1878002017.0000004000.0000007.00000097.00000079.00000088.000000
\n", + "

8 rows × 59 columns

\n", + "
" + ], + "text/plain": [ + " barrels08 barrelsA08 charge120 charge240 city08 \\\n", + "count 37843.000000 37843.000000 37843.0 37843.000000 37843.000000 \n", + "mean 17.532506 0.216169 0.0 0.023531 17.941389 \n", + "std 4.575950 1.141527 0.0 0.427647 6.660360 \n", + "min 0.060000 0.000000 0.0 0.000000 6.000000 \n", + "25% 14.330870 0.000000 0.0 0.000000 15.000000 \n", + "50% 17.347895 0.000000 0.0 0.000000 17.000000 \n", + "75% 20.600625 0.000000 0.0 0.000000 20.000000 \n", + "max 47.087143 18.311667 0.0 12.000000 138.000000 \n", + "\n", + " city08U cityA08 cityA08U cityCD cityE \\\n", + "count 37843.000000 37843.000000 37843.000000 37843.000000 37843.000000 \n", + "mean 4.042737 0.520149 0.327163 0.000406 0.184790 \n", + "std 9.645820 3.837874 3.542596 0.039918 2.904558 \n", + "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "25% 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "50% 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "75% 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "max 138.304000 127.000000 127.093000 5.350000 122.000000 \n", + "\n", + " ... UCity UCityA UHighway UHighwayA \\\n", + "count ... 37843.000000 37843.000000 37843.000000 37843.000000 \n", + "mean ... 22.587229 0.652380 33.619221 0.933845 \n", + "std ... 9.350163 5.284547 10.048326 6.059456 \n", + "min ... 0.000000 0.000000 0.000000 0.000000 \n", + "25% ... 18.000000 0.000000 27.100000 0.000000 \n", + "50% ... 21.000000 0.000000 33.000000 0.000000 \n", + "75% ... 25.139300 0.000000 38.109600 0.000000 \n", + "max ... 197.577100 181.560900 159.100000 152.187800 \n", + "\n", + " year youSaveSpend charge240b phevCity phevHwy \\\n", + "count 37843.000000 37843.000000 37843.000000 37843.000000 37843.000000 \n", + "mean 2000.064398 -2658.999022 0.004360 0.069313 0.068203 \n", + "std 10.390588 2553.098329 0.142776 1.966806 1.871986 \n", + "min 1984.000000 -22250.000000 0.000000 0.000000 0.000000 \n", + "25% 1990.000000 -4250.000000 0.000000 0.000000 0.000000 \n", + "50% 2001.000000 -2500.000000 0.000000 0.000000 0.000000 \n", + "75% 2009.000000 -750.000000 0.000000 0.000000 0.000000 \n", + "max 2017.000000 4000.000000 7.000000 97.000000 79.000000 \n", + "\n", + " phevComb \n", + "count 37843.000000 \n", + "mean 0.068573 \n", + "std 1.913647 \n", + "min 0.000000 \n", + "25% 0.000000 \n", + "50% 0.000000 \n", + "75% 0.000000 \n", + "max 88.000000 \n", + "\n", + "[8 rows x 59 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds.describe() # exploring the data " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "barrels08 0\n", + "barrelsA08 0\n", + "charge120 0\n", + "charge240 0\n", + "city08 0\n", + " ... \n", + "modifiedOn 0\n", + "startStop 31705\n", + "phevCity 0\n", + "phevHwy 0\n", + "phevComb 0\n", + "Length: 83, dtype: int64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds.isnull() ## checking missing null is looking nan \n", + "ds.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "cylinders 123\n", + "displ 120\n", + "drive 1189\n", + "eng_dscr 15403\n", + "trany 11\n", + "guzzler 35562\n", + "trans_dscr 22796\n", + "tCharger 32657\n", + "sCharger 37177\n", + "atvType 34771\n", + "fuelType2 36435\n", + "rangeA 36440\n", + "evMotor 37281\n", + "mfrCode 30818\n", + "c240Dscr 37806\n", + "c240bDscr 37807\n", + "startStop 31705\n", + "dtype: int64" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_null = ds.isnull().sum()\n", + "ds_null[ds_null>0] ## looking for what to drop in the dataset \n", + "## trying to explore the missing data that is greater than zero " + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "barrels08 0\n", + "barrelsA08 36435\n", + "charge120 37843\n", + "charge240 37691\n", + "city08 0\n", + " ... \n", + "modifiedOn 0\n", + "startStop 0\n", + "phevCity 37788\n", + "phevHwy 37788\n", + "phevComb 37788\n", + "Length: 83, dtype: int64" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check the zeros \n", + "ds_zeros = ds[ds == 0.0].count()\n", + "ds_zeros" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "barrelsA08 36435\n", + "charge120 37843\n", + "charge240 37691\n", + "city08U 30544\n", + "cityA08 36435\n", + "cityA08U 37103\n", + "cityCD 37831\n", + "cityE 37668\n", + "cityUF 37788\n", + "co2 120\n", + "co2A 15\n", + "co2TailpipeAGpm 36490\n", + "co2TailpipeGpm 120\n", + "comb08U 30544\n", + "combA08 36435\n", + "combA08U 37103\n", + "combE 37668\n", + "combinedCD 37828\n", + "combinedUF 37788\n", + "displ 2\n", + "engId 12600\n", + "fuelCostA08 36471\n", + "ghgScoreA 9\n", + "highway08U 30544\n", + "highwayA08 36435\n", + "highwayA08U 37103\n", + "highwayCD 37833\n", + "highwayE 37668\n", + "highwayUF 37788\n", + "hlv 33228\n", + "hpv 33229\n", + "lv2 31593\n", + "lv4 24449\n", + "phevBlended 37807\n", + "pv2 31604\n", + "pv4 24449\n", + "range 37723\n", + "rangeCity 37750\n", + "rangeCityA 37788\n", + "rangeHwy 37750\n", + "rangeHwyA 37788\n", + "UCity 25\n", + "UCityA 36481\n", + "UHighway 25\n", + "UHighwayA 36481\n", + "youSaveSpend 1234\n", + "charge240b 37807\n", + "phevCity 37788\n", + "phevHwy 37788\n", + "phevComb 37788\n", + "dtype: int64" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_zeros[ds_zeros>0] ## all column greater than zero" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "eng_dscr 15403\n", + "guzzler 35562\n", + "trans_dscr 22796\n", + "tCharger 32657\n", + "sCharger 37177\n", + "atvType 34771\n", + "fuelType2 36435\n", + "rangeA 36440\n", + "evMotor 37281\n", + "mfrCode 30818\n", + "c240Dscr 37806\n", + "c240bDscr 37807\n", + "startStop 31705\n", + "dtype: int64" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## list to drop the null \n", + "ds_null = ds.isnull().sum()\n", + "ds_null[ds_null>10000]\n", + "#list(ds_null>1000].index)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
barrels08barrelsA08charge120charge240city08city08UcityA08cityA08UcityCDcityE...UHighwayAVClassyearyouSaveSpendcharge240bcreatedOnmodifiedOnphevCityphevHwyphevComb
015.6957140.00.00.0190.000.00.00.0...0.0Two Seaters1985-12500.0Tue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013000
129.9645450.00.00.090.000.00.00.0...0.0Two Seaters1985-85000.0Tue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013000
212.2077780.00.00.0230.000.00.00.0...0.0Subcompact Cars19855000.0Tue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013000
329.9645450.00.00.0100.000.00.00.0...0.0Vans1985-85000.0Tue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013000
417.3478950.00.00.0170.000.00.00.0...0.0Compact Cars1993-40000.0Tue Jan 01 00:00:00 EST 2013Tue Jan 01 00:00:00 EST 2013000
\n", + "

5 rows × 69 columns

\n", + "
" + ], + "text/plain": [ + " barrels08 barrelsA08 charge120 charge240 city08 city08U cityA08 \\\n", + "0 15.695714 0.0 0.0 0.0 19 0.0 0 \n", + "1 29.964545 0.0 0.0 0.0 9 0.0 0 \n", + "2 12.207778 0.0 0.0 0.0 23 0.0 0 \n", + "3 29.964545 0.0 0.0 0.0 10 0.0 0 \n", + "4 17.347895 0.0 0.0 0.0 17 0.0 0 \n", + "\n", + " cityA08U cityCD cityE ... UHighwayA VClass year \\\n", + "0 0.0 0.0 0.0 ... 0.0 Two Seaters 1985 \n", + "1 0.0 0.0 0.0 ... 0.0 Two Seaters 1985 \n", + "2 0.0 0.0 0.0 ... 0.0 Subcompact Cars 1985 \n", + "3 0.0 0.0 0.0 ... 0.0 Vans 1985 \n", + "4 0.0 0.0 0.0 ... 0.0 Compact Cars 1993 \n", + "\n", + " youSaveSpend charge240b createdOn \\\n", + "0 -1250 0.0 Tue Jan 01 00:00:00 EST 2013 \n", + "1 -8500 0.0 Tue Jan 01 00:00:00 EST 2013 \n", + "2 500 0.0 Tue Jan 01 00:00:00 EST 2013 \n", + "3 -8500 0.0 Tue Jan 01 00:00:00 EST 2013 \n", + "4 -4000 0.0 Tue Jan 01 00:00:00 EST 2013 \n", + "\n", + " modifiedOn phevCity phevHwy phevComb \n", + "0 Tue Jan 01 00:00:00 EST 2013 0 0 0 \n", + "1 Tue Jan 01 00:00:00 EST 2013 0 0 0 \n", + "2 Tue Jan 01 00:00:00 EST 2013 0 0 0 \n", + "3 Tue Jan 01 00:00:00 EST 2013 0 0 0 \n", + "4 Tue Jan 01 00:00:00 EST 2013 0 0 0 \n", + "\n", + "[5 rows x 69 columns]" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## list to drop the null \n", + "ds_null = ds.isnull().sum()\n", + "ds_null[ds_null>10000]\n", + "li_to_drop = list(ds_null[ds_null>1000].index)\n", + "\n", + "ds_dropped = ds.drop(columns=li_to_drop)\n", + "ds_dropped.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
barrels08barrelsA08charge120charge240city08city08UcityA08cityA08UcityCDcityE...UCityUCityAUHighwayUHighwayAyearyouSaveSpendcharge240bphevCityphevHwyphevComb
015.6957140.00.00.0190.000.00.00.0...23.33330.035.00000.01985-12500.0000
129.9645450.00.00.090.000.00.00.0...11.00000.019.00000.01985-85000.0000
212.2077780.00.00.0230.000.00.00.0...29.00000.047.00000.019855000.0000
329.9645450.00.00.0100.000.00.00.0...12.22220.016.66670.01985-85000.0000
417.3478950.00.00.0170.000.00.00.0...21.00000.032.00000.01993-40000.0000
..................................................................
3783814.9822730.00.00.0190.000.00.00.0...24.00000.037.00000.01993-7500.0000
3783914.3308700.00.00.0200.000.00.00.0...25.00000.039.00000.01993-5000.0000
3784015.6957140.00.00.0180.000.00.00.0...23.00000.034.00000.01993-12500.0000
3784115.6957140.00.00.0180.000.00.00.0...23.00000.034.00000.01993-12500.0000
3784218.3116670.00.00.0160.000.00.00.0...20.00000.029.00000.01993-45000.0000
\n", + "

37843 rows × 60 columns

\n", + "
" + ], + "text/plain": [ + " barrels08 barrelsA08 charge120 charge240 city08 city08U cityA08 \\\n", + "0 15.695714 0.0 0.0 0.0 19 0.0 0 \n", + "1 29.964545 0.0 0.0 0.0 9 0.0 0 \n", + "2 12.207778 0.0 0.0 0.0 23 0.0 0 \n", + "3 29.964545 0.0 0.0 0.0 10 0.0 0 \n", + "4 17.347895 0.0 0.0 0.0 17 0.0 0 \n", + "... ... ... ... ... ... ... ... \n", + "37838 14.982273 0.0 0.0 0.0 19 0.0 0 \n", + "37839 14.330870 0.0 0.0 0.0 20 0.0 0 \n", + "37840 15.695714 0.0 0.0 0.0 18 0.0 0 \n", + "37841 15.695714 0.0 0.0 0.0 18 0.0 0 \n", + "37842 18.311667 0.0 0.0 0.0 16 0.0 0 \n", + "\n", + " cityA08U cityCD cityE ... UCity UCityA UHighway UHighwayA \\\n", + "0 0.0 0.0 0.0 ... 23.3333 0.0 35.0000 0.0 \n", + "1 0.0 0.0 0.0 ... 11.0000 0.0 19.0000 0.0 \n", + "2 0.0 0.0 0.0 ... 29.0000 0.0 47.0000 0.0 \n", + "3 0.0 0.0 0.0 ... 12.2222 0.0 16.6667 0.0 \n", + "4 0.0 0.0 0.0 ... 21.0000 0.0 32.0000 0.0 \n", + "... ... ... ... ... ... ... ... ... \n", + "37838 0.0 0.0 0.0 ... 24.0000 0.0 37.0000 0.0 \n", + "37839 0.0 0.0 0.0 ... 25.0000 0.0 39.0000 0.0 \n", + "37840 0.0 0.0 0.0 ... 23.0000 0.0 34.0000 0.0 \n", + "37841 0.0 0.0 0.0 ... 23.0000 0.0 34.0000 0.0 \n", + "37842 0.0 0.0 0.0 ... 20.0000 0.0 29.0000 0.0 \n", + "\n", + " year youSaveSpend charge240b phevCity phevHwy phevComb \n", + "0 1985 -1250 0.0 0 0 0 \n", + "1 1985 -8500 0.0 0 0 0 \n", + "2 1985 500 0.0 0 0 0 \n", + "3 1985 -8500 0.0 0 0 0 \n", + "4 1993 -4000 0.0 0 0 0 \n", + "... ... ... ... ... ... ... \n", + "37838 1993 -750 0.0 0 0 0 \n", + "37839 1993 -500 0.0 0 0 0 \n", + "37840 1993 -1250 0.0 0 0 0 \n", + "37841 1993 -1250 0.0 0 0 0 \n", + "37842 1993 -4500 0.0 0 0 0 \n", + "\n", + "[37843 rows x 60 columns]" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## how to check the negatives\n", + "## numeric ()\n", + "## or select_type()\n", + "\n", + "numeric = ds.select_dtypes(exclude=object)\n", + "numeric\n" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "barrels08 0\n", + "barrelsA08 0\n", + "charge120 0\n", + "charge240 0\n", + "city08 0\n", + "city08U 0\n", + "cityA08 0\n", + "cityA08U 0\n", + "cityCD 0\n", + "cityE 0\n", + "cityUF 0\n", + "co2 31989\n", + "co2A 37318\n", + "co2TailpipeAGpm 0\n", + "co2TailpipeGpm 0\n", + "comb08 0\n", + "comb08U 0\n", + "combA08 0\n", + "combA08U 0\n", + "combE 0\n", + "combinedCD 0\n", + "combinedUF 0\n", + "cylinders 0\n", + "displ 0\n", + "engId 0\n", + "feScore 32028\n", + "fuelCost08 0\n", + "fuelCostA08 0\n", + "ghgScore 32028\n", + "ghgScoreA 37325\n", + "highway08 0\n", + "highway08U 0\n", + "highwayA08 0\n", + "highwayA08U 0\n", + "highwayCD 0\n", + "highwayE 0\n", + "highwayUF 0\n", + "hlv 0\n", + "hpv 0\n", + "id 0\n", + "lv2 0\n", + "lv4 0\n", + "phevBlended 0\n", + "pv2 0\n", + "pv4 0\n", + "range 0\n", + "rangeCity 0\n", + "rangeCityA 0\n", + "rangeHwy 0\n", + "rangeHwyA 0\n", + "UCity 0\n", + "UCityA 0\n", + "UHighway 0\n", + "UHighwayA 0\n", + "year 0\n", + "youSaveSpend 32526\n", + "charge240b 0\n", + "phevCity 0\n", + "phevHwy 0\n", + "phevComb 0\n", + "dtype: int64" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numeric[numeric <0].count() # counts all numeric less than zero\n" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "co2 31989\n", + "co2A 37318\n", + "feScore 32028\n", + "ghgScore 32028\n", + "ghgScoreA 37325\n", + "youSaveSpend 32526\n", + "dtype: int64" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_nagatives = numeric[numeric <0].count()\n", + "ds_nagatives[ds_nagatives >0] # counts cloumn that have more zero negative." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "condition = ds['displ'].isnull()\n", + "null_displ = ds[condition]\n", + "null_displ = null_displ[['year', 'make', 'model', 'trany', 'fuelType', 'cylinders', 'displ']]" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yearmakemodeltranyfuelTypecylindersdispl
71382000NissanAltra EVNaNElectricityNaNNaN
71392000ToyotaRAV4 EVNaNElectricityNaNNaN
81432001ToyotaRAV4 EVNaNElectricityNaNNaN
81442001FordTh!nkNaNElectricityNaNNaN
81462001FordExplorer USPS ElectricNaNElectricityNaNNaN
........................
309692017KiaSoul ElectricAutomatic (A1)ElectricityNaNNaN
309722016TeslaModel S (60 kW-hr battery pack)Automatic (A1)ElectricityNaNNaN
309732016TeslaModel S AWD - 60DAutomatic (A1)ElectricityNaNNaN
309742016TeslaModel S AWD - P100DAutomatic (A1)ElectricityNaNNaN
309752016TeslaModel X AWD - 60DAutomatic (A1)ElectricityNaNNaN
\n", + "

120 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " year make model trany \\\n", + "7138 2000 Nissan Altra EV NaN \n", + "7139 2000 Toyota RAV4 EV NaN \n", + "8143 2001 Toyota RAV4 EV NaN \n", + "8144 2001 Ford Th!nk NaN \n", + "8146 2001 Ford Explorer USPS Electric NaN \n", + "... ... ... ... ... \n", + "30969 2017 Kia Soul Electric Automatic (A1) \n", + "30972 2016 Tesla Model S (60 kW-hr battery pack) Automatic (A1) \n", + "30973 2016 Tesla Model S AWD - 60D Automatic (A1) \n", + "30974 2016 Tesla Model S AWD - P100D Automatic (A1) \n", + "30975 2016 Tesla Model X AWD - 60D Automatic (A1) \n", + "\n", + " fuelType cylinders displ \n", + "7138 Electricity NaN NaN \n", + "7139 Electricity NaN NaN \n", + "8143 Electricity NaN NaN \n", + "8144 Electricity NaN NaN \n", + "8146 Electricity NaN NaN \n", + "... ... ... ... \n", + "30969 Electricity NaN NaN \n", + "30972 Electricity NaN NaN \n", + "30973 Electricity NaN NaN \n", + "30974 Electricity NaN NaN \n", + "30975 Electricity NaN NaN \n", + "\n", + "[120 rows x 7 columns]" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "null_displ" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "ds[['displ','cylinders']] = ds[['displ','cylinders']].fillna(0)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
yearmakemodeltranydrivefuelTypecylindersdispl
296702016VolvoXC90 AWDAutomatic (S8)All-Wheel DrivePremium4.02.0
296712016ChevroletSpark EVAutomatic (A1)Front-Wheel DriveElectricity0.00.0
296722016Mercedes-BenzB250eAutomatic (A1)Front-Wheel DriveElectricity0.00.0
296732016HyundaiSonata Plug-in HybridAuto(AM6)Front-Wheel DriveRegular Gas and Electricity4.02.0
296742016JaguarXF AWDAutomatic (S8)All-Wheel DrivePremium6.03.0
...........................
299951987GMCR15 Pickup 2WDAutomatic 3-spdRear-Wheel DriveRegular6.04.3
299962017MitsubishiMirageManual 5-spdFront-Wheel DriveRegular3.01.2
299972017MitsubishiMirageAutomatic (variable gear ratios)Front-Wheel DriveRegular3.01.2
299982017ChryslerPacificaAutomatic 9-spdFront-Wheel DriveRegular6.03.6
299992017Mitsubishii-MiEVAutomatic (A1)Rear-Wheel DriveElectricity0.00.0
\n", + "

330 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " year make model \\\n", + "29670 2016 Volvo XC90 AWD \n", + "29671 2016 Chevrolet Spark EV \n", + "29672 2016 Mercedes-Benz B250e \n", + "29673 2016 Hyundai Sonata Plug-in Hybrid \n", + "29674 2016 Jaguar XF AWD \n", + "... ... ... ... \n", + "29995 1987 GMC R15 Pickup 2WD \n", + "29996 2017 Mitsubishi Mirage \n", + "29997 2017 Mitsubishi Mirage \n", + "29998 2017 Chrysler Pacifica \n", + "29999 2017 Mitsubishi i-MiEV \n", + "\n", + " trany drive \\\n", + "29670 Automatic (S8) All-Wheel Drive \n", + "29671 Automatic (A1) Front-Wheel Drive \n", + "29672 Automatic (A1) Front-Wheel Drive \n", + "29673 Auto(AM6) Front-Wheel Drive \n", + "29674 Automatic (S8) All-Wheel Drive \n", + "... ... ... \n", + "29995 Automatic 3-spd Rear-Wheel Drive \n", + "29996 Manual 5-spd Front-Wheel Drive \n", + "29997 Automatic (variable gear ratios) Front-Wheel Drive \n", + "29998 Automatic 9-spd Front-Wheel Drive \n", + "29999 Automatic (A1) Rear-Wheel Drive \n", + "\n", + " fuelType cylinders displ \n", + "29670 Premium 4.0 2.0 \n", + "29671 Electricity 0.0 0.0 \n", + "29672 Electricity 0.0 0.0 \n", + "29673 Regular Gas and Electricity 4.0 2.0 \n", + "29674 Premium 6.0 3.0 \n", + "... ... ... ... \n", + "29995 Regular 6.0 4.3 \n", + "29996 Regular 3.0 1.2 \n", + "29997 Regular 3.0 1.2 \n", + "29998 Regular 6.0 3.6 \n", + "29999 Electricity 0.0 0.0 \n", + "\n", + "[330 rows x 8 columns]" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds[['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']][29670:30000]" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('float64')" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## datatype corrections \n", + "\n", + "ds['cylinders'].dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "barrels08 float64\n", + "barrelsA08 float64\n", + "charge120 float64\n", + "charge240 float64\n", + "city08 int64\n", + " ... \n", + "modifiedOn object\n", + "startStop object\n", + "phevCity int64\n", + "phevHwy int64\n", + "phevComb int64\n", + "Length: 83, dtype: object" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dtype('int64')" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds['cylinders'] = ds['cylinders'].astype('int64')\n", + "ds['cylinders'].dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{nan, 'Auto(AV-S6)', 'Manual(M7)', 'Automatic (S7)', 'Auto(AM-S9)', 'Automatic 9-spd', 'Automatic (A6)', 'Auto(AM7)', 'Automatic (AM6)', 'Auto(AV-S8)', 'Automatic (AV-S6)', 'Auto(AM-S6)', 'Automatic 3-spd', 'Automatic (variable gear ratios)', 'Automatic (A1)', 'Auto (AV-S8)', 'Manual 7-spd', 'Auto(AM-S8)', 'Automatic 7-spd', 'Manual 4-spd Doubled', 'Automatic 6spd', 'Automatic 5-spd', 'Automatic (S5)', 'Auto (AV)', 'Auto(L4)', 'Manual 3-spd', 'Automatic (S8)', 'Automatic 4-spd', 'Automatic 8-spd', 'Manual 6-spd', 'Automatic (AV)', 'Auto (AV-S6)', 'Auto(L3)', 'Auto(AM-S7)', 'Manual 5 spd', 'Auto(AM5)', 'Auto(A1)', 'Auto(AM6)', 'Automatic (S9)', 'Manual 4-spd', 'Manual 5-spd', 'Automatic (S6)', 'Auto(AM8)', 'Automatic (S4)', 'Auto(AV-S7)', 'Automatic (AM5)', 'Automatic 6-spd'}\n" + ] + } + ], + "source": [ + "## cleaning gtext and removing special symbols \n", + "print (set(ds['trany']))" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Manual 5-spd', 'Automatic 3-spd', 'Automatic 4-spd',\n", + " 'Automatic 5-spd', 'Manual 4-spd', 'Manual 3-spd', 'Manual 6-spd',\n", + " 'Automatic (S5)', 'Automatic (variable gear ratios)',\n", + " 'Automatic 6-spd', 'Automatic (S6)', nan, 'Automatic (S4)',\n", + " 'Automatic 7-spd', 'Automatic (S7)', 'Automatic (S8)',\n", + " 'Automatic (AM5)', 'Auto(AM6)', 'Auto(AV-S7)', 'Automatic (A6)',\n", + " 'Automatic (AV-S6)', 'Auto(AM7)', 'Manual 4-spd Doubled',\n", + " 'Manual 5 spd', 'Automatic (AM6)', 'Manual 7-spd', 'Auto(L4)',\n", + " 'Auto(L3)', 'Automatic (AV)', 'Auto (AV-S6)', 'Auto(AM5)',\n", + " 'Auto(AV-S6)', 'Auto (AV-S8)', 'Automatic 8-spd', 'Auto(AV-S8)',\n", + " 'Automatic (A1)', 'Auto (AV)', 'Auto(AM-S6)', 'Auto(AM-S7)',\n", + " 'Automatic 6spd', 'Automatic 9-spd', 'Automatic (S9)',\n", + " 'Auto(AM-S8)', 'Auto(A1)', 'Auto(AM8)', 'Manual(M7)',\n", + " 'Auto(AM-S9)'], dtype=object)" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds['trany'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds['trany'] = ds['trany']" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "ade50332f43eeb97f8e1fbc4c7d493d3589b4263fef63cb9cf9d26d524eaa4f1" + }, + "kernelspec": { + "display_name": "Python 3.9.7 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}