diff --git a/your-code/.ipynb_checkpoints/challenge-1-checkpoint.ipynb b/your-code/.ipynb_checkpoints/challenge-1-checkpoint.ipynb new file mode 100644 index 0000000..c70fb32 --- /dev/null +++ b/your-code/.ipynb_checkpoints/challenge-1-checkpoint.ipynb @@ -0,0 +1,1208 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Challenge 1\n", + "\n", + "In this challenge you will be working on **Pokemon**. You will answer a series of questions in order to practice dataframe calculation, aggregation, and transformation.\n", + "\n", + "![Pokemon](../images/pokemon.jpg)\n", + "\n", + "Follow the instructions below and enter your code.\n", + "\n", + "#### Import all required libraries." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# import libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Import data set.\n", + "\n", + "Read the dataset `pokemon.csv` into a dataframe called `pokemon`.\n", + "\n", + "*Data set attributed to [Alberto Barradas](https://www.kaggle.com/abcsds/pokemon/)*" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# import dataset\n", + "pokemon = pd.read_csv(r'C:\\Users\\gaelm\\Desktop\\lab\\lab-dataframe-calculations\\your-code\\Pokemon.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Print first 10 rows of `pokemon`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison3184549496565451False
12IvysaurGrassPoison4056062638080601False
23VenusaurGrassPoison525808283100100801False
33VenusaurMega VenusaurGrassPoison62580100123122120801False
44CharmanderFireNaN3093952436050651False
55CharmeleonFireNaN4055864588065801False
66CharizardFireFlying534788478109851001False
76CharizardMega Charizard XFireDragon63478130111130851001False
86CharizardMega Charizard YFireFlying63478104781591151001False
97SquirtleWaterNaN3144448655064431False
\n", + "
" + ], + "text/plain": [ + " # Name Type 1 Type 2 Total HP Attack Defense \\\n", + "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", + "1 2 Ivysaur Grass Poison 405 60 62 63 \n", + "2 3 Venusaur Grass Poison 525 80 82 83 \n", + "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", + "4 4 Charmander Fire NaN 309 39 52 43 \n", + "5 5 Charmeleon Fire NaN 405 58 64 58 \n", + "6 6 Charizard Fire Flying 534 78 84 78 \n", + "7 6 CharizardMega Charizard X Fire Dragon 634 78 130 111 \n", + "8 6 CharizardMega Charizard Y Fire Flying 634 78 104 78 \n", + "9 7 Squirtle Water NaN 314 44 48 65 \n", + "\n", + " Sp. Atk Sp. Def Speed Generation Legendary \n", + "0 65 65 45 1 False \n", + "1 80 80 60 1 False \n", + "2 100 100 80 1 False \n", + "3 122 120 80 1 False \n", + "4 60 50 65 1 False \n", + "5 80 65 80 1 False \n", + "6 109 85 100 1 False \n", + "7 130 85 100 1 False \n", + "8 159 115 100 1 False \n", + "9 50 64 43 1 False " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "pokemon.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When you look at a data set, you often wonder what each column means. Some open-source data sets provide descriptions of the data set. In many cases, data descriptions are extremely useful for data analysts to perform work efficiently and successfully.\n", + "\n", + "For the `Pokemon.csv` data set, fortunately, the owner provided descriptions which you can see [here](https://www.kaggle.com/abcsds/pokemon/home). For your convenience, we are including the descriptions below. Read the descriptions and understand what each column means. This knowledge is helpful in your work with the data.\n", + "\n", + "| Column | Description |\n", + "| --- | --- |\n", + "| # | ID for each pokemon |\n", + "| Name | Name of each pokemon |\n", + "| Type 1 | Each pokemon has a type, this determines weakness/resistance to attacks |\n", + "| Type 2 | Some pokemon are dual type and have 2 |\n", + "| Total | A general guide to how strong a pokemon is |\n", + "| HP | Hit points, or health, defines how much damage a pokemon can withstand before fainting |\n", + "| Attack | The base modifier for normal attacks (eg. Scratch, Punch) |\n", + "| Defense | The base damage resistance against normal attacks |\n", + "| SP Atk | Special attack, the base modifier for special attacks (e.g. fire blast, bubble beam) |\n", + "| SP Def | The base damage resistance against special attacks |\n", + "| Speed | Determines which pokemon attacks first each round |\n", + "| Generation | Number of generation |\n", + "| Legendary | True if Legendary Pokemon False if not |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Obtain the distinct values across `Type 1` and `Type 2`.\n", + "\n", + "Exctract all the values in `Type 1` and `Type 2`. Then create an array containing the distinct values across both fields." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Grass', 'Fire', 'Water', 'Bug', 'Normal', 'Poison', 'Electric',\n", + " 'Ground', 'Fairy', 'Fighting', 'Psychic', 'Rock', 'Ghost', 'Ice',\n", + " 'Dragon', 'Dark', 'Steel', 'Flying'], dtype=object)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "type_1_values = pokemon['Type 1'].unique()\n", + "type_1_values" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Poison', nan, 'Flying', 'Dragon', 'Ground', 'Fairy', 'Grass',\n", + " 'Fighting', 'Psychic', 'Steel', 'Ice', 'Rock', 'Dark', 'Water',\n", + " 'Electric', 'Fire', 'Ghost', 'Bug', 'Normal'], dtype=object)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type_2_values = pokemon['Type 2'].unique()\n", + "type_2_values" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Bug', 'Dark', 'Dragon', 'Electric', 'Fairy', 'Fighting', 'Fire',\n", + " 'Flying', 'Ghost', 'Grass', 'Ground', 'Ice', 'Normal', 'Poison',\n", + " 'Psychic', 'Rock', 'Steel', 'Water'], dtype=object)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "distinct_types = np.unique(np.concatenate((type_1_values[~pd.isna(type_1_values)], type_2_values[~pd.isna(type_2_values)])))\n", + "distinct_types " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Cleanup `Name` that contain \"Mega\".\n", + "\n", + "If you have checked out the pokemon names carefully enough, you should have found there are junk texts in the pokemon names which contain \"Mega\". We want to clean up the pokemon names. For instance, \"VenusaurMega Venusaur\" should be \"Mega Venusaur\", and \"CharizardMega Charizard X\" should be \"Mega Charizard X\"." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Name
0Bulbasaur
1Ivysaur
2Venusaur
3VenusaurMega Venusaur
4Charmander
5Charmeleon
6Charizard
7CharizardMega Charizard X
8CharizardMega Charizard Y
9Squirtle
\n", + "
" + ], + "text/plain": [ + " Name\n", + "0 Bulbasaur\n", + "1 Ivysaur\n", + "2 Venusaur\n", + "3 VenusaurMega Venusaur\n", + "4 Charmander\n", + "5 Charmeleon\n", + "6 Charizard\n", + "7 CharizardMega Charizard X\n", + "8 CharizardMega Charizard Y\n", + "9 Squirtle" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "\n", + "pokemon['Name'] = pokemon['Name'].str.replace(r'Mega\\s*', 'Mega ', regex=True).str.replace(r'Mega\\s*(\\w+)', r'Mega \\1', regex=True)\n", + "\n", + "# test transformed data\n", + "pokemon[['Name']].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Create a new column called `A/D Ratio` whose value equals to `Attack` devided by `Defense`.\n", + "\n", + "For instance, if a pokemon has the Attack score 49 and Defense score 49, the corresponding `A/D Ratio` is 49/49=1." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameAttackDefenseA/D Ratio
0Bulbasaur49491.000000
1Ivysaur62630.984127
2Venusaur82830.987952
3VenusaurMega Venusaur1001230.813008
4Charmander52431.209302
5Charmeleon64581.103448
6Charizard84781.076923
7CharizardMega Charizard X1301111.171171
8CharizardMega Charizard Y104781.333333
9Squirtle48650.738462
\n", + "
" + ], + "text/plain": [ + " Name Attack Defense A/D Ratio\n", + "0 Bulbasaur 49 49 1.000000\n", + "1 Ivysaur 62 63 0.984127\n", + "2 Venusaur 82 83 0.987952\n", + "3 VenusaurMega Venusaur 100 123 0.813008\n", + "4 Charmander 52 43 1.209302\n", + "5 Charmeleon 64 58 1.103448\n", + "6 Charizard 84 78 1.076923\n", + "7 CharizardMega Charizard X 130 111 1.171171\n", + "8 CharizardMega Charizard Y 104 78 1.333333\n", + "9 Squirtle 48 65 0.738462" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "pokemon['A/D Ratio'] = pokemon['Attack'] / pokemon['Defense']\n", + "pokemon[['Name', 'Attack', 'Defense', 'A/D Ratio']].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Identify the pokemon with the highest `A/D Ratio`." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "# 386\n", + "Name DeoxysAttack Forme\n", + "Type 1 Psychic\n", + "Type 2 NaN\n", + "Total 600\n", + "HP 50\n", + "Attack 180\n", + "Defense 20\n", + "Sp. Atk 180\n", + "Sp. Def 20\n", + "Speed 150\n", + "Generation 3\n", + "Legendary True\n", + "A/D Ratio 9.0\n", + "Name: 429, dtype: object" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "highestAD_ratio_index = pokemon['A/D Ratio'].idxmax()\n", + "highestAD_ratio_pokemon = pokemon.loc[highestAD_ratio_index]\n", + "highestAD_ratio_pokemon" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Identify the pokemon with the lowest A/D Ratio." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "# 213\n", + "Name Shuckle\n", + "Type 1 Bug\n", + "Type 2 Rock\n", + "Total 505\n", + "HP 20\n", + "Attack 10\n", + "Defense 230\n", + "Sp. Atk 10\n", + "Sp. Def 230\n", + "Speed 5\n", + "Generation 2\n", + "Legendary False\n", + "A/D Ratio 0.043478\n", + "Name: 230, dtype: object" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "lowestAD_ratio_index = pokemon['A/D Ratio'].idxmin()\n", + "lowestAD_ratio_pokemon = pokemon.loc[lowestAD_ratio_index]\n", + "lowestAD_ratio_pokemon" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Create a new column called `Combo Type` whose value combines `Type 1` and `Type 2`.\n", + "\n", + "Rules:\n", + "\n", + "* If both `Type 1` and `Type 2` have valid values, the `Combo Type` value should contain both values in the form of ` `. For example, if `Type 1` value is `Grass` and `Type 2` value is `Poison`, `Combo Type` will be `Grass-Poison`.\n", + "\n", + "* If `Type 1` has valid value but `Type 2` is not, `Combo Type` will be the same as `Type 1`. For example, if `Type 1` is `Fire` whereas `Type 2` is `NaN`, `Combo Type` will be `Fire`." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameType 1Type 2Combo Type
0BulbasaurGrassPoisonGrass-Poison
1IvysaurGrassPoisonGrass-Poison
2VenusaurGrassPoisonGrass-Poison
3VenusaurMega VenusaurGrassPoisonGrass-Poison
4CharmanderFireNaNFire
5CharmeleonFireNaNFire
6CharizardFireFlyingFire-Flying
7CharizardMega Charizard XFireDragonFire-Dragon
8CharizardMega Charizard YFireFlyingFire-Flying
9SquirtleWaterNaNWater
\n", + "
" + ], + "text/plain": [ + " Name Type 1 Type 2 Combo Type\n", + "0 Bulbasaur Grass Poison Grass-Poison\n", + "1 Ivysaur Grass Poison Grass-Poison\n", + "2 Venusaur Grass Poison Grass-Poison\n", + "3 VenusaurMega Venusaur Grass Poison Grass-Poison\n", + "4 Charmander Fire NaN Fire\n", + "5 Charmeleon Fire NaN Fire\n", + "6 Charizard Fire Flying Fire-Flying\n", + "7 CharizardMega Charizard X Fire Dragon Fire-Dragon\n", + "8 CharizardMega Charizard Y Fire Flying Fire-Flying\n", + "9 Squirtle Water NaN Water" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "pokemon['Combo Type'] = np.where(\n", + " pokemon['Type 2'].isna(), \n", + " pokemon['Type 1'], \n", + " pokemon['Type 1'] + '-' + pokemon['Type 2']\n", + ")\n", + "pokemon[['Name', 'Type 1', 'Type 2', 'Combo Type']].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Identify the pokemon whose `A/D Ratio` are among the top 5." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameAttackDefenseA/D Ratio
429DeoxysAttack Forme180209.000
347Carvanha90204.500
19BeedrillMega Beedrill150403.750
453Cranidos125403.125
348Sharpedo120403.000
\n", + "
" + ], + "text/plain": [ + " Name Attack Defense A/D Ratio\n", + "429 DeoxysAttack Forme 180 20 9.000\n", + "347 Carvanha 90 20 4.500\n", + "19 BeedrillMega Beedrill 150 40 3.750\n", + "453 Cranidos 125 40 3.125\n", + "348 Sharpedo 120 40 3.000" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "top_5AD_ratio = pokemon.nlargest(5, 'A/D Ratio')\n", + "\n", + "top_5AD_ratio[['Name', 'Attack', 'Defense', 'A/D Ratio']]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### For the 5 pokemon printed above, aggregate `Combo Type` and use a list to store the unique values.\n", + "\n", + "Your end product is a list containing the distinct `Combo Type` values of the 5 pokemon with the highest `A/D Ratio`." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Psychic', 'Water-Dark', 'Bug-Poison', 'Rock']" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "top_5AD_ratio = pokemon.nlargest(5, 'A/D Ratio')\n", + "unique_combo_types = top_5AD_ratio['Combo Type'].unique().tolist()\n", + "unique_combo_types" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### For each of the `Combo Type` values obtained from the previous question, calculate the mean scores of all numeric fields across all pokemon.\n", + "\n", + "Your output should look like below:\n", + "\n", + "![Aggregate](../images/aggregated-mean.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
#TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendaryA/D Ratio
Combo Type
Psychic381.973684464.55263272.55263264.94736867.23684298.55263282.39473778.8684213.3421050.2368421.164196
Water-Dark347.666667493.83333369.166667120.00000065.16666788.83333363.50000087.1666673.1666670.0000002.291949
Bug-Poison199.166667347.91666753.75000068.33333358.08333342.50000059.33333365.9166672.3333330.0000001.315989
Rock410.111111409.44444467.111111103.333333107.22222240.55555658.33333332.8888893.8888890.1111111.260091
\n", + "
" + ], + "text/plain": [ + " # Total HP Attack Defense \\\n", + "Combo Type \n", + "Psychic 381.973684 464.552632 72.552632 64.947368 67.236842 \n", + "Water-Dark 347.666667 493.833333 69.166667 120.000000 65.166667 \n", + "Bug-Poison 199.166667 347.916667 53.750000 68.333333 58.083333 \n", + "Rock 410.111111 409.444444 67.111111 103.333333 107.222222 \n", + "\n", + " Sp. Atk Sp. Def Speed Generation Legendary A/D Ratio \n", + "Combo Type \n", + "Psychic 98.552632 82.394737 78.868421 3.342105 0.236842 1.164196 \n", + "Water-Dark 88.833333 63.500000 87.166667 3.166667 0.000000 2.291949 \n", + "Bug-Poison 42.500000 59.333333 65.916667 2.333333 0.000000 1.315989 \n", + "Rock 40.555556 58.333333 32.888889 3.888889 0.111111 1.260091 " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "mean_scores_combo_type = pokemon.groupby('Combo Type').mean(numeric_only=True)\n", + "filtered_mean_scores = mean_scores_combo_type.loc[unique_combo_types]\n", + "filtered_mean_scores\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/your-code/.ipynb_checkpoints/challenge-2-checkpoint.ipynb b/your-code/.ipynb_checkpoints/challenge-2-checkpoint.ipynb new file mode 100644 index 0000000..f7c2ba6 --- /dev/null +++ b/your-code/.ipynb_checkpoints/challenge-2-checkpoint.ipynb @@ -0,0 +1,1702 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Challenge 2\n", + "\n", + "In this challenge we will continue working with the `Pokemon` dataset. We will attempt solving a slightly more complex problem in which we will practice the iterative data analysis process you leaned in [this video](https://www.youtube.com/watch?v=xOomNicqbkk).\n", + "\n", + "The problem statement is as follows:\n", + "\n", + "**You are at a Pokemon black market planning to buy a Pokemon for battle. All Pokemon are sold at the same price and you can only afford to buy one. You cannot choose which specific Pokemon to buy. However, you can specify the type of the Pokemon - one type that exists in either `Type 1` or `Type 2`. Which type should you choose in order to maximize your chance of receiving a good Pokemon?**\n", + "\n", + "To remind you about the 3 steps of iterative data analysis, they are:\n", + "\n", + "1. Setting Expectations\n", + "1. Collecting Information\n", + "1. Reacting to Data / Revising Expectations\n", + "\n", + "Following the iterative process, we'll guide you in completing the challenge." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Problem Solving Iteration 1\n", + "\n", + "In this iteration we'll analyze the problem and identify the breakthrough. The original question statement is kind of vague because we don't know what a *good pokemon* really means as represented in the data. We'll start by understanding the dataset and see if we can find some insights." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Import libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Importing the dataset\n", + "pokemon = pd.read_csv(r'C:\\Users\\gaelm\\Desktop\\lab\\lab-dataframe-calculations\\your-code\\Pokemon.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the data it seems whether a pokemon is good depends on its abilities as represented in the fields of `HP`, `Attack`, `Defense`, `Sp. Atk`, `Sp. Def`, `Speed`, and `Total`. We are not sure about `Generation` and `Legendary` because they are not necessarily the decisive factors of the pokemon abilities.\n", + "\n", + "But `HP`, `Attack`, `Defense`, `Sp. Atk`, `Sp. Def`, `Speed`, and `Total` are a lot of fields! If we look at them all at once it's very complicated. This isn't Mission Impossible but it's ideal that we tackle this kind of problem after we learn Machine Learning (which you will do in Module 3). For now, is there a way to consolidate the fields we need to look into?\n", + "\n", + "Fortunately there seems to be a way. It appears the `Total` field is computed based on the other 6 fields. But we need to prove our theory. If we can approve there is a formula to compute `Total` based on the other 6 abilities, we only need to look into `Total`.\n", + "\n", + "We have the following expectation now:\n", + "\n", + "#### The `Total` field is computed based on `HP`, `Attack`, `Defense`, `Sp. Atk`, `Sp. Def`, and `Speed`.\n", + "\n", + "We need to collect the following information:\n", + "\n", + "* **What is the formula to compute `Total`?**\n", + "* **Does the formula work for all pokemon?**\n", + "\n", + "In the cell below, make a hypothesis on how `Total` is computed and test your hypothesis." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# your code here\n", + "pokemon['Calculated Total'] = (\n", + " pokemon['HP'] +\n", + " pokemon['Attack'] +\n", + " pokemon['Defense'] +\n", + " pokemon['Sp. Atk'] +\n", + " pokemon['Sp. Def'] +\n", + " pokemon['Speed']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "pokemon['Total Matches'] = pokemon['Calculated Total'] == pokemon['Total']" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Total Matches\n", + "True 800\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "totals_matching= pokemon['Total Matches'].value_counts()\n", + "totals_matching" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Problem Solving Iteration 2\n", + "\n", + "Now that we have consolidated the abilities fields, we can update the problem statement. The new problem statement is:\n", + "\n", + "### Which pokemon type is most likely to have the highest `Total` value?\n", + "\n", + "In the updated problem statement, we assume there is a certain relationship between the `Total` and the pokemon type. But we have two *type* fields (`Type 1` and `Type 2`) that have string values. In data analysis, string fields have to be transformed to numerical format in order to be analyzed. \n", + "\n", + "In addition, keep in mind that `Type 1` always has a value but `Type 2` is sometimes empty (having the `NaN` value). Also, the pokemon type we choose may be either in `Type 1` or `Type 2`.\n", + "\n", + "Now our expectation is:\n", + "\n", + "#### `Type 1` and `Type 2` string variables need to be converted to numerical variables in order to identify the relationship between `Total` and the pokemon type.\n", + "\n", + "The information we need to collect is:\n", + "\n", + "#### How to convert two string variables to numerical?\n", + "\n", + "Let's address the first question first. You can use a method called **One Hot Encoding** which is frequently used in machine learning to encode categorical string variables to numerical. The idea is to gather all the possible string values in a categorical field and create a numerical field for each unique string value. Each of those numerical fields uses `1` and `0` to indicate whether the data record has the corresponding categorical value. A detailed explanation of One Hot Encoding can be found in [this article](https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f). You will formally learn it in Module 3.\n", + "\n", + "For instance, if a pokemon has `Type 1` as `Poison` and `Type 2` as `Fire`, then its `Poison` and `Fire` fields are `1` whereas all other fields are `0`. If a pokemon has `Type 1` as `Water` and `Type 2` as `NaN`, then its `Water` field is `1` whereas all other fields are `0`.\n", + "\n", + "#### In the next cell, use One Hot Encoding to encode `Type 1` and `Type 2`. Use the pokemon type values as the names of the numerical fields you create.\n", + "\n", + "The new numerical variables you create should look like below:\n", + "\n", + "![One Hot Encoding](../images/one-hot-encoding.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. Def...Type2_GhostType2_GrassType2_GroundType2_IceType2_NormalType2_PoisonType2_PsychicType2_RockType2_SteelType2_Water
01BulbasaurGrassPoison3184549496565...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
12IvysaurGrassPoison4056062638080...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
23VenusaurGrassPoison525808283100100...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
33VenusaurMega VenusaurGrassPoison62580100123122120...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
44CharmanderFireNaN3093952436050...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
..................................................................
795719DiancieRockFairy60050100150100150...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
796719DiancieMega DiancieRockFairy70050160110160110...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
797720HoopaHoopa ConfinedPsychicGhost6008011060150130...TrueFalseFalseFalseFalseFalseFalseFalseFalseFalse
798720HoopaHoopa UnboundPsychicDark6808016060170130...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
799721VolcanionFireWater6008011012013090...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
\n", + "

800 rows × 51 columns

\n", + "
" + ], + "text/plain": [ + " # Name Type 1 Type 2 Total HP Attack Defense \\\n", + "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", + "1 2 Ivysaur Grass Poison 405 60 62 63 \n", + "2 3 Venusaur Grass Poison 525 80 82 83 \n", + "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", + "4 4 Charmander Fire NaN 309 39 52 43 \n", + ".. ... ... ... ... ... .. ... ... \n", + "795 719 Diancie Rock Fairy 600 50 100 150 \n", + "796 719 DiancieMega Diancie Rock Fairy 700 50 160 110 \n", + "797 720 HoopaHoopa Confined Psychic Ghost 600 80 110 60 \n", + "798 720 HoopaHoopa Unbound Psychic Dark 680 80 160 60 \n", + "799 721 Volcanion Fire Water 600 80 110 120 \n", + "\n", + " Sp. Atk Sp. Def ... Type2_Ghost Type2_Grass Type2_Ground Type2_Ice \\\n", + "0 65 65 ... False False False False \n", + "1 80 80 ... False False False False \n", + "2 100 100 ... False False False False \n", + "3 122 120 ... False False False False \n", + "4 60 50 ... False False False False \n", + ".. ... ... ... ... ... ... ... \n", + "795 100 150 ... False False False False \n", + "796 160 110 ... False False False False \n", + "797 150 130 ... True False False False \n", + "798 170 130 ... False False False False \n", + "799 130 90 ... False False False False \n", + "\n", + " Type2_Normal Type2_Poison Type2_Psychic Type2_Rock Type2_Steel \\\n", + "0 False True False False False \n", + "1 False True False False False \n", + "2 False True False False False \n", + "3 False True False False False \n", + "4 False False False False False \n", + ".. ... ... ... ... ... \n", + "795 False False False False False \n", + "796 False False False False False \n", + "797 False False False False False \n", + "798 False False False False False \n", + "799 False False False False False \n", + "\n", + " Type2_Water \n", + "0 False \n", + "1 False \n", + "2 False \n", + "3 False \n", + "4 False \n", + ".. ... \n", + "795 False \n", + "796 False \n", + "797 False \n", + "798 False \n", + "799 True \n", + "\n", + "[800 rows x 51 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "type1_encoded = pd.get_dummies(pokemon['Type 1'], prefix='Type1')\n", + "type2_encoded = pd.get_dummies(pokemon['Type 2'], prefix='Type2')\n", + "pokemon_encoded = pd.concat([pokemon, type1_encoded, type2_encoded], axis=1)\n", + "pokemon_encoded" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
#NameTotalHPAttackDefenseSp. AtkSp. DefSpeedGeneration...Type2_GhostType2_GrassType2_GroundType2_IceType2_NormalType2_PoisonType2_PsychicType2_RockType2_SteelType2_Water
01Bulbasaur3184549496565451...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
12Ivysaur4056062638080601...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
23Venusaur525808283100100801...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
33VenusaurMega Venusaur62580100123122120801...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
44Charmander3093952436050651...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
..................................................................
795719Diancie60050100150100150506...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
796719DiancieMega Diancie700501601101601101106...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
797720HoopaHoopa Confined6008011060150130706...TrueFalseFalseFalseFalseFalseFalseFalseFalseFalse
798720HoopaHoopa Unbound6808016060170130806...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
799721Volcanion6008011012013090706...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
\n", + "

800 rows × 49 columns

\n", + "
" + ], + "text/plain": [ + " # Name Total HP Attack Defense Sp. Atk Sp. Def \\\n", + "0 1 Bulbasaur 318 45 49 49 65 65 \n", + "1 2 Ivysaur 405 60 62 63 80 80 \n", + "2 3 Venusaur 525 80 82 83 100 100 \n", + "3 3 VenusaurMega Venusaur 625 80 100 123 122 120 \n", + "4 4 Charmander 309 39 52 43 60 50 \n", + ".. ... ... ... .. ... ... ... ... \n", + "795 719 Diancie 600 50 100 150 100 150 \n", + "796 719 DiancieMega Diancie 700 50 160 110 160 110 \n", + "797 720 HoopaHoopa Confined 600 80 110 60 150 130 \n", + "798 720 HoopaHoopa Unbound 680 80 160 60 170 130 \n", + "799 721 Volcanion 600 80 110 120 130 90 \n", + "\n", + " Speed Generation ... Type2_Ghost Type2_Grass Type2_Ground \\\n", + "0 45 1 ... False False False \n", + "1 60 1 ... False False False \n", + "2 80 1 ... False False False \n", + "3 80 1 ... False False False \n", + "4 65 1 ... False False False \n", + ".. ... ... ... ... ... ... \n", + "795 50 6 ... False False False \n", + "796 110 6 ... False False False \n", + "797 70 6 ... True False False \n", + "798 80 6 ... False False False \n", + "799 70 6 ... False False False \n", + "\n", + " Type2_Ice Type2_Normal Type2_Poison Type2_Psychic Type2_Rock \\\n", + "0 False False True False False \n", + "1 False False True False False \n", + "2 False False True False False \n", + "3 False False True False False \n", + "4 False False False False False \n", + ".. ... ... ... ... ... \n", + "795 False False False False False \n", + "796 False False False False False \n", + "797 False False False False False \n", + "798 False False False False False \n", + "799 False False False False False \n", + "\n", + " Type2_Steel Type2_Water \n", + "0 False False \n", + "1 False False \n", + "2 False False \n", + "3 False False \n", + "4 False False \n", + ".. ... ... \n", + "795 False False \n", + "796 False False \n", + "797 False False \n", + "798 False False \n", + "799 False True \n", + "\n", + "[800 rows x 49 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pokemon_encoded = pokemon_encoded.drop(['Type 1', 'Type 2'], axis=1)\n", + "pokemon_encoded" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
#NameTotalHPAttackDefenseSp. AtkSp. DefSpeedGeneration...Type2_GhostType2_GrassType2_GroundType2_IceType2_NormalType2_PoisonType2_PsychicType2_RockType2_SteelType2_Water
01Bulbasaur3184549496565451...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
12Ivysaur4056062638080601...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
23Venusaur525808283100100801...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
33VenusaurMega Venusaur62580100123122120801...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
44Charmander3093952436050651...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", + "

5 rows × 49 columns

\n", + "
" + ], + "text/plain": [ + " # Name Total HP Attack Defense Sp. Atk Sp. Def \\\n", + "0 1 Bulbasaur 318 45 49 49 65 65 \n", + "1 2 Ivysaur 405 60 62 63 80 80 \n", + "2 3 Venusaur 525 80 82 83 100 100 \n", + "3 3 VenusaurMega Venusaur 625 80 100 123 122 120 \n", + "4 4 Charmander 309 39 52 43 60 50 \n", + "\n", + " Speed Generation ... Type2_Ghost Type2_Grass Type2_Ground Type2_Ice \\\n", + "0 45 1 ... False False False False \n", + "1 60 1 ... False False False False \n", + "2 80 1 ... False False False False \n", + "3 80 1 ... False False False False \n", + "4 65 1 ... False False False False \n", + "\n", + " Type2_Normal Type2_Poison Type2_Psychic Type2_Rock Type2_Steel \\\n", + "0 False True False False False \n", + "1 False True False False False \n", + "2 False True False False False \n", + "3 False True False False False \n", + "4 False False False False False \n", + "\n", + " Type2_Water \n", + "0 False \n", + "1 False \n", + "2 False \n", + "3 False \n", + "4 False \n", + "\n", + "[5 rows x 49 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pokemon_encoded.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Problem Solving Iteration 3\n", + "\n", + "Now we have encoded the pokemon types, we will identify the relationship between `Total` and the encoded fields. Our expectation is:\n", + "\n", + "#### There are relationships between `Total` and the encoded pokemon type variables and we need to identify the correlations.\n", + "\n", + "The information we need to collect is:\n", + "\n", + "#### How to identify the relationship between `Total` and the encoded pokemon type fields?\n", + "\n", + "There are multiple ways to answer this question. The easiest way is to use correlation. In the cell below, calculate the correlation of `Total` to each of the encoded fields. Rank the correlations and identify the #1 pokemon type that is most likely to have the highest `Total`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
#NameTotalHPAttackDefenseSp. AtkSp. DefSpeedGeneration...Type2_GhostType2_GrassType2_GroundType2_IceType2_NormalType2_PoisonType2_PsychicType2_RockType2_SteelType2_Water
01Bulbasaur3184549496565451...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
12Ivysaur4056062638080601...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
23Venusaur525808283100100801...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
33VenusaurMega Venusaur62580100123122120801...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
44Charmander3093952436050651...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
..................................................................
795719Diancie60050100150100150506...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
796719DiancieMega Diancie700501601101601101106...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
797720HoopaHoopa Confined6008011060150130706...TrueFalseFalseFalseFalseFalseFalseFalseFalseFalse
798720HoopaHoopa Unbound6808016060170130806...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
799721Volcanion6008011012013090706...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
\n", + "

800 rows × 49 columns

\n", + "
" + ], + "text/plain": [ + " # Name Total HP Attack Defense Sp. Atk Sp. Def \\\n", + "0 1 Bulbasaur 318 45 49 49 65 65 \n", + "1 2 Ivysaur 405 60 62 63 80 80 \n", + "2 3 Venusaur 525 80 82 83 100 100 \n", + "3 3 VenusaurMega Venusaur 625 80 100 123 122 120 \n", + "4 4 Charmander 309 39 52 43 60 50 \n", + ".. ... ... ... .. ... ... ... ... \n", + "795 719 Diancie 600 50 100 150 100 150 \n", + "796 719 DiancieMega Diancie 700 50 160 110 160 110 \n", + "797 720 HoopaHoopa Confined 600 80 110 60 150 130 \n", + "798 720 HoopaHoopa Unbound 680 80 160 60 170 130 \n", + "799 721 Volcanion 600 80 110 120 130 90 \n", + "\n", + " Speed Generation ... Type2_Ghost Type2_Grass Type2_Ground \\\n", + "0 45 1 ... False False False \n", + "1 60 1 ... False False False \n", + "2 80 1 ... False False False \n", + "3 80 1 ... False False False \n", + "4 65 1 ... False False False \n", + ".. ... ... ... ... ... ... \n", + "795 50 6 ... False False False \n", + "796 110 6 ... False False False \n", + "797 70 6 ... True False False \n", + "798 80 6 ... False False False \n", + "799 70 6 ... False False False \n", + "\n", + " Type2_Ice Type2_Normal Type2_Poison Type2_Psychic Type2_Rock \\\n", + "0 False False True False False \n", + "1 False False True False False \n", + "2 False False True False False \n", + "3 False False True False False \n", + "4 False False False False False \n", + ".. ... ... ... ... ... \n", + "795 False False False False False \n", + "796 False False False False False \n", + "797 False False False False False \n", + "798 False False False False False \n", + "799 False False False False False \n", + "\n", + " Type2_Steel Type2_Water \n", + "0 False False \n", + "1 False False \n", + "2 False False \n", + "3 False False \n", + "4 False False \n", + ".. ... ... \n", + "795 False False \n", + "796 False False \n", + "797 False False \n", + "798 False False \n", + "799 False True \n", + "\n", + "[800 rows x 49 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "type1_encoded = pd.get_dummies(pokemon['Type 1'], prefix='Type1')\n", + "type2_encoded = pd.get_dummies(pokemon['Type 2'], prefix='Type2')\n", + "pokemon_encoded = pd.concat([pokemon, type1_encoded, type2_encoded], axis=1)\n", + "pokemon_encoded = pokemon_encoded.drop(['Type 1', 'Type 2'], axis=1)\n", + "pokemon_encoded " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['#', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed',\n", + " 'Generation', 'Calculated Total'],\n", + " dtype='object')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numeric_columns = pokemon_encoded.select_dtypes(include='number').columns\n", + "numeric_columns " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "# 0.119813\n", + "HP 0.618748\n", + "Attack 0.736211\n", + "Defense 0.612787\n", + "Sp. Atk 0.747250\n", + "Sp. Def 0.717609\n", + "Speed 0.575943\n", + "Generation 0.048384\n", + "Calculated Total 1.000000\n", + "Name: Total, dtype: float64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "correlation_matrix = pokemon_encoded[numeric_columns].corr()\n", + "total_correlations = correlation_matrix['Total'].drop('Total')\n", + "total_correlations" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Calculated Total 1.000000\n", + "Sp. Atk 0.747250\n", + "Attack 0.736211\n", + "Sp. Def 0.717609\n", + "HP 0.618748\n", + "Defense 0.612787\n", + "Speed 0.575943\n", + "# 0.119813\n", + "Generation 0.048384\n", + "Name: Total, dtype: float64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ranked_correlations = total_correlations.sort_values(ascending=False)\n", + "ranked_correlations" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "top_type = ranked_correlations.idxmax()\n", + "top_correlation_value = ranked_correlations.max()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/your-code/.ipynb_checkpoints/challenge-3-checkpoint.ipynb b/your-code/.ipynb_checkpoints/challenge-3-checkpoint.ipynb new file mode 100644 index 0000000..81c44f0 --- /dev/null +++ b/your-code/.ipynb_checkpoints/challenge-3-checkpoint.ipynb @@ -0,0 +1,767 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Challenge 3\n", + "\n", + "In this challenge we will work on the `Orders` data set. In your work you will apply the thinking process and workflow we showed you in Challenge 2.\n", + "\n", + "You are serving as a Business Intelligence Analyst at the headquarter of an international fashion goods chain store. Your boss today asked you to do two things for her:\n", + "\n", + "**First, identify two groups of customers from the data set.** The first group is **VIP Customers** whose **aggregated expenses** at your global chain stores are **above the 95th percentile** (aka. 0.95 quantile). The second group is **Preferred Customers** whose **aggregated expenses** are **between the 75th and 95th percentile**.\n", + "\n", + "**Second, identify which country has the most of your VIP customers, and which country has the most of your VIP+Preferred Customers combined.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Q1: How to identify VIP & Preferred Customers?\n", + "\n", + "We start by importing all the required libraries:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# import required libraries\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, extract and import `Orders` dataset into a dataframe variable called `orders`. Print the head of `orders` to overview the data:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# your code here\n", + "orders = pd.read_csv(r'C:\\Users\\gaelm\\Desktop\\lab\\lab-dataframe-calculations\\your-code\\Orders.zip')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "\"Identify VIP and Preferred Customers\" is the non-technical goal of your boss. You need to translate that goal into technical languages that data analysts use:\n", + "\n", + "## How to label customers whose aggregated `amount_spent` is in a given quantile range?\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We break down the main problem into several sub problems:\n", + "\n", + "#### Sub Problem 1: How to aggregate the `amount_spent` for unique customers?\n", + "\n", + "#### Sub Problem 2: How to select customers whose aggregated `amount_spent` is in a given quantile range?\n", + "\n", + "#### Sub Problem 3: How to label selected customers as \"VIP\" or \"Preferred\"?\n", + "\n", + "*Note: If you want to break down the main problem in a different way, please feel free to revise the sub problems above.*\n", + "\n", + "Now in the workspace below, tackle each of the sub problems using the iterative problem solving workflow. Insert cells as necessary to write your codes and explain your steps." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idtotal_spent
01234677183.60
1123474310.00
2123481797.24
3123491757.55
412350334.40
\n", + "
" + ], + "text/plain": [ + " customer_id total_spent\n", + "0 12346 77183.60\n", + "1 12347 4310.00\n", + "2 12348 1797.24\n", + "3 12349 1757.55\n", + "4 12350 334.40" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "spending = orders.groupby('CustomerID')['amount_spent'].sum().reset_index()\n", + "spending.columns = ['customer_id', 'total_spent']\n", + "spending.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1661.64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "percentile75 = spending['total_spent'].quantile(0.75)\n", + "percentile75" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5840.181999999982" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "percentile95 = spending['total_spent'].quantile(0.95)\n", + "percentile95" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idtotal_spentcustomer_type
01234677183.60VIP
1123474310.00Preferred
2123481797.24Preferred
3123491757.55Preferred
412350334.40Regular
\n", + "
" + ], + "text/plain": [ + " customer_id total_spent customer_type\n", + "0 12346 77183.60 VIP\n", + "1 12347 4310.00 Preferred\n", + "2 12348 1797.24 Preferred\n", + "3 12349 1757.55 Preferred\n", + "4 12350 334.40 Regular" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spending['customer_type'] = 'Regular' \n", + "spending.loc[spending['total_spent'] > percentile95, 'customer_type'] = 'VIP'\n", + "spending.loc[(spending['total_spent'] > percentile75) & \n", + " (spending['total_spent'] <= percentile95), 'customer_type'] = 'Preferred'\n", + "spending.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we'll leave it to you to solve Q2 & Q3, which you can leverage from your solution for Q1:\n", + "\n", + "## Q2: How to identify which country has the most VIP Customers?" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 12346\n", + "10 12357\n", + "12 12359\n", + "50 12409\n", + "55 12415\n", + " ... \n", + "4207 18109\n", + "4229 18139\n", + "4253 18172\n", + "4292 18223\n", + "4298 18229\n", + "Name: customer_id, Length: 217, dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "vip_customer = spending[spending['customer_type'] == 'VIP']['customer_id']\n", + "vip_customer" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0InvoiceNoStockCodeyearmonthdayhourDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountryamount_spent
26265363702272820101238alarm clock bakelike pink242010-12-01 08:45:003.7512583France90.0
27275363702272720101238alarm clock bakelike red242010-12-01 08:45:003.7512583France90.0
28285363702272620101238alarm clock bakelike green122010-12-01 08:45:003.7512583France45.0
29295363702172420101238panda and bunnies sticker sheet122010-12-01 08:45:000.8512583France10.2
30305363702188320101238stars gift tape242010-12-01 08:45:000.6512583France15.6
.............................................
397883541868581584850382011125126 chocolate love heart t-lights482011-12-09 12:25:001.8513777United Kingdom88.8
39790554189058158622061201112512large cake stand hanging strawbery82011-12-09 12:49:002.9513113United Kingdom23.6
39790654189158158623275201112512set of 3 hanging owls ollie beak242011-12-09 12:49:001.2513113United Kingdom30.0
39790754189258158621217201112512red retrospot round cake tins242011-12-09 12:49:008.9513113United Kingdom214.8
39790854189358158620685201112512doormat red retrospot102011-12-09 12:49:007.0813113United Kingdom70.8
\n", + "

104484 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 InvoiceNo StockCode year month day hour \\\n", + "26 26 536370 22728 2010 12 3 8 \n", + "27 27 536370 22727 2010 12 3 8 \n", + "28 28 536370 22726 2010 12 3 8 \n", + "29 29 536370 21724 2010 12 3 8 \n", + "30 30 536370 21883 2010 12 3 8 \n", + "... ... ... ... ... ... ... ... \n", + "397883 541868 581584 85038 2011 12 5 12 \n", + "397905 541890 581586 22061 2011 12 5 12 \n", + "397906 541891 581586 23275 2011 12 5 12 \n", + "397907 541892 581586 21217 2011 12 5 12 \n", + "397908 541893 581586 20685 2011 12 5 12 \n", + "\n", + " Description Quantity InvoiceDate \\\n", + "26 alarm clock bakelike pink 24 2010-12-01 08:45:00 \n", + "27 alarm clock bakelike red 24 2010-12-01 08:45:00 \n", + "28 alarm clock bakelike green 12 2010-12-01 08:45:00 \n", + "29 panda and bunnies sticker sheet 12 2010-12-01 08:45:00 \n", + "30 stars gift tape 24 2010-12-01 08:45:00 \n", + "... ... ... ... \n", + "397883 6 chocolate love heart t-lights 48 2011-12-09 12:25:00 \n", + "397905 large cake stand hanging strawbery 8 2011-12-09 12:49:00 \n", + "397906 set of 3 hanging owls ollie beak 24 2011-12-09 12:49:00 \n", + "397907 red retrospot round cake tins 24 2011-12-09 12:49:00 \n", + "397908 doormat red retrospot 10 2011-12-09 12:49:00 \n", + "\n", + " UnitPrice CustomerID Country amount_spent \n", + "26 3.75 12583 France 90.0 \n", + "27 3.75 12583 France 90.0 \n", + "28 3.75 12583 France 45.0 \n", + "29 0.85 12583 France 10.2 \n", + "30 0.65 12583 France 15.6 \n", + "... ... ... ... ... \n", + "397883 1.85 13777 United Kingdom 88.8 \n", + "397905 2.95 13113 United Kingdom 23.6 \n", + "397906 1.25 13113 United Kingdom 30.0 \n", + "397907 8.95 13113 United Kingdom 214.8 \n", + "397908 7.08 13113 United Kingdom 70.8 \n", + "\n", + "[104484 rows x 14 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vip_orders = orders[orders['CustomerID'].isin(vip_customer)]\n", + "vip_orders" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Country United Kingdom\n", + "VIP_Customer_Count 84185\n", + "Name: 0, dtype: object" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vip_country_count = vip_orders['Country'].value_counts().reset_index()\n", + "vip_country_count.columns = ['Country', 'VIP_Customer_Count']\n", + "most_vip_country = vip_country_count.loc[vip_country_count['VIP_Customer_Count'].idxmax()]\n", + "most_vip_country" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Q3: How to identify which country has the most VIP+Preferred Customers combined?" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# your code here\n", + "preferred_customer = spending[spending['customer_type'] == 'Preferred']['customer_id']" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "combined_customer = pd.concat([preferred_customer, vip_customer])" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "combined_orders_customer = orders[orders['CustomerID'].isin(combined_customer)]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "combined_country_count = combined_orders_customer['Country'].value_counts().reset_index()\n", + "combined_country_count.columns = ['Country', 'VIP_Preferred_Customer_Count']" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Country United Kingdom\n", + "VIP_Preferred_Customer_Count 221635\n", + "Name: 0, dtype: object" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "most_country_combined = combined_country_count.loc[combined_country_count['VIP_Preferred_Customer_Count'].idxmax()]\n", + "most_country_combined" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/your-code/challenge-1.ipynb b/your-code/challenge-1.ipynb index cd674cb..c70fb32 100644 --- a/your-code/challenge-1.ipynb +++ b/your-code/challenge-1.ipynb @@ -1,276 +1,1208 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Challenge 1\n", - "\n", - "In this challenge you will be working on **Pokemon**. You will answer a series of questions in order to practice dataframe calculation, aggregation, and transformation.\n", - "\n", - "![Pokemon](../images/pokemon.jpg)\n", - "\n", - "Follow the instructions below and enter your code.\n", - "\n", - "#### Import all required libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import libraries" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Import data set.\n", - "\n", - "Read the dataset `pokemon.csv` into a dataframe called `pokemon`.\n", - "\n", - "*Data set attributed to [Alberto Barradas](https://www.kaggle.com/abcsds/pokemon/)*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Print first 10 rows of `pokemon`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When you look at a data set, you often wonder what each column means. Some open-source data sets provide descriptions of the data set. In many cases, data descriptions are extremely useful for data analysts to perform work efficiently and successfully.\n", - "\n", - "For the `Pokemon.csv` data set, fortunately, the owner provided descriptions which you can see [here](https://www.kaggle.com/abcsds/pokemon/home). For your convenience, we are including the descriptions below. Read the descriptions and understand what each column means. This knowledge is helpful in your work with the data.\n", - "\n", - "| Column | Description |\n", - "| --- | --- |\n", - "| # | ID for each pokemon |\n", - "| Name | Name of each pokemon |\n", - "| Type 1 | Each pokemon has a type, this determines weakness/resistance to attacks |\n", - "| Type 2 | Some pokemon are dual type and have 2 |\n", - "| Total | A general guide to how strong a pokemon is |\n", - "| HP | Hit points, or health, defines how much damage a pokemon can withstand before fainting |\n", - "| Attack | The base modifier for normal attacks (eg. Scratch, Punch) |\n", - "| Defense | The base damage resistance against normal attacks |\n", - "| SP Atk | Special attack, the base modifier for special attacks (e.g. fire blast, bubble beam) |\n", - "| SP Def | The base damage resistance against special attacks |\n", - "| Speed | Determines which pokemon attacks first each round |\n", - "| Generation | Number of generation |\n", - "| Legendary | True if Legendary Pokemon False if not |" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Obtain the distinct values across `Type 1` and `Type 2`.\n", - "\n", - "Exctract all the values in `Type 1` and `Type 2`. Then create an array containing the distinct values across both fields." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Cleanup `Name` that contain \"Mega\".\n", - "\n", - "If you have checked out the pokemon names carefully enough, you should have found there are junk texts in the pokemon names which contain \"Mega\". We want to clean up the pokemon names. For instance, \"VenusaurMega Venusaur\" should be \"Mega Venusaur\", and \"CharizardMega Charizard X\" should be \"Mega Charizard X\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here\n", - "\n", - "\n", - "# test transformed data\n", - "pokemon.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Create a new column called `A/D Ratio` whose value equals to `Attack` devided by `Defense`.\n", - "\n", - "For instance, if a pokemon has the Attack score 49 and Defense score 49, the corresponding `A/D Ratio` is 49/49=1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Identify the pokemon with the highest `A/D Ratio`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Identify the pokemon with the lowest A/D Ratio." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Create a new column called `Combo Type` whose value combines `Type 1` and `Type 2`.\n", - "\n", - "Rules:\n", - "\n", - "* If both `Type 1` and `Type 2` have valid values, the `Combo Type` value should contain both values in the form of ` `. For example, if `Type 1` value is `Grass` and `Type 2` value is `Poison`, `Combo Type` will be `Grass-Poison`.\n", - "\n", - "* If `Type 1` has valid value but `Type 2` is not, `Combo Type` will be the same as `Type 1`. For example, if `Type 1` is `Fire` whereas `Type 2` is `NaN`, `Combo Type` will be `Fire`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Identify the pokemon whose `A/D Ratio` are among the top 5." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### For the 5 pokemon printed above, aggregate `Combo Type` and use a list to store the unique values.\n", - "\n", - "Your end product is a list containing the distinct `Combo Type` values of the 5 pokemon with the highest `A/D Ratio`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### For each of the `Combo Type` values obtained from the previous question, calculate the mean scores of all numeric fields across all pokemon.\n", - "\n", - "Your output should look like below:\n", - "\n", - "![Aggregate](../images/aggregated-mean.png)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Challenge 1\n", + "\n", + "In this challenge you will be working on **Pokemon**. You will answer a series of questions in order to practice dataframe calculation, aggregation, and transformation.\n", + "\n", + "![Pokemon](../images/pokemon.jpg)\n", + "\n", + "Follow the instructions below and enter your code.\n", + "\n", + "#### Import all required libraries." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# import libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Import data set.\n", + "\n", + "Read the dataset `pokemon.csv` into a dataframe called `pokemon`.\n", + "\n", + "*Data set attributed to [Alberto Barradas](https://www.kaggle.com/abcsds/pokemon/)*" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# import dataset\n", + "pokemon = pd.read_csv(r'C:\\Users\\gaelm\\Desktop\\lab\\lab-dataframe-calculations\\your-code\\Pokemon.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Print first 10 rows of `pokemon`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
01BulbasaurGrassPoison3184549496565451False
12IvysaurGrassPoison4056062638080601False
23VenusaurGrassPoison525808283100100801False
33VenusaurMega VenusaurGrassPoison62580100123122120801False
44CharmanderFireNaN3093952436050651False
55CharmeleonFireNaN4055864588065801False
66CharizardFireFlying534788478109851001False
76CharizardMega Charizard XFireDragon63478130111130851001False
86CharizardMega Charizard YFireFlying63478104781591151001False
97SquirtleWaterNaN3144448655064431False
\n", + "
" + ], + "text/plain": [ + " # Name Type 1 Type 2 Total HP Attack Defense \\\n", + "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", + "1 2 Ivysaur Grass Poison 405 60 62 63 \n", + "2 3 Venusaur Grass Poison 525 80 82 83 \n", + "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", + "4 4 Charmander Fire NaN 309 39 52 43 \n", + "5 5 Charmeleon Fire NaN 405 58 64 58 \n", + "6 6 Charizard Fire Flying 534 78 84 78 \n", + "7 6 CharizardMega Charizard X Fire Dragon 634 78 130 111 \n", + "8 6 CharizardMega Charizard Y Fire Flying 634 78 104 78 \n", + "9 7 Squirtle Water NaN 314 44 48 65 \n", + "\n", + " Sp. Atk Sp. Def Speed Generation Legendary \n", + "0 65 65 45 1 False \n", + "1 80 80 60 1 False \n", + "2 100 100 80 1 False \n", + "3 122 120 80 1 False \n", + "4 60 50 65 1 False \n", + "5 80 65 80 1 False \n", + "6 109 85 100 1 False \n", + "7 130 85 100 1 False \n", + "8 159 115 100 1 False \n", + "9 50 64 43 1 False " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "pokemon.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When you look at a data set, you often wonder what each column means. Some open-source data sets provide descriptions of the data set. In many cases, data descriptions are extremely useful for data analysts to perform work efficiently and successfully.\n", + "\n", + "For the `Pokemon.csv` data set, fortunately, the owner provided descriptions which you can see [here](https://www.kaggle.com/abcsds/pokemon/home). For your convenience, we are including the descriptions below. Read the descriptions and understand what each column means. This knowledge is helpful in your work with the data.\n", + "\n", + "| Column | Description |\n", + "| --- | --- |\n", + "| # | ID for each pokemon |\n", + "| Name | Name of each pokemon |\n", + "| Type 1 | Each pokemon has a type, this determines weakness/resistance to attacks |\n", + "| Type 2 | Some pokemon are dual type and have 2 |\n", + "| Total | A general guide to how strong a pokemon is |\n", + "| HP | Hit points, or health, defines how much damage a pokemon can withstand before fainting |\n", + "| Attack | The base modifier for normal attacks (eg. Scratch, Punch) |\n", + "| Defense | The base damage resistance against normal attacks |\n", + "| SP Atk | Special attack, the base modifier for special attacks (e.g. fire blast, bubble beam) |\n", + "| SP Def | The base damage resistance against special attacks |\n", + "| Speed | Determines which pokemon attacks first each round |\n", + "| Generation | Number of generation |\n", + "| Legendary | True if Legendary Pokemon False if not |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Obtain the distinct values across `Type 1` and `Type 2`.\n", + "\n", + "Exctract all the values in `Type 1` and `Type 2`. Then create an array containing the distinct values across both fields." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Grass', 'Fire', 'Water', 'Bug', 'Normal', 'Poison', 'Electric',\n", + " 'Ground', 'Fairy', 'Fighting', 'Psychic', 'Rock', 'Ghost', 'Ice',\n", + " 'Dragon', 'Dark', 'Steel', 'Flying'], dtype=object)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "type_1_values = pokemon['Type 1'].unique()\n", + "type_1_values" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Poison', nan, 'Flying', 'Dragon', 'Ground', 'Fairy', 'Grass',\n", + " 'Fighting', 'Psychic', 'Steel', 'Ice', 'Rock', 'Dark', 'Water',\n", + " 'Electric', 'Fire', 'Ghost', 'Bug', 'Normal'], dtype=object)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type_2_values = pokemon['Type 2'].unique()\n", + "type_2_values" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Bug', 'Dark', 'Dragon', 'Electric', 'Fairy', 'Fighting', 'Fire',\n", + " 'Flying', 'Ghost', 'Grass', 'Ground', 'Ice', 'Normal', 'Poison',\n", + " 'Psychic', 'Rock', 'Steel', 'Water'], dtype=object)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "distinct_types = np.unique(np.concatenate((type_1_values[~pd.isna(type_1_values)], type_2_values[~pd.isna(type_2_values)])))\n", + "distinct_types " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Cleanup `Name` that contain \"Mega\".\n", + "\n", + "If you have checked out the pokemon names carefully enough, you should have found there are junk texts in the pokemon names which contain \"Mega\". We want to clean up the pokemon names. For instance, \"VenusaurMega Venusaur\" should be \"Mega Venusaur\", and \"CharizardMega Charizard X\" should be \"Mega Charizard X\"." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Name
0Bulbasaur
1Ivysaur
2Venusaur
3VenusaurMega Venusaur
4Charmander
5Charmeleon
6Charizard
7CharizardMega Charizard X
8CharizardMega Charizard Y
9Squirtle
\n", + "
" + ], + "text/plain": [ + " Name\n", + "0 Bulbasaur\n", + "1 Ivysaur\n", + "2 Venusaur\n", + "3 VenusaurMega Venusaur\n", + "4 Charmander\n", + "5 Charmeleon\n", + "6 Charizard\n", + "7 CharizardMega Charizard X\n", + "8 CharizardMega Charizard Y\n", + "9 Squirtle" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "\n", + "pokemon['Name'] = pokemon['Name'].str.replace(r'Mega\\s*', 'Mega ', regex=True).str.replace(r'Mega\\s*(\\w+)', r'Mega \\1', regex=True)\n", + "\n", + "# test transformed data\n", + "pokemon[['Name']].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Create a new column called `A/D Ratio` whose value equals to `Attack` devided by `Defense`.\n", + "\n", + "For instance, if a pokemon has the Attack score 49 and Defense score 49, the corresponding `A/D Ratio` is 49/49=1." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameAttackDefenseA/D Ratio
0Bulbasaur49491.000000
1Ivysaur62630.984127
2Venusaur82830.987952
3VenusaurMega Venusaur1001230.813008
4Charmander52431.209302
5Charmeleon64581.103448
6Charizard84781.076923
7CharizardMega Charizard X1301111.171171
8CharizardMega Charizard Y104781.333333
9Squirtle48650.738462
\n", + "
" + ], + "text/plain": [ + " Name Attack Defense A/D Ratio\n", + "0 Bulbasaur 49 49 1.000000\n", + "1 Ivysaur 62 63 0.984127\n", + "2 Venusaur 82 83 0.987952\n", + "3 VenusaurMega Venusaur 100 123 0.813008\n", + "4 Charmander 52 43 1.209302\n", + "5 Charmeleon 64 58 1.103448\n", + "6 Charizard 84 78 1.076923\n", + "7 CharizardMega Charizard X 130 111 1.171171\n", + "8 CharizardMega Charizard Y 104 78 1.333333\n", + "9 Squirtle 48 65 0.738462" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "pokemon['A/D Ratio'] = pokemon['Attack'] / pokemon['Defense']\n", + "pokemon[['Name', 'Attack', 'Defense', 'A/D Ratio']].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Identify the pokemon with the highest `A/D Ratio`." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "# 386\n", + "Name DeoxysAttack Forme\n", + "Type 1 Psychic\n", + "Type 2 NaN\n", + "Total 600\n", + "HP 50\n", + "Attack 180\n", + "Defense 20\n", + "Sp. Atk 180\n", + "Sp. Def 20\n", + "Speed 150\n", + "Generation 3\n", + "Legendary True\n", + "A/D Ratio 9.0\n", + "Name: 429, dtype: object" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "highestAD_ratio_index = pokemon['A/D Ratio'].idxmax()\n", + "highestAD_ratio_pokemon = pokemon.loc[highestAD_ratio_index]\n", + "highestAD_ratio_pokemon" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Identify the pokemon with the lowest A/D Ratio." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "# 213\n", + "Name Shuckle\n", + "Type 1 Bug\n", + "Type 2 Rock\n", + "Total 505\n", + "HP 20\n", + "Attack 10\n", + "Defense 230\n", + "Sp. Atk 10\n", + "Sp. Def 230\n", + "Speed 5\n", + "Generation 2\n", + "Legendary False\n", + "A/D Ratio 0.043478\n", + "Name: 230, dtype: object" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "lowestAD_ratio_index = pokemon['A/D Ratio'].idxmin()\n", + "lowestAD_ratio_pokemon = pokemon.loc[lowestAD_ratio_index]\n", + "lowestAD_ratio_pokemon" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Create a new column called `Combo Type` whose value combines `Type 1` and `Type 2`.\n", + "\n", + "Rules:\n", + "\n", + "* If both `Type 1` and `Type 2` have valid values, the `Combo Type` value should contain both values in the form of ` `. For example, if `Type 1` value is `Grass` and `Type 2` value is `Poison`, `Combo Type` will be `Grass-Poison`.\n", + "\n", + "* If `Type 1` has valid value but `Type 2` is not, `Combo Type` will be the same as `Type 1`. For example, if `Type 1` is `Fire` whereas `Type 2` is `NaN`, `Combo Type` will be `Fire`." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameType 1Type 2Combo Type
0BulbasaurGrassPoisonGrass-Poison
1IvysaurGrassPoisonGrass-Poison
2VenusaurGrassPoisonGrass-Poison
3VenusaurMega VenusaurGrassPoisonGrass-Poison
4CharmanderFireNaNFire
5CharmeleonFireNaNFire
6CharizardFireFlyingFire-Flying
7CharizardMega Charizard XFireDragonFire-Dragon
8CharizardMega Charizard YFireFlyingFire-Flying
9SquirtleWaterNaNWater
\n", + "
" + ], + "text/plain": [ + " Name Type 1 Type 2 Combo Type\n", + "0 Bulbasaur Grass Poison Grass-Poison\n", + "1 Ivysaur Grass Poison Grass-Poison\n", + "2 Venusaur Grass Poison Grass-Poison\n", + "3 VenusaurMega Venusaur Grass Poison Grass-Poison\n", + "4 Charmander Fire NaN Fire\n", + "5 Charmeleon Fire NaN Fire\n", + "6 Charizard Fire Flying Fire-Flying\n", + "7 CharizardMega Charizard X Fire Dragon Fire-Dragon\n", + "8 CharizardMega Charizard Y Fire Flying Fire-Flying\n", + "9 Squirtle Water NaN Water" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "pokemon['Combo Type'] = np.where(\n", + " pokemon['Type 2'].isna(), \n", + " pokemon['Type 1'], \n", + " pokemon['Type 1'] + '-' + pokemon['Type 2']\n", + ")\n", + "pokemon[['Name', 'Type 1', 'Type 2', 'Combo Type']].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Identify the pokemon whose `A/D Ratio` are among the top 5." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameAttackDefenseA/D Ratio
429DeoxysAttack Forme180209.000
347Carvanha90204.500
19BeedrillMega Beedrill150403.750
453Cranidos125403.125
348Sharpedo120403.000
\n", + "
" + ], + "text/plain": [ + " Name Attack Defense A/D Ratio\n", + "429 DeoxysAttack Forme 180 20 9.000\n", + "347 Carvanha 90 20 4.500\n", + "19 BeedrillMega Beedrill 150 40 3.750\n", + "453 Cranidos 125 40 3.125\n", + "348 Sharpedo 120 40 3.000" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "top_5AD_ratio = pokemon.nlargest(5, 'A/D Ratio')\n", + "\n", + "top_5AD_ratio[['Name', 'Attack', 'Defense', 'A/D Ratio']]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### For the 5 pokemon printed above, aggregate `Combo Type` and use a list to store the unique values.\n", + "\n", + "Your end product is a list containing the distinct `Combo Type` values of the 5 pokemon with the highest `A/D Ratio`." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Psychic', 'Water-Dark', 'Bug-Poison', 'Rock']" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "top_5AD_ratio = pokemon.nlargest(5, 'A/D Ratio')\n", + "unique_combo_types = top_5AD_ratio['Combo Type'].unique().tolist()\n", + "unique_combo_types" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### For each of the `Combo Type` values obtained from the previous question, calculate the mean scores of all numeric fields across all pokemon.\n", + "\n", + "Your output should look like below:\n", + "\n", + "![Aggregate](../images/aggregated-mean.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
#TotalHPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendaryA/D Ratio
Combo Type
Psychic381.973684464.55263272.55263264.94736867.23684298.55263282.39473778.8684213.3421050.2368421.164196
Water-Dark347.666667493.83333369.166667120.00000065.16666788.83333363.50000087.1666673.1666670.0000002.291949
Bug-Poison199.166667347.91666753.75000068.33333358.08333342.50000059.33333365.9166672.3333330.0000001.315989
Rock410.111111409.44444467.111111103.333333107.22222240.55555658.33333332.8888893.8888890.1111111.260091
\n", + "
" + ], + "text/plain": [ + " # Total HP Attack Defense \\\n", + "Combo Type \n", + "Psychic 381.973684 464.552632 72.552632 64.947368 67.236842 \n", + "Water-Dark 347.666667 493.833333 69.166667 120.000000 65.166667 \n", + "Bug-Poison 199.166667 347.916667 53.750000 68.333333 58.083333 \n", + "Rock 410.111111 409.444444 67.111111 103.333333 107.222222 \n", + "\n", + " Sp. Atk Sp. Def Speed Generation Legendary A/D Ratio \n", + "Combo Type \n", + "Psychic 98.552632 82.394737 78.868421 3.342105 0.236842 1.164196 \n", + "Water-Dark 88.833333 63.500000 87.166667 3.166667 0.000000 2.291949 \n", + "Bug-Poison 42.500000 59.333333 65.916667 2.333333 0.000000 1.315989 \n", + "Rock 40.555556 58.333333 32.888889 3.888889 0.111111 1.260091 " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "mean_scores_combo_type = pokemon.groupby('Combo Type').mean(numeric_only=True)\n", + "filtered_mean_scores = mean_scores_combo_type.loc[unique_combo_types]\n", + "filtered_mean_scores\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/your-code/challenge-2.ipynb b/your-code/challenge-2.ipynb index d347731..eb34eb5 100644 --- a/your-code/challenge-2.ipynb +++ b/your-code/challenge-2.ipynb @@ -1,195 +1,1702 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Challenge 2\n", - "\n", - "In this challenge we will continue working with the `Pokemon` dataset. We will attempt solving a slightly more complex problem in which we will practice the iterative data analysis process you leaned in [this video](https://www.youtube.com/watch?v=xOomNicqbkk).\n", - "\n", - "The problem statement is as follows:\n", - "\n", - "**You are at a Pokemon black market planning to buy a Pokemon for battle. All Pokemon are sold at the same price and you can only afford to buy one. You cannot choose which specific Pokemon to buy. However, you can specify the type of the Pokemon - one type that exists in either `Type 1` or `Type 2`. Which type should you choose in order to maximize your chance of receiving a good Pokemon?**\n", - "\n", - "To remind you about the 3 steps of iterative data analysis, they are:\n", - "\n", - "1. Setting Expectations\n", - "1. Collecting Information\n", - "1. Reacting to Data / Revising Expectations\n", - "\n", - "Following the iterative process, we'll guide you in completing the challenge." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## Problem Solving Iteration 1\n", - "\n", - "In this iteration we'll analyze the problem and identify the breakthrough. The original question statement is kind of vague because we don't know what a *good pokemon* really means as represented in the data. We'll start by understanding the dataset and see if we can find some insights." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Import libraries\n", - "import numpy as np\n", - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# Importing the dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the data it seems whether a pokemon is good depends on its abilities as represented in the fields of `HP`, `Attack`, `Defense`, `Sp. Atk`, `Sp. Def`, `Speed`, and `Total`. We are not sure about `Generation` and `Legendary` because they are not necessarily the decisive factors of the pokemon abilities.\n", - "\n", - "But `HP`, `Attack`, `Defense`, `Sp. Atk`, `Sp. Def`, `Speed`, and `Total` are a lot of fields! If we look at them all at once it's very complicated. This isn't Mission Impossible but it's ideal that we tackle this kind of problem after we learn Machine Learning (which you will do in Module 3). For now, is there a way to consolidate the fields we need to look into?\n", - "\n", - "Fortunately there seems to be a way. It appears the `Total` field is computed based on the other 6 fields. But we need to prove our theory. If we can approve there is a formula to compute `Total` based on the other 6 abilities, we only need to look into `Total`.\n", - "\n", - "We have the following expectation now:\n", - "\n", - "#### The `Total` field is computed based on `HP`, `Attack`, `Defense`, `Sp. Atk`, `Sp. Def`, and `Speed`.\n", - "\n", - "We need to collect the following information:\n", - "\n", - "* **What is the formula to compute `Total`?**\n", - "* **Does the formula work for all pokemon?**\n", - "\n", - "In the cell below, make a hypothesis on how `Total` is computed and test your hypothesis." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Problem Solving Iteration 2\n", - "\n", - "Now that we have consolidated the abilities fields, we can update the problem statement. The new problem statement is:\n", - "\n", - "### Which pokemon type is most likely to have the highest `Total` value?\n", - "\n", - "In the updated problem statement, we assume there is a certain relationship between the `Total` and the pokemon type. But we have two *type* fields (`Type 1` and `Type 2`) that have string values. In data analysis, string fields have to be transformed to numerical format in order to be analyzed. \n", - "\n", - "In addition, keep in mind that `Type 1` always has a value but `Type 2` is sometimes empty (having the `NaN` value). Also, the pokemon type we choose may be either in `Type 1` or `Type 2`.\n", - "\n", - "Now our expectation is:\n", - "\n", - "#### `Type 1` and `Type 2` string variables need to be converted to numerical variables in order to identify the relationship between `Total` and the pokemon type.\n", - "\n", - "The information we need to collect is:\n", - "\n", - "#### How to convert two string variables to numerical?\n", - "\n", - "Let's address the first question first. You can use a method called **One Hot Encoding** which is frequently used in machine learning to encode categorical string variables to numerical. The idea is to gather all the possible string values in a categorical field and create a numerical field for each unique string value. Each of those numerical fields uses `1` and `0` to indicate whether the data record has the corresponding categorical value. A detailed explanation of One Hot Encoding can be found in [this article](https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f). You will formally learn it in Module 3.\n", - "\n", - "For instance, if a pokemon has `Type 1` as `Poison` and `Type 2` as `Fire`, then its `Poison` and `Fire` fields are `1` whereas all other fields are `0`. If a pokemon has `Type 1` as `Water` and `Type 2` as `NaN`, then its `Water` field is `1` whereas all other fields are `0`.\n", - "\n", - "#### In the next cell, use One Hot Encoding to encode `Type 1` and `Type 2`. Use the pokemon type values as the names of the numerical fields you create.\n", - "\n", - "The new numerical variables you create should look like below:\n", - "\n", - "![One Hot Encoding](../images/one-hot-encoding.png)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Problem Solving Iteration 3\n", - "\n", - "Now we have encoded the pokemon types, we will identify the relationship between `Total` and the encoded fields. Our expectation is:\n", - "\n", - "#### There are relationships between `Total` and the encoded pokemon type variables and we need to identify the correlations.\n", - "\n", - "The information we need to collect is:\n", - "\n", - "#### How to identify the relationship between `Total` and the encoded pokemon type fields?\n", - "\n", - "There are multiple ways to answer this question. The easiest way is to use correlation. In the cell below, calculate the correlation of `Total` to each of the encoded fields. Rank the correlations and identify the #1 pokemon type that is most likely to have the highest `Total`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Bonus Question\n", - "\n", - "Say now you can choose both `Type 1` and `Type 2` of the pokemon. In order to receive the best pokemon, which types will you choose?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Challenge 2\n", + "\n", + "In this challenge we will continue working with the `Pokemon` dataset. We will attempt solving a slightly more complex problem in which we will practice the iterative data analysis process you leaned in [this video](https://www.youtube.com/watch?v=xOomNicqbkk).\n", + "\n", + "The problem statement is as follows:\n", + "\n", + "**You are at a Pokemon black market planning to buy a Pokemon for battle. All Pokemon are sold at the same price and you can only afford to buy one. You cannot choose which specific Pokemon to buy. However, you can specify the type of the Pokemon - one type that exists in either `Type 1` or `Type 2`. Which type should you choose in order to maximize your chance of receiving a good Pokemon?**\n", + "\n", + "To remind you about the 3 steps of iterative data analysis, they are:\n", + "\n", + "1. Setting Expectations\n", + "1. Collecting Information\n", + "1. Reacting to Data / Revising Expectations\n", + "\n", + "Following the iterative process, we'll guide you in completing the challenge." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Problem Solving Iteration 1\n", + "\n", + "In this iteration we'll analyze the problem and identify the breakthrough. The original question statement is kind of vague because we don't know what a *good pokemon* really means as represented in the data. We'll start by understanding the dataset and see if we can find some insights." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Import libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Importing the dataset\n", + "pokemon = pd.read_csv(r'C:\\Users\\gaelm\\Desktop\\lab\\lab-dataframe-calculations\\your-code\\Pokemon.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the data it seems whether a pokemon is good depends on its abilities as represented in the fields of `HP`, `Attack`, `Defense`, `Sp. Atk`, `Sp. Def`, `Speed`, and `Total`. We are not sure about `Generation` and `Legendary` because they are not necessarily the decisive factors of the pokemon abilities.\n", + "\n", + "But `HP`, `Attack`, `Defense`, `Sp. Atk`, `Sp. Def`, `Speed`, and `Total` are a lot of fields! If we look at them all at once it's very complicated. This isn't Mission Impossible but it's ideal that we tackle this kind of problem after we learn Machine Learning (which you will do in Module 3). For now, is there a way to consolidate the fields we need to look into?\n", + "\n", + "Fortunately there seems to be a way. It appears the `Total` field is computed based on the other 6 fields. But we need to prove our theory. If we can approve there is a formula to compute `Total` based on the other 6 abilities, we only need to look into `Total`.\n", + "\n", + "We have the following expectation now:\n", + "\n", + "#### The `Total` field is computed based on `HP`, `Attack`, `Defense`, `Sp. Atk`, `Sp. Def`, and `Speed`.\n", + "\n", + "We need to collect the following information:\n", + "\n", + "* **What is the formula to compute `Total`?**\n", + "* **Does the formula work for all pokemon?**\n", + "\n", + "In the cell below, make a hypothesis on how `Total` is computed and test your hypothesis." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# your code here\n", + "pokemon['Calculated Total'] = (\n", + " pokemon['HP'] +\n", + " pokemon['Attack'] +\n", + " pokemon['Defense'] +\n", + " pokemon['Sp. Atk'] +\n", + " pokemon['Sp. Def'] +\n", + " pokemon['Speed']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "pokemon['Total Matches'] = pokemon['Calculated Total'] == pokemon['Total']" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Total Matches\n", + "True 800\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "totals_matching= pokemon['Total Matches'].value_counts()\n", + "totals_matching" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Problem Solving Iteration 2\n", + "\n", + "Now that we have consolidated the abilities fields, we can update the problem statement. The new problem statement is:\n", + "\n", + "### Which pokemon type is most likely to have the highest `Total` value?\n", + "\n", + "In the updated problem statement, we assume there is a certain relationship between the `Total` and the pokemon type. But we have two *type* fields (`Type 1` and `Type 2`) that have string values. In data analysis, string fields have to be transformed to numerical format in order to be analyzed. \n", + "\n", + "In addition, keep in mind that `Type 1` always has a value but `Type 2` is sometimes empty (having the `NaN` value). Also, the pokemon type we choose may be either in `Type 1` or `Type 2`.\n", + "\n", + "Now our expectation is:\n", + "\n", + "#### `Type 1` and `Type 2` string variables need to be converted to numerical variables in order to identify the relationship between `Total` and the pokemon type.\n", + "\n", + "The information we need to collect is:\n", + "\n", + "#### How to convert two string variables to numerical?\n", + "\n", + "Let's address the first question first. You can use a method called **One Hot Encoding** which is frequently used in machine learning to encode categorical string variables to numerical. The idea is to gather all the possible string values in a categorical field and create a numerical field for each unique string value. Each of those numerical fields uses `1` and `0` to indicate whether the data record has the corresponding categorical value. A detailed explanation of One Hot Encoding can be found in [this article](https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f). You will formally learn it in Module 3.\n", + "\n", + "For instance, if a pokemon has `Type 1` as `Poison` and `Type 2` as `Fire`, then its `Poison` and `Fire` fields are `1` whereas all other fields are `0`. If a pokemon has `Type 1` as `Water` and `Type 2` as `NaN`, then its `Water` field is `1` whereas all other fields are `0`.\n", + "\n", + "#### In the next cell, use One Hot Encoding to encode `Type 1` and `Type 2`. Use the pokemon type values as the names of the numerical fields you create.\n", + "\n", + "The new numerical variables you create should look like below:\n", + "\n", + "![One Hot Encoding](../images/one-hot-encoding.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
#NameType 1Type 2TotalHPAttackDefenseSp. AtkSp. Def...Type2_GhostType2_GrassType2_GroundType2_IceType2_NormalType2_PoisonType2_PsychicType2_RockType2_SteelType2_Water
01BulbasaurGrassPoison3184549496565...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
12IvysaurGrassPoison4056062638080...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
23VenusaurGrassPoison525808283100100...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
33VenusaurMega VenusaurGrassPoison62580100123122120...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
44CharmanderFireNaN3093952436050...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
..................................................................
795719DiancieRockFairy60050100150100150...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
796719DiancieMega DiancieRockFairy70050160110160110...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
797720HoopaHoopa ConfinedPsychicGhost6008011060150130...TrueFalseFalseFalseFalseFalseFalseFalseFalseFalse
798720HoopaHoopa UnboundPsychicDark6808016060170130...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
799721VolcanionFireWater6008011012013090...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
\n", + "

800 rows × 51 columns

\n", + "
" + ], + "text/plain": [ + " # Name Type 1 Type 2 Total HP Attack Defense \\\n", + "0 1 Bulbasaur Grass Poison 318 45 49 49 \n", + "1 2 Ivysaur Grass Poison 405 60 62 63 \n", + "2 3 Venusaur Grass Poison 525 80 82 83 \n", + "3 3 VenusaurMega Venusaur Grass Poison 625 80 100 123 \n", + "4 4 Charmander Fire NaN 309 39 52 43 \n", + ".. ... ... ... ... ... .. ... ... \n", + "795 719 Diancie Rock Fairy 600 50 100 150 \n", + "796 719 DiancieMega Diancie Rock Fairy 700 50 160 110 \n", + "797 720 HoopaHoopa Confined Psychic Ghost 600 80 110 60 \n", + "798 720 HoopaHoopa Unbound Psychic Dark 680 80 160 60 \n", + "799 721 Volcanion Fire Water 600 80 110 120 \n", + "\n", + " Sp. Atk Sp. Def ... Type2_Ghost Type2_Grass Type2_Ground Type2_Ice \\\n", + "0 65 65 ... False False False False \n", + "1 80 80 ... False False False False \n", + "2 100 100 ... False False False False \n", + "3 122 120 ... False False False False \n", + "4 60 50 ... False False False False \n", + ".. ... ... ... ... ... ... ... \n", + "795 100 150 ... False False False False \n", + "796 160 110 ... False False False False \n", + "797 150 130 ... True False False False \n", + "798 170 130 ... False False False False \n", + "799 130 90 ... False False False False \n", + "\n", + " Type2_Normal Type2_Poison Type2_Psychic Type2_Rock Type2_Steel \\\n", + "0 False True False False False \n", + "1 False True False False False \n", + "2 False True False False False \n", + "3 False True False False False \n", + "4 False False False False False \n", + ".. ... ... ... ... ... \n", + "795 False False False False False \n", + "796 False False False False False \n", + "797 False False False False False \n", + "798 False False False False False \n", + "799 False False False False False \n", + "\n", + " Type2_Water \n", + "0 False \n", + "1 False \n", + "2 False \n", + "3 False \n", + "4 False \n", + ".. ... \n", + "795 False \n", + "796 False \n", + "797 False \n", + "798 False \n", + "799 True \n", + "\n", + "[800 rows x 51 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "type1_encoded = pd.get_dummies(pokemon['Type 1'], prefix='Type1')\n", + "type2_encoded = pd.get_dummies(pokemon['Type 2'], prefix='Type2')\n", + "pokemon_encoded = pd.concat([pokemon, type1_encoded, type2_encoded], axis=1)\n", + "pokemon_encoded" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
#NameTotalHPAttackDefenseSp. AtkSp. DefSpeedGeneration...Type2_GhostType2_GrassType2_GroundType2_IceType2_NormalType2_PoisonType2_PsychicType2_RockType2_SteelType2_Water
01Bulbasaur3184549496565451...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
12Ivysaur4056062638080601...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
23Venusaur525808283100100801...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
33VenusaurMega Venusaur62580100123122120801...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
44Charmander3093952436050651...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
..................................................................
795719Diancie60050100150100150506...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
796719DiancieMega Diancie700501601101601101106...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
797720HoopaHoopa Confined6008011060150130706...TrueFalseFalseFalseFalseFalseFalseFalseFalseFalse
798720HoopaHoopa Unbound6808016060170130806...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
799721Volcanion6008011012013090706...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
\n", + "

800 rows × 49 columns

\n", + "
" + ], + "text/plain": [ + " # Name Total HP Attack Defense Sp. Atk Sp. Def \\\n", + "0 1 Bulbasaur 318 45 49 49 65 65 \n", + "1 2 Ivysaur 405 60 62 63 80 80 \n", + "2 3 Venusaur 525 80 82 83 100 100 \n", + "3 3 VenusaurMega Venusaur 625 80 100 123 122 120 \n", + "4 4 Charmander 309 39 52 43 60 50 \n", + ".. ... ... ... .. ... ... ... ... \n", + "795 719 Diancie 600 50 100 150 100 150 \n", + "796 719 DiancieMega Diancie 700 50 160 110 160 110 \n", + "797 720 HoopaHoopa Confined 600 80 110 60 150 130 \n", + "798 720 HoopaHoopa Unbound 680 80 160 60 170 130 \n", + "799 721 Volcanion 600 80 110 120 130 90 \n", + "\n", + " Speed Generation ... Type2_Ghost Type2_Grass Type2_Ground \\\n", + "0 45 1 ... False False False \n", + "1 60 1 ... False False False \n", + "2 80 1 ... False False False \n", + "3 80 1 ... False False False \n", + "4 65 1 ... False False False \n", + ".. ... ... ... ... ... ... \n", + "795 50 6 ... False False False \n", + "796 110 6 ... False False False \n", + "797 70 6 ... True False False \n", + "798 80 6 ... False False False \n", + "799 70 6 ... False False False \n", + "\n", + " Type2_Ice Type2_Normal Type2_Poison Type2_Psychic Type2_Rock \\\n", + "0 False False True False False \n", + "1 False False True False False \n", + "2 False False True False False \n", + "3 False False True False False \n", + "4 False False False False False \n", + ".. ... ... ... ... ... \n", + "795 False False False False False \n", + "796 False False False False False \n", + "797 False False False False False \n", + "798 False False False False False \n", + "799 False False False False False \n", + "\n", + " Type2_Steel Type2_Water \n", + "0 False False \n", + "1 False False \n", + "2 False False \n", + "3 False False \n", + "4 False False \n", + ".. ... ... \n", + "795 False False \n", + "796 False False \n", + "797 False False \n", + "798 False False \n", + "799 False True \n", + "\n", + "[800 rows x 49 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pokemon_encoded = pokemon_encoded.drop(['Type 1', 'Type 2'], axis=1)\n", + "pokemon_encoded" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
#NameTotalHPAttackDefenseSp. AtkSp. DefSpeedGeneration...Type2_GhostType2_GrassType2_GroundType2_IceType2_NormalType2_PoisonType2_PsychicType2_RockType2_SteelType2_Water
01Bulbasaur3184549496565451...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
12Ivysaur4056062638080601...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
23Venusaur525808283100100801...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
33VenusaurMega Venusaur62580100123122120801...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
44Charmander3093952436050651...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", + "

5 rows × 49 columns

\n", + "
" + ], + "text/plain": [ + " # Name Total HP Attack Defense Sp. Atk Sp. Def \\\n", + "0 1 Bulbasaur 318 45 49 49 65 65 \n", + "1 2 Ivysaur 405 60 62 63 80 80 \n", + "2 3 Venusaur 525 80 82 83 100 100 \n", + "3 3 VenusaurMega Venusaur 625 80 100 123 122 120 \n", + "4 4 Charmander 309 39 52 43 60 50 \n", + "\n", + " Speed Generation ... Type2_Ghost Type2_Grass Type2_Ground Type2_Ice \\\n", + "0 45 1 ... False False False False \n", + "1 60 1 ... False False False False \n", + "2 80 1 ... False False False False \n", + "3 80 1 ... False False False False \n", + "4 65 1 ... False False False False \n", + "\n", + " Type2_Normal Type2_Poison Type2_Psychic Type2_Rock Type2_Steel \\\n", + "0 False True False False False \n", + "1 False True False False False \n", + "2 False True False False False \n", + "3 False True False False False \n", + "4 False False False False False \n", + "\n", + " Type2_Water \n", + "0 False \n", + "1 False \n", + "2 False \n", + "3 False \n", + "4 False \n", + "\n", + "[5 rows x 49 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pokemon_encoded.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Problem Solving Iteration 3\n", + "\n", + "Now we have encoded the pokemon types, we will identify the relationship between `Total` and the encoded fields. Our expectation is:\n", + "\n", + "#### There are relationships between `Total` and the encoded pokemon type variables and we need to identify the correlations.\n", + "\n", + "The information we need to collect is:\n", + "\n", + "#### How to identify the relationship between `Total` and the encoded pokemon type fields?\n", + "\n", + "There are multiple ways to answer this question. The easiest way is to use correlation. In the cell below, calculate the correlation of `Total` to each of the encoded fields. Rank the correlations and identify the #1 pokemon type that is most likely to have the highest `Total`." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
#NameTotalHPAttackDefenseSp. AtkSp. DefSpeedGeneration...Type2_GhostType2_GrassType2_GroundType2_IceType2_NormalType2_PoisonType2_PsychicType2_RockType2_SteelType2_Water
01Bulbasaur3184549496565451...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
12Ivysaur4056062638080601...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
23Venusaur525808283100100801...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
33VenusaurMega Venusaur62580100123122120801...FalseFalseFalseFalseFalseTrueFalseFalseFalseFalse
44Charmander3093952436050651...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
..................................................................
795719Diancie60050100150100150506...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
796719DiancieMega Diancie700501601101601101106...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
797720HoopaHoopa Confined6008011060150130706...TrueFalseFalseFalseFalseFalseFalseFalseFalseFalse
798720HoopaHoopa Unbound6808016060170130806...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
799721Volcanion6008011012013090706...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
\n", + "

800 rows × 49 columns

\n", + "
" + ], + "text/plain": [ + " # Name Total HP Attack Defense Sp. Atk Sp. Def \\\n", + "0 1 Bulbasaur 318 45 49 49 65 65 \n", + "1 2 Ivysaur 405 60 62 63 80 80 \n", + "2 3 Venusaur 525 80 82 83 100 100 \n", + "3 3 VenusaurMega Venusaur 625 80 100 123 122 120 \n", + "4 4 Charmander 309 39 52 43 60 50 \n", + ".. ... ... ... .. ... ... ... ... \n", + "795 719 Diancie 600 50 100 150 100 150 \n", + "796 719 DiancieMega Diancie 700 50 160 110 160 110 \n", + "797 720 HoopaHoopa Confined 600 80 110 60 150 130 \n", + "798 720 HoopaHoopa Unbound 680 80 160 60 170 130 \n", + "799 721 Volcanion 600 80 110 120 130 90 \n", + "\n", + " Speed Generation ... Type2_Ghost Type2_Grass Type2_Ground \\\n", + "0 45 1 ... False False False \n", + "1 60 1 ... False False False \n", + "2 80 1 ... False False False \n", + "3 80 1 ... False False False \n", + "4 65 1 ... False False False \n", + ".. ... ... ... ... ... ... \n", + "795 50 6 ... False False False \n", + "796 110 6 ... False False False \n", + "797 70 6 ... True False False \n", + "798 80 6 ... False False False \n", + "799 70 6 ... False False False \n", + "\n", + " Type2_Ice Type2_Normal Type2_Poison Type2_Psychic Type2_Rock \\\n", + "0 False False True False False \n", + "1 False False True False False \n", + "2 False False True False False \n", + "3 False False True False False \n", + "4 False False False False False \n", + ".. ... ... ... ... ... \n", + "795 False False False False False \n", + "796 False False False False False \n", + "797 False False False False False \n", + "798 False False False False False \n", + "799 False False False False False \n", + "\n", + " Type2_Steel Type2_Water \n", + "0 False False \n", + "1 False False \n", + "2 False False \n", + "3 False False \n", + "4 False False \n", + ".. ... ... \n", + "795 False False \n", + "796 False False \n", + "797 False False \n", + "798 False False \n", + "799 False True \n", + "\n", + "[800 rows x 49 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "type1_encoded = pd.get_dummies(pokemon['Type 1'], prefix='Type1')\n", + "type2_encoded = pd.get_dummies(pokemon['Type 2'], prefix='Type2')\n", + "pokemon_encoded = pd.concat([pokemon, type1_encoded, type2_encoded], axis=1)\n", + "pokemon_encoded = pokemon_encoded.drop(['Type 1', 'Type 2'], axis=1)\n", + "pokemon_encoded " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['#', 'Total', 'HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed',\n", + " 'Generation', 'Calculated Total'],\n", + " dtype='object')" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numeric_columns = pokemon_encoded.select_dtypes(include='number').columns\n", + "numeric_columns " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "# 0.119813\n", + "HP 0.618748\n", + "Attack 0.736211\n", + "Defense 0.612787\n", + "Sp. Atk 0.747250\n", + "Sp. Def 0.717609\n", + "Speed 0.575943\n", + "Generation 0.048384\n", + "Calculated Total 1.000000\n", + "Name: Total, dtype: float64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "correlation_matrix = pokemon_encoded[numeric_columns].corr()\n", + "total_correlations = correlation_matrix['Total'].drop('Total')\n", + "total_correlations" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Calculated Total 1.000000\n", + "Sp. Atk 0.747250\n", + "Attack 0.736211\n", + "Sp. Def 0.717609\n", + "HP 0.618748\n", + "Defense 0.612787\n", + "Speed 0.575943\n", + "# 0.119813\n", + "Generation 0.048384\n", + "Name: Total, dtype: float64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ranked_correlations = total_correlations.sort_values(ascending=False)\n", + "ranked_correlations" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "top_type = ranked_correlations.idxmax()\n", + "top_correlation_value = ranked_correlations.max()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/your-code/challenge-3.ipynb b/your-code/challenge-3.ipynb index a42a586..81c44f0 100644 --- a/your-code/challenge-3.ipynb +++ b/your-code/challenge-3.ipynb @@ -1,147 +1,767 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Challenge 3\n", - "\n", - "In this challenge we will work on the `Orders` data set. In your work you will apply the thinking process and workflow we showed you in Challenge 2.\n", - "\n", - "You are serving as a Business Intelligence Analyst at the headquarter of an international fashion goods chain store. Your boss today asked you to do two things for her:\n", - "\n", - "**First, identify two groups of customers from the data set.** The first group is **VIP Customers** whose **aggregated expenses** at your global chain stores are **above the 95th percentile** (aka. 0.95 quantile). The second group is **Preferred Customers** whose **aggregated expenses** are **between the 75th and 95th percentile**.\n", - "\n", - "**Second, identify which country has the most of your VIP customers, and which country has the most of your VIP+Preferred Customers combined.**" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Q1: How to identify VIP & Preferred Customers?\n", - "\n", - "We start by importing all the required libraries:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import required libraries\n", - "import numpy as np\n", - "import pandas as pd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, extract and import `Orders` dataset into a dataframe variable called `orders`. Print the head of `orders` to overview the data:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "\n", - "\"Identify VIP and Preferred Customers\" is the non-technical goal of your boss. You need to translate that goal into technical languages that data analysts use:\n", - "\n", - "## How to label customers whose aggregated `amount_spent` is in a given quantile range?\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We break down the main problem into several sub problems:\n", - "\n", - "#### Sub Problem 1: How to aggregate the `amount_spent` for unique customers?\n", - "\n", - "#### Sub Problem 2: How to select customers whose aggregated `amount_spent` is in a given quantile range?\n", - "\n", - "#### Sub Problem 3: How to label selected customers as \"VIP\" or \"Preferred\"?\n", - "\n", - "*Note: If you want to break down the main problem in a different way, please feel free to revise the sub problems above.*\n", - "\n", - "Now in the workspace below, tackle each of the sub problems using the iterative problem solving workflow. Insert cells as necessary to write your codes and explain your steps." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we'll leave it to you to solve Q2 & Q3, which you can leverage from your solution for Q1:\n", - "\n", - "## Q2: How to identify which country has the most VIP Customers?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Q3: How to identify which country has the most VIP+Preferred Customers combined?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# your code here" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Challenge 3\n", + "\n", + "In this challenge we will work on the `Orders` data set. In your work you will apply the thinking process and workflow we showed you in Challenge 2.\n", + "\n", + "You are serving as a Business Intelligence Analyst at the headquarter of an international fashion goods chain store. Your boss today asked you to do two things for her:\n", + "\n", + "**First, identify two groups of customers from the data set.** The first group is **VIP Customers** whose **aggregated expenses** at your global chain stores are **above the 95th percentile** (aka. 0.95 quantile). The second group is **Preferred Customers** whose **aggregated expenses** are **between the 75th and 95th percentile**.\n", + "\n", + "**Second, identify which country has the most of your VIP customers, and which country has the most of your VIP+Preferred Customers combined.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Q1: How to identify VIP & Preferred Customers?\n", + "\n", + "We start by importing all the required libraries:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# import required libraries\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, extract and import `Orders` dataset into a dataframe variable called `orders`. Print the head of `orders` to overview the data:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# your code here\n", + "orders = pd.read_csv(r'C:\\Users\\gaelm\\Desktop\\lab\\lab-dataframe-calculations\\your-code\\Orders.zip')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "\"Identify VIP and Preferred Customers\" is the non-technical goal of your boss. You need to translate that goal into technical languages that data analysts use:\n", + "\n", + "## How to label customers whose aggregated `amount_spent` is in a given quantile range?\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We break down the main problem into several sub problems:\n", + "\n", + "#### Sub Problem 1: How to aggregate the `amount_spent` for unique customers?\n", + "\n", + "#### Sub Problem 2: How to select customers whose aggregated `amount_spent` is in a given quantile range?\n", + "\n", + "#### Sub Problem 3: How to label selected customers as \"VIP\" or \"Preferred\"?\n", + "\n", + "*Note: If you want to break down the main problem in a different way, please feel free to revise the sub problems above.*\n", + "\n", + "Now in the workspace below, tackle each of the sub problems using the iterative problem solving workflow. Insert cells as necessary to write your codes and explain your steps." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idtotal_spent
01234677183.60
1123474310.00
2123481797.24
3123491757.55
412350334.40
\n", + "
" + ], + "text/plain": [ + " customer_id total_spent\n", + "0 12346 77183.60\n", + "1 12347 4310.00\n", + "2 12348 1797.24\n", + "3 12349 1757.55\n", + "4 12350 334.40" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "spending = orders.groupby('CustomerID')['amount_spent'].sum().reset_index()\n", + "spending.columns = ['customer_id', 'total_spent']\n", + "spending.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1661.64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "percentile75 = spending['total_spent'].quantile(0.75)\n", + "percentile75" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5840.181999999982" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "percentile95 = spending['total_spent'].quantile(0.95)\n", + "percentile95" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idtotal_spentcustomer_type
01234677183.60VIP
1123474310.00Preferred
2123481797.24Preferred
3123491757.55Preferred
412350334.40Regular
\n", + "
" + ], + "text/plain": [ + " customer_id total_spent customer_type\n", + "0 12346 77183.60 VIP\n", + "1 12347 4310.00 Preferred\n", + "2 12348 1797.24 Preferred\n", + "3 12349 1757.55 Preferred\n", + "4 12350 334.40 Regular" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spending['customer_type'] = 'Regular' \n", + "spending.loc[spending['total_spent'] > percentile95, 'customer_type'] = 'VIP'\n", + "spending.loc[(spending['total_spent'] > percentile75) & \n", + " (spending['total_spent'] <= percentile95), 'customer_type'] = 'Preferred'\n", + "spending.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we'll leave it to you to solve Q2 & Q3, which you can leverage from your solution for Q1:\n", + "\n", + "## Q2: How to identify which country has the most VIP Customers?" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 12346\n", + "10 12357\n", + "12 12359\n", + "50 12409\n", + "55 12415\n", + " ... \n", + "4207 18109\n", + "4229 18139\n", + "4253 18172\n", + "4292 18223\n", + "4298 18229\n", + "Name: customer_id, Length: 217, dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# your code here\n", + "vip_customer = spending[spending['customer_type'] == 'VIP']['customer_id']\n", + "vip_customer" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0InvoiceNoStockCodeyearmonthdayhourDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountryamount_spent
26265363702272820101238alarm clock bakelike pink242010-12-01 08:45:003.7512583France90.0
27275363702272720101238alarm clock bakelike red242010-12-01 08:45:003.7512583France90.0
28285363702272620101238alarm clock bakelike green122010-12-01 08:45:003.7512583France45.0
29295363702172420101238panda and bunnies sticker sheet122010-12-01 08:45:000.8512583France10.2
30305363702188320101238stars gift tape242010-12-01 08:45:000.6512583France15.6
.............................................
397883541868581584850382011125126 chocolate love heart t-lights482011-12-09 12:25:001.8513777United Kingdom88.8
39790554189058158622061201112512large cake stand hanging strawbery82011-12-09 12:49:002.9513113United Kingdom23.6
39790654189158158623275201112512set of 3 hanging owls ollie beak242011-12-09 12:49:001.2513113United Kingdom30.0
39790754189258158621217201112512red retrospot round cake tins242011-12-09 12:49:008.9513113United Kingdom214.8
39790854189358158620685201112512doormat red retrospot102011-12-09 12:49:007.0813113United Kingdom70.8
\n", + "

104484 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 InvoiceNo StockCode year month day hour \\\n", + "26 26 536370 22728 2010 12 3 8 \n", + "27 27 536370 22727 2010 12 3 8 \n", + "28 28 536370 22726 2010 12 3 8 \n", + "29 29 536370 21724 2010 12 3 8 \n", + "30 30 536370 21883 2010 12 3 8 \n", + "... ... ... ... ... ... ... ... \n", + "397883 541868 581584 85038 2011 12 5 12 \n", + "397905 541890 581586 22061 2011 12 5 12 \n", + "397906 541891 581586 23275 2011 12 5 12 \n", + "397907 541892 581586 21217 2011 12 5 12 \n", + "397908 541893 581586 20685 2011 12 5 12 \n", + "\n", + " Description Quantity InvoiceDate \\\n", + "26 alarm clock bakelike pink 24 2010-12-01 08:45:00 \n", + "27 alarm clock bakelike red 24 2010-12-01 08:45:00 \n", + "28 alarm clock bakelike green 12 2010-12-01 08:45:00 \n", + "29 panda and bunnies sticker sheet 12 2010-12-01 08:45:00 \n", + "30 stars gift tape 24 2010-12-01 08:45:00 \n", + "... ... ... ... \n", + "397883 6 chocolate love heart t-lights 48 2011-12-09 12:25:00 \n", + "397905 large cake stand hanging strawbery 8 2011-12-09 12:49:00 \n", + "397906 set of 3 hanging owls ollie beak 24 2011-12-09 12:49:00 \n", + "397907 red retrospot round cake tins 24 2011-12-09 12:49:00 \n", + "397908 doormat red retrospot 10 2011-12-09 12:49:00 \n", + "\n", + " UnitPrice CustomerID Country amount_spent \n", + "26 3.75 12583 France 90.0 \n", + "27 3.75 12583 France 90.0 \n", + "28 3.75 12583 France 45.0 \n", + "29 0.85 12583 France 10.2 \n", + "30 0.65 12583 France 15.6 \n", + "... ... ... ... ... \n", + "397883 1.85 13777 United Kingdom 88.8 \n", + "397905 2.95 13113 United Kingdom 23.6 \n", + "397906 1.25 13113 United Kingdom 30.0 \n", + "397907 8.95 13113 United Kingdom 214.8 \n", + "397908 7.08 13113 United Kingdom 70.8 \n", + "\n", + "[104484 rows x 14 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vip_orders = orders[orders['CustomerID'].isin(vip_customer)]\n", + "vip_orders" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Country United Kingdom\n", + "VIP_Customer_Count 84185\n", + "Name: 0, dtype: object" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vip_country_count = vip_orders['Country'].value_counts().reset_index()\n", + "vip_country_count.columns = ['Country', 'VIP_Customer_Count']\n", + "most_vip_country = vip_country_count.loc[vip_country_count['VIP_Customer_Count'].idxmax()]\n", + "most_vip_country" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Q3: How to identify which country has the most VIP+Preferred Customers combined?" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# your code here\n", + "preferred_customer = spending[spending['customer_type'] == 'Preferred']['customer_id']" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "combined_customer = pd.concat([preferred_customer, vip_customer])" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "combined_orders_customer = orders[orders['CustomerID'].isin(combined_customer)]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "combined_country_count = combined_orders_customer['Country'].value_counts().reset_index()\n", + "combined_country_count.columns = ['Country', 'VIP_Preferred_Customer_Count']" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Country United Kingdom\n", + "VIP_Preferred_Customer_Count 221635\n", + "Name: 0, dtype: object" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "most_country_combined = combined_country_count.loc[combined_country_count['VIP_Preferred_Customer_Count'].idxmax()]\n", + "most_country_combined" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}