From 2427b64cbb5c3f4e8a5d4ce6e09a75ce0f99990a Mon Sep 17 00:00:00 2001 From: GildaRIA <114354041+GildaRIA@users.noreply.github.com> Date: Tue, 28 Mar 2023 17:41:04 -0600 Subject: [PATCH] Add files via upload --- proyecto5_Gilda.ipynb | 1673 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1673 insertions(+) create mode 100644 proyecto5_Gilda.ipynb diff --git a/proyecto5_Gilda.ipynb b/proyecto5_Gilda.ipynb new file mode 100644 index 0000000..3ceb79f --- /dev/null +++ b/proyecto5_Gilda.ipynb @@ -0,0 +1,1673 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "id": "da56bcca-c2bf-4826-a785-5ceb0d38fe80", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f6b72270-36e1-47b1-88ad-b794f2f6d941", + "metadata": {}, + "outputs": [], + "source": [ + "df_train = pd.read_csv('z_train.csv')\n", + "df_test = pd.read_csv('z_test.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7b63d2d6-3408-4f91-8941-347ca9c0c136", + "metadata": {}, + "outputs": [], + "source": [ + "df_train_copy = df_train.copy()\n", + "df_test_copy = df_test.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8520d3c1-c3ec-41e5-953a-debd4408ce89", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(3824, 14)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train_copy[df_train_copy.target==1].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4cc727e6-d72e-48fc-b585-7a28f2d52d96", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(11502, 14)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train_copy[df_train_copy.target==0].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2c9be408-1fb1-49e4-b6a7-8300d30991d2", + "metadata": {}, + "outputs": [], + "source": [ + "df_test_copy.drop(['enrollee_id', 'city','company_size'], axis=1 , inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0d806fdf-2ca1-4bfa-b31a-7834d3c5678a", + "metadata": {}, + "outputs": [], + "source": [ + "df_train_copy.drop(['enrollee_id', 'city','company_size'], axis=1 , inplace = True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "91eaa090-9f4d-4320-b2ca-0e36b96fad15", + "metadata": {}, + "outputs": [], + "source": [ + "df_train_copy.dropna(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b245b241-ae79-4c52-b164-f1a58c4fa00f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(7405, 11)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train_copy.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "505b7c21-9519-4c10-a752-7764a9a01a9a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(15326, 14)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "99f9447d-f10c-40ae-8f4d-85c92ab5fda7", + "metadata": {}, + "outputs": [], + "source": [ + "def cambiar_mayor(num):\n", + " num= str(num)\n", + " if '>' in num:\n", + " num = num.replace('>','')\n", + " return num\n", + " \n", + " if '<' in num:\n", + " num = num.replace('<1','0')\n", + " return num\n", + " \n", + " if 'nan' in num:\n", + " num = num.replace('nan','')\n", + " return num\n", + " \n", + " else:\n", + " return num" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5ab88ab7-9d7e-4a20-a07d-0fdbd9d6813b", + "metadata": {}, + "outputs": [], + "source": [ + "df_train_copy.experience = df_train_copy.experience.apply(cambiar_mayor)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "6180200c-55d4-4a1d-bc55-721dc20b653a", + "metadata": {}, + "outputs": [], + "source": [ + "df_train_copy['experience'] = pd.to_numeric(df_train_copy.experience)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "d14feec3-b295-4262-9567-0b6374654dde", + "metadata": {}, + "outputs": [], + "source": [ + "df_test_copy.experience = df_train_copy.experience.apply(cambiar_mayor)\n", + "df_test_copy['experience'] = pd.to_numeric(df_train_copy.experience)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "0e4e4fd7-163b-42d5-8cb2-5a3a66f17278", + "metadata": {}, + "outputs": [], + "source": [ + "def cambiar_num(num):\n", + " num= str(num)\n", + " if '>4' in num:\n", + " num = num.replace('>4','5')\n", + " return num\n", + " \n", + " if 'never' in num:\n", + " num = num.replace('never','0')\n", + " return num\n", + " \n", + " if 'nan' in num:\n", + " num = num.replace('nan','')\n", + " return num\n", + " \n", + " else:\n", + " return num" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f8486566-d94b-409c-8cfc-6536baa514b8", + "metadata": {}, + "outputs": [], + "source": [ + "df_train_copy.last_new_job = df_train_copy.last_new_job.apply(cambiar_num)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b6df3b54-eed5-4f8b-b71f-3953cb5c346f", + "metadata": {}, + "outputs": [], + "source": [ + "df_train_copy['last_new_job'] = pd.to_numeric(df_train_copy.last_new_job)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "6b5d2bd1-c87b-4400-a40f-b2f5c5919824", + "metadata": {}, + "outputs": [], + "source": [ + "df_test_copy.last_new_job = df_train_copy.last_new_job.apply(cambiar_num)\n", + "df_test_copy['last_new_job'] = pd.to_numeric(df_train_copy.last_new_job)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "7d5bd53d-996b-4719-9907-09e7a87cf724", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['city_development_index', 'gender', 'relevent_experience',\n", + " 'enrolled_university', 'education_level', 'major_discipline',\n", + " 'experience', 'company_type', 'last_new_job', 'training_hours',\n", + " 'target'],\n", + " dtype='object')" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train_copy.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "7fc6324a-788b-4035-a27a-68a058c1fe0f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "20 1614\n", + "5 480\n", + "10 453\n", + "6 447\n", + "9 442\n", + "7 412\n", + "4 399\n", + "3 364\n", + "15 333\n", + "8 317\n", + "11 314\n", + "14 284\n", + "2 253\n", + "16 251\n", + "12 220\n", + "13 184\n", + "17 170\n", + "19 146\n", + "18 134\n", + "1 107\n", + "0 81\n", + "Name: experience, dtype: int64" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train_copy.experience.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "b50a5da1-f135-4b7f-8a05-9b816ba24d5c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: feature_engine in c:\\anaconda\\lib\\site-packages (1.5.2)\n", + "Requirement already satisfied: scipy>=1.4.1 in c:\\anaconda\\lib\\site-packages (from feature_engine) (1.7.3)\n", + "Requirement already satisfied: statsmodels>=0.11.1 in c:\\anaconda\\lib\\site-packages (from feature_engine) (0.13.2)\n", + "Requirement already satisfied: numpy>=1.18.2 in c:\\anaconda\\lib\\site-packages (from feature_engine) (1.21.5)\n", + "Requirement already satisfied: scikit-learn>=1.0.0 in c:\\anaconda\\lib\\site-packages (from feature_engine) (1.0.2)\n", + "Requirement already satisfied: pandas>=1.0.3 in c:\\anaconda\\lib\\site-packages (from feature_engine) (1.4.2)\n", + "Requirement already satisfied: pytz>=2020.1 in c:\\anaconda\\lib\\site-packages (from pandas>=1.0.3->feature_engine) (2021.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in c:\\anaconda\\lib\\site-packages (from pandas>=1.0.3->feature_engine) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in c:\\anaconda\\lib\\site-packages (from python-dateutil>=2.8.1->pandas>=1.0.3->feature_engine) (1.16.0)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\anaconda\\lib\\site-packages (from scikit-learn>=1.0.0->feature_engine) (2.2.0)\n", + "Requirement already satisfied: joblib>=0.11 in c:\\anaconda\\lib\\site-packages (from scikit-learn>=1.0.0->feature_engine) (1.1.0)\n", + "Requirement already satisfied: patsy>=0.5.2 in c:\\anaconda\\lib\\site-packages (from statsmodels>=0.11.1->feature_engine) (0.5.2)\n", + "Requirement already satisfied: packaging>=21.3 in c:\\anaconda\\lib\\site-packages (from statsmodels>=0.11.1->feature_engine) (21.3)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in c:\\anaconda\\lib\\site-packages (from packaging>=21.3->statsmodels>=0.11.1->feature_engine) (3.0.4)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install feature_engine" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "d113959a-8d72-4500-9b6e-e4f332d2b529", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "from sklearn.linear_model import LinearRegression, Ridge, Lasso\n", + "from sklearn.preprocessing import RobustScaler, OrdinalEncoder\n", + "from sklearn.model_selection import train_test_split\n", + "from feature_engine import imputation as imp\n", + "from feature_engine import encoding as enc\n", + "from feature_engine.wrappers import SklearnTransformerWrapper\n", + "from feature_engine.encoding import OneHotEncoder\n", + "from feature_engine import imputation as mdi\n", + "\n", + "\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.preprocessing import RobustScaler\n", + "\n", + "from feature_engine.wrappers import SklearnTransformerWrapper\n", + "\n", + "\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn import metrics\n", + "\n", + "import sklearn.metrics" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "fb91b830-e24d-4314-82d2-26c540471165", + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(df_train_copy.drop(columns = ['target']), df_train_copy.target, test_size = 0.2, random_state = 830)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "3562d94c-06ba-486c-ad59-c62b3a790b46", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 7405 entries, 1 to 15323\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 city_development_index 7405 non-null float64\n", + " 1 gender 7405 non-null object \n", + " 2 relevent_experience 7405 non-null object \n", + " 3 enrolled_university 7405 non-null object \n", + " 4 education_level 7405 non-null object \n", + " 5 major_discipline 7405 non-null object \n", + " 6 experience 7405 non-null int64 \n", + " 7 company_type 7405 non-null object \n", + " 8 last_new_job 7405 non-null int64 \n", + " 9 training_hours 7405 non-null int64 \n", + " 10 target 7405 non-null float64\n", + "dtypes: float64(2), int64(3), object(6)\n", + "memory usage: 952.3+ KB\n" + ] + } + ], + "source": [ + "df_train_copy.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "f7a8ac22-0ccb-4e59-a47e-d767a90a3398", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1 47\n", + "6 18\n", + "7 46\n", + "10 108\n", + "11 23\n", + " ... \n", + "15316 51\n", + "15317 36\n", + "15318 23\n", + "15320 25\n", + "15323 44\n", + "Name: training_hours, Length: 7405, dtype: int64" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train_copy.training_hours" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "047a5ad6-0b57-4fdd-b063-9517b28615fa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Pipeline] ... (step 1 of 4) Processing mean_imputation, total= 0.0s\n", + "[Pipeline] ... (step 2 of 4) Processing mode_imputation, total= 0.0s\n", + "[Pipeline] ........... (step 3 of 4) Processing escalar, total= 0.0s\n", + "[Pipeline] ........... (step 4 of 4) Processing one_hot, total= 0.0s\n" + ] + }, + { + "data": { + "text/plain": [ + "Pipeline(steps=[('mean_imputation',\n", + " MeanMedianImputer(variables=['experience', 'last_new_job',\n", + " 'training_hours'])),\n", + " ('mode_imputation',\n", + " CategoricalImputer(variables=['enrolled_university',\n", + " 'education_level',\n", + " 'major_discipline',\n", + " 'relevent_experience', 'gender',\n", + " 'company_type'])),\n", + " ('escalar',\n", + " SklearnTransformerWrapper(transformer=StandardScaler(),\n", + " variables=['experience',\n", + " 'last_new_job',\n", + " 'training_hours'])),\n", + " ('one_hot',\n", + " OneHotEncoder(variables=['enrolled_university',\n", + " 'education_level', 'major_discipline',\n", + " 'relevent_experience', 'gender',\n", + " 'company_type']))],\n", + " verbose=True)" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline_gilda = Pipeline([('mean_imputation', mdi.MeanMedianImputer(imputation_method = 'median',\n", + " variables = ['experience', 'last_new_job','training_hours'])),\n", + " ('mode_imputation', mdi.CategoricalImputer(imputation_method = 'missing',\n", + " variables = ['enrolled_university', 'education_level', 'major_discipline','relevent_experience','gender','company_type'])),\n", + " ('escalar', SklearnTransformerWrapper(StandardScaler(), variables=['experience', 'last_new_job','training_hours'])),\n", + " ('one_hot', enc.OneHotEncoder(variables = ['enrolled_university', 'education_level', 'major_discipline','relevent_experience','gender','company_type'])), \n", + " ], verbose = True)\n", + "pipeline_gilda.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4e744035-7fa8-4988-9eef-037d36df5877", + "metadata": {}, + "outputs": [], + "source": [ + "test_1 = pipeline_gilda.transform(df_test_copy)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "c895a465-fc8f-476f-bcf9-78f2d95e2850", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(3832, 27)" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_1.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "28f747dc-4b7a-4f74-a856-b22bdfb91332", + "metadata": {}, + "outputs": [], + "source": [ + "data_1 = pipeline_gilda.transform(X_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "29f0e520-7897-40fe-9830-f5173868b6a7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(5924, 27)" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_1.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "bcfef059-8188-4a57-81da-7b0243766c49", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
city_development_indexgenderrelevent_experienceenrolled_universityeducation_levelmajor_disciplineexperiencecompany_typelast_new_jobtraining_hours
27120.920MaleHas relevent experienceno_enrollmentGraduateArts15Funded Startup5102
123660.910FemaleNo relevent experienceno_enrollmentGraduateArts3Pvt Ltd23
38530.926MaleHas relevent experienceno_enrollmentGraduateSTEM9Pvt Ltd0122
44950.910MaleHas relevent experienceno_enrollmentGraduateSTEM10Pvt Ltd140
6410.802MaleHas relevent experienceno_enrollmentGraduateSTEM4Pvt Ltd131
\n", + "
" + ], + "text/plain": [ + " city_development_index gender relevent_experience \\\n", + "2712 0.920 Male Has relevent experience \n", + "12366 0.910 Female No relevent experience \n", + "3853 0.926 Male Has relevent experience \n", + "4495 0.910 Male Has relevent experience \n", + "641 0.802 Male Has relevent experience \n", + "\n", + " enrolled_university education_level major_discipline experience \\\n", + "2712 no_enrollment Graduate Arts 15 \n", + "12366 no_enrollment Graduate Arts 3 \n", + "3853 no_enrollment Graduate STEM 9 \n", + "4495 no_enrollment Graduate STEM 10 \n", + "641 no_enrollment Graduate STEM 4 \n", + "\n", + " company_type last_new_job training_hours \n", + "2712 Funded Startup 5 102 \n", + "12366 Pvt Ltd 2 3 \n", + "3853 Pvt Ltd 0 122 \n", + "4495 Pvt Ltd 1 40 \n", + "641 Pvt Ltd 1 31 " + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "b131a746-b7f9-4e14-a6f3-e209602ae8c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
city_development_indexexperiencelast_new_jobtraining_hoursenrolled_university_no_enrollmentenrolled_university_Full time courseenrolled_university_Part time courseeducation_level_Graduateeducation_level_Masterseducation_level_Phd...relevent_experience_No relevent experiencegender_Malegender_Femalegender_Othercompany_type_Funded Startupcompany_type_Pvt Ltdcompany_type_Public Sectorcompany_type_Early Stage Startupcompany_type_NGOcompany_type_Other
27120.9200.5785431.6057970.595879100100...0100100000
123660.910-1.343031-0.188947-1.032405100100...1010010000
38530.926-0.382244-1.3854430.924825100100...0100010000
44950.910-0.222113-0.787195-0.423854100100...0100010000
6410.802-1.182900-0.787195-0.571880100100...0100010000
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " city_development_index experience last_new_job training_hours \\\n", + "2712 0.920 0.578543 1.605797 0.595879 \n", + "12366 0.910 -1.343031 -0.188947 -1.032405 \n", + "3853 0.926 -0.382244 -1.385443 0.924825 \n", + "4495 0.910 -0.222113 -0.787195 -0.423854 \n", + "641 0.802 -1.182900 -0.787195 -0.571880 \n", + "\n", + " enrolled_university_no_enrollment \\\n", + "2712 1 \n", + "12366 1 \n", + "3853 1 \n", + "4495 1 \n", + "641 1 \n", + "\n", + " enrolled_university_Full time course \\\n", + "2712 0 \n", + "12366 0 \n", + "3853 0 \n", + "4495 0 \n", + "641 0 \n", + "\n", + " enrolled_university_Part time course education_level_Graduate \\\n", + "2712 0 1 \n", + "12366 0 1 \n", + "3853 0 1 \n", + "4495 0 1 \n", + "641 0 1 \n", + "\n", + " education_level_Masters education_level_Phd ... \\\n", + "2712 0 0 ... \n", + "12366 0 0 ... \n", + "3853 0 0 ... \n", + "4495 0 0 ... \n", + "641 0 0 ... \n", + "\n", + " relevent_experience_No relevent experience gender_Male gender_Female \\\n", + "2712 0 1 0 \n", + "12366 1 0 1 \n", + "3853 0 1 0 \n", + "4495 0 1 0 \n", + "641 0 1 0 \n", + "\n", + " gender_Other company_type_Funded Startup company_type_Pvt Ltd \\\n", + "2712 0 1 0 \n", + "12366 0 0 1 \n", + "3853 0 0 1 \n", + "4495 0 0 1 \n", + "641 0 0 1 \n", + "\n", + " company_type_Public Sector company_type_Early Stage Startup \\\n", + "2712 0 0 \n", + "12366 0 0 \n", + "3853 0 0 \n", + "4495 0 0 \n", + "641 0 0 \n", + "\n", + " company_type_NGO company_type_Other \n", + "2712 0 0 \n", + "12366 0 0 \n", + "3853 0 0 \n", + "4495 0 0 \n", + "641 0 0 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_1.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "7e27a342-3379-43ab-88d8-3e58ba8e396b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['city_development_index', 'experience', 'last_new_job',\n", + " 'training_hours', 'enrolled_university_no_enrollment',\n", + " 'enrolled_university_Full time course',\n", + " 'enrolled_university_Part time course', 'education_level_Graduate',\n", + " 'education_level_Masters', 'education_level_Phd',\n", + " 'major_discipline_Arts', 'major_discipline_STEM',\n", + " 'major_discipline_Other', 'major_discipline_Humanities',\n", + " 'major_discipline_Business Degree', 'major_discipline_No Major',\n", + " 'relevent_experience_Has relevent experience',\n", + " 'relevent_experience_No relevent experience', 'gender_Male',\n", + " 'gender_Female', 'gender_Other', 'company_type_Funded Startup',\n", + " 'company_type_Pvt Ltd', 'company_type_Public Sector',\n", + " 'company_type_Early Stage Startup', 'company_type_NGO',\n", + " 'company_type_Other'],\n", + " dtype='object')" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_1.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "85d3824f-1e98-413a-b9de-ec7baf770685", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn import tree" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "99a8a166-f316-4998-896e-746a465c0d62", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "22c145a1-512d-44a7-b6a6-1543a1d91510", + "metadata": {}, + "outputs": [], + "source": [ + "grid_forestrest = RandomForestClassifier()\n", + "params = {'max_depth': [4, 7, 10, 15], 'n_estimators':[21, 32, 59, 78]}\n", + "grid_forest = GridSearchCV(grid_forestrest, params).fit(data_1, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "a9645bf0-ac74-441f-934d-e32de4992034", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestClassifier(max_depth=7, n_estimators=32)" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_forest.best_estimator_" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "eb270632-4e60-47ed-8f12-27e1d7995a6e", + "metadata": {}, + "outputs": [], + "source": [ + "best_forest = RandomForestClassifier(max_depth=10, n_estimators=59).fit(data_1, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "d77a1bfb-b251-4a9e-812e-aff7a59a8710", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0
city_development_index0.537338
experience0.123796
last_new_job0.053632
training_hours0.107324
enrolled_university_no_enrollment0.009371
enrolled_university_Full time course0.016756
enrolled_university_Part time course0.007740
education_level_Graduate0.011346
education_level_Masters0.011078
education_level_Phd0.005274
major_discipline_Arts0.004499
major_discipline_STEM0.009114
major_discipline_Other0.002683
major_discipline_Humanities0.004782
major_discipline_Business Degree0.002970
major_discipline_No Major0.003115
relevent_experience_Has relevent experience0.009909
relevent_experience_No relevent experience0.009217
gender_Male0.008947
gender_Female0.008828
gender_Other0.002535
company_type_Funded Startup0.008748
company_type_Pvt Ltd0.011570
company_type_Public Sector0.010419
company_type_Early Stage Startup0.006576
company_type_NGO0.008553
company_type_Other0.003877
\n", + "
" + ], + "text/plain": [ + " 0\n", + "city_development_index 0.537338\n", + "experience 0.123796\n", + "last_new_job 0.053632\n", + "training_hours 0.107324\n", + "enrolled_university_no_enrollment 0.009371\n", + "enrolled_university_Full time course 0.016756\n", + "enrolled_university_Part time course 0.007740\n", + "education_level_Graduate 0.011346\n", + "education_level_Masters 0.011078\n", + "education_level_Phd 0.005274\n", + "major_discipline_Arts 0.004499\n", + "major_discipline_STEM 0.009114\n", + "major_discipline_Other 0.002683\n", + "major_discipline_Humanities 0.004782\n", + "major_discipline_Business Degree 0.002970\n", + "major_discipline_No Major 0.003115\n", + "relevent_experience_Has relevent experience 0.009909\n", + "relevent_experience_No relevent experience 0.009217\n", + "gender_Male 0.008947\n", + "gender_Female 0.008828\n", + "gender_Other 0.002535\n", + "company_type_Funded Startup 0.008748\n", + "company_type_Pvt Ltd 0.011570\n", + "company_type_Public Sector 0.010419\n", + "company_type_Early Stage Startup 0.006576\n", + "company_type_NGO 0.008553\n", + "company_type_Other 0.003877" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame([best_forest.feature_importances_], columns = best_forest.feature_names_in_).T" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "ac1ee75d-b9c8-44cf-a63b-63ae60829870", + "metadata": {}, + "outputs": [], + "source": [ + "pred_y = best_forest.predict(data_1)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "a6c7fcec-6d0a-4548-8ba9-c290061e97d9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0., 0., 0., ..., 0., 0., 0.])" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_y" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "4eae4e02-2cda-42f8-bdec-a643a1e0e38b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7398078401926709" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics.roc_auc_score(y_train, pred_y)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "7dab9b48-1599-40df-b122-758543a67562", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import roc_auc_score" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "cb1eb469-384c-4660-adaa-58e665c1e9dd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8946657663740716" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics.accuracy_score(y_train, pred_y)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "8aabbaa2-f865-4785-b372-fe0d58f9daf3", + "metadata": {}, + "outputs": [], + "source": [ + "test_2 = pipeline_gilda.transform(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "7f14441d-2df4-4419-8882-1fbfe013cd47", + "metadata": {}, + "outputs": [], + "source": [ + "pred_y_test = best_forest.predict(test_2)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "00a0bdda-3924-4b50-a108-d5beb7a48e56", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6669102170906248" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics.roc_auc_score(y_test, pred_y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "4218095e-1d72-4068-a4a7-c26c05afda2a", + "metadata": {}, + "outputs": [], + "source": [ + "df_test_copy = df_test.copy(deep=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "c5c74d55-bc5e-40ae-8bcd-470fdc78783c", + "metadata": {}, + "outputs": [], + "source": [ + "prediccion = df_test_copy.copy(deep=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "2836dcc4-861c-466b-88f6-a4c4bc8066a5", + "metadata": {}, + "outputs": [], + "source": [ + "prediccion.drop(['city', 'city_development_index', 'gender',\n", + " 'relevent_experience', 'enrolled_university', 'education_level',\n", + " 'major_discipline', 'experience', 'company_size', 'company_type',\n", + " 'last_new_job', 'training_hours'],axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "16618dfa-d6f1-4d03-984a-e47ea648c7a0", + "metadata": {}, + "outputs": [], + "source": [ + "pred_y_test = best_forest.predict_proba(test_1)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "8a145c96-2a9c-4d1f-b177-ea30d771ba7a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0., 0., 0., ..., 0., 0., 0.])" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + " best_forest.predict(test_1)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "33c3701e-6a63-4a78-aca9-ab807bc18d09", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0.82260266, 0.17739734],\n", + " [0.89824154, 0.10175846],\n", + " [0.85059994, 0.14940006],\n", + " ...,\n", + " [0.92987183, 0.07012817],\n", + " [0.90788634, 0.09211366],\n", + " [0.95592714, 0.04407286]])" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pred_y_test[:-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "17005bb5-cb2a-4224-b9e8-5ade36460fe0", + "metadata": {}, + "outputs": [], + "source": [ + "datos=[]\n", + "for x in pred_y_test:\n", + " datos.append(x[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "e4c7094e-2c7a-4cf2-8962-06348219cfc2", + "metadata": {}, + "outputs": [], + "source": [ + "target = pd.DataFrame(datos)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "4523eae6-1a99-4ed9-bdcd-63e3ec4ef96a", + "metadata": {}, + "outputs": [], + "source": [ + "prediccion['target'] = target" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "99ae60f4-f8ea-40b2-826c-4005902f1778", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
enrollee_idtarget
0236030.177397
1224990.101758
2104650.149400
382930.109397
442460.097448
.........
382788800.104415
382878860.070128
3829122790.092114
383053260.044073
383140170.047552
\n", + "

3832 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " enrollee_id target\n", + "0 23603 0.177397\n", + "1 22499 0.101758\n", + "2 10465 0.149400\n", + "3 8293 0.109397\n", + "4 4246 0.097448\n", + "... ... ...\n", + "3827 8880 0.104415\n", + "3828 7886 0.070128\n", + "3829 12279 0.092114\n", + "3830 5326 0.044073\n", + "3831 4017 0.047552\n", + "\n", + "[3832 rows x 2 columns]" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prediccion" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "88fdbbf8-a1ff-46d1-8dd3-db78c55d6db5", + "metadata": {}, + "outputs": [], + "source": [ + "prediccion.to_csv('prediccion1_gilda.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5c528bc-a3e9-4206-9b4e-3e3af3c60375", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}