From 78a82c53a0944ba308b6416962033c81ffb5f499 Mon Sep 17 00:00:00 2001
From: yuliearad1 <73402260+yuliearad1@users.noreply.github.com>
Date: Fri, 30 Sep 2022 12:33:50 -0500
Subject: [PATCH] Add files via upload

Data Exploration!
---
 Quant_Data_Tech.ipynb | 477 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 477 insertions(+)
 create mode 100644 Quant_Data_Tech.ipynb

diff --git a/Quant_Data_Tech.ipynb b/Quant_Data_Tech.ipynb
new file mode 100644
index 0000000..9b6f5eb
--- /dev/null
+++ b/Quant_Data_Tech.ipynb
@@ -0,0 +1,477 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "8d7081f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 142,
+   "id": "0e9b76ad",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/h6/9qtkdjdj5nj72hzhyqcrqhcw0000gn/T/ipykernel_37283/597542118.py:1: DtypeWarning: Columns (5,11) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+      "  consumer_complaints = pd.read_csv(\"consumer_complaints.csv\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "consumer_complaints = pd.read_csv(\"consumer_complaints.csv\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 143,
+   "id": "83b8e232",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "state_populations = pd.read_csv(\"PEPPOP2021.NST_EST2021_POP-2022-09-29T183532.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 146,
+   "id": "8418559b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "12/31/2015\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Data ends on this date\n",
+    "print(max(consumer_complaints[\"date_sent_to_company\"]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 147,
+   "id": "df9819fd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "01/01/2013\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Data starts on this date\n",
+    "print(min(consumer_complaints[\"date_sent_to_company\"]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 148,
+   "id": "f2f91d4d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#I took this list and map from JeffPaine/us_state_abbreviations.py \n",
+    "states = [ 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',\n",
+    "           'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',\n",
+    "           'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',\n",
+    "           'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',\n",
+    "           'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 149,
+   "id": "ad72f310",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Count per state, based on states list above\n",
+      "[638, 5635, 2385, 12348, 81700, 9495, 6445, 3224, 2917, 53673, 24548, 1937, 2290, 1935, 19624, 6139, 2694, 3992, 5403, 11105, 17703, 1942, 14486, 6424, 7233, 2570, 893, 15501, 476, 1742, 2826, 22408, 2776, 6779, 38266, 17380, 3630, 6604, 20015, 1899, 7013, 757, 8523, 41352, 2951, 18150, 943, 11554, 6125, 1457, 551]\n",
+      "Percentage per state, based on states list above\n",
+      "[0.00116199 0.01026307 0.00434382 0.02248951 0.14880085 0.01729332\n",
+      " 0.01173833 0.0058719  0.00531275 0.09775506 0.04470946 0.00352787\n",
+      " 0.00417079 0.00352423 0.03574135 0.01118101 0.0049066  0.00727066\n",
+      " 0.00984053 0.02022562 0.03224261 0.00353698 0.02638347 0.01170008\n",
+      " 0.01317352 0.00468076 0.00162643 0.02823209 0.00086694 0.00317272\n",
+      " 0.00514702 0.04081187 0.00505595 0.01234665 0.06969417 0.03165433\n",
+      " 0.00661135 0.01202792 0.03645348 0.00345866 0.01277283 0.00137873\n",
+      " 0.01552301 0.07531472 0.00537468 0.03305674 0.00171749 0.02104339\n",
+      " 0.01115551 0.00265365 0.00100354]\n"
+     ]
+    }
+   ],
+   "source": [
+    "count_per_state = []\n",
+    "\n",
+    "for state in states:\n",
+    "    count_per_state.append(sum(consumer_complaints.state == state))\n",
+    "total = np.array(count_per_state).sum()\n",
+    "print(\"Count per state, based on states list above\")\n",
+    "print(count_per_state)\n",
+    "print(\"Percentage per state, based on states list above\")\n",
+    "print(count_per_state/total)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 150,
+   "id": "f2dd1044",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Most common product complaint: \n",
+      "0    Mortgage\n",
+      "Name: product, dtype: object\n",
+      "Most common sub-product complaint: \n",
+      "0    Other mortgage\n",
+      "Name: sub_product, dtype: object\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Most common product and subproduct complaint\n",
+    "print(\"Most common product complaint: \")\n",
+    "print(consumer_complaints[\"product\"].mode())\n",
+    "\n",
+    "print(\"Most common sub-product complaint: \")\n",
+    "print(consumer_complaints[\"sub_product\"].mode())\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 184,
+   "id": "e5bc9eaa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   date_received          product                             sub_product  \\\n",
+      "6     08/30/2013      Credit card                                     NaN   \n",
+      "9     09/17/2013         Mortgage  Conventional adjustable mortgage (ARM)   \n",
+      "10    08/30/2013  Debt collection   Other (i.e. phone, health club, etc.)   \n",
+      "13    08/30/2013         Mortgage                          Other mortgage   \n",
+      "14    09/10/2013  Debt collection                             Credit card   \n",
+      "\n",
+      "                                       issue                  sub_issue  \\\n",
+      "6              Credit line increase/decrease                        NaN   \n",
+      "9   Loan modification,collection,foreclosure                        NaN   \n",
+      "10     Cont'd attempts collect debt not owed              Debt was paid   \n",
+      "13  Application, originator, mortgage broker                        NaN   \n",
+      "14                     Communication tactics  Called outside of 8am-9pm   \n",
+      "\n",
+      "   consumer_complaint_narrative company_public_response  \\\n",
+      "6                           NaN                     NaN   \n",
+      "9                           NaN                     NaN   \n",
+      "10                          NaN                     NaN   \n",
+      "13                          NaN                     NaN   \n",
+      "14                          NaN                     NaN   \n",
+      "\n",
+      "                       company state zipcode tags consumer_consent_provided  \\\n",
+      "6        Wells Fargo & Company    AZ   85730  NaN                       NaN   \n",
+      "9         SunTrust Banks, Inc.    CA   94551  NaN                       NaN   \n",
+      "10  Convergent Resources, Inc.    NC   27545  NaN                       NaN   \n",
+      "13       Wells Fargo & Company    TX   78244  NaN                       NaN   \n",
+      "14             Bank of America    GA   30132  NaN                       NaN   \n",
+      "\n",
+      "   submitted_via date_sent_to_company company_response_to_consumer  \\\n",
+      "6    Postal mail           09/05/2013      Closed with explanation   \n",
+      "9            Web           09/18/2013      Closed with explanation   \n",
+      "10           Web           08/30/2013      Closed with explanation   \n",
+      "13           Fax           09/03/2013      Closed with explanation   \n",
+      "14           Web           09/14/2013      Closed with explanation   \n",
+      "\n",
+      "   timely_response consumer_disputed?  complaint_id  \n",
+      "6              Yes                 No        511062  \n",
+      "9              Yes                Yes        530602  \n",
+      "10             Yes                 No        509988  \n",
+      "13             Yes                Yes        510129  \n",
+      "14             Yes                 No        521353  \n"
+     ]
+    }
+   ],
+   "source": [
+    "#create testing and training sets\n",
+    "rand_int = np.random.rand(len(consumer_complaints)) < 0.8\n",
+    "train = consumer_complaints[rand_int]\n",
+    "test = consumer_complaints[~rand_int]\n",
+    "print(test.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 200,
+   "id": "f91005c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Arrays of unique products, companies, states, and company responses\n",
+    "unique_product = consumer_complaints[\"product\"].unique()\n",
+    "unique_company = consumer_complaints[\"company\"].unique()\n",
+    "unique_state = consumer_complaints[\"state\"].unique()\n",
+    "unique_company_response = consumer_complaints[\"company_response_to_consumer\"].unique()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 201,
+   "id": "85bf3356",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#probabilities of each unique response \n",
+    "prob_responses = []\n",
+    "for response in unique_company_response:\n",
+    "    prob_responses.append(sum(train[\"company_response_to_consumer\"] == response))\n",
+    "prob_responses = np.array(prob_responses)\n",
+    "prob_responses = prob_responses/sum(prob_responses)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 202,
+   "id": "fe256eab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#probabilities of each unique state\n",
+    "prob_states = []\n",
+    "for state in unique_state:\n",
+    "    prob_states.append(sum(train[\"state\"] == state))\n",
+    "prob_states = np.array(prob_states)\n",
+    "prob_states = prob_states/sum(prob_states)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 203,
+   "id": "722643c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#probabilities of each unique product\n",
+    "prob_product = []\n",
+    "for product in unique_product:\n",
+    "    prob_product.append(sum(train[\"product\"] == product))\n",
+    "prob_product = np.array(prob_product)\n",
+    "prob_product = prob_product/sum(prob_product)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 204,
+   "id": "0e650398",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#probabilities of each unique company\n",
+    "#This has a long runtime unfortunately, due to the sheer size of the number of unique companies\n",
+    "prob_company = []\n",
+    "for company in unique_company:\n",
+    "    prob_company.append(sum(train[\"company\"] == company))\n",
+    "prob_company = np.array(prob_company)\n",
+    "prob_company = prob_company/sum(prob_company)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 205,
+   "id": "895ac6e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#probability of state given company response\n",
+    "#Here, we iterate through every response and and seperate into new dataframes\n",
+    "#Then, we iterate through each state and get the probability of seeing it with the current response\n",
+    "prob_state_given_response = []\n",
+    "\n",
+    "for response in unique_company_response:\n",
+    "    curr_df = train.loc[train[\"company_response_to_consumer\"] == response]\n",
+    "    curr_total = sum(train[\"company_response_to_consumer\"] == response)\n",
+    "    curr_probs = []\n",
+    "    for state in unique_state:\n",
+    "        curr_probs.append((sum(curr_df[\"state\"] == state))/curr_total)\n",
+    "    prob_state_given_response.append(curr_probs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 206,
+   "id": "2072d203",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#probability of product given company response\n",
+    "#We do the same thing as above \n",
+    "prob_product_given_response = []\n",
+    "\n",
+    "for response in unique_company_response:\n",
+    "    curr_df = train.loc[train[\"company_response_to_consumer\"] == response]\n",
+    "    curr_total = sum(train[\"company_response_to_consumer\"] == response)\n",
+    "    curr_probs = []\n",
+    "    for product in unique_product:\n",
+    "        curr_probs.append((sum(curr_df[\"product\"] == product))/curr_total)\n",
+    "    prob_product_given_response.append(curr_probs)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 207,
+   "id": "756e72aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#probability of company given company response\n",
+    "#This also has a long run time because of the number of unique companies\n",
+    "#We do the same thing as above\n",
+    "prob_company_given_response = []\n",
+    "\n",
+    "for response in unique_company_response:\n",
+    "    curr_df = train.loc[train[\"company_response_to_consumer\"] == response]\n",
+    "    curr_total = sum(train[\"company_response_to_consumer\"] == response)\n",
+    "    curr_probs = []\n",
+    "    for company in unique_company:\n",
+    "        curr_probs.append((sum(curr_df[\"company\"] == company))/curr_total)\n",
+    "    prob_company_given_response.append(curr_probs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 208,
+   "id": "938e7389",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Convert to a list \n",
+    "unique_product = unique_product.tolist()\n",
+    "unique_company = unique_company.tolist()\n",
+    "unique_state = unique_state.tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 209,
+   "id": "4284b131",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Naive Bayes Classifier \n",
+    "def naiveBayesClassifier(product, company, state):\n",
+    "    prob_of_response = []\n",
+    "    i = 0\n",
+    "    #Index of product, company and state\n",
+    "    p_index = unique_product.index(product)\n",
+    "    c_index = unique_company.index(company)\n",
+    "    s_index = unique_state.index(state)\n",
+    "    #Iterate through responses \n",
+    "    for response in unique_company_response:\n",
+    "        #Probability\n",
+    "        curr_result = prob_responses[i] * prob_product_given_response[i][p_index] * prob_state_given_response[i][s_index] * prob_company_given_response[i][c_index]\n",
+    "        prob_of_response.append(curr_result)\n",
+    "        i += 1\n",
+    "    #Find max index\n",
+    "    max_index = prob_of_response.index(max(prob_of_response))\n",
+    "    #return value \n",
+    "#     print(unique_company_response[max_index])\n",
+    "    return unique_company_response[max_index]\n",
+    "        \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 211,
+   "id": "9eec3d43",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "percentage of correctness: \n",
+      "0.7121438336451288\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Testing naive bayes classifier \n",
+    "count_correct = 0\n",
+    "count_total = 0\n",
+    "#print(test.iloc[[3]])\n",
+    "for i in range(len(test)):\n",
+    "    count_total += 1\n",
+    "    curr_row = test.iloc[i]\n",
+    "    count_correct += (naiveBayesClassifier(curr_row.iat[1], curr_row.iat[7], curr_row.iat[8]) == curr_row.iat[14])\n",
+    "print(\"percentage of correctness: \")\n",
+    "print(count_correct/count_total)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b56ebd4b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3eec0baa",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d0f5e003",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}