From 78a82c53a0944ba308b6416962033c81ffb5f499 Mon Sep 17 00:00:00 2001 From: yuliearad1 <73402260+yuliearad1@users.noreply.github.com> Date: Fri, 30 Sep 2022 12:33:50 -0500 Subject: [PATCH] Add files via upload Data Exploration! --- Quant_Data_Tech.ipynb | 477 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 477 insertions(+) create mode 100644 Quant_Data_Tech.ipynb diff --git a/Quant_Data_Tech.ipynb b/Quant_Data_Tech.ipynb new file mode 100644 index 0000000..9b6f5eb --- /dev/null +++ b/Quant_Data_Tech.ipynb @@ -0,0 +1,477 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 26, + "id": "8d7081f8", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "id": "0e9b76ad", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/h6/9qtkdjdj5nj72hzhyqcrqhcw0000gn/T/ipykernel_37283/597542118.py:1: DtypeWarning: Columns (5,11) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " consumer_complaints = pd.read_csv(\"consumer_complaints.csv\")\n" + ] + } + ], + "source": [ + "consumer_complaints = pd.read_csv(\"consumer_complaints.csv\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "id": "83b8e232", + "metadata": {}, + "outputs": [], + "source": [ + "state_populations = pd.read_csv(\"PEPPOP2021.NST_EST2021_POP-2022-09-29T183532.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "id": "8418559b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "12/31/2015\n" + ] + } + ], + "source": [ + "#Data ends on this date\n", + "print(max(consumer_complaints[\"date_sent_to_company\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "id": "df9819fd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "01/01/2013\n" + ] + } + ], + "source": [ + "#Data starts on this date\n", + "print(min(consumer_complaints[\"date_sent_to_company\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "id": "f2f91d4d", + "metadata": {}, + "outputs": [], + "source": [ + "#I took this list and map from JeffPaine/us_state_abbreviations.py \n", + "states = [ 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',\n", + " 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',\n", + " 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',\n", + " 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',\n", + " 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "id": "ad72f310", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Count per state, based on states list above\n", + "[638, 5635, 2385, 12348, 81700, 9495, 6445, 3224, 2917, 53673, 24548, 1937, 2290, 1935, 19624, 6139, 2694, 3992, 5403, 11105, 17703, 1942, 14486, 6424, 7233, 2570, 893, 15501, 476, 1742, 2826, 22408, 2776, 6779, 38266, 17380, 3630, 6604, 20015, 1899, 7013, 757, 8523, 41352, 2951, 18150, 943, 11554, 6125, 1457, 551]\n", + "Percentage per state, based on states list above\n", + "[0.00116199 0.01026307 0.00434382 0.02248951 0.14880085 0.01729332\n", + " 0.01173833 0.0058719 0.00531275 0.09775506 0.04470946 0.00352787\n", + " 0.00417079 0.00352423 0.03574135 0.01118101 0.0049066 0.00727066\n", + " 0.00984053 0.02022562 0.03224261 0.00353698 0.02638347 0.01170008\n", + " 0.01317352 0.00468076 0.00162643 0.02823209 0.00086694 0.00317272\n", + " 0.00514702 0.04081187 0.00505595 0.01234665 0.06969417 0.03165433\n", + " 0.00661135 0.01202792 0.03645348 0.00345866 0.01277283 0.00137873\n", + " 0.01552301 0.07531472 0.00537468 0.03305674 0.00171749 0.02104339\n", + " 0.01115551 0.00265365 0.00100354]\n" + ] + } + ], + "source": [ + "count_per_state = []\n", + "\n", + "for state in states:\n", + " count_per_state.append(sum(consumer_complaints.state == state))\n", + "total = np.array(count_per_state).sum()\n", + "print(\"Count per state, based on states list above\")\n", + "print(count_per_state)\n", + "print(\"Percentage per state, based on states list above\")\n", + "print(count_per_state/total)" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "id": "f2dd1044", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Most common product complaint: \n", + "0 Mortgage\n", + "Name: product, dtype: object\n", + "Most common sub-product complaint: \n", + "0 Other mortgage\n", + "Name: sub_product, dtype: object\n" + ] + } + ], + "source": [ + "#Most common product and subproduct complaint\n", + "print(\"Most common product complaint: \")\n", + "print(consumer_complaints[\"product\"].mode())\n", + "\n", + "print(\"Most common sub-product complaint: \")\n", + "print(consumer_complaints[\"sub_product\"].mode())\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 184, + "id": "e5bc9eaa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " date_received product sub_product \\\n", + "6 08/30/2013 Credit card NaN \n", + "9 09/17/2013 Mortgage Conventional adjustable mortgage (ARM) \n", + "10 08/30/2013 Debt collection Other (i.e. phone, health club, etc.) \n", + "13 08/30/2013 Mortgage Other mortgage \n", + "14 09/10/2013 Debt collection Credit card \n", + "\n", + " issue sub_issue \\\n", + "6 Credit line increase/decrease NaN \n", + "9 Loan modification,collection,foreclosure NaN \n", + "10 Cont'd attempts collect debt not owed Debt was paid \n", + "13 Application, originator, mortgage broker NaN \n", + "14 Communication tactics Called outside of 8am-9pm \n", + "\n", + " consumer_complaint_narrative company_public_response \\\n", + "6 NaN NaN \n", + "9 NaN NaN \n", + "10 NaN NaN \n", + "13 NaN NaN \n", + "14 NaN NaN \n", + "\n", + " company state zipcode tags consumer_consent_provided \\\n", + "6 Wells Fargo & Company AZ 85730 NaN NaN \n", + "9 SunTrust Banks, Inc. CA 94551 NaN NaN \n", + "10 Convergent Resources, Inc. NC 27545 NaN NaN \n", + "13 Wells Fargo & Company TX 78244 NaN NaN \n", + "14 Bank of America GA 30132 NaN NaN \n", + "\n", + " submitted_via date_sent_to_company company_response_to_consumer \\\n", + "6 Postal mail 09/05/2013 Closed with explanation \n", + "9 Web 09/18/2013 Closed with explanation \n", + "10 Web 08/30/2013 Closed with explanation \n", + "13 Fax 09/03/2013 Closed with explanation \n", + "14 Web 09/14/2013 Closed with explanation \n", + "\n", + " timely_response consumer_disputed? complaint_id \n", + "6 Yes No 511062 \n", + "9 Yes Yes 530602 \n", + "10 Yes No 509988 \n", + "13 Yes Yes 510129 \n", + "14 Yes No 521353 \n" + ] + } + ], + "source": [ + "#create testing and training sets\n", + "rand_int = np.random.rand(len(consumer_complaints)) < 0.8\n", + "train = consumer_complaints[rand_int]\n", + "test = consumer_complaints[~rand_int]\n", + "print(test.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "id": "f91005c3", + "metadata": {}, + "outputs": [], + "source": [ + "#Arrays of unique products, companies, states, and company responses\n", + "unique_product = consumer_complaints[\"product\"].unique()\n", + "unique_company = consumer_complaints[\"company\"].unique()\n", + "unique_state = consumer_complaints[\"state\"].unique()\n", + "unique_company_response = consumer_complaints[\"company_response_to_consumer\"].unique()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "id": "85bf3356", + "metadata": {}, + "outputs": [], + "source": [ + "#probabilities of each unique response \n", + "prob_responses = []\n", + "for response in unique_company_response:\n", + " prob_responses.append(sum(train[\"company_response_to_consumer\"] == response))\n", + "prob_responses = np.array(prob_responses)\n", + "prob_responses = prob_responses/sum(prob_responses)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "id": "fe256eab", + "metadata": {}, + "outputs": [], + "source": [ + "#probabilities of each unique state\n", + "prob_states = []\n", + "for state in unique_state:\n", + " prob_states.append(sum(train[\"state\"] == state))\n", + "prob_states = np.array(prob_states)\n", + "prob_states = prob_states/sum(prob_states)" + ] + }, + { + "cell_type": "code", + "execution_count": 203, + "id": "722643c6", + "metadata": {}, + "outputs": [], + "source": [ + "#probabilities of each unique product\n", + "prob_product = []\n", + "for product in unique_product:\n", + " prob_product.append(sum(train[\"product\"] == product))\n", + "prob_product = np.array(prob_product)\n", + "prob_product = prob_product/sum(prob_product)" + ] + }, + { + "cell_type": "code", + "execution_count": 204, + "id": "0e650398", + "metadata": {}, + "outputs": [], + "source": [ + "#probabilities of each unique company\n", + "#This has a long runtime unfortunately, due to the sheer size of the number of unique companies\n", + "prob_company = []\n", + "for company in unique_company:\n", + " prob_company.append(sum(train[\"company\"] == company))\n", + "prob_company = np.array(prob_company)\n", + "prob_company = prob_company/sum(prob_company)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 205, + "id": "895ac6e9", + "metadata": {}, + "outputs": [], + "source": [ + "#probability of state given company response\n", + "#Here, we iterate through every response and and seperate into new dataframes\n", + "#Then, we iterate through each state and get the probability of seeing it with the current response\n", + "prob_state_given_response = []\n", + "\n", + "for response in unique_company_response:\n", + " curr_df = train.loc[train[\"company_response_to_consumer\"] == response]\n", + " curr_total = sum(train[\"company_response_to_consumer\"] == response)\n", + " curr_probs = []\n", + " for state in unique_state:\n", + " curr_probs.append((sum(curr_df[\"state\"] == state))/curr_total)\n", + " prob_state_given_response.append(curr_probs)" + ] + }, + { + "cell_type": "code", + "execution_count": 206, + "id": "2072d203", + "metadata": {}, + "outputs": [], + "source": [ + "#probability of product given company response\n", + "#We do the same thing as above \n", + "prob_product_given_response = []\n", + "\n", + "for response in unique_company_response:\n", + " curr_df = train.loc[train[\"company_response_to_consumer\"] == response]\n", + " curr_total = sum(train[\"company_response_to_consumer\"] == response)\n", + " curr_probs = []\n", + " for product in unique_product:\n", + " curr_probs.append((sum(curr_df[\"product\"] == product))/curr_total)\n", + " prob_product_given_response.append(curr_probs)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 207, + "id": "756e72aa", + "metadata": {}, + "outputs": [], + "source": [ + "#probability of company given company response\n", + "#This also has a long run time because of the number of unique companies\n", + "#We do the same thing as above\n", + "prob_company_given_response = []\n", + "\n", + "for response in unique_company_response:\n", + " curr_df = train.loc[train[\"company_response_to_consumer\"] == response]\n", + " curr_total = sum(train[\"company_response_to_consumer\"] == response)\n", + " curr_probs = []\n", + " for company in unique_company:\n", + " curr_probs.append((sum(curr_df[\"company\"] == company))/curr_total)\n", + " prob_company_given_response.append(curr_probs)" + ] + }, + { + "cell_type": "code", + "execution_count": 208, + "id": "938e7389", + "metadata": {}, + "outputs": [], + "source": [ + "#Convert to a list \n", + "unique_product = unique_product.tolist()\n", + "unique_company = unique_company.tolist()\n", + "unique_state = unique_state.tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "id": "4284b131", + "metadata": {}, + "outputs": [], + "source": [ + "#Naive Bayes Classifier \n", + "def naiveBayesClassifier(product, company, state):\n", + " prob_of_response = []\n", + " i = 0\n", + " #Index of product, company and state\n", + " p_index = unique_product.index(product)\n", + " c_index = unique_company.index(company)\n", + " s_index = unique_state.index(state)\n", + " #Iterate through responses \n", + " for response in unique_company_response:\n", + " #Probability\n", + " curr_result = prob_responses[i] * prob_product_given_response[i][p_index] * prob_state_given_response[i][s_index] * prob_company_given_response[i][c_index]\n", + " prob_of_response.append(curr_result)\n", + " i += 1\n", + " #Find max index\n", + " max_index = prob_of_response.index(max(prob_of_response))\n", + " #return value \n", + "# print(unique_company_response[max_index])\n", + " return unique_company_response[max_index]\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "id": "9eec3d43", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "percentage of correctness: \n", + "0.7121438336451288\n" + ] + } + ], + "source": [ + "#Testing naive bayes classifier \n", + "count_correct = 0\n", + "count_total = 0\n", + "#print(test.iloc[[3]])\n", + "for i in range(len(test)):\n", + " count_total += 1\n", + " curr_row = test.iloc[i]\n", + " count_correct += (naiveBayesClassifier(curr_row.iat[1], curr_row.iat[7], curr_row.iat[8]) == curr_row.iat[14])\n", + "print(\"percentage of correctness: \")\n", + "print(count_correct/count_total)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b56ebd4b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3eec0baa", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0f5e003", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}