From 49b8d40d6aa39081b1bd56ef6ed9fcfba0795e4d Mon Sep 17 00:00:00 2001 From: PJ Passalacqua Date: Wed, 24 Jun 2015 15:57:35 -0400 Subject: [PATCH 1/6] setup --- .envrc | 1 + .gitignore | 5 +++++ 2 files changed, 6 insertions(+) create mode 100644 .envrc diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..94840b3 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +layout python3 diff --git a/.gitignore b/.gitignore index f00dbf2..39a82ff 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,8 @@ docs/_build/ # PyBuilder target/ +# direnv +.direnv/ + +# Data +spambase/ From c5ad9ed2c6af819e718ca7b945ed4391f6f0d58d Mon Sep 17 00:00:00 2001 From: PJ Passalacqua Date: Wed, 24 Jun 2015 15:57:46 -0400 Subject: [PATCH 2/6] initial data import --- Spam_Detector.ipynb | 2049 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2049 insertions(+) create mode 100644 Spam_Detector.ipynb diff --git a/Spam_Detector.ipynb b/Spam_Detector.ipynb new file mode 100644 index 0000000..0495ff6 --- /dev/null +++ b/Spam_Detector.ipynb @@ -0,0 +1,2049 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import sklearn\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['word_freq_make',\n", + " 'word_freq_address',\n", + " 'word_freq_all',\n", + " 'word_freq_3d',\n", + " 'word_freq_our',\n", + " 'word_freq_over',\n", + " 'word_freq_remove',\n", + " 'word_freq_internet',\n", + " 'word_freq_order',\n", + " 'word_freq_mail',\n", + " 'word_freq_receive',\n", + " 'word_freq_will',\n", + " 'word_freq_people',\n", + " 'word_freq_report',\n", + " 'word_freq_addresses',\n", + " 'word_freq_free',\n", + " 'word_freq_business',\n", + " 'word_freq_email',\n", + " 'word_freq_you',\n", + " 'word_freq_credit',\n", + " 'word_freq_your',\n", + " 'word_freq_font',\n", + " 'word_freq_000',\n", + " 'word_freq_money',\n", + " 'word_freq_hp',\n", + " 'word_freq_hpl',\n", + " 'word_freq_george',\n", + " 'word_freq_650',\n", + " 'word_freq_lab',\n", + " 'word_freq_labs',\n", + " 'word_freq_telnet',\n", + " 'word_freq_857',\n", + " 'word_freq_data',\n", + " 'word_freq_415',\n", + " 'word_freq_85',\n", + " 'word_freq_technology',\n", + " 'word_freq_1999',\n", + " 'word_freq_parts',\n", + " 'word_freq_pm',\n", + " 'word_freq_direct',\n", + " 'word_freq_cs',\n", + " 'word_freq_meeting',\n", + " 'word_freq_original',\n", + " 'word_freq_project',\n", + " 'word_freq_re',\n", + " 'word_freq_edu',\n", + " 'word_freq_table',\n", + " 'word_freq_conference',\n", + " 'char_freq_;',\n", + " 'char_freq_(',\n", + " 'char_freq_[',\n", + " 'char_freq_!',\n", + " 'char_freq_$',\n", + " 'char_freq_#',\n", + " 'capital_run_length_average',\n", + " 'capital_run_length_longest',\n", + " 'capital_run_length_total',\n", + " 'spam_flag']" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "with open(\"spambase/spambase.names\") as file:\n", + " names = file.readlines()[33:]\n", + "names = [name.split(\":\")[0] for name in names]\n", + "names.append(\"spam_flag\")\n", + "names" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = pd.read_csv(\"spambase/spambase.data\", names=names)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
word_freq_makeword_freq_addressword_freq_allword_freq_3dword_freq_ourword_freq_overword_freq_removeword_freq_internetword_freq_orderword_freq_mail...char_freq_;char_freq_(char_freq_[char_freq_!char_freq_$char_freq_#capital_run_length_averagecapital_run_length_longestcapital_run_length_totalspam_flag
00.000.640.6400.320.000.000.000.000.00...0.0000.0000.0000.7780.0000.0003.756612781
10.210.280.5000.140.280.210.070.000.94...0.0000.1320.0000.3720.1800.0485.11410110281
20.060.000.7101.230.190.190.120.640.25...0.0100.1430.0000.2760.1840.0109.82148522591
30.000.000.0000.630.000.310.630.310.63...0.0000.1370.0000.1370.0000.0003.537401911
40.000.000.0000.630.000.310.630.310.63...0.0000.1350.0000.1350.0000.0003.537401911
50.000.000.0001.850.000.001.850.000.00...0.0000.2230.0000.0000.0000.0003.00015541
60.000.000.0001.920.000.000.000.000.64...0.0000.0540.0000.1640.0540.0001.67141121
70.000.000.0001.880.000.001.880.000.00...0.0000.2060.0000.0000.0000.0002.45011491
80.150.000.4600.610.000.300.000.920.76...0.0000.2710.0000.1810.2030.0229.74444512571
90.060.120.7700.190.320.380.000.060.00...0.0400.0300.0000.2440.0810.0001.729437491
100.000.000.0000.000.000.960.000.001.92...0.0000.0000.0000.4620.0000.0001.3126211
110.000.000.2500.380.250.250.000.000.00...0.0220.0440.0000.6630.0000.0001.243111841
120.000.690.3400.340.000.000.000.000.00...0.0000.0560.0000.7860.0000.0003.728612611
130.000.000.0000.900.000.900.000.000.90...0.0000.0000.0000.0000.0000.0002.0837251
140.000.001.4200.710.350.000.350.000.71...0.0000.1020.0000.3570.0000.0001.971242051
150.000.420.4201.270.000.420.000.001.27...0.0000.0630.0000.5720.0630.0005.659552491
160.000.000.0000.940.000.000.000.000.00...0.0000.0000.0000.4280.0000.0004.652311071
170.000.000.0000.000.000.000.000.000.00...0.0000.0000.0001.9750.3700.00035.461954611
180.000.000.5501.110.000.180.000.000.00...0.0000.1820.0000.4550.0000.0001.3204701
190.000.630.0001.590.310.000.000.310.00...0.0000.2750.0000.0550.4960.0003.509911861
200.000.000.0000.000.000.000.000.000.00...0.0000.7290.0000.7290.0000.0003.8339231
210.050.070.1000.760.050.150.020.550.00...0.0420.1010.0160.2500.0460.0592.5696622591
220.000.000.0002.940.000.000.000.000.00...0.4040.4040.0000.8090.0000.0004.85712341
230.000.000.0001.160.000.000.000.000.00...0.0000.1330.0000.6670.0000.0001.1315691
240.000.000.0000.000.000.000.000.000.00...0.0000.1960.0000.3920.1960.0005.46622821
250.050.070.1000.760.050.150.020.550.00...0.0420.1010.0160.2500.0460.0592.5656622581
260.000.000.0000.000.000.000.000.000.00...0.0000.1960.0000.3920.1960.0005.46622821
270.000.000.0000.000.001.660.000.000.00...0.0000.0000.0000.3680.0000.0002.61112471
280.000.000.0000.000.000.000.000.000.00...0.0000.3520.0000.3520.0000.0004.00011361
290.000.000.0000.650.000.650.000.000.00...0.0000.4590.0000.0910.0000.0002.687661291
..................................................................
45710.000.000.4600.230.230.000.000.000.00...0.0000.0820.0000.0820.0000.0001.2565980
45720.000.000.0000.000.000.000.000.000.00...0.0000.2540.0000.0000.0000.0001.0001130
45730.000.000.1800.180.180.000.000.000.00...0.0330.0330.0000.0990.0000.0001.489111370
45740.290.000.2900.000.000.000.000.000.29...0.0000.1070.0000.0000.0000.0001.2206610
45750.000.000.0000.000.000.000.000.001.38...0.0000.2130.0000.0000.0000.0001.72011430
45760.000.000.0000.000.000.000.000.000.00...0.0000.1310.0000.0000.0000.0001.4885640
45770.000.001.2000.000.000.000.000.000.00...0.0000.0000.0000.0000.0000.0001.2003240
45780.000.000.4000.000.000.000.000.000.00...0.0000.0000.1450.0000.0000.0001.3725700
45790.270.050.1000.000.000.000.000.000.00...0.6070.0640.0360.0550.0000.2023.7664317890
45800.000.000.0000.000.000.000.000.000.00...0.0000.0000.0000.0000.0000.0001.5715110
45810.000.000.0000.000.510.000.000.000.00...0.0000.0910.0000.0910.0000.0001.5864460
45820.000.000.0000.000.000.000.000.000.00...0.0000.0000.0000.0000.0000.0001.2663190
45830.000.001.2300.000.000.000.000.000.00...0.0000.0000.4060.0000.0000.0001.66613700
45840.000.000.4500.000.220.000.000.000.00...0.0000.0820.0000.0410.0000.0001.50071230
45850.000.000.0000.000.000.000.000.000.00...0.0000.6250.0000.0000.0000.0001.3754110
45860.000.000.0000.360.000.000.000.000.00...0.0000.1120.0000.0000.0000.0561.793211740
45870.000.000.0000.000.000.000.000.000.00...0.0000.1250.0000.0000.1250.0001.2724280
45880.000.003.0300.000.000.000.000.000.00...0.0000.0000.0000.0000.0000.0001.1112100
45890.000.000.0000.540.000.000.000.000.00...0.0000.0000.0000.0000.0000.0001.0001220
45900.000.000.0000.000.000.000.000.000.00...0.0000.1850.0000.0000.0000.0922.46811790
45910.000.000.0000.000.000.000.000.000.00...0.0000.0000.0000.0000.0000.0001.000180
45920.000.001.2502.500.000.000.000.000.00...0.0000.1110.0000.0000.0000.0001.2854270
45930.000.000.0000.000.000.000.000.000.00...0.0000.0000.0001.0520.0000.0001.000160
45940.000.000.0000.000.000.000.000.000.00...0.0000.6300.0000.0000.0000.0001.7275190
45950.000.001.1900.000.000.000.000.000.00...0.0000.0000.0000.0000.0000.0001.0001240
45960.310.000.6200.000.310.000.000.000.00...0.0000.2320.0000.0000.0000.0001.1423880
45970.000.000.0000.000.000.000.000.000.00...0.0000.0000.0000.3530.0000.0001.5554140
45980.300.000.3000.000.000.000.000.000.00...0.1020.7180.0000.0000.0000.0001.40461180
45990.960.000.0000.320.000.000.000.000.00...0.0000.0570.0000.0000.0000.0001.1475780
46000.000.000.6500.000.000.000.000.000.00...0.0000.0000.0000.1250.0000.0001.2505400
\n", + "

4601 rows × 58 columns

\n", + "
" + ], + "text/plain": [ + " word_freq_make word_freq_address word_freq_all word_freq_3d \\\n", + "0 0.00 0.64 0.64 0 \n", + "1 0.21 0.28 0.50 0 \n", + "2 0.06 0.00 0.71 0 \n", + "3 0.00 0.00 0.00 0 \n", + "4 0.00 0.00 0.00 0 \n", + "5 0.00 0.00 0.00 0 \n", + "6 0.00 0.00 0.00 0 \n", + "7 0.00 0.00 0.00 0 \n", + "8 0.15 0.00 0.46 0 \n", + "9 0.06 0.12 0.77 0 \n", + "10 0.00 0.00 0.00 0 \n", + "11 0.00 0.00 0.25 0 \n", + "12 0.00 0.69 0.34 0 \n", + "13 0.00 0.00 0.00 0 \n", + "14 0.00 0.00 1.42 0 \n", + "15 0.00 0.42 0.42 0 \n", + "16 0.00 0.00 0.00 0 \n", + "17 0.00 0.00 0.00 0 \n", + "18 0.00 0.00 0.55 0 \n", + "19 0.00 0.63 0.00 0 \n", + "20 0.00 0.00 0.00 0 \n", + "21 0.05 0.07 0.10 0 \n", + "22 0.00 0.00 0.00 0 \n", + "23 0.00 0.00 0.00 0 \n", + "24 0.00 0.00 0.00 0 \n", + "25 0.05 0.07 0.10 0 \n", + "26 0.00 0.00 0.00 0 \n", + "27 0.00 0.00 0.00 0 \n", + "28 0.00 0.00 0.00 0 \n", + "29 0.00 0.00 0.00 0 \n", + "... ... ... ... ... \n", + "4571 0.00 0.00 0.46 0 \n", + "4572 0.00 0.00 0.00 0 \n", + "4573 0.00 0.00 0.18 0 \n", + "4574 0.29 0.00 0.29 0 \n", + "4575 0.00 0.00 0.00 0 \n", + "4576 0.00 0.00 0.00 0 \n", + "4577 0.00 0.00 1.20 0 \n", + "4578 0.00 0.00 0.40 0 \n", + "4579 0.27 0.05 0.10 0 \n", + "4580 0.00 0.00 0.00 0 \n", + "4581 0.00 0.00 0.00 0 \n", + "4582 0.00 0.00 0.00 0 \n", + "4583 0.00 0.00 1.23 0 \n", + "4584 0.00 0.00 0.45 0 \n", + "4585 0.00 0.00 0.00 0 \n", + "4586 0.00 0.00 0.00 0 \n", + "4587 0.00 0.00 0.00 0 \n", + "4588 0.00 0.00 3.03 0 \n", + "4589 0.00 0.00 0.00 0 \n", + "4590 0.00 0.00 0.00 0 \n", + "4591 0.00 0.00 0.00 0 \n", + "4592 0.00 0.00 1.25 0 \n", + "4593 0.00 0.00 0.00 0 \n", + "4594 0.00 0.00 0.00 0 \n", + "4595 0.00 0.00 1.19 0 \n", + "4596 0.31 0.00 0.62 0 \n", + "4597 0.00 0.00 0.00 0 \n", + "4598 0.30 0.00 0.30 0 \n", + "4599 0.96 0.00 0.00 0 \n", + "4600 0.00 0.00 0.65 0 \n", + "\n", + " word_freq_our word_freq_over word_freq_remove word_freq_internet \\\n", + "0 0.32 0.00 0.00 0.00 \n", + "1 0.14 0.28 0.21 0.07 \n", + "2 1.23 0.19 0.19 0.12 \n", + "3 0.63 0.00 0.31 0.63 \n", + "4 0.63 0.00 0.31 0.63 \n", + "5 1.85 0.00 0.00 1.85 \n", + "6 1.92 0.00 0.00 0.00 \n", + "7 1.88 0.00 0.00 1.88 \n", + "8 0.61 0.00 0.30 0.00 \n", + "9 0.19 0.32 0.38 0.00 \n", + "10 0.00 0.00 0.96 0.00 \n", + "11 0.38 0.25 0.25 0.00 \n", + "12 0.34 0.00 0.00 0.00 \n", + "13 0.90 0.00 0.90 0.00 \n", + "14 0.71 0.35 0.00 0.35 \n", + "15 1.27 0.00 0.42 0.00 \n", + "16 0.94 0.00 0.00 0.00 \n", + "17 0.00 0.00 0.00 0.00 \n", + "18 1.11 0.00 0.18 0.00 \n", + "19 1.59 0.31 0.00 0.00 \n", + "20 0.00 0.00 0.00 0.00 \n", + "21 0.76 0.05 0.15 0.02 \n", + "22 2.94 0.00 0.00 0.00 \n", + "23 1.16 0.00 0.00 0.00 \n", + "24 0.00 0.00 0.00 0.00 \n", + "25 0.76 0.05 0.15 0.02 \n", + "26 0.00 0.00 0.00 0.00 \n", + "27 0.00 0.00 1.66 0.00 \n", + "28 0.00 0.00 0.00 0.00 \n", + "29 0.65 0.00 0.65 0.00 \n", + "... ... ... ... ... \n", + "4571 0.23 0.23 0.00 0.00 \n", + "4572 0.00 0.00 0.00 0.00 \n", + "4573 0.18 0.18 0.00 0.00 \n", + "4574 0.00 0.00 0.00 0.00 \n", + "4575 0.00 0.00 0.00 0.00 \n", + "4576 0.00 0.00 0.00 0.00 \n", + "4577 0.00 0.00 0.00 0.00 \n", + "4578 0.00 0.00 0.00 0.00 \n", + "4579 0.00 0.00 0.00 0.00 \n", + "4580 0.00 0.00 0.00 0.00 \n", + "4581 0.00 0.51 0.00 0.00 \n", + "4582 0.00 0.00 0.00 0.00 \n", + "4583 0.00 0.00 0.00 0.00 \n", + "4584 0.00 0.22 0.00 0.00 \n", + "4585 0.00 0.00 0.00 0.00 \n", + "4586 0.36 0.00 0.00 0.00 \n", + "4587 0.00 0.00 0.00 0.00 \n", + "4588 0.00 0.00 0.00 0.00 \n", + "4589 0.54 0.00 0.00 0.00 \n", + "4590 0.00 0.00 0.00 0.00 \n", + "4591 0.00 0.00 0.00 0.00 \n", + "4592 2.50 0.00 0.00 0.00 \n", + "4593 0.00 0.00 0.00 0.00 \n", + "4594 0.00 0.00 0.00 0.00 \n", + "4595 0.00 0.00 0.00 0.00 \n", + "4596 0.00 0.31 0.00 0.00 \n", + "4597 0.00 0.00 0.00 0.00 \n", + "4598 0.00 0.00 0.00 0.00 \n", + "4599 0.32 0.00 0.00 0.00 \n", + "4600 0.00 0.00 0.00 0.00 \n", + "\n", + " word_freq_order word_freq_mail ... char_freq_; char_freq_( \\\n", + "0 0.00 0.00 ... 0.000 0.000 \n", + "1 0.00 0.94 ... 0.000 0.132 \n", + "2 0.64 0.25 ... 0.010 0.143 \n", + "3 0.31 0.63 ... 0.000 0.137 \n", + "4 0.31 0.63 ... 0.000 0.135 \n", + "5 0.00 0.00 ... 0.000 0.223 \n", + "6 0.00 0.64 ... 0.000 0.054 \n", + "7 0.00 0.00 ... 0.000 0.206 \n", + "8 0.92 0.76 ... 0.000 0.271 \n", + "9 0.06 0.00 ... 0.040 0.030 \n", + "10 0.00 1.92 ... 0.000 0.000 \n", + "11 0.00 0.00 ... 0.022 0.044 \n", + "12 0.00 0.00 ... 0.000 0.056 \n", + "13 0.00 0.90 ... 0.000 0.000 \n", + "14 0.00 0.71 ... 0.000 0.102 \n", + "15 0.00 1.27 ... 0.000 0.063 \n", + "16 0.00 0.00 ... 0.000 0.000 \n", + "17 0.00 0.00 ... 0.000 0.000 \n", + "18 0.00 0.00 ... 0.000 0.182 \n", + "19 0.31 0.00 ... 0.000 0.275 \n", + "20 0.00 0.00 ... 0.000 0.729 \n", + "21 0.55 0.00 ... 0.042 0.101 \n", + "22 0.00 0.00 ... 0.404 0.404 \n", + "23 0.00 0.00 ... 0.000 0.133 \n", + "24 0.00 0.00 ... 0.000 0.196 \n", + "25 0.55 0.00 ... 0.042 0.101 \n", + "26 0.00 0.00 ... 0.000 0.196 \n", + "27 0.00 0.00 ... 0.000 0.000 \n", + "28 0.00 0.00 ... 0.000 0.352 \n", + "29 0.00 0.00 ... 0.000 0.459 \n", + "... ... ... ... ... ... \n", + "4571 0.00 0.00 ... 0.000 0.082 \n", + "4572 0.00 0.00 ... 0.000 0.254 \n", + "4573 0.00 0.00 ... 0.033 0.033 \n", + "4574 0.00 0.29 ... 0.000 0.107 \n", + "4575 0.00 1.38 ... 0.000 0.213 \n", + "4576 0.00 0.00 ... 0.000 0.131 \n", + "4577 0.00 0.00 ... 0.000 0.000 \n", + "4578 0.00 0.00 ... 0.000 0.000 \n", + "4579 0.00 0.00 ... 0.607 0.064 \n", + "4580 0.00 0.00 ... 0.000 0.000 \n", + "4581 0.00 0.00 ... 0.000 0.091 \n", + "4582 0.00 0.00 ... 0.000 0.000 \n", + "4583 0.00 0.00 ... 0.000 0.000 \n", + "4584 0.00 0.00 ... 0.000 0.082 \n", + "4585 0.00 0.00 ... 0.000 0.625 \n", + "4586 0.00 0.00 ... 0.000 0.112 \n", + "4587 0.00 0.00 ... 0.000 0.125 \n", + "4588 0.00 0.00 ... 0.000 0.000 \n", + "4589 0.00 0.00 ... 0.000 0.000 \n", + "4590 0.00 0.00 ... 0.000 0.185 \n", + "4591 0.00 0.00 ... 0.000 0.000 \n", + "4592 0.00 0.00 ... 0.000 0.111 \n", + "4593 0.00 0.00 ... 0.000 0.000 \n", + "4594 0.00 0.00 ... 0.000 0.630 \n", + "4595 0.00 0.00 ... 0.000 0.000 \n", + "4596 0.00 0.00 ... 0.000 0.232 \n", + "4597 0.00 0.00 ... 0.000 0.000 \n", + "4598 0.00 0.00 ... 0.102 0.718 \n", + "4599 0.00 0.00 ... 0.000 0.057 \n", + "4600 0.00 0.00 ... 0.000 0.000 \n", + "\n", + " char_freq_[ char_freq_! char_freq_$ char_freq_# \\\n", + "0 0.000 0.778 0.000 0.000 \n", + "1 0.000 0.372 0.180 0.048 \n", + "2 0.000 0.276 0.184 0.010 \n", + "3 0.000 0.137 0.000 0.000 \n", + "4 0.000 0.135 0.000 0.000 \n", + "5 0.000 0.000 0.000 0.000 \n", + "6 0.000 0.164 0.054 0.000 \n", + "7 0.000 0.000 0.000 0.000 \n", + "8 0.000 0.181 0.203 0.022 \n", + "9 0.000 0.244 0.081 0.000 \n", + "10 0.000 0.462 0.000 0.000 \n", + "11 0.000 0.663 0.000 0.000 \n", + "12 0.000 0.786 0.000 0.000 \n", + "13 0.000 0.000 0.000 0.000 \n", + "14 0.000 0.357 0.000 0.000 \n", + "15 0.000 0.572 0.063 0.000 \n", + "16 0.000 0.428 0.000 0.000 \n", + "17 0.000 1.975 0.370 0.000 \n", + "18 0.000 0.455 0.000 0.000 \n", + "19 0.000 0.055 0.496 0.000 \n", + "20 0.000 0.729 0.000 0.000 \n", + "21 0.016 0.250 0.046 0.059 \n", + "22 0.000 0.809 0.000 0.000 \n", + "23 0.000 0.667 0.000 0.000 \n", + "24 0.000 0.392 0.196 0.000 \n", + "25 0.016 0.250 0.046 0.059 \n", + "26 0.000 0.392 0.196 0.000 \n", + "27 0.000 0.368 0.000 0.000 \n", + "28 0.000 0.352 0.000 0.000 \n", + "29 0.000 0.091 0.000 0.000 \n", + "... ... ... ... ... \n", + "4571 0.000 0.082 0.000 0.000 \n", + "4572 0.000 0.000 0.000 0.000 \n", + "4573 0.000 0.099 0.000 0.000 \n", + "4574 0.000 0.000 0.000 0.000 \n", + "4575 0.000 0.000 0.000 0.000 \n", + "4576 0.000 0.000 0.000 0.000 \n", + "4577 0.000 0.000 0.000 0.000 \n", + "4578 0.145 0.000 0.000 0.000 \n", + "4579 0.036 0.055 0.000 0.202 \n", + "4580 0.000 0.000 0.000 0.000 \n", + "4581 0.000 0.091 0.000 0.000 \n", + "4582 0.000 0.000 0.000 0.000 \n", + "4583 0.406 0.000 0.000 0.000 \n", + "4584 0.000 0.041 0.000 0.000 \n", + "4585 0.000 0.000 0.000 0.000 \n", + "4586 0.000 0.000 0.000 0.056 \n", + "4587 0.000 0.000 0.125 0.000 \n", + "4588 0.000 0.000 0.000 0.000 \n", + "4589 0.000 0.000 0.000 0.000 \n", + "4590 0.000 0.000 0.000 0.092 \n", + "4591 0.000 0.000 0.000 0.000 \n", + "4592 0.000 0.000 0.000 0.000 \n", + "4593 0.000 1.052 0.000 0.000 \n", + "4594 0.000 0.000 0.000 0.000 \n", + "4595 0.000 0.000 0.000 0.000 \n", + "4596 0.000 0.000 0.000 0.000 \n", + "4597 0.000 0.353 0.000 0.000 \n", + "4598 0.000 0.000 0.000 0.000 \n", + "4599 0.000 0.000 0.000 0.000 \n", + "4600 0.000 0.125 0.000 0.000 \n", + "\n", + " capital_run_length_average capital_run_length_longest \\\n", + "0 3.756 61 \n", + "1 5.114 101 \n", + "2 9.821 485 \n", + "3 3.537 40 \n", + "4 3.537 40 \n", + "5 3.000 15 \n", + "6 1.671 4 \n", + "7 2.450 11 \n", + "8 9.744 445 \n", + "9 1.729 43 \n", + "10 1.312 6 \n", + "11 1.243 11 \n", + "12 3.728 61 \n", + "13 2.083 7 \n", + "14 1.971 24 \n", + "15 5.659 55 \n", + "16 4.652 31 \n", + "17 35.461 95 \n", + "18 1.320 4 \n", + "19 3.509 91 \n", + "20 3.833 9 \n", + "21 2.569 66 \n", + "22 4.857 12 \n", + "23 1.131 5 \n", + "24 5.466 22 \n", + "25 2.565 66 \n", + "26 5.466 22 \n", + "27 2.611 12 \n", + "28 4.000 11 \n", + "29 2.687 66 \n", + "... ... ... \n", + "4571 1.256 5 \n", + "4572 1.000 1 \n", + "4573 1.489 11 \n", + "4574 1.220 6 \n", + "4575 1.720 11 \n", + "4576 1.488 5 \n", + "4577 1.200 3 \n", + "4578 1.372 5 \n", + "4579 3.766 43 \n", + "4580 1.571 5 \n", + "4581 1.586 4 \n", + "4582 1.266 3 \n", + "4583 1.666 13 \n", + "4584 1.500 7 \n", + "4585 1.375 4 \n", + "4586 1.793 21 \n", + "4587 1.272 4 \n", + "4588 1.111 2 \n", + "4589 1.000 1 \n", + "4590 2.468 11 \n", + "4591 1.000 1 \n", + "4592 1.285 4 \n", + "4593 1.000 1 \n", + "4594 1.727 5 \n", + "4595 1.000 1 \n", + "4596 1.142 3 \n", + "4597 1.555 4 \n", + "4598 1.404 6 \n", + "4599 1.147 5 \n", + "4600 1.250 5 \n", + "\n", + " capital_run_length_total spam_flag \n", + "0 278 1 \n", + "1 1028 1 \n", + "2 2259 1 \n", + "3 191 1 \n", + "4 191 1 \n", + "5 54 1 \n", + "6 112 1 \n", + "7 49 1 \n", + "8 1257 1 \n", + "9 749 1 \n", + "10 21 1 \n", + "11 184 1 \n", + "12 261 1 \n", + "13 25 1 \n", + "14 205 1 \n", + "15 249 1 \n", + "16 107 1 \n", + "17 461 1 \n", + "18 70 1 \n", + "19 186 1 \n", + "20 23 1 \n", + "21 2259 1 \n", + "22 34 1 \n", + "23 69 1 \n", + "24 82 1 \n", + "25 2258 1 \n", + "26 82 1 \n", + "27 47 1 \n", + "28 36 1 \n", + "29 129 1 \n", + "... ... ... \n", + "4571 98 0 \n", + "4572 13 0 \n", + "4573 137 0 \n", + "4574 61 0 \n", + "4575 43 0 \n", + "4576 64 0 \n", + "4577 24 0 \n", + "4578 70 0 \n", + "4579 1789 0 \n", + "4580 11 0 \n", + "4581 46 0 \n", + "4582 19 0 \n", + "4583 70 0 \n", + "4584 123 0 \n", + "4585 11 0 \n", + "4586 174 0 \n", + "4587 28 0 \n", + "4588 10 0 \n", + "4589 22 0 \n", + "4590 79 0 \n", + "4591 8 0 \n", + "4592 27 0 \n", + "4593 6 0 \n", + "4594 19 0 \n", + "4595 24 0 \n", + "4596 88 0 \n", + "4597 14 0 \n", + "4598 118 0 \n", + "4599 78 0 \n", + "4600 40 0 \n", + "\n", + "[4601 rows x 58 columns]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 334072ac28c8649eeea8793a07857fdb9dd73e7d Mon Sep 17 00:00:00 2001 From: PJ Passalacqua Date: Wed, 24 Jun 2015 16:13:39 -0400 Subject: [PATCH 3/6] split data into train and test subsets --- Spam_Detector.ipynb | 1896 +------------------------------------------ 1 file changed, 16 insertions(+), 1880 deletions(-) diff --git a/Spam_Detector.ipynb b/Spam_Detector.ipynb index 0495ff6..27c3e02 100644 --- a/Spam_Detector.ipynb +++ b/Spam_Detector.ipynb @@ -120,1895 +120,31 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 40, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "train = data.sample(frac=.7)\n", + "data.index\n", + "test = data.drop(train.index)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, "metadata": { "collapsed": false }, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
word_freq_makeword_freq_addressword_freq_allword_freq_3dword_freq_ourword_freq_overword_freq_removeword_freq_internetword_freq_orderword_freq_mail...char_freq_;char_freq_(char_freq_[char_freq_!char_freq_$char_freq_#capital_run_length_averagecapital_run_length_longestcapital_run_length_totalspam_flag
00.000.640.6400.320.000.000.000.000.00...0.0000.0000.0000.7780.0000.0003.756612781
10.210.280.5000.140.280.210.070.000.94...0.0000.1320.0000.3720.1800.0485.11410110281
20.060.000.7101.230.190.190.120.640.25...0.0100.1430.0000.2760.1840.0109.82148522591
30.000.000.0000.630.000.310.630.310.63...0.0000.1370.0000.1370.0000.0003.537401911
40.000.000.0000.630.000.310.630.310.63...0.0000.1350.0000.1350.0000.0003.537401911
50.000.000.0001.850.000.001.850.000.00...0.0000.2230.0000.0000.0000.0003.00015541
60.000.000.0001.920.000.000.000.000.64...0.0000.0540.0000.1640.0540.0001.67141121
70.000.000.0001.880.000.001.880.000.00...0.0000.2060.0000.0000.0000.0002.45011491
80.150.000.4600.610.000.300.000.920.76...0.0000.2710.0000.1810.2030.0229.74444512571
90.060.120.7700.190.320.380.000.060.00...0.0400.0300.0000.2440.0810.0001.729437491
100.000.000.0000.000.000.960.000.001.92...0.0000.0000.0000.4620.0000.0001.3126211
110.000.000.2500.380.250.250.000.000.00...0.0220.0440.0000.6630.0000.0001.243111841
120.000.690.3400.340.000.000.000.000.00...0.0000.0560.0000.7860.0000.0003.728612611
130.000.000.0000.900.000.900.000.000.90...0.0000.0000.0000.0000.0000.0002.0837251
140.000.001.4200.710.350.000.350.000.71...0.0000.1020.0000.3570.0000.0001.971242051
150.000.420.4201.270.000.420.000.001.27...0.0000.0630.0000.5720.0630.0005.659552491
160.000.000.0000.940.000.000.000.000.00...0.0000.0000.0000.4280.0000.0004.652311071
170.000.000.0000.000.000.000.000.000.00...0.0000.0000.0001.9750.3700.00035.461954611
180.000.000.5501.110.000.180.000.000.00...0.0000.1820.0000.4550.0000.0001.3204701
190.000.630.0001.590.310.000.000.310.00...0.0000.2750.0000.0550.4960.0003.509911861
200.000.000.0000.000.000.000.000.000.00...0.0000.7290.0000.7290.0000.0003.8339231
210.050.070.1000.760.050.150.020.550.00...0.0420.1010.0160.2500.0460.0592.5696622591
220.000.000.0002.940.000.000.000.000.00...0.4040.4040.0000.8090.0000.0004.85712341
230.000.000.0001.160.000.000.000.000.00...0.0000.1330.0000.6670.0000.0001.1315691
240.000.000.0000.000.000.000.000.000.00...0.0000.1960.0000.3920.1960.0005.46622821
250.050.070.1000.760.050.150.020.550.00...0.0420.1010.0160.2500.0460.0592.5656622581
260.000.000.0000.000.000.000.000.000.00...0.0000.1960.0000.3920.1960.0005.46622821
270.000.000.0000.000.001.660.000.000.00...0.0000.0000.0000.3680.0000.0002.61112471
280.000.000.0000.000.000.000.000.000.00...0.0000.3520.0000.3520.0000.0004.00011361
290.000.000.0000.650.000.650.000.000.00...0.0000.4590.0000.0910.0000.0002.687661291
..................................................................
45710.000.000.4600.230.230.000.000.000.00...0.0000.0820.0000.0820.0000.0001.2565980
45720.000.000.0000.000.000.000.000.000.00...0.0000.2540.0000.0000.0000.0001.0001130
45730.000.000.1800.180.180.000.000.000.00...0.0330.0330.0000.0990.0000.0001.489111370
45740.290.000.2900.000.000.000.000.000.29...0.0000.1070.0000.0000.0000.0001.2206610
45750.000.000.0000.000.000.000.000.001.38...0.0000.2130.0000.0000.0000.0001.72011430
45760.000.000.0000.000.000.000.000.000.00...0.0000.1310.0000.0000.0000.0001.4885640
45770.000.001.2000.000.000.000.000.000.00...0.0000.0000.0000.0000.0000.0001.2003240
45780.000.000.4000.000.000.000.000.000.00...0.0000.0000.1450.0000.0000.0001.3725700
45790.270.050.1000.000.000.000.000.000.00...0.6070.0640.0360.0550.0000.2023.7664317890
45800.000.000.0000.000.000.000.000.000.00...0.0000.0000.0000.0000.0000.0001.5715110
45810.000.000.0000.000.510.000.000.000.00...0.0000.0910.0000.0910.0000.0001.5864460
45820.000.000.0000.000.000.000.000.000.00...0.0000.0000.0000.0000.0000.0001.2663190
45830.000.001.2300.000.000.000.000.000.00...0.0000.0000.4060.0000.0000.0001.66613700
45840.000.000.4500.000.220.000.000.000.00...0.0000.0820.0000.0410.0000.0001.50071230
45850.000.000.0000.000.000.000.000.000.00...0.0000.6250.0000.0000.0000.0001.3754110
45860.000.000.0000.360.000.000.000.000.00...0.0000.1120.0000.0000.0000.0561.793211740
45870.000.000.0000.000.000.000.000.000.00...0.0000.1250.0000.0000.1250.0001.2724280
45880.000.003.0300.000.000.000.000.000.00...0.0000.0000.0000.0000.0000.0001.1112100
45890.000.000.0000.540.000.000.000.000.00...0.0000.0000.0000.0000.0000.0001.0001220
45900.000.000.0000.000.000.000.000.000.00...0.0000.1850.0000.0000.0000.0922.46811790
45910.000.000.0000.000.000.000.000.000.00...0.0000.0000.0000.0000.0000.0001.000180
45920.000.001.2502.500.000.000.000.000.00...0.0000.1110.0000.0000.0000.0001.2854270
45930.000.000.0000.000.000.000.000.000.00...0.0000.0000.0001.0520.0000.0001.000160
45940.000.000.0000.000.000.000.000.000.00...0.0000.6300.0000.0000.0000.0001.7275190
45950.000.001.1900.000.000.000.000.000.00...0.0000.0000.0000.0000.0000.0001.0001240
45960.310.000.6200.000.310.000.000.000.00...0.0000.2320.0000.0000.0000.0001.1423880
45970.000.000.0000.000.000.000.000.000.00...0.0000.0000.0000.3530.0000.0001.5554140
45980.300.000.3000.000.000.000.000.000.00...0.1020.7180.0000.0000.0000.0001.40461180
45990.960.000.0000.320.000.000.000.000.00...0.0000.0570.0000.0000.0000.0001.1475780
46000.000.000.6500.000.000.000.000.000.00...0.0000.0000.0000.1250.0000.0001.2505400
\n", - "

4601 rows × 58 columns

\n", - "
" - ], "text/plain": [ - " word_freq_make word_freq_address word_freq_all word_freq_3d \\\n", - "0 0.00 0.64 0.64 0 \n", - "1 0.21 0.28 0.50 0 \n", - "2 0.06 0.00 0.71 0 \n", - "3 0.00 0.00 0.00 0 \n", - "4 0.00 0.00 0.00 0 \n", - "5 0.00 0.00 0.00 0 \n", - "6 0.00 0.00 0.00 0 \n", - "7 0.00 0.00 0.00 0 \n", - "8 0.15 0.00 0.46 0 \n", - "9 0.06 0.12 0.77 0 \n", - "10 0.00 0.00 0.00 0 \n", - "11 0.00 0.00 0.25 0 \n", - "12 0.00 0.69 0.34 0 \n", - "13 0.00 0.00 0.00 0 \n", - "14 0.00 0.00 1.42 0 \n", - "15 0.00 0.42 0.42 0 \n", - "16 0.00 0.00 0.00 0 \n", - "17 0.00 0.00 0.00 0 \n", - "18 0.00 0.00 0.55 0 \n", - "19 0.00 0.63 0.00 0 \n", - "20 0.00 0.00 0.00 0 \n", - "21 0.05 0.07 0.10 0 \n", - "22 0.00 0.00 0.00 0 \n", - "23 0.00 0.00 0.00 0 \n", - "24 0.00 0.00 0.00 0 \n", - "25 0.05 0.07 0.10 0 \n", - "26 0.00 0.00 0.00 0 \n", - "27 0.00 0.00 0.00 0 \n", - "28 0.00 0.00 0.00 0 \n", - "29 0.00 0.00 0.00 0 \n", - "... ... ... ... ... \n", - "4571 0.00 0.00 0.46 0 \n", - "4572 0.00 0.00 0.00 0 \n", - "4573 0.00 0.00 0.18 0 \n", - "4574 0.29 0.00 0.29 0 \n", - "4575 0.00 0.00 0.00 0 \n", - "4576 0.00 0.00 0.00 0 \n", - "4577 0.00 0.00 1.20 0 \n", - "4578 0.00 0.00 0.40 0 \n", - "4579 0.27 0.05 0.10 0 \n", - "4580 0.00 0.00 0.00 0 \n", - "4581 0.00 0.00 0.00 0 \n", - "4582 0.00 0.00 0.00 0 \n", - "4583 0.00 0.00 1.23 0 \n", - "4584 0.00 0.00 0.45 0 \n", - "4585 0.00 0.00 0.00 0 \n", - "4586 0.00 0.00 0.00 0 \n", - "4587 0.00 0.00 0.00 0 \n", - "4588 0.00 0.00 3.03 0 \n", - "4589 0.00 0.00 0.00 0 \n", - "4590 0.00 0.00 0.00 0 \n", - "4591 0.00 0.00 0.00 0 \n", - "4592 0.00 0.00 1.25 0 \n", - "4593 0.00 0.00 0.00 0 \n", - "4594 0.00 0.00 0.00 0 \n", - "4595 0.00 0.00 1.19 0 \n", - "4596 0.31 0.00 0.62 0 \n", - "4597 0.00 0.00 0.00 0 \n", - "4598 0.30 0.00 0.30 0 \n", - "4599 0.96 0.00 0.00 0 \n", - "4600 0.00 0.00 0.65 0 \n", - "\n", - " word_freq_our word_freq_over word_freq_remove word_freq_internet \\\n", - "0 0.32 0.00 0.00 0.00 \n", - "1 0.14 0.28 0.21 0.07 \n", - "2 1.23 0.19 0.19 0.12 \n", - "3 0.63 0.00 0.31 0.63 \n", - "4 0.63 0.00 0.31 0.63 \n", - "5 1.85 0.00 0.00 1.85 \n", - "6 1.92 0.00 0.00 0.00 \n", - "7 1.88 0.00 0.00 1.88 \n", - "8 0.61 0.00 0.30 0.00 \n", - "9 0.19 0.32 0.38 0.00 \n", - "10 0.00 0.00 0.96 0.00 \n", - "11 0.38 0.25 0.25 0.00 \n", - "12 0.34 0.00 0.00 0.00 \n", - "13 0.90 0.00 0.90 0.00 \n", - "14 0.71 0.35 0.00 0.35 \n", - "15 1.27 0.00 0.42 0.00 \n", - "16 0.94 0.00 0.00 0.00 \n", - "17 0.00 0.00 0.00 0.00 \n", - "18 1.11 0.00 0.18 0.00 \n", - "19 1.59 0.31 0.00 0.00 \n", - "20 0.00 0.00 0.00 0.00 \n", - "21 0.76 0.05 0.15 0.02 \n", - "22 2.94 0.00 0.00 0.00 \n", - "23 1.16 0.00 0.00 0.00 \n", - "24 0.00 0.00 0.00 0.00 \n", - "25 0.76 0.05 0.15 0.02 \n", - "26 0.00 0.00 0.00 0.00 \n", - "27 0.00 0.00 1.66 0.00 \n", - "28 0.00 0.00 0.00 0.00 \n", - "29 0.65 0.00 0.65 0.00 \n", - "... ... ... ... ... \n", - "4571 0.23 0.23 0.00 0.00 \n", - "4572 0.00 0.00 0.00 0.00 \n", - "4573 0.18 0.18 0.00 0.00 \n", - "4574 0.00 0.00 0.00 0.00 \n", - "4575 0.00 0.00 0.00 0.00 \n", - "4576 0.00 0.00 0.00 0.00 \n", - "4577 0.00 0.00 0.00 0.00 \n", - "4578 0.00 0.00 0.00 0.00 \n", - "4579 0.00 0.00 0.00 0.00 \n", - "4580 0.00 0.00 0.00 0.00 \n", - "4581 0.00 0.51 0.00 0.00 \n", - "4582 0.00 0.00 0.00 0.00 \n", - "4583 0.00 0.00 0.00 0.00 \n", - "4584 0.00 0.22 0.00 0.00 \n", - "4585 0.00 0.00 0.00 0.00 \n", - "4586 0.36 0.00 0.00 0.00 \n", - "4587 0.00 0.00 0.00 0.00 \n", - "4588 0.00 0.00 0.00 0.00 \n", - "4589 0.54 0.00 0.00 0.00 \n", - "4590 0.00 0.00 0.00 0.00 \n", - "4591 0.00 0.00 0.00 0.00 \n", - "4592 2.50 0.00 0.00 0.00 \n", - "4593 0.00 0.00 0.00 0.00 \n", - "4594 0.00 0.00 0.00 0.00 \n", - "4595 0.00 0.00 0.00 0.00 \n", - "4596 0.00 0.31 0.00 0.00 \n", - "4597 0.00 0.00 0.00 0.00 \n", - "4598 0.00 0.00 0.00 0.00 \n", - "4599 0.32 0.00 0.00 0.00 \n", - "4600 0.00 0.00 0.00 0.00 \n", - "\n", - " word_freq_order word_freq_mail ... char_freq_; char_freq_( \\\n", - "0 0.00 0.00 ... 0.000 0.000 \n", - "1 0.00 0.94 ... 0.000 0.132 \n", - "2 0.64 0.25 ... 0.010 0.143 \n", - "3 0.31 0.63 ... 0.000 0.137 \n", - "4 0.31 0.63 ... 0.000 0.135 \n", - "5 0.00 0.00 ... 0.000 0.223 \n", - "6 0.00 0.64 ... 0.000 0.054 \n", - "7 0.00 0.00 ... 0.000 0.206 \n", - "8 0.92 0.76 ... 0.000 0.271 \n", - "9 0.06 0.00 ... 0.040 0.030 \n", - "10 0.00 1.92 ... 0.000 0.000 \n", - "11 0.00 0.00 ... 0.022 0.044 \n", - "12 0.00 0.00 ... 0.000 0.056 \n", - "13 0.00 0.90 ... 0.000 0.000 \n", - "14 0.00 0.71 ... 0.000 0.102 \n", - "15 0.00 1.27 ... 0.000 0.063 \n", - "16 0.00 0.00 ... 0.000 0.000 \n", - "17 0.00 0.00 ... 0.000 0.000 \n", - "18 0.00 0.00 ... 0.000 0.182 \n", - "19 0.31 0.00 ... 0.000 0.275 \n", - "20 0.00 0.00 ... 0.000 0.729 \n", - "21 0.55 0.00 ... 0.042 0.101 \n", - "22 0.00 0.00 ... 0.404 0.404 \n", - "23 0.00 0.00 ... 0.000 0.133 \n", - "24 0.00 0.00 ... 0.000 0.196 \n", - "25 0.55 0.00 ... 0.042 0.101 \n", - "26 0.00 0.00 ... 0.000 0.196 \n", - "27 0.00 0.00 ... 0.000 0.000 \n", - "28 0.00 0.00 ... 0.000 0.352 \n", - "29 0.00 0.00 ... 0.000 0.459 \n", - "... ... ... ... ... ... \n", - "4571 0.00 0.00 ... 0.000 0.082 \n", - "4572 0.00 0.00 ... 0.000 0.254 \n", - "4573 0.00 0.00 ... 0.033 0.033 \n", - "4574 0.00 0.29 ... 0.000 0.107 \n", - "4575 0.00 1.38 ... 0.000 0.213 \n", - "4576 0.00 0.00 ... 0.000 0.131 \n", - "4577 0.00 0.00 ... 0.000 0.000 \n", - "4578 0.00 0.00 ... 0.000 0.000 \n", - "4579 0.00 0.00 ... 0.607 0.064 \n", - "4580 0.00 0.00 ... 0.000 0.000 \n", - "4581 0.00 0.00 ... 0.000 0.091 \n", - "4582 0.00 0.00 ... 0.000 0.000 \n", - "4583 0.00 0.00 ... 0.000 0.000 \n", - "4584 0.00 0.00 ... 0.000 0.082 \n", - "4585 0.00 0.00 ... 0.000 0.625 \n", - "4586 0.00 0.00 ... 0.000 0.112 \n", - "4587 0.00 0.00 ... 0.000 0.125 \n", - "4588 0.00 0.00 ... 0.000 0.000 \n", - "4589 0.00 0.00 ... 0.000 0.000 \n", - "4590 0.00 0.00 ... 0.000 0.185 \n", - "4591 0.00 0.00 ... 0.000 0.000 \n", - "4592 0.00 0.00 ... 0.000 0.111 \n", - "4593 0.00 0.00 ... 0.000 0.000 \n", - "4594 0.00 0.00 ... 0.000 0.630 \n", - "4595 0.00 0.00 ... 0.000 0.000 \n", - "4596 0.00 0.00 ... 0.000 0.232 \n", - "4597 0.00 0.00 ... 0.000 0.000 \n", - "4598 0.00 0.00 ... 0.102 0.718 \n", - "4599 0.00 0.00 ... 0.000 0.057 \n", - "4600 0.00 0.00 ... 0.000 0.000 \n", - "\n", - " char_freq_[ char_freq_! char_freq_$ char_freq_# \\\n", - "0 0.000 0.778 0.000 0.000 \n", - "1 0.000 0.372 0.180 0.048 \n", - "2 0.000 0.276 0.184 0.010 \n", - "3 0.000 0.137 0.000 0.000 \n", - "4 0.000 0.135 0.000 0.000 \n", - "5 0.000 0.000 0.000 0.000 \n", - "6 0.000 0.164 0.054 0.000 \n", - "7 0.000 0.000 0.000 0.000 \n", - "8 0.000 0.181 0.203 0.022 \n", - "9 0.000 0.244 0.081 0.000 \n", - "10 0.000 0.462 0.000 0.000 \n", - "11 0.000 0.663 0.000 0.000 \n", - "12 0.000 0.786 0.000 0.000 \n", - "13 0.000 0.000 0.000 0.000 \n", - "14 0.000 0.357 0.000 0.000 \n", - "15 0.000 0.572 0.063 0.000 \n", - "16 0.000 0.428 0.000 0.000 \n", - "17 0.000 1.975 0.370 0.000 \n", - "18 0.000 0.455 0.000 0.000 \n", - "19 0.000 0.055 0.496 0.000 \n", - "20 0.000 0.729 0.000 0.000 \n", - "21 0.016 0.250 0.046 0.059 \n", - "22 0.000 0.809 0.000 0.000 \n", - "23 0.000 0.667 0.000 0.000 \n", - "24 0.000 0.392 0.196 0.000 \n", - "25 0.016 0.250 0.046 0.059 \n", - "26 0.000 0.392 0.196 0.000 \n", - "27 0.000 0.368 0.000 0.000 \n", - "28 0.000 0.352 0.000 0.000 \n", - "29 0.000 0.091 0.000 0.000 \n", - "... ... ... ... ... \n", - "4571 0.000 0.082 0.000 0.000 \n", - "4572 0.000 0.000 0.000 0.000 \n", - "4573 0.000 0.099 0.000 0.000 \n", - "4574 0.000 0.000 0.000 0.000 \n", - "4575 0.000 0.000 0.000 0.000 \n", - "4576 0.000 0.000 0.000 0.000 \n", - "4577 0.000 0.000 0.000 0.000 \n", - "4578 0.145 0.000 0.000 0.000 \n", - "4579 0.036 0.055 0.000 0.202 \n", - "4580 0.000 0.000 0.000 0.000 \n", - "4581 0.000 0.091 0.000 0.000 \n", - "4582 0.000 0.000 0.000 0.000 \n", - "4583 0.406 0.000 0.000 0.000 \n", - "4584 0.000 0.041 0.000 0.000 \n", - "4585 0.000 0.000 0.000 0.000 \n", - "4586 0.000 0.000 0.000 0.056 \n", - "4587 0.000 0.000 0.125 0.000 \n", - "4588 0.000 0.000 0.000 0.000 \n", - "4589 0.000 0.000 0.000 0.000 \n", - "4590 0.000 0.000 0.000 0.092 \n", - "4591 0.000 0.000 0.000 0.000 \n", - "4592 0.000 0.000 0.000 0.000 \n", - "4593 0.000 1.052 0.000 0.000 \n", - "4594 0.000 0.000 0.000 0.000 \n", - "4595 0.000 0.000 0.000 0.000 \n", - "4596 0.000 0.000 0.000 0.000 \n", - "4597 0.000 0.353 0.000 0.000 \n", - "4598 0.000 0.000 0.000 0.000 \n", - "4599 0.000 0.000 0.000 0.000 \n", - "4600 0.000 0.125 0.000 0.000 \n", - "\n", - " capital_run_length_average capital_run_length_longest \\\n", - "0 3.756 61 \n", - "1 5.114 101 \n", - "2 9.821 485 \n", - "3 3.537 40 \n", - "4 3.537 40 \n", - "5 3.000 15 \n", - "6 1.671 4 \n", - "7 2.450 11 \n", - "8 9.744 445 \n", - "9 1.729 43 \n", - "10 1.312 6 \n", - "11 1.243 11 \n", - "12 3.728 61 \n", - "13 2.083 7 \n", - "14 1.971 24 \n", - "15 5.659 55 \n", - "16 4.652 31 \n", - "17 35.461 95 \n", - "18 1.320 4 \n", - "19 3.509 91 \n", - "20 3.833 9 \n", - "21 2.569 66 \n", - "22 4.857 12 \n", - "23 1.131 5 \n", - "24 5.466 22 \n", - "25 2.565 66 \n", - "26 5.466 22 \n", - "27 2.611 12 \n", - "28 4.000 11 \n", - "29 2.687 66 \n", - "... ... ... \n", - "4571 1.256 5 \n", - "4572 1.000 1 \n", - "4573 1.489 11 \n", - "4574 1.220 6 \n", - "4575 1.720 11 \n", - "4576 1.488 5 \n", - "4577 1.200 3 \n", - "4578 1.372 5 \n", - "4579 3.766 43 \n", - "4580 1.571 5 \n", - "4581 1.586 4 \n", - "4582 1.266 3 \n", - "4583 1.666 13 \n", - "4584 1.500 7 \n", - "4585 1.375 4 \n", - "4586 1.793 21 \n", - "4587 1.272 4 \n", - "4588 1.111 2 \n", - "4589 1.000 1 \n", - "4590 2.468 11 \n", - "4591 1.000 1 \n", - "4592 1.285 4 \n", - "4593 1.000 1 \n", - "4594 1.727 5 \n", - "4595 1.000 1 \n", - "4596 1.142 3 \n", - "4597 1.555 4 \n", - "4598 1.404 6 \n", - "4599 1.147 5 \n", - "4600 1.250 5 \n", - "\n", - " capital_run_length_total spam_flag \n", - "0 278 1 \n", - "1 1028 1 \n", - "2 2259 1 \n", - "3 191 1 \n", - "4 191 1 \n", - "5 54 1 \n", - "6 112 1 \n", - "7 49 1 \n", - "8 1257 1 \n", - "9 749 1 \n", - "10 21 1 \n", - "11 184 1 \n", - "12 261 1 \n", - "13 25 1 \n", - "14 205 1 \n", - "15 249 1 \n", - "16 107 1 \n", - "17 461 1 \n", - "18 70 1 \n", - "19 186 1 \n", - "20 23 1 \n", - "21 2259 1 \n", - "22 34 1 \n", - "23 69 1 \n", - "24 82 1 \n", - "25 2258 1 \n", - "26 82 1 \n", - "27 47 1 \n", - "28 36 1 \n", - "29 129 1 \n", - "... ... ... \n", - "4571 98 0 \n", - "4572 13 0 \n", - "4573 137 0 \n", - "4574 61 0 \n", - "4575 43 0 \n", - "4576 64 0 \n", - "4577 24 0 \n", - "4578 70 0 \n", - "4579 1789 0 \n", - "4580 11 0 \n", - "4581 46 0 \n", - "4582 19 0 \n", - "4583 70 0 \n", - "4584 123 0 \n", - "4585 11 0 \n", - "4586 174 0 \n", - "4587 28 0 \n", - "4588 10 0 \n", - "4589 22 0 \n", - "4590 79 0 \n", - "4591 8 0 \n", - "4592 27 0 \n", - "4593 6 0 \n", - "4594 19 0 \n", - "4595 24 0 \n", - "4596 88 0 \n", - "4597 14 0 \n", - "4598 118 0 \n", - "4599 78 0 \n", - "4600 40 0 \n", - "\n", - "[4601 rows x 58 columns]" + "0" ] }, - "execution_count": 31, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } From 654aa14b5728e9a9ca3a1a6262c6806946ce877c Mon Sep 17 00:00:00 2001 From: PJ Passalacqua Date: Wed, 24 Jun 2015 17:15:45 -0400 Subject: [PATCH 4/6] basic bayes complete --- Spam_Detector.ipynb | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/Spam_Detector.ipynb b/Spam_Detector.ipynb index 27c3e02..458f99a 100644 --- a/Spam_Detector.ipynb +++ b/Spam_Detector.ipynb @@ -2,15 +2,18 @@ "cells": [ { "cell_type": "code", - "execution_count": 7, + "execution_count": 63, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import sklearn\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "classifier = MultinomialNB()\n", "import matplotlib.pyplot as plt\n", - "import pandas as pd" + "import pandas as pd\n", + "from sklearn.cross_validation import train_test_split " ] }, { @@ -120,36 +123,42 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 61, "metadata": { "collapsed": false }, "outputs": [], "source": [ - "train = data.sample(frac=.7)\n", - "data.index\n", - "test = data.drop(train.index)" + "train_data, test_data = train_test_split(data, test_size = .4)" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 65, "metadata": { "collapsed": false }, "outputs": [ { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" + "ename": "ValueError", + "evalue": "Found arrays with inconsistent numbers of samples: [ 57 2760]", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mbayes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mMultinomialNB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mbayes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mbayes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/pjpassalacqua/PythonEngineering/Assignments/spambase/.direnv/python-3.4.3/lib/python3.4/site-packages/sklearn/naive_bayes.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 472\u001b[0m \u001b[0mReturns\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 473\u001b[0m \"\"\"\n\u001b[0;32m--> 474\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'csr'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 475\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_features\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 476\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/pjpassalacqua/PythonEngineering/Assignments/spambase/.direnv/python-3.4.3/lib/python3.4/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric)\u001b[0m\n\u001b[1;32m 452\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloat64\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 453\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 454\u001b[0;31m \u001b[0mcheck_consistent_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 455\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 456\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/Users/pjpassalacqua/PythonEngineering/Assignments/spambase/.direnv/python-3.4.3/lib/python3.4/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_consistent_length\u001b[0;34m(*arrays)\u001b[0m\n\u001b[1;32m 172\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0muniques\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 173\u001b[0m raise ValueError(\"Found arrays with inconsistent numbers of samples: \"\n\u001b[0;32m--> 174\u001b[0;31m \"%s\" % str(uniques))\n\u001b[0m\u001b[1;32m 175\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 176\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: Found arrays with inconsistent numbers of samples: [ 57 2760]" + ] } ], - "source": [] + "source": [ + "bayes = MultinomialNB()\n", + "bayes.fit(train_data, names[:-1])\n", + "bayes.score(test_data, names[:-1])" + ] }, { "cell_type": "code", From 052e7633f11a03d72cefdb3f740ac2ed1a09ce9a Mon Sep 17 00:00:00 2001 From: PJ Passalacqua Date: Thu, 25 Jun 2015 10:30:03 -0400 Subject: [PATCH 5/6] normal mode finished --- Spam_Detector.ipynb | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/Spam_Detector.ipynb b/Spam_Detector.ipynb index 458f99a..388ddab 100644 --- a/Spam_Detector.ipynb +++ b/Spam_Detector.ipynb @@ -134,30 +134,26 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 67, "metadata": { "collapsed": false }, "outputs": [ { - "ename": "ValueError", - "evalue": "Found arrays with inconsistent numbers of samples: [ 57 2760]", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mbayes\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mMultinomialNB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mbayes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mbayes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/Users/pjpassalacqua/PythonEngineering/Assignments/spambase/.direnv/python-3.4.3/lib/python3.4/site-packages/sklearn/naive_bayes.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 472\u001b[0m \u001b[0mReturns\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 473\u001b[0m \"\"\"\n\u001b[0;32m--> 474\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'csr'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 475\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_features\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 476\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/Users/pjpassalacqua/PythonEngineering/Assignments/spambase/.direnv/python-3.4.3/lib/python3.4/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric)\u001b[0m\n\u001b[1;32m 452\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfloat64\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 453\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 454\u001b[0;31m \u001b[0mcheck_consistent_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 455\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 456\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/Users/pjpassalacqua/PythonEngineering/Assignments/spambase/.direnv/python-3.4.3/lib/python3.4/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_consistent_length\u001b[0;34m(*arrays)\u001b[0m\n\u001b[1;32m 172\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0muniques\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 173\u001b[0m raise ValueError(\"Found arrays with inconsistent numbers of samples: \"\n\u001b[0;32m--> 174\u001b[0;31m \"%s\" % str(uniques))\n\u001b[0m\u001b[1;32m 175\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 176\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: Found arrays with inconsistent numbers of samples: [ 57 2760]" - ] + "data": { + "text/plain": [ + "0.77675176534492119" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "bayes = MultinomialNB()\n", - "bayes.fit(train_data, names[:-1])\n", - "bayes.score(test_data, names[:-1])" + "bayes.fit(train_data[names[:-1]], train_data.spam_flag)\n", + "bayes.score(test_data[names[:-1]], test_data.spam_flag)" ] }, { From 0bfc12d1e7830c910b7352c7d52c88d4c4981cbf Mon Sep 17 00:00:00 2001 From: PJ Passalacqua Date: Tue, 21 Jul 2015 14:44:38 -0400 Subject: [PATCH 6/6] added comment on findings --- Spam_Detector.ipynb | 98 +++++++++------------------------------------ 1 file changed, 19 insertions(+), 79 deletions(-) diff --git a/Spam_Detector.ipynb b/Spam_Detector.ipynb index 388ddab..55f5d36 100644 --- a/Spam_Detector.ipynb +++ b/Spam_Detector.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 63, + "execution_count": 1, "metadata": { "collapsed": false }, @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 2, "metadata": { "collapsed": false }, @@ -29,90 +29,21 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 3, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/plain": [ - "['word_freq_make',\n", - " 'word_freq_address',\n", - " 'word_freq_all',\n", - " 'word_freq_3d',\n", - " 'word_freq_our',\n", - " 'word_freq_over',\n", - " 'word_freq_remove',\n", - " 'word_freq_internet',\n", - " 'word_freq_order',\n", - " 'word_freq_mail',\n", - " 'word_freq_receive',\n", - " 'word_freq_will',\n", - " 'word_freq_people',\n", - " 'word_freq_report',\n", - " 'word_freq_addresses',\n", - " 'word_freq_free',\n", - " 'word_freq_business',\n", - " 'word_freq_email',\n", - " 'word_freq_you',\n", - " 'word_freq_credit',\n", - " 'word_freq_your',\n", - " 'word_freq_font',\n", - " 'word_freq_000',\n", - " 'word_freq_money',\n", - " 'word_freq_hp',\n", - " 'word_freq_hpl',\n", - " 'word_freq_george',\n", - " 'word_freq_650',\n", - " 'word_freq_lab',\n", - " 'word_freq_labs',\n", - " 'word_freq_telnet',\n", - " 'word_freq_857',\n", - " 'word_freq_data',\n", - " 'word_freq_415',\n", - " 'word_freq_85',\n", - " 'word_freq_technology',\n", - " 'word_freq_1999',\n", - " 'word_freq_parts',\n", - " 'word_freq_pm',\n", - " 'word_freq_direct',\n", - " 'word_freq_cs',\n", - " 'word_freq_meeting',\n", - " 'word_freq_original',\n", - " 'word_freq_project',\n", - " 'word_freq_re',\n", - " 'word_freq_edu',\n", - " 'word_freq_table',\n", - " 'word_freq_conference',\n", - " 'char_freq_;',\n", - " 'char_freq_(',\n", - " 'char_freq_[',\n", - " 'char_freq_!',\n", - " 'char_freq_$',\n", - " 'char_freq_#',\n", - " 'capital_run_length_average',\n", - " 'capital_run_length_longest',\n", - " 'capital_run_length_total',\n", - " 'spam_flag']" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "with open(\"spambase/spambase.names\") as file:\n", " names = file.readlines()[33:]\n", "names = [name.split(\":\")[0] for name in names]\n", - "names.append(\"spam_flag\")\n", - "names" + "names.append(\"spam_flag\")" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 4, "metadata": { "collapsed": false }, @@ -123,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 5, "metadata": { "collapsed": false }, @@ -134,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 6, "metadata": { "collapsed": false }, @@ -142,10 +73,10 @@ { "data": { "text/plain": [ - "0.77675176534492119" + "0.80825638240086906" ] }, - "execution_count": 67, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -156,6 +87,15 @@ "bayes.score(test_data[names[:-1]], test_data.spam_flag)" ] }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "The Spam detector appears to be fairly accurate. It has an $R^2$ score of .808." + ] + }, { "cell_type": "code", "execution_count": null,