From f7c1f99d6373383d5024064598eb37ce62b7c2f1 Mon Sep 17 00:00:00 2001 From: Jeff Hacker Date: Thu, 25 Jun 2015 00:20:53 -0400 Subject: [PATCH 1/2] homework complete --- Jeff spam homework.ipynb | 465 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 465 insertions(+) create mode 100644 Jeff spam homework.ipynb diff --git a/Jeff spam homework.ipynb b/Jeff spam homework.ipynb new file mode 100644 index 0000000..fe37e91 --- /dev/null +++ b/Jeff spam homework.ipynb @@ -0,0 +1,465 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 254, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import sklearn\n", + "from sklearn.cross_validation import train_test_split \n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.naive_bayes import MultinomialNB as mnb" + ] + }, + { + "cell_type": "code", + "execution_count": 255, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = pd.read_csv(\"spambase/spambase.data\")" + ] + }, + { + "cell_type": "code", + "execution_count": 256, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
00.640.64.10.10.320.20.30.40.50.6...0.400.410.420.7780.430.443.756612781
00.210.280.50000.280.210.070.000.94...0.000.13200.3720.1800.0485.11410110281
10.060.000.71000.190.190.120.640.25...0.010.14300.2760.1840.0109.82148522591
\n", + "

2 rows × 58 columns

\n", + "
" + ], + "text/plain": [ + " 0 0.64 0.64.1 0.1 0.32 0.2 0.3 0.4 0.5 0.6 ... 0.40 \\\n", + "0 0.21 0.28 0.50 0 0 0.28 0.21 0.07 0.00 0.94 ... 0.00 \n", + "1 0.06 0.00 0.71 0 0 0.19 0.19 0.12 0.64 0.25 ... 0.01 \n", + "\n", + " 0.41 0.42 0.778 0.43 0.44 3.756 61 278 1 \n", + "0 0.132 0 0.372 0.180 0.048 5.114 101 1028 1 \n", + "1 0.143 0 0.276 0.184 0.010 9.821 485 2259 1 \n", + "\n", + "[2 rows x 58 columns]" + ] + }, + "execution_count": 256, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 257, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "spam_train, spam_test = train_test_split(df, test_size=.4)" + ] + }, + { + "cell_type": "code", + "execution_count": 258, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
00.640.64.10.10.320.20.30.40.50.6...0.400.410.420.7780.430.443.756612781
154600.290.29000.590.291.041.042.22...0.0000.08400.1050.210.02110.81788712441
290600.000.00000.000.000.000.000.00...0.6751.35100.0000.000.0003.70026370
\n", + "

2 rows × 58 columns

\n", + "
" + ], + "text/plain": [ + " 0 0.64 0.64.1 0.1 0.32 0.2 0.3 0.4 0.5 0.6 ... 0.40 \\\n", + "1546 0 0.29 0.29 0 0 0.59 0.29 1.04 1.04 2.22 ... 0.000 \n", + "2906 0 0.00 0.00 0 0 0.00 0.00 0.00 0.00 0.00 ... 0.675 \n", + "\n", + " 0.41 0.42 0.778 0.43 0.44 3.756 61 278 1 \n", + "1546 0.084 0 0.105 0.21 0.021 10.817 887 1244 1 \n", + "2906 1.351 0 0.000 0.00 0.000 3.700 26 37 0 \n", + "\n", + "[2 rows x 58 columns]" + ] + }, + "execution_count": 258, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spam_train.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 259, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
00.640.64.10.10.320.20.30.40.50.6...0.400.410.420.7780.430.443.756612781
430200000.0000002.77...0.0000.0000.0000.438001.2143170
356200001.3500000.00...0.3320.7470.1660.000004.054192960
\n", + "

2 rows × 58 columns

\n", + "
" + ], + "text/plain": [ + " 0 0.64 0.64.1 0.1 0.32 0.2 0.3 0.4 0.5 0.6 ... 0.40 0.41 \\\n", + "4302 0 0 0 0 0.00 0 0 0 0 2.77 ... 0.000 0.000 \n", + "3562 0 0 0 0 1.35 0 0 0 0 0.00 ... 0.332 0.747 \n", + "\n", + " 0.42 0.778 0.43 0.44 3.756 61 278 1 \n", + "4302 0.000 0.438 0 0 1.214 3 17 0 \n", + "3562 0.166 0.000 0 0 4.054 19 296 0 \n", + "\n", + "[2 rows x 58 columns]" + ] + }, + "execution_count": 259, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spam_test.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 260, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "training_run = mnb()" + ] + }, + { + "cell_type": "code", + "execution_count": 261, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)" + ] + }, + "execution_count": 261, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_run.fit(spam_train.iloc[:, :57], spam_train.iloc[:, -1])" + ] + }, + { + "cell_type": "code", + "execution_count": 262, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.80652173913043479" + ] + }, + "execution_count": 262, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_run.score(spam_test.iloc[:, :57], spam_test.iloc[:, -1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 34401c072e8547d97dba922d26ea2d0007fc667a Mon Sep 17 00:00:00 2001 From: Jeff Hacker Date: Fri, 21 Aug 2015 14:52:26 -0400 Subject: [PATCH 2/2] Added markdown results to notebook --- Jeff spam homework.ipynb | 197 ++++++++++++++++++++------------------- README.md | 4 +- 2 files changed, 105 insertions(+), 96 deletions(-) diff --git a/Jeff spam homework.ipynb b/Jeff spam homework.ipynb index fe37e91..94acdf5 100644 --- a/Jeff spam homework.ipynb +++ b/Jeff spam homework.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 254, + "execution_count": 1, "metadata": { "collapsed": false }, @@ -17,7 +17,7 @@ }, { "cell_type": "code", - "execution_count": 255, + "execution_count": 2, "metadata": { "collapsed": false }, @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 256, + "execution_count": 3, "metadata": { "collapsed": false }, @@ -36,7 +36,7 @@ { "data": { "text/html": [ - "
\n", + "
\n", "\n", " \n", " \n", @@ -130,7 +130,7 @@ "[2 rows x 58 columns]" ] }, - "execution_count": 256, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -141,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": 257, + "execution_count": 4, "metadata": { "collapsed": false }, @@ -152,7 +152,7 @@ }, { "cell_type": "code", - "execution_count": 258, + "execution_count": 5, "metadata": { "collapsed": false }, @@ -160,7 +160,7 @@ { "data": { "text/html": [ - "
\n", + "
\n", "
\n", " \n", " \n", @@ -190,51 +190,51 @@ " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -243,18 +243,18 @@ "" ], "text/plain": [ - " 0 0.64 0.64.1 0.1 0.32 0.2 0.3 0.4 0.5 0.6 ... 0.40 \\\n", - "1546 0 0.29 0.29 0 0 0.59 0.29 1.04 1.04 2.22 ... 0.000 \n", - "2906 0 0.00 0.00 0 0 0.00 0.00 0.00 0.00 0.00 ... 0.675 \n", + " 0 0.64 0.64.1 0.1 0.32 0.2 0.3 0.4 0.5 0.6 ... 0.40 0.41 \\\n", + "2215 0 0 0 0 0 0 0 0 0 0 ... 1.408 0 \n", + "3310 0 0 0 0 0 0 0 0 0 0 ... 0.000 0 \n", "\n", - " 0.41 0.42 0.778 0.43 0.44 3.756 61 278 1 \n", - "1546 0.084 0 0.105 0.21 0.021 10.817 887 1244 1 \n", - "2906 1.351 0 0.000 0.00 0.000 3.700 26 37 0 \n", + " 0.42 0.778 0.43 0.44 3.756 61 278 1 \n", + "2215 0 0 0 0 2.6 6 13 0 \n", + "3310 0 0 0 0 1.4 3 7 0 \n", "\n", "[2 rows x 58 columns]" ] }, - "execution_count": 258, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -265,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 259, + "execution_count": 6, "metadata": { "collapsed": false }, @@ -273,7 +273,7 @@ { "data": { "text/html": [ - "
\n", + "
\n", "
15462215000000000.290.29000.590.291.041.042.22...0.0000.0841.408000002.661300.1050.210.02110.81788712441
2906331000.000.00000.000.000.000.000.00...0.6751.3510000000...0.0000.000.0003.7002637000001.4370
\n", " \n", " \n", @@ -303,52 +303,52 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
430200008710.450.000.6700002.770.670.00.670.220.22...0.0000.0000.0000.43800.11101.21431701.5990.1480.0004.9471025641
356200001.350010740.000.550.55000.552.20.000.000.55...0.3320.7470.1660.000004.054192960.16500.4960.0000.08216.8261483871
\n", @@ -356,18 +356,18 @@ "
" ], "text/plain": [ - " 0 0.64 0.64.1 0.1 0.32 0.2 0.3 0.4 0.5 0.6 ... 0.40 0.41 \\\n", - "4302 0 0 0 0 0.00 0 0 0 0 2.77 ... 0.000 0.000 \n", - "3562 0 0 0 0 1.35 0 0 0 0 0.00 ... 0.332 0.747 \n", + " 0 0.64 0.64.1 0.1 0.32 0.2 0.3 0.4 0.5 0.6 ... 0.40 \\\n", + "871 0.45 0.00 0.67 0 0 0.67 0.0 0.67 0.22 0.22 ... 0 \n", + "1074 0.00 0.55 0.55 0 0 0.55 2.2 0.00 0.00 0.55 ... 0 \n", "\n", - " 0.42 0.778 0.43 0.44 3.756 61 278 1 \n", - "4302 0.000 0.438 0 0 1.214 3 17 0 \n", - "3562 0.166 0.000 0 0 4.054 19 296 0 \n", + " 0.41 0.42 0.778 0.43 0.44 3.756 61 278 1 \n", + "871 0.111 0 1.599 0.148 0.000 4.947 102 564 1 \n", + "1074 0.165 0 0.496 0.000 0.082 16.826 148 387 1 \n", "\n", "[2 rows x 58 columns]" ] }, - "execution_count": 259, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -378,7 +378,7 @@ }, { "cell_type": "code", - "execution_count": 260, + "execution_count": 7, "metadata": { "collapsed": false }, @@ -389,7 +389,7 @@ }, { "cell_type": "code", - "execution_count": 261, + "execution_count": 14, "metadata": { "collapsed": false }, @@ -400,7 +400,7 @@ "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)" ] }, - "execution_count": 261, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -411,7 +411,7 @@ }, { "cell_type": "code", - "execution_count": 262, + "execution_count": 15, "metadata": { "collapsed": false }, @@ -419,10 +419,10 @@ { "data": { "text/plain": [ - "0.80652173913043479" + "0.79619565217391308" ] }, - "execution_count": 262, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -431,6 +431,13 @@ "training_run.score(spam_test.iloc[:, :57], spam_test.iloc[:, -1])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Spam detector is coming back with 80% accuracy score." + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/README.md b/README.md index 3a8b6f2..0821c82 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -# Classifying spam +# Run Jeff spam homework.ipynb using Ipython Notebook + +## Classifying spam ## Description