diff --git a/Jeff spam homework.ipynb b/Jeff spam homework.ipynb new file mode 100644 index 0000000..94acdf5 --- /dev/null +++ b/Jeff spam homework.ipynb @@ -0,0 +1,472 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import sklearn\n", + "from sklearn.cross_validation import train_test_split \n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.naive_bayes import MultinomialNB as mnb" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = pd.read_csv(\"spambase/spambase.data\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
00.640.64.10.10.320.20.30.40.50.6...0.400.410.420.7780.430.443.756612781
00.210.280.50000.280.210.070.000.94...0.000.13200.3720.1800.0485.11410110281
10.060.000.71000.190.190.120.640.25...0.010.14300.2760.1840.0109.82148522591
\n", + "

2 rows × 58 columns

\n", + "
" + ], + "text/plain": [ + " 0 0.64 0.64.1 0.1 0.32 0.2 0.3 0.4 0.5 0.6 ... 0.40 \\\n", + "0 0.21 0.28 0.50 0 0 0.28 0.21 0.07 0.00 0.94 ... 0.00 \n", + "1 0.06 0.00 0.71 0 0 0.19 0.19 0.12 0.64 0.25 ... 0.01 \n", + "\n", + " 0.41 0.42 0.778 0.43 0.44 3.756 61 278 1 \n", + "0 0.132 0 0.372 0.180 0.048 5.114 101 1028 1 \n", + "1 0.143 0 0.276 0.184 0.010 9.821 485 2259 1 \n", + "\n", + "[2 rows x 58 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "spam_train, spam_test = train_test_split(df, test_size=.4)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
00.640.64.10.10.320.20.30.40.50.6...0.400.410.420.7780.430.443.756612781
22150000000000...1.408000002.66130
33100000000000...0.000000001.4370
\n", + "

2 rows × 58 columns

\n", + "
" + ], + "text/plain": [ + " 0 0.64 0.64.1 0.1 0.32 0.2 0.3 0.4 0.5 0.6 ... 0.40 0.41 \\\n", + "2215 0 0 0 0 0 0 0 0 0 0 ... 1.408 0 \n", + "3310 0 0 0 0 0 0 0 0 0 0 ... 0.000 0 \n", + "\n", + " 0.42 0.778 0.43 0.44 3.756 61 278 1 \n", + "2215 0 0 0 0 2.6 6 13 0 \n", + "3310 0 0 0 0 1.4 3 7 0 \n", + "\n", + "[2 rows x 58 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spam_train.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
00.640.64.10.10.320.20.30.40.50.6...0.400.410.420.7780.430.443.756612781
8710.450.000.67000.670.00.670.220.22...00.11101.5990.1480.0004.9471025641
10740.000.550.55000.552.20.000.000.55...00.16500.4960.0000.08216.8261483871
\n", + "

2 rows × 58 columns

\n", + "
" + ], + "text/plain": [ + " 0 0.64 0.64.1 0.1 0.32 0.2 0.3 0.4 0.5 0.6 ... 0.40 \\\n", + "871 0.45 0.00 0.67 0 0 0.67 0.0 0.67 0.22 0.22 ... 0 \n", + "1074 0.00 0.55 0.55 0 0 0.55 2.2 0.00 0.00 0.55 ... 0 \n", + "\n", + " 0.41 0.42 0.778 0.43 0.44 3.756 61 278 1 \n", + "871 0.111 0 1.599 0.148 0.000 4.947 102 564 1 \n", + "1074 0.165 0 0.496 0.000 0.082 16.826 148 387 1 \n", + "\n", + "[2 rows x 58 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spam_test.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "training_run = mnb()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_run.fit(spam_train.iloc[:, :57], spam_train.iloc[:, -1])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.79619565217391308" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "training_run.score(spam_test.iloc[:, :57], spam_test.iloc[:, -1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Spam detector is coming back with 80% accuracy score." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/README.md b/README.md index 3a8b6f2..0821c82 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ -# Classifying spam +# Run Jeff spam homework.ipynb using Ipython Notebook + +## Classifying spam ## Description