diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..94840b3 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +layout python3 diff --git a/.gitignore b/.gitignore index f00dbf2..39a82ff 100644 --- a/.gitignore +++ b/.gitignore @@ -65,3 +65,8 @@ docs/_build/ # PyBuilder target/ +# direnv +.direnv/ + +# Data +spambase/ diff --git a/Spam_Detector.ipynb b/Spam_Detector.ipynb new file mode 100644 index 0000000..55f5d36 --- /dev/null +++ b/Spam_Detector.ipynb @@ -0,0 +1,130 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import sklearn\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "classifier = MultinomialNB()\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from sklearn.cross_validation import train_test_split " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "with open(\"spambase/spambase.names\") as file:\n", + " names = file.readlines()[33:]\n", + "names = [name.split(\":\")[0] for name in names]\n", + "names.append(\"spam_flag\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = pd.read_csv(\"spambase/spambase.data\", names=names)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "train_data, test_data = train_test_split(data, test_size = .4)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.80825638240086906" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bayes = MultinomialNB()\n", + "bayes.fit(train_data[names[:-1]], train_data.spam_flag)\n", + "bayes.score(test_data[names[:-1]], test_data.spam_flag)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "The Spam detector appears to be fairly accurate. It has an $R^2$ score of .808." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}