diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000..286dcb3 --- /dev/null +++ b/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 0000000..eb2c037 --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,56 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Project', \"Gutenberg's\", 'The', 'Hound', 'of', 'the', 'Baskervilles,', 'by', 'A.', 'Conan', 'Doyle', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever.', 'You', 'may', 'copy', 'it,', 'give', 'it', 'away', 'or', 're-use', 'it', 'under', 'the', 'terms', 'of', 'the', 'Project', 'Gutenberg', 'License', 'included', 'with', 'this', 'eBook', 'or', 'online', 'at', 'www.gutenberg.org', 'Title:', 'The', 'Hound', 'of', 'the', 'Baskervilles', 'Author:', 'A.', 'Conan', 'Doyle', 'Posting', 'Date:', 'December', '8,', '2008', '[EBook', '#2852]', 'Release', 'Date:', 'October,', '2001', 'Language:', 'English']\n" + ] + } + ], + "source": [ + "def word_frequency(text):\n", + " with open(text) as sample:\n", + " return sample.read()\n", + "print(word_frequency('text.txt').split())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/__pycache__/word_frequency.cpython-34.pyc b/__pycache__/word_frequency.cpython-34.pyc new file mode 100644 index 0000000..f11cffb Binary files /dev/null and b/__pycache__/word_frequency.cpython-34.pyc differ diff --git a/word_frequency.py b/word_frequency.py new file mode 100644 index 0000000..59ab826 --- /dev/null +++ b/word_frequency.py @@ -0,0 +1,21 @@ +import re + + +with open('sample.txt') as sample: + new_text = sample.read() + +def word_frequency(new_text): + edict = {} + regex_text = re.sub(r'[^A-Za-z\s]', "", new_text).lower().split() + for word in regex_text: + if word in edict: + edict[word] += 1 + else: + edict[word] = 1 + return edict + +print(word_frequency(new_text)) + +def max_dict(a_dict): + t_list = sorted(list(edict.items()), key=lambda x: x[1], reverse=True) + return a[:20]