diff --git a/.ipynb_checkpoints/Word Frequency-checkpoint.ipynb b/.ipynb_checkpoints/Word Frequency-checkpoint.ipynb new file mode 100644 index 0000000..5c4d7fb --- /dev/null +++ b/.ipynb_checkpoints/Word Frequency-checkpoint.ipynb @@ -0,0 +1,77 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('the', 3496), ('of', 1715), ('and', 1685), ('to', 1487), ('i', 1465), ('a', 1363), ('that', 1143), ('it', 985), ('in', 964), ('he', 914), ('you', 894), ('was', 803), ('his', 690), ('is', 649), ('have', 547), ('had', 505), ('with', 487), ('my', 477), ('we', 462), ('for', 445)]\n" + ] + } + ], + "source": [ + "import re \n", + "from collections import OrderedDict\n", + "\n", + "top_20_list = []\n", + "\n", + "with open('sample.txt') as in_file:\n", + " x = in_file.read()\n", + " x = re.sub(r'[^A-Za-z\\s]','',x).lower().split()\n", + "\n", + "def word_frequency(x):\n", + " word_dict = {}\n", + " clean_words = x\n", + " for word in clean_words:\n", + " if word in word_dict:\n", + " word_dict[word] = word_dict[word] + 1\n", + " else:\n", + " word_dict[word] = 1\n", + " ordered_words = OrderedDict(reversed(sorted(word_dict.items(), key=lambda y: y[1])))\n", + " #print(ordered_words)\n", + " return ordered_words\n", + "for key, value in iter(word_frequency(x).items()):\n", + " list_of_word_values = key, value\n", + " #print(list_of_words_values)\n", + " top_20_list.append(list_of_word_values)\n", + "print(top_20_list[:20])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/Word Frequency.ipynb b/Word Frequency.ipynb new file mode 100644 index 0000000..227d6e1 --- /dev/null +++ b/Word Frequency.ipynb @@ -0,0 +1,77 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 99, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('the', 3496), ('of', 1715), ('and', 1685), ('to', 1487), ('i', 1465), ('a', 1363), ('that', 1143), ('it', 985), ('in', 964), ('he', 914), ('you', 894), ('was', 803), ('his', 690), ('is', 649), ('have', 547), ('had', 505), ('with', 487), ('my', 477), ('we', 462), ('for', 445)]\n" + ] + } + ], + "source": [ + "import re \n", + "from collections import OrderedDict\n", + "\n", + "top_20_list = []\n", + "\n", + "with open('sample.txt') as in_file:\n", + " x = in_file.read()\n", + " x = re.sub(r'[^A-Za-z\\s]','',x).lower().split()\n", + "\n", + "def word_frequency(x):\n", + " word_dict = {}\n", + " clean_words = x\n", + " for word in clean_words:\n", + " if word in word_dict:\n", + " word_dict[word] = word_dict[word] + 1\n", + " else:\n", + " word_dict[word] = 1\n", + " ordered_words = OrderedDict(reversed(sorted(word_dict.items(), key=lambda y: y[1])))\n", + " #print(ordered_words)\n", + " return ordered_words\n", + "for key, value in iter(word_frequency(x).items()):\n", + " list_of_word_values = key, value\n", + " #print(list_of_words_values)\n", + " top_20_list.append(list_of_word_values)\n", + "print(top_20_list[:20])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.4.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/word.py b/word.py new file mode 100644 index 0000000..bb6f07d --- /dev/null +++ b/word.py @@ -0,0 +1,29 @@ + +import re + +def histo_gram(phrase): + with open('sample.txt') as in_file: + x = in_file.read() + x = re.sub(r'[^\n \w]','',x).lower().split() + # print(x) + text = {} + for word in x: + if word in text: + text[word] = text[word] + 1 + else: + text[word] = 1 + return text + returned_text = histo_gram(1) +histo_gram(1) +def one_list(text): + new_list = [] + for key, value in text.items(): + two_append = [key, value] + new_list.append(two_append) + new_list.sort(key=lambda tup: tup[1], reverse=True) + + return new_list + +# newer_dict = histo_gram(1) +# newer_list = one_list(newer_dict) +print(newer_list[0:20])