tiy-gvl-python · pnitto · May 29, 2015 · Jul 21, 2015
diff --git a/.ipynb_checkpoints/Word Frequency-checkpoint.ipynb b/.ipynb_checkpoints/Word Frequency-checkpoint.ipynb
@@ -0,0 +1,77 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('the', 3496), ('of', 1715), ('and', 1685), ('to', 1487), ('i', 1465), ('a', 1363), ('that', 1143), ('it', 985), ('in', 964), ('he', 914), ('you', 894), ('was', 803), ('his', 690), ('is', 649), ('have', 547), ('had', 505), ('with', 487), ('my', 477), ('we', 462), ('for', 445)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import re \n",
+    "from collections import OrderedDict\n",
+    "\n",
+    "top_20_list = []\n",
+    "\n",
+    "with open('sample.txt') as in_file:\n",
+    "    x = in_file.read()\n",
+    "    x = re.sub(r'[^A-Za-z\\s]','',x).lower().split()\n",
+    "\n",
+    "def word_frequency(x):\n",
+    "    word_dict = {}\n",
+    "    clean_words = x\n",
+    "    for word in clean_words:\n",
+    "        if word in word_dict:\n",
+    "            word_dict[word] = word_dict[word] + 1\n",
+    "        else:\n",
+    "            word_dict[word] = 1\n",
+    "    ordered_words = OrderedDict(reversed(sorted(word_dict.items(), key=lambda y: y[1])))\n",
+    "    #print(ordered_words)\n",
+    "    return ordered_words\n",
+    "for key, value in iter(word_frequency(x).items()):\n",
+    "    list_of_word_values = key, value\n",
+    "    #print(list_of_words_values)\n",
+    "    top_20_list.append(list_of_word_values)\n",
+    "print(top_20_list[:20])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/Word Frequency.ipynb b/Word Frequency.ipynb
@@ -0,0 +1,77 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('the', 3496), ('of', 1715), ('and', 1685), ('to', 1487), ('i', 1465), ('a', 1363), ('that', 1143), ('it', 985), ('in', 964), ('he', 914), ('you', 894), ('was', 803), ('his', 690), ('is', 649), ('have', 547), ('had', 505), ('with', 487), ('my', 477), ('we', 462), ('for', 445)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import re \n",
+    "from collections import OrderedDict\n",
+    "\n",
+    "top_20_list = []\n",
+    "\n",
+    "with open('sample.txt') as in_file:\n",
+    "    x = in_file.read()\n",
+    "    x = re.sub(r'[^A-Za-z\\s]','',x).lower().split()\n",
+    "\n",
+    "def word_frequency(x):\n",
+    "    word_dict = {}\n",
+    "    clean_words = x\n",
+    "    for word in clean_words:\n",
+    "        if word in word_dict:\n",
+    "            word_dict[word] = word_dict[word] + 1\n",
+    "        else:\n",
+    "            word_dict[word] = 1\n",
+    "    ordered_words = OrderedDict(reversed(sorted(word_dict.items(), key=lambda y: y[1])))\n",
+    "    #print(ordered_words)\n",
+    "    return ordered_words\n",
+    "for key, value in iter(word_frequency(x).items()):\n",
+    "    list_of_word_values = key, value\n",
+    "    #print(list_of_words_values)\n",
+    "    top_20_list.append(list_of_word_values)\n",
+    "print(top_20_list[:20])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/word.py b/word.py
@@ -0,0 +1,29 @@
+
+import re
+
+def histo_gram(phrase):
+    with open('sample.txt') as in_file:
+        x = in_file.read()
+        x = re.sub(r'[^\n \w]','',x).lower().split()
+        # print(x)
+        text = {}
+    for word in x:
+        if word in text:
+            text[word] = text[word] + 1
+        else:
+            text[word] = 1
+    return text
+    returned_text = histo_gram(1)
+histo_gram(1)
+def one_list(text):
+    new_list = []
+    for key, value in text.items():
+        two_append = [key, value]
+        new_list.append(two_append)
+        new_list.sort(key=lambda tup: tup[1], reverse=True)
+
+    return new_list
+
+# newer_dict = histo_gram(1)
+# newer_list = one_list(newer_dict)
+print(newer_list[0:20])