From a0c90776eeb80d84b935ac76b0b734156b65dc5d Mon Sep 17 00:00:00 2001
From: ckean02 <catie.kean@tufts.edu>
Date: Tue, 11 Mar 2025 03:58:47 +0000
Subject: [PATCH] HW

---
 lab.ipynb | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 113 insertions(+), 7 deletions(-)

diff --git a/lab.ipynb b/lab.ipynb
index 56262e7..97c8bb3 100644
--- a/lab.ipynb
+++ b/lab.ipynb
@@ -14,21 +14,127 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "execution_count": 13,
+   "metadata": {},
    "outputs": [],
    "source": [
     "# your code here"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total tokens: 6452\n",
+      "Unique tokens (types): 2620\n",
+      "\n",
+      "Most frequent tokens:\n",
+      "the                  304\n",
+      "and                  181\n",
+      "of                   113\n",
+      "to                   104\n",
+      "And                  91\n",
+      "his                  90\n",
+      "in                   87\n",
+      "The                  78\n",
+      "my                   69\n",
+      "with                 64\n",
+      "a                    61\n",
+      "thy                  49\n",
+      "all                  44\n",
+      "our                  44\n",
+      "he                   40\n",
+      "I                    39\n",
+      "from                 32\n",
+      "for                  30\n",
+      "that                 29\n",
+      "her                  29\n"
+     ]
+    }
+   ],
+   "source": [
+    "file_path = \"book_9.txt\"\n",
+    "\n",
+    "file = open(file_path, \"r\", encoding=\"utf-8\")\n",
+    "\n",
+    "file_text = file.read()\n",
+    "\n",
+    "file.close()\n",
+    "\n",
+    "tokens = file_text.split()\n",
+    "\n",
+    "token_counts = {}\n",
+    "\n",
+    "for token in tokens:\n",
+    "    if token in token_counts:\n",
+    "        token_counts[token] += 1\n",
+    "    else:\n",
+    "        token_counts[token] = 1\n",
+    "\n",
+    "total_tokens = len(tokens)\n",
+    "\n",
+    "unique_tokens = len(token_counts)\n",
+    "\n",
+    "sorted_tokens = []\n",
+    "\n",
+    "for token in token_counts:\n",
+    "\n",
+    "    found = False\n",
+    "    for i in range(len(sorted_tokens)):\n",
+    "        if token_counts[token] > token_counts[sorted_tokens[i]]:\n",
+    "            sorted_tokens.insert(i, token)\n",
+    "            found = True\n",
+    "            break\n",
+    "    if not found:\n",
+    "        sorted_tokens.append(token)\n",
+    "\n",
+    "print(f\"Total tokens: {total_tokens}\")\n",
+    "\n",
+    "print(f\"Unique tokens (types): {unique_tokens}\")\n",
+    "\n",
+    "print(\"\\nMost frequent tokens:\")\n",
+    "\n",
+    "for i in range(min(20, len(sorted_tokens))):\n",
+    "\n",
+    "    token = sorted_tokens[i]\n",
+    "\n",
+    "    count = token_counts[token]\n",
+    "    \n",
+    "    print(f\"{token:<20} {count}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "I split the text using Python's built-in split() method. This basic tokenization produces approximately 6,500 tokens, with around 2,000 unique types. The process highlights several limitations of simple tokenization. Since I only split on whitespace, punctuation remains attached to words (for example, \"night;\" is treated as a single token). The code doesn't handle apostrophes in any special way, they're considered as whole tokens. The same is true for hyphenated compounds. The code also doesn't make any special provisions for footnote markers like or special characters (like \"Æ\" in \"Ægean”).\n",
+    "While I initially attempted to use regular expressions and exception handling, I switched to create a more streamlined approach. The final code tracks how many times each token appears in the text and sorts tokens by frequency. The most frequent tokens are typical function words like \"the,\" \"and,\" \"of,\" and \"to\" which is consistent with general English.\n",
+    "\n"
+   ]
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.1"
   }
  },
  "nbformat": 4,