TuftsIntroDH · lnguyen2693 · Mar 10, 2025
diff --git a/lab.ipynb b/lab.ipynb
@@ -12,23 +12,79 @@
     "- How many tokens does your tokenization method obtain? How many types? (Remember what tokens and types are?)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Difficulties and experimentation:\n",
+    "- I experimented with different regex and how it affects my output when in use them with `re.findall` and `re.split`\n",
+    "- I also tried to format the code, move the code around to make it look more clean\n",
+    "\n",
+    "My tokenization method obtains **4459 tokens** and **1930 types**"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "execution_count": 43,
+   "metadata": {},
    "outputs": [],
    "source": [
-    "# your code here"
+    "import re\n",
+    "\n",
+    "word_regex = re.compile(r\"[\\s\\.,!;”“]+\")\n",
+    "\n",
+    "with open('book_9.txt', 'r') as f:\n",
+    "    lines = f.read().splitlines()\n",
+    "# lines\n",
+    "\n",
+    "stopwords = [\"i\", \"me\", \"my\", \"myself\", \"we\", \"our\", \"ours\", \"ourselves\", \"you\", \"your\", \n",
+    "            \"yours\", \"yourself\", \"yourselves\", \"he\", \"him\", \"his\", \"himself\", \"she\", \"her\", \n",
+    "            \"hers\", \"herself\", \"it\", \"its\", \"itself\", \"they\", \"them\", \"their\", \"theirs\", \n",
+    "            \"themselves\", \"what\", \"which\", \"who\", \"whom\", \"this\", \"that\", \"these\", \"those\", \n",
+    "            \"am\", \"is\", \"are\", \"was\", \"were\", \"be\", \"been\", \"being\", \"have\", \"has\", \"had\", \n",
+    "            \"having\", \"do\", \"does\", \"did\", \"doing\", \"a\", \"an\", \"the\", \"and\", \"but\", \"if\", \"or\", \n",
+    "            \"because\", \"as\", \"until\", \"while\", \"of\", \"at\", \"by\", \"for\", \"with\", \"about\", \"against\", \n",
+    "            \"between\", \"into\", \"through\", \"during\", \"before\", \"after\", \"above\", \"below\", \"to\", \n",
+    "            \"from\", \"up\", \"down\", \"in\", \"out\", \"on\", \"off\", \"over\", \"under\", \"again\", \"further\", \n",
+    "            \"then\", \"once\", \"here\", \"there\", \"when\", \"where\", \"why\", \"how\", \"all\", \"any\", \"both\", \n",
+    "            \"each\", \"few\", \"more\", \"most\", \"other\", \"some\", \"such\", \"no\", \"nor\", \"not\", \"only\", \n",
+    "            \"own\", \"same\", \"so\", \"than\", \"too\", \"very\", \"s\", \"t\", \"can\", \"will\", \"just\", \"don\", \"should\", \"now\"]\n",
+    "\n",
+    "word_lines = [[w.lower() for w in word_regex.split(l) if w != '' and w not in stopwords] for l in lines]\n",
+    "# word_lines\n",
+    "\n",
+    "wcount = 0\n",
+    "tcount = {}\n",
+    "\n",
+    "for l in word_lines:\n",
+    "    for w in l:\n",
+    "        if w not in tcount:\n",
+    "            tcount[w] = 0\n",
+    "        tcount[w] += 1\n",
+    "        wcount += 1\n",
+    "\n",
+    "# print(len(tcount), wcount)\n",
+    "# print(tcount)"
    ]
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.1"
   }
  },
  "nbformat": 4,