From b6744b603c9b6951b6db86a29e590110d355d44a Mon Sep 17 00:00:00 2001 From: Linh Nguyen <137933787+lnguyen2693@users.noreply.github.com> Date: Mon, 10 Mar 2025 14:49:28 +0000 Subject: [PATCH] hw --- lab.ipynb | 72 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/lab.ipynb b/lab.ipynb index 56262e7..7793cca 100644 --- a/lab.ipynb +++ b/lab.ipynb @@ -12,23 +12,79 @@ "- How many tokens does your tokenization method obtain? How many types? (Remember what tokens and types are?)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Difficulties and experimentation:\n", + "- I experimented with different regex and how it affects my output when in use them with `re.findall` and `re.split`\n", + "- I also tried to format the code, move the code around to make it look more clean\n", + "\n", + "My tokenization method obtains **4459 tokens** and **1930 types**" + ] + }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "execution_count": 43, + "metadata": {}, "outputs": [], "source": [ - "# your code here" + "import re\n", + "\n", + "word_regex = re.compile(r\"[\\s\\.,!;”“]+\")\n", + "\n", + "with open('book_9.txt', 'r') as f:\n", + " lines = f.read().splitlines()\n", + "# lines\n", + "\n", + "stopwords = [\"i\", \"me\", \"my\", \"myself\", \"we\", \"our\", \"ours\", \"ourselves\", \"you\", \"your\", \n", + " \"yours\", \"yourself\", \"yourselves\", \"he\", \"him\", \"his\", \"himself\", \"she\", \"her\", \n", + " \"hers\", \"herself\", \"it\", \"its\", \"itself\", \"they\", \"them\", \"their\", \"theirs\", \n", + " \"themselves\", \"what\", \"which\", \"who\", \"whom\", \"this\", \"that\", \"these\", \"those\", \n", + " \"am\", \"is\", \"are\", \"was\", \"were\", \"be\", \"been\", \"being\", \"have\", \"has\", \"had\", \n", + " \"having\", \"do\", \"does\", \"did\", \"doing\", \"a\", \"an\", \"the\", \"and\", \"but\", \"if\", \"or\", \n", + " \"because\", \"as\", \"until\", \"while\", \"of\", \"at\", \"by\", \"for\", \"with\", \"about\", \"against\", \n", + " \"between\", \"into\", \"through\", \"during\", \"before\", \"after\", \"above\", \"below\", \"to\", \n", + " \"from\", \"up\", \"down\", \"in\", \"out\", \"on\", \"off\", \"over\", \"under\", \"again\", \"further\", \n", + " \"then\", \"once\", \"here\", \"there\", \"when\", \"where\", \"why\", \"how\", \"all\", \"any\", \"both\", \n", + " \"each\", \"few\", \"more\", \"most\", \"other\", \"some\", \"such\", \"no\", \"nor\", \"not\", \"only\", \n", + " \"own\", \"same\", \"so\", \"than\", \"too\", \"very\", \"s\", \"t\", \"can\", \"will\", \"just\", \"don\", \"should\", \"now\"]\n", + "\n", + "word_lines = [[w.lower() for w in word_regex.split(l) if w != '' and w not in stopwords] for l in lines]\n", + "# word_lines\n", + "\n", + "wcount = 0\n", + "tcount = {}\n", + "\n", + "for l in word_lines:\n", + " for w in l:\n", + " if w not in tcount:\n", + " tcount[w] = 0\n", + " tcount[w] += 1\n", + " wcount += 1\n", + "\n", + "# print(len(tcount), wcount)\n", + "# print(tcount)" ] } ], "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" } }, "nbformat": 4,