Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 64 additions & 8 deletions lab.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,79 @@
"- How many tokens does your tokenization method obtain? How many types? (Remember what tokens and types are?)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Difficulties and experimentation:\n",
"- I experimented with different regex and how it affects my output when in use them with `re.findall` and `re.split`\n",
"- I also tried to format the code, move the code around to make it look more clean\n",
"\n",
"My tokenization method obtains **4459 tokens** and **1930 types**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
}
},
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"# your code here"
"import re\n",
"\n",
"word_regex = re.compile(r\"[\\s\\.,!;”“]+\")\n",
"\n",
"with open('book_9.txt', 'r') as f:\n",
" lines = f.read().splitlines()\n",
"# lines\n",
"\n",
"stopwords = [\"i\", \"me\", \"my\", \"myself\", \"we\", \"our\", \"ours\", \"ourselves\", \"you\", \"your\", \n",
" \"yours\", \"yourself\", \"yourselves\", \"he\", \"him\", \"his\", \"himself\", \"she\", \"her\", \n",
" \"hers\", \"herself\", \"it\", \"its\", \"itself\", \"they\", \"them\", \"their\", \"theirs\", \n",
" \"themselves\", \"what\", \"which\", \"who\", \"whom\", \"this\", \"that\", \"these\", \"those\", \n",
" \"am\", \"is\", \"are\", \"was\", \"were\", \"be\", \"been\", \"being\", \"have\", \"has\", \"had\", \n",
" \"having\", \"do\", \"does\", \"did\", \"doing\", \"a\", \"an\", \"the\", \"and\", \"but\", \"if\", \"or\", \n",
" \"because\", \"as\", \"until\", \"while\", \"of\", \"at\", \"by\", \"for\", \"with\", \"about\", \"against\", \n",
" \"between\", \"into\", \"through\", \"during\", \"before\", \"after\", \"above\", \"below\", \"to\", \n",
" \"from\", \"up\", \"down\", \"in\", \"out\", \"on\", \"off\", \"over\", \"under\", \"again\", \"further\", \n",
" \"then\", \"once\", \"here\", \"there\", \"when\", \"where\", \"why\", \"how\", \"all\", \"any\", \"both\", \n",
" \"each\", \"few\", \"more\", \"most\", \"other\", \"some\", \"such\", \"no\", \"nor\", \"not\", \"only\", \n",
" \"own\", \"same\", \"so\", \"than\", \"too\", \"very\", \"s\", \"t\", \"can\", \"will\", \"just\", \"don\", \"should\", \"now\"]\n",
"\n",
"word_lines = [[w.lower() for w in word_regex.split(l) if w != '' and w not in stopwords] for l in lines]\n",
"# word_lines\n",
"\n",
"wcount = 0\n",
"tcount = {}\n",
"\n",
"for l in word_lines:\n",
" for w in l:\n",
" if w not in tcount:\n",
" tcount[w] = 0\n",
" tcount[w] += 1\n",
" wcount += 1\n",
"\n",
"# print(len(tcount), wcount)\n",
"# print(tcount)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.1"
}
},
"nbformat": 4,
Expand Down