From a0c90776eeb80d84b935ac76b0b734156b65dc5d Mon Sep 17 00:00:00 2001 From: ckean02 Date: Tue, 11 Mar 2025 03:58:47 +0000 Subject: [PATCH] HW --- lab.ipynb | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 113 insertions(+), 7 deletions(-) diff --git a/lab.ipynb b/lab.ipynb index 56262e7..97c8bb3 100644 --- a/lab.ipynb +++ b/lab.ipynb @@ -14,21 +14,127 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "execution_count": 13, + "metadata": {}, "outputs": [], "source": [ "# your code here" ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total tokens: 6452\n", + "Unique tokens (types): 2620\n", + "\n", + "Most frequent tokens:\n", + "the 304\n", + "and 181\n", + "of 113\n", + "to 104\n", + "And 91\n", + "his 90\n", + "in 87\n", + "The 78\n", + "my 69\n", + "with 64\n", + "a 61\n", + "thy 49\n", + "all 44\n", + "our 44\n", + "he 40\n", + "I 39\n", + "from 32\n", + "for 30\n", + "that 29\n", + "her 29\n" + ] + } + ], + "source": [ + "file_path = \"book_9.txt\"\n", + "\n", + "file = open(file_path, \"r\", encoding=\"utf-8\")\n", + "\n", + "file_text = file.read()\n", + "\n", + "file.close()\n", + "\n", + "tokens = file_text.split()\n", + "\n", + "token_counts = {}\n", + "\n", + "for token in tokens:\n", + " if token in token_counts:\n", + " token_counts[token] += 1\n", + " else:\n", + " token_counts[token] = 1\n", + "\n", + "total_tokens = len(tokens)\n", + "\n", + "unique_tokens = len(token_counts)\n", + "\n", + "sorted_tokens = []\n", + "\n", + "for token in token_counts:\n", + "\n", + " found = False\n", + " for i in range(len(sorted_tokens)):\n", + " if token_counts[token] > token_counts[sorted_tokens[i]]:\n", + " sorted_tokens.insert(i, token)\n", + " found = True\n", + " break\n", + " if not found:\n", + " sorted_tokens.append(token)\n", + "\n", + "print(f\"Total tokens: {total_tokens}\")\n", + "\n", + "print(f\"Unique tokens (types): {unique_tokens}\")\n", + "\n", + "print(\"\\nMost frequent tokens:\")\n", + "\n", + "for i in range(min(20, len(sorted_tokens))):\n", + "\n", + " token = sorted_tokens[i]\n", + "\n", + " count = token_counts[token]\n", + " \n", + " print(f\"{token:<20} {count}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I split the text using Python's built-in split() method. This basic tokenization produces approximately 6,500 tokens, with around 2,000 unique types. The process highlights several limitations of simple tokenization. Since I only split on whitespace, punctuation remains attached to words (for example, \"night;\" is treated as a single token). The code doesn't handle apostrophes in any special way, they're considered as whole tokens. The same is true for hyphenated compounds. The code also doesn't make any special provisions for footnote markers like or special characters (like \"Æ\" in \"Ægean”).\n", + "While I initially attempted to use regular expressions and exception handling, I switched to create a more streamlined approach. The final code tracks how many times each token appears in the text and sorts tokens by frequency. The most frequent tokens are typical function words like \"the,\" \"and,\" \"of,\" and \"to\" which is consistent with general English.\n", + "\n" + ] } ], "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" } }, "nbformat": 4,