diff --git a/lab.ipynb b/lab.ipynb index 56262e7..97c8bb3 100644 --- a/lab.ipynb +++ b/lab.ipynb @@ -14,21 +14,127 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "execution_count": 13, + "metadata": {}, "outputs": [], "source": [ "# your code here" ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total tokens: 6452\n", + "Unique tokens (types): 2620\n", + "\n", + "Most frequent tokens:\n", + "the 304\n", + "and 181\n", + "of 113\n", + "to 104\n", + "And 91\n", + "his 90\n", + "in 87\n", + "The 78\n", + "my 69\n", + "with 64\n", + "a 61\n", + "thy 49\n", + "all 44\n", + "our 44\n", + "he 40\n", + "I 39\n", + "from 32\n", + "for 30\n", + "that 29\n", + "her 29\n" + ] + } + ], + "source": [ + "file_path = \"book_9.txt\"\n", + "\n", + "file = open(file_path, \"r\", encoding=\"utf-8\")\n", + "\n", + "file_text = file.read()\n", + "\n", + "file.close()\n", + "\n", + "tokens = file_text.split()\n", + "\n", + "token_counts = {}\n", + "\n", + "for token in tokens:\n", + " if token in token_counts:\n", + " token_counts[token] += 1\n", + " else:\n", + " token_counts[token] = 1\n", + "\n", + "total_tokens = len(tokens)\n", + "\n", + "unique_tokens = len(token_counts)\n", + "\n", + "sorted_tokens = []\n", + "\n", + "for token in token_counts:\n", + "\n", + " found = False\n", + " for i in range(len(sorted_tokens)):\n", + " if token_counts[token] > token_counts[sorted_tokens[i]]:\n", + " sorted_tokens.insert(i, token)\n", + " found = True\n", + " break\n", + " if not found:\n", + " sorted_tokens.append(token)\n", + "\n", + "print(f\"Total tokens: {total_tokens}\")\n", + "\n", + "print(f\"Unique tokens (types): {unique_tokens}\")\n", + "\n", + "print(\"\\nMost frequent tokens:\")\n", + "\n", + "for i in range(min(20, len(sorted_tokens))):\n", + "\n", + " token = sorted_tokens[i]\n", + "\n", + " count = token_counts[token]\n", + " \n", + " print(f\"{token:<20} {count}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I split the text using Python's built-in split() method. This basic tokenization produces approximately 6,500 tokens, with around 2,000 unique types. The process highlights several limitations of simple tokenization. Since I only split on whitespace, punctuation remains attached to words (for example, \"night;\" is treated as a single token). The code doesn't handle apostrophes in any special way, they're considered as whole tokens. The same is true for hyphenated compounds. The code also doesn't make any special provisions for footnote markers like or special characters (like \"Æ\" in \"Ægean”).\n", + "While I initially attempted to use regular expressions and exception handling, I switched to create a more streamlined approach. The final code tracks how many times each token appears in the text and sorts tokens by frequency. The most frequent tokens are typical function words like \"the,\" \"and,\" \"of,\" and \"to\" which is consistent with general English.\n", + "\n" + ] } ], "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" } }, "nbformat": 4,