Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 77 additions & 8 deletions lab.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,90 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "plaintext"
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tokens saved to tokens.txt\n",
"\n",
"Found 6452 tokens.\n",
"\n",
"First 20 tokens:\n",
"['thus', 'joyful', 'troy', 'maintained', 'the', 'watch', 'of', 'night', 'while', 'fear', 'pale', 'comrade', 'of', 'inglorious', 'flight', 'and', 'heavenbred', 'horror', 'on', 'the']\n"
]
}
},
"outputs": [],
],
"source": [
"# your code here"
"import re\n",
"\n",
"def tokenize_text(text):\n",
" \"\"\"Convert text to lowercase and split into words (tokens)\"\"\"\n",
" text = text.lower() # convert to lowercase\n",
" text = re.sub(r\"[^\\w\\s']\", '', text) # remove punctuation except apostrophes\n",
" tokens = text.split() # split into words\n",
" return tokens\n",
"\n",
"def tokenize_file(input_file, output_file=None):\n",
" \"\"\"Read a text file and return its tokens\"\"\"\n",
" try:\n",
" with open(input_file, 'r') as f:\n",
" text = f.read()\n",
" except FileNotFoundError:\n",
" print(f\"Error: File '{input_file}' not found.\")\n",
" return None\n",
" except Exception as e:\n",
" print(f\"Error reading file: {e}\")\n",
" return None\n",
" \n",
" tokens = tokenize_text(text)\n",
" \n",
" if output_file:\n",
" try:\n",
" with open(output_file, 'w') as f:\n",
" f.write('\\n'.join(tokens))\n",
" print(f\"Tokens saved to {output_file}\")\n",
" except Exception as e:\n",
" print(f\"Error writing to output file: {e}\")\n",
" \n",
" return tokens\n",
"\n",
"\n",
"# Set your input file path here\n",
"input_filename = \"book_9.txt\" \n",
"output_filename = \"tokens.txt\" \n",
"\n",
"\n",
"# Tokenize the file\n",
"tokens = tokenize_file(input_filename, output_filename)\n",
"\n",
"# Print results\n",
"if tokens is not None:\n",
" print(f\"\\nFound {len(tokens)} tokens.\")\n",
" print(\"\\nFirst 20 tokens:\")\n",
" print(tokens[:20])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.1"
}
},
"nbformat": 4,
Expand Down
Loading