TuftsIntroDH · katabasist · Apr 29, 2025
diff --git a/lab.ipynb b/lab.ipynb
@@ -14,21 +14,90 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tokens saved to tokens.txt\n",
+      "\n",
+      "Found 6452 tokens.\n",
+      "\n",
+      "First 20 tokens:\n",
+      "['thus', 'joyful', 'troy', 'maintained', 'the', 'watch', 'of', 'night', 'while', 'fear', 'pale', 'comrade', 'of', 'inglorious', 'flight', 'and', 'heavenbred', 'horror', 'on', 'the']\n"
+     ]
     }
-   },
-   "outputs": [],
+   ],
    "source": [
-    "# your code here"
+    "import re\n",
+    "\n",
+    "def tokenize_text(text):\n",
+    "    \"\"\"Convert text to lowercase and split into words (tokens)\"\"\"\n",
+    "    text = text.lower()  # convert to lowercase\n",
+    "    text = re.sub(r\"[^\\w\\s']\", '', text)  # remove punctuation except apostrophes\n",
+    "    tokens = text.split()  # split into words\n",
+    "    return tokens\n",
+    "\n",
+    "def tokenize_file(input_file, output_file=None):\n",
+    "    \"\"\"Read a text file and return its tokens\"\"\"\n",
+    "    try:\n",
+    "        with open(input_file, 'r') as f:\n",
+    "            text = f.read()\n",
+    "    except FileNotFoundError:\n",
+    "        print(f\"Error: File '{input_file}' not found.\")\n",
+    "        return None\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error reading file: {e}\")\n",
+    "        return None\n",
+    "    \n",
+    "    tokens = tokenize_text(text)\n",
+    "    \n",
+    "    if output_file:\n",
+    "        try:\n",
+    "            with open(output_file, 'w') as f:\n",
+    "                f.write('\\n'.join(tokens))\n",
+    "            print(f\"Tokens saved to {output_file}\")\n",
+    "        except Exception as e:\n",
+    "            print(f\"Error writing to output file: {e}\")\n",
+    "    \n",
+    "    return tokens\n",
+    "\n",
+    "\n",
+    "# Set your input file path here\n",
+    "input_filename = \"book_9.txt\"  \n",
+    "output_filename = \"tokens.txt\"  \n",
+    "\n",
+    "\n",
+    "# Tokenize the file\n",
+    "tokens = tokenize_file(input_filename, output_filename)\n",
+    "\n",
+    "# Print results\n",
+    "if tokens is not None:\n",
+    "    print(f\"\\nFound {len(tokens)} tokens.\")\n",
+    "    print(\"\\nFirst 20 tokens:\")\n",
+    "    print(tokens[:20])"
    ]
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.1"
   }
  },
  "nbformat": 4,