TuftsIntroDH · connornight7 · Mar 27, 2025 · Mar 27, 2025
diff --git a/tf-idf.ipynb b/tf-idf.ipynb
@@ -1 +1,355 @@
-
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: lxml in /home/codespace/.python/current/lib/python3.12/site-packages (5.3.1)\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install lxml"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: nltk in /home/codespace/.python/current/lib/python3.12/site-packages (3.9.1)\n",
+      "Requirement already satisfied: click in /home/codespace/.python/current/lib/python3.12/site-packages (from nltk) (8.1.8)\n",
+      "Requirement already satisfied: joblib in /home/codespace/.local/lib/python3.12/site-packages (from nltk) (1.4.2)\n",
+      "Requirement already satisfied: regex>=2021.8.3 in /home/codespace/.python/current/lib/python3.12/site-packages (from nltk) (2024.11.6)\n",
+      "Requirement already satisfied: tqdm in /home/codespace/.python/current/lib/python3.12/site-packages (from nltk) (4.67.1)\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install nltk"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from lxml import etree\n",
+    "from pathlib import Path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TEI_NS = \"http://www.tei-c.org/ns/1.0\"\n",
+    "XML_NS = \"http://www.w3.org/XML/1998/namespace\"\n",
+    "\n",
+    "NAMESPACES = {\n",
+    "    \"tei\": TEI_NS,\n",
+    "    \"xml\": XML_NS,\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package punkt to /home/codespace/nltk_data...\n",
+      "[nltk_data]   Package punkt is already up-to-date!\n",
+      "[nltk_data] Downloading package punkt_tab to\n",
+      "[nltk_data]     /home/codespace/nltk_data...\n",
+      "[nltk_data]   Package punkt_tab is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "\n",
+    "# download the files needed for tokenization\n",
+    "# the punkt tokenizer should be installed already,\n",
+    "# but let's download it just in case\n",
+    "nltk.download(\"punkt\")\n",
+    "nltk.download(\"punkt_tab\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = Path(\"./xml/tlg0012\").glob(\"**/*perseus-eng*.xml\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File successfully included!\n"
+     ]
+    }
+   ],
+   "source": [
+    "from collections import Counter\n",
+    "\n",
+    "# Using our `tokenized_texts` dictionary, we'll iterate\n",
+    "# through each key-value pair — remember, the keys are\n",
+    "# filenames and the values are lists of tokens.\n",
+    "# We'll get a count of the tokens by passing the list to\n",
+    "# `Counter`, then we'll change the value for that key to\n",
+    "# a dictionary with its own keys, `tokens` and `counts`.\n",
+    "\n",
+    "# for filename, tokens in tokenized_texts.items():\n",
+    "#     counts = Counter(tokens)\n",
+    "\n",
+    "#     tokenized_texts[filename] = {\"tokens\": tokens, \"counts\": counts}\n",
+    "#     tokenized_texts[\"tlg0012.tlg001.perseus-eng3.txt\"][\"counts\"][\"odysseus\"]\n",
+    "\n",
+    "# Create a new dictionary to store the updated structure\n",
+    "updated_tokenized_texts = {}\n",
+    "\n",
+    "filename = \"tlg0012.tlg001.perseus-eng3.txt\"\n",
+    "tokens = [\"odysseus\", \"zeus\", \"odysseus\", \"athena\"]  # Example tokenized words\n",
+    "\n",
+    "# Add it to tokenized_texts\n",
+    "#tokenized_texts[filename] = tokens\n",
+    "\n",
+    "\n",
+    "updated_tokenized_texts = {}\n",
+    "\n",
+    "#\n",
+    "\n",
+    "tokenized_texts = updated_tokenized_texts  # Update dictionary\n",
+    "\n",
+    "# Check if the file is now included\n",
+    "#\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for file in files:\n",
+    "    # print the name of the file as a sanity check\n",
+    "    print(file)\n",
+    "    \n",
+    "    # etree.parse() reads the file and turns the raw XML into an object that we can use in Python\n",
+    "    tree = etree.parse(file)\n",
+    "\n",
+    "    # xpath() is a method that applies **xpath expressions** to search through the XML.\n",
+    "    # This xpath expression says, \"Find any `tei:div` element with a `subtype` of `'card'`.\n",
+    "    # Under that element, get any text.\" The second argument, `namespaces=`, tells the\n",
+    "    # method to use the supplied namespaces as shortcuts, so we don't have to type out\n",
+    "    # \"http://www.tei-c.org/ns/1.0\" every time we want an element in the TEI namespace.\n",
+    "\n",
+    "    text = tree.xpath(f\"//tei:div[@subtype='card']//text()\", namespaces=NAMESPACES)\n",
+    "\n",
+    "    # xpath() returns an array of matches, so we initialize an empty array to store the\n",
+    "    # results. We could use a list comprehension, but for now rewriting these\n",
+    "    # lines as a list comprehension is left as an exercise for the reader.\n",
+    "    cleaned_text = []\n",
+    "\n",
+    "    # Now we iterate through each string returned by `xpath()`\n",
+    "    for t in text:\n",
+    "        # `strip()` removes leading and trailing whitespace; if all that's left is an empty\n",
+    "        # string, we don't care about it.\n",
+    "        if t.strip() != \"\":\n",
+    "            cleaned_text.append(t.strip())\n",
+    "\n",
+    "    # We make sure that we actually *have* text before writing just the text, without\n",
+    "    # TEI elements, to a separate file. No need to write an empty file, right?\n",
+    "    if len(cleaned_text) > 0:\n",
+    "        # A lot is happening here:\n",
+    "        #\n",
+    "        # 1. `str(file)` turns the `Path` object into a `str`\n",
+    "        # 2. `split(\"/\")` splits the resulting string at every \"/\"\n",
+    "        # 3. `[-1]` takes the last element of the list returned by `split(\"/\")`\n",
+    "        # 4. `replace(\".xml\", \".txt\")` changes the extension of the file\n",
+    "        # \n",
+    "        # So something like \"xml/tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\"\n",
+    "        # is transformed into \"tlg0012.tlg001.perseus-eng3.txt\".\n",
+    "        with open(str(file).split(\"/\")[-1].replace(\".xml\", \".txt\"), \"w+\") as f:\n",
+    "            # We write the text to the file, `join`-ing each element in\n",
+    "            # `cleaned_text` with a newline (\"\\n\")\n",
+    "            f.write(\"\\n\".join(cleaned_text))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize the tokenizer\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "\n",
+    "# Initialize an empty dictionary to store the tokenized texts\n",
+    "tokenized_texts = {}\n",
+    "\n",
+    "# Get a Path.glob() iterator for the .txt files that you've created in this directory.\n",
+    "# Can you figure out what the new `[1-4]` segment is doing?\n",
+    "text_files = Path(\".\").glob(\"tlg0012.tlg00*.perseus-eng[1-4].txt\")\n",
+    "\n",
+    "# Iterate through the text files, reading and tokenizing them one by one,\n",
+    "# then storing the list of tokens in our `tokenized_texts` dictionary —\n",
+    "# so we'll be getting a dictionary of lists.\n",
+    "for file in text_files:\n",
+    "    name = str(file)\n",
+    "\n",
+    "    with open(file) as f:\n",
+    "        # Notice we're lowercasing the text. You don't *have*\n",
+    "        # to do this, but it helps eliminate some noise for\n",
+    "        # our purposes.\n",
+    "        text = f.read().lower()\n",
+    "        tokens = word_tokenize(text)\n",
+    "\n",
+    "        # Let's just print the length of the tokens list to make\n",
+    "        # sure we're getting sane results. We'll use string interpolation\n",
+    "        # to identify which text we're working with.\n",
+    "        print(f\"There are {len(tokens)} tokens in {name}.\")\n",
+    "\n",
+    "        # Store each file's `tokens` list in the `tokenized_texts`\n",
+    "        # dictionary, using the filename as the key.\n",
+    "        tokenized_texts[name] = tokens\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0\n",
+      "0\n"
+     ]
+    }
+   ],
+   "source": [
+    "from collections import Counter\n",
+    "from math import log10\n",
+    "\n",
+    "df_achilles = 0\n",
+    "df_odysseus = 0\n",
+    "\n",
+    "# Iterate through the dictionary to count DF values\n",
+    "for filename, values in tokenized_texts.items():\n",
+    "    if \"odysseus\" in values['counts']:\n",
+    "        df_odysseus += 1\n",
+    "    \n",
+    "    if \"achilles\" in values[\"counts\"]:\n",
+    "        df_achilles += 1\n",
+    "\n",
+    "n_docs = len(tokenized_texts.keys())\n",
+    "\n",
+    "# Prevent division by zero\n",
+    "idf_achilles = log10(n_docs / df_achilles) if df_achilles > 0 else 0\n",
+    "idf_odysseus = log10(n_docs / df_odysseus) if df_odysseus > 0 else 0\n",
+    "\n",
+    "print(idf_achilles)\n",
+    "print(idf_odysseus)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Now let's calculate the TF-IDF \"score\" for each term in each document.\n",
+    "\n",
+    "# Once again, iterate through the dictionary.\n",
+    "for filename, values in tokenized_texts.items():\n",
+    "    # Get the total number of terms in each file — we'll\n",
+    "    # use this to calculate the relative frequency as our\n",
+    "    # TF.\n",
+    "    total_terms = len(values['tokens'])\n",
+    "\n",
+    "    # Get the TF for each term in this file.\n",
+    "    tf_achilles = values['counts']['achilles'] / total_terms\n",
+    "    tf_odysseus = values['counts']['odysseus'] / total_terms\n",
+    "\n",
+    "    # Remember, the simplest version of TF-IDF is just\n",
+    "    # TF * 1/DF\n",
+    "    tf_idf_achilles = tf_achilles * idf_achilles\n",
+    "    tf_idf_odysseus = tf_odysseus * idf_odysseus\n",
+    "\n",
+    "    # Now we can report on the statistics for this file\n",
+    "    print(f\"\"\"In {filename}:\n",
+    "TF of achilles: {tf_achilles}\n",
+    "TF of odysseus: {tf_odysseus}\n",
+    "TF-IDF of achilles: {tf_idf_achilles}\n",
+    "TF-IDF of odysseus: {tf_idf_odysseus}\n",
+    "\"\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}