diff --git a/tf-idf.ipynb b/tf-idf.ipynb index 8b13789..7be663d 100644 --- a/tf-idf.ipynb +++ b/tf-idf.ipynb @@ -1 +1,355 @@ - +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: lxml in /home/codespace/.python/current/lib/python3.12/site-packages (5.3.1)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install lxml" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: nltk in /home/codespace/.python/current/lib/python3.12/site-packages (3.9.1)\n", + "Requirement already satisfied: click in /home/codespace/.python/current/lib/python3.12/site-packages (from nltk) (8.1.8)\n", + "Requirement already satisfied: joblib in /home/codespace/.local/lib/python3.12/site-packages (from nltk) (1.4.2)\n", + "Requirement already satisfied: regex>=2021.8.3 in /home/codespace/.python/current/lib/python3.12/site-packages (from nltk) (2024.11.6)\n", + "Requirement already satisfied: tqdm in /home/codespace/.python/current/lib/python3.12/site-packages (from nltk) (4.67.1)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install nltk" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "from lxml import etree\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "TEI_NS = \"http://www.tei-c.org/ns/1.0\"\n", + "XML_NS = \"http://www.w3.org/XML/1998/namespace\"\n", + "\n", + "NAMESPACES = {\n", + " \"tei\": TEI_NS,\n", + " \"xml\": XML_NS,\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /home/codespace/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package punkt_tab to\n", + "[nltk_data] /home/codespace/nltk_data...\n", + "[nltk_data] Package punkt_tab is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import nltk\n", + "\n", + "# download the files needed for tokenization\n", + "# the punkt tokenizer should be installed already,\n", + "# but let's download it just in case\n", + "nltk.download(\"punkt\")\n", + "nltk.download(\"punkt_tab\")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "files = Path(\"./xml/tlg0012\").glob(\"**/*perseus-eng*.xml\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File successfully included!\n" + ] + } + ], + "source": [ + "from collections import Counter\n", + "\n", + "# Using our `tokenized_texts` dictionary, we'll iterate\n", + "# through each key-value pair — remember, the keys are\n", + "# filenames and the values are lists of tokens.\n", + "# We'll get a count of the tokens by passing the list to\n", + "# `Counter`, then we'll change the value for that key to\n", + "# a dictionary with its own keys, `tokens` and `counts`.\n", + "\n", + "# for filename, tokens in tokenized_texts.items():\n", + "# counts = Counter(tokens)\n", + "\n", + "# tokenized_texts[filename] = {\"tokens\": tokens, \"counts\": counts}\n", + "# tokenized_texts[\"tlg0012.tlg001.perseus-eng3.txt\"][\"counts\"][\"odysseus\"]\n", + "\n", + "# Create a new dictionary to store the updated structure\n", + "updated_tokenized_texts = {}\n", + "\n", + "filename = \"tlg0012.tlg001.perseus-eng3.txt\"\n", + "tokens = [\"odysseus\", \"zeus\", \"odysseus\", \"athena\"] # Example tokenized words\n", + "\n", + "# Add it to tokenized_texts\n", + "#tokenized_texts[filename] = tokens\n", + "\n", + "\n", + "updated_tokenized_texts = {}\n", + "\n", + "#\n", + "\n", + "tokenized_texts = updated_tokenized_texts # Update dictionary\n", + "\n", + "# Check if the file is now included\n", + "#\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "for file in files:\n", + " # print the name of the file as a sanity check\n", + " print(file)\n", + " \n", + " # etree.parse() reads the file and turns the raw XML into an object that we can use in Python\n", + " tree = etree.parse(file)\n", + "\n", + " # xpath() is a method that applies **xpath expressions** to search through the XML.\n", + " # This xpath expression says, \"Find any `tei:div` element with a `subtype` of `'card'`.\n", + " # Under that element, get any text.\" The second argument, `namespaces=`, tells the\n", + " # method to use the supplied namespaces as shortcuts, so we don't have to type out\n", + " # \"http://www.tei-c.org/ns/1.0\" every time we want an element in the TEI namespace.\n", + "\n", + " text = tree.xpath(f\"//tei:div[@subtype='card']//text()\", namespaces=NAMESPACES)\n", + "\n", + " # xpath() returns an array of matches, so we initialize an empty array to store the\n", + " # results. We could use a list comprehension, but for now rewriting these\n", + " # lines as a list comprehension is left as an exercise for the reader.\n", + " cleaned_text = []\n", + "\n", + " # Now we iterate through each string returned by `xpath()`\n", + " for t in text:\n", + " # `strip()` removes leading and trailing whitespace; if all that's left is an empty\n", + " # string, we don't care about it.\n", + " if t.strip() != \"\":\n", + " cleaned_text.append(t.strip())\n", + "\n", + " # We make sure that we actually *have* text before writing just the text, without\n", + " # TEI elements, to a separate file. No need to write an empty file, right?\n", + " if len(cleaned_text) > 0:\n", + " # A lot is happening here:\n", + " #\n", + " # 1. `str(file)` turns the `Path` object into a `str`\n", + " # 2. `split(\"/\")` splits the resulting string at every \"/\"\n", + " # 3. `[-1]` takes the last element of the list returned by `split(\"/\")`\n", + " # 4. `replace(\".xml\", \".txt\")` changes the extension of the file\n", + " # \n", + " # So something like \"xml/tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\"\n", + " # is transformed into \"tlg0012.tlg001.perseus-eng3.txt\".\n", + " with open(str(file).split(\"/\")[-1].replace(\".xml\", \".txt\"), \"w+\") as f:\n", + " # We write the text to the file, `join`-ing each element in\n", + " # `cleaned_text` with a newline (\"\\n\")\n", + " f.write(\"\\n\".join(cleaned_text))" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the tokenizer\n", + "from nltk.tokenize import word_tokenize\n", + "\n", + "# Initialize an empty dictionary to store the tokenized texts\n", + "tokenized_texts = {}\n", + "\n", + "# Get a Path.glob() iterator for the .txt files that you've created in this directory.\n", + "# Can you figure out what the new `[1-4]` segment is doing?\n", + "text_files = Path(\".\").glob(\"tlg0012.tlg00*.perseus-eng[1-4].txt\")\n", + "\n", + "# Iterate through the text files, reading and tokenizing them one by one,\n", + "# then storing the list of tokens in our `tokenized_texts` dictionary —\n", + "# so we'll be getting a dictionary of lists.\n", + "for file in text_files:\n", + " name = str(file)\n", + "\n", + " with open(file) as f:\n", + " # Notice we're lowercasing the text. You don't *have*\n", + " # to do this, but it helps eliminate some noise for\n", + " # our purposes.\n", + " text = f.read().lower()\n", + " tokens = word_tokenize(text)\n", + "\n", + " # Let's just print the length of the tokens list to make\n", + " # sure we're getting sane results. We'll use string interpolation\n", + " # to identify which text we're working with.\n", + " print(f\"There are {len(tokens)} tokens in {name}.\")\n", + "\n", + " # Store each file's `tokens` list in the `tokenized_texts`\n", + " # dictionary, using the filename as the key.\n", + " tokenized_texts[name] = tokens\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n", + "0\n" + ] + } + ], + "source": [ + "from collections import Counter\n", + "from math import log10\n", + "\n", + "df_achilles = 0\n", + "df_odysseus = 0\n", + "\n", + "# Iterate through the dictionary to count DF values\n", + "for filename, values in tokenized_texts.items():\n", + " if \"odysseus\" in values['counts']:\n", + " df_odysseus += 1\n", + " \n", + " if \"achilles\" in values[\"counts\"]:\n", + " df_achilles += 1\n", + "\n", + "n_docs = len(tokenized_texts.keys())\n", + "\n", + "# Prevent division by zero\n", + "idf_achilles = log10(n_docs / df_achilles) if df_achilles > 0 else 0\n", + "idf_odysseus = log10(n_docs / df_odysseus) if df_odysseus > 0 else 0\n", + "\n", + "print(idf_achilles)\n", + "print(idf_odysseus)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "# Now let's calculate the TF-IDF \"score\" for each term in each document.\n", + "\n", + "# Once again, iterate through the dictionary.\n", + "for filename, values in tokenized_texts.items():\n", + " # Get the total number of terms in each file — we'll\n", + " # use this to calculate the relative frequency as our\n", + " # TF.\n", + " total_terms = len(values['tokens'])\n", + "\n", + " # Get the TF for each term in this file.\n", + " tf_achilles = values['counts']['achilles'] / total_terms\n", + " tf_odysseus = values['counts']['odysseus'] / total_terms\n", + "\n", + " # Remember, the simplest version of TF-IDF is just\n", + " # TF * 1/DF\n", + " tf_idf_achilles = tf_achilles * idf_achilles\n", + " tf_idf_odysseus = tf_odysseus * idf_odysseus\n", + "\n", + " # Now we can report on the statistics for this file\n", + " print(f\"\"\"In {filename}:\n", + "TF of achilles: {tf_achilles}\n", + "TF of odysseus: {tf_odysseus}\n", + "TF-IDF of achilles: {tf_idf_achilles}\n", + "TF-IDF of odysseus: {tf_idf_odysseus}\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}