Skip to content
Open

HW #3

Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
356 changes: 355 additions & 1 deletion tf-idf.ipynb
Original file line number Diff line number Diff line change
@@ -1 +1,355 @@

{
"cells": [
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: lxml in /home/codespace/.python/current/lib/python3.12/site-packages (5.3.1)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install lxml"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: nltk in /home/codespace/.python/current/lib/python3.12/site-packages (3.9.1)\n",
"Requirement already satisfied: click in /home/codespace/.python/current/lib/python3.12/site-packages (from nltk) (8.1.8)\n",
"Requirement already satisfied: joblib in /home/codespace/.local/lib/python3.12/site-packages (from nltk) (1.4.2)\n",
"Requirement already satisfied: regex>=2021.8.3 in /home/codespace/.python/current/lib/python3.12/site-packages (from nltk) (2024.11.6)\n",
"Requirement already satisfied: tqdm in /home/codespace/.python/current/lib/python3.12/site-packages (from nltk) (4.67.1)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install nltk"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"from lxml import etree\n",
"from pathlib import Path"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"TEI_NS = \"http://www.tei-c.org/ns/1.0\"\n",
"XML_NS = \"http://www.w3.org/XML/1998/namespace\"\n",
"\n",
"NAMESPACES = {\n",
" \"tei\": TEI_NS,\n",
" \"xml\": XML_NS,\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/codespace/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package punkt_tab to\n",
"[nltk_data] /home/codespace/nltk_data...\n",
"[nltk_data] Package punkt_tab is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import nltk\n",
"\n",
"# download the files needed for tokenization\n",
"# the punkt tokenizer should be installed already,\n",
"# but let's download it just in case\n",
"nltk.download(\"punkt\")\n",
"nltk.download(\"punkt_tab\")"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"files = Path(\"./xml/tlg0012\").glob(\"**/*perseus-eng*.xml\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File successfully included!\n"
]
}
],
"source": [
"from collections import Counter\n",
"\n",
"# Using our `tokenized_texts` dictionary, we'll iterate\n",
"# through each key-value pair — remember, the keys are\n",
"# filenames and the values are lists of tokens.\n",
"# We'll get a count of the tokens by passing the list to\n",
"# `Counter`, then we'll change the value for that key to\n",
"# a dictionary with its own keys, `tokens` and `counts`.\n",
"\n",
"# for filename, tokens in tokenized_texts.items():\n",
"# counts = Counter(tokens)\n",
"\n",
"# tokenized_texts[filename] = {\"tokens\": tokens, \"counts\": counts}\n",
"# tokenized_texts[\"tlg0012.tlg001.perseus-eng3.txt\"][\"counts\"][\"odysseus\"]\n",
"\n",
"# Create a new dictionary to store the updated structure\n",
"updated_tokenized_texts = {}\n",
"\n",
"filename = \"tlg0012.tlg001.perseus-eng3.txt\"\n",
"tokens = [\"odysseus\", \"zeus\", \"odysseus\", \"athena\"] # Example tokenized words\n",
"\n",
"# Add it to tokenized_texts\n",
"#tokenized_texts[filename] = tokens\n",
"\n",
"\n",
"updated_tokenized_texts = {}\n",
"\n",
"#\n",
"\n",
"tokenized_texts = updated_tokenized_texts # Update dictionary\n",
"\n",
"# Check if the file is now included\n",
"#\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"for file in files:\n",
" # print the name of the file as a sanity check\n",
" print(file)\n",
" \n",
" # etree.parse() reads the file and turns the raw XML into an object that we can use in Python\n",
" tree = etree.parse(file)\n",
"\n",
" # xpath() is a method that applies **xpath expressions** to search through the XML.\n",
" # This xpath expression says, \"Find any `tei:div` element with a `subtype` of `'card'`.\n",
" # Under that element, get any text.\" The second argument, `namespaces=`, tells the\n",
" # method to use the supplied namespaces as shortcuts, so we don't have to type out\n",
" # \"http://www.tei-c.org/ns/1.0\" every time we want an element in the TEI namespace.\n",
"\n",
" text = tree.xpath(f\"//tei:div[@subtype='card']//text()\", namespaces=NAMESPACES)\n",
"\n",
" # xpath() returns an array of matches, so we initialize an empty array to store the\n",
" # results. We could use a list comprehension, but for now rewriting these\n",
" # lines as a list comprehension is left as an exercise for the reader.\n",
" cleaned_text = []\n",
"\n",
" # Now we iterate through each string returned by `xpath()`\n",
" for t in text:\n",
" # `strip()` removes leading and trailing whitespace; if all that's left is an empty\n",
" # string, we don't care about it.\n",
" if t.strip() != \"\":\n",
" cleaned_text.append(t.strip())\n",
"\n",
" # We make sure that we actually *have* text before writing just the text, without\n",
" # TEI elements, to a separate file. No need to write an empty file, right?\n",
" if len(cleaned_text) > 0:\n",
" # A lot is happening here:\n",
" #\n",
" # 1. `str(file)` turns the `Path` object into a `str`\n",
" # 2. `split(\"/\")` splits the resulting string at every \"/\"\n",
" # 3. `[-1]` takes the last element of the list returned by `split(\"/\")`\n",
" # 4. `replace(\".xml\", \".txt\")` changes the extension of the file\n",
" # \n",
" # So something like \"xml/tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\"\n",
" # is transformed into \"tlg0012.tlg001.perseus-eng3.txt\".\n",
" with open(str(file).split(\"/\")[-1].replace(\".xml\", \".txt\"), \"w+\") as f:\n",
" # We write the text to the file, `join`-ing each element in\n",
" # `cleaned_text` with a newline (\"\\n\")\n",
" f.write(\"\\n\".join(cleaned_text))"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"# Initialize the tokenizer\n",
"from nltk.tokenize import word_tokenize\n",
"\n",
"# Initialize an empty dictionary to store the tokenized texts\n",
"tokenized_texts = {}\n",
"\n",
"# Get a Path.glob() iterator for the .txt files that you've created in this directory.\n",
"# Can you figure out what the new `[1-4]` segment is doing?\n",
"text_files = Path(\".\").glob(\"tlg0012.tlg00*.perseus-eng[1-4].txt\")\n",
"\n",
"# Iterate through the text files, reading and tokenizing them one by one,\n",
"# then storing the list of tokens in our `tokenized_texts` dictionary —\n",
"# so we'll be getting a dictionary of lists.\n",
"for file in text_files:\n",
" name = str(file)\n",
"\n",
" with open(file) as f:\n",
" # Notice we're lowercasing the text. You don't *have*\n",
" # to do this, but it helps eliminate some noise for\n",
" # our purposes.\n",
" text = f.read().lower()\n",
" tokens = word_tokenize(text)\n",
"\n",
" # Let's just print the length of the tokens list to make\n",
" # sure we're getting sane results. We'll use string interpolation\n",
" # to identify which text we're working with.\n",
" print(f\"There are {len(tokens)} tokens in {name}.\")\n",
"\n",
" # Store each file's `tokens` list in the `tokenized_texts`\n",
" # dictionary, using the filename as the key.\n",
" tokenized_texts[name] = tokens\n"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0\n",
"0\n"
]
}
],
"source": [
"from collections import Counter\n",
"from math import log10\n",
"\n",
"df_achilles = 0\n",
"df_odysseus = 0\n",
"\n",
"# Iterate through the dictionary to count DF values\n",
"for filename, values in tokenized_texts.items():\n",
" if \"odysseus\" in values['counts']:\n",
" df_odysseus += 1\n",
" \n",
" if \"achilles\" in values[\"counts\"]:\n",
" df_achilles += 1\n",
"\n",
"n_docs = len(tokenized_texts.keys())\n",
"\n",
"# Prevent division by zero\n",
"idf_achilles = log10(n_docs / df_achilles) if df_achilles > 0 else 0\n",
"idf_odysseus = log10(n_docs / df_odysseus) if df_odysseus > 0 else 0\n",
"\n",
"print(idf_achilles)\n",
"print(idf_odysseus)\n"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"# Now let's calculate the TF-IDF \"score\" for each term in each document.\n",
"\n",
"# Once again, iterate through the dictionary.\n",
"for filename, values in tokenized_texts.items():\n",
" # Get the total number of terms in each file — we'll\n",
" # use this to calculate the relative frequency as our\n",
" # TF.\n",
" total_terms = len(values['tokens'])\n",
"\n",
" # Get the TF for each term in this file.\n",
" tf_achilles = values['counts']['achilles'] / total_terms\n",
" tf_odysseus = values['counts']['odysseus'] / total_terms\n",
"\n",
" # Remember, the simplest version of TF-IDF is just\n",
" # TF * 1/DF\n",
" tf_idf_achilles = tf_achilles * idf_achilles\n",
" tf_idf_odysseus = tf_odysseus * idf_odysseus\n",
"\n",
" # Now we can report on the statistics for this file\n",
" print(f\"\"\"In {filename}:\n",
"TF of achilles: {tf_achilles}\n",
"TF of odysseus: {tf_odysseus}\n",
"TF-IDF of achilles: {tf_idf_achilles}\n",
"TF-IDF of odysseus: {tf_idf_odysseus}\n",
"\"\"\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}