diff --git a/tf-idf.ipynb b/tf-idf.ipynb
index 6f041cc..dd26037 100644
--- a/tf-idf.ipynb
+++ b/tf-idf.ipynb
@@ -9,15 +9,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "Collecting lxml\n",
- " Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.7 kB)\n",
- "Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl (5.0 MB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m50.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hInstalling collected packages: lxml\n",
- "Successfully installed lxml-5.3.1\n",
- "\n",
- "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n",
- "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+ "Requirement already satisfied: lxml in /home/codespace/.python/current/lib/python3.12/site-packages (5.3.1)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
@@ -28,7 +20,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -38,7 +30,7 @@
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -47,7 +39,7 @@
},
{
"cell_type": "code",
- "execution_count": 58,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -62,18 +54,24 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
+ "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n",
"tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\n",
"tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml\n",
- "tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n",
- "tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n",
- "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n"
+ "tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n"
]
}
],
@@ -96,9 +94,20 @@
},
{
"cell_type": "code",
- "execution_count": 68,
+ "execution_count": 23,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "185\n",
+ "141\n",
+ "178\n",
+ "168\n"
+ ]
+ }
+ ],
"source": [
"from collections import Counter\n",
"\n",
@@ -111,12 +120,13 @@
"\n",
" with open(t) as f:\n",
" text = f.read().lower().split()\n",
- " counts[name] = Counter(text)\n"
+ " counts[name] = Counter(text)\n",
+ " print(counts[name]['gods'])"
]
},
{
"cell_type": "code",
- "execution_count": 71,
+ "execution_count": 24,
"metadata": {},
"outputs": [
{
@@ -125,13 +135,13 @@
"4"
]
},
- "execution_count": 71,
+ "execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "term = 'odysseus'\n",
+ "term = 'gods'\n",
"\n",
"df_ulysses = 0\n",
"\n",
@@ -141,6 +151,90 @@
"\n",
"df_ulysses"
]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Notes when completing the assignment:\n",
+ "1. Review TF-IDF and understand the code:\n",
+ "- Re-inspect the code with:
\n",
+ " `print(counts)`
\n",
+ " `print(counts[name]['odysseus'])`
\n",
+ " change other term
\n",
+ " --> Figure out calculating TF-IDF score for a term\n",
+ "\n",
+ "2. When calculating tf-idf score, I receive a negative score for `tf_idf_ulysses`. I was confused because I assumed that was impossible. I revisited the code I wrote for calculating IDF score, and I realize I added 1 to `df_ulysses` to prevent division by 0, but that means when `df_ulysses` = the # of docs, `# docs / (df_ulysses + 1)` < 1, which causes `idf_ulysses` to be negative. I re-read about TF-IDF [here](https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency/) to make sure the correct formula does add 1 into `df_ulysses` when calculating `idf_ulysses`. Though the link mentions that the formula can contain adding one into the denominator, which means I can keep the `+ 1` in calculating `idf_ulysses`, I decided to remove it as the example in the link doesn't show them adding one into the denominator.\n",
+ "\n",
+ "3. I ran the code to calculate TF-IDF of these term:\n",
+ "\n",
+ "- hector:\n",
+ "\n",
+ "| TF | IDF | TF-IDF |\n",
+ "| --- | ---- | ------- |\n",
+ "| 0.0015694050991501417 | 0.6020599913279624 | 0.0009448760203843942\n",
+ "| 0.0 | 0.6020599913279624 | 0.0\n",
+ "| 0.0 | 0.6020599913279624 | 0.0\n",
+ "| 0.0 | 0.6020599913279624 | 0.0\n",
+ "\n",
+ "Have a high TF-IDF in documents related to The Iliad but not in The Odyssey\n",
+ "\n",
+ "- trojans:\n",
+ "\n",
+ "| TF | IDF | TF-IDF |\n",
+ "| --- | ---- | ------- |\n",
+ "| 0.0019886685552407933 | 0.0 | 0.0\n",
+ "| 8.359945827551038e-05 | 0.0 | 0.0\n",
+ "| 0.0022201973793970816 | 0.0 | 0.0\n",
+ "| 9.012730481805551e-05 | 0.0 | 0.0\n",
+ "\n",
+ "Have TF-IDF score = 0 since the term appears in all 4 documents. However, we can observe that this term appears more in the second and the third ones\n",
+ "\n",
+ "- gods:\n",
+ "\n",
+ "| TF | IDF | TF-IDF |\n",
+ "| --- | ---- | ------- |\n",
+ "| 0.0010481586402266289 | 0.0 | 0.0\n",
+ "| 0.0011787523616846962 | 0.0 | 0.0\n",
+ "| 0.0011421824668574581 | 0.0 | 0.0\n",
+ "| 0.001261782267452777 | 0.0 | 0.0\n",
+ "\n",
+ "This term might appear frequently across all documents\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "4 4\n",
+ "0.0010481586402266289 0.0 0.0\n",
+ "0.0011787523616846962 0.0 0.0\n",
+ "0.0011421824668574581 0.0 0.0\n",
+ "0.001261782267452777 0.0 0.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "\n",
+ "# Calculate IDF score for 'term'\n",
+ "idf_ulysses = np.log10(len(counts) / df_ulysses)\n",
+ "print(len(counts), df_ulysses)\n",
+ "\n",
+ "# Calculate TF-IDF score for 'term' in each document\n",
+ "tf_idf_ulysses = {}\n",
+ "\n",
+ "for doc in counts:\n",
+ " words = counts[doc]\n",
+ " tf_score = words[term] / sum(words.values())\n",
+ " tf_idf_score = tf_score * idf_ulysses\n",
+ " print(tf_score, idf_ulysses, tf_idf_score)"
+ ]
}
],
"metadata": {