From e4a7c7d0d1453c8c55799014748cd2121cd70f5a Mon Sep 17 00:00:00 2001
From: Linh Nguyen <137933787+lnguyen2693@users.noreply.github.com>
Date: Sun, 30 Mar 2025 20:03:38 +0000
Subject: [PATCH 1/2] calculate tf-idf

---
 tf-idf.ipynb | 101 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 78 insertions(+), 23 deletions(-)

diff --git a/tf-idf.ipynb b/tf-idf.ipynb
index 6f041cc..ac5b560 100644
--- a/tf-idf.ipynb
+++ b/tf-idf.ipynb
@@ -9,15 +9,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Collecting lxml\n",
-      "  Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.7 kB)\n",
-      "Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl (5.0 MB)\n",
-      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m50.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-      "\u001b[?25hInstalling collected packages: lxml\n",
-      "Successfully installed lxml-5.3.1\n",
-      "\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
+      "Requirement already satisfied: lxml in /home/codespace/.python/current/lib/python3.12/site-packages (5.3.1)\n",
       "Note: you may need to restart the kernel to use updated packages.\n"
      ]
     }
@@ -28,7 +20,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -38,7 +30,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -47,7 +39,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -62,18 +54,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n",
+      "tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n",
       "tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\n",
       "tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml\n",
-      "tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n",
-      "tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n",
-      "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n"
+      "tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n"
      ]
     }
    ],
@@ -96,9 +88,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 46,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "71\n",
+      "0\n",
+      "5\n",
+      "15\n"
+     ]
+    }
+   ],
    "source": [
     "from collections import Counter\n",
     "\n",
@@ -111,27 +114,28 @@
     "\n",
     "    with open(t) as f:\n",
     "        text = f.read().lower().split()\n",
-    "        counts[name] = Counter(text)\n"
+    "        counts[name] = Counter(text)\n",
+    "    print(counts[name]['throng'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 60,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "4"
+       "3"
       ]
      },
-     "execution_count": 71,
+     "execution_count": 60,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "term = 'odysseus'\n",
+    "term = 'throng'\n",
     "\n",
     "df_ulysses = 0\n",
     "\n",
@@ -141,6 +145,57 @@
     "\n",
     "df_ulysses"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Notes when completing the assignment:\n",
+    "1. Review TF-IDF and understand the code:\n",
+    "- Re-inspect the code with: <br>\n",
+    "    `print(counts)` <br>\n",
+    "    `print(counts[name]['odysseus'])` <br>\n",
+    "    change other term <br>\n",
+    "    --> Figure out calculating TF-IDF score for a term\n",
+    "\n",
+    "2. When calculating tf-idf score, I receive a negative score for `tf_idf_ulysses`. I was confused because I assumed that was impossible. I revisited the code I wrote for calculating IDF score, and I realize I added 1 to `df_ulysses` to prevent division by 0, but that means when `df_ulysses` = the # of docs, `# docs / (df_ulysses + 1)` < 1, which causes `idf_ulysses` to be negative. I re-read about TF-IDF [here](https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency/) to make sure the correct formula does add 1 into `df_ulysses` when calculating `idf_ulysses`. Though the link mentions that the formula can contain adding one into the denominator, which means I can keep the `+ 1` in calculating `idf_ulysses`, I decided to remove it as the example in the link doesn't show them adding one into the denominator.\n",
+    "\n",
+    "3. To test for my code, I tried to find a term that does not appear in all 4 documents."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "4 3\n",
+      "0.0004022662889518414 0.12493873660829992 5.025864192175238e-05\n",
+      "0.0 0.12493873660829992 0.0\n",
+      "3.2083777158917363e-05 0.12493873660829992 4.0085065838573656e-06\n",
+      "0.00011265913102256937 0.12493873660829992 1.4075489497348745e-05\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "# Calculate IDF score for 'term'\n",
+    "idf_ulysses = np.log10(len(counts) / df_ulysses)\n",
+    "print(len(counts), df_ulysses)\n",
+    "\n",
+    "# Calculate TF-IDF score for 'term' in each document\n",
+    "tf_idf_ulysses = {}\n",
+    "\n",
+    "for doc in counts:\n",
+    "    words = counts[doc]\n",
+    "    tf_score = words[term] / sum(words.values())\n",
+    "    tf_idf_score = tf_score * idf_ulysses\n",
+    "    print(tf_score, idf_ulysses, tf_idf_score)"
+   ]
   }
  ],
  "metadata": {

From 14b8d48e81cd8abad7f4741e5f946c6e77cb19f2 Mon Sep 17 00:00:00 2001
From: Linh Nguyen <137933787+lnguyen2693@users.noreply.github.com>
Date: Sun, 30 Mar 2025 23:02:22 +0000
Subject: [PATCH 2/2] hw

---
 tf-idf.ipynb | 75 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 57 insertions(+), 18 deletions(-)

diff --git a/tf-idf.ipynb b/tf-idf.ipynb
index ac5b560..dd26037 100644
--- a/tf-idf.ipynb
+++ b/tf-idf.ipynb
@@ -61,7 +61,13 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n",
+      "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n",
       "tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\n",
       "tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml\n",
@@ -88,17 +94,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "71\n",
-      "0\n",
-      "5\n",
-      "15\n"
+      "185\n",
+      "141\n",
+      "178\n",
+      "168\n"
      ]
     }
    ],
@@ -115,27 +121,27 @@
     "    with open(t) as f:\n",
     "        text = f.read().lower().split()\n",
     "        counts[name] = Counter(text)\n",
-    "    print(counts[name]['throng'])"
+    "    print(counts[name]['gods'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "3"
+       "4"
       ]
      },
-     "execution_count": 60,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "term = 'throng'\n",
+    "term = 'gods'\n",
     "\n",
     "df_ulysses = 0\n",
     "\n",
@@ -160,23 +166,56 @@
     "\n",
     "2. When calculating tf-idf score, I receive a negative score for `tf_idf_ulysses`. I was confused because I assumed that was impossible. I revisited the code I wrote for calculating IDF score, and I realize I added 1 to `df_ulysses` to prevent division by 0, but that means when `df_ulysses` = the # of docs, `# docs / (df_ulysses + 1)` < 1, which causes `idf_ulysses` to be negative. I re-read about TF-IDF [here](https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency/) to make sure the correct formula does add 1 into `df_ulysses` when calculating `idf_ulysses`. Though the link mentions that the formula can contain adding one into the denominator, which means I can keep the `+ 1` in calculating `idf_ulysses`, I decided to remove it as the example in the link doesn't show them adding one into the denominator.\n",
     "\n",
-    "3. To test for my code, I tried to find a term that does not appear in all 4 documents."
+    "3. I ran the code to calculate TF-IDF of these term:\n",
+    "\n",
+    "- hector:\n",
+    "\n",
+    "| TF | IDF | TF-IDF |\n",
+    "| --- | ---- | ------- |\n",
+    "| 0.0015694050991501417 | 0.6020599913279624 | 0.0009448760203843942\n",
+    "| 0.0 | 0.6020599913279624 | 0.0\n",
+    "| 0.0 | 0.6020599913279624 | 0.0\n",
+    "| 0.0 | 0.6020599913279624 | 0.0\n",
+    "\n",
+    "Have a high TF-IDF in documents related to The Iliad but not in The Odyssey\n",
+    "\n",
+    "- trojans:\n",
+    "\n",
+    "| TF | IDF | TF-IDF |\n",
+    "| --- | ---- | ------- |\n",
+    "| 0.0019886685552407933 | 0.0 | 0.0\n",
+    "| 8.359945827551038e-05 | 0.0 | 0.0\n",
+    "| 0.0022201973793970816 | 0.0 | 0.0\n",
+    "| 9.012730481805551e-05 | 0.0 | 0.0\n",
+    "\n",
+    "Have TF-IDF score = 0 since the term appears in all 4 documents. However, we can observe that this term appears more in the second and the third ones\n",
+    "\n",
+    "- gods:\n",
+    "\n",
+    "| TF | IDF | TF-IDF |\n",
+    "| --- | ---- | ------- |\n",
+    "| 0.0010481586402266289 | 0.0 | 0.0\n",
+    "| 0.0011787523616846962 | 0.0 | 0.0\n",
+    "| 0.0011421824668574581 | 0.0 | 0.0\n",
+    "| 0.001261782267452777 | 0.0 | 0.0\n",
+    "\n",
+    "This term might appear frequently across all documents\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "4 3\n",
-      "0.0004022662889518414 0.12493873660829992 5.025864192175238e-05\n",
-      "0.0 0.12493873660829992 0.0\n",
-      "3.2083777158917363e-05 0.12493873660829992 4.0085065838573656e-06\n",
-      "0.00011265913102256937 0.12493873660829992 1.4075489497348745e-05\n"
+      "4 4\n",
+      "0.0010481586402266289 0.0 0.0\n",
+      "0.0011787523616846962 0.0 0.0\n",
+      "0.0011421824668574581 0.0 0.0\n",
+      "0.001261782267452777 0.0 0.0\n"
      ]
     }
    ],