From e4a7c7d0d1453c8c55799014748cd2121cd70f5a Mon Sep 17 00:00:00 2001 From: Linh Nguyen <137933787+lnguyen2693@users.noreply.github.com> Date: Sun, 30 Mar 2025 20:03:38 +0000 Subject: [PATCH 1/2] calculate tf-idf --- tf-idf.ipynb | 101 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 78 insertions(+), 23 deletions(-) diff --git a/tf-idf.ipynb b/tf-idf.ipynb index 6f041cc..ac5b560 100644 --- a/tf-idf.ipynb +++ b/tf-idf.ipynb @@ -9,15 +9,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Collecting lxml\n", - " Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.7 kB)\n", - "Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl (5.0 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m50.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: lxml\n", - "Successfully installed lxml-5.3.1\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Requirement already satisfied: lxml in /home/codespace/.python/current/lib/python3.12/site-packages (5.3.1)\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } @@ -28,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -47,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -62,18 +54,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n", + "tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n", "tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\n", "tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml\n", - "tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n", - "tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n", - "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n" + "tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n" ] } ], @@ -96,9 +88,20 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 46, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "71\n", + "0\n", + "5\n", + "15\n" + ] + } + ], "source": [ "from collections import Counter\n", "\n", @@ -111,27 +114,28 @@ "\n", " with open(t) as f:\n", " text = f.read().lower().split()\n", - " counts[name] = Counter(text)\n" + " counts[name] = Counter(text)\n", + " print(counts[name]['throng'])" ] }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "4" + "3" ] }, - "execution_count": 71, + "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "term = 'odysseus'\n", + "term = 'throng'\n", "\n", "df_ulysses = 0\n", "\n", @@ -141,6 +145,57 @@ "\n", "df_ulysses" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notes when completing the assignment:\n", + "1. Review TF-IDF and understand the code:\n", + "- Re-inspect the code with:
\n", + " `print(counts)`
\n", + " `print(counts[name]['odysseus'])`
\n", + " change other term
\n", + " --> Figure out calculating TF-IDF score for a term\n", + "\n", + "2. When calculating tf-idf score, I receive a negative score for `tf_idf_ulysses`. I was confused because I assumed that was impossible. I revisited the code I wrote for calculating IDF score, and I realize I added 1 to `df_ulysses` to prevent division by 0, but that means when `df_ulysses` = the # of docs, `# docs / (df_ulysses + 1)` < 1, which causes `idf_ulysses` to be negative. I re-read about TF-IDF [here](https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency/) to make sure the correct formula does add 1 into `df_ulysses` when calculating `idf_ulysses`. Though the link mentions that the formula can contain adding one into the denominator, which means I can keep the `+ 1` in calculating `idf_ulysses`, I decided to remove it as the example in the link doesn't show them adding one into the denominator.\n", + "\n", + "3. To test for my code, I tried to find a term that does not appear in all 4 documents." + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4 3\n", + "0.0004022662889518414 0.12493873660829992 5.025864192175238e-05\n", + "0.0 0.12493873660829992 0.0\n", + "3.2083777158917363e-05 0.12493873660829992 4.0085065838573656e-06\n", + "0.00011265913102256937 0.12493873660829992 1.4075489497348745e-05\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "# Calculate IDF score for 'term'\n", + "idf_ulysses = np.log10(len(counts) / df_ulysses)\n", + "print(len(counts), df_ulysses)\n", + "\n", + "# Calculate TF-IDF score for 'term' in each document\n", + "tf_idf_ulysses = {}\n", + "\n", + "for doc in counts:\n", + " words = counts[doc]\n", + " tf_score = words[term] / sum(words.values())\n", + " tf_idf_score = tf_score * idf_ulysses\n", + " print(tf_score, idf_ulysses, tf_idf_score)" + ] } ], "metadata": { From 14b8d48e81cd8abad7f4741e5f946c6e77cb19f2 Mon Sep 17 00:00:00 2001 From: Linh Nguyen <137933787+lnguyen2693@users.noreply.github.com> Date: Sun, 30 Mar 2025 23:02:22 +0000 Subject: [PATCH 2/2] hw --- tf-idf.ipynb | 75 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 57 insertions(+), 18 deletions(-) diff --git a/tf-idf.ipynb b/tf-idf.ipynb index ac5b560..dd26037 100644 --- a/tf-idf.ipynb +++ b/tf-idf.ipynb @@ -61,7 +61,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n", + "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n", "tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\n", "tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml\n", @@ -88,17 +94,17 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "71\n", - "0\n", - "5\n", - "15\n" + "185\n", + "141\n", + "178\n", + "168\n" ] } ], @@ -115,27 +121,27 @@ " with open(t) as f:\n", " text = f.read().lower().split()\n", " counts[name] = Counter(text)\n", - " print(counts[name]['throng'])" + " print(counts[name]['gods'])" ] }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "3" + "4" ] }, - "execution_count": 60, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "term = 'throng'\n", + "term = 'gods'\n", "\n", "df_ulysses = 0\n", "\n", @@ -160,23 +166,56 @@ "\n", "2. When calculating tf-idf score, I receive a negative score for `tf_idf_ulysses`. I was confused because I assumed that was impossible. I revisited the code I wrote for calculating IDF score, and I realize I added 1 to `df_ulysses` to prevent division by 0, but that means when `df_ulysses` = the # of docs, `# docs / (df_ulysses + 1)` < 1, which causes `idf_ulysses` to be negative. I re-read about TF-IDF [here](https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency/) to make sure the correct formula does add 1 into `df_ulysses` when calculating `idf_ulysses`. Though the link mentions that the formula can contain adding one into the denominator, which means I can keep the `+ 1` in calculating `idf_ulysses`, I decided to remove it as the example in the link doesn't show them adding one into the denominator.\n", "\n", - "3. To test for my code, I tried to find a term that does not appear in all 4 documents." + "3. I ran the code to calculate TF-IDF of these term:\n", + "\n", + "- hector:\n", + "\n", + "| TF | IDF | TF-IDF |\n", + "| --- | ---- | ------- |\n", + "| 0.0015694050991501417 | 0.6020599913279624 | 0.0009448760203843942\n", + "| 0.0 | 0.6020599913279624 | 0.0\n", + "| 0.0 | 0.6020599913279624 | 0.0\n", + "| 0.0 | 0.6020599913279624 | 0.0\n", + "\n", + "Have a high TF-IDF in documents related to The Iliad but not in The Odyssey\n", + "\n", + "- trojans:\n", + "\n", + "| TF | IDF | TF-IDF |\n", + "| --- | ---- | ------- |\n", + "| 0.0019886685552407933 | 0.0 | 0.0\n", + "| 8.359945827551038e-05 | 0.0 | 0.0\n", + "| 0.0022201973793970816 | 0.0 | 0.0\n", + "| 9.012730481805551e-05 | 0.0 | 0.0\n", + "\n", + "Have TF-IDF score = 0 since the term appears in all 4 documents. However, we can observe that this term appears more in the second and the third ones\n", + "\n", + "- gods:\n", + "\n", + "| TF | IDF | TF-IDF |\n", + "| --- | ---- | ------- |\n", + "| 0.0010481586402266289 | 0.0 | 0.0\n", + "| 0.0011787523616846962 | 0.0 | 0.0\n", + "| 0.0011421824668574581 | 0.0 | 0.0\n", + "| 0.001261782267452777 | 0.0 | 0.0\n", + "\n", + "This term might appear frequently across all documents\n" ] }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "4 3\n", - "0.0004022662889518414 0.12493873660829992 5.025864192175238e-05\n", - "0.0 0.12493873660829992 0.0\n", - "3.2083777158917363e-05 0.12493873660829992 4.0085065838573656e-06\n", - "0.00011265913102256937 0.12493873660829992 1.4075489497348745e-05\n" + "4 4\n", + "0.0010481586402266289 0.0 0.0\n", + "0.0011787523616846962 0.0 0.0\n", + "0.0011421824668574581 0.0 0.0\n", + "0.001261782267452777 0.0 0.0\n" ] } ],