diff --git a/tf-idf.ipynb b/tf-idf.ipynb index 6f041cc..dd26037 100644 --- a/tf-idf.ipynb +++ b/tf-idf.ipynb @@ -9,15 +9,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Collecting lxml\n", - " Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.7 kB)\n", - "Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl (5.0 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m50.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: lxml\n", - "Successfully installed lxml-5.3.1\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Requirement already satisfied: lxml in /home/codespace/.python/current/lib/python3.12/site-packages (5.3.1)\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } @@ -28,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -47,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -62,18 +54,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n", "tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\n", "tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml\n", - "tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n", - "tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n", - "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n" + "tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n" ] } ], @@ -96,9 +94,20 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 23, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "185\n", + "141\n", + "178\n", + "168\n" + ] + } + ], "source": [ "from collections import Counter\n", "\n", @@ -111,12 +120,13 @@ "\n", " with open(t) as f:\n", " text = f.read().lower().split()\n", - " counts[name] = Counter(text)\n" + " counts[name] = Counter(text)\n", + " print(counts[name]['gods'])" ] }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -125,13 +135,13 @@ "4" ] }, - "execution_count": 71, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "term = 'odysseus'\n", + "term = 'gods'\n", "\n", "df_ulysses = 0\n", "\n", @@ -141,6 +151,90 @@ "\n", "df_ulysses" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notes when completing the assignment:\n", + "1. Review TF-IDF and understand the code:\n", + "- Re-inspect the code with:
\n", + " `print(counts)`
\n", + " `print(counts[name]['odysseus'])`
\n", + " change other term
\n", + " --> Figure out calculating TF-IDF score for a term\n", + "\n", + "2. When calculating tf-idf score, I receive a negative score for `tf_idf_ulysses`. I was confused because I assumed that was impossible. I revisited the code I wrote for calculating IDF score, and I realize I added 1 to `df_ulysses` to prevent division by 0, but that means when `df_ulysses` = the # of docs, `# docs / (df_ulysses + 1)` < 1, which causes `idf_ulysses` to be negative. I re-read about TF-IDF [here](https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency/) to make sure the correct formula does add 1 into `df_ulysses` when calculating `idf_ulysses`. Though the link mentions that the formula can contain adding one into the denominator, which means I can keep the `+ 1` in calculating `idf_ulysses`, I decided to remove it as the example in the link doesn't show them adding one into the denominator.\n", + "\n", + "3. I ran the code to calculate TF-IDF of these term:\n", + "\n", + "- hector:\n", + "\n", + "| TF | IDF | TF-IDF |\n", + "| --- | ---- | ------- |\n", + "| 0.0015694050991501417 | 0.6020599913279624 | 0.0009448760203843942\n", + "| 0.0 | 0.6020599913279624 | 0.0\n", + "| 0.0 | 0.6020599913279624 | 0.0\n", + "| 0.0 | 0.6020599913279624 | 0.0\n", + "\n", + "Have a high TF-IDF in documents related to The Iliad but not in The Odyssey\n", + "\n", + "- trojans:\n", + "\n", + "| TF | IDF | TF-IDF |\n", + "| --- | ---- | ------- |\n", + "| 0.0019886685552407933 | 0.0 | 0.0\n", + "| 8.359945827551038e-05 | 0.0 | 0.0\n", + "| 0.0022201973793970816 | 0.0 | 0.0\n", + "| 9.012730481805551e-05 | 0.0 | 0.0\n", + "\n", + "Have TF-IDF score = 0 since the term appears in all 4 documents. However, we can observe that this term appears more in the second and the third ones\n", + "\n", + "- gods:\n", + "\n", + "| TF | IDF | TF-IDF |\n", + "| --- | ---- | ------- |\n", + "| 0.0010481586402266289 | 0.0 | 0.0\n", + "| 0.0011787523616846962 | 0.0 | 0.0\n", + "| 0.0011421824668574581 | 0.0 | 0.0\n", + "| 0.001261782267452777 | 0.0 | 0.0\n", + "\n", + "This term might appear frequently across all documents\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4 4\n", + "0.0010481586402266289 0.0 0.0\n", + "0.0011787523616846962 0.0 0.0\n", + "0.0011421824668574581 0.0 0.0\n", + "0.001261782267452777 0.0 0.0\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "# Calculate IDF score for 'term'\n", + "idf_ulysses = np.log10(len(counts) / df_ulysses)\n", + "print(len(counts), df_ulysses)\n", + "\n", + "# Calculate TF-IDF score for 'term' in each document\n", + "tf_idf_ulysses = {}\n", + "\n", + "for doc in counts:\n", + " words = counts[doc]\n", + " tf_score = words[term] / sum(words.values())\n", + " tf_idf_score = tf_score * idf_ulysses\n", + " print(tf_score, idf_ulysses, tf_idf_score)" + ] } ], "metadata": {