Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 116 additions & 22 deletions tf-idf.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting lxml\n",
" Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.7 kB)\n",
"Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl (5.0 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m50.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: lxml\n",
"Successfully installed lxml-5.3.1\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
"Requirement already satisfied: lxml in /home/codespace/.python/current/lib/python3.12/site-packages (5.3.1)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
Expand All @@ -28,7 +20,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -38,7 +30,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -47,7 +39,7 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -62,18 +54,24 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n",
"tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\n",
"tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml\n",
"tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n",
"tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n",
"tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n"
"tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n"
]
}
],
Expand All @@ -96,9 +94,20 @@
},
{
"cell_type": "code",
"execution_count": 68,
"execution_count": 23,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"185\n",
"141\n",
"178\n",
"168\n"
]
}
],
"source": [
"from collections import Counter\n",
"\n",
Expand All @@ -111,12 +120,13 @@
"\n",
" with open(t) as f:\n",
" text = f.read().lower().split()\n",
" counts[name] = Counter(text)\n"
" counts[name] = Counter(text)\n",
" print(counts[name]['gods'])"
]
},
{
"cell_type": "code",
"execution_count": 71,
"execution_count": 24,
"metadata": {},
"outputs": [
{
Expand All @@ -125,13 +135,13 @@
"4"
]
},
"execution_count": 71,
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"term = 'odysseus'\n",
"term = 'gods'\n",
"\n",
"df_ulysses = 0\n",
"\n",
Expand All @@ -141,6 +151,90 @@
"\n",
"df_ulysses"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Notes when completing the assignment:\n",
"1. Review TF-IDF and understand the code:\n",
"- Re-inspect the code with: <br>\n",
" `print(counts)` <br>\n",
" `print(counts[name]['odysseus'])` <br>\n",
" change other term <br>\n",
" --> Figure out calculating TF-IDF score for a term\n",
"\n",
"2. When calculating tf-idf score, I receive a negative score for `tf_idf_ulysses`. I was confused because I assumed that was impossible. I revisited the code I wrote for calculating IDF score, and I realize I added 1 to `df_ulysses` to prevent division by 0, but that means when `df_ulysses` = the # of docs, `# docs / (df_ulysses + 1)` < 1, which causes `idf_ulysses` to be negative. I re-read about TF-IDF [here](https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency/) to make sure the correct formula does add 1 into `df_ulysses` when calculating `idf_ulysses`. Though the link mentions that the formula can contain adding one into the denominator, which means I can keep the `+ 1` in calculating `idf_ulysses`, I decided to remove it as the example in the link doesn't show them adding one into the denominator.\n",
"\n",
"3. I ran the code to calculate TF-IDF of these term:\n",
"\n",
"- hector:\n",
"\n",
"| TF | IDF | TF-IDF |\n",
"| --- | ---- | ------- |\n",
"| 0.0015694050991501417 | 0.6020599913279624 | 0.0009448760203843942\n",
"| 0.0 | 0.6020599913279624 | 0.0\n",
"| 0.0 | 0.6020599913279624 | 0.0\n",
"| 0.0 | 0.6020599913279624 | 0.0\n",
"\n",
"Have a high TF-IDF in documents related to The Iliad but not in The Odyssey\n",
"\n",
"- trojans:\n",
"\n",
"| TF | IDF | TF-IDF |\n",
"| --- | ---- | ------- |\n",
"| 0.0019886685552407933 | 0.0 | 0.0\n",
"| 8.359945827551038e-05 | 0.0 | 0.0\n",
"| 0.0022201973793970816 | 0.0 | 0.0\n",
"| 9.012730481805551e-05 | 0.0 | 0.0\n",
"\n",
"Have TF-IDF score = 0 since the term appears in all 4 documents. However, we can observe that this term appears more in the second and the third ones\n",
"\n",
"- gods:\n",
"\n",
"| TF | IDF | TF-IDF |\n",
"| --- | ---- | ------- |\n",
"| 0.0010481586402266289 | 0.0 | 0.0\n",
"| 0.0011787523616846962 | 0.0 | 0.0\n",
"| 0.0011421824668574581 | 0.0 | 0.0\n",
"| 0.001261782267452777 | 0.0 | 0.0\n",
"\n",
"This term might appear frequently across all documents\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4 4\n",
"0.0010481586402266289 0.0 0.0\n",
"0.0011787523616846962 0.0 0.0\n",
"0.0011421824668574581 0.0 0.0\n",
"0.001261782267452777 0.0 0.0\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"# Calculate IDF score for 'term'\n",
"idf_ulysses = np.log10(len(counts) / df_ulysses)\n",
"print(len(counts), df_ulysses)\n",
"\n",
"# Calculate TF-IDF score for 'term' in each document\n",
"tf_idf_ulysses = {}\n",
"\n",
"for doc in counts:\n",
" words = counts[doc]\n",
" tf_score = words[term] / sum(words.values())\n",
" tf_idf_score = tf_score * idf_ulysses\n",
" print(tf_score, idf_ulysses, tf_idf_score)"
]
}
],
"metadata": {
Expand Down