Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 52 additions & 17 deletions tf-idf.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting lxml\n",
" Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.7 kB)\n",
"Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl (5.0 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m50.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
" Downloading lxml-5.4.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.5 kB)\n",
"Downloading lxml-5.4.0-cp312-cp312-manylinux_2_28_x86_64.whl (5.0 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m42.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hInstalling collected packages: lxml\n",
"Successfully installed lxml-5.3.1\n",
"Successfully installed lxml-5.4.0\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
Expand All @@ -28,7 +28,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -38,7 +38,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -47,7 +47,7 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -62,7 +62,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand All @@ -71,9 +71,9 @@
"text": [
"tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\n",
"tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml\n",
"tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n",
"tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n",
"tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n"
"tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n",
"tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n"
]
}
],
Expand All @@ -96,7 +96,7 @@
},
{
"cell_type": "code",
"execution_count": 68,
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -116,7 +116,7 @@
},
{
"cell_type": "code",
"execution_count": 71,
"execution_count": 11,
"metadata": {},
"outputs": [
{
Expand All @@ -125,7 +125,7 @@
"4"
]
},
"execution_count": 71,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -141,11 +141,46 @@
"\n",
"df_ulysses"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"term = 'home'\n",
"\n",
"df = 0\n",
"\n",
"for _, els in counts.items():\n",
" if term in els:\n",
" df += 1\n",
"\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I could improve my analysis by broadening the textual source material in the future. TF IDF is essentially telling us how important/prevalent a given word is in a given document by calculating how frequent the word is in the document and how rare it is in the corpus at large. TF IDF could be a useful measure for searching documents in historical studies or categorizing information from large bodies of data. "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "base",
"language": "python",
"name": "python3"
},
Expand All @@ -159,7 +194,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.1"
"version": "3.12.3"
}
},
"nbformat": 4,
Expand Down