TuftsIntroDH · bbobb · May 9, 2025
diff --git a/tf-idf.ipynb b/tf-idf.ipynb
@@ -2,21 +2,21 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "Collecting lxml\n",
-      "  Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.7 kB)\n",
-      "Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl (5.0 MB)\n",
-      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m50.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "  Downloading lxml-5.4.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.5 kB)\n",
+      "Downloading lxml-5.4.0-cp312-cp312-manylinux_2_28_x86_64.whl (5.0 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m42.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
       "\u001b[?25hInstalling collected packages: lxml\n",
-      "Successfully installed lxml-5.3.1\n",
+      "Successfully installed lxml-5.4.0\n",
       "\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n",
       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
       "Note: you may need to restart the kernel to use updated packages.\n"
      ]
@@ -28,7 +28,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -38,7 +38,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -47,7 +47,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -62,7 +62,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -71,9 +71,9 @@
      "text": [
       "tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\n",
       "tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml\n",
-      "tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n",
       "tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n",
-      "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n"
+      "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n",
+      "tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n"
      ]
     }
    ],
@@ -96,7 +96,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -116,7 +116,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -125,7 +125,7 @@
        "4"
       ]
      },
-     "execution_count": 71,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -141,11 +141,46 @@
     "\n",
     "df_ulysses"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "term = 'home'\n",
+    "\n",
+    "df = 0\n",
+    "\n",
+    "for _, els in counts.items():\n",
+    "    if term in els:\n",
+    "        df += 1\n",
+    "\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "I could improve my analysis by broadening the textual source material in the future. TF IDF is essentially telling us how important/prevalent a given word is in a given document by calculating how frequent the word is in the document and how rare it is in the corpus at large. TF IDF could be a useful measure for searching documents in historical studies or categorizing information from large bodies of data. "
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -159,7 +194,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.1"
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,