From 93945a333fc548f688a7331ab24f63d921704362 Mon Sep 17 00:00:00 2001 From: bbobb <124208046+bbobb@users.noreply.github.com> Date: Fri, 9 May 2025 07:15:17 +0000 Subject: [PATCH] tf idf hw --- tf-idf.ipynb | 69 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 17 deletions(-) diff --git a/tf-idf.ipynb b/tf-idf.ipynb index 6f041cc..4586143 100644 --- a/tf-idf.ipynb +++ b/tf-idf.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -10,13 +10,13 @@ "output_type": "stream", "text": [ "Collecting lxml\n", - " Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.7 kB)\n", - "Downloading lxml-5.3.1-cp312-cp312-manylinux_2_28_x86_64.whl (5.0 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m50.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + " Downloading lxml-5.4.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.5 kB)\n", + "Downloading lxml-5.4.0-cp312-cp312-manylinux_2_28_x86_64.whl (5.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.0/5.0 MB\u001b[0m \u001b[31m42.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: lxml\n", - "Successfully installed lxml-5.3.1\n", + "Successfully installed lxml-5.4.0\n", "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.1.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -71,9 +71,9 @@ "text": [ "tlg0012/tlg001/tlg0012.tlg001.perseus-eng3.xml\n", "tlg0012/tlg001/tlg0012.tlg001.perseus-eng4.xml\n", - "tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n", "tlg0012/tlg002/tlg0012.tlg002.perseus-eng4.xml\n", - "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n" + "tlg0012/tlg002/tlg0012.tlg002.perseus-eng3.xml\n", + "tlg0012/tlg003/tlg0012.tlg003.perseus-eng1.xml\n" ] } ], @@ -96,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -116,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -125,7 +125,7 @@ "4" ] }, - "execution_count": 71, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -141,11 +141,46 @@ "\n", "df_ulysses" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "term = 'home'\n", + "\n", + "df = 0\n", + "\n", + "for _, els in counts.items():\n", + " if term in els:\n", + " df += 1\n", + "\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I could improve my analysis by broadening the textual source material in the future. TF IDF is essentially telling us how important/prevalent a given word is in a given document by calculating how frequent the word is in the document and how rare it is in the corpus at large. TF IDF could be a useful measure for searching documents in historical studies or categorizing information from large bodies of data. " + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "base", "language": "python", "name": "python3" }, @@ -159,7 +194,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.12.3" } }, "nbformat": 4,