From d754eaf269e0d5d27b3fafb1788dec14ff373112 Mon Sep 17 00:00:00 2001 From: David Reguera <33068707+nablabits@users.noreply.github.com> Date: Mon, 13 Mar 2023 08:13:27 +0100 Subject: [PATCH 1/4] Update .gitignore --- .gitignore | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0323ef1..71ec0fe 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ .ipynb_checkpoints .DS_Store -**/.DS_Store \ No newline at end of file +**/.DS_Store + +classification_training.zip +classification/ +comprehend_train_data.csv +ref_output.txt From 4d145af805f1476c8175b682dab3f6105023aca7 Mon Sep 17 00:00:00 2001 From: David Reguera <33068707+nablabits@users.noreply.github.com> Date: Mon, 13 Mar 2023 08:15:38 +0100 Subject: [PATCH 2/4] Update .gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 71ec0fe..f6c9d1d 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,6 @@ **/.DS_Store classification_training.zip -classification/ +classification-training/ comprehend_train_data.csv ref_output.txt From 6ee3035414d89aba76513e03b594152783950392 Mon Sep 17 00:00:00 2001 From: David Reguera <33068707+nablabits@users.noreply.github.com> Date: Mon, 13 Mar 2023 08:16:04 +0100 Subject: [PATCH 3/4] Update .gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index f6c9d1d..748a40d 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ .DS_Store **/.DS_Store -classification_training.zip +classification-training.zip classification-training/ comprehend_train_data.csv ref_output.txt From 25b6dc2e2b58102e27f8d200c8942f23dbf5e56a Mon Sep 17 00:00:00 2001 From: nablabits Date: Mon, 13 Mar 2023 07:26:50 +0000 Subject: [PATCH 4/4] checkpoint 1 --- 01-idp-document-classification.ipynb | 976 ++++++++++++++++++++++++++- 1 file changed, 948 insertions(+), 28 deletions(-) diff --git a/01-idp-document-classification.ipynb b/01-idp-document-classification.ipynb index babb453..62cd9ee 100644 --- a/01-idp-document-classification.ipynb +++ b/01-idp-document-classification.ipynb @@ -54,13 +54,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "0a67449e", "metadata": { "scrolled": true, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], "source": [ "!python -m pip install -q amazon-textract-response-parser --upgrade\n", "!python -m pip install -q amazon-textract-caller --upgrade\n", @@ -69,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "f7f8a235", "metadata": { "tags": [] @@ -91,12 +110,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "66660511", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SageMaker role is: arn:aws:iam::981147895656:role/idp-temp-sagemaker-SageMakerExecutionRole-1TSM55YJNFLJR\n", + "Default SageMaker Bucket: s3://sagemaker-eu-west-1-981147895656\n" + ] + } + ], "source": [ "import boto3\n", "import botocore\n", @@ -110,7 +138,7 @@ "import multiprocessing as mp\n", "from IPython.display import Image, display, HTML, JSON\n", "\n", - "# variables\n", + "# Connect to the S3 bucket where the data will be stored.\n", "data_bucket = sagemaker.Session().default_bucket()\n", "region = boto3.session.Session().region_name\n", "\n", @@ -135,24 +163,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "cc74349d", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " % Total % Received % Xferd Average Speed Time Time Time Current\n", + " Dload Upload Total Spent Left Speed\n", + "100 21.0M 100 21.0M 0 0 13.6M 0 0:00:01 0:00:01 --:--:-- 13.6M\n" + ] + } + ], "source": [ "!curl https://idp-assets-wwso.s3.us-east-2.amazonaws.com/workshop-data/classification-training.zip --output classification-training.zip" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "582ed56f", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document archive extracted successfully...\n", + "Removing hidden files/directories: ./classification-training/invoices/.DS_Store\n", + "Removing hidden files/directories: ./classification-training/invoices/.ipynb_checkpoints\n", + "Removing hidden files/directories: ./classification-training/bank-statements/.DS_Store\n", + "Removing hidden files/directories: ./classification-training/bank-statements/.ipynb_checkpoints\n", + "Removing hidden files/directories: ./classification-training/receipts/.ipynb_checkpoints\n" + ] + } + ], "source": [ "import shutil\n", "\n", @@ -188,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "33b5e387", "metadata": { "scrolled": true, @@ -212,7 +263,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "ba331a70", "metadata": { "tags": [] @@ -246,13 +297,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "b58e56f6", "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, "scrolled": true, "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(['bank-statements', 'receipts', 'invoices'],\n", + " ['idp/textract/receipts/receipt_90.png',\n", + " 'idp/textract/receipts/receipt_91.png',\n", + " 'idp/textract/receipts/receipt_92.png',\n", + " 'idp/textract/receipts/receipt_93.png',\n", + " 'idp/textract/receipts/receipt_94.png',\n", + " 'idp/textract/receipts/receipt_95.png',\n", + " 'idp/textract/receipts/receipt_96.png',\n", + " 'idp/textract/receipts/receipt_97.png',\n", + " 'idp/textract/receipts/receipt_98.png',\n", + " 'idp/textract/receipts/receipt_99.png'],\n", + " ['idp/textract/bank-statements/bank_stmt_0.png',\n", + " 'idp/textract/bank-statements/bank_stmt_1.png',\n", + " 'idp/textract/bank-statements/bank_stmt_10.png',\n", + " 'idp/textract/bank-statements/bank_stmt_11.png',\n", + " 'idp/textract/bank-statements/bank_stmt_12.png',\n", + " 'idp/textract/bank-statements/bank_stmt_13.png',\n", + " 'idp/textract/bank-statements/bank_stmt_14.png',\n", + " 'idp/textract/bank-statements/bank_stmt_15.png',\n", + " 'idp/textract/bank-statements/bank_stmt_16.png',\n", + " 'idp/textract/bank-statements/bank_stmt_17.png'])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "docs=[]\n", "\n", @@ -261,8 +347,10 @@ "\n", "if type(docs[0]) is list:\n", " docs=[item for sublist in docs for item in sublist]\n", - " \n", - "names, docs[-10:], docs[:10]" + " \n", + "assert len(docs) == 300 # we have 300 samples, 100 per category\n", + "\n", + "names, docs[-10:], docs[:10]\n" ] }, { @@ -280,9 +368,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, + "id": "3462c0e6-db16-4a13-9941-2ef202965aab", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Let's see how textract works with our docs\n", + "ref = f\"s3://{data_bucket}/ref-2911-25357.pdf\"\n", + "response = call_textract(input_document=ref)\n", + "lines = get_string(textract_json=response, output_type=[Textract_Pretty_Print.LINES])\n", + "\n", + "with open(\"ref_output.txt\", \"w\") as f:\n", + " f.write(lines)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, "id": "3f93877e", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "def textract_extract_text(document, bucket=data_bucket): \n", @@ -318,12 +426,320 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "dd4608d2", "metadata": { - "scrolled": true + "scrolled": true, + "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing document: idp/textract/bank-statements/bank_stmt_0.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_1.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_10.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_11.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_12.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_13.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_14.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_15.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_16.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_17.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_18.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_19.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_2.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_20.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_21.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_22.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_23.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_24.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_25.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_26.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_27.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_28.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_29.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_3.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_30.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_31.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_32.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_33.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_34.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_35.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_36.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_37.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_38.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_39.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_4.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_40.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_41.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_42.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_43.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_44.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_45.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_46.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_47.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_48.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_49.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_5.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_50.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_51.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_52.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_53.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_54.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_55.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_56.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_57.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_58.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_59.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_6.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_60.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_61.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_62.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_63.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_64.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_65.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_66.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_67.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_68.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_69.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_7.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_70.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_71.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_72.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_73.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_74.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_75.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_76.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_77.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_78.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_79.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_8.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_80.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_81.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_82.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_83.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_84.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_85.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_86.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_87.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_88.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_89.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_9.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_90.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_91.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_92.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_93.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_94.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_95.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_96.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_97.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_98.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_99.png\n", + "Processing document: idp/textract/invoices/invoice_0.png\n", + "Processing document: idp/textract/invoices/invoice_1.png\n", + "Processing document: idp/textract/invoices/invoice_10.png\n", + "Processing document: idp/textract/invoices/invoice_11.png\n", + "Processing document: idp/textract/invoices/invoice_12.png\n", + "Processing document: idp/textract/invoices/invoice_13.png\n", + "Processing document: idp/textract/invoices/invoice_14.png\n", + "Processing document: idp/textract/invoices/invoice_15.png\n", + "Processing document: idp/textract/invoices/invoice_16.png\n", + "Processing document: idp/textract/invoices/invoice_17.png\n", + "Processing document: idp/textract/invoices/invoice_18.png\n", + "Processing document: idp/textract/invoices/invoice_19.png\n", + "Processing document: idp/textract/invoices/invoice_2.png\n", + "Processing document: idp/textract/invoices/invoice_20.png\n", + "Processing document: idp/textract/invoices/invoice_21.png\n", + "Processing document: idp/textract/invoices/invoice_22.png\n", + "Processing document: idp/textract/invoices/invoice_23.png\n", + "Processing document: idp/textract/invoices/invoice_24.png\n", + "Processing document: idp/textract/invoices/invoice_25.png\n", + "Processing document: idp/textract/invoices/invoice_26.png\n", + "Processing document: idp/textract/invoices/invoice_27.png\n", + "Processing document: idp/textract/invoices/invoice_28.png\n", + "Processing document: idp/textract/invoices/invoice_29.png\n", + "Processing document: idp/textract/invoices/invoice_3.png\n", + "Processing document: idp/textract/invoices/invoice_30.png\n", + "Processing document: idp/textract/invoices/invoice_31.png\n", + "Processing document: idp/textract/invoices/invoice_32.png\n", + "Processing document: idp/textract/invoices/invoice_33.png\n", + "Processing document: idp/textract/invoices/invoice_34.png\n", + "Processing document: idp/textract/invoices/invoice_35.png\n", + "Processing document: idp/textract/invoices/invoice_36.png\n", + "Processing document: idp/textract/invoices/invoice_37.png\n", + "Processing document: idp/textract/invoices/invoice_38.png\n", + "Processing document: idp/textract/invoices/invoice_39.png\n", + "Processing document: idp/textract/invoices/invoice_4.png\n", + "Processing document: idp/textract/invoices/invoice_40.png\n", + "Processing document: idp/textract/invoices/invoice_41.png\n", + "Processing document: idp/textract/invoices/invoice_42.png\n", + "Processing document: idp/textract/invoices/invoice_43.png\n", + "Processing document: idp/textract/invoices/invoice_44.png\n", + "Processing document: idp/textract/invoices/invoice_45.png\n", + "Processing document: idp/textract/invoices/invoice_46.png\n", + "Processing document: idp/textract/invoices/invoice_47.png\n", + "Processing document: idp/textract/invoices/invoice_48.png\n", + "Processing document: idp/textract/invoices/invoice_49.png\n", + "Processing document: idp/textract/invoices/invoice_5.png\n", + "Processing document: idp/textract/invoices/invoice_50.png\n", + "Processing document: idp/textract/invoices/invoice_51.png\n", + "Processing document: idp/textract/invoices/invoice_52.png\n", + "Processing document: idp/textract/invoices/invoice_53.png\n", + "Processing document: idp/textract/invoices/invoice_54.png\n", + "Processing document: idp/textract/invoices/invoice_55.png\n", + "Processing document: idp/textract/invoices/invoice_56.png\n", + "Processing document: idp/textract/invoices/invoice_57.png\n", + "Processing document: idp/textract/invoices/invoice_58.png\n", + "Processing document: idp/textract/invoices/invoice_59.png\n", + "Processing document: idp/textract/invoices/invoice_6.png\n", + "Processing document: idp/textract/invoices/invoice_60.png\n", + "Processing document: idp/textract/invoices/invoice_61.png\n", + "Processing document: idp/textract/invoices/invoice_62.png\n", + "Processing document: idp/textract/invoices/invoice_63.png\n", + "Processing document: idp/textract/invoices/invoice_64.png\n", + "Processing document: idp/textract/invoices/invoice_65.png\n", + "Processing document: idp/textract/invoices/invoice_66.png\n", + "Processing document: idp/textract/invoices/invoice_67.png\n", + "Processing document: idp/textract/invoices/invoice_68.png\n", + "Processing document: idp/textract/invoices/invoice_69.png\n", + "Processing document: idp/textract/invoices/invoice_7.png\n", + "Processing document: idp/textract/invoices/invoice_70.png\n", + "Processing document: idp/textract/invoices/invoice_71.png\n", + "Processing document: idp/textract/invoices/invoice_72.png\n", + "Processing document: idp/textract/invoices/invoice_73.png\n", + "Processing document: idp/textract/invoices/invoice_74.png\n", + "Processing document: idp/textract/invoices/invoice_75.png\n", + "Processing document: idp/textract/invoices/invoice_76.png\n", + "Processing document: idp/textract/invoices/invoice_77.png\n", + "Processing document: idp/textract/invoices/invoice_78.png\n", + "Processing document: idp/textract/invoices/invoice_79.png\n", + "Processing document: idp/textract/invoices/invoice_8.png\n", + "Processing document: idp/textract/invoices/invoice_80.png\n", + "Processing document: idp/textract/invoices/invoice_81.png\n", + "Processing document: idp/textract/invoices/invoice_82.png\n", + "Processing document: idp/textract/invoices/invoice_83.png\n", + "Processing document: idp/textract/invoices/invoice_84.png\n", + "Processing document: idp/textract/invoices/invoice_85.png\n", + "Processing document: idp/textract/invoices/invoice_86.png\n", + "Processing document: idp/textract/invoices/invoice_87.png\n", + "Processing document: idp/textract/invoices/invoice_88.png\n", + "Processing document: idp/textract/invoices/invoice_89.png\n", + "Processing document: idp/textract/invoices/invoice_9.png\n", + "Processing document: idp/textract/invoices/invoice_90.png\n", + "Processing document: idp/textract/invoices/invoice_91.png\n", + "Processing document: idp/textract/invoices/invoice_92.png\n", + "Processing document: idp/textract/invoices/invoice_93.png\n", + "Processing document: idp/textract/invoices/invoice_94.png\n", + "Processing document: idp/textract/invoices/invoice_95.png\n", + "Processing document: idp/textract/invoices/invoice_96.png\n", + "Processing document: idp/textract/invoices/invoice_97.png\n", + "Processing document: idp/textract/invoices/invoice_98.png\n", + "Processing document: idp/textract/invoices/invoice_99.png\n", + "Processing document: idp/textract/receipts/receipt_0.png\n", + "Processing document: idp/textract/receipts/receipt_1.png\n", + "Processing document: idp/textract/receipts/receipt_10.png\n", + "Processing document: idp/textract/receipts/receipt_11.png\n", + "Processing document: idp/textract/receipts/receipt_12.png\n", + "Processing document: idp/textract/receipts/receipt_13.png\n", + "Processing document: idp/textract/receipts/receipt_14.png\n", + "Processing document: idp/textract/receipts/receipt_15.png\n", + "Processing document: idp/textract/receipts/receipt_16.png\n", + "Processing document: idp/textract/receipts/receipt_17.png\n", + "Processing document: idp/textract/receipts/receipt_18.png\n", + "Processing document: idp/textract/receipts/receipt_19.png\n", + "Processing document: idp/textract/receipts/receipt_2.png\n", + "Processing document: idp/textract/receipts/receipt_20.png\n", + "Processing document: idp/textract/receipts/receipt_21.png\n", + "Processing document: idp/textract/receipts/receipt_22.png\n", + "Processing document: idp/textract/receipts/receipt_23.png\n", + "Processing document: idp/textract/receipts/receipt_24.png\n", + "Processing document: idp/textract/receipts/receipt_25.png\n", + "Processing document: idp/textract/receipts/receipt_26.png\n", + "Processing document: idp/textract/receipts/receipt_27.png\n", + "Processing document: idp/textract/receipts/receipt_28.png\n", + "Processing document: idp/textract/receipts/receipt_29.png\n", + "Processing document: idp/textract/receipts/receipt_3.png\n", + "Processing document: idp/textract/receipts/receipt_30.png\n", + "Processing document: idp/textract/receipts/receipt_31.png\n", + "Processing document: idp/textract/receipts/receipt_32.png\n", + "Processing document: idp/textract/receipts/receipt_33.png\n", + "Processing document: idp/textract/receipts/receipt_34.png\n", + "Processing document: idp/textract/receipts/receipt_35.png\n", + "Processing document: idp/textract/receipts/receipt_36.png\n", + "Processing document: idp/textract/receipts/receipt_37.png\n", + "Processing document: idp/textract/receipts/receipt_38.png\n", + "Processing document: idp/textract/receipts/receipt_39.png\n", + "Processing document: idp/textract/receipts/receipt_4.png\n", + "Processing document: idp/textract/receipts/receipt_40.png\n", + "Processing document: idp/textract/receipts/receipt_41.png\n", + "Processing document: idp/textract/receipts/receipt_42.png\n", + "Processing document: idp/textract/receipts/receipt_43.png\n", + "Processing document: idp/textract/receipts/receipt_44.png\n", + "Processing document: idp/textract/receipts/receipt_45.png\n", + "Processing document: idp/textract/receipts/receipt_46.png\n", + "Processing document: idp/textract/receipts/receipt_47.png\n", + "Processing document: idp/textract/receipts/receipt_48.png\n", + "Processing document: idp/textract/receipts/receipt_49.png\n", + "Processing document: idp/textract/receipts/receipt_5.png\n", + "Processing document: idp/textract/receipts/receipt_50.png\n", + "Processing document: idp/textract/receipts/receipt_51.png\n", + "Processing document: idp/textract/receipts/receipt_52.png\n", + "Processing document: idp/textract/receipts/receipt_53.png\n", + "Processing document: idp/textract/receipts/receipt_54.png\n", + "Processing document: idp/textract/receipts/receipt_55.png\n", + "Processing document: idp/textract/receipts/receipt_56.png\n", + "Processing document: idp/textract/receipts/receipt_57.png\n", + "Processing document: idp/textract/receipts/receipt_58.png\n", + "Processing document: idp/textract/receipts/receipt_59.png\n", + "Processing document: idp/textract/receipts/receipt_6.png\n", + "Processing document: idp/textract/receipts/receipt_60.png\n", + "Processing document: idp/textract/receipts/receipt_61.png\n", + "Processing document: idp/textract/receipts/receipt_62.png\n", + "Processing document: idp/textract/receipts/receipt_63.png\n", + "Processing document: idp/textract/receipts/receipt_64.png\n", + "Processing document: idp/textract/receipts/receipt_65.png\n", + "Processing document: idp/textract/receipts/receipt_66.png\n", + "Processing document: idp/textract/receipts/receipt_67.png\n", + "Processing document: idp/textract/receipts/receipt_68.png\n", + "Processing document: idp/textract/receipts/receipt_69.png\n", + "Processing document: idp/textract/receipts/receipt_7.png\n", + "Processing document: idp/textract/receipts/receipt_70.png\n", + "Processing document: idp/textract/receipts/receipt_71.png\n", + "Processing document: idp/textract/receipts/receipt_72.png\n", + "Processing document: idp/textract/receipts/receipt_73.png\n", + "Processing document: idp/textract/receipts/receipt_74.png\n", + "Processing document: idp/textract/receipts/receipt_75.png\n", + "Processing document: idp/textract/receipts/receipt_76.png\n", + "Processing document: idp/textract/receipts/receipt_77.png\n", + "Processing document: idp/textract/receipts/receipt_78.png\n", + "Processing document: idp/textract/receipts/receipt_79.png\n", + "Processing document: idp/textract/receipts/receipt_8.png\n", + "Processing document: idp/textract/receipts/receipt_80.png\n", + "Processing document: idp/textract/receipts/receipt_81.png\n", + "Processing document: idp/textract/receipts/receipt_82.png\n", + "Processing document: idp/textract/receipts/receipt_83.png\n", + "Processing document: idp/textract/receipts/receipt_84.png\n", + "Processing document: idp/textract/receipts/receipt_85.png\n", + "Processing document: idp/textract/receipts/receipt_86.png\n", + "Processing document: idp/textract/receipts/receipt_87.png\n", + "Processing document: idp/textract/receipts/receipt_88.png\n", + "Processing document: idp/textract/receipts/receipt_89.png\n", + "Processing document: idp/textract/receipts/receipt_9.png\n", + "Processing document: idp/textract/receipts/receipt_90.png\n", + "Processing document: idp/textract/receipts/receipt_91.png\n", + "Processing document: idp/textract/receipts/receipt_92.png\n", + "Processing document: idp/textract/receipts/receipt_93.png\n", + "Processing document: idp/textract/receipts/receipt_94.png\n", + "Processing document: idp/textract/receipts/receipt_95.png\n", + "Processing document: idp/textract/receipts/receipt_96.png\n", + "Processing document: idp/textract/receipts/receipt_97.png\n", + "Processing document: idp/textract/receipts/receipt_98.png\n", + "Processing document: idp/textract/receipts/receipt_99.png\n" + ] + } + ], "source": [ "pool = mp.Pool(mp.cpu_count())\n", "pool_results = [pool.apply_async(textract_extract_text, (document,data_bucket)) for document in docs]\n", @@ -344,13 +760,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "31957382", - "metadata": {}, - "outputs": [], + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Page 1 of 5 03/02/2022\\nDC 1090001004290\\nAnyCompany Bank\\n999-99-99-99 16769 3 C 001 11 S 66 002\\nJANE DOE\\n7972 JERMAINE MOUNTAIN, EAST JONATHANFORT, MD 24167-5046\\nYour consolidated statement\\nContact us\\n(858) LLL-0101 or\\nFor 03/02/2022\\nexample.com\\n(858) 555-0101\\nDo more with digital banking\\nBank without having to leave home. Check your account balances, make transfers, pay bills and deposit checks with your mobile device. If\\nyou are not enrolled in digital banking, it only lakes a minute Gel started today at example.com/U\\nExample Bank, Member FDIC. To learn more, visit example.com/ABCXYZ ©2020 AnyCompany Bank.\\nIf you are traveling outside of the USA and have concerns about accessing your account while you are traveling, please contact your\\nBranch Banker or call us at 858-LLL-0101.\\nSummary of your accounts\\nACCOUNT NAME\\nACCOUNT NUMBER\\nBALANCE ($)\\nDETAILS ON\\nCHECKING\\n003884257406\\n19,102.60\\npage 1\\nTotal checking and money market savings accounts\\n$19,102.60\\nSAVINGS\\n388425740636\\n9,762.71\\npage 3\\nTotal savings accounts\\n$9,762.71\\nChecking and money market savings accounts\\nCHECKING 003884257406\\nAccount summary\\nYour previous balance as of 03/02/2022\\n$15,572.91\\nChecks\\n1,895.76\\nOther withdrawals, debits and service charges\\n887.69\\nDeposits, credits and interest\\n+2,871.72\\nYour new balance as of 06/17/2020\\n=\\n$19,102.60\\nAverage Posted Balance in Statement Cycle\\n$1,499.02\\nChecks\\nDATE\\nCHECK #\\nAMOUNT ($)\\nDATE\\nCHECK #\\nAMOUNT ($)\\nDATE\\nCHECK #\\nAMOUNT ($)\\n05/26\\n1401\\n450.00\\n06/05\\n*965025\\n101.39\\n06/09\\n985026\\n150.00\\n* indicates a skip in sequential check numbers above this item\\nTotal checks\\n= $701.39\\nOther windrawals, debits and service charges can be found in full statement\\nPage 1 of 1\\n0000667\\n'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "comprehend_df = pd.DataFrame(labeled_collection, columns=['label','document'])\n", - "comprehend_df" + "# the document column contains the extracted data, let's compare it against `bank_stmt_0.png`\n", + "comprehend_df.at[0, 'document']" ] }, { @@ -365,9 +795,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "ac163d35", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Upload Comprehend training data to S3\n", @@ -1096,11 +1528,499 @@ } ], "metadata": { + "availableInstances": [ + { + "_defaultOrder": 0, + "_isFastLaunch": true, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 4, + "name": "ml.t3.medium", + "vcpuNum": 2 + }, + { + "_defaultOrder": 1, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 8, + "name": "ml.t3.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 2, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 16, + "name": "ml.t3.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 3, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 32, + "name": "ml.t3.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 4, + "_isFastLaunch": true, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 8, + "name": "ml.m5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 5, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 16, + "name": "ml.m5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 6, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 32, + "name": "ml.m5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 7, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 64, + "name": "ml.m5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 8, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 128, + "name": "ml.m5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 9, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 192, + "name": "ml.m5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 10, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 256, + "name": "ml.m5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 11, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 384, + "name": "ml.m5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 12, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 8, + "name": "ml.m5d.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 13, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 16, + "name": "ml.m5d.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 14, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 32, + "name": "ml.m5d.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 15, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 64, + "name": "ml.m5d.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 16, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 128, + "name": "ml.m5d.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 17, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 192, + "name": "ml.m5d.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 18, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 256, + "name": "ml.m5d.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 19, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 384, + "name": "ml.m5d.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 20, + "_isFastLaunch": true, + "category": "Compute optimized", + "gpuNum": 0, + "memoryGiB": 4, + "name": "ml.c5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 21, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "memoryGiB": 8, + "name": "ml.c5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 22, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "memoryGiB": 16, + "name": "ml.c5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 23, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "memoryGiB": 32, + "name": "ml.c5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 24, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "memoryGiB": 72, + "name": "ml.c5.9xlarge", + "vcpuNum": 36 + }, + { + "_defaultOrder": 25, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "memoryGiB": 96, + "name": "ml.c5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 26, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "memoryGiB": 144, + "name": "ml.c5.18xlarge", + "vcpuNum": 72 + }, + { + "_defaultOrder": 27, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "memoryGiB": 192, + "name": "ml.c5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 28, + "_isFastLaunch": true, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 16, + "name": "ml.g4dn.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 29, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 32, + "name": "ml.g4dn.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 30, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 64, + "name": "ml.g4dn.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 31, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 128, + "name": "ml.g4dn.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 32, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "memoryGiB": 192, + "name": "ml.g4dn.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 33, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 256, + "name": "ml.g4dn.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 34, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 61, + "name": "ml.p3.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 35, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "memoryGiB": 244, + "name": "ml.p3.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 36, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "memoryGiB": 488, + "name": "ml.p3.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 37, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "memoryGiB": 768, + "name": "ml.p3dn.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 38, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "memoryGiB": 16, + "name": "ml.r5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 39, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "memoryGiB": 32, + "name": "ml.r5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 40, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "memoryGiB": 64, + "name": "ml.r5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 41, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "memoryGiB": 128, + "name": "ml.r5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 42, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "memoryGiB": 256, + "name": "ml.r5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 43, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "memoryGiB": 384, + "name": "ml.r5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 44, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "memoryGiB": 512, + "name": "ml.r5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 45, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "memoryGiB": 768, + "name": "ml.r5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 46, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 16, + "name": "ml.g5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 47, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 32, + "name": "ml.g5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 48, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 64, + "name": "ml.g5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 49, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 128, + "name": "ml.g5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 50, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 256, + "name": "ml.g5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 51, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "memoryGiB": 192, + "name": "ml.g5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 52, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "memoryGiB": 384, + "name": "ml.g5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 53, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "memoryGiB": 768, + "name": "ml.g5.48xlarge", + "vcpuNum": 192 + } + ], "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "Python 3 (Data Science)", "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0" + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:eu-west-1:470317259841:image/datascience-1.0" }, "language_info": { "codemirror_mode": {