diff --git a/.gitignore b/.gitignore index 0323ef1..748a40d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ .ipynb_checkpoints .DS_Store -**/.DS_Store \ No newline at end of file +**/.DS_Store + +classification-training.zip +classification-training/ +comprehend_train_data.csv +ref_output.txt diff --git a/01-idp-document-classification.ipynb b/01-idp-document-classification.ipynb index babb453..62cd9ee 100644 --- a/01-idp-document-classification.ipynb +++ b/01-idp-document-classification.ipynb @@ -54,13 +54,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "0a67449e", "metadata": { "scrolled": true, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], "source": [ "!python -m pip install -q amazon-textract-response-parser --upgrade\n", "!python -m pip install -q amazon-textract-caller --upgrade\n", @@ -69,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "f7f8a235", "metadata": { "tags": [] @@ -91,12 +110,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "66660511", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SageMaker role is: arn:aws:iam::981147895656:role/idp-temp-sagemaker-SageMakerExecutionRole-1TSM55YJNFLJR\n", + "Default SageMaker Bucket: s3://sagemaker-eu-west-1-981147895656\n" + ] + } + ], "source": [ "import boto3\n", "import botocore\n", @@ -110,7 +138,7 @@ "import multiprocessing as mp\n", "from IPython.display import Image, display, HTML, JSON\n", "\n", - "# variables\n", + "# Connect to the S3 bucket where the data will be stored.\n", "data_bucket = sagemaker.Session().default_bucket()\n", "region = boto3.session.Session().region_name\n", "\n", @@ -135,24 +163,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "cc74349d", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " % Total % Received % Xferd Average Speed Time Time Time Current\n", + " Dload Upload Total Spent Left Speed\n", + "100 21.0M 100 21.0M 0 0 13.6M 0 0:00:01 0:00:01 --:--:-- 13.6M\n" + ] + } + ], "source": [ "!curl https://idp-assets-wwso.s3.us-east-2.amazonaws.com/workshop-data/classification-training.zip --output classification-training.zip" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "582ed56f", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Document archive extracted successfully...\n", + "Removing hidden files/directories: ./classification-training/invoices/.DS_Store\n", + "Removing hidden files/directories: ./classification-training/invoices/.ipynb_checkpoints\n", + "Removing hidden files/directories: ./classification-training/bank-statements/.DS_Store\n", + "Removing hidden files/directories: ./classification-training/bank-statements/.ipynb_checkpoints\n", + "Removing hidden files/directories: ./classification-training/receipts/.ipynb_checkpoints\n" + ] + } + ], "source": [ "import shutil\n", "\n", @@ -188,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "33b5e387", "metadata": { "scrolled": true, @@ -212,7 +263,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "ba331a70", "metadata": { "tags": [] @@ -246,13 +297,48 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "b58e56f6", "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, "scrolled": true, "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(['bank-statements', 'receipts', 'invoices'],\n", + " ['idp/textract/receipts/receipt_90.png',\n", + " 'idp/textract/receipts/receipt_91.png',\n", + " 'idp/textract/receipts/receipt_92.png',\n", + " 'idp/textract/receipts/receipt_93.png',\n", + " 'idp/textract/receipts/receipt_94.png',\n", + " 'idp/textract/receipts/receipt_95.png',\n", + " 'idp/textract/receipts/receipt_96.png',\n", + " 'idp/textract/receipts/receipt_97.png',\n", + " 'idp/textract/receipts/receipt_98.png',\n", + " 'idp/textract/receipts/receipt_99.png'],\n", + " ['idp/textract/bank-statements/bank_stmt_0.png',\n", + " 'idp/textract/bank-statements/bank_stmt_1.png',\n", + " 'idp/textract/bank-statements/bank_stmt_10.png',\n", + " 'idp/textract/bank-statements/bank_stmt_11.png',\n", + " 'idp/textract/bank-statements/bank_stmt_12.png',\n", + " 'idp/textract/bank-statements/bank_stmt_13.png',\n", + " 'idp/textract/bank-statements/bank_stmt_14.png',\n", + " 'idp/textract/bank-statements/bank_stmt_15.png',\n", + " 'idp/textract/bank-statements/bank_stmt_16.png',\n", + " 'idp/textract/bank-statements/bank_stmt_17.png'])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "docs=[]\n", "\n", @@ -261,8 +347,10 @@ "\n", "if type(docs[0]) is list:\n", " docs=[item for sublist in docs for item in sublist]\n", - " \n", - "names, docs[-10:], docs[:10]" + " \n", + "assert len(docs) == 300 # we have 300 samples, 100 per category\n", + "\n", + "names, docs[-10:], docs[:10]\n" ] }, { @@ -280,9 +368,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, + "id": "3462c0e6-db16-4a13-9941-2ef202965aab", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Let's see how textract works with our docs\n", + "ref = f\"s3://{data_bucket}/ref-2911-25357.pdf\"\n", + "response = call_textract(input_document=ref)\n", + "lines = get_string(textract_json=response, output_type=[Textract_Pretty_Print.LINES])\n", + "\n", + "with open(\"ref_output.txt\", \"w\") as f:\n", + " f.write(lines)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, "id": "3f93877e", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "def textract_extract_text(document, bucket=data_bucket): \n", @@ -318,12 +426,320 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "dd4608d2", "metadata": { - "scrolled": true + "scrolled": true, + "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing document: idp/textract/bank-statements/bank_stmt_0.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_1.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_10.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_11.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_12.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_13.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_14.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_15.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_16.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_17.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_18.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_19.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_2.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_20.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_21.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_22.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_23.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_24.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_25.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_26.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_27.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_28.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_29.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_3.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_30.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_31.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_32.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_33.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_34.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_35.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_36.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_37.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_38.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_39.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_4.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_40.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_41.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_42.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_43.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_44.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_45.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_46.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_47.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_48.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_49.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_5.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_50.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_51.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_52.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_53.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_54.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_55.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_56.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_57.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_58.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_59.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_6.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_60.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_61.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_62.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_63.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_64.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_65.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_66.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_67.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_68.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_69.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_7.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_70.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_71.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_72.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_73.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_74.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_75.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_76.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_77.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_78.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_79.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_8.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_80.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_81.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_82.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_83.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_84.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_85.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_86.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_87.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_88.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_89.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_9.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_90.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_91.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_92.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_93.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_94.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_95.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_96.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_97.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_98.png\n", + "Processing document: idp/textract/bank-statements/bank_stmt_99.png\n", + "Processing document: idp/textract/invoices/invoice_0.png\n", + "Processing document: idp/textract/invoices/invoice_1.png\n", + "Processing document: idp/textract/invoices/invoice_10.png\n", + "Processing document: idp/textract/invoices/invoice_11.png\n", + "Processing document: idp/textract/invoices/invoice_12.png\n", + "Processing document: idp/textract/invoices/invoice_13.png\n", + "Processing document: idp/textract/invoices/invoice_14.png\n", + "Processing document: idp/textract/invoices/invoice_15.png\n", + "Processing document: idp/textract/invoices/invoice_16.png\n", + "Processing document: idp/textract/invoices/invoice_17.png\n", + "Processing document: idp/textract/invoices/invoice_18.png\n", + "Processing document: idp/textract/invoices/invoice_19.png\n", + "Processing document: idp/textract/invoices/invoice_2.png\n", + "Processing document: idp/textract/invoices/invoice_20.png\n", + "Processing document: idp/textract/invoices/invoice_21.png\n", + "Processing document: idp/textract/invoices/invoice_22.png\n", + "Processing document: idp/textract/invoices/invoice_23.png\n", + "Processing document: idp/textract/invoices/invoice_24.png\n", + "Processing document: idp/textract/invoices/invoice_25.png\n", + "Processing document: idp/textract/invoices/invoice_26.png\n", + "Processing document: idp/textract/invoices/invoice_27.png\n", + "Processing document: idp/textract/invoices/invoice_28.png\n", + "Processing document: idp/textract/invoices/invoice_29.png\n", + "Processing document: idp/textract/invoices/invoice_3.png\n", + "Processing document: idp/textract/invoices/invoice_30.png\n", + "Processing document: idp/textract/invoices/invoice_31.png\n", + "Processing document: idp/textract/invoices/invoice_32.png\n", + "Processing document: idp/textract/invoices/invoice_33.png\n", + "Processing document: idp/textract/invoices/invoice_34.png\n", + "Processing document: idp/textract/invoices/invoice_35.png\n", + "Processing document: idp/textract/invoices/invoice_36.png\n", + "Processing document: idp/textract/invoices/invoice_37.png\n", + "Processing document: idp/textract/invoices/invoice_38.png\n", + "Processing document: idp/textract/invoices/invoice_39.png\n", + "Processing document: idp/textract/invoices/invoice_4.png\n", + "Processing document: idp/textract/invoices/invoice_40.png\n", + "Processing document: idp/textract/invoices/invoice_41.png\n", + "Processing document: idp/textract/invoices/invoice_42.png\n", + "Processing document: idp/textract/invoices/invoice_43.png\n", + "Processing document: idp/textract/invoices/invoice_44.png\n", + "Processing document: idp/textract/invoices/invoice_45.png\n", + "Processing document: idp/textract/invoices/invoice_46.png\n", + "Processing document: idp/textract/invoices/invoice_47.png\n", + "Processing document: idp/textract/invoices/invoice_48.png\n", + "Processing document: idp/textract/invoices/invoice_49.png\n", + "Processing document: idp/textract/invoices/invoice_5.png\n", + "Processing document: idp/textract/invoices/invoice_50.png\n", + "Processing document: idp/textract/invoices/invoice_51.png\n", + "Processing document: idp/textract/invoices/invoice_52.png\n", + "Processing document: idp/textract/invoices/invoice_53.png\n", + "Processing document: idp/textract/invoices/invoice_54.png\n", + "Processing document: idp/textract/invoices/invoice_55.png\n", + "Processing document: idp/textract/invoices/invoice_56.png\n", + "Processing document: idp/textract/invoices/invoice_57.png\n", + "Processing document: idp/textract/invoices/invoice_58.png\n", + "Processing document: idp/textract/invoices/invoice_59.png\n", + "Processing document: idp/textract/invoices/invoice_6.png\n", + "Processing document: idp/textract/invoices/invoice_60.png\n", + "Processing document: idp/textract/invoices/invoice_61.png\n", + "Processing document: idp/textract/invoices/invoice_62.png\n", + "Processing document: idp/textract/invoices/invoice_63.png\n", + "Processing document: idp/textract/invoices/invoice_64.png\n", + "Processing document: idp/textract/invoices/invoice_65.png\n", + "Processing document: idp/textract/invoices/invoice_66.png\n", + "Processing document: idp/textract/invoices/invoice_67.png\n", + "Processing document: idp/textract/invoices/invoice_68.png\n", + "Processing document: idp/textract/invoices/invoice_69.png\n", + "Processing document: idp/textract/invoices/invoice_7.png\n", + "Processing document: idp/textract/invoices/invoice_70.png\n", + "Processing document: idp/textract/invoices/invoice_71.png\n", + "Processing document: idp/textract/invoices/invoice_72.png\n", + "Processing document: idp/textract/invoices/invoice_73.png\n", + "Processing document: idp/textract/invoices/invoice_74.png\n", + "Processing document: idp/textract/invoices/invoice_75.png\n", + "Processing document: idp/textract/invoices/invoice_76.png\n", + "Processing document: idp/textract/invoices/invoice_77.png\n", + "Processing document: idp/textract/invoices/invoice_78.png\n", + "Processing document: idp/textract/invoices/invoice_79.png\n", + "Processing document: idp/textract/invoices/invoice_8.png\n", + "Processing document: idp/textract/invoices/invoice_80.png\n", + "Processing document: idp/textract/invoices/invoice_81.png\n", + "Processing document: idp/textract/invoices/invoice_82.png\n", + "Processing document: idp/textract/invoices/invoice_83.png\n", + "Processing document: idp/textract/invoices/invoice_84.png\n", + "Processing document: idp/textract/invoices/invoice_85.png\n", + "Processing document: idp/textract/invoices/invoice_86.png\n", + "Processing document: idp/textract/invoices/invoice_87.png\n", + "Processing document: idp/textract/invoices/invoice_88.png\n", + "Processing document: idp/textract/invoices/invoice_89.png\n", + "Processing document: idp/textract/invoices/invoice_9.png\n", + "Processing document: idp/textract/invoices/invoice_90.png\n", + "Processing document: idp/textract/invoices/invoice_91.png\n", + "Processing document: idp/textract/invoices/invoice_92.png\n", + "Processing document: idp/textract/invoices/invoice_93.png\n", + "Processing document: idp/textract/invoices/invoice_94.png\n", + "Processing document: idp/textract/invoices/invoice_95.png\n", + "Processing document: idp/textract/invoices/invoice_96.png\n", + "Processing document: idp/textract/invoices/invoice_97.png\n", + "Processing document: idp/textract/invoices/invoice_98.png\n", + "Processing document: idp/textract/invoices/invoice_99.png\n", + "Processing document: idp/textract/receipts/receipt_0.png\n", + "Processing document: idp/textract/receipts/receipt_1.png\n", + "Processing document: idp/textract/receipts/receipt_10.png\n", + "Processing document: idp/textract/receipts/receipt_11.png\n", + "Processing document: idp/textract/receipts/receipt_12.png\n", + "Processing document: idp/textract/receipts/receipt_13.png\n", + "Processing document: idp/textract/receipts/receipt_14.png\n", + "Processing document: idp/textract/receipts/receipt_15.png\n", + "Processing document: idp/textract/receipts/receipt_16.png\n", + "Processing document: idp/textract/receipts/receipt_17.png\n", + "Processing document: idp/textract/receipts/receipt_18.png\n", + "Processing document: idp/textract/receipts/receipt_19.png\n", + "Processing document: idp/textract/receipts/receipt_2.png\n", + "Processing document: idp/textract/receipts/receipt_20.png\n", + "Processing document: idp/textract/receipts/receipt_21.png\n", + "Processing document: idp/textract/receipts/receipt_22.png\n", + "Processing document: idp/textract/receipts/receipt_23.png\n", + "Processing document: idp/textract/receipts/receipt_24.png\n", + "Processing document: idp/textract/receipts/receipt_25.png\n", + "Processing document: idp/textract/receipts/receipt_26.png\n", + "Processing document: idp/textract/receipts/receipt_27.png\n", + "Processing document: idp/textract/receipts/receipt_28.png\n", + "Processing document: idp/textract/receipts/receipt_29.png\n", + "Processing document: idp/textract/receipts/receipt_3.png\n", + "Processing document: idp/textract/receipts/receipt_30.png\n", + "Processing document: idp/textract/receipts/receipt_31.png\n", + "Processing document: idp/textract/receipts/receipt_32.png\n", + "Processing document: idp/textract/receipts/receipt_33.png\n", + "Processing document: idp/textract/receipts/receipt_34.png\n", + "Processing document: idp/textract/receipts/receipt_35.png\n", + "Processing document: idp/textract/receipts/receipt_36.png\n", + "Processing document: idp/textract/receipts/receipt_37.png\n", + "Processing document: idp/textract/receipts/receipt_38.png\n", + "Processing document: idp/textract/receipts/receipt_39.png\n", + "Processing document: idp/textract/receipts/receipt_4.png\n", + "Processing document: idp/textract/receipts/receipt_40.png\n", + "Processing document: idp/textract/receipts/receipt_41.png\n", + "Processing document: idp/textract/receipts/receipt_42.png\n", + "Processing document: idp/textract/receipts/receipt_43.png\n", + "Processing document: idp/textract/receipts/receipt_44.png\n", + "Processing document: idp/textract/receipts/receipt_45.png\n", + "Processing document: idp/textract/receipts/receipt_46.png\n", + "Processing document: idp/textract/receipts/receipt_47.png\n", + "Processing document: idp/textract/receipts/receipt_48.png\n", + "Processing document: idp/textract/receipts/receipt_49.png\n", + "Processing document: idp/textract/receipts/receipt_5.png\n", + "Processing document: idp/textract/receipts/receipt_50.png\n", + "Processing document: idp/textract/receipts/receipt_51.png\n", + "Processing document: idp/textract/receipts/receipt_52.png\n", + "Processing document: idp/textract/receipts/receipt_53.png\n", + "Processing document: idp/textract/receipts/receipt_54.png\n", + "Processing document: idp/textract/receipts/receipt_55.png\n", + "Processing document: idp/textract/receipts/receipt_56.png\n", + "Processing document: idp/textract/receipts/receipt_57.png\n", + "Processing document: idp/textract/receipts/receipt_58.png\n", + "Processing document: idp/textract/receipts/receipt_59.png\n", + "Processing document: idp/textract/receipts/receipt_6.png\n", + "Processing document: idp/textract/receipts/receipt_60.png\n", + "Processing document: idp/textract/receipts/receipt_61.png\n", + "Processing document: idp/textract/receipts/receipt_62.png\n", + "Processing document: idp/textract/receipts/receipt_63.png\n", + "Processing document: idp/textract/receipts/receipt_64.png\n", + "Processing document: idp/textract/receipts/receipt_65.png\n", + "Processing document: idp/textract/receipts/receipt_66.png\n", + "Processing document: idp/textract/receipts/receipt_67.png\n", + "Processing document: idp/textract/receipts/receipt_68.png\n", + "Processing document: idp/textract/receipts/receipt_69.png\n", + "Processing document: idp/textract/receipts/receipt_7.png\n", + "Processing document: idp/textract/receipts/receipt_70.png\n", + "Processing document: idp/textract/receipts/receipt_71.png\n", + "Processing document: idp/textract/receipts/receipt_72.png\n", + "Processing document: idp/textract/receipts/receipt_73.png\n", + "Processing document: idp/textract/receipts/receipt_74.png\n", + "Processing document: idp/textract/receipts/receipt_75.png\n", + "Processing document: idp/textract/receipts/receipt_76.png\n", + "Processing document: idp/textract/receipts/receipt_77.png\n", + "Processing document: idp/textract/receipts/receipt_78.png\n", + "Processing document: idp/textract/receipts/receipt_79.png\n", + "Processing document: idp/textract/receipts/receipt_8.png\n", + "Processing document: idp/textract/receipts/receipt_80.png\n", + "Processing document: idp/textract/receipts/receipt_81.png\n", + "Processing document: idp/textract/receipts/receipt_82.png\n", + "Processing document: idp/textract/receipts/receipt_83.png\n", + "Processing document: idp/textract/receipts/receipt_84.png\n", + "Processing document: idp/textract/receipts/receipt_85.png\n", + "Processing document: idp/textract/receipts/receipt_86.png\n", + "Processing document: idp/textract/receipts/receipt_87.png\n", + "Processing document: idp/textract/receipts/receipt_88.png\n", + "Processing document: idp/textract/receipts/receipt_89.png\n", + "Processing document: idp/textract/receipts/receipt_9.png\n", + "Processing document: idp/textract/receipts/receipt_90.png\n", + "Processing document: idp/textract/receipts/receipt_91.png\n", + "Processing document: idp/textract/receipts/receipt_92.png\n", + "Processing document: idp/textract/receipts/receipt_93.png\n", + "Processing document: idp/textract/receipts/receipt_94.png\n", + "Processing document: idp/textract/receipts/receipt_95.png\n", + "Processing document: idp/textract/receipts/receipt_96.png\n", + "Processing document: idp/textract/receipts/receipt_97.png\n", + "Processing document: idp/textract/receipts/receipt_98.png\n", + "Processing document: idp/textract/receipts/receipt_99.png\n" + ] + } + ], "source": [ "pool = mp.Pool(mp.cpu_count())\n", "pool_results = [pool.apply_async(textract_extract_text, (document,data_bucket)) for document in docs]\n", @@ -344,13 +760,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "31957382", - "metadata": {}, - "outputs": [], + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'Page 1 of 5 03/02/2022\\nDC 1090001004290\\nAnyCompany Bank\\n999-99-99-99 16769 3 C 001 11 S 66 002\\nJANE DOE\\n7972 JERMAINE MOUNTAIN, EAST JONATHANFORT, MD 24167-5046\\nYour consolidated statement\\nContact us\\n(858) LLL-0101 or\\nFor 03/02/2022\\nexample.com\\n(858) 555-0101\\nDo more with digital banking\\nBank without having to leave home. Check your account balances, make transfers, pay bills and deposit checks with your mobile device. If\\nyou are not enrolled in digital banking, it only lakes a minute Gel started today at example.com/U\\nExample Bank, Member FDIC. To learn more, visit example.com/ABCXYZ ©2020 AnyCompany Bank.\\nIf you are traveling outside of the USA and have concerns about accessing your account while you are traveling, please contact your\\nBranch Banker or call us at 858-LLL-0101.\\nSummary of your accounts\\nACCOUNT NAME\\nACCOUNT NUMBER\\nBALANCE ($)\\nDETAILS ON\\nCHECKING\\n003884257406\\n19,102.60\\npage 1\\nTotal checking and money market savings accounts\\n$19,102.60\\nSAVINGS\\n388425740636\\n9,762.71\\npage 3\\nTotal savings accounts\\n$9,762.71\\nChecking and money market savings accounts\\nCHECKING 003884257406\\nAccount summary\\nYour previous balance as of 03/02/2022\\n$15,572.91\\nChecks\\n1,895.76\\nOther withdrawals, debits and service charges\\n887.69\\nDeposits, credits and interest\\n+2,871.72\\nYour new balance as of 06/17/2020\\n=\\n$19,102.60\\nAverage Posted Balance in Statement Cycle\\n$1,499.02\\nChecks\\nDATE\\nCHECK #\\nAMOUNT ($)\\nDATE\\nCHECK #\\nAMOUNT ($)\\nDATE\\nCHECK #\\nAMOUNT ($)\\n05/26\\n1401\\n450.00\\n06/05\\n*965025\\n101.39\\n06/09\\n985026\\n150.00\\n* indicates a skip in sequential check numbers above this item\\nTotal checks\\n= $701.39\\nOther windrawals, debits and service charges can be found in full statement\\nPage 1 of 1\\n0000667\\n'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "comprehend_df = pd.DataFrame(labeled_collection, columns=['label','document'])\n", - "comprehend_df" + "# the document column contains the extracted data, let's compare it against `bank_stmt_0.png`\n", + "comprehend_df.at[0, 'document']" ] }, { @@ -365,9 +795,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "ac163d35", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# Upload Comprehend training data to S3\n", @@ -1096,11 +1528,499 @@ } ], "metadata": { + "availableInstances": [ + { + "_defaultOrder": 0, + "_isFastLaunch": true, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 4, + "name": "ml.t3.medium", + "vcpuNum": 2 + }, + { + "_defaultOrder": 1, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 8, + "name": "ml.t3.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 2, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 16, + "name": "ml.t3.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 3, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 32, + "name": "ml.t3.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 4, + "_isFastLaunch": true, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 8, + "name": "ml.m5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 5, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 16, + "name": "ml.m5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 6, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 32, + "name": "ml.m5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 7, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 64, + "name": "ml.m5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 8, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 128, + "name": "ml.m5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 9, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 192, + "name": "ml.m5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 10, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 256, + "name": "ml.m5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 11, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 384, + "name": "ml.m5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 12, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 8, + "name": "ml.m5d.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 13, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 16, + "name": "ml.m5d.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 14, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 32, + "name": "ml.m5d.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 15, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 64, + "name": "ml.m5d.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 16, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 128, + "name": "ml.m5d.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 17, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 192, + "name": "ml.m5d.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 18, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 256, + "name": "ml.m5d.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 19, + "_isFastLaunch": false, + "category": "General purpose", + "gpuNum": 0, + "memoryGiB": 384, + "name": "ml.m5d.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 20, + "_isFastLaunch": true, + "category": "Compute optimized", + "gpuNum": 0, + "memoryGiB": 4, + "name": "ml.c5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 21, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "memoryGiB": 8, + "name": "ml.c5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 22, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "memoryGiB": 16, + "name": "ml.c5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 23, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "memoryGiB": 32, + "name": "ml.c5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 24, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "memoryGiB": 72, + "name": "ml.c5.9xlarge", + "vcpuNum": 36 + }, + { + "_defaultOrder": 25, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "memoryGiB": 96, + "name": "ml.c5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 26, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "memoryGiB": 144, + "name": "ml.c5.18xlarge", + "vcpuNum": 72 + }, + { + "_defaultOrder": 27, + "_isFastLaunch": false, + "category": "Compute optimized", + "gpuNum": 0, + "memoryGiB": 192, + "name": "ml.c5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 28, + "_isFastLaunch": true, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 16, + "name": "ml.g4dn.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 29, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 32, + "name": "ml.g4dn.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 30, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 64, + "name": "ml.g4dn.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 31, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 128, + "name": "ml.g4dn.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 32, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "memoryGiB": 192, + "name": "ml.g4dn.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 33, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 256, + "name": "ml.g4dn.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 34, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 61, + "name": "ml.p3.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 35, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "memoryGiB": 244, + "name": "ml.p3.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 36, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "memoryGiB": 488, + "name": "ml.p3.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 37, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "memoryGiB": 768, + "name": "ml.p3dn.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 38, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "memoryGiB": 16, + "name": "ml.r5.large", + "vcpuNum": 2 + }, + { + "_defaultOrder": 39, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "memoryGiB": 32, + "name": "ml.r5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 40, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "memoryGiB": 64, + "name": "ml.r5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 41, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "memoryGiB": 128, + "name": "ml.r5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 42, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "memoryGiB": 256, + "name": "ml.r5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 43, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "memoryGiB": 384, + "name": "ml.r5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 44, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "memoryGiB": 512, + "name": "ml.r5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 45, + "_isFastLaunch": false, + "category": "Memory Optimized", + "gpuNum": 0, + "memoryGiB": 768, + "name": "ml.r5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 46, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 16, + "name": "ml.g5.xlarge", + "vcpuNum": 4 + }, + { + "_defaultOrder": 47, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 32, + "name": "ml.g5.2xlarge", + "vcpuNum": 8 + }, + { + "_defaultOrder": 48, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 64, + "name": "ml.g5.4xlarge", + "vcpuNum": 16 + }, + { + "_defaultOrder": 49, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 128, + "name": "ml.g5.8xlarge", + "vcpuNum": 32 + }, + { + "_defaultOrder": 50, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 1, + "memoryGiB": 256, + "name": "ml.g5.16xlarge", + "vcpuNum": 64 + }, + { + "_defaultOrder": 51, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "memoryGiB": 192, + "name": "ml.g5.12xlarge", + "vcpuNum": 48 + }, + { + "_defaultOrder": 52, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 4, + "memoryGiB": 384, + "name": "ml.g5.24xlarge", + "vcpuNum": 96 + }, + { + "_defaultOrder": 53, + "_isFastLaunch": false, + "category": "Accelerated computing", + "gpuNum": 8, + "memoryGiB": 768, + "name": "ml.g5.48xlarge", + "vcpuNum": 192 + } + ], "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "Python 3 (Data Science)", "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0" + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:eu-west-1:470317259841:image/datascience-1.0" }, "language_info": { "codemirror_mode": {