Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Before deploying the solution, you need to create an OpenAI resource and deploy

2. **Deploy a Vision-Capable Model**:
- Ensure the deployed model supports vision, such as GPT-4T-0125, GPT-4T-0409 or GPT-4-Omni.
- Currently, this solution also assumes the deployed model supports setting the `response_format` option to `json_object`.


## Deployment
Expand Down
10 changes: 9 additions & 1 deletion infra/main.bicep
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason that we allow only these four locations for the appservice?

Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@ param functionAppDockerImage string = 'DOCKER|argus.azurecr.io/argus-backend:lat
// Define the resource group location
param location string = resourceGroup().location

@allowed([
'westeurope'
'westus2'
'westus3'
'eastus2'
])
param appserviceLocation string

// Define the storage account name
param storageAccountName string = 'sa${uniqueString(resourceGroup().id)}'

Expand Down Expand Up @@ -190,7 +198,7 @@ resource applicationInsights 'Microsoft.Insights/components@2020-02-02' = {
// Define the App Service Plan
resource appServicePlan 'Microsoft.Web/serverfarms@2021-03-01' = {
name: appServicePlanName
location: location
location: appserviceLocation
kind: 'Linux'
sku: {
name: 'B1'
Expand Down
527 changes: 43 additions & 484 deletions notebooks/evaluator.ipynb

Large diffs are not rendered by default.

7 changes: 3 additions & 4 deletions notebooks/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,13 @@ python-dotenv
azure-ai-documentintelligence
azure-identity
PyMuPDF
langchain
langchain_core
langchain_community
langchain_openai
tiktoken
python-multipart
promptflow-evals
jsonpath-ng
thefuzz
seaborn
semantic-kernel
jupyter
azure-ai-formrecognizer
seaborn
1 change: 0 additions & 1 deletion src/.funcignore

This file was deleted.

2 changes: 1 addition & 1 deletion src/functionapp/ai_ocr/azure/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def get_config():
"doc_intelligence_key": os.getenv("DOCUMENT_INTELLIGENCE_KEY", None),
"openai_api_key": os.getenv("AZURE_OPENAI_KEY", None),
"openai_api_endpoint": os.getenv("AZURE_OPENAI_ENDPOINT", None),
"openai_api_version": "2023-12-01-preview",
"openai_api_version": "2024-07-01-preview",
"openai_model_deployment": os.getenv("AZURE_OPENAI_MODEL_DEPLOYMENT_NAME", None),
"temp_images_outdir" : os.getenv("TEMP_IMAGES_OUTDIR", "/tmp/")
}
1 change: 0 additions & 1 deletion src/functionapp/ai_ocr/azure/images.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import fitz # PyMuPDF
from PIL import Image
from pathlib import Path
import io
import os

Expand Down
36 changes: 16 additions & 20 deletions src/functionapp/ai_ocr/azure/openai_ops.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,28 @@
import base64

from langchain.chains.transform import TransformChain
from langchain_openai import AzureChatOpenAI
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
from semantic_kernel.connectors.ai import PromptExecutionSettings

from ai_ocr.azure.config import get_config


def get_llm():
def get_completion_service():
api_key = get_config()['openai_api_key']
if not api_key:
raise ValueError("openai_api_key environment variable is not set.")

return AzureChatOpenAI(
model=get_config()["openai_model_deployment"],
temperature=0,
max_tokens=4000,
verbose=True,
api_key=api_key,
api_version=get_config()["openai_api_version"]
)

chat_completion_service = AzureChatCompletion(
deployment_name=get_config()["openai_model_deployment"],
api_key=api_key,
endpoint=get_config()["openai_api_endpoint"],
api_version=get_config()["openai_api_version"])

req_settings = PromptExecutionSettings(
extension_data = {
"max_tokens": 4000,
"temperature": 0,
}
)
return chat_completion_service, req_settings


def load_image(image_path) -> str:
Expand All @@ -33,10 +36,3 @@ def get_size_of_base64_images(images):
for img in images:
total_size += len(img)
return total_size


load_image_chain = TransformChain(
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could not find any place where this chain was referenced or used, so I removed it. If this was necessary for a larger piece of the solution, let me know so that I can be sure to include a Semantic Kernel replacement.

input_variables=["image_path"],
output_variables=["image"],
transform=load_image
)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We will need to incorporate the newest changes that are utilizing html instead of markdown

Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage
from langchain_core.prompts import HumanMessagePromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from semantic_kernel.contents import ChatHistory, ChatMessageContent, ImageContent
from ai_ocr.azure.openai_ops import get_completion_service

from ai_ocr.azure.openai_ops import get_llm
import logging
import json

import logging, json


def get_structured_data(html_content: str, prompt: str, json_schema: str, images=[]) -> any:
async def get_structured_data(markdown_content: str, prompt: str, json_schema: str, images=[]) -> any:
system_message = f"""
Your task is to extract the JSON contents from a document using the provided materials:
1. Custom instructions for the extraction process
Expand All @@ -35,31 +31,30 @@ def get_structured_data(html_content: str, prompt: str, json_schema: str, images
```
"""

chat_template = ChatPromptTemplate.from_messages(
[
SystemMessage(content=system_message),
HumanMessagePromptTemplate.from_template("Here is the Document content (in html format):\n{html}"),
]
)

messages = chat_template.format_messages(html=html_content)
chat_history = ChatHistory(system_message = system_message)
chat_history.add_user_message(f"Here is the Document content (in markdown format):\n{markdown_content}")

if images:
messages.append(HumanMessage(content="Here are the images from the document:"))
chat_history.add_user_message("Here are the images from the document:")
for img in images:
messages.append(HumanMessage(content=[{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}}]))
chat_history.add_message(
ChatMessageContent(
role="user",
items=[ImageContent(uri=f"data:image/png;base64,{img}")]
)
)

model = get_llm()
return model.invoke(messages)
service, req_params = get_completion_service()
req_params.extension_data["response_format"] = {"type": "json_object"}
return await service.get_chat_message_content(
chat_history,
req_params
)




def perform_gpt_evaluation_and_enrichment(images: list, extracted_data: dict, json_schema: str) -> dict:
model = get_llm()

parser = JsonOutputParser()

async def perform_gpt_evaluation_and_enrichment(images: list, extracted_data: dict, json_schema: str) -> dict:
system_message = f"""
You are an AI assistant tasked with evaluating extracted data from a document.

Expand Down Expand Up @@ -105,28 +100,32 @@ def perform_gpt_evaluation_and_enrichment(images: list, extracted_data: dict, js
Here is the JSON schema template that was used for the extraction:

{json_schema}

------

{parser.get_format_instructions()}
"""

chat_template = ChatPromptTemplate.from_messages(
[
SystemMessage(content=system_message),
HumanMessagePromptTemplate.from_template("Here is the extracted data :\n{extracted}"),
]
)
messages = chat_template.format_messages(extracted=json.dumps(extracted_data, indent=2))
chat_history = ChatHistory(system_message = system_message)
chat_history.add_user_message(f"Here is the extracted data :\n{json.dumps(extracted_data, indent=2)}")

if images:
messages.append(HumanMessage(content="Here are the images from the document:"))
chat_history.add_user_message("Here are the images from the document:")
for img in images:
messages.append(HumanMessage(content=[{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}}]))
chat_history.add_message(
ChatMessageContent(
role="user",
items=[ImageContent(uri=f"data:image/png;base64,{img}")]
)
)

service, req_params = get_completion_service()
# Set the response format to JSON object
req_params.extension_data["response_format"] = {"type": "json_object"}
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assumes using a model that supports structured outputs - if this is not the desired route, this will require refactoring.

evaluation_result = await service.get_chat_message_content(
chat_history,
req_params
)


evaluation_result = model.invoke(messages)
try:
return parser.parse(evaluation_result.content)
return json.loads(evaluation_result.content)
except Exception as e:
logging.error(f"Failed to parse GPT evaluation and enrichment result: {e}")
return {
Expand All @@ -135,44 +134,33 @@ def perform_gpt_evaluation_and_enrichment(images: list, extracted_data: dict, js
}


def get_summary_with_gpt(mkd_output_json: str) -> any:
async def get_summary_with_gpt(mkd_output_json: str) -> any:
reasoning_prompt = """
Use the provided data represented in the schema to produce a summary in natural language. The format should be a few sentences summary of the document.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.
"""

chat_template = ChatPromptTemplate.from_messages(
[
SystemMessage(
content=(
reasoning_prompt
)
),
HumanMessagePromptTemplate.from_template("{text}"),
]
)
messages = chat_template.format_messages(text=mkd_output_json)

model = get_llm()
return model.invoke(messages)
chat_history = ChatHistory(system_message = reasoning_prompt)
chat_history.add_user_message(f"{mkd_output_json}")

service, req_params = get_completion_service()
return await service.get_chat_message_content(
chat_history,
req_params
)



def classify_doc_with_llm(ocr_input: str, classification_system_prompt) -> any:
async def classify_doc_with_llm(ocr_input: str, classification_system_prompt) -> any:
prompt = classification_system_prompt

chat_template = ChatPromptTemplate.from_messages(
[
SystemMessage(
content=(
prompt
)
),
HumanMessagePromptTemplate.from_template("{text}"),
]
)
messages = chat_template.format_messages(text=ocr_input)
chat_history = ChatHistory(system_message = prompt)
chat_history.add_user_message(f"{ocr_input}")

model = get_llm()
return model.invoke(messages)
service, req_params = get_completion_service()
return await service.get_chat_message_content(
chat_history,
req_params
)
27 changes: 17 additions & 10 deletions src/functionapp/ai_ocr/process.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
import glob, logging, json, os, sys
import glob
import logging
import json
import os
import sys
from typing import Tuple
from datetime import datetime
import tempfile
from azure.identity import DefaultAzureCredential
from azure.cosmos import CosmosClient, exceptions
from azure.core.exceptions import ResourceNotFoundError
from PyPDF2 import PdfReader
from langchain_core.output_parsers.json import parse_json_markdown

from ai_ocr.azure.doc_intelligence import get_ocr_results
from ai_ocr.azure.openai_ops import load_image, get_size_of_base64_images
from ai_ocr.chains import get_structured_data, get_summary_with_gpt, perform_gpt_evaluation_and_enrichment
from ai_ocr.model import Config
from ai_ocr.azure.images import convert_pdf_into_image

from ai_ocr.azure.openai_ops import load_image, get_size_of_base64_images
from ai_ocr.genai_operations import get_structured_data, get_summary_with_gpt, perform_gpt_evaluation_and_enrichment


def connect_to_cosmos():
endpoint = os.environ['COSMOS_DB_ENDPOINT']
key = os.environ['COSMOS_DB_KEY']
Expand Down Expand Up @@ -139,7 +145,7 @@ def fetch_model_prompt_and_schema(dataset_type):
example_schema = config_item[dataset_type]['example_schema']
return model_prompt, example_schema

def run_ocr_and_gpt(file_to_ocr: str, prompt: str, json_schema: str, document: dict, container: any, config: Config = Config()) -> (any, dict, dict):
async def run_ocr_and_gpt(file_to_ocr: str, prompt: str, json_schema: str, document: dict, container: any, config: Config = Config()) -> Tuple[any, dict, dict]:
processing_times = {}

# Get OCR results
Expand Down Expand Up @@ -179,19 +185,20 @@ def run_ocr_and_gpt(file_to_ocr: str, prompt: str, json_schema: str, document: d

# Get structured data
gpt_extraction_start_time = datetime.now()
structured = get_structured_data(ocr_result, prompt, json_schema, imgs)
structured = await get_structured_data(ocr_result.content, prompt, json_schema, imgs)
gpt_extraction_time = (datetime.now() - gpt_extraction_start_time).total_seconds()
processing_times['gpt_extraction_time'] = gpt_extraction_time

# Update state after GPT extraction
update_state(document, container, 'gpt_extraction_completed', True, gpt_extraction_time)

# Parse structured data
extracted_data = parse_json_markdown(structured.content)
stripped_content = structured.content.strip()
extracted_data = json.loads(stripped_content)

# Perform GPT evaluation and enrichment
evaluation_start_time = datetime.now()
enriched_data = perform_gpt_evaluation_and_enrichment(imgs, extracted_data, json_schema)
enriched_data = await perform_gpt_evaluation_and_enrichment(imgs, extracted_data, json_schema)
evaluation_time = (datetime.now() - evaluation_start_time).total_seconds()
processing_times['gpt_evaluation_time'] = evaluation_time

Expand All @@ -211,15 +218,15 @@ def run_ocr_and_gpt(file_to_ocr: str, prompt: str, json_schema: str, document: d



def process_gpt_summary(ocr_response, document, container):
async def process_gpt_summary(ocr_response, document, container):
try:
classification = 'N/A'
try:
classification = ocr_response.categorization
except AttributeError:
logging.warning("Cannot find 'categorization' in output schema! Logging it as N/A...")
summary_start_time = datetime.now()
gpt_summary = get_summary_with_gpt(ocr_response)
gpt_summary = await get_summary_with_gpt(ocr_response)
summary_processing_time = (datetime.now() - summary_start_time).total_seconds()
update_state(document, container, 'gpt_summary_completed', True, summary_processing_time)
document['extracted_data']['classification'] = classification
Expand Down
Loading