Azure-Samples · tarockey · Sep 6, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024
diff --git a/README.md b/README.md
@@ -32,6 +32,7 @@ Before deploying the solution, you need to create an OpenAI resource and deploy
 
 2. **Deploy a Vision-Capable Model**:
    - Ensure the deployed model supports vision, such as GPT-4T-0125, GPT-4T-0409 or GPT-4-Omni.
+   - Currently, this solution also assumes the deployed model supports setting the `response_format` option to `json_object`. 
 
 
 ## Deployment

diff --git a/infra/main.bicep b/infra/main.bicep
@@ -4,6 +4,14 @@ param functionAppDockerImage string = 'DOCKER|argus.azurecr.io/argus-backend:lat
 // Define the resource group location
 param location string = resourceGroup().location
 
+@allowed([
+  'westeurope'
+  'westus2'
+  'westus3'
+  'eastus2'
+])
+param appserviceLocation string
+
 // Define the storage account name
 param storageAccountName string = 'sa${uniqueString(resourceGroup().id)}'
 
@@ -190,7 +198,7 @@ resource applicationInsights 'Microsoft.Insights/components@2020-02-02' = {
 // Define the App Service Plan
 resource appServicePlan 'Microsoft.Web/serverfarms@2021-03-01' = {
   name: appServicePlanName
-  location: location
+  location: appserviceLocation
   kind: 'Linux'
   sku: {
     name: 'B1'

diff --git a/notebooks/evaluator.ipynb b/notebooks/evaluator.ipynb
diff --git a/notebooks/requirements.txt b/notebooks/requirements.txt
@@ -12,14 +12,13 @@ python-dotenv
 azure-ai-documentintelligence
 azure-identity
 PyMuPDF
-langchain
-langchain_core
-langchain_community
-langchain_openai
 tiktoken
 python-multipart
 promptflow-evals
 jsonpath-ng
 thefuzz
+seaborn
+semantic-kernel
+jupyter
 azure-ai-formrecognizer
 seaborn
diff --git a/src/.funcignore b/src/.funcignore
diff --git a/src/functionapp/ai_ocr/azure/config.py b/src/functionapp/ai_ocr/azure/config.py
@@ -9,7 +9,7 @@ def get_config():
         "doc_intelligence_key": os.getenv("DOCUMENT_INTELLIGENCE_KEY", None),
         "openai_api_key": os.getenv("AZURE_OPENAI_KEY", None),
         "openai_api_endpoint": os.getenv("AZURE_OPENAI_ENDPOINT", None),
-        "openai_api_version": "2023-12-01-preview",
+        "openai_api_version": "2024-07-01-preview",
         "openai_model_deployment": os.getenv("AZURE_OPENAI_MODEL_DEPLOYMENT_NAME", None),
         "temp_images_outdir" : os.getenv("TEMP_IMAGES_OUTDIR", "/tmp/")
     }
diff --git a/src/functionapp/ai_ocr/azure/images.py b/src/functionapp/ai_ocr/azure/images.py
@@ -1,6 +1,5 @@
 import fitz  # PyMuPDF
 from PIL import Image
-from pathlib import Path
 import io
 import os
 

diff --git a/src/functionapp/ai_ocr/azure/openai_ops.py b/src/functionapp/ai_ocr/azure/openai_ops.py
@@ -1,25 +1,28 @@
 import base64
 
-from langchain.chains.transform import TransformChain
-from langchain_openai import AzureChatOpenAI
+from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
+from semantic_kernel.connectors.ai import PromptExecutionSettings
 
 from ai_ocr.azure.config import get_config
 
-
-def get_llm():  
+def get_completion_service():  
     api_key = get_config()['openai_api_key'] 
     if not api_key:  
         raise ValueError("openai_api_key environment variable is not set.")  
-
-    return AzureChatOpenAI(  
-        model=get_config()["openai_model_deployment"],  
-        temperature=0,  
-        max_tokens=4000,  
-        verbose=True,  
-        api_key=api_key,
-        api_version=get_config()["openai_api_version"]
-    )  
 
+    chat_completion_service = AzureChatCompletion(
+            deployment_name=get_config()["openai_model_deployment"],
+            api_key=api_key,
+            endpoint=get_config()["openai_api_endpoint"],
+            api_version=get_config()["openai_api_version"])
+
+    req_settings = PromptExecutionSettings(
+        extension_data = {
+            "max_tokens": 4000,
+            "temperature": 0,
+        }
+    )
+    return chat_completion_service, req_settings
 
 
 def load_image(image_path) -> str:
@@ -33,10 +36,3 @@ def get_size_of_base64_images(images):
     for img in images:
         total_size += len(img)
     return total_size
-
-
-load_image_chain = TransformChain(
-    input_variables=["image_path"],
-    output_variables=["image"],
-    transform=load_image
-)
diff --git a/src/functionapp/ai_ocr/chains.py → src/functionapp/ai_ocr/genai_operations.py b/src/functionapp/ai_ocr/chains.py → src/functionapp/ai_ocr/genai_operations.py
@@ -1,15 +1,11 @@
-from langchain_core.messages import HumanMessage
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.messages import SystemMessage
-from langchain_core.prompts import HumanMessagePromptTemplate
-from langchain_core.output_parsers import JsonOutputParser
+from semantic_kernel.contents import ChatHistory, ChatMessageContent, ImageContent
+from ai_ocr.azure.openai_ops import get_completion_service
 
-from ai_ocr.azure.openai_ops import get_llm
+import logging
+import json
 
-import logging, json
 
-
-def get_structured_data(html_content: str, prompt: str, json_schema: str, images=[]) -> any:
+async def get_structured_data(markdown_content: str, prompt: str, json_schema: str, images=[]) -> any:
     system_message = f"""
     Your task is to extract the JSON contents from a document using the provided materials:
     1. Custom instructions for the extraction process
@@ -35,31 +31,30 @@ def get_structured_data(html_content: str, prompt: str, json_schema: str, images
     ```
     """
 
-    chat_template = ChatPromptTemplate.from_messages(
-        [
-            SystemMessage(content=system_message),
-            HumanMessagePromptTemplate.from_template("Here is the Document content (in html format):\n{html}"),
-        ]
-    )
-
-    messages = chat_template.format_messages(html=html_content)
+    chat_history = ChatHistory(system_message = system_message)
+    chat_history.add_user_message(f"Here is the Document content (in markdown format):\n{markdown_content}")
 
     if images:
-        messages.append(HumanMessage(content="Here are the images from the document:"))
+        chat_history.add_user_message("Here are the images from the document:")
         for img in images:
-            messages.append(HumanMessage(content=[{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}}]))
+            chat_history.add_message(
+                ChatMessageContent(
+                    role="user",
+                    items=[ImageContent(uri=f"data:image/png;base64,{img}")]
+                )
+            )
 
-    model = get_llm()
-    return model.invoke(messages)
+    service, req_params = get_completion_service()
+    req_params.extension_data["response_format"] = {"type": "json_object"}
+    return await service.get_chat_message_content(
+        chat_history,
+        req_params
+    )
 
 
 
 
-def perform_gpt_evaluation_and_enrichment(images: list, extracted_data: dict, json_schema: str) -> dict:
-    model = get_llm()
-
-    parser = JsonOutputParser()
-
+async def perform_gpt_evaluation_and_enrichment(images: list, extracted_data: dict, json_schema: str) -> dict:
     system_message = f"""
     You are an AI assistant tasked with evaluating extracted data from a document.
 
@@ -105,28 +100,32 @@ def perform_gpt_evaluation_and_enrichment(images: list, extracted_data: dict, js
     Here is the JSON schema template that was used for the extraction:
 
     {json_schema}
-
-    ------
-
-    {parser.get_format_instructions()}
     """
 
-    chat_template = ChatPromptTemplate.from_messages(
-        [
-            SystemMessage(content=system_message),
-            HumanMessagePromptTemplate.from_template("Here is the extracted data :\n{extracted}"),
-        ]
-    )
-    messages = chat_template.format_messages(extracted=json.dumps(extracted_data, indent=2))
+    chat_history = ChatHistory(system_message = system_message)
+    chat_history.add_user_message(f"Here is the extracted data :\n{json.dumps(extracted_data, indent=2)}")
 
     if images:
-        messages.append(HumanMessage(content="Here are the images from the document:"))
+        chat_history.add_user_message("Here are the images from the document:")
         for img in images:
-            messages.append(HumanMessage(content=[{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}}]))
+            chat_history.add_message(
+                ChatMessageContent(
+                    role="user",
+                    items=[ImageContent(uri=f"data:image/png;base64,{img}")]
+                )
+            )
+
+    service, req_params = get_completion_service()
+    # Set the response format to JSON object
+    req_params.extension_data["response_format"] = {"type": "json_object"}
+    evaluation_result = await service.get_chat_message_content(
+        chat_history,
+        req_params
+    )
+
 
-    evaluation_result = model.invoke(messages)
     try:
-        return parser.parse(evaluation_result.content)
+        return json.loads(evaluation_result.content)
     except Exception as e:
         logging.error(f"Failed to parse GPT evaluation and enrichment result: {e}")
         return {
@@ -135,44 +134,33 @@ def perform_gpt_evaluation_and_enrichment(images: list, extracted_data: dict, js
         }
 
 
-def get_summary_with_gpt(mkd_output_json: str) -> any:
+async def get_summary_with_gpt(mkd_output_json: str) -> any:
     reasoning_prompt = """
     Use the provided data represented in the schema to produce a summary in natural language. The format should be a few sentences summary of the document.
 
     As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
     the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.
     """
-
-    chat_template = ChatPromptTemplate.from_messages(
-        [
-            SystemMessage(
-                content=(
-                    reasoning_prompt
-                )
-            ),
-            HumanMessagePromptTemplate.from_template("{text}"),
-        ]
-    )
-    messages = chat_template.format_messages(text=mkd_output_json)
 
-    model = get_llm()
-    return model.invoke(messages)
+    chat_history = ChatHistory(system_message = reasoning_prompt)
+    chat_history.add_user_message(f"{mkd_output_json}")
+
+    service, req_params = get_completion_service()
+    return await service.get_chat_message_content(
+        chat_history,
+        req_params
+    )
+
 
 
-def classify_doc_with_llm(ocr_input: str, classification_system_prompt) -> any:
+async def classify_doc_with_llm(ocr_input: str, classification_system_prompt) -> any:
     prompt = classification_system_prompt
 
-    chat_template = ChatPromptTemplate.from_messages(
-        [
-            SystemMessage(
-                content=(
-                    prompt
-                )
-            ),
-            HumanMessagePromptTemplate.from_template("{text}"),
-        ]
-    )
-    messages = chat_template.format_messages(text=ocr_input)
+    chat_history = ChatHistory(system_message = prompt)
+    chat_history.add_user_message(f"{ocr_input}")
 
-    model = get_llm()
-    return model.invoke(messages)
+    service, req_params = get_completion_service()
+    return await service.get_chat_message_content(
+        chat_history,
+        req_params
+    )
diff --git a/src/functionapp/ai_ocr/process.py b/src/functionapp/ai_ocr/process.py
@@ -1,18 +1,24 @@
-import glob, logging, json, os, sys
+import glob
+import logging
+import json
+import os
+import sys
+from typing import Tuple
 from datetime import datetime
 import tempfile 
 from azure.identity import DefaultAzureCredential
 from azure.cosmos import CosmosClient, exceptions
 from azure.core.exceptions import ResourceNotFoundError
 from PyPDF2 import PdfReader
-from langchain_core.output_parsers.json import parse_json_markdown
 
 from ai_ocr.azure.doc_intelligence import get_ocr_results
-from ai_ocr.azure.openai_ops import load_image, get_size_of_base64_images
-from ai_ocr.chains import get_structured_data, get_summary_with_gpt, perform_gpt_evaluation_and_enrichment
 from ai_ocr.model import Config
 from ai_ocr.azure.images import convert_pdf_into_image
 
+from ai_ocr.azure.openai_ops import load_image, get_size_of_base64_images
+from ai_ocr.genai_operations import get_structured_data, get_summary_with_gpt, perform_gpt_evaluation_and_enrichment
+
+
 def connect_to_cosmos():
     endpoint = os.environ['COSMOS_DB_ENDPOINT']
     key = os.environ['COSMOS_DB_KEY']
@@ -139,7 +145,7 @@ def fetch_model_prompt_and_schema(dataset_type):
     example_schema = config_item[dataset_type]['example_schema']
     return model_prompt, example_schema
 
-def run_ocr_and_gpt(file_to_ocr: str, prompt: str, json_schema: str, document: dict, container: any, config: Config = Config()) -> (any, dict, dict):
+async def run_ocr_and_gpt(file_to_ocr: str, prompt: str, json_schema: str, document: dict, container: any, config: Config = Config()) -> Tuple[any, dict, dict]:
     processing_times = {}
 
     # Get OCR results
@@ -179,19 +185,20 @@ def run_ocr_and_gpt(file_to_ocr: str, prompt: str, json_schema: str, document: d
 
     # Get structured data
     gpt_extraction_start_time = datetime.now()
-    structured = get_structured_data(ocr_result, prompt, json_schema, imgs)
+    structured = await get_structured_data(ocr_result.content, prompt, json_schema, imgs)
     gpt_extraction_time = (datetime.now() - gpt_extraction_start_time).total_seconds()
     processing_times['gpt_extraction_time'] = gpt_extraction_time
 
     # Update state after GPT extraction
     update_state(document, container, 'gpt_extraction_completed', True, gpt_extraction_time)
 
     # Parse structured data
-    extracted_data = parse_json_markdown(structured.content)
+    stripped_content = structured.content.strip()
+    extracted_data = json.loads(stripped_content)
 
     # Perform GPT evaluation and enrichment
     evaluation_start_time = datetime.now()
-    enriched_data = perform_gpt_evaluation_and_enrichment(imgs, extracted_data, json_schema)
+    enriched_data = await perform_gpt_evaluation_and_enrichment(imgs, extracted_data, json_schema)
     evaluation_time = (datetime.now() - evaluation_start_time).total_seconds()
     processing_times['gpt_evaluation_time'] = evaluation_time
 
@@ -211,15 +218,15 @@ def run_ocr_and_gpt(file_to_ocr: str, prompt: str, json_schema: str, document: d
 
 
 
-def process_gpt_summary(ocr_response, document, container):
+async def process_gpt_summary(ocr_response, document, container):
     try:
         classification = 'N/A'
         try:
             classification = ocr_response.categorization
         except AttributeError:
             logging.warning("Cannot find 'categorization' in output schema! Logging it as N/A...")
         summary_start_time = datetime.now()
-        gpt_summary = get_summary_with_gpt(ocr_response)
+        gpt_summary = await get_summary_with_gpt(ocr_response)
         summary_processing_time = (datetime.now() - summary_start_time).total_seconds()
         update_state(document, container, 'gpt_summary_completed', True, summary_processing_time)
         document['extracted_data']['classification'] = classification