FRA-Backend/embedder.py at main · Optomatica/FRA-Backend · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import os
import base64
from typing import List, Dict, Any, Tuple
from dotenv import load_dotenv
from pinecone import Pinecone
from langchain_community.document_loaders import (
    UnstructuredWordDocumentLoader,
    UnstructuredExcelLoader,
    PyPDFLoader,
    TextLoader,
    CSVLoader
)
from mistralai.client import MistralClient
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_mistralai import MistralAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.schema import Document
import fitz  # PyMuPDF for PDF image extraction
from PIL import Image
import io
from openai import OpenAI
import pandas as pd

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = "opto-fra"

mistral_client = MistralClient(api_key=MISTRAL_API_KEY)

def initialize_pinecone() -> Pinecone:
    pc = Pinecone(api_key=PINECONE_API_KEY)
    if INDEX_NAME not in pc.list_indexes().names():
        print("Index not found, creating new index")
        pc.create_index(
            name=INDEX_NAME,
            dimension=1024,
            metric="cosine",
            spec=ServerlessSpec(cloud="aws", region="us-east-1")
        )
    else:
        print(f"Index already exists")
    return pc

def get_file_extension(file_path: str) -> str:
    return os.path.splitext(file_path)[1].lower()

def encode_image_to_base64(image_bytes: bytes) -> str:
    """Convert image bytes to base64 string for OpenAI API"""
    return base64.b64encode(image_bytes).decode('utf-8')

def describe_image_with_mistral(image_bytes: bytes) -> str:
    """Use Mistral's vision model to describe an image"""
    try:
        response = mistral_client.chat(
            model=os.getenv("MISTRAL_MODEL"),
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "Describe the following image in detail."},
                {"role": "user", "content": {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image_to_base64(image_bytes)}"}}}
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error describing image: {e}")
        return "Image description unavailable"

def extract_images_from_pdf(file_path: str) -> List[Tuple[bytes, int, Dict]]:
    """Extract images from PDF and return image data with page numbers"""
    images = []
    try:
        pdf_document = fitz.open(file_path)

        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            image_list = page.get_images()

            for img_index, img in enumerate(image_list):
                # Get image data
                xref = img[0]
                pix = fitz.Pixmap(pdf_document, xref)

                # Convert to PIL Image and then to bytes
                if pix.n - pix.alpha < 4:  # GRAY or RGB
                    img_data = pix.tobytes("png")
                    images.append((
                        img_data,
                        page_num + 1,  # 1-indexed page number
                        {
                            "image_index": img_index,
                            "source": file_path,
                            "page": page_num + 1,
                            "type": "image"
                        }
                    ))
                pix = None  # Clean up

        pdf_document.close()
    except Exception as e:
        print(f"Error extracting images from PDF: {e}")

    return images

def extract_images_from_docx(file_path: str) -> List[Tuple[bytes, str, Dict]]:
    """Extract images from DOCX files"""
    images = []
    try:
        from docx import Document as DocxDocument
        import zipfile

        # Open docx as zip file to extract images
        with zipfile.ZipFile(file_path, 'r') as docx_zip:
            # Look for image files in the media folder
            image_files = [f for f in docx_zip.namelist() if f.startswith('word/media/')]

            for idx, img_file in enumerate(image_files):
                img_data = docx_zip.read(img_file)
                images.append((
                    img_data,
                    f"image_{idx}",
                    {
                        "image_index": idx,
                        "source": file_path,
                        "image_file": img_file,
                        "type": "image"
                    }
                ))
    except Exception as e:
        print(f"Error extracting images from DOCX: {e}")

    return images

def load_document_with_images(file_path: str) -> Tuple[List[Document], List[Tuple[bytes, Any, Dict]]]:
    """Load document and extract images separately"""
    ext = get_file_extension(file_path)
    documents = []
    images = []

    # Load text content
    if ext == ".pdf":
        loader = PyPDFLoader(file_path)
        documents = loader.load()
        images = extract_images_from_pdf(file_path)
    elif ext == ".docx":
        loader = UnstructuredWordDocumentLoader(file_path)
        documents = loader.load()
        images = extract_images_from_docx(file_path)
    elif ext == ".xlsx":
        loader = UnstructuredExcelLoader(file_path)
        documents = loader.load()
        # Excel files might contain charts/images, but they're complex to extract
        # For now, we'll focus on text content
    elif ext == ".txt":
        loader = TextLoader(file_path)
        documents = loader.load()
    elif ext == ".csv":
        loader = CSVLoader(file_path)
        documents = loader.load()
    else:
        raise ValueError(f"Unsupported file format: {ext}")

    return documents, images

def create_image_documents(images: List[Tuple[bytes, Any, Dict]]) -> List[Document]:
    """Convert images to Document objects with descriptions"""
    image_documents = []

    for img_data, location, metadata in images:
        try:
            description = describe_image_with_mistral(img_data)

            # Create a document for the image description
            image_doc = Document(
                page_content=f"[IMAGE DESCRIPTION] {description}",
                metadata={
                    **metadata,
                    "content_type": "image_description",
                    "location": str(location)
                }
            )
            image_documents.append(image_doc)
            print(f"Processed image at location {location}")

        except Exception as e:
            print(f"Failed to process image at location {location}: {e}")

    return image_documents

def process_document_with_images(file_path: str) -> List[Document]:
    """Process document including both text and image content"""
    # Load documents and extract images
    documents, images = load_document_with_images(file_path)

    # Split text documents
    splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=250)
    text_chunks = splitter.split_documents(documents)

    # Process images and create image documents
    image_documents = create_image_documents(images)

    # Combine text chunks and image documents
    all_chunks = text_chunks + image_documents

    print(f"Created {len(text_chunks)} text chunks and {len(image_documents)} image descriptions")
    return all_chunks

def store_embeddings(index_name: str, chunks: List[Document], embeddings):
    """Store both text and image description embeddings"""
    texts = [chunk.page_content for chunk in chunks]
    metadatas = [chunk.metadata for chunk in chunks]

    PineconeVectorStore.from_texts(
        texts=texts,
        embedding=embeddings,
        metadatas=metadatas,
        index_name=INDEX_NAME,
        namespace=index_name
    )
    print(f"Stored {len(texts)} chunks (text + image descriptions) into index: {index_name}")

def process_and_embed_document(file_path: str, company_name: str) -> str:
    """Orchestrate the entire process of processing and embedding a document"""
    try:
        # Initialize services
        initialize_pinecone()

        # Process document
        print(f"Processing document: {file_path}")
        chunks = process_document_with_images(file_path)

        # Initialize embeddings
        embeddings = MistralAIEmbeddings(
            model="mistral-embed",
            mistral_api_key=MISTRAL_API_KEY
        )

        print("Finished creating embeddings model")
        print(f"Total chunks to store: {len(chunks)}")

        # Store embeddings
        store_embeddings(company_name, chunks, embeddings)
        return f"Successfully processed and embedded document: {file_path}"

    except Exception as e:
        print(f"Error processing document {file_path}: {e}")
        raise e