DocScanAI/ocr_processor.py at main · Parisinghchauhan/DocScanAI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
import os
import re
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import numpy as np
from ai_processor import AIProcessor

class OCRProcessor:
    def __init__(self):
        # Check if Tesseract is available
        try:
            pytesseract.get_tesseract_version()
        except Exception as e:
            print(f"Tesseract not properly configured: {e}")
            # In a production system, we might raise an exception here

        # Initialize AI processor if available
        try:
            self.ai_processor = AIProcessor()
            self.use_ai = True
            print("AI processing enabled for enhanced OCR accuracy")
        except Exception as e:
            print(f"AI processing not available: {e}")
            self.use_ai = False

    def process_file(self, file_path):
        """
        Process an uploaded file (image or PDF) using OCR

        Args:
            file_path (str): Path to the uploaded file

        Returns:
            str: Extracted text from the file
        """
        file_ext = os.path.splitext(file_path)[1].lower()

        try:
            if file_ext in ['.pdf']:
                return self._process_pdf(file_path)
            elif file_ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']:
                return self._process_image(file_path)
            else:
                raise ValueError(f"Unsupported file format: {file_ext}")
        except Exception as e:
            print(f"Error processing file: {e}")
            return ""

    def _process_image(self, image_path):
        """
        Process a single image using OCR

        Args:
            image_path (str): Path to the image file

        Returns:
            str: Extracted text from the image
        """
        try:
            # Open image
            image = Image.open(image_path)

            # Convert to grayscale for better OCR results
            if image.mode != 'L':
                image = image.convert('L')

            # Apply some pre-processing for better OCR results
            # This is a simple approach; more sophisticated methods could be used
            # image = self._preprocess_image(image)

            # Perform OCR
            extracted_text = pytesseract.image_to_string(image, lang='eng')

            return extracted_text
        except Exception as e:
            print(f"Error processing image: {e}")
            return ""

    def _process_pdf(self, pdf_path):
        """
        Process a PDF file using OCR

        Args:
            pdf_path (str): Path to the PDF file

        Returns:
            str: Extracted text from the PDF
        """
        try:
            # Convert PDF to images
            images = convert_from_path(pdf_path)

            # Process each image
            all_text = []
            for image in images:
                # Convert to grayscale
                if image.mode != 'L':
                    image = image.convert('L')

                # Apply pre-processing
                # image = self._preprocess_image(image)

                # Perform OCR
                text = pytesseract.image_to_string(image, lang='eng')
                all_text.append(text)

            # Combine text from all pages
            return "\n\n".join(all_text)
        except Exception as e:
            print(f"Error processing PDF: {e}")
            return ""

    def _preprocess_image(self, image):
        """
        Apply pre-processing to improve OCR results

        Args:
            image (PIL.Image): Image to preprocess

        Returns:
            PIL.Image: Preprocessed image
        """
        # This is a placeholder for more sophisticated preprocessing
        # In a production system, we would use more advanced techniques
        return image

    def extract_items(self, text):
        """
        Extract structured item data from OCR text

        Args:
            text (str): Raw OCR text

        Returns:
            list: List of dictionaries containing item details
        """
        # First, enhance the OCR text with AI if available
        if self.use_ai:
            try:
                # Enhance the raw OCR text
                enhanced_text = self.ai_processor.enhance_ocr_text(text)
                print("OCR text enhanced with AI")

                # Try AI-based extraction first
                ai_items = self.ai_processor.extract_structured_data(enhanced_text)
                if ai_items and len(ai_items) > 0:
                    print(f"AI successfully extracted {len(ai_items)} items")
                    return ai_items

                # If AI extraction fails, fall back to traditional methods but use the enhanced text
                text = enhanced_text
            except Exception as e:
                print(f"AI-based extraction failed: {e}")
                # Continue with traditional methods

        items = []

        # Clean the text
        text = self._clean_text(text)

        # Split into lines
        lines = text.split('\n')

        # Different invoice formats will require different parsing strategies
        # This is a simple approach that looks for lines with quantity, price, and total
        for i, line in enumerate(lines):
            # Skip short lines
            if len(line.strip()) < 5:
                continue

            # Try to extract item details using regex
            item_data = self._extract_item_from_line(line)

            if item_data:
                items.append(item_data)
                continue

            # Try multi-line approach if single line fails
            if i < len(lines) - 1:
                combined_line = line + " " + lines[i + 1]
                item_data = self._extract_item_from_line(combined_line)

                if item_data:
                    items.append(item_data)

        # If no items found, try a different approach
        if not items:
            items = self._extract_items_table_format(text)

        return items

    def _clean_text(self, text):
        """
        Clean and normalize OCR text

        Args:
            text (str): Raw OCR text

        Returns:
            str: Cleaned text
        """
        # Replace common OCR errors
        text = text.replace('|', '1')
        text = text.replace('l', '1')
        text = text.replace('O', '0')
        text = text.replace('o', '0')

        # Replace multiple spaces with a single space
        text = re.sub(r'\s+', ' ', text)

        # Replace common currency symbols
        text = text.replace('₹', '')
        text = text.replace('$', '')
        text = text.replace('€', '')

        # Remove other special characters
        text = re.sub(r'[^\w\s\.\-\,]', '', text)

        return text

    def _extract_item_from_line(self, line):
        """
        Extract item details from a single line

        Args:
            line (str): Line of text

        Returns:
            dict: Item details or None if no match
        """
        # This regex pattern tries to match common invoice line item formats
        # It's simplified and would need to be adapted to specific invoice formats
        patterns = [
            # Pattern 1: Item name followed by quantity, unit price, and total
            r'([\w\s\-]+)\s+(\d+(?:\.\d+)?)\s+(?:x\s+)?(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)',

            # Pattern 2: Item name followed by quantity and total (no unit price)
            r'([\w\s\-]+)\s+(\d+(?:\.\d+)?)\s+(?:pcs|units|nos)?\s+(\d+(?:\.\d+)?)',

            # Pattern 3: Just item name and price
            r'([\w\s\-]+)\s+(\d+(?:\.\d+)?)'
        ]

        for pattern in patterns:
            match = re.search(pattern, line)
            if match:
                groups = match.groups()

                if len(groups) == 4:  # Pattern 1
                    item_name = groups[0].strip()
                    qty = float(groups[1])
                    unit_price = float(groups[2])
                    total = float(groups[3])

                    # Verify that qty * unit_price is approximately equal to total
                    calculated_total = qty * unit_price
                    if abs(calculated_total - total) > 1:  # Allow for small rounding differences
                        # If totals don't match, this might not be a valid item line
                        continue

                    return {
                        "item": item_name,
                        "qty": qty,
                        "unit_price": unit_price,
                        "total": total
                    }

                elif len(groups) == 3:  # Pattern 2
                    item_name = groups[0].strip()
                    qty = float(groups[1])
                    total = float(groups[2])
                    unit_price = total / qty if qty else 0

                    return {
                        "item": item_name,
                        "qty": qty,
                        "unit_price": unit_price,
                        "total": total
                    }

                elif len(groups) == 2:  # Pattern 3
                    item_name = groups[0].strip()
                    total = float(groups[1])

                    return {
                        "item": item_name,
                        "qty": 1,  # Assume quantity of 1
                        "unit_price": total,
                        "total": total
                    }

        return None

    def _extract_items_table_format(self, text):
        """
        Extract items assuming a table format in the invoice

        Args:
            text (str): OCR text

        Returns:
            list: List of item dictionaries
        """
        items = []

        # Split into lines
        lines = text.split('\n')

        # Look for lines that might represent table rows
        for line in lines:
            # Skip short lines
            if len(line.strip()) < 10:
                continue

            # Split line by multiple spaces
            parts = re.split(r'\s{2,}', line.strip())

            if len(parts) >= 3:
                # Try to identify which parts correspond to item, qty, price, total
                item_name = parts[0]

                # Look for numbers in the other parts
                numbers = []
                for part in parts[1:]:
                    try:
                        num = float(re.sub(r'[^\d\.]', '', part))
                        numbers.append(num)
                    except:
                        pass

                if len(numbers) >= 2:
                    # Assume last number is total
                    total = numbers[-1]

                    # If we have 3 numbers, assume they are qty, unit_price, total
                    if len(numbers) >= 3:
                        qty = numbers[-3]
                        unit_price = numbers[-2]
                    else:
                        # If we have 2 numbers, assume they are qty and total
                        qty = numbers[-2]
                        unit_price = total / qty if qty else 0

                    items.append({
                        "item": item_name,
                        "qty": qty,
                        "unit_price": unit_price,
                        "total": total
                    })

        return items