-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathocr_processor.py
More file actions
352 lines (281 loc) · 11.8 KB
/
ocr_processor.py
File metadata and controls
352 lines (281 loc) · 11.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
import os
import re
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import numpy as np
from ai_processor import AIProcessor
class OCRProcessor:
def __init__(self):
# Check if Tesseract is available
try:
pytesseract.get_tesseract_version()
except Exception as e:
print(f"Tesseract not properly configured: {e}")
# In a production system, we might raise an exception here
# Initialize AI processor if available
try:
self.ai_processor = AIProcessor()
self.use_ai = True
print("AI processing enabled for enhanced OCR accuracy")
except Exception as e:
print(f"AI processing not available: {e}")
self.use_ai = False
def process_file(self, file_path):
"""
Process an uploaded file (image or PDF) using OCR
Args:
file_path (str): Path to the uploaded file
Returns:
str: Extracted text from the file
"""
file_ext = os.path.splitext(file_path)[1].lower()
try:
if file_ext in ['.pdf']:
return self._process_pdf(file_path)
elif file_ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.tif']:
return self._process_image(file_path)
else:
raise ValueError(f"Unsupported file format: {file_ext}")
except Exception as e:
print(f"Error processing file: {e}")
return ""
def _process_image(self, image_path):
"""
Process a single image using OCR
Args:
image_path (str): Path to the image file
Returns:
str: Extracted text from the image
"""
try:
# Open image
image = Image.open(image_path)
# Convert to grayscale for better OCR results
if image.mode != 'L':
image = image.convert('L')
# Apply some pre-processing for better OCR results
# This is a simple approach; more sophisticated methods could be used
# image = self._preprocess_image(image)
# Perform OCR
extracted_text = pytesseract.image_to_string(image, lang='eng')
return extracted_text
except Exception as e:
print(f"Error processing image: {e}")
return ""
def _process_pdf(self, pdf_path):
"""
Process a PDF file using OCR
Args:
pdf_path (str): Path to the PDF file
Returns:
str: Extracted text from the PDF
"""
try:
# Convert PDF to images
images = convert_from_path(pdf_path)
# Process each image
all_text = []
for image in images:
# Convert to grayscale
if image.mode != 'L':
image = image.convert('L')
# Apply pre-processing
# image = self._preprocess_image(image)
# Perform OCR
text = pytesseract.image_to_string(image, lang='eng')
all_text.append(text)
# Combine text from all pages
return "\n\n".join(all_text)
except Exception as e:
print(f"Error processing PDF: {e}")
return ""
def _preprocess_image(self, image):
"""
Apply pre-processing to improve OCR results
Args:
image (PIL.Image): Image to preprocess
Returns:
PIL.Image: Preprocessed image
"""
# This is a placeholder for more sophisticated preprocessing
# In a production system, we would use more advanced techniques
return image
def extract_items(self, text):
"""
Extract structured item data from OCR text
Args:
text (str): Raw OCR text
Returns:
list: List of dictionaries containing item details
"""
# First, enhance the OCR text with AI if available
if self.use_ai:
try:
# Enhance the raw OCR text
enhanced_text = self.ai_processor.enhance_ocr_text(text)
print("OCR text enhanced with AI")
# Try AI-based extraction first
ai_items = self.ai_processor.extract_structured_data(enhanced_text)
if ai_items and len(ai_items) > 0:
print(f"AI successfully extracted {len(ai_items)} items")
return ai_items
# If AI extraction fails, fall back to traditional methods but use the enhanced text
text = enhanced_text
except Exception as e:
print(f"AI-based extraction failed: {e}")
# Continue with traditional methods
items = []
# Clean the text
text = self._clean_text(text)
# Split into lines
lines = text.split('\n')
# Different invoice formats will require different parsing strategies
# This is a simple approach that looks for lines with quantity, price, and total
for i, line in enumerate(lines):
# Skip short lines
if len(line.strip()) < 5:
continue
# Try to extract item details using regex
item_data = self._extract_item_from_line(line)
if item_data:
items.append(item_data)
continue
# Try multi-line approach if single line fails
if i < len(lines) - 1:
combined_line = line + " " + lines[i + 1]
item_data = self._extract_item_from_line(combined_line)
if item_data:
items.append(item_data)
# If no items found, try a different approach
if not items:
items = self._extract_items_table_format(text)
return items
def _clean_text(self, text):
"""
Clean and normalize OCR text
Args:
text (str): Raw OCR text
Returns:
str: Cleaned text
"""
# Replace common OCR errors
text = text.replace('|', '1')
text = text.replace('l', '1')
text = text.replace('O', '0')
text = text.replace('o', '0')
# Replace multiple spaces with a single space
text = re.sub(r'\s+', ' ', text)
# Replace common currency symbols
text = text.replace('₹', '')
text = text.replace('$', '')
text = text.replace('€', '')
# Remove other special characters
text = re.sub(r'[^\w\s\.\-\,]', '', text)
return text
def _extract_item_from_line(self, line):
"""
Extract item details from a single line
Args:
line (str): Line of text
Returns:
dict: Item details or None if no match
"""
# This regex pattern tries to match common invoice line item formats
# It's simplified and would need to be adapted to specific invoice formats
patterns = [
# Pattern 1: Item name followed by quantity, unit price, and total
r'([\w\s\-]+)\s+(\d+(?:\.\d+)?)\s+(?:x\s+)?(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)',
# Pattern 2: Item name followed by quantity and total (no unit price)
r'([\w\s\-]+)\s+(\d+(?:\.\d+)?)\s+(?:pcs|units|nos)?\s+(\d+(?:\.\d+)?)',
# Pattern 3: Just item name and price
r'([\w\s\-]+)\s+(\d+(?:\.\d+)?)'
]
for pattern in patterns:
match = re.search(pattern, line)
if match:
groups = match.groups()
if len(groups) == 4: # Pattern 1
item_name = groups[0].strip()
qty = float(groups[1])
unit_price = float(groups[2])
total = float(groups[3])
# Verify that qty * unit_price is approximately equal to total
calculated_total = qty * unit_price
if abs(calculated_total - total) > 1: # Allow for small rounding differences
# If totals don't match, this might not be a valid item line
continue
return {
"item": item_name,
"qty": qty,
"unit_price": unit_price,
"total": total
}
elif len(groups) == 3: # Pattern 2
item_name = groups[0].strip()
qty = float(groups[1])
total = float(groups[2])
unit_price = total / qty if qty else 0
return {
"item": item_name,
"qty": qty,
"unit_price": unit_price,
"total": total
}
elif len(groups) == 2: # Pattern 3
item_name = groups[0].strip()
total = float(groups[1])
return {
"item": item_name,
"qty": 1, # Assume quantity of 1
"unit_price": total,
"total": total
}
return None
def _extract_items_table_format(self, text):
"""
Extract items assuming a table format in the invoice
Args:
text (str): OCR text
Returns:
list: List of item dictionaries
"""
items = []
# Split into lines
lines = text.split('\n')
# Look for lines that might represent table rows
for line in lines:
# Skip short lines
if len(line.strip()) < 10:
continue
# Split line by multiple spaces
parts = re.split(r'\s{2,}', line.strip())
if len(parts) >= 3:
# Try to identify which parts correspond to item, qty, price, total
item_name = parts[0]
# Look for numbers in the other parts
numbers = []
for part in parts[1:]:
try:
num = float(re.sub(r'[^\d\.]', '', part))
numbers.append(num)
except:
pass
if len(numbers) >= 2:
# Assume last number is total
total = numbers[-1]
# If we have 3 numbers, assume they are qty, unit_price, total
if len(numbers) >= 3:
qty = numbers[-3]
unit_price = numbers[-2]
else:
# If we have 2 numbers, assume they are qty and total
qty = numbers[-2]
unit_price = total / qty if qty else 0
items.append({
"item": item_name,
"qty": qty,
"unit_price": unit_price,
"total": total
})
return items