Automation-Challenge-Python/ocr_processor.py at master · lucasromaodev/Automation-Challenge-Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import pytesseract
import os
from PIL import Image
import re

# Caminho para a instalação do Tesseract (ajuste se necessário)
tesseract_path = r"C:\Program Files\Tesseract-OCR"
pytesseract.pytesseract.tesseract_cmd = os.path.join(tesseract_path, "tesseract.exe")

# Caminho para a pasta tessdata
os.environ["TESSDATA_PREFIX"] = os.path.join(tesseract_path, "tessdata")

def extract_invoice_data(invoice_files):
    extracted_data = []

    for invoice in invoice_files:
        file_path = invoice['file']
        print(f"[INFO] Processando fatura: {file_path}")

        try:
            text = pytesseract.image_to_string(Image.open(file_path))
        except Exception as e:
            print(f"[ERRO] Falha ao processar OCR da imagem {file_path}: {e}")
            continue

        print(f"[DEBUG] Texto OCR extraído de {file_path}:\n{text}\n{'-'*40}")

        # Quebra o texto em linhas
        lines = text.strip().split('\n')
        lines = [line.strip() for line in lines if line.strip()]

        # Company Name: Primeira linha com letras e possívelmente "LLC", "Inc", etc.
        company_name = ""
        for line in lines[:3]:  # Tenta nas primeiras linhas
            if re.search(r'[A-Za-z]{2,}', line):
                company_name = line
                break

        if not company_name:
            print(f"[ERRO] Não encontrou Company Name em {file_path}")

        # Invoice Number - busca por "Invoice #xxxx" ou variações
        invoice_number_match = re.search(r'Invoice\s*(?:Number)?\s*[#:]*\s*([A-Za-z0-9\-]+)', text, re.I)
        invoice_number = invoice_number_match.group(1).strip() if invoice_number_match else ""

        if not invoice_number:
            print(f"[ERRO] Não encontrou Invoice Number em {file_path}")

        # Invoice Date - formato tipo "2019-06-03", "03/06/2019", etc.
        invoice_date_match = re.search(r'\b(\d{4}[-/]\d{2}[-/]\d{2})\b', text)
        invoice_date = invoice_date_match.group(1).strip() if invoice_date_match else ""

        if not invoice_date:
            print(f"[ERRO] Não encontrou Invoice Date em {file_path}")

        # Total Due - valor após "Total", ignorando "Subtotal" e "Sales Tax"
        total_due = ""
        total_lines = [line for line in lines if re.search(r'\bTotal\b', line, re.I)]
        for line in total_lines:
            if re.search(r'\bSubtotal\b', line, re.I) or re.search(r'\bSales\s*Tax\b', line, re.I):
                continue
            match = re.search(r'Total\s*[:\-]?\s*\€?\$?\s*([0-9.,]+)', line, re.I)
            if match:
                total_due = match.group(1).strip()
                break

        if not total_due:
            print(f"[ERRO] Não encontrou Total Due em {file_path}")

        # Inclui ID e Due Date que já vêm do site, junto com os dados extraídos do OCR
        extracted_data.append({
            'ID': invoice.get('ID', ''),
            'Due Date': invoice.get('Due Date', ''),
            'Invoice Number': invoice_number,
            'Invoice Date': invoice_date,
            'Company Name': company_name,
            'Total Due': total_due,
            'File Name': file_path
        })

    return extracted_data