Pythonation · hmdqr · Aug 25, 2025 · Aug 25, 2025 · Aug 25, 2025 · Copilot
diff --git a/.gitignore b/.gitignore
@@ -175,6 +175,8 @@ conversion.log
 # Database files
 *.csv
 processed_files.csv
+*.db
+processed_files.db
 
 # Document directories (if they contain sensitive or large files)
 docs_import/

diff --git a/BatchPdfConv.py b/BatchPdfConv.py
@@ -1,18 +1,23 @@
+# Standard library imports
 import os
 import sys
 import base64
-import csv
 import time
 import logging
+import sqlite3
+
+# Third-party imports
 from mistralai import Mistral
+import pypandoc
 from dotenv import load_dotenv
-load_dotenv() 
 
+# Load environment variables
+load_dotenv()
 
 # Configuration
 DOC_DIR = "docs_import"
 EXPORT_DIR = "docs_exports"
-DB_CSV = "processed_files.csv"
+DB_FILE = "processed_files.db"
 LOG_FILE = "conversion.log"
 MAX_RETRIES = 5
 INITIAL_BACKOFF = 1  # in seconds
@@ -21,7 +26,7 @@
 logging.basicConfig(
     filename=LOG_FILE,
     level=logging.INFO,
-    format='%(asctime)s %(levelname)s: %(message)s',
+    format="%(asctime)s %(levelname)s: %(message)s",
 )
 
 # Ensure API key is set via environment variable
@@ -32,66 +37,101 @@
 
 client = Mistral(api_key=API_KEY)
 
-FIELDNAMES = ['filename', 'status', 'attempts', 'error']
 
-
-def ensure_export_directory():
-    """Ensure the export directory exists."""
-    if not os.path.exists(EXPORT_DIR):
-        os.makedirs(EXPORT_DIR)
-        print(f"Created export directory: {EXPORT_DIR}")
+def init_database():
+    """Initialize the SQLite database and create table if it doesn't exist."""
+    conn = sqlite3.connect(DB_FILE)
+    cursor = conn.cursor()
+    cursor.execute('''
+        CREATE TABLE IF NOT EXISTS processed_files (
+            filename TEXT PRIMARY KEY,
+            status TEXT NOT NULL,
+            attempts INTEGER NOT NULL,
+            error TEXT,
+            processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+        )
+    ''')
+    conn.commit()
+    conn.close()
 
 
 def load_processed():
-    """Load processed file records from CSV into a dict."""
+    """Load processed file records from SQLite database into a dict."""
+    init_database()
     processed = {}
-    if os.path.exists(DB_CSV):
-        with open(DB_CSV, newline='', encoding='utf-8') as csvfile:
-            reader = csv.DictReader(csvfile)
-            for row in reader:
-                processed[row['filename']] = row
+
+    conn = sqlite3.connect(DB_FILE)
+    cursor = conn.cursor()
+    cursor.execute('SELECT filename, status, attempts, error FROM processed_files')
+
+    for row in cursor.fetchall():
+        filename, status, attempts, error = row
+        processed[filename] = {
+            'filename': filename,
+            'status': status,
+            'attempts': str(attempts),
+            'error': error or ''
+        }
+
+    conn.close()
     return processed
 
 
-def append_to_db(record):
-    """Append a processing record to the CSV database."""
-    file_exists = os.path.exists(DB_CSV)
-    with open(DB_CSV, 'a', newline='', encoding='utf-8') as csvfile:
-        writer = csv.DictWriter(csvfile, fieldnames=FIELDNAMES)
-        if not file_exists:
-            writer.writeheader()
-        writer.writerow(record)
+def update_db(record):
+    """Update or insert a processing record into the SQLite database."""
+    init_database()
+
+    conn = sqlite3.connect(DB_FILE)
+    cursor = conn.cursor()
+
+    # Use INSERT OR REPLACE for atomic upsert operation
+    cursor.execute('''
+        INSERT OR REPLACE INTO processed_files (filename, status, attempts, error)
+        VALUES (?, ?, ?, ?)
+    ''', (record['filename'], record['status'], int(record['attempts']), record['error']))
+
+    conn.commit()
+    conn.close()
+
+
+def ensure_export_directory():
+    """Ensure the export directory exists."""
+    if not os.path.exists(EXPORT_DIR):
+        os.makedirs(EXPORT_DIR)
+        print(f"Created export directory: {EXPORT_DIR}")
 
 
 def get_pdf_files():
     """List all PDF files in the DOC_DIR folder and its subdirectories."""
     if not os.path.isdir(DOC_DIR):
-        print(f"Error: Directory '{DOC_DIR}' not found.")
+        os.makedirs(DOC_DIR)
+        print(
+            f"Directory '{DOC_DIR}' was not found, so I created it for you. Please put your PDF files inside and run again."
+        )
         sys.exit(1)
-    
+
     pdf_files = []
     for root, dirs, files in os.walk(DOC_DIR):
         for file in files:
-            if file.lower().endswith('.pdf'):
-                # Get relative path from DOC_DIR
+            if file.lower().endswith(".pdf"):
                 rel_path = os.path.relpath(os.path.join(root, file), DOC_DIR)
                 pdf_files.append(rel_path)
-    
+
     return pdf_files
 
 
 def encode_pdf(pdf_path):
     """Encode the PDF file to a base64 string."""
     try:
         with open(pdf_path, "rb") as pdf_file:
-            return base64.b64encode(pdf_file.read()).decode('utf-8')
+            return base64.b64encode(pdf_file.read()).decode("utf-8")
     except Exception as e:
         logging.error(f"Failed to encode {pdf_path}: {e}")
         return None
 
 
 def convert_pdf_to_markdown(pdf_filename):
-    """Perform OCR on the PDF and write the output as a markdown file in the export directory."""
+    """Perform OCR on the PDF, save as Markdown, and convert to Word safely."""
     full_path = os.path.join(DOC_DIR, pdf_filename)
     b64 = encode_pdf(full_path)
     if not b64:
@@ -102,39 +142,51 @@ def convert_pdf_to_markdown(pdf_filename):
         model="mistral-ocr-latest",
         document={
             "type": "document_url",
-            "document_url": f"data:application/pdf;base64,{b64}"
+            "document_url": f"data:application/pdf;base64,{b64}",
         },
-        include_image_base64=False
+        include_image_base64=False,
     )
 
-    # Create output directory structure
-    output_name = pdf_filename.rsplit('.', 1)[0] + '.md'
+    # Markdown output
+    output_name = pdf_filename.rsplit(".", 1)[0] + ".md"
     output_path = os.path.join(EXPORT_DIR, output_name)
-
-    # Ensure the output directory exists
     output_dir = os.path.dirname(output_path)
     if output_dir and not os.path.exists(output_dir):
         os.makedirs(output_dir)
-    
-    with open(output_path, 'w', encoding='utf-8') as md_file:
+
+    with open(output_path, "w", encoding="utf-8") as md_file:
         for page in response.pages:
             md_file.write(f"## Page {page.index + 1}\n\n")
             md_file.write(page.markdown + "\n\n")
-    
+
     print(f"Saved markdown file: {output_path}")
 
+    if os.path.exists(output_path):
+        try:
+            docx_path = os.path.join(
+                EXPORT_DIR, pdf_filename.rsplit(".", 1)[0] + ".docx"
+            )
+            pypandoc.convert_file(output_path, "docx", outputfile=docx_path)
+            print(f"Converted to Word: {docx_path}")
+        except Exception as e:
+            logging.error(f"Word conversion failed for {pdf_filename}: {e}")
+            print(f"Word conversion failed for {pdf_filename}: {e}")
+    else:
+        print(f"Markdown file not found, skipping Word conversion for {pdf_filename}")
-    if os.path.exists(output_path):
-        try:
-            docx_path = os.path.join(
-                EXPORT_DIR, pdf_filename.rsplit(".", 1)[0] + ".docx"
-            )
-            pypandoc.convert_file(output_path, "docx", outputfile=docx_path)
-            print(f"Converted to Word: {docx_path}")
-        except Exception as e:
-            logging.error(f"Word conversion failed for {pdf_filename}: {e}")
-            print(f"Word conversion failed for {pdf_filename}: {e}")
-    else:
-        print(f"Markdown file not found, skipping Word conversion for {pdf_filename}")
+    if os.path.exists(output_path):
+        try:
+            docx_path = os.path.join(
+                EXPORT_DIR, pdf_filename.rsplit(".", 1)[0] + ".docx"
+            )
+            pypandoc.convert_file(output_path, "docx", outputfile=docx_path)
+            print(f"Converted to Word: {docx_path}")
+        except RuntimeError as e:
+            logging.error(f"Pandoc not found or failed for {pdf_filename}: {e}")
+            print(f"Word conversion failed for {pdf_filename}: {e}\nPlease ensure that Pandoc is installed and available in your PATH. See https://pandoc.org/installing.html")
+        except Exception as e:
+            logging.error(f"Word conversion failed for {pdf_filename}: {e}")
+            print(f"Word conversion failed for {pdf_filename}: {e}")
+    else:
+        print(f"Markdown file not found, skipping Word conversion for {pdf_filename}")
-    if os.path.exists(output_path):
-        try:
-            docx_path = os.path.join(
-                EXPORT_DIR, pdf_filename.rsplit(".", 1)[0] + ".docx"
-            )
-            pypandoc.convert_file(output_path, "docx", outputfile=docx_path)
-            print(f"Converted to Word: {docx_path}")
-        except Exception as e:
-            logging.error(f"Word conversion failed for {pdf_filename}: {e}")
-            print(f"Word conversion failed for {pdf_filename}: {e}")
-    else:
-        print(f"Markdown file not found, skipping Word conversion for {pdf_filename}")
+    if os.path.exists(output_path):
+        try:
+            docx_path = os.path.join(
+                EXPORT_DIR, pdf_filename.rsplit(".", 1)[0] + ".docx"
+            )
+            pypandoc.convert_file(output_path, "docx", outputfile=docx_path)
+            print(f"Converted to Word: {docx_path}")
+        except RuntimeError as e:
+            logging.error(f"Pandoc not found or failed for {pdf_filename}: {e}")
+            print(f"Word conversion failed for {pdf_filename}: {e}\nPlease ensure that Pandoc is installed and available in your PATH. See https://pandoc.org/installing.html")
+        except Exception as e:
+            logging.error(f"Word conversion failed for {pdf_filename}: {e}")
+            print(f"Word conversion failed for {pdf_filename}: {e}")
+    else:
+        print(f"Markdown file not found, skipping Word conversion for {pdf_filename}")
+
 
 def main():
-    # Ensure export directory exists
     ensure_export_directory()
-    
+
     processed = load_processed()
     all_files = get_pdf_files()
     total = len(all_files)
-    succeeded = sum(1 for r in processed.values() if r['status'] == 'success')
-    to_do = [f for f in all_files if processed.get(f, {}).get('status') != 'success']
+    succeeded = sum(1 for r in processed.values() if r["status"] == "success")
+    to_do = [f for f in all_files if processed.get(f, {}).get("status") != "success"]
 
-    print(f"Found {total} PDF files in '{DOC_DIR}/'. {succeeded} already converted. {len(to_do)} remaining.")
+    print(
+        f"Found {total} PDF files in '{DOC_DIR}/'. {succeeded} already converted. {len(to_do)} remaining."
+    )
     print(f"Output will be saved to '{EXPORT_DIR}/' directory.")
 
     converted_count = 0
@@ -150,13 +202,27 @@ def main():
                 convert_pdf_to_markdown(pdf)
                 success = True
                 converted_count += 1
-                append_to_db({'filename': pdf, 'status': 'success', 'attempts': attempts, 'error': ''})
+                update_db(
+                    {
+                        "filename": pdf,
+                        "status": "success",
+                        "attempts": attempts,
+                        "error": "",
+                    }
+                )
                 print(f"Success: {pdf} (attempt {attempts})")
                 print(f"Waiting for the next file...")
                 time.sleep(3)
             except Exception as e:
                 error_msg = str(e)
-                append_to_db({'filename': pdf, 'status': 'error', 'attempts': attempts, 'error': error_msg})
+                update_db(
+                    {
+                        "filename": pdf,
+                        "status": "error",
+                        "attempts": attempts,
+                        "error": error_msg,
+                    }
+                )
                 logging.error(f"{pdf} attempt {attempts} failed: {error_msg}")
                 print(f"Error converting {pdf} on attempt {attempts}: {error_msg}")
                 if attempts < MAX_RETRIES:
@@ -167,9 +233,11 @@ def main():
         if not success:
             print(f"Failed: {pdf} after {attempts} attempts.")
 
-    print(f"\nConversion complete. Total successful conversions: {converted_count} out of {len(to_do)}.")
+    print(
+        f"\nConversion complete. Total successful conversions: {converted_count} out of {len(to_do)}."
+    )
     print(f"All converted files are saved in '{EXPORT_DIR}/' directory.")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/README.md b/README.md
@@ -17,8 +17,8 @@
 - **🔤 دقة فائقة للغة العربية**: يتفوق على الحلول المنافسة مثل Google Document AI و Azure OCR في فهم بنية الخط العربي
 - **⚡ سرعة هائلة**: قادر على معالجة آلاف الصفحات في دقائق
 - **🧠 فهم هيكلي عميق**: لا يقرأ الحروف فقط، بل يفهم العناوين، الفقرات، الجداول، والقوائم ويحافظ على تنسيقها
-- **📄 مخرجات منظمة**: يسلم البيانات بصيغة `Markdown`، مما يسهل عملية **تحويل PDF إلى Word**
-- **🔄 كود متقدم للمعالجة الدفعية**: يتضمن السكربت `BatchPdfConv.py` ميزات احترافية مثل تتبع الحالة، وتسجيل الأخطاء، وإعادة المحاولة التلقائية
+- **📄 مخرجات منظمة**: يحول الملفات تلقائياً إلى صيغتي `Markdown` و `Word (.docx)`، مما يوفر مرونة كاملة في التعامل مع النصوص المستخرجة
+- **🔄 كود متقدم للمعالجة الدفعية**: يتضمن السكربت `BatchPdfConv.py` ميزات احترافية مثل تتبع الحالة، وتسجيل الأخطاء، وإعادة المحاولة التلقائية، والتحويل التلقائي إلى Word
 
 ## 🛠️ المتطلبات الأساسية
 
@@ -37,7 +37,7 @@ cd Mistral-Arabic-OCR-test
 
 ### 2. تثبيت المكتبات المطلوبة
 ```bash
-pip install mistralai python-dotenv
+pip install mistralai python-dotenv pypandoc
 ```
 
 ### 3. إعداد مفتاح الـ API (للسكربت المتقدم `BatchPdfConv.py`)
@@ -78,20 +78,23 @@ python docconv.py
 هذا هو السكربت الأكثر قوة واحترافية، مصمم لمعالجة عدد كبير من الملفات بكفاءة وموثوقية.
 
 **الميزات المتقدمة:**
-- **📊 إدارة الحالة**: يستخدم ملف `processed_files.csv` لتسجيل حالة كل ملف (ناجح/فاشل). إذا توقف السكربت، سيكمل من حيث توقف عند تشغيله مرة أخرى دون إعادة معالجة الملفات الناجحة
+- **📊 إدارة الحالة**: يستخدم قاعدة بيانات SQLite (`processed_files.db`) لتسجيل حالة كل ملف (ناجح/فاشل). إذا توقف السكربت، سيكمل من حيث توقف عند تشغيله مرة أخرى دون إعادة معالجة الملفات الناجحة
 - **📝 تسجيل الأخطاء**: يتم تسجيل جميع الأخطاء وتفاصيل العمليات في ملف `conversion.log` للمساعدة في تصحيح الأخطاء
 - **🔄 إعادة المحاولة التلقائية**: في حال فشل طلب الـ API، سيحاول السكربت إعادة الطلب عدة مرات مع زيادة فترة الانتظار بين المحاولات
 - **🔒 الأمان**: يقرأ مفتاح الـ API من ملف `.env` بدلاً من كتابته مباشرة في الكود
+- **📄 تحويل تلقائي إلى Word**: يحول كل ملف PDF إلى Markdown ثم إلى Word (.docx) تلقائياً
+- **🛠️ إنشاء المجلدات تلقائياً**: ينشئ المجلدات المطلوبة إذا لم تكن موجودة
+- **🗄️ قاعدة بيانات آمنة**: يستخدم SQLite لضمان سلامة البيانات وتجنب فقدانها
 
 **طريقة التشغيل:**
-1. أنشئ مجلداً باسم `doc`
-2. ضع جميع ملفات الـ PDF التي تريد معالجتها داخل مجلد `doc`
+1. أنشئ مجلداً باسم `docs_import` (أو سيتم إنشاؤه تلقائياً)
+2. ضع جميع ملفات الـ PDF التي تريد معالجتها داخل مجلد `docs_import`
 3. تأكد من إعداد ملف `.env` كما هو موضح في قسم التثبيت
 4. قم بتشغيل السكربت:
 ```bash
 python BatchPdfConv.py
 ```
-5. سيتم إنشاء ملفات `Markdown` الناتجة في المجلد الرئيسي للمشروع
+5. سيتم إنشاء ملفات `Markdown` و `Word` الناتجة في مجلد `docs_exports`
 
 ## 📁 هيكل المشروع
 
@@ -102,8 +105,11 @@ Mistral-Arabic-OCR-test/
 ├── BatchPdfConv.py         # سكربت المعالجة الدفعية المتقدم
 ├── document.pdf            # ملف PDF للاختبار
 ├── docs_import/            # مجلد ملفات PDF المدخلة
-├── docs_exports/           # مجلد ملفات Markdown المخرجة
+├── docs_exports/           # مجلد ملفات Markdown و Word المخرجة
+├── processed_files.db      # قاعدة بيانات SQLite لتتبع حالة الملفات
+├── conversion.log          # ملف سجل الأخطاء والعمليات
 ├── .env                    # ملف إعدادات API (يجب إنشاؤه)
+├── .gitignore             # ملف إعدادات Git
 └── README.md               # هذا الملف
 ```
 
@@ -118,20 +124,29 @@ Mistral-Arabic-OCR-test/
 2. **خطأ في تثبيت المكتبات**:
    ```bash
    pip install --upgrade pip
-   pip install mistralai python-dotenv
+   pip install mistralai python-dotenv pypandoc
    ```
 
-3. **مشكلة في قراءة ملف PDF**:
-   - تأكد من أن الملف موجود في المسار الصحيح
+3. **مشكلة في تثبيت pypandoc**:
+   - في Windows: `pip install pypandoc`
+   - في macOS: `brew install pandoc` ثم `pip install pypandoc`
+   - في Linux: `sudo apt-get install pandoc` ثم `pip install pypandoc`
+
+4. **مشكلة في قراءة ملف PDF**:
+   - تأكد من أن الملف موجود في المسار الصحيح (`docs_import/`)
    - تأكد من أن الملف غير تالف
 
+5. **مشكلة في التحويل إلى Word**:
+   - تأكد من تثبيت pandoc بشكل صحيح
+   - تحقق من صلاحيات الكتابة في مجلد `docs_exports`
+
 ## 🤝 المساهمة والدعم
 
 هذا المشروع يهدف إلى خدمة المجتمع العربي والمطورين. يمكنك المساهمة بعدة طرق:
 
 - **🐛 فتح قضية (Issue)**: إذا واجهت مشكلة أو كان لديك اقتراح لتحسين الكود
 - **🔧 طلب سحب (Pull Request)**: إذا قمت بتطوير ميزة جديدة أو إصلاح خطأ، نرحب بمساهماتك
-- **💝 دعم القناة**: الدعم المادي عبر [باتريون](https://www.patreon.com/YourPatreon) أو Super Thanks في يوتيوب يساعدنا على توفير المزيد من هذه الموارد والمعرفة للمجتمع
+- **💝 دعم القناة**: الدعم المادي عبر Super Thanks في يوتيوب يساعدنا على توفير المزيد من هذه الموارد والمعرفة للمجتمع
 
 ## 📄 الترخيص