parsemypdf/parser/markitdown/markitdown_pdf.py at main · Ibrahim01110/parsemypdf · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
"""
PDF Content Extraction Script using Microsoft's Opensource markitdown library

Dependencies:
    - markitdown

Usage:
    Run the script directly to process a specified PDF file and print its content.
    Different sample files can be uncommented in the main function to test various PDF types.

Note:
    MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc). It supports:
    PDF, PowerPoint, Word, Excel, Images (EXIF metadata and OCR), Audio (EXIF metadata and speech transcription), HTML
    Text-based formats (CSV, JSON, XML), ZIP files (iterates over contents)
"""
import os
from markitdown import MarkItDown

# Get the project root directory
project_root = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))

def main():
    """
    Main function to demonstrate PDF content extraction.

    Processes different types of PDF files:
        - sample-1.pdf: Contains simple tables
        - sample-2.pdf: Contains image-based simple tables
        - sample-3.pdf: Contains image-based complex tables
        - sample-4.pdf: Contains mixed content (text, images, complex tables)
        - sample-5.pdf: Multi-column Texts

    Returns:
        None: Prints extracted content to console
    """
    md = MarkItDown()
    # Select PDF file to process - uncomment desired sample file
    #file_path = project_root+"/input/sample-1.pdf" # Table in pdf
    #file_path = project_root+"/input/sample-2.pdf" # Image based simple table in pdf
    #file_path = project_root+"/input/sample-3.pdf" # Image based complex table in pdf
    file_path = project_root+"/input/sample-4.pdf"  # Complex PDF with mixed content types
    #file_path = project_root+"/input/sample-5.pdf"  # Multi-column Texts

    result = md.convert(file_path)

    # Output extracted content to output.txt
    with open("output.txt", 'w') as file:
        file.write(result.text_content)

if __name__ == "__main__":
    main()