Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,7 @@ gp/*
dist/*
.idea
venv
alita/*
test_output
.vscode/settings.json
.DS_Store
53 changes: 37 additions & 16 deletions gptpdf/parse.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import os
import re
import base64
from typing import List, Tuple, Optional, Dict
from typing import List, Tuple, Optional, Dict, Literal
import fitz
import shapely.geometry as sg
from shapely.geometry.base import BaseGeometry
from shapely.validation import explain_validity
import concurrent.futures
import logging
# LLM imports
import google.generativeai as genai
from PIL import Image
from openai import OpenAI

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
Expand Down Expand Up @@ -213,16 +216,25 @@ def parse_pdf(
output_dir: str = './',
api_key: Optional[str] = None,
base_url: Optional[str] = None,
model: str = 'gpt-4o',
model: str = 'gemini-2.5-flash',
gpt_worker: int = 1,
prompt = DEFAULT_PROMPT,
rect_prompt = DEFAULT_RECT_PROMPT,
role_prompt = DEFAULT_ROLE_PROMPT,
llm_provider: Literal['gemini', 'openai'] = 'gemini',
) -> Tuple[str, List[str]]:
"""
解析PDF文件到markdown文件。
@param pdf_path: PDF文件路径
@param output_dir: 输出目录
@param api_key: LLM API key
@param base_url: LLM base URL (for OpenAI or compatible endpoints)
@param model: LLM model name (e.g., 'gpt-4o' for OpenAI, 'gemini-2.5-flash' for Gemini)
@param gpt_worker: 并发线程数
@param prompt: 主提示词
@param rect_prompt: 矩形区域提示词
@param role_prompt: 角色提示词
@param llm_provider: 'gemini' (default) or 'openai' to select backend
@return: 解析后的markdown内容, 矩形图片路径列表
"""

Expand All @@ -232,39 +244,47 @@ def parse_pdf(
image_infos = _parse_pdf_to_images(pdf_path, output_dir=output_dir)

# Process images with GPT
def _process_page(index: int, image_info: Tuple[str, List[str]]) -> Tuple[int, str]:
# 使用 OpenAI 客户端替代 Agent
client = OpenAI(api_key=api_key, base_url=base_url)
def _process_page(index: int, image_info: Tuple[str, List[str]], model_name=model) -> Tuple[int, str]:
page_image, rect_images = image_info
local_prompt = prompt
if rect_images:
local_prompt += rect_prompt + ', '.join(rect_images)

# 打开图片文件
with open(page_image, "rb") as image_file:
# 调用 OpenAI API
if llm_provider == 'gemini':
genai.configure(api_key=api_key)
try:
image = Image.open(page_image)
gemini_model = genai.GenerativeModel(model_name=model_name)
response = gemini_model.generate_content([
local_prompt,
image
])
content = response.text
return index, content
except Exception as e:
return index, f"Error processing page {index+1}: {str(e)}"
elif llm_provider == 'openai':
try:
client = OpenAI(api_key=api_key, base_url=base_url)
with open(page_image, "rb") as image_file:
image_data = image_file.read()
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": role_prompt},
{"role": "user", "content": [
{"type": "text", "text": local_prompt},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64.b64encode(image_file.read()).decode('utf-8')}"}}
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64.b64encode(image_data).decode('utf-8')}"}}
]}
]
)

# 检查 response.choices 是否为 None
if not response.choices:
print(response)
return index, f"Error: Empty choices in API response for page {index+1}"

content = response.choices[0].message.content
return index, content
except Exception as e:
# 捕获所有异常并返回错误信息
return index, f"Error processing page {index+1}: {str(e)}"
else:
return index, f"Unknown llm_provider: {llm_provider}"

contents = [None] * len(image_infos)
with concurrent.futures.ThreadPoolExecutor(max_workers=gpt_worker) as executor:
Expand All @@ -287,4 +307,5 @@ def _process_page(index: int, image_info: Tuple[str, List[str]]) -> Tuple[int, s
os.remove(page_image)
all_rect_images.extend(rect_images)

return content, all_rect_images
return content, all_rect_images

Loading