diff --git a/.gitignore b/.gitignore index 3fbd4dea9..47ccd423c 100644 --- a/.gitignore +++ b/.gitignore @@ -204,4 +204,16 @@ conf/local.service_conf.yaml docker/.env docker/launch_backend_service.sh docker/.env.oceanbase -local.service_conf.yaml \ No newline at end of file +local.service_conf.yaml + +# Generated by scripts/deploy.sh (runtime configs) +conf/service_conf_ragflow_*.yaml +nginx_conf/ + +logs/ +pods/ +upload_wiki_json.pid +.ragflow_secret_key +setup_tools_venv.sh +build_tools_bundle.sh +upload_snapshot.json \ No newline at end of file diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index bef03d38e..7ee238e3d 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -14,11 +14,13 @@ # limitations under the License. # import datetime +import io import json import logging import pathlib import re from io import BytesIO +import time import xxhash from quart import request, send_file @@ -34,7 +36,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.llm_service import LLMBundle from api.db.services.tenant_llm_service import TenantLLMService -from api.db.services.task_service import TaskService, queue_tasks, cancel_all_task_of +from api.db.services.task_service import TaskService, queue_tasks, cancel_all_task_of, queue_tasks_batch from common.metadata_utils import meta_filter, convert_conditions from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_parser_config, get_result, server_error_response, token_required, \ get_request_json @@ -181,6 +183,82 @@ async def upload(dataset_id, tenant_id): return get_result(data=renamed_doc_list) +@manager.route("/datasets//documents_with_meta", methods=["POST"]) # noqa: F821 +@token_required +async def upload_with_meta(dataset_id, tenant_id): + e, kb = KnowledgebaseService.get_by_id(dataset_id) + if not e: + raise LookupError(f"Can't find the dataset with ID {dataset_id}!") + + req = await request.json + docs = req.get("docs") + if not docs: + return get_error_data_result( + message="No docs in request params!", code=RetCode.ARGUMENT_ERROR + ) + parse = req.get("parse", True) + + group_id_field = req.get("group_id_field") + file_extension = req.get("file_extension", "html") + + file_objs = [] + for doc in docs: + title = doc["title"] + file_obj = io.BytesIO(doc["content"].encode("utf-8")) + # If the title already has an extension, do not add another extension + if "." in title: + filename = title + else: + filename = f"{title}.{file_extension}" + file_obj.filename = filename + metadata = doc.get("metadata", {}) + if not metadata.get("_group_id") and group_id_field and group_id_field in metadata: + metadata["_group_id"] = metadata[group_id_field] + if not metadata.get("_title"): + metadata["_title"] = title + file_objs.append(( + file_obj, + metadata, + )) + err, files = FileService.upload_document(kb, file_objs, tenant_id) + if err: + return get_result(message="\n".join(err), code=RetCode.SERVER_ERROR) + # rename key's name + renamed_doc_list = [] + docs_to_parse = [] + for file in files: + doc = file[0] + key_mapping = { + "chunk_num": "chunk_count", + "kb_id": "dataset_id", + "token_num": "token_count", + "parser_id": "chunk_method", + } + renamed_doc = {} + for key, value in doc.items(): + new_key = key_mapping.get(key, key) + renamed_doc[new_key] = value + renamed_doc["run"] = "UNSTART" + renamed_doc_list.append(renamed_doc) + if parse: + doc["tenant_id"] = tenant_id + docs_to_parse.append(doc) + + # Batch parse documents + if docs_to_parse: + doc_ids = [doc["id"] for doc in docs_to_parse] + storage_addresses = File2DocumentService.get_storage_addresses(doc_ids) + docs_with_storage = [] + for doc in docs_to_parse: + bucket, name = storage_addresses.get(doc["id"], (None, None)) + if bucket and name: + docs_with_storage.append((doc, bucket, name)) + if docs_with_storage: + queue_tasks_batch(docs_with_storage, 0) + + return get_result(data=renamed_doc_list) + + @manager.route("/datasets//documents/", methods=["PUT"]) # noqa: F821 @token_required async def update_doc(tenant_id, dataset_id, document_id): @@ -825,27 +903,59 @@ async def parse(tenant_id, dataset_id): unique_doc_ids, duplicate_messages = check_duplicate_ids(doc_list, "document") doc_list = unique_doc_ids - not_found = [] - success_count = 0 - for id in doc_list: - doc = DocumentService.query(id=id, kb_id=dataset_id) - if not doc: - not_found.append(id) - continue - if not doc: - return get_error_data_result(message=f"You don't own the document {id}.") - if 0.0 < doc[0].progress < 1.0: - return get_error_data_result("Can't parse document that is currently being processed") - info = {"run": "1", "progress": 0, "progress_msg": "", "chunk_num": 0, "token_num": 0} - DocumentService.update_by_id(id, info) - settings.docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), dataset_id) - TaskService.filter_delete([Task.doc_id == id]) - e, doc = DocumentService.get_by_id(id) - doc = doc.to_dict() - doc["tenant_id"] = tenant_id - bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"]) - queue_tasks(doc, bucket, name, 0) - success_count += 1 + if not doc_list: + if duplicate_messages: + return get_error_data_result(message=";".join(duplicate_messages)) + return get_error_data_result("No valid document IDs provided") + + # Batch query all documents + docs = list(DocumentService.model.select().where( + DocumentService.model.id.in_(doc_list), + DocumentService.model.kb_id == dataset_id + ).dicts()) + + found_doc_ids = {doc["id"] for doc in docs} + not_found = [doc_id for doc_id in doc_list if doc_id not in found_doc_ids] + + # Check for documents currently being processed + processing_docs = [doc for doc in docs if 0.0 < doc.get("progress", 0) < 1.0] + if processing_docs: + processing_ids = [doc["id"] for doc in processing_docs] + return get_error_data_result(f"Can't parse documents that are currently being processed: {processing_ids}") + + # All found documents are ready to parse + docs_to_parse = docs + + if not docs_to_parse: + if not_found: + return get_result(message=f"Documents not found: {not_found}", code=RetCode.DATA_ERROR) + if duplicate_messages: + return get_error_data_result(message=";".join(duplicate_messages)) + return get_error_data_result("No documents available for parsing") + + doc_ids_to_parse = [doc["id"] for doc in docs_to_parse] + + # Batch delete chunks from index (before queue_tasks_batch, which handles old task chunks) + settings.docStoreConn.delete({"doc_id": doc_ids_to_parse}, search.index_name(tenant_id), dataset_id) + + # Batch get storage addresses + storage_addresses = File2DocumentService.get_storage_addresses(doc_ids_to_parse) + + # Prepare documents with storage addresses for batch processing + docs_with_storage = [] + for doc in docs_to_parse: + bucket, name = storage_addresses.get(doc["id"], (None, None)) + if bucket and name: + doc["tenant_id"] = tenant_id + docs_with_storage.append((doc, bucket, name)) + + # Batch queue tasks (queue_tasks_batch handles task deletion internally) + if docs_with_storage: + queue_tasks_batch(docs_with_storage, 0) + + success_count = len(docs_with_storage) + + # Handle response with errors if not_found: return get_result(message=f"Documents not found: {not_found}", code=RetCode.DATA_ERROR) if duplicate_messages: @@ -1493,6 +1603,7 @@ async def retrieval_test(tenant_id): format: float description: Similarity score. """ + start_time = time.time() req = await get_request_json() if not req.get("dataset_ids"): return get_error_data_result("`dataset_ids` is required.") @@ -1511,6 +1622,11 @@ async def retrieval_test(tenant_id): ) if "question" not in req: return get_error_data_result("`question` is required.") + + end_time = time.time() + logging.info(f"retrieval_test prepare1 elapsed time: {end_time - start_time:.3f} seconds") + start_time = time.time() + page = int(req.get("page", 1)) size = int(req.get("page_size", 30)) question = req["question"] @@ -1520,14 +1636,17 @@ async def retrieval_test(tenant_id): langs = req.get("cross_languages", []) if not isinstance(doc_ids, list): return get_error_data_result("`documents` should be a list") - doc_ids_list = KnowledgebaseService.list_documents_by_ids(kb_ids) - for doc_id in doc_ids: - if doc_id not in doc_ids_list: - return get_error_data_result(f"The datasets don't own the document {doc_id}") + + if doc_ids: + is_valid, _, invalid_doc_ids = KnowledgebaseService.verify_documents_belong_to_kbs(doc_ids, kb_ids) + if not is_valid: + return get_error_data_result(f"The datasets don't own the documents {invalid_doc_ids}") + if not doc_ids: metadata_condition = req.get("metadata_condition", {}) or {} - metas = DocumentService.get_meta_by_kbs(kb_ids) - doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")) + if metadata_condition: + metas = DocumentService.get_meta_by_kbs(kb_ids) + doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")) # If metadata_condition has conditions but no docs match, return empty result if not doc_ids and metadata_condition.get("conditions"): return get_result(data={"total": 0, "chunks": [], "doc_aggs": {}}) @@ -1536,6 +1655,11 @@ async def retrieval_test(tenant_id): similarity_threshold = float(req.get("similarity_threshold", 0.2)) vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3)) top = int(req.get("top_k", 1024)) + + end_time = time.time() + logging.info(f"retrieval_test prepare2 elapsed time: {end_time - start_time:.3f} seconds") + start_time = time.time() + if req.get("highlight") == "False" or req.get("highlight") == "false": highlight = False else: @@ -1550,7 +1674,6 @@ async def retrieval_test(tenant_id): rerank_mdl = None if req.get("rerank_id"): rerank_mdl = LLMBundle(kb.tenant_id, LLMType.RERANK, llm_name=req["rerank_id"]) - if langs: question = await cross_languages(kb.tenant_id, None, question, langs) @@ -1573,6 +1696,10 @@ async def retrieval_test(tenant_id): highlight=highlight, rank_feature=label_question(question, kbs), ) + end_time = time.time() + logging.info(f"retrieval_test retrieval elapsed time: {end_time - start_time:.3f} seconds") + start_time = time.time() + if toc_enhance: chat_mdl = LLMBundle(kb.tenant_id, LLMType.CHAT) cks = settings.retriever.retrieval_by_toc(question, ranks["chunks"], tenant_ids, chat_mdl, size) @@ -1604,6 +1731,10 @@ async def retrieval_test(tenant_id): rename_chunk[new_key] = value renamed_chunks.append(rename_chunk) ranks["chunks"] = renamed_chunks + + end_time = time.time() + logging.info(f"retrieval_test postprocess elapsed time: {end_time - start_time:.3f} seconds") + return get_result(data=ranks) except Exception as e: if str(e).find("not_found") > 0: @@ -1611,4 +1742,4 @@ async def retrieval_test(tenant_id): message="No chunk found! Check the chunk status please!", code=RetCode.DATA_ERROR, ) - return server_error_response(e) + return server_error_response(e) \ No newline at end of file diff --git a/sdk/python/README.md b/sdk/python/README.md new file mode 100644 index 000000000..ad64dcde0 --- /dev/null +++ b/sdk/python/README.md @@ -0,0 +1,65 @@ +# ragflow-sdk + +RAGFlow Python SDK 提供了与 RAGFlow 服务交互的 Python 接口,包括数据集管理、文档上传、对话等功能。 + +## 安装 + +```shell +pip install ragflow-sdk +``` + +## 快速开始 + +```python +from ragflow_sdk import RAGFlow + +# 初始化客户端 +rag = RAGFlow(api_key="YOUR_API_KEY", base_url="http://localhost:9380") + +# 创建数据集 +dataset = rag.create_dataset(name="My Dataset") + +# 上传文档 +documents = dataset.upload_documents_with_meta([ + { + "title": "Document Title", + "content": "Document content...", + "metadata": { + "tags": ["tag1", "tag2"] + } + } +]) +``` + +## 文档 + +- [工具模块 API 参考](ragflow_sdk/tools/README.md) - FileReader, DocumentExtractor, FieldMapper, BatchUploader 等工具的详细 API 文档 +- [示例脚本](examples/README.md) - 批量上传等示例脚本的使用说明 + +## 工具模块 + +SDK 提供了强大的工具模块,用于批量处理和文档管理: + +- **FileReader**: 支持多种文件格式的文件读取器 +- **DocumentExtractor**: 从文件/目录中提取文档 +- **FieldMapper**: 灵活的字段映射器,支持自动字段检测 +- **BatchUploader**: 批量上传器,支持断点续传和自动重试 + +详细文档请参考 [工具模块 API 参考](ragflow_sdk/tools/README.md)。 + +## 构建和发布 + +### 构建 Python SDK + +```shell +uv build +``` + +### 发布到 PyPI + +```shell +uv pip install twine +export TWINE_USERNAME="__token__" +export TWINE_PASSWORD=$YOUR_PYPI_API_TOKEN +twine upload dist/*.whl +``` diff --git a/sdk/python/examples/README.md b/sdk/python/examples/README.md new file mode 100644 index 000000000..8ef8f9028 --- /dev/null +++ b/sdk/python/examples/README.md @@ -0,0 +1,554 @@ +# RAGFlow SDK Examples + +本目录包含使用 RAGFlow SDK 的示例脚本。 + +## 示例脚本 + +### reparse_failed_documents.py + +用于重新解析数据集中所有失败文档的工具。 + +#### 功能特性 + +- ✅ 自动查找失败的文档(状态为 "FAIL") +- ✅ 分页获取文档,支持大数据集 +- ✅ 批量重新解析,可配置批次大小 +- ✅ 自动重试机制(指数退避) +- ✅ 进度跟踪和日志记录 + +#### 使用方法 + +##### 基本用法 + +```bash +python examples/reparse_failed_documents.py \ + -k YOUR_API_KEY \ + -H http://localhost:9380 \ + -i DATASET_ID +``` + +##### 自定义批次大小 + +```bash +python examples/reparse_failed_documents.py \ + -k YOUR_API_KEY \ + -H http://localhost:9380 \ + -i DATASET_ID \ + -b 100 +``` + +##### 自定义页面大小 + +```bash +python examples/reparse_failed_documents.py \ + -k YOUR_API_KEY \ + -H http://localhost:9380 \ + -i DATASET_ID \ + --page-size 5000 +``` + +#### 参数说明 + +| 参数 | 简写 | 必需 | 说明 | +|------|------|------|------| +| `--api-key` | `-k` | 是 | RAGFlow API 密钥 | +| `--host-address` | `-H` | 是 | RAGFlow 服务器地址(如:http://localhost:9380) | +| `--dataset-id` | `-i` | 是 | 要重新解析的数据集 ID | +| `--batch-size` | `-b` | 否 | 重新解析文档的批次大小(默认:50) | +| `--page-size` | - | 否 | 获取文档的页面大小(默认:10000) | +| `--log-file` | - | 否 | 日志文件路径(默认:./logs/reparse_failed_documents.log) | + +#### 编程方式使用 + +```python +from ragflow_sdk import RAGFlow +from ragflow_sdk.tools import FailedDocumentReparser + +# 初始化 +rag = RAGFlow(api_key="YOUR_API_KEY", base_url="http://localhost:9380") + +# 创建重新解析器 +reparser = FailedDocumentReparser(rag) + +# 重新解析失败的文档 +total_failed, total_reparsed = reparser.reparse_failed_documents( + dataset_id="DATASET_ID", + reparse_batch_size=50, + page_size=10000 +) + +print(f"Total failed documents: {total_failed}") +print(f"Total reparsed: {total_reparsed}") +``` + +#### 工作原理 + +1. **分页获取文档**:从数据集的第一页开始,逐页获取所有文档 +2. **过滤失败文档**:检查每个文档的 `run` 状态,筛选出状态为 "FAIL" 的文档 +3. **批量重新解析**:当累积的失败文档数量达到 `reparse_batch_size` 时,批量调用 `async_parse_documents` 重新解析 +4. **自动重试**:如果重新解析失败,会自动重试(最多 10 次,使用指数退避策略) +5. **处理剩余文档**:处理完所有页面后,处理剩余的失败文档(如果数量不足一个批次) + +#### 注意事项 + +1. **批次大小**:较大的批次大小可以提高效率,但会增加单次 API 调用的负担 +2. **页面大小**:较大的页面大小可以减少 API 调用次数,但会增加内存使用 +3. **网络稳定性**:如果网络不稳定,建议使用较小的批次大小 +4. **数据集权限**:确保 API 密钥有权限访问指定的数据集 + +#### 故障排除 + +**问题:找不到数据集** +- 检查数据集 ID 是否正确 +- 确认 API 密钥有权限访问该数据集 + +**问题:重新解析失败** +- 检查网络连接 +- 查看日志中的错误信息 +- 检查文档是否真的处于失败状态 + +--- + +### batch_upload.py + +通用的批量文档上传工具,支持多种文件格式和灵活的字段映射。 + +#### 功能特性 + +- ✅ 支持多种文件格式: + - **多文档格式**(一个文件包含多个文档): + - **JSON**: 数组格式,包含多个文档对象 + - **JSONL**: 每行一个 JSON 对象 + - **CSV**: 每行一个文档(第一行为表头) + - **XLSX/XLS**: Excel 文件,每行一个文档(第一行为表头) + - **单文档格式**(一个文件对应一个文档): + - **PDF**: PDF 文档 + - **Office**: Word (.docx, .doc), PowerPoint (.pptx, .ppt), Excel (.xlsx, .xls) + - **HTML**: HTML 文件 (.html, .htm) + - **Markdown**: Markdown 文件 (.md, .markdown) + - **文本**: 文本文件 (.txt) + - **图片**: 图片文件 (.jpg, .jpeg, .png, .gif, .bmp, .tiff, .webp) + - **其他**: 邮件 (.eml), EPUB (.epub) 等 PowerRAG 支持的格式 +- ✅ 迭代器模式,懒加载(不会一次性加载所有文件到内存) +- ✅ 支持断点续传(resume) +- ✅ 字段映射器,灵活映射数据源字段到标准格式 +- ✅ 自动重试机制(指数退避) +- ✅ Snapshot 管理,防止重复处理 + +#### 标准字段格式 + +上传的文档会被映射为以下标准格式: + +```python +{ + "title": "文档标题", + "content": "文档内容", + "metadata": { + "doc_id": "文档ID(可选)", + "doc_url": "文档URL(可选)", + "tags": ["标签1", "标签2"] # 可选 + } +} +``` + +#### 使用方法 + +##### 基本用法 + +```bash +python examples/batch_upload.py \ + -k YOUR_API_KEY \ + -H http://localhost:9380 \ + -d /path/to/files +``` + +##### 上传到现有数据集 + +```bash +python examples/batch_upload.py \ + -k YOUR_API_KEY \ + -H http://localhost:9380 \ + -d /path/to/files \ + -i DATASET_ID +``` + +##### 自定义字段映射 + +如果数据源的字段名与默认不同,可以指定字段映射: + +```bash +python examples/batch_upload.py \ + -k YOUR_API_KEY \ + -H http://localhost:9380 \ + -d /path/to/files \ + --title-field "article_title" \ + --content-field "article_body" \ + --doc-id-field "article_id" \ + --tags-field "categories" \ + --tags-separator ";" +``` + +##### 自定义批次大小 + +```bash +python examples/batch_upload.py \ + -k YOUR_API_KEY \ + -H http://localhost:9380 \ + -d /path/to/files \ + -b 20 +``` + +##### 断点续传 + +如果上传过程中断,可以使用 `--resume` 参数从上次中断的地方继续: + +```bash +python examples/batch_upload.py \ + -k YOUR_API_KEY \ + -H http://localhost:9380 \ + -d /path/to/files \ + --resume +``` + +##### 后台运行 + +使用 `nohup` 在后台运行,不输出日志: + +```bash +nohup python examples/batch_upload.py \ + -k YOUR_API_KEY \ + -H http://localhost:9380 \ + -d /path/to/files \ + &> /dev/null & +``` + +**说明**: +- `nohup`:确保进程在终端关闭后继续运行 +- `&> /dev/null`:将标准输出和标准错误都重定向到 `/dev/null`,不保存日志 +- `&`:在后台运行进程 + +如果需要查看进程状态,可以使用: +```bash +# 查看进程 +ps aux | grep batch_upload.py + +# 查看进程 ID(PID) +pgrep -f batch_upload.py +``` + +##### 指定文件类型 + +只处理特定类型的文件: + +```bash +python examples/batch_upload.py \ + -k YOUR_API_KEY \ + -H http://localhost:9380 \ + -d /path/to/files \ + --file-patterns "*.json" "*.jsonl" +``` + +##### 控制多文档格式 + +控制哪些文件扩展名应被当作多文档格式处理: + +```bash +# 默认:json 和 jsonl 作为多文档格式 +python examples/batch_upload.py \ + -k YOUR_API_KEY \ + -H http://localhost:9380 \ + -d /path/to/files + +# 只将 json 作为多文档格式(jsonl 将被当作单文档格式) +python examples/batch_upload.py \ + -k YOUR_API_KEY \ + -H http://localhost:9380 \ + -d /path/to/files \ + --multi-doc-extensions json + +# 将 json、jsonl 和 csv 都作为多文档格式 +python examples/batch_upload.py \ + -k YOUR_API_KEY \ + -H http://localhost:9380 \ + -d /path/to/files \ + --multi-doc-extensions json jsonl csv + +# 将 CSV、XLSX、XLS 作为单文档格式处理(只将 json 和 jsonl 作为多文档格式) +python examples/batch_upload.py \ + -k YOUR_API_KEY \ + -H http://localhost:9380 \ + -d /path/to/files \ + --multi-doc-extensions json jsonl +``` + +**注意**: +- 默认情况下,`json`、`jsonl`、`csv`、`xlsx`、`xls` 被当作多文档格式 +- 可以通过 `--multi-doc-extensions` 参数控制哪些扩展名应被当作多文档格式 +- 如果某个扩展名不在列表中,对应的文件将被当作单文档格式处理 +- 对于 `json` 文件,只有数组格式(以 `[` 开头)才会被当作多文档格式 +- 对于 `jsonl` 文件,如果在其扩展名列表中,始终被当作多文档格式 +- **示例**:如果设置 `--multi-doc-extensions json jsonl`,则 `csv`、`xlsx`、`xls` 文件将被当作单文档格式处理(文件名作为 title,文件内容作为 content) + +#### 参数说明 + +| 参数 | 简写 | 必需 | 说明 | +|------|------|------|------| +| `--api-key` | `-k` | 是 | RAGFlow API 密钥 | +| `--host-address` | `-H` | 是 | RAGFlow 服务器地址(如:http://localhost:9380) | +| `--data-dir` | `-d` | 是 | 包含文件的目录路径 | +| `--dataset-id` | `-i` | 否 | 要使用的数据集 ID(如果不提供,将创建新数据集) | +| `--dataset-name` | `-n` | 否 | 新数据集的名称(默认:自动生成) | +| `--batch-size` | `-b` | 否 | 上传文档的批次大小(默认:5) | +| `--snapshot-file` | `-s` | 否 | 用于断点续传的快照文件路径(默认:upload_snapshot.json) | +| `--resume` | - | 否 | 从上次快照恢复上传 | +| `--file-extension` | - | 否 | 上传文档的文件扩展名(默认:txt) | +| `--title-field` | - | 否 | 标题字段名(默认:自动检测) | +| `--content-field` | - | 否 | 内容字段名(默认:自动检测) | +| `--doc-id-field` | - | 否 | 文档ID字段名(默认:自动检测) | +| `--doc-url-field` | - | 否 | 文档URL字段名(默认:自动检测) | +| `--tags-field` | - | 否 | 标签字段名(默认:自动检测) | +| `--tags-separator` | - | 否 | 标签分隔符(默认:,) | +| `--file-patterns` | - | 否 | 文件匹配模式(如:*.json *.csv) | +| `--multi-doc-extensions` | - | 否 | 指定哪些文件扩展名应被当作多文档格式处理(默认:json jsonl csv xlsx xls)。不在列表中的扩展名将被当作单文档格式处理 | + +#### 文件格式说明 + +##### 多文档格式(一个文件包含多个文档) + +**JSON 格式**(数组格式): +```json +[ + { + "id": "doc_001", + "title": "文档标题", + "text": "文档内容...", + "tags": "tag1, tag2, tag3" // 逗号分隔字符串(可以有空格) + }, + { + "id": "doc_002", + "title": "另一个文档", + "text": "更多内容...", + "tags": ["tag2", "tag4"] // 数组格式 + } +] +``` + +**JSONL 格式**(每行一个 JSON 对象): +```jsonl +{"id": "doc_001", "title": "文档1", "text": "内容1", "tags": "tag1, tag2"} // 逗号分隔字符串 +{"id": "doc_002", "title": "文档2", "text": "内容2", "tags": ["tag3", "tag4"]} // 数组格式 +``` + +**CSV 格式**(第一行为表头,每行一个文档): +```csv +id,title,content,tags +doc_001,文档1,内容1,"tag1, tag2" // 逗号分隔字符串(可以有空格) +doc_002,文档2,内容2,"tag3" +``` + +**Excel 格式**(第一行为表头,每行一个文档): +| id | title | content | tags | +|----|-------|---------|------| +| doc_001 | 文档1 | 内容1 | tag1, tag2 | +| doc_002 | 文档2 | 内容2 | tag3 | + +**Tags 字段格式说明**: +- **逗号分隔字符串**:`"tag1, tag2, tag3"` 或 `"tag1,tag2,tag3"`(空格会被自动去除) +- **数组格式**:`["tag1", "tag2", "tag3"]` +- 两种格式都支持,工具会自动识别并解析 + +##### 单文档格式(一个文件对应一个文档) + +对于单文档格式(PDF、Office、HTML、Markdown、文本、图片等),工具会自动处理: +- **title**: 自动使用文件名(不含扩展名)作为标题 +- **content**: 文件内容(文本文件直接读取内容,二进制文件由 PowerRAG 解析) +- **metadata**: 为空(不包含 doc_id、doc_url、tags 等字段) + +**示例**: +- `document.pdf` → title: "document", content: PDF 解析后的内容 +- `report.docx` → title: "report", content: Word 文档解析后的内容 +- `article.md` → title: "article", content: Markdown 文件内容 +- `image.png` → title: "image", content: 图片 OCR 识别后的内容 + +#### 字段自动检测 + +如果不指定字段映射,工具会自动检测以下字段名: + +- **title**: `title`, `name`, `subject`, `heading`, `header` +- **content**: `content`, `text`, `body`, `description`, `desc`, `data` +- **doc_id**: `id`, `doc_id`, `_id`, `document_id`, `docid` +- **doc_url**: `url`, `link`, `uri`, `doc_url`, `source_url` +- **tags**: `tags`, `tag`, `categories`, `category`, `labels`, `label` + +#### 断点续传机制 + +工具支持断点续传功能: + +1. **快照保存**:每次成功上传批次后自动保存进度快照 +2. **文件级恢复**:记录已完全处理的文件,避免重复处理 +3. **恢复上传**:使用 `--resume` 参数可以从上次中断的地方继续 +4. **快照文件**:默认保存在 `upload_snapshot.json`,可通过 `-s` 参数自定义 + +快照文件包含以下信息: +- `processed_files`: 已完全处理的文件列表 +- `total_processed`: 已处理的文档总数 +- `dataset_id`: 数据集 ID(如果使用现有数据集) +- `timestamp`: 快照时间戳 + +#### 迭代器模式(编程方式) + +对于需要更多控制的场景,可以使用迭代器模式,将文档提取和上传逻辑分离: + +```python +from ragflow_sdk import RAGFlow +from ragflow_sdk.tools import BatchUploader, DocumentExtractor, FieldMapper + +# 初始化 +rag = RAGFlow(api_key="YOUR_API_KEY", base_url="http://localhost:9380") + +# 创建字段映射器 +field_mapper = FieldMapper( + title_field="article_title", + content_field="article_body", + tags_field="categories" +) + +# 创建文档提取器 +extractor = DocumentExtractor(field_mapper=field_mapper) + +# 创建上传器 +uploader = BatchUploader(rag) + +# 获取或创建数据集 +uploader.get_or_create_dataset(dataset_name="My Dataset") + +# 方式1: 使用文档提取器的迭代器,然后手动上传 +for batch, file_path, is_file_complete in extractor.extract_batches( + data_dir="/path/to/files", + batch_size=10 +): + # 处理批次 + print(f"Processing batch from {file_path}: {len(batch)} documents") + + # 可以在这里添加自定义处理逻辑 + # ... + + # 手动上传批次 + uploader.dataset.upload_documents_with_meta(batch) + + # 如果文件处理完成,可以执行额外操作 + if is_file_complete: + print(f"File {file_path} completed") + +# 方式2: 手动控制文档提取(如果需要自定义处理) +batch_iterator = extractor.extract_batches( + data_dir="/path/to/files", + batch_size=10 +) + +for batch, file_path, is_file_complete in batch_iterator: + # 可以在这里添加自定义处理逻辑 + print(f"Processing batch from {file_path}: {len(batch)} documents") + + # 手动上传批次 + uploader.dataset.upload_documents_with_meta(batch, file_extension="txt") + + if is_file_complete: + print(f"File {file_path} completed") +``` + +#### 错误处理 + +工具包含自动重试机制: +- 默认最多重试 10 次 +- 使用指数退避策略(最大等待时间 8 秒) +- 失败时会记录错误信息并保存快照 + +#### 内存优化 + +工具使用迭代器模式,具有以下内存优化特性: + +1. **懒加载**:文件只在需要时读取,不会一次性加载所有文件到内存 +2. **批次处理**:文档按批次处理,每批次大小可配置 +3. **流式处理**:大文件可以流式读取,不会占用过多内存 + +#### 注意事项 + +1. **文件顺序**:工具会按文件名排序处理文件,这对于断点续传很重要 +2. **批次大小**:较小的批次大小可以提高容错性,但会增加 API 调用次数 +3. **网络稳定性**:如果网络不稳定,建议使用较小的批次大小和启用断点续传 +4. **数据集权限**:确保 API 密钥有权限访问指定的数据集 +5. **Excel 支持**:需要安装 `pandas` 和 `openpyxl`:`pip install pandas openpyxl` + +#### 故障排除 + +**问题:找不到数据集** +- 检查数据集 ID 是否正确 +- 确认 API 密钥有权限访问该数据集 + +**问题:上传失败** +- 检查网络连接 +- 查看日志中的错误信息 +- 尝试使用 `--resume` 参数恢复上传 + +**问题:字段映射错误** +- 检查数据源字段名是否正确 +- 使用 `--title-field` 等参数显式指定字段映射 +- 查看日志中的字段检测信息 + +**问题:Excel 文件读取失败** +- 确保已安装 `pandas` 和 `openpyxl`:`pip install pandas openpyxl` +- 检查 Excel 文件格式是否正确 +- 尝试将 Excel 文件转换为 CSV 格式 + +## 迁移指南 + +### 从 WikiUploader 迁移到 BatchUploader + +如果你之前使用 `WikiUploader` 上传 Wiki JSON 文件,可以按以下方式迁移到新的 `BatchUploader`: + +**旧代码(WikiUploader):** +```python +from ragflow_sdk import RAGFlow +from ragflow_sdk.tools import WikiUploader + +rag = RAGFlow(api_key="YOUR_API_KEY", base_url="http://localhost:9380") +uploader = WikiUploader(rag) + +total_docs, total_files = uploader.upload_wiki_json_files( + data_dir="/path/to/json/files", + batch_size=10 +) +``` + +**新代码(BatchUploader):** +```python +from ragflow_sdk import RAGFlow +from ragflow_sdk.tools import BatchUploader, DocumentExtractor, FieldMapper + +rag = RAGFlow(api_key="YOUR_API_KEY", base_url="http://localhost:9380") + +# 创建字段映射器(Wiki JSON 格式:title, text, id, tags) +field_mapper = FieldMapper( + title_field="title", + content_field="text", + doc_id_field="id", + tags_field="tags" +) + +# 创建文档提取器和上传器 +extractor = DocumentExtractor(field_mapper=field_mapper) +uploader = BatchUploader(rag) + +total_docs, total_files = uploader.upload( + document_extractor=extractor, + data_dir="/path/to/json/files", + batch_size=10 +) +``` + +新工具的优势: +- ✅ 支持更多文件格式(JSON、JSONL、CSV、XLSX、XLS 等) +- ✅ 更灵活的字段映射配置 +- ✅ 更好的断点续传机制(文件级 + 文档索引级) +- ✅ 统一的 API 接口 diff --git a/sdk/python/examples/batch_upload.py b/sdk/python/examples/batch_upload.py new file mode 100644 index 000000000..02558374e --- /dev/null +++ b/sdk/python/examples/batch_upload.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python3 +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Example script for batch uploading documents to RAGFlow using BatchUploader. + +This script demonstrates how to use the BatchUploader tool to upload +documents from various file formats (JSON, JSONL, CSV, XLSX, XLS) to RAGFlow. +""" + +import argparse +import logging +import os +import sys +from logging.handlers import RotatingFileHandler + +# Add parent directory to path to import ragflow_sdk +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if parent_dir not in sys.path: + sys.path.insert(0, parent_dir) + +from ragflow_sdk import RAGFlow +from ragflow_sdk.tools import BatchUploader, DocumentExtractor, FieldMapper + + +def setup_logging(log_file: str = None): + """ + Configure logging to output to both file and console. + + Args: + log_file: Path to log file. If None, defaults to './logs/batch_upload.log' + """ + if log_file is None: + log_file = './logs/batch_upload.log' + + # Ensure log directory exists + log_dir = os.path.dirname(log_file) + if log_dir: + os.makedirs(log_dir, exist_ok=True) + + # Create formatter + formatter = logging.Formatter( + '%(asctime)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + # Get root logger + logger = logging.getLogger() + logger.setLevel(logging.INFO) + + # Clear existing handlers + logger.handlers.clear() + + # File handler with rotation + file_handler = RotatingFileHandler( + log_file, + maxBytes=10*1024*1024, # 10MB + backupCount=5 + ) + file_handler.setLevel(logging.INFO) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + # Console handler + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(logging.INFO) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + + logging.info(f"Logging to file: {os.path.abspath(log_file)}") + + +def main(): + parser = argparse.ArgumentParser( + description='Batch upload documents to RAGFlow', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Upload to a new dataset + python batch_upload.py -k YOUR_API_KEY -H http://localhost:9380 -d /path/to/files + + # Upload to an existing dataset + python batch_upload.py -k YOUR_API_KEY -H http://localhost:9380 -d /path/to/files -i DATASET_ID + + # Upload with custom field mapping + python batch_upload.py -k YOUR_API_KEY -H http://localhost:9380 -d /path/to/files \\ + --title-field "article_title" --content-field "article_body" + + # Upload with custom batch size and resume support + python batch_upload.py -k YOUR_API_KEY -H http://localhost:9380 -d /path/to/files -b 10 --resume + """ + ) + parser.add_argument( + '-k', '--api-key', + required=True, + help='RAGFlow API key' + ) + parser.add_argument( + '-H', '--host-address', + required=True, + help='RAGFlow host address (e.g., http://localhost:9380)' + ) + parser.add_argument( + '-d', '--data-dir', + required=True, + help='Directory containing files to upload' + ) + parser.add_argument( + '-i', '--dataset-id', + help='Dataset ID to use (if not provided, a new dataset will be created)' + ) + parser.add_argument( + '-n', '--dataset-name', + help='Name for new dataset (default: auto-generated)' + ) + parser.add_argument( + '-b', '--batch-size', + type=int, + default=5, + help='Batch size for uploading documents (default: 5)' + ) + parser.add_argument( + '-s', '--snapshot-file', + default='upload_snapshot.json', + help='Snapshot file for resume support (default: upload_snapshot.json)' + ) + parser.add_argument( + '--resume', + action='store_true', + help='Resume from last snapshot' + ) + parser.add_argument( + '--file-extension', + default='txt', + help='File extension for uploaded documents (default: txt)' + ) + parser.add_argument( + '--title-field', + help='Source field name for title (default: auto-detect)' + ) + parser.add_argument( + '--content-field', + help='Source field name for content (default: auto-detect)' + ) + parser.add_argument( + '--doc-id-field', + help='Source field name for doc_id (default: auto-detect)' + ) + parser.add_argument( + '--doc-url-field', + help='Source field name for doc_url (default: auto-detect)' + ) + parser.add_argument( + '--tags-field', + help='Source field name for tags (default: auto-detect)' + ) + parser.add_argument( + '--tags-separator', + default=',', + help='Separator for tags string (default: ,)' + ) + parser.add_argument( + '--file-patterns', + nargs='+', + help='File patterns to match (e.g., *.json *.csv)' + ) + parser.add_argument( + '--multi-doc-extensions', + nargs='+', + default=['json', 'jsonl', 'csv', 'xlsx', 'xls'], + help='File extensions (without dot) to treat as multi-document formats. Default: json jsonl csv xlsx xls' + ) + parser.add_argument( + '--log-file', + default='./logs/batch_upload.log', + help='Path to log file (default: ./logs/batch_upload.log)' + ) + + args = parser.parse_args() + + # Setup logging after parsing arguments + setup_logging(args.log_file) + + # Validate data directory + if not os.path.isdir(args.data_dir): + logging.error(f"Error: Data directory '{args.data_dir}' does not exist") + sys.exit(1) + + # Initialize RAGFlow client + try: + rag = RAGFlow(args.api_key, args.host_address) + logging.info(f"Connected to RAGFlow at {args.host_address}") + except Exception as e: + logging.error(f"Failed to initialize RAGFlow client: {e}") + sys.exit(1) + + # Create field mapper if custom mappings provided + field_mapper = None + if any([args.title_field, args.content_field, args.doc_id_field, + args.doc_url_field, args.tags_field]): + field_mapper = FieldMapper( + title_field=args.title_field, + content_field=args.content_field, + doc_id_field=args.doc_id_field, + doc_url_field=args.doc_url_field, + tags_field=args.tags_field, + tags_separator=args.tags_separator + ) + + # Create document extractor + extractor = DocumentExtractor( + field_mapper=field_mapper, + multi_doc_extensions=args.multi_doc_extensions + ) + + # Initialize BatchUploader + uploader = BatchUploader(rag) + + # Upload files + try: + total_docs, total_files = uploader.upload( + document_extractor=extractor, + data_dir=args.data_dir, + dataset_id=args.dataset_id, + dataset_name=args.dataset_name, + batch_size=args.batch_size, + snapshot_file=args.snapshot_file, + resume=args.resume, + file_extension=args.file_extension, + file_patterns=args.file_patterns + ) + + logging.info(f"\n✅ Upload completed successfully!") + logging.info(f" Total documents: {total_docs}") + logging.info(f" Total files: {total_files}") + logging.info(f" Dataset ID: {uploader.dataset.id}") + logging.info(f" Dataset Name: {uploader.dataset.name}") + + except KeyboardInterrupt: + logging.warning("\n⚠️ Upload interrupted by user") + logging.info(f"💡 You can resume later using: --resume") + sys.exit(1) + except Exception as e: + logging.error(f"\n❌ Upload failed: {e}") + logging.info(f"💡 You can resume later using: --resume") + sys.exit(1) + + +if __name__ == "__main__": + main() + diff --git a/sdk/python/examples/reparse_failed_documents.py b/sdk/python/examples/reparse_failed_documents.py new file mode 100644 index 000000000..e4115c02a --- /dev/null +++ b/sdk/python/examples/reparse_failed_documents.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Example script for reparsing failed documents in a RAGFlow dataset. + +This script demonstrates how to use the FailedDocumentReparser tool to +find and reparse all failed documents in a dataset. +""" + +import argparse +import logging +import os +import sys +from logging.handlers import RotatingFileHandler + +# Add parent directory to path to import ragflow_sdk +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if parent_dir not in sys.path: + sys.path.insert(0, parent_dir) + +from ragflow_sdk import RAGFlow +from ragflow_sdk.tools import FailedDocumentReparser + + +def setup_logging(log_file: str = None): + """ + Configure logging to output to both file and console. + + Args: + log_file: Path to log file. If None, defaults to './logs/reparse_failed_documents.log' + """ + if log_file is None: + log_file = './logs/reparse_failed_documents.log' + + # Ensure log directory exists + log_dir = os.path.dirname(log_file) + if log_dir: + os.makedirs(log_dir, exist_ok=True) + + # Create formatter + formatter = logging.Formatter( + '%(asctime)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + # Get root logger + logger = logging.getLogger() + logger.setLevel(logging.INFO) + + # Clear existing handlers + logger.handlers.clear() + + # File handler with rotation + file_handler = RotatingFileHandler( + log_file, + maxBytes=10*1024*1024, # 10MB + backupCount=5 + ) + file_handler.setLevel(logging.INFO) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + # Console handler + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(logging.INFO) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + + logging.info(f"Logging to file: {os.path.abspath(log_file)}") + + +def main(): + parser = argparse.ArgumentParser( + description='Reparse all failed documents in a RAGFlow dataset', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Reparse failed documents with default batch size (50) + python reparse_failed_documents.py -k YOUR_API_KEY -H http://localhost:9380 -i DATASET_ID + + # Reparse with custom batch size + python reparse_failed_documents.py -k YOUR_API_KEY -H http://localhost:9380 -i DATASET_ID -b 100 + + # Reparse with custom page size for fetching documents + python reparse_failed_documents.py -k YOUR_API_KEY -H http://localhost:9380 -i DATASET_ID --page-size 5000 + """ + ) + parser.add_argument( + '-k', '--api-key', + required=True, + help='RAGFlow API key' + ) + parser.add_argument( + '-H', '--host-address', + required=True, + help='RAGFlow host address (e.g., http://localhost:9380)' + ) + parser.add_argument( + '-i', '--dataset-id', + required=True, + help='Dataset ID to reparse failed documents from' + ) + parser.add_argument( + '-b', '--batch-size', + type=int, + default=1000, + help='Batch size for reparsing documents (default: 50)' + ) + parser.add_argument( + '--page-size', + type=int, + default=10000, + help='Page size for fetching documents (default: 10000)' + ) + parser.add_argument( + '--log-file', + default='./logs/reparse_failed_documents.log', + help='Path to log file (default: ./logs/reparse_failed_documents.log)' + ) + + args = parser.parse_args() + + # Setup logging after parsing arguments + setup_logging(args.log_file) + + # Initialize RAGFlow client + try: + rag = RAGFlow(args.api_key, args.host_address) + logging.info(f"Connected to RAGFlow at {args.host_address}") + except Exception as e: + logging.error(f"Failed to initialize RAGFlow client: {e}") + sys.exit(1) + + # Initialize FailedDocumentReparser + reparser = FailedDocumentReparser(rag) + + # Reparse failed documents + try: + total_failed, total_reparsed = reparser.reparse_failed_documents( + dataset_id=args.dataset_id, + reparse_batch_size=args.batch_size, + page_size=args.page_size + ) + + logging.info(f"\n✅ Reparsing completed successfully!") + logging.info(f" Total failed documents: {total_failed}") + logging.info(f" Total reparsed: {total_reparsed}") + if total_failed > 0: + logging.info(f" Success rate: {total_reparsed / total_failed * 100:.2f}%") + + except KeyboardInterrupt: + logging.warning("\n⚠️ Reparsing interrupted by user") + sys.exit(1) + except Exception as e: + logging.error(f"\n❌ Reparsing failed: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() + diff --git a/sdk/python/pyproject.toml b/sdk/python/pyproject.toml index ba6facbfe..a62fec1ac 100644 --- a/sdk/python/pyproject.toml +++ b/sdk/python/pyproject.toml @@ -8,18 +8,44 @@ readme = "README.md" requires-python = ">=3.12,<3.15" dependencies = ["requests>=2.30.0,<3.0.0", "beartype>=0.20.0,<1.0.0"] +[project.optional-dependencies] +test = [ + "hypothesis>=6.131.9", + "openpyxl>=3.1.5", + "pandas>=2.0.0", + "pillow>=11.1.0", + "pytest>=8.3.5", + "pytest-cov>=4.1.0", + "pytest-xdist>=3.5.0", + "pytest-mock>=3.12.0", + "pytest-timeout>=2.2.0", + "python-docx>=1.1.2", + "python-pptx>=1.0.2", + "reportlab>=4.3.1", + "requests>=2.32.3", + "requests-toolbelt>=1.0.0", + "xlwt>=1.3.0", + "xlrd>=2.0.1", +] [dependency-groups] test = [ "hypothesis>=6.131.9", "openpyxl>=3.1.5", + "pandas>=2.0.0", "pillow>=11.1.0", "pytest>=8.3.5", + "pytest-cov>=4.1.0", + "pytest-xdist>=3.5.0", + "pytest-mock>=3.12.0", + "pytest-timeout>=2.2.0", "python-docx>=1.1.2", "python-pptx>=1.0.2", "reportlab>=4.3.1", "requests>=2.32.3", "requests-toolbelt>=1.0.0", + "xlwt>=1.3.0", + "xlrd>=2.0.1", ] diff --git a/sdk/python/ragflow_sdk/__init__.py b/sdk/python/ragflow_sdk/__init__.py index ea383cfc3..c040f42a9 100644 --- a/sdk/python/ragflow_sdk/__init__.py +++ b/sdk/python/ragflow_sdk/__init__.py @@ -26,6 +26,7 @@ from .modules.document import Document from .modules.chunk import Chunk from .modules.agent import Agent +from .tools import BatchUploader, DocumentExtractor, FileReader, FieldMapper __version__ = importlib.metadata.version("ragflow_sdk") @@ -36,5 +37,9 @@ "Session", "Document", "Chunk", - "Agent" + "Agent", + "BatchUploader", + "DocumentExtractor", + "FileReader", + "FieldMapper", ] diff --git a/sdk/python/ragflow_sdk/modules/dataset.py b/sdk/python/ragflow_sdk/modules/dataset.py index d2d689da3..6f3646bd0 100644 --- a/sdk/python/ragflow_sdk/modules/dataset.py +++ b/sdk/python/ragflow_sdk/modules/dataset.py @@ -54,14 +54,61 @@ def upload_documents(self, document_list: list[dict]): url = f"/datasets/{self.id}/documents" files = [("file", (ele["display_name"], ele["blob"])) for ele in document_list] res = self.post(path=url, json=None, files=files) - res = res.json() - if res.get("code") == 0: + + # Check response status code + if res.status_code != 200: + raise Exception(f"API request failed with status code {res.status_code}: {res.text[:500]}") + + # Check if response body is empty + if not res.text or not res.text.strip(): + raise Exception(f"API returned empty response (status {res.status_code}). URL: {res.url}") + + # Try to parse JSON + try: + res_json = res.json() + except Exception as e: + raise Exception(f"Failed to parse JSON response (status {res.status_code}): {str(e)}. Response text: {res.text[:500]}") + + if res_json.get("code") == 0: doc_list = [] - for doc in res["data"]: + for doc in res_json.get("data", []): document = Document(self.rag, doc) doc_list.append(document) return doc_list - raise Exception(res.get("message")) + raise Exception(res_json.get("message", "Unknown error")) + + def upload_documents_with_meta(self, document_list: list[dict], group_id_field: str = None, file_extension: str = "html"): + url = f"/datasets/{self.id}/documents_with_meta" + docs = [] + for ele in document_list: + docs.append({ + "title": ele["title"], + "content": ele["content"], + "metadata": ele.get("metadata", {}), + }) + res = self.post(path=url, json={"docs": docs, "group_id_field": group_id_field, "file_extension": file_extension}) + + # Check response status code + if res.status_code != 200: + raise Exception(f"API request failed with status code {res.status_code}: {res.text[:500]}") + + # Check if response body is empty + if not res.text or not res.text.strip(): + raise Exception(f"API returned empty response (status {res.status_code}). URL: {res.url}") + + # Try to parse JSON + try: + res_json = res.json() + except Exception as e: + raise Exception(f"Failed to parse JSON response (status {res.status_code}): {str(e)}. Response text: {res.text[:500]}") + + if res_json.get("code") == 0: + doc_list = [] + for doc in res_json.get("data", []): + document = Document(self.rag, doc) + doc_list.append(document) + return doc_list + raise Exception(res_json.get("message", "Unknown error")) def list_documents( self, @@ -87,13 +134,27 @@ def list_documents( "create_time_to": create_time_to, } res = self.get(f"/datasets/{self.id}/documents", params=params) - res = res.json() + + # Check response status code + if res.status_code != 200: + raise Exception(f"API request failed with status code {res.status_code}: {res.text[:500]}") + + # Check if response body is empty + if not res.text or not res.text.strip(): + raise Exception(f"API returned empty response (status {res.status_code}). URL: {res.url}") + + # Try to parse JSON + try: + res_json = res.json() + except Exception as e: + raise Exception(f"Failed to parse JSON response (status {res.status_code}): {str(e)}. Response text: {res.text[:500]}") + documents = [] - if res.get("code") == 0: - for document in res["data"].get("docs"): + if res_json.get("code") == 0: + for document in res_json.get("data", {}).get("docs", []): documents.append(Document(self.rag, document)) return documents - raise Exception(res["message"]) + raise Exception(res_json.get("message", "Unknown error")) def delete_documents(self, ids: list[str] | None = None): res = self.rm(f"/datasets/{self.id}/documents", {"ids": ids}) @@ -150,4 +211,4 @@ def async_cancel_parse_documents(self, document_ids): res = self.rm(f"/datasets/{self.id}/chunks", {"document_ids": document_ids}) res = res.json() if res.get("code") != 0: - raise Exception(res.get("message")) + raise Exception(res.get("message")) \ No newline at end of file diff --git a/sdk/python/ragflow_sdk/ragflow.py b/sdk/python/ragflow_sdk/ragflow.py index da8a3d336..0cb6615c3 100644 --- a/sdk/python/ragflow_sdk/ragflow.py +++ b/sdk/python/ragflow_sdk/ragflow.py @@ -48,6 +48,36 @@ def put(self, path, json): res = requests.put(url=self.api_url + path, json=json, headers=self.authorization_header) return res + def _parse_response(self, res, url=None): + """ + Parse API response with proper error handling. + + Args: + res: requests.Response object + url: Optional URL for error messages + + Returns: + Parsed JSON response as dict + + Raises: + Exception: If response is invalid or contains error + """ + # Check response status code + if res.status_code != 200: + raise Exception(f"API request failed with status code {res.status_code}: {res.text[:500]}") + + # Check if response body is empty + if not res.text or not res.text.strip(): + error_url = url or res.url if hasattr(res, 'url') else 'unknown' + raise Exception(f"API returned empty response (status {res.status_code}). URL: {error_url}") + + # Try to parse JSON + try: + return res.json() + except Exception as e: + error_url = url or res.url if hasattr(res, 'url') else 'unknown' + raise Exception(f"Failed to parse JSON response (status {res.status_code}): {str(e)}. Response text: {res.text[:500]}") + def create_dataset( self, name: str, @@ -70,10 +100,10 @@ def create_dataset( payload["parser_config"] = parser_config.to_json() res = self.post("/datasets", payload) - res = res.json() - if res.get("code") == 0: - return DataSet(self, res["data"]) - raise Exception(res["message"]) + res_json = self._parse_response(res) + if res_json.get("code") == 0: + return DataSet(self, res_json["data"]) + raise Exception(res_json.get("message", "Unknown error")) def delete_datasets(self, ids: list[str] | None = None): res = self.delete("/datasets", {"ids": ids}) @@ -99,13 +129,13 @@ def list_datasets(self, page: int = 1, page_size: int = 30, orderby: str = "crea "name": name, }, ) - res = res.json() + res_json = self._parse_response(res) result_list = [] - if res.get("code") == 0: - for data in res["data"]: + if res_json.get("code") == 0: + for data in res_json.get("data", []): result_list.append(DataSet(self, data)) return result_list - raise Exception(res["message"]) + raise Exception(res_json.get("message", "Unknown error")) def create_chat(self, name: str, avatar: str = "", dataset_ids=None, llm: Chat.LLM | None = None, prompt: Chat.Prompt | None = None) -> Chat: if dataset_ids is None: diff --git a/sdk/python/ragflow_sdk/tools/README.md b/sdk/python/ragflow_sdk/tools/README.md new file mode 100644 index 000000000..a65ea7dc3 --- /dev/null +++ b/sdk/python/ragflow_sdk/tools/README.md @@ -0,0 +1,645 @@ +# RAGFlow SDK Tools API Reference + +本文档介绍 RAGFlow SDK 工具模块的 API 参考。 + +## 目录 + +- [FileReader](#filereader) - 文件读取器 +- [DocumentExtractor](#documentextractor) - 文档提取器 +- [FieldMapper](#fieldmapper) - 字段映射器 +- [BatchUploader](#batchuploader) - 批量上传器 +- [Models](#models) - 数据模型 + +--- + +## FileReader + +文件读取器,支持多种文件格式的批量迭代读取。 + +### 类定义 + +```python +class FileReader: + def __init__( + self, + field_mapper: Optional[FieldMapper] = None, + multi_doc_extensions: Optional[List[str]] = None + ) +``` + +### 参数 + +- `field_mapper` (Optional[FieldMapper]): 字段映射器实例,用于字段映射 +- `multi_doc_extensions` (Optional[List[str]]): 应被视为多文档格式的文件扩展名列表(不含点号) + - 默认: `['json', 'jsonl']` + - 不在列表中的扩展名将被视为单文档格式 + +### 方法 + +#### `is_multi_document_format(file_path: str) -> bool` + +检查文件是否为多文档格式。 + +**参数:** +- `file_path` (str): 文件路径 + +**返回:** +- `bool`: 如果文件包含多个文档返回 `True`,否则返回 `False` + +**示例:** +```python +reader = FileReader() +is_multi = reader.is_multi_document_format("data.json") +``` + +#### `read_file(file_path: str, start_index: int = 0) -> Iterator[Dict[str, Any]]` + +读取文件并生成文档。 + +**参数:** +- `file_path` (str): 文件路径 +- `start_index` (int): 起始文档索引(对于多文档格式,跳过此索引之前的文档) + +**返回:** +- `Iterator[Dict[str, Any]]`: 文档字典的迭代器 + +**示例:** +```python +reader = FileReader() +for doc in reader.read_file("data.json"): + print(doc) +``` + +#### `read_files_batch(file_paths: List[str], batch_size: int, processed_files: Optional[List[str]] = None) -> Iterator[List[Dict[str, Any]]]` + +批量读取多个文件。 + +**参数:** +- `file_paths` (List[str]): 要读取的文件路径列表 +- `batch_size` (int): 每批文档数量 +- `processed_files` (Optional[List[str]]): 已处理文件路径列表(将被跳过) + +**返回:** +- `Iterator[List[Dict[str, Any]]]`: 文档批次列表的迭代器 + +**示例:** +```python +reader = FileReader() +file_paths = ["file1.json", "file2.json"] +for batch in reader.read_files_batch(file_paths, batch_size=10): + print(f"Batch size: {len(batch)}") +``` + +### 支持的文件格式 + +#### 多文档格式 +- **JSON**: 数组格式(以 `[` 开头) +- **JSONL**: 每行一个 JSON 对象 +- **CSV**: 每行一个文档(第一行为表头) +- **XLSX/XLS**: Excel 文件,每行一个文档(第一行为表头) + +#### 单文档格式 +- **PDF**: PDF 文档 +- **Office**: Word (.docx, .doc), PowerPoint (.pptx, .ppt) +- **HTML**: HTML 文件 (.html, .htm) +- **Markdown**: Markdown 文件 (.md, .markdown) +- **文本**: 文本文件 (.txt) +- **图片**: 图片文件 (.jpg, .jpeg, .png, .gif, .bmp, .tiff, .webp) +- **其他**: 邮件 (.eml), EPUB (.epub) + +--- + +## DocumentExtractor + +文档提取器,从文件/目录中提取文档。 + +### 类定义 + +```python +class DocumentExtractor: + def __init__( + self, + field_mapper: Optional[FieldMapper] = None, + multi_doc_extensions: Optional[List[str]] = None + ) +``` + +### 参数 + +- `field_mapper` (Optional[FieldMapper]): 字段映射器实例 +- `multi_doc_extensions` (Optional[List[str]]): 多文档格式扩展名列表 + - 默认: `['json', 'jsonl', 'csv', 'xlsx', 'xls']` + +### 方法 + +#### `extract_documents(data_dir: str, file_patterns: Optional[List[str]] = None, processed_files: Optional[List[str]] = None) -> Iterator[Tuple[Dict[str, Any], str]]` + +从目录中的文件提取文档。 + +**参数:** +- `data_dir` (str): 包含文件的目录 +- `file_patterns` (Optional[List[str]]): 可选的文件匹配模式列表 +- `processed_files` (Optional[List[str]]): 已处理文件路径列表(将被跳过) + +**返回:** +- `Iterator[Tuple[Dict[str, Any], str]]`: 文档字典和文件路径的元组迭代器 + +**示例:** +```python +extractor = DocumentExtractor() +for doc, file_path in extractor.extract_documents("/path/to/files"): + print(f"Document from {file_path}: {doc['title']}") +``` + +#### `extract_batches(data_dir: str, batch_size: int, file_patterns: Optional[List[str]] = None, file_cursor: Optional[Dict[str, int]] = None) -> Iterator[Tuple[List[Dict[str, Any]], str, bool]]` + +从目录中的文件批量提取文档。 + +**参数:** +- `data_dir` (str): 包含文件的目录 +- `batch_size` (int): 每批文档数量 +- `file_patterns` (Optional[List[str]]): 可选的文件匹配模式列表 +- `file_cursor` (Optional[Dict[str, int]]): 可选的文件路径到文档索引的映射字典,用于从指定索引恢复 + +**返回:** +- `Iterator[Tuple[List[Dict[str, Any]], str, bool]]`: 文档批次列表、当前文件路径和文件是否完成的元组迭代器 + +**示例:** +```python +extractor = DocumentExtractor() +for batch, file_path, is_complete in extractor.extract_batches( + data_dir="/path/to/files", + batch_size=10 +): + print(f"Batch from {file_path}: {len(batch)} documents") + if is_complete: + print(f"File {file_path} completed") +``` + +--- + +## FieldMapper + +字段映射器,将源文档字段转换为 RAGFlow 标准格式。 + +### 类定义 + +```python +class FieldMapper: + def __init__( + self, + title_field: Optional[str] = None, + content_field: Optional[str] = None, + doc_id_field: Optional[str] = None, + doc_url_field: Optional[str] = None, + tags_field: Optional[str] = None, + tags_separator: str = ',', + config: Optional[FieldMappingConfig] = None + ) +``` + +### 参数 + +- `title_field` (Optional[str]): 标题字段名(None = 自动检测) +- `content_field` (Optional[str]): 内容字段名(None = 自动检测) +- `doc_id_field` (Optional[str]): 文档ID字段名(None = 自动检测) +- `doc_url_field` (Optional[str]): 文档URL字段名(None = 自动检测) +- `tags_field` (Optional[str]): 标签字段名(None = 自动检测) +- `tags_separator` (str): 标签分隔符(默认: `','`) +- `config` (Optional[FieldMappingConfig]): 字段映射配置实例(如果提供,将覆盖单独字段) + +### 方法 + +#### `map(doc: Dict[str, Any]) -> Dict[str, Any]` + +将源文档映射为 RAGFlow 标准格式。 + +**参数:** +- `doc` (Dict[str, Any]): 源文档字典 + +**返回:** +- `Dict[str, Any]`: RAGFlow 格式的映射文档 + +**标准格式:** +```python +{ + "title": str, + "content": str, + "metadata": { + "doc_id": str (optional), + "doc_url": str (optional), + "tags": List[str] (optional) + } +} +``` + +**示例:** +```python +mapper = FieldMapper( + title_field="article_title", + content_field="article_body", + tags_field="categories" +) + +source_doc = { + "article_title": "My Article", + "article_body": "Content here...", + "categories": "tech, python" +} + +mapped_doc = mapper.map(source_doc) +# Result: +# { +# "title": "My Article", +# "content": "Content here...", +# "metadata": { +# "tags": ["tech", "python"] +# } +# } +``` + +### 自动字段检测 + +如果不指定字段映射,工具会自动检测以下字段名: + +- **title**: `title`, `name`, `subject`, `heading`, `header` +- **content**: `content`, `text`, `body`, `description`, `desc`, `data` +- **doc_id**: `id`, `doc_id`, `_id`, `document_id`, `docid` +- **doc_url**: `url`, `link`, `uri`, `doc_url`, `source_url` +- **tags**: `tags`, `tag`, `categories`, `category`, `labels`, `label` + +### 标签格式支持 + +支持以下标签格式: + +- **数组格式**: `["tag1", "tag2", "tag3"]` +- **逗号分隔字符串**: `"tag1, tag2, tag3"` 或 `"tag1,tag2,tag3"`(空格会被自动去除) + +--- + +## BatchUploader + +批量上传器,用于将大量文档上传到 RAGFlow。 + +### 类定义 + +```python +class BatchUploader: + def __init__(self, rag: RAGFlow, dataset: Optional[DataSet] = None) +``` + +### 参数 + +- `rag` (RAGFlow): RAGFlow 客户端实例 +- `dataset` (Optional[DataSet]): 可选的数据集实例 + +### 方法 + +#### `set_dataset(dataset: DataSet)` + +设置要用于上传的数据集。 + +**参数:** +- `dataset` (DataSet): 数据集实例 + +#### `get_or_create_dataset(dataset_id: Optional[str] = None, dataset_name: Optional[str] = None) -> DataSet` + +获取现有数据集或创建新数据集。 + +**参数:** +- `dataset_id` (Optional[str]): 可选的现有数据集 ID +- `dataset_name` (Optional[str]): 可选的新数据集名称(默认: 自动生成) + +**返回:** +- `DataSet`: 数据集实例 + +**示例:** +```python +uploader = BatchUploader(rag) +dataset = uploader.get_or_create_dataset(dataset_name="My Dataset") +``` + +#### `upload(document_extractor: DocumentExtractor, data_dir: str, dataset_id: Optional[str] = None, dataset_name: Optional[str] = None, batch_size: int = 5, snapshot_file: str = "upload_snapshot.json", resume: bool = False, file_extension: str = "txt", file_patterns: Optional[List[str]] = None) -> Tuple[int, int]` + +从目录上传文档到 RAGFlow。 + +**参数:** +- `document_extractor` (DocumentExtractor): 文档提取器实例 +- `data_dir` (str): 包含要上传文件的目录 +- `dataset_id` (Optional[str]): 可选的数据集 ID +- `dataset_name` (Optional[str]): 可选的新数据集名称 +- `batch_size` (int): 每批文档数量(默认: 5) +- `snapshot_file` (str): 用于断点续传的快照文件路径(默认: "upload_snapshot.json") +- `resume` (bool): 是否从快照恢复(默认: False) +- `file_extension` (str): 上传文档的文件扩展名(默认: "txt") +- `file_patterns` (Optional[List[str]]): 可选的文件匹配模式列表 + +**返回:** +- `Tuple[int, int]`: (已处理文档总数, 已处理文件总数) + +**示例:** +```python +from ragflow_sdk import RAGFlow +from ragflow_sdk.tools import BatchUploader, DocumentExtractor + +rag = RAGFlow(api_key="YOUR_API_KEY", base_url="http://localhost:9380") +extractor = DocumentExtractor() +uploader = BatchUploader(rag) + +total_docs, total_files = uploader.upload( + document_extractor=extractor, + data_dir="/path/to/files", + batch_size=10, + resume=True +) +``` + +#### `retry_with_backoff(func, max_retries: int = 10, max_backoff: int = 8)` + +带指数退避的重试包装器(静态方法)。 + +**参数:** +- `func`: 要重试的函数 +- `max_retries` (int): 最大重试次数(默认: 10) +- `max_backoff` (int): 最大重试间隔(秒)(默认: 8) + +**返回:** +- 带重试逻辑的包装函数 + +#### `save_snapshot(snapshot_file: str, file_cursors: List[FileCursor], total_processed: int, dataset_id: Optional[str] = None)` + +保存处理快照到文件(静态方法)。 + +**参数:** +- `snapshot_file` (str): 快照文件路径 +- `file_cursors` (List[FileCursor]): 文件游标实体列表,每个游标跟踪一个文件的处理进度 +- `total_processed` (int): 已处理文档总数 +- `dataset_id` (Optional[str]): 可选的数据集 ID + +#### `load_snapshot(snapshot_file: str) -> Optional[Snapshot]` + +从文件加载处理快照(静态方法)。 + +**参数:** +- `snapshot_file` (str): 快照文件路径 + +**返回:** +- `Optional[Snapshot]`: 快照实体,如果文件不存在或无效则返回 None + +### 特性 + +- ✅ 迭代器模式的批量处理 +- ✅ 基于快照的断点续传支持 +- ✅ 自动重试(指数退避) +- ✅ 文件级和文档索引级的恢复支持 + +--- + +## Models + +### FileType + +文件类型枚举。 + +```python +class FileType(Enum): + JSON_ARRAY = "json_array" + JSONL = "jsonl" + CSV = "csv" + EXCEL = "excel" + SINGLE = "single" +``` + +### DocumentFormat + +文档格式枚举。 + +```python +class DocumentFormat(Enum): + MULTI_DOC = "multi_doc" # 一个文件包含多个文档 + SINGLE_DOC = "single_doc" # 一个文件对应一个文档 +``` + +### DocumentMetadata + +文档元数据实体。 + +```python +@dataclass +class DocumentMetadata: + doc_id: Optional[str] = None + doc_url: Optional[str] = None + tags: Optional[List[str]] = None + + def to_dict(self) -> Dict[str, Any] + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "DocumentMetadata" + def is_empty(self) -> bool +``` + +**方法说明:** +- `to_dict()`: 将元数据转换为字典格式 +- `from_dict()`: 从字典创建元数据实体 +- `is_empty()`: 检查元数据是否为空(所有字段都为 None) + +### Document + +RAGFlow 标准格式的文档实体。 + +```python +@dataclass +class Document: + title: str + content: str + metadata: Optional[DocumentMetadata] = None + + def to_dict(self) -> Dict + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "Document" +``` + +### FieldMappingConfig + +字段映射配置。 + +```python +@dataclass +class FieldMappingConfig: + title_field: Optional[str] = None + content_field: Optional[str] = None + doc_id_field: Optional[str] = None + doc_url_field: Optional[str] = None + tags_field: Optional[str] = None + tags_separator: str = ',' +``` + +### FileCursor + +文件游标实体,用于跟踪单个文件的文档处理进度。 + +```python +@dataclass +class FileCursor: + file_path: str + doc_index: int + + def to_dict(self) -> Dict[str, Any] + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "FileCursor" +``` + +**字段说明:** +- `file_path` (str): 文件路径 +- `doc_index` (int): 下一个要处理的文档索引(索引 >= 此值的文档已处理) + +**方法说明:** +- `to_dict()`: 将文件游标转换为字典格式 +- `from_dict()`: 从字典创建文件游标实体 + +### Snapshot + +处理快照实体,用于断点续传支持。 + +```python +@dataclass +class Snapshot: + file_cursors: List[FileCursor] = field(default_factory=list) + total_processed: int = 0 + timestamp: float = 0.0 + dataset_id: Optional[str] = None + + def get_cursor(self, file_path: str) -> Optional[FileCursor] + def set_cursor(self, file_path: str, doc_index: int) -> None + def remove_cursor(self, file_path: str) -> None + def to_dict(self) -> Dict[str, Any] + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "Snapshot" +``` + +**字段说明:** +- `file_cursors` (List[FileCursor]): 文件游标实体列表,每个游标跟踪一个文件的处理进度 +- `total_processed` (int): 已处理文档总数 +- `timestamp` (float): 快照创建时间戳 +- `dataset_id` (Optional[str]): 可选的数据集 ID + +**方法说明:** +- `get_cursor(file_path)`: 获取指定文件路径的游标 +- `set_cursor(file_path, doc_index)`: 设置或更新文件路径的游标 +- `remove_cursor(file_path)`: 移除文件路径的游标 +- `to_dict()`: 将快照转换为字典格式 +- `from_dict()`: 从字典创建快照实体 + +### BatchInfo + +批量处理信息。 + +```python +@dataclass +class BatchInfo: + batch_documents: List[Dict[str, Any]] + file_path: str + is_file_complete: bool +``` + +--- + +## 使用示例 + +### 基本用法 + +```python +from ragflow_sdk import RAGFlow +from ragflow_sdk.tools import BatchUploader, DocumentExtractor, FieldMapper + +# 初始化 RAGFlow 客户端 +rag = RAGFlow(api_key="YOUR_API_KEY", base_url="http://localhost:9380") + +# 创建字段映射器(可选) +field_mapper = FieldMapper( + title_field="article_title", + content_field="article_body", + tags_field="categories" +) + +# 创建文档提取器 +extractor = DocumentExtractor(field_mapper=field_mapper) + +# 创建批量上传器 +uploader = BatchUploader(rag) + +# 上传文档 +total_docs, total_files = uploader.upload( + document_extractor=extractor, + data_dir="/path/to/files", + batch_size=10, + resume=True +) + +print(f"Uploaded {total_docs} documents from {total_files} files") +``` + +### 使用迭代器模式 + +```python +from ragflow_sdk import RAGFlow +from ragflow_sdk.tools import BatchUploader, DocumentExtractor + +rag = RAGFlow(api_key="YOUR_API_KEY", base_url="http://localhost:9380") +extractor = DocumentExtractor() +uploader = BatchUploader(rag) + +# 获取或创建数据集 +uploader.get_or_create_dataset(dataset_name="My Dataset") + +# 使用迭代器模式提取和上传 +for batch, file_path, is_file_complete in extractor.extract_batches( + data_dir="/path/to/files", + batch_size=10 +): + # 可以在这里添加自定义处理逻辑 + print(f"Processing batch from {file_path}: {len(batch)} documents") + + # 手动上传批次 + uploader.dataset.upload_documents_with_meta(batch, file_extension="txt") + + if is_file_complete: + print(f"File {file_path} completed") +``` + +### 自定义字段映射 + +```python +from ragflow_sdk.tools import FieldMapper + +# 创建自定义字段映射器 +mapper = FieldMapper( + title_field="article_title", + content_field="article_body", + doc_id_field="article_id", + doc_url_field="article_url", + tags_field="categories", + tags_separator=";" +) + +# 映射文档 +source_doc = { + "article_title": "My Article", + "article_body": "Content...", + "article_id": "123", + "article_url": "https://example.com/article", + "categories": "tech;python;ai" +} + +mapped_doc = mapper.map(source_doc) +``` + +--- + +## 注意事项 + +1. **内存优化**: 工具使用迭代器模式,支持懒加载,不会一次性加载所有文件到内存 +2. **断点续传**: 支持文件级和文档索引级的断点续传,适合处理大量文档 +3. **错误处理**: 包含自动重试机制(指数退避),默认最多重试 10 次 +4. **文件顺序**: 工具会按文件名排序处理文件,这对于断点续传很重要 +5. **Excel 支持**: 需要安装 `pandas` 和 `openpyxl`:`pip install pandas openpyxl` + diff --git a/sdk/python/ragflow_sdk/tools/__init__.py b/sdk/python/ragflow_sdk/tools/__init__.py new file mode 100644 index 000000000..1625654c4 --- /dev/null +++ b/sdk/python/ragflow_sdk/tools/__init__.py @@ -0,0 +1,39 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .batch_uploader import BatchUploader +from .document_extractor import DocumentExtractor +from .file_reader import FileReader +from .field_mapper import FieldMapper +from .reparse_failed_documents import FailedDocumentReparser +from .models import FileType, DocumentFormat, Document, DocumentMetadata, FieldMappingConfig, FileCursor, Snapshot, BatchInfo + +__all__ = [ + "BatchUploader", + "DocumentExtractor", + "FileReader", + "FieldMapper", + "FailedDocumentReparser", + "FileType", + "DocumentFormat", + "Document", + "DocumentMetadata", + "FieldMappingConfig", + "FileCursor", + "Snapshot", + "BatchInfo", +] + diff --git a/sdk/python/ragflow_sdk/tools/batch_uploader.py b/sdk/python/ragflow_sdk/tools/batch_uploader.py new file mode 100644 index 000000000..eab4b2976 --- /dev/null +++ b/sdk/python/ragflow_sdk/tools/batch_uploader.py @@ -0,0 +1,373 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import json +import os +import time +import logging +from typing import Optional, Dict, List, Tuple, Iterator + +from ..ragflow import RAGFlow +from ..modules.dataset import DataSet +from .document_extractor import DocumentExtractor +from .models import Snapshot, FileCursor + + +class BatchUploader: + """ + Batch uploader for uploading large volumes of documents to RAGFlow. + + This class handles the upload logic, receiving document batches from + a DocumentExtractor iterator and uploading them to RAGFlow. + + Features: + - Iterator-based batch processing + - Snapshot-based resume support + - Automatic retry with exponential backoff + """ + + def __init__(self, rag: RAGFlow, dataset: Optional[DataSet] = None): + """ + Initialize BatchUploader. + + Args: + rag: RAGFlow client instance + dataset: Optional dataset instance + """ + self.rag = rag + self.dataset = dataset + self.logger = logging.getLogger(__name__) + + @staticmethod + def retry_with_backoff(func, max_retries: int = 10, max_backoff: int = 8): + """ + Retry wrapper with exponential backoff. + + Args: + func: Function to retry + max_retries: Maximum number of retry attempts (default: 10) + max_backoff: Maximum retry interval in seconds (default: 8) + + Returns: + Wrapped function with retry logic + """ + def wrapper(*args, **kwargs): + logger = logging.getLogger(__name__) + for attempt in range(max_retries): + try: + return func(*args, **kwargs) + except Exception as e: + if attempt < max_retries - 1: + backoff_time = min(2 ** attempt, max_backoff) + logger.warning(f" ⚠️ Error: {str(e)}") + logger.info(f" Retrying in {backoff_time}s... (attempt {attempt + 1}/{max_retries})") + time.sleep(backoff_time) + else: + logger.error(f" ❌ Failed after {max_retries} attempts: {str(e)}") + raise + return wrapper + + @staticmethod + def save_snapshot(snapshot_file: str, file_cursors: List[FileCursor], + total_processed: int, dataset_id: Optional[str] = None): + """ + Save processing snapshot to file. + + Args: + snapshot_file: Path to snapshot file + file_cursors: List of FileCursor entities, each tracking a file's processing progress + total_processed: Total number of documents processed + dataset_id: Optional dataset ID + """ + snapshot = Snapshot( + file_cursors=file_cursors, + total_processed=total_processed, + timestamp=time.time(), + dataset_id=dataset_id + ) + + # Ensure directory exists + os.makedirs(os.path.dirname(snapshot_file) if os.path.dirname(snapshot_file) else '.', exist_ok=True) + + with open(snapshot_file, 'w', encoding='utf-8') as f: + json.dump(snapshot.to_dict(), f, indent=2) + + @staticmethod + def load_snapshot(snapshot_file: str) -> Optional[Snapshot]: + """ + Load processing snapshot from file. + + Args: + snapshot_file: Path to snapshot file + + Returns: + Snapshot entity or None if file doesn't exist or is invalid + """ + if not os.path.exists(snapshot_file): + return None + try: + with open(snapshot_file, 'r', encoding='utf-8') as f: + data = json.load(f) + return Snapshot.from_dict(data) + except Exception as e: + logger = logging.getLogger(__name__) + logger.warning(f"Warning: Failed to load snapshot: {e}") + return None + + def set_dataset(self, dataset: DataSet): + """Set the dataset to use for uploading.""" + self.dataset = dataset + + def get_or_create_dataset(self, dataset_id: Optional[str] = None, + dataset_name: Optional[str] = None) -> DataSet: + """ + Get existing dataset or create a new one. + + Args: + dataset_id: Optional existing dataset ID + dataset_name: Optional name for new dataset (default: auto-generated) + + Returns: + Dataset instance + """ + if dataset_id: + datasets = self.rag.list_datasets(id=dataset_id) + if not datasets: + raise Exception(f"Dataset with ID '{dataset_id}' not found") + self.dataset = datasets[0] + self.logger.info(f"Using existing dataset: {self.dataset.name} (ID: {self.dataset.id})") + else: + if not dataset_name: + dataset_name = f"batch_upload_{time.strftime('%Y%m%d_%H%M%S')}" + self.dataset = self.rag.create_dataset(name=dataset_name) + self.logger.info(f"Created new dataset: {self.dataset.name} (ID: {self.dataset.id})") + + return self.dataset + + def _upload_batches( + self, + batch_iterator: Iterator[Tuple[List[Dict], str, bool]], + snapshot_file: str = "upload_snapshot.json", + file_extension: str = "txt" + ) -> Iterator[Tuple[List[Dict], str, bool]]: + """ + Internal method: Upload document batches from a document extractor iterator. + + This is a private method used internally by upload(). It receives batches + from a DocumentExtractor iterator and uploads them to RAGFlow. + + Args: + batch_iterator: Iterator from DocumentExtractor.extract_batches() + snapshot_file: Path to snapshot file for resume support + file_extension: File extension for uploaded documents + + Yields: + Tuples of (batch_documents, current_file_path, is_file_complete) + - batch_documents: List of document dictionaries (already uploaded) + - current_file_path: Path of the file currently being processed + - is_file_complete: True if this is the last batch from the current file + """ + # Upload function with retry + def upload_batch(batch: List[Dict]): + """Upload a batch of documents.""" + if not batch: + return + + self.logger.info(f" Uploading batch of {len(batch)} documents...") + docs = self.dataset.upload_documents_with_meta(batch, file_extension=file_extension) + self.logger.info(f" Successfully uploaded {len(docs)} documents") + return len(docs) + + upload_batch_with_retry = self.retry_with_backoff(upload_batch, max_retries=10, max_backoff=8) + + # Process batches from iterator + for batch, file_path, is_file_complete in batch_iterator: + # Upload batch with retry + upload_batch_with_retry(batch) + + # Yield batch info (for tracking purposes) + yield batch, file_path, is_file_complete + + def upload( + self, + document_extractor: DocumentExtractor, + data_dir: str, + dataset_id: Optional[str] = None, + dataset_name: Optional[str] = None, + batch_size: int = 5, + snapshot_file: str = "upload_snapshot.json", + resume: bool = False, + file_extension: str = "txt", + file_patterns: Optional[List[str]] = None + ) -> Tuple[int, int]: + """ + Upload documents from directory to RAGFlow. + + This is a convenience method that uses DocumentExtractor to extract + documents and then uploads them automatically. + + Args: + document_extractor: DocumentExtractor instance + data_dir: Directory containing files to upload + dataset_id: Optional dataset ID to use + dataset_name: Optional name for new dataset + batch_size: Number of documents per batch + snapshot_file: Path to snapshot file for resume support + resume: Whether to resume from snapshot + file_extension: File extension for uploaded documents if title does not have an extension + file_patterns: Optional list of file patterns to match + + Returns: + Tuple of (total_processed_docs, total_files_processed) + """ + if not self.dataset: + self.get_or_create_dataset(dataset_id, dataset_name) + + # Load snapshot + snapshot = None + file_cursors: List[FileCursor] = [] + total_processed = 0 + + if resume: + snapshot = self.load_snapshot(snapshot_file) + if snapshot: + file_cursors = snapshot.file_cursors + total_processed = snapshot.total_processed + + # Restore dataset_id from snapshot if not provided + if not dataset_id and snapshot.dataset_id: + dataset_id = snapshot.dataset_id + if not self.dataset or self.dataset.id != dataset_id: + datasets = self.rag.list_datasets(id=dataset_id) + if datasets: + self.dataset = datasets[0] + self.logger.info(f"📍 Restored dataset_id from snapshot: {dataset_id}") + + self.logger.info(f"📍 Resuming: {len(file_cursors)} files in progress, {total_processed} documents processed") + else: + self.logger.warning("⚠️ No valid snapshot found, starting from beginning") + + # Track current file being processed and document index + current_file_path = None + current_file_doc_index = {} # Track current doc index for each file + + # Convert FileCursor list to dict for backward compatibility with extract_batches + file_cursor_dict = None + if resume and file_cursors: + file_cursor_dict = {cursor.file_path: cursor.doc_index for cursor in file_cursors} + + # Exclude snapshot file from processing + exclude_files = [snapshot_file] if os.path.exists(snapshot_file) else [] + + # Get batch iterator from document extractor + batch_iterator = document_extractor.extract_batches( + data_dir=data_dir, + batch_size=batch_size, + file_patterns=file_patterns, + file_cursor=file_cursor_dict, + exclude_files=exclude_files + ) + + # Create a snapshot instance to manage file cursors + current_snapshot = Snapshot( + file_cursors=file_cursors.copy() if file_cursors else [], + total_processed=total_processed, + dataset_id=self.dataset.id if self.dataset else None + ) + + # Track documents processed in this session (for resume mode) + docs_processed_this_session = 0 + + # Process batches using internal _upload_batches method + try: + for batch, file_path, is_file_complete in self._upload_batches( + batch_iterator=batch_iterator, + snapshot_file=snapshot_file, + file_extension=file_extension + ): + current_file_path = file_path + current_snapshot.total_processed += len(batch) + docs_processed_this_session += len(batch) + + # Update file cursor: track how many documents we've processed from this file + if file_path not in current_file_doc_index: + # Start from cursor position or 0 + cursor = current_snapshot.get_cursor(file_path) + current_file_doc_index[file_path] = cursor.doc_index if cursor else 0 + + # Increment document index for this file + current_file_doc_index[file_path] += len(batch) + + # Update snapshot after successful upload + # For multi-doc files, update cursor to next index + # For single-doc files, if complete, set cursor to 1 (or any > 0) + if is_file_complete: + # File is complete, mark as fully processed + # For single-doc files, cursor > 0 means processed + # For multi-doc files, cursor = total docs means processed + current_snapshot.set_cursor(file_path, current_file_doc_index[file_path]) + self.logger.info(f" File completed: {os.path.basename(file_path)} (processed {current_file_doc_index[file_path]} documents)") + else: + # File not complete yet, update cursor to current position + current_snapshot.set_cursor(file_path, current_file_doc_index[file_path]) + + # Save snapshot after each successful batch upload + self.save_snapshot(snapshot_file, current_snapshot.file_cursors, current_snapshot.total_processed, current_snapshot.dataset_id) + + except Exception as e: + self.logger.error(f"Upload failed: {e}") + # Save snapshot before re-raising + self.save_snapshot(snapshot_file, current_snapshot.file_cursors, current_snapshot.total_processed, current_snapshot.dataset_id) + raise + + # Count fully processed files (files with cursor > 0 for single-doc, or cursor >= total docs for multi-doc) + # Count only files that were processed or completed in this session + fully_processed_files = 0 + initial_file_cursors = {cursor.file_path: cursor.doc_index for cursor in file_cursors} if resume else {} + + for cursor in current_snapshot.file_cursors: + if cursor.doc_index > 0: + # Check if this file was processed in this session + # If resume mode, only count files that were processed/completed in this session + if resume: + initial_index = initial_file_cursors.get(cursor.file_path, 0) + if cursor.doc_index > initial_index: + fully_processed_files += 1 + else: + fully_processed_files += 1 + + # Return documents processed in this session (not total) + # For resume mode, return only newly processed documents + # For normal mode, return total processed documents + if resume: + total_processed = docs_processed_this_session + else: + total_processed = current_snapshot.total_processed + + self.logger.info("\n" + "=" * 60) + self.logger.info(f"✅ Upload completed!") + self.logger.info(f" Documents processed: {total_processed}") + if resume: + self.logger.info(f" Total documents (including previous): {current_snapshot.total_processed}") + self.logger.info(f" Files processed: {fully_processed_files}") + + # Clean up snapshot on successful completion + if os.path.exists(snapshot_file): + os.remove(snapshot_file) + self.logger.info(f"🧹 Snapshot file removed: {snapshot_file}") + + return total_processed, fully_processed_files + diff --git a/sdk/python/ragflow_sdk/tools/document_extractor.py b/sdk/python/ragflow_sdk/tools/document_extractor.py new file mode 100644 index 000000000..ef69a0591 --- /dev/null +++ b/sdk/python/ragflow_sdk/tools/document_extractor.py @@ -0,0 +1,222 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import json +import os +import glob +import logging +from typing import Iterator, List, Dict, Optional, Tuple, Any +from pathlib import Path + +from .file_reader import FileReader +from .field_mapper import FieldMapper + + +class DocumentExtractor: + """ + Document extractor that extracts documents from files/directories. + + This class provides an iterator interface for extracting documents + from various file formats with lazy loading support. + """ + + def __init__(self, field_mapper: Optional[FieldMapper] = None, + multi_doc_extensions: Optional[List[str]] = ['json', 'jsonl']): + """ + Initialize DocumentExtractor. + + Args: + field_mapper: Optional FieldMapper instance for field mapping + multi_doc_extensions: List of file extensions (without dot) that should be + treated as multi-document formats. + Default: ['json', 'jsonl', 'csv', 'xlsx', 'xls'] + Files with these extensions will be parsed as containing + multiple documents. If not in this list, files will be + treated as single-document format. + """ + self.field_mapper = field_mapper or FieldMapper() + self.multi_doc_extensions = multi_doc_extensions + self.file_reader = FileReader(field_mapper=self.field_mapper, + multi_doc_extensions=multi_doc_extensions) + self.logger = logging.getLogger(__name__) + + def _get_file_list(self, data_dir: str, file_patterns: Optional[List[str]] = None) -> List[str]: + """ + Get list of files to process. + + Args: + data_dir: Directory containing files + file_patterns: Optional list of file patterns (e.g., ['*.json', '*.csv']) + If None, uses all supported patterns + + Returns: + Sorted list of file paths (sorted by filename for consistent processing order) + """ + if file_patterns is None: + # Include all PowerRAG supported formats + # Multi-doc formats + file_patterns = ['*.json', '*.jsonl', '*.csv', '*.xlsx', '*.xls'] + # Single-doc formats (PowerRAG supported) + file_patterns.extend([ + '*.pdf', '*.docx', '*.doc', '*.pptx', '*.ppt', + '*.html', '*.htm', '*.md', '*.markdown', '*.txt', + '*.eml', '*.epub', '*.jpg', '*.jpeg', '*.png', '*.gif', '*.bmp', '*.tiff', '*.webp' + ]) + + file_list = [] + for pattern in file_patterns: + pattern_path = os.path.join(data_dir, pattern) + file_list.extend(glob.glob(pattern_path)) + + # Remove duplicates and sort by filename (important for snapshot consistency) + file_list = sorted(list(set(file_list))) + return file_list + + def extract_documents( + self, + data_dir: str, + file_patterns: Optional[List[str]] = None, + processed_files: Optional[List[str]] = None + ) -> Iterator[Tuple[Dict[str, Any], str]]: + """ + Extract documents from files in a directory. + + This method returns an iterator that yields documents lazily. + Each document is yielded as a tuple of (document_dict, file_path). + + Args: + data_dir: Directory containing files to extract + file_patterns: Optional list of file patterns to match + processed_files: Optional list of already processed file paths (will be skipped) + + Yields: + Tuples of (document_dict, file_path) + - document_dict: Document dictionary in RAGFlow standard format + - file_path: Path of the file containing this document + """ + # Get file list + file_list = self._get_file_list(data_dir, file_patterns) + self.logger.info(f"Found {len(file_list)} files to process") + + processed_set = set(processed_files or []) + + for file_path in file_list: + # Skip already processed files + if file_path in processed_set: + self.logger.debug(f"Skipping already processed file: {file_path}") + continue + + self.logger.info(f"Processing file: {os.path.basename(file_path)}") + + try: + # Read file and yield documents lazily + for doc in self.file_reader.read_file(file_path): + yield doc, file_path + except Exception as e: + self.logger.error(f"Error reading file {file_path}: {e}") + # Continue with next file + continue + + def extract_batches( + self, + data_dir: str, + batch_size: int, + file_patterns: Optional[List[str]] = None, + file_cursor: Optional[Dict[str, int]] = None, + exclude_files: Optional[List[str]] = None + ) -> Iterator[Tuple[List[Dict[str, Any]], str, bool]]: + """ + Extract documents in batches from files in a directory. + + This method returns an iterator that yields batches of documents. + Each batch is loaded lazily only when requested. + + Args: + data_dir: Directory containing files to extract + batch_size: Number of documents per batch + file_patterns: Optional list of file patterns to match + file_cursor: Optional dictionary mapping file paths to document indices + {file_path: doc_index} - resume from this index for each file + exclude_files: Optional list of file paths to exclude from processing + + Yields: + Tuples of (batch_documents, current_file_path, is_file_complete) + - batch_documents: List of document dictionaries ready for upload + - current_file_path: Path of the file currently being processed + - is_file_complete: True if this is the last batch from the current file + """ + # Get file list (sorted by filename) + file_list = self._get_file_list(data_dir, file_patterns) + + # Exclude specified files (e.g., snapshot files) + if exclude_files: + exclude_set = set(exclude_files) + file_list = [f for f in file_list if f not in exclude_set] + + self.logger.info(f"Found {len(file_list)} files to process") + + file_cursor = file_cursor or {} + current_batch = [] + current_file_path = None + + for file_path in file_list: + # Get start index for this file (0 if not in cursor, or cursor value) + start_index = file_cursor.get(file_path, 0) + + # Check if file is fully processed + # For multi-doc files, we need to check if there are more docs + # For single-doc files, if start_index > 0, it's processed + if start_index > 0 and not self.file_reader.is_multi_document_format(file_path): + # Single-doc file already processed + self.logger.debug(f"Skipping already processed file: {file_path}") + continue + + current_file_path = file_path + + if start_index > 0: + self.logger.info(f"Resuming file: {os.path.basename(file_path)} from document index {start_index}") + else: + self.logger.info(f"Processing file: {os.path.basename(file_path)}") + + try: + # Read file lazily (iterator-based, doesn't load entire file into memory) + # Pass start_index to skip already processed documents + doc_iterator = self.file_reader.read_file(file_path, start_index=start_index) + doc_count = 0 + + for doc in doc_iterator: + current_batch.append(doc) + doc_count += 1 + + # Yield batch when it reaches batch_size + if len(current_batch) >= batch_size: + yield current_batch, current_file_path, False + current_batch = [] + + # Yield remaining documents in final batch for this file + if current_batch: + yield current_batch, current_file_path, True + current_batch = [] + + except Exception as e: + self.logger.error(f"Error reading file {file_path}: {e}") + # Continue with next file + continue + + # Yield remaining documents in final batch (should not happen, but just in case) + if current_batch: + yield current_batch, current_file_path, True + diff --git a/sdk/python/ragflow_sdk/tools/field_mapper.py b/sdk/python/ragflow_sdk/tools/field_mapper.py new file mode 100644 index 000000000..9c7623a50 --- /dev/null +++ b/sdk/python/ragflow_sdk/tools/field_mapper.py @@ -0,0 +1,226 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Dict, Any, Optional, List, Union +import logging + +from .models import FieldMappingConfig, Document, DocumentMetadata + + +class FieldMapper: + """ + Field mapper for converting source document fields to RAGFlow standard format. + + Standard fields: + - title: Document title + - content: Document content + - metadata.doc_id: Document ID + - metadata.doc_url: Document URL + - metadata.tags: List of tags + """ + + # Default field mappings + DEFAULT_MAPPINGS = { + 'title': ['title', 'name', 'subject', 'heading', 'header'], + 'content': ['content', 'text', 'body', 'description', 'desc', 'data'], + 'doc_id': ['id', 'doc_id', '_id', 'document_id', 'docid'], + 'doc_url': ['url', 'link', 'uri', 'doc_url', 'source_url'], + 'tags': ['tags', 'tag', 'categories', 'category', 'labels', 'label'] + } + + def __init__( + self, + title_field: Optional[str] = None, + content_field: Optional[str] = None, + doc_id_field: Optional[str] = None, + doc_url_field: Optional[str] = None, + tags_field: Optional[str] = None, + tags_separator: str = ',', + config: Optional[FieldMappingConfig] = None + ): + """ + Initialize FieldMapper with custom field mappings. + + Args: + title_field: Source field name for title (None = auto-detect) + content_field: Source field name for content (None = auto-detect) + doc_id_field: Source field name for doc_id (None = auto-detect) + doc_url_field: Source field name for doc_url (None = auto-detect) + tags_field: Source field name for tags (None = auto-detect) + tags_separator: Separator for tags string (default: ',') + config: Optional FieldMappingConfig instance (if provided, overrides individual fields) + """ + if config: + self.title_field = config.title_field + self.content_field = config.content_field + self.doc_id_field = config.doc_id_field + self.doc_url_field = config.doc_url_field + self.tags_field = config.tags_field + self.tags_separator = config.tags_separator + else: + self.title_field = title_field + self.content_field = content_field + self.doc_id_field = doc_id_field + self.doc_url_field = doc_url_field + self.tags_field = tags_field + self.tags_separator = tags_separator + self.logger = logging.getLogger(__name__) + + def _find_field(self, doc: Dict[str, Any], candidates: List[str]) -> Optional[str]: + """ + Find field name in document using candidate names. + + Args: + doc: Document dictionary + candidates: List of candidate field names + + Returns: + Found field name or None + """ + # Check exact match first (case-insensitive) + doc_keys_lower = {k.lower(): k for k in doc.keys()} + for candidate in candidates: + if candidate.lower() in doc_keys_lower: + return doc_keys_lower[candidate.lower()] + return None + + def _extract_field(self, doc: Dict[str, Any], field_name: Optional[str], + candidates: List[str], default: Any = None) -> Any: + """ + Extract field value from document. + + Args: + doc: Document dictionary + field_name: Explicit field name (if provided) + candidates: List of candidate field names for auto-detection + default: Default value if field not found + + Returns: + Field value or default + """ + if field_name: + # Use explicit field name + return doc.get(field_name, default) + + # Auto-detect field name + found_field = self._find_field(doc, candidates) + if found_field: + return doc.get(found_field, default) + + return default + + def _parse_tags(self, tags_value: Any) -> List[str]: + """ + Parse tags from various formats. + + Supports: + - Array format: ["tag1", "tag2", "tag3"] + - Comma-separated string: "tag1, tag2, tag3" or "tag1,tag2,tag3" (spaces are trimmed) + + Args: + tags_value: Tags value (can be string, list, or None) + + Returns: + List of tag strings (empty strings and None values are filtered out) + """ + if tags_value is None: + return [] + + if isinstance(tags_value, list): + # Array format: ["tag1", "tag2", "tag3"] + return [str(tag).strip() for tag in tags_value if tag and str(tag).strip()] + + if isinstance(tags_value, str): + # Comma-separated string: "tag1, tag2, tag3" or "tag1,tag2,tag3" + if not tags_value.strip(): + return [] + # Split by separator and strip whitespace from each tag + return [tag.strip() for tag in tags_value.split(self.tags_separator) if tag.strip()] + + # Convert to string and parse as single tag + tag_str = str(tags_value).strip() + return [tag_str] if tag_str else [] + + def map(self, doc: Dict[str, Any]) -> Dict[str, Any]: + """ + Map source document to RAGFlow standard format. + + Args: + doc: Source document dictionary + + Returns: + Mapped document in RAGFlow format: + { + 'title': str, + 'content': str, + 'metadata': { + 'doc_id': str (optional), + 'doc_url': str (optional), + 'tags': List[str] (optional) + } + } + """ + # Extract fields + title = self._extract_field( + doc, self.title_field, + self.DEFAULT_MAPPINGS['title'], + default='' + ) + + content = self._extract_field( + doc, self.content_field, + self.DEFAULT_MAPPINGS['content'], + default='' + ) + + doc_id = self._extract_field( + doc, self.doc_id_field, + self.DEFAULT_MAPPINGS['doc_id'], + default=None + ) + + doc_url = self._extract_field( + doc, self.doc_url_field, + self.DEFAULT_MAPPINGS['doc_url'], + default=None + ) + + tags_value = self._extract_field( + doc, self.tags_field, + self.DEFAULT_MAPPINGS['tags'], + default=None + ) + + tags = self._parse_tags(tags_value) + + # Build metadata entity + metadata = None + if doc_id is not None or doc_url is not None or tags: + metadata = DocumentMetadata( + doc_id=str(doc_id) if doc_id is not None else None, + doc_url=str(doc_url) if doc_url is not None else None, + tags=tags if tags else None + ) + + # Build result document + doc = Document( + title=str(title) if title else '', + content=str(content) if content else '', + metadata=metadata + ) + + return doc.to_dict() + diff --git a/sdk/python/ragflow_sdk/tools/file_reader.py b/sdk/python/ragflow_sdk/tools/file_reader.py new file mode 100644 index 000000000..85c5052cb --- /dev/null +++ b/sdk/python/ragflow_sdk/tools/file_reader.py @@ -0,0 +1,323 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import json +import os +import csv +import logging +from typing import Iterator, List, Dict, Optional, Any +from pathlib import Path + +try: + import pandas as pd + PANDAS_AVAILABLE = True +except ImportError: + PANDAS_AVAILABLE = False + +from .field_mapper import FieldMapper +from .models import FileType + + +class FileReader: + """ + File reader for various file formats supporting batch iteration. + + Supports: + - Multi-document formats: JSON (array), JSONL, CSV, XLSX, XLS + - Single-document formats: PDF, Office, HTML, Markdown, images, etc. + """ + + # PowerRAG supported single-document formats + # Based on powerrag/server/services/parse_service.py and powerrag/app/title.py + SINGLE_DOC_FORMATS = { + '.pdf', '.docx', '.doc', '.pptx', '.ppt', '.html', '.htm', + '.md', '.markdown', '.txt', '.eml', '.epub', + '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp' + } + + def __init__(self, field_mapper: Optional[FieldMapper] = None, + multi_doc_extensions: Optional[List[str]] = None): + """ + Initialize FileReader. + + Args: + field_mapper: Optional FieldMapper instance for field mapping + multi_doc_extensions: List of file extensions (without dot) that should be + treated as multi-document formats. + Files with these extensions will be parsed as containing + multiple documents. If not in this list, files will be + treated as single-document format. + """ + self.field_mapper = field_mapper or FieldMapper() + # Convert to set with dots for easier comparison + self.multi_doc_extensions = {f'.{ext.lower().lstrip(".")}' for ext in multi_doc_extensions} + self.logger = logging.getLogger(__name__) + + def is_multi_document_format(self, file_path: str) -> bool: + """ + Check if file is a multi-document format. + + Args: + file_path: Path to the file + + Returns: + True if file contains multiple documents, False if single document + """ + ext = Path(file_path).suffix.lower() + + # Check if extension is in configured multi-doc extensions + if ext in self.multi_doc_extensions: + # For JSON, check if it's an array (multi-doc) or single object (single-doc) + if ext == '.json': + try: + with open(file_path, 'r', encoding='utf-8') as f: + first_char = f.read(1).strip() + if first_char == '[': + return True # Array JSON - multi-doc + else: + return False # Single object JSON - single-doc + except Exception: + # Default to single-doc if can't determine + return False + elif ext == '.jsonl': + # JSONL is always multi-doc (one JSON object per line) + return True + elif ext in ['.csv', '.xlsx', '.xls']: + # CSV and Excel files are treated as multi-doc if in the list + return True + else: + # Other configured extensions are signle-doc + return False + + # All other formats are single-doc + return False + + def _detect_file_type(self, file_path: str) -> FileType: + """ + Detect file type based on extension and content. + + Args: + file_path: Path to the file + + Returns: + FileType enumeration value + """ + ext = Path(file_path).suffix.lower() + + # Check if it's a multi-doc format first + if not self.is_multi_document_format(file_path): + return FileType.SINGLE + + # Determine specific multi-doc type + if ext == '.json': + return FileType.JSON_ARRAY + elif ext == '.jsonl': + return FileType.JSONL + elif ext == '.csv': + return FileType.CSV + elif ext in ['.xlsx', '.xls']: + return FileType.EXCEL + else: + # Should not reach here, but treat as single if unknown + return FileType.SINGLE + + def _read_json_array(self, file_path: str, start_index: int = 0) -> Iterator[Dict[str, Any]]: + """Read JSON file (array of documents).""" + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + if isinstance(data, list): + for idx, doc in enumerate(data): + if idx >= start_index: + yield doc + else: + # Should not happen if is_multi_document_format is correct + # But if it does, return empty iterator (single object should be handled as single-doc) + return + + def _read_jsonl(self, file_path: str, start_index: int = 0) -> Iterator[Dict[str, Any]]: + """Read JSONL file (one JSON object per line).""" + with open(file_path, 'r', encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + if line_num - 1 < start_index: + continue + line = line.strip() + if not line: + continue + try: + yield json.loads(line) + except json.JSONDecodeError as e: + self.logger.warning(f"Failed to parse JSONL line {line_num} in {file_path}: {e}") + continue + + def _read_csv(self, file_path: str, start_index: int = 0) -> Iterator[Dict[str, Any]]: + """Read CSV file (each row is a document).""" + with open(file_path, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + for row_idx, row in enumerate(reader): + if row_idx < start_index: + continue + # Convert empty strings to None for consistency + doc = {k: (v if v else None) for k, v in row.items()} + yield doc + + def _read_excel(self, file_path: str, start_index: int = 0) -> Iterator[Dict[str, Any]]: + """Read Excel file (each row is a document).""" + if not PANDAS_AVAILABLE: + raise ImportError("pandas is required for Excel file support. Install it with: pip install pandas openpyxl") + + try: + # Determine engine based on file extension + file_ext = Path(file_path).suffix.lower() + if file_ext == '.xls': + # For .xls files, use xlrd engine + engine = 'xlrd' + else: + # For .xlsx and other formats, use openpyxl + engine = 'openpyxl' + + # Try reading all sheets + excel_file = pd.ExcelFile(file_path, engine=engine) + row_count = 0 + for sheet_name in excel_file.sheet_names: + df = pd.read_excel(excel_file, sheet_name=sheet_name) + # Convert DataFrame to list of dicts + for _, row in df.iterrows(): + if row_count < start_index: + row_count += 1 + continue + # Convert NaN to None and convert to dict + doc = {k: (None if pd.isna(v) else v) for k, v in row.to_dict().items()} + yield doc + row_count += 1 + except Exception as e: + self.logger.error(f"Failed to read Excel file {file_path}: {e}") + raise + + def _read_single_file(self, file_path: str) -> Iterator[Dict[str, Any]]: + """ + Read single file as a single document. + + For single-file-single-document format: + - title: filename (with extension) + - content: file content (as string for text files, empty string for binary files) + - Other fields: empty (no metadata, no doc_id, no doc_url, no tags) + """ + # Get filename as title (with extension) + title = Path(file_path).name # Use filename with extension as title + + # Try reading as text first + content = '' + try: + with open(file_path, 'r', encoding='utf-8') as f: + content = f.read() + except UnicodeDecodeError: + # If text reading fails, it's likely a binary file (PDF, images, etc.) + # For binary files, content will be empty string + # The actual binary content will be handled by the uploader when uploading the file + content = '' + except Exception as e: + self.logger.warning(f"Failed to read file {file_path} as text: {e}") + content = '' + + # Return document with only title and content, no other fields + # Field mapper will extract title and content, and won't add metadata since + # doc_id, doc_url, and tags are all None/empty + yield { + 'title': title, + 'content': content + } + + def read_file(self, file_path: str, start_index: int = 0) -> Iterator[Dict[str, Any]]: + """ + Read a file and yield documents. + + Args: + file_path: Path to the file + start_index: Starting document index (for multi-doc formats, skip documents before this index) + + Yields: + Document dictionaries + """ + file_type = self._detect_file_type(file_path) + + try: + if file_type == FileType.JSON_ARRAY: + iterator = self._read_json_array(file_path, start_index) + elif file_type == FileType.JSONL: + iterator = self._read_jsonl(file_path, start_index) + elif file_type == FileType.CSV: + iterator = self._read_csv(file_path, start_index) + elif file_type == FileType.EXCEL: + iterator = self._read_excel(file_path, start_index) + else: # FileType.SINGLE + # For single-doc files, start_index should be 0 (or we skip if already processed) + if start_index > 0: + # File already processed, skip + return + iterator = self._read_single_file(file_path) + + for doc in iterator: + # Apply field mapping if mapper is provided + mapped_doc = self.field_mapper.map(doc) + yield mapped_doc + + except Exception as e: + self.logger.error(f"Error reading file {file_path}: {e}") + raise + + def read_files_batch( + self, + file_paths: List[str], + batch_size: int, + processed_files: Optional[List[str]] = None + ) -> Iterator[List[Dict[str, Any]]]: + """ + Read multiple files in batches. + + Args: + file_paths: List of file paths to read + batch_size: Number of documents per batch + processed_files: Optional list of already processed file paths (will be skipped) + + Yields: + Batches of documents (each batch is a list of document dicts) + """ + processed_set = set(processed_files or []) + batch = [] + + for file_path in file_paths: + # Skip already processed files + if file_path in processed_set: + self.logger.debug(f"Skipping already processed file: {file_path}") + continue + + try: + for doc in self.read_file(file_path): + batch.append(doc) + + # Yield batch when it reaches batch_size + if len(batch) >= batch_size: + yield batch + batch = [] + except Exception as e: + self.logger.error(f"Error processing file {file_path}: {e}") + # Continue with next file even if current file fails + continue + + # Yield remaining documents + if batch: + yield batch + diff --git a/sdk/python/ragflow_sdk/tools/models.py b/sdk/python/ragflow_sdk/tools/models.py new file mode 100644 index 000000000..3fea6fdb0 --- /dev/null +++ b/sdk/python/ragflow_sdk/tools/models.py @@ -0,0 +1,260 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from enum import Enum +from typing import Dict, List, Optional, Any +from dataclasses import dataclass, field + + +class FileType(Enum): + """File type enumeration.""" + JSON_ARRAY = "json_array" + JSONL = "jsonl" + CSV = "csv" + EXCEL = "excel" + SINGLE = "single" + + +class DocumentFormat(Enum): + """Document format enumeration.""" + MULTI_DOC = "multi_doc" # One file contains multiple documents + SINGLE_DOC = "single_doc" # One file corresponds to one document + + +@dataclass +class DocumentMetadata: + """ + Document metadata entity in RAGFlow format. + + Attributes: + doc_id: Optional document ID + doc_url: Optional document URL + tags: Optional list of tags + """ + doc_id: Optional[str] = None + doc_url: Optional[str] = None + tags: Optional[List[str]] = None + + def to_dict(self) -> Dict[str, Any]: + """Convert metadata to dictionary format.""" + result = {} + if self.doc_id is not None: + result["doc_id"] = self.doc_id + if self.doc_url is not None: + result["doc_url"] = self.doc_url + if self.tags is not None: + result["tags"] = self.tags + return result + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "DocumentMetadata": + """Create metadata from dictionary.""" + return cls( + doc_id=data.get("doc_id"), + doc_url=data.get("doc_url"), + tags=data.get("tags") + ) + + def is_empty(self) -> bool: + """Check if metadata is empty (all fields are None).""" + return self.doc_id is None and self.doc_url is None and ( + self.tags is None or len(self.tags) == 0 + ) + + +@dataclass +class Document: + """ + Standard document entity in RAGFlow format. + + Attributes: + title: Document title + content: Document content + metadata: Optional metadata entity containing doc_id, doc_url, tags + """ + title: str + content: str + metadata: Optional[DocumentMetadata] = None + + def to_dict(self) -> Dict: + """Convert document to dictionary format.""" + result = { + "title": self.title, + "content": self.content + } + if self.metadata and not self.metadata.is_empty(): + result["metadata"] = self.metadata.to_dict() + return result + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "Document": + """Create document from dictionary.""" + metadata_dict = data.get("metadata", {}) + metadata = None + if metadata_dict: + if isinstance(metadata_dict, DocumentMetadata): + metadata = metadata_dict + else: + metadata = DocumentMetadata.from_dict(metadata_dict) + return cls( + title=data.get("title", ""), + content=data.get("content", ""), + metadata=metadata + ) + + +@dataclass +class FieldMappingConfig: + """ + Field mapping configuration. + + Attributes: + title_field: Source field name for title (None = auto-detect) + content_field: Source field name for content (None = auto-detect) + doc_id_field: Source field name for doc_id (None = auto-detect) + doc_url_field: Source field name for doc_url (None = auto-detect) + tags_field: Source field name for tags (None = auto-detect) + tags_separator: Separator for tags string (default: ',') + """ + title_field: Optional[str] = None + content_field: Optional[str] = None + doc_id_field: Optional[str] = None + doc_url_field: Optional[str] = None + tags_field: Optional[str] = None + tags_separator: str = ',' + + +@dataclass +class FileCursor: + """ + File cursor entity for tracking document processing progress. + + This entity represents a single file's processing cursor, indicating + how many documents have been processed from this file. + + Attributes: + file_path: Path to the file + doc_index: Next document index to process (documents with index >= this are processed) + """ + file_path: str + doc_index: int + + def to_dict(self) -> Dict[str, Any]: + """Convert file cursor to dictionary format.""" + return { + "file_path": self.file_path, + "doc_index": self.doc_index + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "FileCursor": + """Create file cursor from dictionary.""" + return cls( + file_path=data.get("file_path", ""), + doc_index=data.get("doc_index", 0) + ) + + +@dataclass +class Snapshot: + """ + Processing snapshot entity for resume support. + + Attributes: + file_cursors: List of file cursor entities, each tracking a file's processing progress + total_processed: Total number of documents processed + timestamp: Snapshot creation timestamp + dataset_id: Optional dataset ID + """ + file_cursors: List[FileCursor] = field(default_factory=list) + total_processed: int = 0 + timestamp: float = 0.0 + dataset_id: Optional[str] = None + + def get_cursor(self, file_path: str) -> Optional[FileCursor]: + """Get file cursor for a specific file path.""" + for cursor in self.file_cursors: + if cursor.file_path == file_path: + return cursor + return None + + def set_cursor(self, file_path: str, doc_index: int) -> None: + """Set or update file cursor for a file path.""" + cursor = self.get_cursor(file_path) + if cursor: + cursor.doc_index = doc_index + else: + self.file_cursors.append(FileCursor(file_path=file_path, doc_index=doc_index)) + + def remove_cursor(self, file_path: str) -> None: + """Remove file cursor for a file path.""" + self.file_cursors = [c for c in self.file_cursors if c.file_path != file_path] + + def to_dict(self) -> Dict[str, Any]: + """Convert snapshot to dictionary format.""" + result = { + "file_cursors": [cursor.to_dict() for cursor in self.file_cursors], + "total_processed": self.total_processed, + "timestamp": self.timestamp + } + if self.dataset_id: + result["dataset_id"] = self.dataset_id + return result + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "Snapshot": + """Create snapshot from dictionary.""" + file_cursors_data = data.get("file_cursors", []) + # Support backward compatibility: if file_cursor (singular) exists, convert it + if "file_cursor" in data and "file_cursors" not in data: + # Old format: file_cursor is a dict {file_path: doc_index} + file_cursor_dict = data.get("file_cursor", {}) + if isinstance(file_cursor_dict, dict): + file_cursors_data = [ + {"file_path": path, "doc_index": idx} + for path, idx in file_cursor_dict.items() + ] + + file_cursors = [] + if file_cursors_data: + for cursor_data in file_cursors_data: + if isinstance(cursor_data, FileCursor): + file_cursors.append(cursor_data) + else: + file_cursors.append(FileCursor.from_dict(cursor_data)) + + return cls( + file_cursors=file_cursors, + total_processed=data.get("total_processed", 0), + timestamp=data.get("timestamp", 0.0), + dataset_id=data.get("dataset_id") + ) + + +@dataclass +class BatchInfo: + """ + Batch processing information. + + Attributes: + batch_documents: List of documents in the batch + file_path: Path of the file currently being processed + is_file_complete: True if this is the last batch from the current file + """ + batch_documents: List[Dict[str, Any]] + file_path: str + is_file_complete: bool + diff --git a/sdk/python/ragflow_sdk/tools/reparse_failed_documents.py b/sdk/python/ragflow_sdk/tools/reparse_failed_documents.py new file mode 100644 index 000000000..d2e32d119 --- /dev/null +++ b/sdk/python/ragflow_sdk/tools/reparse_failed_documents.py @@ -0,0 +1,221 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import time +import logging +from typing import Optional, Tuple + +from ..ragflow import RAGFlow +from ..modules.dataset import DataSet + + +class FailedDocumentReparser: + """ + Tool for reparsing failed documents in a RAGFlow dataset. + + This class handles the logic for finding failed documents in a dataset + and reparsing them in batches with automatic retry and exponential backoff. + + Features: + - Stream processing: fetch documents page by page + - Batch reparsing with configurable batch size + - Automatic retry with exponential backoff + - Progress tracking and logging + """ + + def __init__(self, rag: RAGFlow, dataset: Optional[DataSet] = None): + """ + Initialize FailedDocumentReparser. + + Args: + rag: RAGFlow client instance + dataset: Optional dataset instance + """ + self.rag = rag + self.dataset = dataset + self.logger = logging.getLogger(__name__) + + @staticmethod + def retry_with_backoff(func, max_retries: int = 10, max_backoff: int = 8): + """ + Retry wrapper with exponential backoff. + + Args: + func: Function to retry + max_retries: Maximum number of retry attempts (default: 10) + max_backoff: Maximum retry interval in seconds (default: 8) + + Returns: + Wrapped function with retry logic + """ + def wrapper(*args, **kwargs): + logger = logging.getLogger(__name__) + for attempt in range(max_retries): + try: + return func(*args, **kwargs) + except Exception as e: + if attempt < max_retries - 1: + backoff_time = min(2 ** attempt, max_backoff) + logger.warning(f" ⚠️ Error: {str(e)}") + logger.info(f" Retrying in {backoff_time}s... (attempt {attempt + 1}/{max_retries})") + time.sleep(backoff_time) + else: + logger.error(f" ❌ Failed after {max_retries} attempts: {str(e)}") + raise + return wrapper + + def set_dataset(self, dataset: DataSet): + """Set the dataset to use for reparsing.""" + self.dataset = dataset + + def get_dataset(self, dataset_id: str) -> DataSet: + """ + Get dataset by ID. + + Args: + dataset_id: Dataset ID + + Returns: + Dataset instance + """ + datasets = self.rag.list_datasets(id=dataset_id) + if not datasets: + raise Exception(f"Dataset with ID '{dataset_id}' not found") + self.dataset = datasets[0] + self.logger.info(f"Using dataset: {self.dataset.name} (ID: {self.dataset.id})") + return self.dataset + + def reparse_failed_documents( + self, + dataset_id: Optional[str] = None, + reparse_batch_size: int = 50, + page_size: int = 10000 + ) -> Tuple[int, int]: + """ + Reparse all failed documents in a dataset. + + This method fetches documents page by page, filters failed ones, + and reparses them in batches. + + Args: + dataset_id: Dataset ID to reparse documents from (if dataset not set) + reparse_batch_size: Batch size for reparsing documents (default: 50) + page_size: Page size for fetching documents (default: 10000) + + Returns: + Tuple of (total_failed, total_reparsed) + """ + # Validate reparse_batch_size + if reparse_batch_size <= 0: + raise ValueError(f"reparse_batch_size must be greater than 0, got {reparse_batch_size}") + + if page_size <= 0: + raise ValueError(f"page_size must be greater than 0, got {page_size}") + + # Get dataset if not set + if not self.dataset: + if not dataset_id: + raise ValueError("Either dataset must be set or dataset_id must be provided") + self.get_dataset(dataset_id) + + # Stream processing: fetch documents page by page, filter failed ones, and reparse when reparse_batch_size is reached + self.logger.info("Fetching documents from dataset and processing failed ones in batches...") + page = 1 + total_documents = 0 + total_failed = 0 + total_reparsed = 0 + pending_failed_doc_ids = [] # Accumulate failed document IDs until reparse_batch_size is reached + + def reparse_batch(doc_ids): + """Reparse a batch of documents.""" + nonlocal total_reparsed + # This check should not be needed if the calling code is correct, + # but kept as a safety measure + if not doc_ids: + self.logger.warning(" ⚠️ Skipping empty batch (this should not happen)") + return + + self.logger.info(f" Reparsing batch of {len(doc_ids)} documents...") + self.dataset.async_parse_documents(doc_ids) + total_reparsed += len(doc_ids) + self.logger.info(f" Batch completed: {len(doc_ids)} documents parsed successfully") + self.logger.info(f" Total reparsed so far: {total_reparsed}") + + # Wrap reparse_batch with retry logic + reparse_batch_with_retry = self.retry_with_backoff(reparse_batch, max_retries=10, max_backoff=8) + + # Helper function to fetch documents with retry + def fetch_documents_page(page_num, page_sz): + """Fetch a page of documents with retry logic.""" + return self.dataset.list_documents(page=page_num, page_size=page_sz, orderby="id", desc=False) + + fetch_documents_with_retry = self.retry_with_backoff(fetch_documents_page, max_retries=10, max_backoff=8) + batch_number = 0 + + while True: + # Fetch a page of documents + self.logger.info(f"Fetching page {page} (page_size={page_size})...") + documents = fetch_documents_with_retry(page, page_size) + + if not documents: + break + + total_documents += len(documents) + self.logger.info(f" Page {page}: fetched {len(documents)} documents (total: {total_documents})") + + # Filter failed documents from this page + page_failed_docs = [] + for doc in documents: + # Check if document parsing failed + # run status can be "FAIL", "DONE", "CANCEL", etc. + if isinstance(doc.run, str) and doc.run.upper() == "FAIL": + page_failed_docs.append(doc) + pending_failed_doc_ids.append(doc.id) + total_failed += 1 + + if page_failed_docs: + self.logger.info(f" Found {len(page_failed_docs)} failed documents in this page (total failed: {total_failed})") + + # If we've accumulated enough failed documents, reparse them + while len(pending_failed_doc_ids) >= reparse_batch_size: + batch_number += 1 + batch = pending_failed_doc_ids[:reparse_batch_size] + pending_failed_doc_ids = pending_failed_doc_ids[reparse_batch_size:] # Remove only the processed batch + self.logger.info(f"\nProcessing batch {batch_number} ({len(batch)} documents)...") + reparse_batch_with_retry(batch) + + # Check if this is the last page + if len(documents) < page_size: + break + page += 1 + + self.logger.info(f"\nFinished fetching all documents. Total: {total_documents}, Failed: {total_failed}") + + # Process remaining failed documents (if any) + if pending_failed_doc_ids: + batch_number += 1 + self.logger.info(f"\nProcessing final batch {batch_number} ({len(pending_failed_doc_ids)} documents)...") + reparse_batch_with_retry(pending_failed_doc_ids) + + if total_failed == 0: + self.logger.info("✅ No failed documents found in the dataset") + return 0, 0 + + self.logger.info("\n" + "=" * 60) + self.logger.info(f"✅ Reparsing completed! Successfully reparsed: {total_reparsed}/{total_failed} documents") + + return total_failed, total_reparsed + diff --git a/sdk/python/test/conftest.py b/sdk/python/test/conftest.py index 5aaaf8c1b..95142ad7d 100644 --- a/sdk/python/test/conftest.py +++ b/sdk/python/test/conftest.py @@ -15,14 +15,35 @@ # import os +import sys +from pathlib import Path + +# CRITICAL: Monkey patch beartype before importing ragflow_sdk +# This is a workaround because ragflow_sdk/__init__.py calls beartype_this_package() +# without checking BEARTYPE_DISABLE environment variable +def _disable_beartype(): + """Disable beartype by monkey patching beartype_this_package to do nothing.""" + try: + import beartype.claw + original_beartype_this_package = beartype.claw.beartype_this_package + + def noop_beartype_this_package(*args, **kwargs): + """No-op version of beartype_this_package that does nothing.""" + pass + + beartype.claw.beartype_this_package = noop_beartype_this_package + os.environ['BEARTYPE_DISABLE'] = '1' + except ImportError: + pass + +# Call before any imports that might trigger ragflow_sdk import +_disable_beartype() import pytest import requests HOST_ADDRESS = os.getenv("HOST_ADDRESS", "http://127.0.0.1:9380") ZHIPU_AI_API_KEY = os.getenv("ZHIPU_AI_API_KEY") -if ZHIPU_AI_API_KEY is None: - pytest.exit("Error: Environment variable ZHIPU_AI_API_KEY must be set") # def generate_random_email(): # return 'user_' + ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))+'@1.com' @@ -62,6 +83,8 @@ def login(): @pytest.fixture(scope="session") def get_api_key_fixture(): + if ZHIPU_AI_API_KEY is None: + pytest.skip("ZHIPU_AI_API_KEY environment variable is not set") try: register() except Exception as e: @@ -104,6 +127,7 @@ def get_my_llms(auth, name): def add_models(auth): + # This function is only called from set_tenant_info which already checks ZHIPU_AI_API_KEY url = HOST_ADDRESS + "/v1/llm/set_api_key" authorization = {"Authorization": auth} models_info = { @@ -130,6 +154,9 @@ def get_tenant_info(auth): @pytest.fixture(scope="session", autouse=True) def set_tenant_info(get_auth): + # Skip if ZHIPU_AI_API_KEY is not set (for unit tests that don't need it) + if ZHIPU_AI_API_KEY is None: + return auth = get_auth try: add_models(auth) diff --git a/sdk/python/test/test_tools/__init__.py b/sdk/python/test/test_tools/__init__.py new file mode 100644 index 000000000..d991ce776 --- /dev/null +++ b/sdk/python/test/test_tools/__init__.py @@ -0,0 +1,16 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + diff --git a/sdk/python/test/test_tools/test_batch_uploader.py b/sdk/python/test/test_tools/test_batch_uploader.py new file mode 100644 index 000000000..6e5260469 --- /dev/null +++ b/sdk/python/test/test_tools/test_batch_uploader.py @@ -0,0 +1,916 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import json +import os +import tempfile +import pytest +from unittest.mock import Mock, MagicMock, patch, call +from pathlib import Path + +from ragflow_sdk.tools import BatchUploader, DocumentExtractor, FieldMapper +from ragflow_sdk.tools.models import Snapshot, FileCursor + + +class TestBatchUploader: + """Unit tests for BatchUploader with mocked PowerRAG interfaces.""" + + @pytest.fixture + def mock_rag(self): + """Create a mock RAGFlow client.""" + rag = Mock() + rag.api_url = "http://test.com/api/v1" + rag.user_key = "test_key" + return rag + + @pytest.fixture + def mock_dataset(self, mock_rag): + """Create a mock DataSet.""" + dataset = Mock() + dataset.id = "test_dataset_id" + dataset.name = "test_dataset" + dataset.rag = mock_rag + + # Mock upload_documents_with_meta to return mock documents + def mock_upload(docs, group_id_field=None, file_extension="txt"): + mock_docs = [] + for i, doc in enumerate(docs): + mock_doc = Mock() + mock_doc.id = f"doc_{i}" + mock_doc.title = doc.get("title", "") + mock_doc.content = doc.get("content", "") + mock_docs.append(mock_doc) + return mock_docs + + dataset.upload_documents_with_meta = Mock(side_effect=mock_upload) + return dataset + + @pytest.fixture + def temp_dir(self): + """Create a temporary directory for test files.""" + with tempfile.TemporaryDirectory() as tmpdir: + yield tmpdir + + def _create_json_file(self, temp_dir, filename, data): + """Helper to create a JSON file.""" + filepath = os.path.join(temp_dir, filename) + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False) + return filepath + + def _create_jsonl_file(self, temp_dir, filename, data_list): + """Helper to create a JSONL file.""" + filepath = os.path.join(temp_dir, filename) + with open(filepath, 'w', encoding='utf-8') as f: + for item in data_list: + f.write(json.dumps(item, ensure_ascii=False) + '\n') + return filepath + + def _create_csv_file(self, temp_dir, filename, rows): + """Helper to create a CSV file.""" + import csv + filepath = os.path.join(temp_dir, filename) + if rows: + with open(filepath, 'w', encoding='utf-8', newline='') as f: + writer = csv.DictWriter(f, fieldnames=rows[0].keys()) + writer.writeheader() + writer.writerows(rows) + else: + # Create empty CSV with header + with open(filepath, 'w', encoding='utf-8', newline='') as f: + writer = csv.DictWriter(f, fieldnames=['title', 'content']) + writer.writeheader() + return filepath + + def _create_text_file(self, temp_dir, filename, content): + """Helper to create a text file.""" + filepath = os.path.join(temp_dir, filename) + with open(filepath, 'w', encoding='utf-8') as f: + f.write(content) + return filepath + + @pytest.mark.parametrize("extension,create_func,test_data", [ + # Multi-doc formats - all extensions must be covered + ("json", "_create_json_file", [ + {"title": "Doc 1", "content": "Content 1", "id": "1"}, + {"title": "Doc 2", "content": "Content 2", "id": "2"}, + {"title": "Doc 3", "content": "Content 3", "id": "3"} + ]), + ("jsonl", "_create_jsonl_file", [ + {"title": "Doc 1", "content": "Content 1", "id": "1"}, + {"title": "Doc 2", "content": "Content 2", "id": "2"}, + {"title": "Doc 3", "content": "Content 3", "id": "3"} + ]), + ("csv", "_create_csv_file", [ + {"title": "Doc 1", "content": "Content 1"}, + {"title": "Doc 2", "content": "Content 2"}, + {"title": "Doc 3", "content": "Content 3"} + ]), + ]) + def test_upload_multi_doc_formats(self, mock_rag, mock_dataset, temp_dir, extension, create_func, test_data): + """Test uploading multi-document formats (json, jsonl, csv, xlsx, xls).""" + # Create test file + filename = f"test.{extension}" + create_method = getattr(self, create_func) + filepath = create_method(temp_dir, filename, test_data) + + # Create uploader and extractor + uploader = BatchUploader(mock_rag, mock_dataset) + extractor = DocumentExtractor(multi_doc_extensions=["json", "jsonl", "csv", "xlsx", "xls"]) + + # Upload documents + total_docs, total_files = uploader.upload( + document_extractor=extractor, + data_dir=temp_dir, + dataset_id="test_dataset_id", + batch_size=2, + file_extension="txt" + ) + + # Verify upload was called + assert mock_dataset.upload_documents_with_meta.called + assert total_docs == len(test_data) + assert total_files == 1 + + # Verify uploaded documents + call_args_list = mock_dataset.upload_documents_with_meta.call_args_list + all_uploaded_docs = [] + for call_args in call_args_list: + docs = call_args[0][0] + all_uploaded_docs.extend(docs) + + assert len(all_uploaded_docs) == len(test_data) + for i, doc in enumerate(all_uploaded_docs): + assert doc["title"] == test_data[i].get("title", "") + assert doc["content"] == test_data[i].get("content", "") + + @pytest.mark.parametrize("extension", ["xlsx", "xls"]) + def test_upload_excel_formats(self, mock_rag, mock_dataset, temp_dir, extension): + """Test uploading Excel formats (xlsx, xls).""" + pytest.importorskip("pandas") + + import pandas as pd + + # Create test Excel file + filename = f"test.{extension}" + filepath = os.path.join(temp_dir, filename) + test_data = [ + {"title": "Doc 1", "content": "Content 1"}, + {"title": "Doc 2", "content": "Content 2"}, + {"title": "Doc 3", "content": "Content 3"} + ] + + if extension == 'xlsx': + # Use pandas with openpyxl for xlsx + df = pd.DataFrame(test_data) + df.to_excel(filepath, index=False, engine='openpyxl') + else: + # For xls, use xlwt directly (pandas 2.0+ doesn't support xlwt writer) + try: + import xlwt + workbook = xlwt.Workbook() + worksheet = workbook.add_sheet('Sheet1') + + # Write header + headers = list(test_data[0].keys()) + for col, header in enumerate(headers): + worksheet.write(0, col, header) + + # Write data rows + for row, data in enumerate(test_data, start=1): + for col, header in enumerate(headers): + worksheet.write(row, col, data[header]) + + workbook.save(filepath) + except ImportError: + pytest.skip("xlwt is not installed") + + # Create uploader and extractor with xlsx/xls as multi-doc format + uploader = BatchUploader(mock_rag, mock_dataset) + extractor = DocumentExtractor(multi_doc_extensions=['json', 'jsonl', 'csv', 'xlsx', 'xls']) + + # Upload documents + total_docs, total_files = uploader.upload( + document_extractor=extractor, + data_dir=temp_dir, + dataset_id="test_dataset_id", + batch_size=2, + file_extension=extension + ) + + # Verify upload was called + assert mock_dataset.upload_documents_with_meta.called + assert total_docs == 3 + assert total_files == 1 + + def _create_excel_file(self, temp_dir, filename, test_data): + """Helper to create Excel file (xlsx or xls).""" + pytest.importorskip("pandas") + import pandas as pd + + filepath = os.path.join(temp_dir, filename) + extension = os.path.splitext(filename)[1].lower() + + if extension == '.xlsx': + df = pd.DataFrame(test_data) + df.to_excel(filepath, index=False, engine='openpyxl') + else: # .xls + try: + import xlwt + workbook = xlwt.Workbook() + worksheet = workbook.add_sheet('Sheet1') + + headers = list(test_data[0].keys()) + for col, header in enumerate(headers): + worksheet.write(0, col, header) + + for row, data in enumerate(test_data, start=1): + for col, header in enumerate(headers): + worksheet.write(row, col, data[header]) + + workbook.save(filepath) + except ImportError: + pytest.skip("xlwt is not installed") + + return filepath + + @pytest.mark.parametrize("extension,batch_size,total_docs,scenario", [ + # Test different batch size scenarios for each format + ("json", 5, 5, "exact"), # Exactly batch_size + ("json", 3, 10, "exceeds"), # More than batch_size (multiple batches) + ("json", 10, 3, "less"), # Less than batch_size + ("json", 3, 7, "partial"), # Partial last batch (7 docs, batch_size=3 -> 3 batches) + ("jsonl", 5, 5, "exact"), + ("jsonl", 3, 10, "exceeds"), + ("jsonl", 10, 3, "less"), + ("jsonl", 3, 7, "partial"), + ("csv", 5, 5, "exact"), + ("csv", 3, 10, "exceeds"), + ("csv", 10, 3, "less"), + ("csv", 3, 7, "partial"), + ("xlsx", 5, 5, "exact"), + ("xlsx", 3, 10, "exceeds"), + ("xlsx", 10, 3, "less"), + ("xlsx", 3, 7, "partial"), + ("xls", 5, 5, "exact"), + ("xls", 3, 10, "exceeds"), + ("xls", 10, 3, "less"), + ("xls", 3, 7, "partial"), + ]) + def test_single_file_multi_doc_batch_scenarios(self, mock_rag, mock_dataset, temp_dir, + extension, batch_size, total_docs, scenario): + """Test single file with multiple documents across different batch size scenarios. + + Covers: + - exact: document count equals batch_size (single full batch) + - exceeds: document count exceeds batch_size (multiple batches) + - less: document count less than batch_size (single partial batch) + - partial: document count creates partial last batch + """ + # Create test data + test_data = [ + {"title": f"Doc {i}", "content": f"Content {i}"} + for i in range(total_docs) + ] + + # Create file based on extension + filename = f"test.{extension}" + if extension in ["json", "jsonl", "csv"]: + create_method = getattr(self, f"_create_{extension}_file" if extension != "jsonl" else "_create_jsonl_file") + filepath = create_method(temp_dir, filename, test_data) + else: # xlsx or xls + filepath = self._create_excel_file(temp_dir, filename, test_data) + + # Create uploader and extractor + uploader = BatchUploader(mock_rag, mock_dataset) + extractor = DocumentExtractor(multi_doc_extensions=["json", "jsonl", "csv", "xlsx", "xls"]) + + # Upload documents + total_docs_uploaded, total_files = uploader.upload( + document_extractor=extractor, + data_dir=temp_dir, + dataset_id="test_dataset_id", + batch_size=batch_size, + file_extension="txt" + ) + + # Verify upload was called + assert mock_dataset.upload_documents_with_meta.called + assert total_docs_uploaded == total_docs + assert total_files == 1 + + # Verify batch count based on scenario + expected_batch_count = (total_docs + batch_size - 1) // batch_size # Ceiling division + assert mock_dataset.upload_documents_with_meta.call_count == expected_batch_count + + # Verify all documents were uploaded correctly + call_args_list = mock_dataset.upload_documents_with_meta.call_args_list + all_uploaded_docs = [] + for call_args in call_args_list: + docs = call_args[0][0] + all_uploaded_docs.extend(docs) + + assert len(all_uploaded_docs) == total_docs + for i, doc in enumerate(all_uploaded_docs): + assert doc["title"] == f"Doc {i}" + assert doc["content"] == f"Content {i}" + + # Verify batch sizes (all batches except last should be full) + for i, call_args in enumerate(call_args_list): + docs = call_args[0][0] + if i < len(call_args_list) - 1: + assert len(docs) == batch_size, f"Batch {i} should be full (size={batch_size})" + else: + # Last batch may be smaller + expected_last_batch_size = total_docs % batch_size + if expected_last_batch_size == 0: + expected_last_batch_size = batch_size + assert len(docs) == expected_last_batch_size, \ + f"Last batch should have {expected_last_batch_size} docs, got {len(docs)}" + + @pytest.mark.parametrize("extension", ["json", "jsonl", "csv"]) + def test_single_file_large_document_count(self, mock_rag, mock_dataset, temp_dir, extension): + """Test single file with large number of documents to verify batch processing works correctly.""" + batch_size = 7 + total_docs = 100 # Large number of documents + + # Create test data with many documents + test_data = [ + {"title": f"Doc {i}", "content": f"Content {i}", "id": str(i)} + for i in range(total_docs) + ] + + # Create file + filename = f"large_test.{extension}" + create_method = getattr(self, f"_create_{extension}_file" if extension != "jsonl" else "_create_jsonl_file") + filepath = create_method(temp_dir, filename, test_data) + + # Create uploader and extractor + uploader = BatchUploader(mock_rag, mock_dataset) + extractor = DocumentExtractor(multi_doc_extensions=["json", "jsonl", "csv", "xlsx", "xls"]) + + # Upload documents + total_docs_uploaded, total_files = uploader.upload( + document_extractor=extractor, + data_dir=temp_dir, + dataset_id="test_dataset_id", + batch_size=batch_size, + file_extension="txt" + ) + + # Verify all documents were uploaded + assert total_docs_uploaded == total_docs + assert total_files == 1 + + # Verify correct number of batches + expected_batch_count = (total_docs + batch_size - 1) // batch_size + assert mock_dataset.upload_documents_with_meta.call_count == expected_batch_count + + # Verify document order and completeness + call_args_list = mock_dataset.upload_documents_with_meta.call_args_list + all_uploaded_docs = [] + for call_args in call_args_list: + docs = call_args[0][0] + all_uploaded_docs.extend(docs) + + assert len(all_uploaded_docs) == total_docs + # Verify first and last documents + assert all_uploaded_docs[0]["title"] == "Doc 0" + assert all_uploaded_docs[-1]["title"] == f"Doc {total_docs - 1}" + # Verify sequential order + for i, doc in enumerate(all_uploaded_docs): + assert doc["title"] == f"Doc {i}", f"Document order mismatch at index {i}" + + @pytest.mark.parametrize("extension,content", [ + # Single-doc formats - select a few for testing + ("txt", "This is a test document content."), + ("md", "# Test Document\n\nThis is markdown content."), + ("pdf", b"PDF content"), # Binary content + ]) + def test_upload_single_doc_formats(self, mock_rag, mock_dataset, temp_dir, extension, content): + """Test uploading single-document formats.""" + filename = f"test.{extension}" + filepath = os.path.join(temp_dir, filename) + + if isinstance(content, bytes): + # For binary files like PDF + with open(filepath, 'wb') as f: + f.write(content) + else: + # For text files + with open(filepath, 'w', encoding='utf-8') as f: + f.write(content) + + # Create uploader and extractor + uploader = BatchUploader(mock_rag, mock_dataset) + extractor = DocumentExtractor() + + # Upload documents + total_docs, total_files = uploader.upload( + document_extractor=extractor, + data_dir=temp_dir, + dataset_id="test_dataset_id", + batch_size=5, + file_extension="txt" + ) + + # Verify upload was called + assert mock_dataset.upload_documents_with_meta.called + assert total_docs == 1 + assert total_files == 1 + + # Verify uploaded document + call_args = mock_dataset.upload_documents_with_meta.call_args + docs = call_args[0][0] + assert len(docs) == 1 + assert docs[0]["title"] == f"test.{extension}" # Filename with extension + if isinstance(content, str): + assert docs[0]["content"] == content + + def test_upload_with_field_mapper(self, mock_rag, mock_dataset, temp_dir): + """Test uploading with field mapper for multi-doc format.""" + # Create JSON file with custom field names + test_data = [ + {"name": "Doc 1", "text": "Content 1", "docid": "1", "link": "http://example.com/1", "tag": "tag1,tag2"}, + {"name": "Doc 2", "text": "Content 2", "docid": "2", "link": "http://example.com/2", "tag": "tag3"} + ] + filepath = self._create_json_file(temp_dir, "test.json", test_data) + + # Create field mapper with custom mappings + field_mapper = FieldMapper( + title_field="name", + content_field="text", + doc_id_field="docid", + doc_url_field="link", + tags_field="tag" + ) + + # Create uploader and extractor + uploader = BatchUploader(mock_rag, mock_dataset) + extractor = DocumentExtractor(field_mapper=field_mapper) + + # Upload documents + total_docs, total_files = uploader.upload( + document_extractor=extractor, + data_dir=temp_dir, + dataset_id="test_dataset_id", + batch_size=5, + file_extension="txt" + ) + + # Verify upload was called + assert mock_dataset.upload_documents_with_meta.called + assert total_docs == 2 + + # Verify field mapping + call_args = mock_dataset.upload_documents_with_meta.call_args + docs = call_args[0][0] + assert len(docs) == 2 + assert docs[0]["title"] == "Doc 1" + assert docs[0]["content"] == "Content 1" + assert docs[0]["metadata"]["doc_id"] == "1" + assert docs[0]["metadata"]["doc_url"] == "http://example.com/1" + assert docs[0]["metadata"]["tags"] == ["tag1", "tag2"] + + def test_snapshot_generation_and_resume(self, mock_rag, mock_dataset, temp_dir): + """Test snapshot generation and resuming from snapshot.""" + # Create JSON file with multiple documents + test_data = [ + {"title": f"Doc {i}", "content": f"Content {i}"} for i in range(10) + ] + filepath = self._create_json_file(temp_dir, "test.json", test_data) + + # Create uploader and extractor + uploader = BatchUploader(mock_rag, mock_dataset) + extractor = DocumentExtractor() + + snapshot_file = os.path.join(temp_dir, "snapshot.json") + + # First upload - process all documents + total_docs, total_files = uploader.upload( + document_extractor=extractor, + data_dir=temp_dir, + dataset_id="test_dataset_id", + batch_size=5, + snapshot_file=snapshot_file, + resume=False, + file_extension="txt" + ) + + # Verify snapshot was created (but will be cleaned up after completion) + # Since upload completes successfully, snapshot is removed + # So we verify the upload worked correctly + assert total_docs == 10 + assert total_files == 1 + + # Verify all documents were uploaded + call_args_list = mock_dataset.upload_documents_with_meta.call_args_list + all_uploaded_docs = [] + for call_args in call_args_list: + docs = call_args[0][0] + all_uploaded_docs.extend(docs) + assert len(all_uploaded_docs) == 10 + + def test_snapshot_resume_partial_processing(self, mock_rag, mock_dataset, temp_dir): + """Test resuming from snapshot with partial file processing.""" + # Create JSON file with multiple documents + test_data = [ + {"title": f"Doc {i}", "content": f"Content {i}"} for i in range(10) + ] + filepath = self._create_json_file(temp_dir, "test.json", test_data) + + # Manually create a snapshot with partial progress + snapshot_file = os.path.join(temp_dir, "snapshot.json") + file_cursors = [FileCursor(file_path=filepath, doc_index=5)] + BatchUploader.save_snapshot(snapshot_file, file_cursors, total_processed=5, dataset_id="test_dataset_id") + + # Create uploader and extractor + uploader = BatchUploader(mock_rag, mock_dataset) + extractor = DocumentExtractor() + + # Resume from snapshot + total_docs, total_files = uploader.upload( + document_extractor=extractor, + data_dir=temp_dir, + dataset_id="test_dataset_id", + batch_size=3, + snapshot_file=snapshot_file, + resume=True, + file_extension="txt" + ) + + # Verify that remaining documents were uploaded + # Should process documents starting from index 5 + assert mock_dataset.upload_documents_with_meta.called + + # Check that documents were resumed from index 5 + call_args_list = mock_dataset.upload_documents_with_meta.call_args_list + all_uploaded_docs = [] + for call_args in call_args_list: + docs = call_args[0][0] + all_uploaded_docs.extend(docs) + + # Should have uploaded remaining 5 documents (indices 5-9) + assert len(all_uploaded_docs) == 5 + assert all_uploaded_docs[0]["title"] == "Doc 5" + assert all_uploaded_docs[-1]["title"] == "Doc 9" + + @pytest.mark.parametrize("multi_doc_extensions,file_extension,expected_multi_doc", [ + # Test multi_doc_extensions configuration + (["json", "jsonl", "csv"], "json", True), + (["json", "jsonl", "csv"], "jsonl", True), + (["json", "jsonl", "csv"], "csv", True), + (["json", "jsonl", "csv"], "txt", False), # txt not in multi_doc_extensions + (["json"], "csv", False), # csv not in multi_doc_extensions when only json is specified + ([], "json", False), # Empty list means all are single-doc + ]) + def test_multi_doc_extensions_config(self, mock_rag, mock_dataset, temp_dir, + multi_doc_extensions, file_extension, expected_multi_doc): + """Test that multi_doc_extensions configuration is respected.""" + # Create test file based on extension + filename = f"test.{file_extension}" + + if file_extension == "json": + test_data = [ + {"title": "Doc 1", "content": "Content 1"}, + {"title": "Doc 2", "content": "Content 2"} + ] + filepath = self._create_json_file(temp_dir, filename, test_data) + elif file_extension == "jsonl": + test_data = [ + {"title": "Doc 1", "content": "Content 1"}, + {"title": "Doc 2", "content": "Content 2"} + ] + filepath = self._create_jsonl_file(temp_dir, filename, test_data) + elif file_extension == "csv": + test_data = [ + {"title": "Doc 1", "content": "Content 1"}, + {"title": "Doc 2", "content": "Content 2"} + ] + filepath = self._create_csv_file(temp_dir, filename, test_data) + else: # txt or other single-doc format + filepath = self._create_text_file(temp_dir, filename, "Single document content") + + # Create extractor with custom multi_doc_extensions + extractor = DocumentExtractor(multi_doc_extensions=multi_doc_extensions) + + # Verify file type detection + is_multi = extractor.file_reader.is_multi_document_format(filepath) + assert is_multi == expected_multi_doc, \ + f"Expected {file_extension} to be {'multi-doc' if expected_multi_doc else 'single-doc'} " \ + f"with multi_doc_extensions={multi_doc_extensions}, but got {is_multi}" + + # Create uploader + uploader = BatchUploader(mock_rag, mock_dataset) + + # Upload documents + total_docs, total_files = uploader.upload( + document_extractor=extractor, + data_dir=temp_dir, + dataset_id="test_dataset_id", + batch_size=5, + file_extension="txt" + ) + + # Verify upload was called + assert mock_dataset.upload_documents_with_meta.called + + if expected_multi_doc: + # Multi-doc files should yield multiple documents + assert total_docs >= 1 + else: + # Single-doc files should yield exactly 1 document + assert total_docs == 1 + + def test_batch_upload_with_field_mapper_snapshot(self, mock_rag, mock_dataset, temp_dir): + """Test batch upload with field mapper and snapshot for multi-doc format.""" + # Create JSON file with custom fields + test_data = [ + {"name": f"Doc {i}", "text": f"Content {i}", "id": str(i)} + for i in range(8) + ] + filepath = self._create_json_file(temp_dir, "test.json", test_data) + + # Create field mapper + field_mapper = FieldMapper( + title_field="name", + content_field="text", + doc_id_field="id" + ) + + # Create uploader and extractor + uploader = BatchUploader(mock_rag, mock_dataset) + extractor = DocumentExtractor(field_mapper=field_mapper) + + snapshot_file = os.path.join(temp_dir, "snapshot.json") + + # Upload with batch_size=3 to create multiple batches + total_docs, total_files = uploader.upload( + document_extractor=extractor, + data_dir=temp_dir, + dataset_id="test_dataset_id", + batch_size=3, + snapshot_file=snapshot_file, + resume=False, + file_extension="txt" + ) + + # Verify all documents were uploaded + assert total_docs == 8 + assert total_files == 1 + + # Verify field mapping was applied + call_args_list = mock_dataset.upload_documents_with_meta.call_args_list + all_uploaded_docs = [] + for call_args in call_args_list: + docs = call_args[0][0] + all_uploaded_docs.extend(docs) + + assert len(all_uploaded_docs) == 8 + for i, doc in enumerate(all_uploaded_docs): + assert doc["title"] == f"Doc {i}" + assert doc["content"] == f"Content {i}" + assert doc["metadata"]["doc_id"] == str(i) + + # Note: snapshot file is cleaned up after successful completion + # This is expected behavior - snapshot is only kept for resume scenarios + + def test_snapshot_resume_with_field_mapper(self, mock_rag, mock_dataset, temp_dir): + """Test resuming from snapshot with field mapper for multi-doc format.""" + # Create JSON file with custom fields + test_data = [ + {"name": f"Doc {i}", "text": f"Content {i}", "id": str(i)} + for i in range(10) + ] + filepath = self._create_json_file(temp_dir, "test.json", test_data) + + # Create field mapper + field_mapper = FieldMapper( + title_field="name", + content_field="text", + doc_id_field="id" + ) + + # Manually create snapshot with partial progress + snapshot_file = os.path.join(temp_dir, "snapshot.json") + file_cursors = [FileCursor(file_path=filepath, doc_index=4)] + BatchUploader.save_snapshot(snapshot_file, file_cursors, total_processed=4, dataset_id="test_dataset_id") + + # Create uploader and extractor + uploader = BatchUploader(mock_rag, mock_dataset) + extractor = DocumentExtractor(field_mapper=field_mapper) + + # Resume from snapshot + total_docs, total_files = uploader.upload( + document_extractor=extractor, + data_dir=temp_dir, + dataset_id="test_dataset_id", + batch_size=3, + snapshot_file=snapshot_file, + resume=True, + file_extension="txt" + ) + + # Verify remaining documents were uploaded with field mapping + assert mock_dataset.upload_documents_with_meta.called + + call_args_list = mock_dataset.upload_documents_with_meta.call_args_list + all_uploaded_docs = [] + for call_args in call_args_list: + docs = call_args[0][0] + all_uploaded_docs.extend(docs) + + # Should have uploaded remaining 6 documents (indices 4-9) + assert len(all_uploaded_docs) == 6 + assert all_uploaded_docs[0]["title"] == "Doc 4" + assert all_uploaded_docs[0]["metadata"]["doc_id"] == "4" + assert all_uploaded_docs[-1]["title"] == "Doc 9" + assert all_uploaded_docs[-1]["metadata"]["doc_id"] == "9" + + def test_upload_empty_file(self, mock_rag, mock_dataset, temp_dir): + """Test uploading empty file.""" + # Create empty JSON file + filepath = self._create_json_file(temp_dir, "empty.json", []) + + # Create uploader and extractor + uploader = BatchUploader(mock_rag, mock_dataset) + extractor = DocumentExtractor() + + # Upload documents + total_docs, total_files = uploader.upload( + document_extractor=extractor, + data_dir=temp_dir, + dataset_id="test_dataset_id", + batch_size=5, + file_extension="txt" + ) + + # Empty file should result in 0 documents + assert total_docs == 0 + # File was processed but had no documents, so no cursor is created + # fully_processed_files counts files with cursor > 0, so should be 0 + assert total_files == 0 + + def test_upload_multiple_files(self, mock_rag, mock_dataset, temp_dir): + """Test uploading multiple files.""" + # Create multiple JSON files + for i in range(3): + test_data = [ + {"title": f"Doc {i}-{j}", "content": f"Content {i}-{j}"} + for j in range(2) + ] + self._create_json_file(temp_dir, f"test_{i}.json", test_data) + + # Create uploader and extractor + uploader = BatchUploader(mock_rag, mock_dataset) + extractor = DocumentExtractor() + + # Upload documents + total_docs, total_files = uploader.upload( + document_extractor=extractor, + data_dir=temp_dir, + dataset_id="test_dataset_id", + batch_size=5, + file_extension="txt" + ) + + # Verify all files were processed + assert total_docs == 6 # 3 files * 2 docs each + assert total_files == 3 + + # Verify all documents were uploaded + call_args_list = mock_dataset.upload_documents_with_meta.call_args_list + all_uploaded_docs = [] + for call_args in call_args_list: + docs = call_args[0][0] + all_uploaded_docs.extend(docs) + assert len(all_uploaded_docs) == 6 + + def test_snapshot_single_doc_file(self, mock_rag, mock_dataset, temp_dir): + """Test snapshot with single-document file.""" + # Create a single-doc text file + filepath = self._create_text_file(temp_dir, "test.txt", "Single document content") + + # Create uploader and extractor + uploader = BatchUploader(mock_rag, mock_dataset) + extractor = DocumentExtractor() + + snapshot_file = os.path.join(temp_dir, "snapshot.json") + + # Upload document + total_docs, total_files = uploader.upload( + document_extractor=extractor, + data_dir=temp_dir, + dataset_id="test_dataset_id", + batch_size=5, + snapshot_file=snapshot_file, + resume=False, + file_extension="txt" + ) + + # Verify upload + assert total_docs == 1 + assert total_files == 1 + + # Manually create snapshot to test resume + file_cursors = [FileCursor(file_path=filepath, doc_index=1)] + BatchUploader.save_snapshot(snapshot_file, file_cursors, total_processed=1, dataset_id="test_dataset_id") + + # Reset mock + mock_dataset.upload_documents_with_meta.reset_mock() + + # Resume from snapshot - single-doc file should be skipped + total_docs_resume, total_files_resume = uploader.upload( + document_extractor=extractor, + data_dir=temp_dir, + dataset_id="test_dataset_id", + batch_size=5, + snapshot_file=snapshot_file, + resume=True, + file_extension="txt" + ) + + # Single-doc file with doc_index > 0 should be skipped + # So no new uploads should occur + assert not mock_dataset.upload_documents_with_meta.called or total_docs_resume == 0 + + def test_snapshot_mixed_files(self, mock_rag, mock_dataset, temp_dir): + """Test snapshot with mixed single-doc and multi-doc files.""" + # Create a single-doc file + txt_file = self._create_text_file(temp_dir, "test.txt", "Single doc content") + + # Create a multi-doc JSON file + json_data = [ + {"title": f"Doc {i}", "content": f"Content {i}"} for i in range(5) + ] + json_file = self._create_json_file(temp_dir, "test.json", json_data) + + # Create uploader and extractor + uploader = BatchUploader(mock_rag, mock_dataset) + extractor = DocumentExtractor() + + snapshot_file = os.path.join(temp_dir, "snapshot.json") + + # Upload all documents + total_docs, total_files = uploader.upload( + document_extractor=extractor, + data_dir=temp_dir, + dataset_id="test_dataset_id", + batch_size=3, + snapshot_file=snapshot_file, + resume=False, + file_extension="txt" + ) + + # Verify upload + assert total_docs == 6 # 1 single-doc + 5 multi-doc + assert total_files == 2 + + # Create partial snapshot: txt file processed, json file partially processed + file_cursors = [ + FileCursor(file_path=txt_file, doc_index=1), # Single-doc: processed + FileCursor(file_path=json_file, doc_index=3) # Multi-doc: 3 of 5 processed + ] + BatchUploader.save_snapshot(snapshot_file, file_cursors, total_processed=4, dataset_id="test_dataset_id") + + # Reset mock + mock_dataset.upload_documents_with_meta.reset_mock() + + # Resume from snapshot + total_docs_resume, total_files_resume = uploader.upload( + document_extractor=extractor, + data_dir=temp_dir, + dataset_id="test_dataset_id", + batch_size=3, + snapshot_file=snapshot_file, + resume=True, + file_extension="txt" + ) + + # Should resume json file from index 3, skip txt file + assert mock_dataset.upload_documents_with_meta.called + + # Verify remaining 2 documents from json file were uploaded + call_args_list = mock_dataset.upload_documents_with_meta.call_args_list + all_uploaded_docs = [] + for call_args in call_args_list: + docs = call_args[0][0] + all_uploaded_docs.extend(docs) + + assert len(all_uploaded_docs) == 2 # Remaining 2 docs from json file + assert all_uploaded_docs[0]["title"] == "Doc 3" + assert all_uploaded_docs[1]["title"] == "Doc 4" + diff --git a/sdk/python/test/test_tools/test_reparse_failed_documents.py b/sdk/python/test/test_tools/test_reparse_failed_documents.py new file mode 100644 index 000000000..c496080ed --- /dev/null +++ b/sdk/python/test/test_tools/test_reparse_failed_documents.py @@ -0,0 +1,353 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from unittest.mock import Mock, MagicMock, patch, call + +from ragflow_sdk.tools import FailedDocumentReparser + + +class TestFailedDocumentReparser: + """Unit tests for FailedDocumentReparser with mocked PowerRAG interfaces.""" + + @pytest.fixture + def mock_rag(self): + """Create a mock RAGFlow client.""" + rag = Mock() + rag.api_url = "http://test.com/api/v1" + rag.user_key = "test_key" + return rag + + @pytest.fixture + def mock_dataset(self, mock_rag): + """Create a mock DataSet.""" + dataset = Mock() + dataset.id = "test_dataset_id" + dataset.name = "test_dataset" + dataset.rag = mock_rag + + # Mock async_parse_documents + dataset.async_parse_documents = Mock() + + # Mock list_documents to return empty list by default + dataset.list_documents = Mock(return_value=[]) + + return dataset + + def test_init(self, mock_rag): + """Test FailedDocumentReparser initialization.""" + reparser = FailedDocumentReparser(mock_rag) + assert reparser.rag == mock_rag + assert reparser.dataset is None + + def test_init_with_dataset(self, mock_rag, mock_dataset): + """Test FailedDocumentReparser initialization with dataset.""" + reparser = FailedDocumentReparser(mock_rag, mock_dataset) + assert reparser.rag == mock_rag + assert reparser.dataset == mock_dataset + + def test_set_dataset(self, mock_rag, mock_dataset): + """Test set_dataset method.""" + reparser = FailedDocumentReparser(mock_rag) + reparser.set_dataset(mock_dataset) + assert reparser.dataset == mock_dataset + + def test_get_dataset(self, mock_rag, mock_dataset): + """Test get_dataset method.""" + mock_rag.list_datasets = Mock(return_value=[mock_dataset]) + + reparser = FailedDocumentReparser(mock_rag) + dataset = reparser.get_dataset("test_dataset_id") + + assert dataset == mock_dataset + assert reparser.dataset == mock_dataset + mock_rag.list_datasets.assert_called_once_with(id="test_dataset_id") + + def test_get_dataset_not_found(self, mock_rag): + """Test get_dataset when dataset is not found.""" + mock_rag.list_datasets = Mock(return_value=[]) + + reparser = FailedDocumentReparser(mock_rag) + + with pytest.raises(Exception) as excinfo: + reparser.get_dataset("non_existent_id") + + assert "Dataset with ID 'non_existent_id' not found" in str(excinfo.value) + + def test_reparse_failed_documents_no_failed(self, mock_rag, mock_dataset): + """Test reparse_failed_documents when there are no failed documents.""" + # Mock documents with all successful + mock_doc1 = Mock() + mock_doc1.id = "doc1" + mock_doc1.run = "DONE" + + mock_doc2 = Mock() + mock_doc2.id = "doc2" + mock_doc2.run = "DONE" + + mock_dataset.list_documents = Mock(return_value=[mock_doc1, mock_doc2]) + + reparser = FailedDocumentReparser(mock_rag, mock_dataset) + total_failed, total_reparsed = reparser.reparse_failed_documents() + + assert total_failed == 0 + assert total_reparsed == 0 + assert not mock_dataset.async_parse_documents.called + + def test_reparse_failed_documents_with_failed(self, mock_rag, mock_dataset): + """Test reparse_failed_documents with failed documents.""" + # Mock documents: some failed, some successful + mock_doc1 = Mock() + mock_doc1.id = "doc1" + mock_doc1.run = "FAIL" + + mock_doc2 = Mock() + mock_doc2.id = "doc2" + mock_doc2.run = "DONE" + + mock_doc3 = Mock() + mock_doc3.id = "doc3" + mock_doc3.run = "FAIL" + + mock_dataset.list_documents = Mock(return_value=[mock_doc1, mock_doc2, mock_doc3]) + + reparser = FailedDocumentReparser(mock_rag, mock_dataset) + total_failed, total_reparsed = reparser.reparse_failed_documents(reparse_batch_size=2) + + assert total_failed == 2 + assert total_reparsed == 2 + # Should be called once with batch of 2 failed documents + mock_dataset.async_parse_documents.assert_called_once_with(["doc1", "doc3"]) + + def test_reparse_failed_documents_batch_processing(self, mock_rag, mock_dataset): + """Test reparse_failed_documents with batch processing.""" + # Create 5 failed documents + failed_docs = [] + for i in range(5): + mock_doc = Mock() + mock_doc.id = f"doc{i}" + mock_doc.run = "FAIL" + failed_docs.append(mock_doc) + + mock_dataset.list_documents = Mock(return_value=failed_docs) + + reparser = FailedDocumentReparser(mock_rag, mock_dataset) + total_failed, total_reparsed = reparser.reparse_failed_documents(reparse_batch_size=2) + + assert total_failed == 5 + assert total_reparsed == 5 + # Should be called 3 times: 2 batches of 2, then 1 batch of 1 + assert mock_dataset.async_parse_documents.call_count == 3 + # Check batch contents + calls = mock_dataset.async_parse_documents.call_args_list + assert calls[0][0][0] == ["doc0", "doc1"] + assert calls[1][0][0] == ["doc2", "doc3"] + assert calls[2][0][0] == ["doc4"] + + def test_reparse_failed_documents_pagination(self, mock_rag, mock_dataset): + """Test reparse_failed_documents with pagination.""" + # First page: 2 failed documents + page1_docs = [] + for i in range(2): + mock_doc = Mock() + mock_doc.id = f"doc{i}" + mock_doc.run = "FAIL" + page1_docs.append(mock_doc) + + # Second page: 1 failed document + page2_docs = [] + mock_doc = Mock() + mock_doc.id = "doc2" + mock_doc.run = "FAIL" + page2_docs.append(mock_doc) + + # Third page: empty (end of pagination) + page3_docs = [] + + mock_dataset.list_documents = Mock(side_effect=[page1_docs, page2_docs, page3_docs]) + + reparser = FailedDocumentReparser(mock_rag, mock_dataset) + total_failed, total_reparsed = reparser.reparse_failed_documents(page_size=2) + + assert total_failed == 3 + assert total_reparsed == 3 + # Should fetch 2 pages: + # - page 1 returns a full page (2 docs) + # - page 2 returns a partial page (1 doc) and the implementation stops + assert mock_dataset.list_documents.call_count == 2 + # Check pagination parameters + calls = mock_dataset.list_documents.call_args_list + assert calls[0] == call(page=1, page_size=2, orderby="id", desc=False) + assert calls[1] == call(page=2, page_size=2, orderby="id", desc=False) + + def test_reparse_failed_documents_with_dataset_id(self, mock_rag, mock_dataset): + """Test reparse_failed_documents with dataset_id parameter.""" + mock_rag.list_datasets = Mock(return_value=[mock_dataset]) + + # Mock documents with failed ones + mock_doc = Mock() + mock_doc.id = "doc1" + mock_doc.run = "FAIL" + mock_dataset.list_documents = Mock(return_value=[mock_doc]) + + reparser = FailedDocumentReparser(mock_rag) + total_failed, total_reparsed = reparser.reparse_failed_documents(dataset_id="test_dataset_id") + + assert total_failed == 1 + assert total_reparsed == 1 + mock_rag.list_datasets.assert_called_once_with(id="test_dataset_id") + mock_dataset.async_parse_documents.assert_called_once_with(["doc1"]) + + def test_reparse_failed_documents_no_dataset_error(self, mock_rag): + """Test reparse_failed_documents raises error when no dataset is set.""" + reparser = FailedDocumentReparser(mock_rag) + + with pytest.raises(ValueError) as excinfo: + reparser.reparse_failed_documents() + + assert "Either dataset must be set or dataset_id must be provided" in str(excinfo.value) + + def test_reparse_failed_documents_invalid_batch_size(self, mock_rag, mock_dataset): + """Test reparse_failed_documents with invalid reparse_batch_size.""" + reparser = FailedDocumentReparser(mock_rag, mock_dataset) + + with pytest.raises(ValueError) as excinfo: + reparser.reparse_failed_documents(reparse_batch_size=0) + + assert "reparse_batch_size must be greater than 0" in str(excinfo.value) + + with pytest.raises(ValueError) as excinfo: + reparser.reparse_failed_documents(reparse_batch_size=-1) + + assert "reparse_batch_size must be greater than 0" in str(excinfo.value) + + def test_reparse_failed_documents_invalid_page_size(self, mock_rag, mock_dataset): + """Test reparse_failed_documents with invalid page_size.""" + reparser = FailedDocumentReparser(mock_rag, mock_dataset) + + with pytest.raises(ValueError) as excinfo: + reparser.reparse_failed_documents(page_size=0) + + assert "page_size must be greater than 0" in str(excinfo.value) + + with pytest.raises(ValueError) as excinfo: + reparser.reparse_failed_documents(page_size=-1) + + assert "page_size must be greater than 0" in str(excinfo.value) + + def test_reparse_failed_documents_case_insensitive_fail(self, mock_rag, mock_dataset): + """Test reparse_failed_documents handles case-insensitive FAIL status.""" + # Test different case variations + test_cases = ["fail", "FAIL", "Fail", "fAiL"] + + for run_status in test_cases: + mock_dataset.async_parse_documents.reset_mock() + + mock_doc = Mock() + mock_doc.id = "doc1" + mock_doc.run = run_status + mock_dataset.list_documents = Mock(return_value=[mock_doc]) + + reparser = FailedDocumentReparser(mock_rag, mock_dataset) + total_failed, total_reparsed = reparser.reparse_failed_documents() + + assert total_failed == 1 + assert total_reparsed == 1 + mock_dataset.async_parse_documents.assert_called_once_with(["doc1"]) + + def test_reparse_failed_documents_retry_on_error(self, mock_rag, mock_dataset): + """Test reparse_failed_documents retries on error.""" + mock_doc = Mock() + mock_doc.id = "doc1" + mock_doc.run = "FAIL" + mock_dataset.list_documents = Mock(return_value=[mock_doc]) + + # First call raises exception, second call succeeds + mock_dataset.async_parse_documents = Mock(side_effect=[Exception("Network error"), None]) + + reparser = FailedDocumentReparser(mock_rag, mock_dataset) + + # Should retry and eventually succeed + total_failed, total_reparsed = reparser.reparse_failed_documents(reparse_batch_size=1) + + assert total_failed == 1 + assert total_reparsed == 1 + # Should be called twice (initial + retry) + assert mock_dataset.async_parse_documents.call_count == 2 + + def test_reparse_failed_documents_mixed_status(self, mock_rag, mock_dataset): + """Test reparse_failed_documents with mixed document statuses.""" + # Create documents with various statuses + mock_doc_fail = Mock() + mock_doc_fail.id = "doc_fail" + mock_doc_fail.run = "FAIL" + + mock_doc_done = Mock() + mock_doc_done.id = "doc_done" + mock_doc_done.run = "DONE" + + mock_doc_cancel = Mock() + mock_doc_cancel.id = "doc_cancel" + mock_doc_cancel.run = "CANCEL" + + mock_doc_fail2 = Mock() + mock_doc_fail2.id = "doc_fail2" + mock_doc_fail2.run = "FAIL" + + mock_dataset.list_documents = Mock(return_value=[ + mock_doc_fail, mock_doc_done, mock_doc_cancel, mock_doc_fail2 + ]) + + reparser = FailedDocumentReparser(mock_rag, mock_dataset) + total_failed, total_reparsed = reparser.reparse_failed_documents() + + assert total_failed == 2 + assert total_reparsed == 2 + # Should only reparse failed documents + mock_dataset.async_parse_documents.assert_called_once_with(["doc_fail", "doc_fail2"]) + + def test_retry_with_backoff(self, mock_rag): + """Test retry_with_backoff static method.""" + call_count = [0] + + def failing_func(): + call_count[0] += 1 + if call_count[0] < 3: + raise Exception("Test error") + return "success" + + wrapped_func = FailedDocumentReparser.retry_with_backoff(failing_func, max_retries=5, max_backoff=1) + result = wrapped_func() + + assert result == "success" + assert call_count[0] == 3 + + def test_retry_with_backoff_max_retries(self, mock_rag): + """Test retry_with_backoff exhausts max retries.""" + call_count = [0] + + def always_failing_func(): + call_count[0] += 1 + raise Exception("Test error") + + wrapped_func = FailedDocumentReparser.retry_with_backoff(always_failing_func, max_retries=3, max_backoff=0.01) + + with pytest.raises(Exception) as excinfo: + wrapped_func() + + assert "Test error" in str(excinfo.value) + assert call_count[0] == 3 + diff --git a/sdk/python/uv.lock b/sdk/python/uv.lock index 9712e4292..4ef930544 100644 --- a/sdk/python/uv.lock +++ b/sdk/python/uv.lock @@ -1,6 +1,11 @@ version = 1 revision = 3 requires-python = ">=3.10, <3.13" +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version < '3.11'", +] [[package]] name = "attrs" @@ -95,6 +100,58 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, ] +[[package]] +name = "coverage" +version = "7.13.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/23/f9/e92df5e07f3fc8d4c7f9a0f146ef75446bf870351cd37b788cf5897f8079/coverage-7.13.1.tar.gz", hash = "sha256:b7593fe7eb5feaa3fbb461ac79aac9f9fc0387a5ca8080b0c6fe2ca27b091afd", size = 825862, upload-time = "2025-12-28T15:42:56.969Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2d/9a/3742e58fd04b233df95c012ee9f3dfe04708a5e1d32613bd2d47d4e1be0d/coverage-7.13.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e1fa280b3ad78eea5be86f94f461c04943d942697e0dac889fa18fff8f5f9147", size = 218633, upload-time = "2025-12-28T15:40:10.165Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7e/45/7e6bdc94d89cd7c8017ce735cf50478ddfe765d4fbf0c24d71d30ea33d7a/coverage-7.13.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c3d8c679607220979434f494b139dfb00131ebf70bb406553d69c1ff01a5c33d", size = 219147, upload-time = "2025-12-28T15:40:12.069Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f7/38/0d6a258625fd7f10773fe94097dc16937a5f0e3e0cdf3adef67d3ac6baef/coverage-7.13.1-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:339dc63b3eba969067b00f41f15ad161bf2946613156fb131266d8debc8e44d0", size = 245894, upload-time = "2025-12-28T15:40:13.556Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/27/58/409d15ea487986994cbd4d06376e9860e9b157cfbfd402b1236770ab8dd2/coverage-7.13.1-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:db622b999ffe49cb891f2fff3b340cdc2f9797d01a0a202a0973ba2562501d90", size = 247721, upload-time = "2025-12-28T15:40:15.37Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/da/bf/6e8056a83fd7a96c93341f1ffe10df636dd89f26d5e7b9ca511ce3bcf0df/coverage-7.13.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1443ba9acbb593fa7c1c29e011d7c9761545fe35e7652e85ce7f51a16f7e08d", size = 249585, upload-time = "2025-12-28T15:40:17.226Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f4/15/e1daff723f9f5959acb63cbe35b11203a9df77ee4b95b45fffd38b318390/coverage-7.13.1-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c832ec92c4499ac463186af72f9ed4d8daec15499b16f0a879b0d1c8e5cf4a3b", size = 246597, upload-time = "2025-12-28T15:40:19.028Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/74/a6/1efd31c5433743a6ddbc9d37ac30c196bb07c7eab3d74fbb99b924c93174/coverage-7.13.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:562ec27dfa3f311e0db1ba243ec6e5f6ab96b1edfcfc6cf86f28038bc4961ce6", size = 247626, upload-time = "2025-12-28T15:40:20.846Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6d/9f/1609267dd3e749f57fdd66ca6752567d1c13b58a20a809dc409b263d0b5f/coverage-7.13.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:4de84e71173d4dada2897e5a0e1b7877e5eefbfe0d6a44edee6ce31d9b8ec09e", size = 245629, upload-time = "2025-12-28T15:40:22.397Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e2/f6/6815a220d5ec2466383d7cc36131b9fa6ecbe95c50ec52a631ba733f306a/coverage-7.13.1-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:a5a68357f686f8c4d527a2dc04f52e669c2fc1cbde38f6f7eb6a0e58cbd17cae", size = 245901, upload-time = "2025-12-28T15:40:23.836Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ac/58/40576554cd12e0872faf6d2c0eb3bc85f71d78427946ddd19ad65201e2c0/coverage-7.13.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:77cc258aeb29a3417062758975521eae60af6f79e930d6993555eeac6a8eac29", size = 246505, upload-time = "2025-12-28T15:40:25.421Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3b/77/9233a90253fba576b0eee81707b5781d0e21d97478e5377b226c5b096c0f/coverage-7.13.1-cp310-cp310-win32.whl", hash = "sha256:bb4f8c3c9a9f34423dba193f241f617b08ffc63e27f67159f60ae6baf2dcfe0f", size = 221257, upload-time = "2025-12-28T15:40:27.217Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e0/43/e842ff30c1a0a623ec80db89befb84a3a7aad7bfe44a6ea77d5a3e61fedd/coverage-7.13.1-cp310-cp310-win_amd64.whl", hash = "sha256:c8e2706ceb622bc63bac98ebb10ef5da80ed70fbd8a7999a5076de3afaef0fb1", size = 222191, upload-time = "2025-12-28T15:40:28.916Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b4/9b/77baf488516e9ced25fc215a6f75d803493fc3f6a1a1227ac35697910c2a/coverage-7.13.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1a55d509a1dc5a5b708b5dad3b5334e07a16ad4c2185e27b40e4dba796ab7f88", size = 218755, upload-time = "2025-12-28T15:40:30.812Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d7/cd/7ab01154e6eb79ee2fab76bf4d89e94c6648116557307ee4ebbb85e5c1bf/coverage-7.13.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4d010d080c4888371033baab27e47c9df7d6fb28d0b7b7adf85a4a49be9298b3", size = 219257, upload-time = "2025-12-28T15:40:32.333Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/01/d5/b11ef7863ffbbdb509da0023fad1e9eda1c0eaea61a6d2ea5b17d4ac706e/coverage-7.13.1-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d938b4a840fb1523b9dfbbb454f652967f18e197569c32266d4d13f37244c3d9", size = 249657, upload-time = "2025-12-28T15:40:34.1Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f7/7c/347280982982383621d29b8c544cf497ae07ac41e44b1ca4903024131f55/coverage-7.13.1-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bf100a3288f9bb7f919b87eb84f87101e197535b9bd0e2c2b5b3179633324fee", size = 251581, upload-time = "2025-12-28T15:40:36.131Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/82/f6/ebcfed11036ade4c0d75fa4453a6282bdd225bc073862766eec184a4c643/coverage-7.13.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ef6688db9bf91ba111ae734ba6ef1a063304a881749726e0d3575f5c10a9facf", size = 253691, upload-time = "2025-12-28T15:40:37.626Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/02/92/af8f5582787f5d1a8b130b2dcba785fa5e9a7a8e121a0bb2220a6fdbdb8a/coverage-7.13.1-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0b609fc9cdbd1f02e51f67f51e5aee60a841ef58a68d00d5ee2c0faf357481a3", size = 249799, upload-time = "2025-12-28T15:40:39.47Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/24/aa/0e39a2a3b16eebf7f193863323edbff38b6daba711abaaf807d4290cf61a/coverage-7.13.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c43257717611ff5e9a1d79dce8e47566235ebda63328718d9b65dd640bc832ef", size = 251389, upload-time = "2025-12-28T15:40:40.954Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/73/46/7f0c13111154dc5b978900c0ccee2e2ca239b910890e674a77f1363d483e/coverage-7.13.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e09fbecc007f7b6afdfb3b07ce5bd9f8494b6856dd4f577d26c66c391b829851", size = 249450, upload-time = "2025-12-28T15:40:42.489Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ac/ca/e80da6769e8b669ec3695598c58eef7ad98b0e26e66333996aee6316db23/coverage-7.13.1-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:a03a4f3a19a189919c7055098790285cc5c5b0b3976f8d227aea39dbf9f8bfdb", size = 249170, upload-time = "2025-12-28T15:40:44.279Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/af/18/9e29baabdec1a8644157f572541079b4658199cfd372a578f84228e860de/coverage-7.13.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3820778ea1387c2b6a818caec01c63adc5b3750211af6447e8dcfb9b6f08dbba", size = 250081, upload-time = "2025-12-28T15:40:45.748Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/00/f8/c3021625a71c3b2f516464d322e41636aea381018319050a8114105872ee/coverage-7.13.1-cp311-cp311-win32.whl", hash = "sha256:ff10896fa55167371960c5908150b434b71c876dfab97b69478f22c8b445ea19", size = 221281, upload-time = "2025-12-28T15:40:47.232Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/27/56/c216625f453df6e0559ed666d246fcbaaa93f3aa99eaa5080cea1229aa3d/coverage-7.13.1-cp311-cp311-win_amd64.whl", hash = "sha256:a998cc0aeeea4c6d5622a3754da5a493055d2d95186bad877b0a34ea6e6dbe0a", size = 222215, upload-time = "2025-12-28T15:40:49.19Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5c/9a/be342e76f6e531cae6406dc46af0d350586f24d9b67fdfa6daee02df71af/coverage-7.13.1-cp311-cp311-win_arm64.whl", hash = "sha256:fea07c1a39a22614acb762e3fbbb4011f65eedafcb2948feeef641ac78b4ee5c", size = 220886, upload-time = "2025-12-28T15:40:51.067Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ce/8a/87af46cccdfa78f53db747b09f5f9a21d5fc38d796834adac09b30a8ce74/coverage-7.13.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6f34591000f06e62085b1865c9bc5f7858df748834662a51edadfd2c3bfe0dd3", size = 218927, upload-time = "2025-12-28T15:40:52.814Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/82/a8/6e22fdc67242a4a5a153f9438d05944553121c8f4ba70cb072af4c41362e/coverage-7.13.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b67e47c5595b9224599016e333f5ec25392597a89d5744658f837d204e16c63e", size = 219288, upload-time = "2025-12-28T15:40:54.262Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d0/0a/853a76e03b0f7c4375e2ca025df45c918beb367f3e20a0a8e91967f6e96c/coverage-7.13.1-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:3e7b8bd70c48ffb28461ebe092c2345536fb18bbbf19d287c8913699735f505c", size = 250786, upload-time = "2025-12-28T15:40:56.059Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ea/b4/694159c15c52b9f7ec7adf49d50e5f8ee71d3e9ef38adb4445d13dd56c20/coverage-7.13.1-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c223d078112e90dc0e5c4e35b98b9584164bea9fbbd221c0b21c5241f6d51b62", size = 253543, upload-time = "2025-12-28T15:40:57.585Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/96/b2/7f1f0437a5c855f87e17cf5d0dc35920b6440ff2b58b1ba9788c059c26c8/coverage-7.13.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:794f7c05af0763b1bbd1b9e6eff0e52ad068be3b12cd96c87de037b01390c968", size = 254635, upload-time = "2025-12-28T15:40:59.443Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e9/d1/73c3fdb8d7d3bddd9473c9c6a2e0682f09fc3dfbcb9c3f36412a7368bcab/coverage-7.13.1-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0642eae483cc8c2902e4af7298bf886d605e80f26382124cddc3967c2a3df09e", size = 251202, upload-time = "2025-12-28T15:41:01.328Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/66/3c/f0edf75dcc152f145d5598329e864bbbe04ab78660fe3e8e395f9fff010f/coverage-7.13.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9f5e772ed5fef25b3de9f2008fe67b92d46831bd2bc5bdc5dd6bfd06b83b316f", size = 252566, upload-time = "2025-12-28T15:41:03.319Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/17/b3/e64206d3c5f7dcbceafd14941345a754d3dbc78a823a6ed526e23b9cdaab/coverage-7.13.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:45980ea19277dc0a579e432aef6a504fe098ef3a9032ead15e446eb0f1191aee", size = 250711, upload-time = "2025-12-28T15:41:06.411Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/dc/ad/28a3eb970a8ef5b479ee7f0c484a19c34e277479a5b70269dc652b730733/coverage-7.13.1-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:e4f18eca6028ffa62adbd185a8f1e1dd242f2e68164dba5c2b74a5204850b4cf", size = 250278, upload-time = "2025-12-28T15:41:08.285Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/54/e3/c8f0f1a93133e3e1291ca76cbb63565bd4b5c5df63b141f539d747fff348/coverage-7.13.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f8dca5590fec7a89ed6826fce625595279e586ead52e9e958d3237821fbc750c", size = 252154, upload-time = "2025-12-28T15:41:09.969Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d0/bf/9939c5d6859c380e405b19e736321f1c7d402728792f4c752ad1adcce005/coverage-7.13.1-cp312-cp312-win32.whl", hash = "sha256:ff86d4e85188bba72cfb876df3e11fa243439882c55957184af44a35bd5880b7", size = 221487, upload-time = "2025-12-28T15:41:11.468Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fa/dc/7282856a407c621c2aad74021680a01b23010bb8ebf427cf5eacda2e876f/coverage-7.13.1-cp312-cp312-win_amd64.whl", hash = "sha256:16cc1da46c04fb0fb128b4dc430b78fa2aba8a6c0c9f8eb391fd5103409a6ac6", size = 222299, upload-time = "2025-12-28T15:41:13.386Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/10/79/176a11203412c350b3e9578620013af35bcdb79b651eb976f4a4b32044fa/coverage-7.13.1-cp312-cp312-win_arm64.whl", hash = "sha256:8d9bc218650022a768f3775dd7fdac1886437325d8d295d923ebcfef4892ad5c", size = 220941, upload-time = "2025-12-28T15:41:14.975Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cc/48/d9f421cb8da5afaa1a64570d9989e00fb7955e6acddc5a12979f7666ef60/coverage-7.13.1-py3-none-any.whl", hash = "sha256:2016745cb3ba554469d02819d78958b571792bb68e31302610e898f80dd3a573", size = 210722, upload-time = "2025-12-28T15:42:54.901Z" }, +] + +[package.optional-dependencies] +toml = [ + { name = "tomli", marker = "python_full_version <= '3.11'" }, +] + [[package]] name = "et-xmlfile" version = "2.0.0" @@ -109,13 +166,22 @@ name = "exceptiongroup" version = "1.3.0" source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } dependencies = [ - { name = "typing-extensions" }, + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" } wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" }, ] +[[package]] +name = "execnet" +version = "2.1.2" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bf/89/780e11f9588d9e7128a3f87788354c7946a9cbb1401ad38a48c4db9a4f07/execnet-2.1.2.tar.gz", hash = "sha256:63d83bfdd9a23e35b9c6a3261412324f964c2ec8dcd8d3c6916ee9373e0befcd", size = 166622, upload-time = "2025-11-12T09:56:37.75Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ab/84/02fc1827e8cdded4aa65baef11296a9bbe595c474f0d6d758af082d849fd/execnet-2.1.2-py3-none-any.whl", hash = "sha256:67fba928dd5a544b783f6056f449e5e3931a5c378b128bc18501f7ea79e296ec", size = 40708, upload-time = "2025-11-12T09:56:36.333Z" }, +] + [[package]] name = "hypothesis" version = "6.142.3" @@ -218,6 +284,92 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6c/77/d7f491cbc05303ac6801651aabeb262d43f319288c1ea96c66b1d2692ff3/lxml-6.0.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:27220da5be049e936c3aca06f174e8827ca6445a4353a1995584311487fc4e3e", size = 3518768, upload-time = "2025-09-22T04:04:57.097Z" }, ] +[[package]] +name = "numpy" +version = "2.2.6" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +resolution-markers = [ + "python_full_version < '3.11'", +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9a/3e/ed6db5be21ce87955c0cbd3009f2803f59fa08df21b5df06862e2d8e2bdd/numpy-2.2.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b412caa66f72040e6d268491a59f2c43bf03eb6c96dd8f0307829feb7fa2b6fb", size = 21165245, upload-time = "2025-05-17T21:27:58.555Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/22/c2/4b9221495b2a132cc9d2eb862e21d42a009f5a60e45fc44b00118c174bff/numpy-2.2.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e41fd67c52b86603a91c1a505ebaef50b3314de0213461c7a6e99c9a3beff90", size = 14360048, upload-time = "2025-05-17T21:28:21.406Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fd/77/dc2fcfc66943c6410e2bf598062f5959372735ffda175b39906d54f02349/numpy-2.2.6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:37e990a01ae6ec7fe7fa1c26c55ecb672dd98b19c3d0e1d1f326fa13cb38d163", size = 5340542, upload-time = "2025-05-17T21:28:30.931Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7a/4f/1cb5fdc353a5f5cc7feb692db9b8ec2c3d6405453f982435efc52561df58/numpy-2.2.6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:5a6429d4be8ca66d889b7cf70f536a397dc45ba6faeb5f8c5427935d9592e9cf", size = 6878301, upload-time = "2025-05-17T21:28:41.613Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/eb/17/96a3acd228cec142fcb8723bd3cc39c2a474f7dcf0a5d16731980bcafa95/numpy-2.2.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efd28d4e9cd7d7a8d39074a4d44c63eda73401580c5c76acda2ce969e0a38e83", size = 14297320, upload-time = "2025-05-17T21:29:02.78Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b4/63/3de6a34ad7ad6646ac7d2f55ebc6ad439dbbf9c4370017c50cf403fb19b5/numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc7b73d02efb0e18c000e9ad8b83480dfcd5dfd11065997ed4c6747470ae8915", size = 16801050, upload-time = "2025-05-17T21:29:27.675Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/07/b6/89d837eddef52b3d0cec5c6ba0456c1bf1b9ef6a6672fc2b7873c3ec4e2e/numpy-2.2.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:74d4531beb257d2c3f4b261bfb0fc09e0f9ebb8842d82a7b4209415896adc680", size = 15807034, upload-time = "2025-05-17T21:29:51.102Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/01/c8/dc6ae86e3c61cfec1f178e5c9f7858584049b6093f843bca541f94120920/numpy-2.2.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8fc377d995680230e83241d8a96def29f204b5782f371c532579b4f20607a289", size = 18614185, upload-time = "2025-05-17T21:30:18.703Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5b/c5/0064b1b7e7c89137b471ccec1fd2282fceaae0ab3a9550f2568782d80357/numpy-2.2.6-cp310-cp310-win32.whl", hash = "sha256:b093dd74e50a8cba3e873868d9e93a85b78e0daf2e98c6797566ad8044e8363d", size = 6527149, upload-time = "2025-05-17T21:30:29.788Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a3/dd/4b822569d6b96c39d1215dbae0582fd99954dcbcf0c1a13c61783feaca3f/numpy-2.2.6-cp310-cp310-win_amd64.whl", hash = "sha256:f0fd6321b839904e15c46e0d257fdd101dd7f530fe03fd6359c1ea63738703f3", size = 12904620, upload-time = "2025-05-17T21:30:48.994Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/da/a8/4f83e2aa666a9fbf56d6118faaaf5f1974d456b1823fda0a176eff722839/numpy-2.2.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f9f1adb22318e121c5c69a09142811a201ef17ab257a1e66ca3025065b7f53ae", size = 21176963, upload-time = "2025-05-17T21:31:19.36Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b3/2b/64e1affc7972decb74c9e29e5649fac940514910960ba25cd9af4488b66c/numpy-2.2.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c820a93b0255bc360f53eca31a0e676fd1101f673dda8da93454a12e23fc5f7a", size = 14406743, upload-time = "2025-05-17T21:31:41.087Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4a/9f/0121e375000b5e50ffdd8b25bf78d8e1a5aa4cca3f185d41265198c7b834/numpy-2.2.6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3d70692235e759f260c3d837193090014aebdf026dfd167834bcba43e30c2a42", size = 5352616, upload-time = "2025-05-17T21:31:50.072Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/31/0d/b48c405c91693635fbe2dcd7bc84a33a602add5f63286e024d3b6741411c/numpy-2.2.6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:481b49095335f8eed42e39e8041327c05b0f6f4780488f61286ed3c01368d491", size = 6889579, upload-time = "2025-05-17T21:32:01.712Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/52/b8/7f0554d49b565d0171eab6e99001846882000883998e7b7d9f0d98b1f934/numpy-2.2.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b64d8d4d17135e00c8e346e0a738deb17e754230d7e0810ac5012750bbd85a5a", size = 14312005, upload-time = "2025-05-17T21:32:23.332Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b3/dd/2238b898e51bd6d389b7389ffb20d7f4c10066d80351187ec8e303a5a475/numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba10f8411898fc418a521833e014a77d3ca01c15b0c6cdcce6a0d2897e6dbbdf", size = 16821570, upload-time = "2025-05-17T21:32:47.991Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/83/6c/44d0325722cf644f191042bf47eedad61c1e6df2432ed65cbe28509d404e/numpy-2.2.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bd48227a919f1bafbdda0583705e547892342c26fb127219d60a5c36882609d1", size = 15818548, upload-time = "2025-05-17T21:33:11.728Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ae/9d/81e8216030ce66be25279098789b665d49ff19eef08bfa8cb96d4957f422/numpy-2.2.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:9551a499bf125c1d4f9e250377c1ee2eddd02e01eac6644c080162c0c51778ab", size = 18620521, upload-time = "2025-05-17T21:33:39.139Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6a/fd/e19617b9530b031db51b0926eed5345ce8ddc669bb3bc0044b23e275ebe8/numpy-2.2.6-cp311-cp311-win32.whl", hash = "sha256:0678000bb9ac1475cd454c6b8c799206af8107e310843532b04d49649c717a47", size = 6525866, upload-time = "2025-05-17T21:33:50.273Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/31/0a/f354fb7176b81747d870f7991dc763e157a934c717b67b58456bc63da3df/numpy-2.2.6-cp311-cp311-win_amd64.whl", hash = "sha256:e8213002e427c69c45a52bbd94163084025f533a55a59d6f9c5b820774ef3303", size = 12907455, upload-time = "2025-05-17T21:34:09.135Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/82/5d/c00588b6cf18e1da539b45d3598d3557084990dcc4331960c15ee776ee41/numpy-2.2.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41c5a21f4a04fa86436124d388f6ed60a9343a6f767fced1a8a71c3fbca038ff", size = 20875348, upload-time = "2025-05-17T21:34:39.648Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/66/ee/560deadcdde6c2f90200450d5938f63a34b37e27ebff162810f716f6a230/numpy-2.2.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:de749064336d37e340f640b05f24e9e3dd678c57318c7289d222a8a2f543e90c", size = 14119362, upload-time = "2025-05-17T21:35:01.241Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3c/65/4baa99f1c53b30adf0acd9a5519078871ddde8d2339dc5a7fde80d9d87da/numpy-2.2.6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:894b3a42502226a1cac872f840030665f33326fc3dac8e57c607905773cdcde3", size = 5084103, upload-time = "2025-05-17T21:35:10.622Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/cc/89/e5a34c071a0570cc40c9a54eb472d113eea6d002e9ae12bb3a8407fb912e/numpy-2.2.6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:71594f7c51a18e728451bb50cc60a3ce4e6538822731b2933209a1f3614e9282", size = 6625382, upload-time = "2025-05-17T21:35:21.414Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f8/35/8c80729f1ff76b3921d5c9487c7ac3de9b2a103b1cd05e905b3090513510/numpy-2.2.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2618db89be1b4e05f7a1a847a9c1c0abd63e63a1607d892dd54668dd92faf87", size = 14018462, upload-time = "2025-05-17T21:35:42.174Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8c/3d/1e1db36cfd41f895d266b103df00ca5b3cbe965184df824dec5c08c6b803/numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd83c01228a688733f1ded5201c678f0c53ecc1006ffbc404db9f7a899ac6249", size = 16527618, upload-time = "2025-05-17T21:36:06.711Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/61/c6/03ed30992602c85aa3cd95b9070a514f8b3c33e31124694438d88809ae36/numpy-2.2.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:37c0ca431f82cd5fa716eca9506aefcabc247fb27ba69c5062a6d3ade8cf8f49", size = 15505511, upload-time = "2025-05-17T21:36:29.965Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b7/25/5761d832a81df431e260719ec45de696414266613c9ee268394dd5ad8236/numpy-2.2.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fe27749d33bb772c80dcd84ae7e8df2adc920ae8297400dabec45f0dedb3f6de", size = 18313783, upload-time = "2025-05-17T21:36:56.883Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/57/0a/72d5a3527c5ebffcd47bde9162c39fae1f90138c961e5296491ce778e682/numpy-2.2.6-cp312-cp312-win32.whl", hash = "sha256:4eeaae00d789f66c7a25ac5f34b71a7035bb474e679f410e5e1a94deb24cf2d4", size = 6246506, upload-time = "2025-05-17T21:37:07.368Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/36/fa/8c9210162ca1b88529ab76b41ba02d433fd54fecaf6feb70ef9f124683f1/numpy-2.2.6-cp312-cp312-win_amd64.whl", hash = "sha256:c1f9540be57940698ed329904db803cf7a402f3fc200bfe599334c9bd84a40b2", size = 12614190, upload-time = "2025-05-17T21:37:26.213Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9e/3b/d94a75f4dbf1ef5d321523ecac21ef23a3cd2ac8b78ae2aac40873590229/numpy-2.2.6-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0b605b275d7bd0c640cad4e5d30fa701a8d59302e127e5f79138ad62762c3e3d", size = 21040391, upload-time = "2025-05-17T21:44:35.948Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/17/f4/09b2fa1b58f0fb4f7c7963a1649c64c4d315752240377ed74d9cd878f7b5/numpy-2.2.6-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:7befc596a7dc9da8a337f79802ee8adb30a552a94f792b9c9d18c840055907db", size = 6786754, upload-time = "2025-05-17T21:44:47.446Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/af/30/feba75f143bdc868a1cc3f44ccfa6c4b9ec522b36458e738cd00f67b573f/numpy-2.2.6-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce47521a4754c8f4593837384bd3424880629f718d87c5d44f8ed763edd63543", size = 16643476, upload-time = "2025-05-17T21:45:11.871Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/37/48/ac2a9584402fb6c0cd5b5d1a91dcf176b15760130dd386bbafdbfe3640bf/numpy-2.2.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d042d24c90c41b54fd506da306759e06e568864df8ec17ccc17e9e884634fd00", size = 12812666, upload-time = "2025-05-17T21:45:31.426Z" }, +] + +[[package]] +name = "numpy" +version = "2.4.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a4/7a/6a3d14e205d292b738db449d0de649b373a59edb0d0b4493821d0a3e8718/numpy-2.4.0.tar.gz", hash = "sha256:6e504f7b16118198f138ef31ba24d985b124c2c469fe8467007cf30fd992f934", size = 20685720, upload-time = "2025-12-20T16:18:19.023Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/26/7e/7bae7cbcc2f8132271967aa03e03954fc1e48aa1f3bf32b29ca95fbef352/numpy-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:316b2f2584682318539f0bcaca5a496ce9ca78c88066579ebd11fd06f8e4741e", size = 16940166, upload-time = "2025-12-20T16:15:43.434Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0f/27/6c13f5b46776d6246ec884ac5817452672156a506d08a1f2abb39961930a/numpy-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2718c1de8504121714234b6f8241d0019450353276c88b9453c9c3d92e101db", size = 12641781, upload-time = "2025-12-20T16:15:45.701Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/14/1c/83b4998d4860d15283241d9e5215f28b40ac31f497c04b12fa7f428ff370/numpy-2.4.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:21555da4ec4a0c942520ead42c3b0dc9477441e085c42b0fbdd6a084869a6f6b", size = 5470247, upload-time = "2025-12-20T16:15:47.943Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/54/08/cbce72c835d937795571b0464b52069f869c9e78b0c076d416c5269d2718/numpy-2.4.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:413aa561266a4be2d06cd2b9665e89d9f54c543f418773076a76adcf2af08bc7", size = 6799807, upload-time = "2025-12-20T16:15:49.795Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ff/be/2e647961cd8c980591d75cdcd9e8f647d69fbe05e2a25613dc0a2ea5fb1a/numpy-2.4.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0feafc9e03128074689183031181fac0897ff169692d8492066e949041096548", size = 14701992, upload-time = "2025-12-20T16:15:51.615Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a2/fb/e1652fb8b6fd91ce6ed429143fe2e01ce714711e03e5b762615e7b36172c/numpy-2.4.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a8fdfed3deaf1928fb7667d96e0567cdf58c2b370ea2ee7e586aa383ec2cb346", size = 16646871, upload-time = "2025-12-20T16:15:54.129Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/62/23/d841207e63c4322842f7cd042ae981cffe715c73376dcad8235fb31debf1/numpy-2.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e06a922a469cae9a57100864caf4f8a97a1026513793969f8ba5b63137a35d25", size = 16487190, upload-time = "2025-12-20T16:15:56.147Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bc/a0/6a842c8421ebfdec0a230e65f61e0dabda6edbef443d999d79b87c273965/numpy-2.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:927ccf5cd17c48f801f4ed43a7e5673a2724bd2171460be3e3894e6e332ef83a", size = 18580762, upload-time = "2025-12-20T16:15:58.524Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0a/d1/c79e0046641186f2134dde05e6181825b911f8bdcef31b19ddd16e232847/numpy-2.4.0-cp311-cp311-win32.whl", hash = "sha256:882567b7ae57c1b1a0250208cc21a7976d8cbcc49d5a322e607e6f09c9e0bd53", size = 6233359, upload-time = "2025-12-20T16:16:00.938Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fc/f0/74965001d231f28184d6305b8cdc1b6fcd4bf23033f6cb039cfe76c9fca7/numpy-2.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:8b986403023c8f3bf8f487c2e6186afda156174d31c175f747d8934dfddf3479", size = 12601132, upload-time = "2025-12-20T16:16:02.484Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/65/32/55408d0f46dfebce38017f5bd931affa7256ad6beac1a92a012e1fbc67a7/numpy-2.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:3f3096405acc48887458bbf9f6814d43785ac7ba2a57ea6442b581dedbc60ce6", size = 10573977, upload-time = "2025-12-20T16:16:04.77Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8b/ff/f6400ffec95de41c74b8e73df32e3fff1830633193a7b1e409be7fb1bb8c/numpy-2.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2a8b6bb8369abefb8bd1801b054ad50e02b3275c8614dc6e5b0373c305291037", size = 16653117, upload-time = "2025-12-20T16:16:06.709Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fd/28/6c23e97450035072e8d830a3c411bf1abd1f42c611ff9d29e3d8f55c6252/numpy-2.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e284ca13d5a8367e43734148622caf0b261b275673823593e3e3634a6490f83", size = 12369711, upload-time = "2025-12-20T16:16:08.758Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bc/af/acbef97b630ab1bb45e6a7d01d1452e4251aa88ce680ac36e56c272120ec/numpy-2.4.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:49ff32b09f5aa0cd30a20c2b39db3e669c845589f2b7fc910365210887e39344", size = 5198355, upload-time = "2025-12-20T16:16:10.902Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c1/c8/4e0d436b66b826f2e53330adaa6311f5cac9871a5b5c31ad773b27f25a74/numpy-2.4.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:36cbfb13c152b1c7c184ddac43765db8ad672567e7bafff2cc755a09917ed2e6", size = 6545298, upload-time = "2025-12-20T16:16:12.607Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ef/27/e1f5d144ab54eac34875e79037011d511ac57b21b220063310cb96c80fbc/numpy-2.4.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:35ddc8f4914466e6fc954c76527aa91aa763682a4f6d73249ef20b418fe6effb", size = 14398387, upload-time = "2025-12-20T16:16:14.257Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/67/64/4cb909dd5ab09a9a5d086eff9586e69e827b88a5585517386879474f4cf7/numpy-2.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dc578891de1db95b2a35001b695451767b580bb45753717498213c5ff3c41d63", size = 16363091, upload-time = "2025-12-20T16:16:17.32Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9d/9c/8efe24577523ec6809261859737cf117b0eb6fdb655abdfdc81b2e468ce4/numpy-2.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:98e81648e0b36e325ab67e46b5400a7a6d4a22b8a7c8e8bbfe20e7db7906bf95", size = 16176394, upload-time = "2025-12-20T16:16:19.524Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/61/f0/1687441ece7b47a62e45a1f82015352c240765c707928edd8aef875d5951/numpy-2.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d57b5046c120561ba8fa8e4030fbb8b822f3063910fa901ffadf16e2b7128ad6", size = 18287378, upload-time = "2025-12-20T16:16:22.866Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d3/6f/f868765d44e6fc466467ed810ba9d8d6db1add7d4a748abfa2a4c99a3194/numpy-2.4.0-cp312-cp312-win32.whl", hash = "sha256:92190db305a6f48734d3982f2c60fa30d6b5ee9bff10f2887b930d7b40119f4c", size = 5955432, upload-time = "2025-12-20T16:16:25.06Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d4/b5/94c1e79fcbab38d1ca15e13777477b2914dd2d559b410f96949d6637b085/numpy-2.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:680060061adb2d74ce352628cb798cfdec399068aa7f07ba9fb818b2b3305f98", size = 12306201, upload-time = "2025-12-20T16:16:26.979Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/70/09/c39dadf0b13bb0768cd29d6a3aaff1fb7c6905ac40e9aaeca26b1c086e06/numpy-2.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:39699233bc72dd482da1415dcb06076e32f60eddc796a796c5fb6c5efce94667", size = 10308234, upload-time = "2025-12-20T16:16:29.417Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4b/ef/088e7c7342f300aaf3ee5f2c821c4b9996a1bef2aaf6a49cc8ab4883758e/numpy-2.4.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:b54c83f1c0c0f1d748dca0af516062b8829d53d1f0c402be24b4257a9c48ada6", size = 16819003, upload-time = "2025-12-20T16:18:03.41Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ff/ce/a53017b5443b4b84517182d463fc7bcc2adb4faa8b20813f8e5f5aeb5faa/numpy-2.4.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:aabb081ca0ec5d39591fc33018cd4b3f96e1a2dd6756282029986d00a785fba4", size = 12567105, upload-time = "2025-12-20T16:18:05.594Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/77/58/5ff91b161f2ec650c88a626c3905d938c89aaadabd0431e6d9c1330c83e2/numpy-2.4.0-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:8eafe7c36c8430b7794edeab3087dec7bf31d634d92f2af9949434b9d1964cba", size = 5395590, upload-time = "2025-12-20T16:18:08.031Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1d/4e/f1a084106df8c2df8132fc437e56987308e0524836aa7733721c8429d4fe/numpy-2.4.0-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:2f585f52b2baf07ff3356158d9268ea095e221371f1074fadea2f42544d58b4d", size = 6709947, upload-time = "2025-12-20T16:18:09.836Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/63/09/3d8aeb809c0332c3f642da812ac2e3d74fc9252b3021f8c30c82e99e3f3d/numpy-2.4.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:32ed06d0fe9cae27d8fb5f400c63ccee72370599c75e683a6358dd3a4fb50aaf", size = 14535119, upload-time = "2025-12-20T16:18:12.105Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fd/7f/68f0fc43a2cbdc6bb239160c754d87c922f60fbaa0fa3cd3d312b8a7f5ee/numpy-2.4.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:57c540ed8fb1f05cb997c6761cd56db72395b0d6985e90571ff660452ade4f98", size = 16475815, upload-time = "2025-12-20T16:18:14.433Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/11/73/edeacba3167b1ca66d51b1a5a14697c2c40098b5ffa01811c67b1785a5ab/numpy-2.4.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:a39fb973a726e63223287adc6dafe444ce75af952d711e400f3bf2b36ef55a7b", size = 12489376, upload-time = "2025-12-20T16:18:16.524Z" }, +] + [[package]] name = "openpyxl" version = "3.1.5" @@ -239,6 +391,42 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" }, ] +[[package]] +name = "pandas" +version = "2.3.3" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.0", source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "python-dateutil" }, + { name = "pytz" }, + { name = "tzdata" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3d/f7/f425a00df4fcc22b292c6895c6831c0c8ae1d9fac1e024d16f98a9ce8749/pandas-2.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c", size = 11555763, upload-time = "2025-09-29T23:16:53.287Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/13/4f/66d99628ff8ce7857aca52fed8f0066ce209f96be2fede6cef9f84e8d04f/pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a", size = 10801217, upload-time = "2025-09-29T23:17:04.522Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1d/03/3fc4a529a7710f890a239cc496fc6d50ad4a0995657dccc1d64695adb9f4/pandas-2.3.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5caf26f64126b6c7aec964f74266f435afef1c1b13da3b0636c7518a1fa3e2b1", size = 12148791, upload-time = "2025-09-29T23:17:18.444Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/40/a8/4dac1f8f8235e5d25b9955d02ff6f29396191d4e665d71122c3722ca83c5/pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd7478f1463441ae4ca7308a70e90b33470fa593429f9d4c578dd00d1fa78838", size = 12769373, upload-time = "2025-09-29T23:17:35.846Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/df/91/82cc5169b6b25440a7fc0ef3a694582418d875c8e3ebf796a6d6470aa578/pandas-2.3.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4793891684806ae50d1288c9bae9330293ab4e083ccd1c5e383c34549c6e4250", size = 13200444, upload-time = "2025-09-29T23:17:49.341Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/10/ae/89b3283800ab58f7af2952704078555fa60c807fff764395bb57ea0b0dbd/pandas-2.3.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:28083c648d9a99a5dd035ec125d42439c6c1c525098c58af0fc38dd1a7a1b3d4", size = 13858459, upload-time = "2025-09-29T23:18:03.722Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/85/72/530900610650f54a35a19476eca5104f38555afccda1aa11a92ee14cb21d/pandas-2.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:503cf027cf9940d2ceaa1a93cfb5f8c8c7e6e90720a2850378f0b3f3b1e06826", size = 11346086, upload-time = "2025-09-29T23:18:18.505Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c1/fa/7ac648108144a095b4fb6aa3de1954689f7af60a14cf25583f4960ecb878/pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523", size = 11578790, upload-time = "2025-09-29T23:18:30.065Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9b/35/74442388c6cf008882d4d4bdfc4109be87e9b8b7ccd097ad1e7f006e2e95/pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45", size = 10833831, upload-time = "2025-09-29T23:38:56.071Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fe/e4/de154cbfeee13383ad58d23017da99390b91d73f8c11856f2095e813201b/pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66", size = 12199267, upload-time = "2025-09-29T23:18:41.627Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/bf/c9/63f8d545568d9ab91476b1818b4741f521646cbdd151c6efebf40d6de6f7/pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b", size = 12789281, upload-time = "2025-09-29T23:18:56.834Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f2/00/a5ac8c7a0e67fd1a6059e40aa08fa1c52cc00709077d2300e210c3ce0322/pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791", size = 13240453, upload-time = "2025-09-29T23:19:09.247Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/27/4d/5c23a5bc7bd209231618dd9e606ce076272c9bc4f12023a70e03a86b4067/pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151", size = 13890361, upload-time = "2025-09-29T23:19:25.342Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8e/59/712db1d7040520de7a4965df15b774348980e6df45c129b8c64d0dbe74ef/pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c", size = 11348702, upload-time = "2025-09-29T23:19:38.296Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e5/63/cd7d615331b328e287d8233ba9fdf191a9c2d11b6af0c7a59cfcec23de68/pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89", size = 12362693, upload-time = "2025-09-29T23:20:14.098Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a6/de/8b1895b107277d52f2b42d3a6806e69cfef0d5cf1d0ba343470b9d8e0a04/pandas-2.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98", size = 12771002, upload-time = "2025-09-29T23:20:26.76Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/87/21/84072af3187a677c5893b170ba2c8fbe450a6ff911234916da889b698220/pandas-2.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084", size = 13450971, upload-time = "2025-09-29T23:20:41.344Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b", size = 10992722, upload-time = "2025-09-29T23:20:54.139Z" }, +] + [[package]] name = "pillow" version = "12.0.0" @@ -323,6 +511,69 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" }, ] +[[package]] +name = "pytest-cov" +version = "7.0.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "coverage", extra = ["toml"] }, + { name = "pluggy" }, + { name = "pytest" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5e/f7/c933acc76f5208b3b00089573cf6a2bc26dc80a8aece8f52bb7d6b1855ca/pytest_cov-7.0.0.tar.gz", hash = "sha256:33c97eda2e049a0c5298e91f519302a1334c26ac65c1a483d6206fd458361af1", size = 54328, upload-time = "2025-09-09T10:57:02.113Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ee/49/1377b49de7d0c1ce41292161ea0f721913fa8722c19fb9c1e3aa0367eecb/pytest_cov-7.0.0-py3-none-any.whl", hash = "sha256:3b8e9558b16cc1479da72058bdecf8073661c7f57f7d3c5f22a1c23507f2d861", size = 22424, upload-time = "2025-09-09T10:57:00.695Z" }, +] + +[[package]] +name = "pytest-mock" +version = "3.15.1" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/68/14/eb014d26be205d38ad5ad20d9a80f7d201472e08167f0bb4361e251084a9/pytest_mock-3.15.1.tar.gz", hash = "sha256:1849a238f6f396da19762269de72cb1814ab44416fa73a8686deac10b0d87a0f", size = 34036, upload-time = "2025-09-16T16:37:27.081Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5a/cc/06253936f4a7fa2e0f48dfe6d851d9c56df896a9ab09ac019d70b760619c/pytest_mock-3.15.1-py3-none-any.whl", hash = "sha256:0a25e2eb88fe5168d535041d09a4529a188176ae608a6d249ee65abc0949630d", size = 10095, upload-time = "2025-09-16T16:37:25.734Z" }, +] + +[[package]] +name = "pytest-timeout" +version = "2.4.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ac/82/4c9ecabab13363e72d880f2fb504c5f750433b2b6f16e99f4ec21ada284c/pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a", size = 17973, upload-time = "2025-05-05T19:44:34.99Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fa/b6/3127540ecdf1464a00e5a01ee60a1b09175f6913f0644ac748494d9c4b21/pytest_timeout-2.4.0-py3-none-any.whl", hash = "sha256:c42667e5cdadb151aeb5b26d114aff6bdf5a907f176a007a30b940d3d865b5c2", size = 14382, upload-time = "2025-05-05T19:44:33.502Z" }, +] + +[[package]] +name = "pytest-xdist" +version = "3.8.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "execnet" }, + { name = "pytest" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/78/b4/439b179d1ff526791eb921115fca8e44e596a13efeda518b9d845a619450/pytest_xdist-3.8.0.tar.gz", hash = "sha256:7e578125ec9bc6050861aa93f2d59f1d8d085595d6551c2c90b6f4fad8d3a9f1", size = 88069, upload-time = "2025-07-01T13:30:59.346Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ca/31/d4e37e9e550c2b92a9cbc2e4d0b7420a27224968580b5a447f420847c975/pytest_xdist-3.8.0-py3-none-any.whl", hash = "sha256:202ca578cfeb7370784a8c33d6d05bc6e13b4f25b5053c30a152269fd10f0b88", size = 46396, upload-time = "2025-07-01T13:30:56.632Z" }, +] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +dependencies = [ + { name = "six" }, +] +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, +] + [[package]] name = "python-docx" version = "1.2.0" @@ -351,6 +602,15 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" }, ] +[[package]] +name = "pytz" +version = "2025.2" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" }, +] + [[package]] name = "ragflow-sdk" version = "0.23.1" @@ -360,36 +620,87 @@ dependencies = [ { name = "requests" }, ] +[package.optional-dependencies] +test = [ + { name = "hypothesis" }, + { name = "openpyxl" }, + { name = "pandas" }, + { name = "pillow" }, + { name = "pytest" }, + { name = "pytest-cov" }, + { name = "pytest-mock" }, + { name = "pytest-timeout" }, + { name = "pytest-xdist" }, + { name = "python-docx" }, + { name = "python-pptx" }, + { name = "reportlab" }, + { name = "requests" }, + { name = "requests-toolbelt" }, + { name = "xlrd" }, + { name = "xlwt" }, +] + [package.dev-dependencies] test = [ { name = "hypothesis" }, { name = "openpyxl" }, + { name = "pandas" }, { name = "pillow" }, { name = "pytest" }, + { name = "pytest-cov" }, + { name = "pytest-mock" }, + { name = "pytest-timeout" }, + { name = "pytest-xdist" }, { name = "python-docx" }, { name = "python-pptx" }, { name = "reportlab" }, { name = "requests" }, { name = "requests-toolbelt" }, + { name = "xlrd" }, + { name = "xlwt" }, ] [package.metadata] requires-dist = [ { name = "beartype", specifier = ">=0.20.0,<1.0.0" }, + { name = "hypothesis", marker = "extra == 'test'", specifier = ">=6.131.9" }, + { name = "openpyxl", marker = "extra == 'test'", specifier = ">=3.1.5" }, + { name = "pandas", marker = "extra == 'test'", specifier = ">=2.0.0" }, + { name = "pillow", marker = "extra == 'test'", specifier = ">=11.1.0" }, + { name = "pytest", marker = "extra == 'test'", specifier = ">=8.3.5" }, + { name = "pytest-cov", marker = "extra == 'test'", specifier = ">=4.1.0" }, + { name = "pytest-mock", marker = "extra == 'test'", specifier = ">=3.12.0" }, + { name = "pytest-timeout", marker = "extra == 'test'", specifier = ">=2.2.0" }, + { name = "pytest-xdist", marker = "extra == 'test'", specifier = ">=3.5.0" }, + { name = "python-docx", marker = "extra == 'test'", specifier = ">=1.1.2" }, + { name = "python-pptx", marker = "extra == 'test'", specifier = ">=1.0.2" }, + { name = "reportlab", marker = "extra == 'test'", specifier = ">=4.3.1" }, { name = "requests", specifier = ">=2.30.0,<3.0.0" }, + { name = "requests", marker = "extra == 'test'", specifier = ">=2.32.3" }, + { name = "requests-toolbelt", marker = "extra == 'test'", specifier = ">=1.0.0" }, + { name = "xlrd", marker = "extra == 'test'", specifier = ">=2.0.1" }, + { name = "xlwt", marker = "extra == 'test'", specifier = ">=1.3.0" }, ] +provides-extras = ["test"] [package.metadata.requires-dev] test = [ { name = "hypothesis", specifier = ">=6.131.9" }, { name = "openpyxl", specifier = ">=3.1.5" }, + { name = "pandas", specifier = ">=2.0.0" }, { name = "pillow", specifier = ">=11.1.0" }, { name = "pytest", specifier = ">=8.3.5" }, + { name = "pytest-cov", specifier = ">=4.1.0" }, + { name = "pytest-mock", specifier = ">=3.12.0" }, + { name = "pytest-timeout", specifier = ">=2.2.0" }, + { name = "pytest-xdist", specifier = ">=3.5.0" }, { name = "python-docx", specifier = ">=1.1.2" }, { name = "python-pptx", specifier = ">=1.0.2" }, { name = "reportlab", specifier = ">=4.3.1" }, { name = "requests", specifier = ">=2.32.3" }, { name = "requests-toolbelt", specifier = ">=1.0.0" }, + { name = "xlrd", specifier = ">=2.0.1" }, + { name = "xlwt", specifier = ">=1.3.0" }, ] [[package]] @@ -432,6 +743,15 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06", size = 54481, upload-time = "2023-05-01T04:11:28.427Z" }, ] +[[package]] +name = "six" +version = "1.17.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, +] + [[package]] name = "sortedcontainers" version = "2.4.0" @@ -475,6 +795,15 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, ] +[[package]] +name = "tzdata" +version = "2025.3" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5e/a7/c202b344c5ca7daf398f3b8a477eeb205cf3b6f32e7ec3a6bac0629ca975/tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7", size = 196772, upload-time = "2025-12-13T17:45:35.667Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521, upload-time = "2025-12-13T17:45:33.889Z" }, +] + [[package]] name = "urllib3" version = "2.5.0" @@ -484,6 +813,15 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" }, ] +[[package]] +name = "xlrd" +version = "2.0.2" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/07/5a/377161c2d3538d1990d7af382c79f3b2372e880b65de21b01b1a2b78691e/xlrd-2.0.2.tar.gz", hash = "sha256:08b5e25de58f21ce71dc7db3b3b8106c1fa776f3024c54e45b45b374e89234c9", size = 100167, upload-time = "2025-06-14T08:46:39.039Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1a/62/c8d562e7766786ba6587d09c5a8ba9f718ed3fa8af7f4553e8f91c36f302/xlrd-2.0.2-py2.py3-none-any.whl", hash = "sha256:ea762c3d29f4cca48d82df517b6d89fbce4db3107f9d78713e48cd321d5c9aa9", size = 96555, upload-time = "2025-06-14T08:46:37.766Z" }, +] + [[package]] name = "xlsxwriter" version = "3.2.9" @@ -492,3 +830,12 @@ sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/46/2c/c06ef49dc36e79 wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3a/0c/3662f4a66880196a590b202f0db82d919dd2f89e99a27fadef91c4a33d41/xlsxwriter-3.2.9-py3-none-any.whl", hash = "sha256:9a5db42bc5dff014806c58a20b9eae7322a134abb6fce3c92c181bfb275ec5b3", size = 175315, upload-time = "2025-09-16T00:16:20.108Z" }, ] + +[[package]] +name = "xlwt" +version = "1.3.0" +source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/06/97/56a6f56ce44578a69343449aa5a0d98eefe04085d69da539f3034e2cd5c1/xlwt-1.3.0.tar.gz", hash = "sha256:c59912717a9b28f1a3c2a98fd60741014b06b043936dcecbc113eaaada156c88", size = 153929, upload-time = "2017-08-22T06:47:16.498Z" } +wheels = [ + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/44/48/def306413b25c3d01753603b1a222a011b8621aed27cd7f89cbc27e6b0f4/xlwt-1.3.0-py2.py3-none-any.whl", hash = "sha256:a082260524678ba48a297d922cc385f58278b8aa68741596a87de01a9c628b2e", size = 99981, upload-time = "2017-08-22T06:47:15.281Z" }, +] diff --git a/web/.env b/web/.env index a6cbd9ccd..4ee4bf577 100644 --- a/web/.env +++ b/web/.env @@ -1,2 +1,3 @@ -PORT=9222 -DID_YOU_KNOW=none \ No newline at end of file +PORT=9223 +DID_YOU_KNOW=none +SERVER_PORT_FOR_WEB=9380 \ No newline at end of file