diff --git a/.env.example b/.env.example index 8877275..111f43b 100644 --- a/.env.example +++ b/.env.example @@ -69,16 +69,21 @@ I2P_LLM_TEMPERATURE_IDEA_PACKAGING_JUDGE=0.0 # ----------------------------- # Embedding (optional overrides) # ----------------------------- -# If not set, Embedding uses: -# - EMBEDDING_API_URL=https://api.openai.com/v1/embeddings -# - EMBEDDING_MODEL=text-embedding-3-large (example) -# - EMBEDDING_API_KEY falls back to LLM_API_KEY -# Tip: For frequent switching, set I2P_INDEX_DIR_MODE=auto_profile to auto-select -# per-embedding index dirs (no manual profile scripts needed). You can still override -# I2P_NOVELTY_INDEX_DIR / I2P_RECALL_INDEX_DIR if you prefer. +# Provider: openai_compatible (default) | gemini +# - openai_compatible: OpenAI / compatible APIs (input as string or list) +# - gemini: Google Gemini native embedContent API +# OpenAI-compatible example: +EMBEDDING_PROVIDER=openai_compatible EMBEDDING_API_URL=https://api.openai.com/v1/embeddings EMBEDDING_MODEL=text-embedding-3-large +# Gemini example (uncomment to use): +# EMBEDDING_PROVIDER=gemini +# EMBEDDING_API_URL=https://generativelanguage.googleapis.com/v1beta +# EMBEDDING_MODEL=gemini-embedding-001 EMBEDDING_API_KEY=your_embedding_key_here +# Tip: EMBEDDING_API_KEY falls back to LLM_API_KEY if not set. +# Tip: For frequent switching, set I2P_INDEX_DIR_MODE=auto_profile to auto-select +# per-embedding index dirs. You can still override I2P_NOVELTY_INDEX_DIR / I2P_RECALL_INDEX_DIR. # Optional: auto profile index directories I2P_INDEX_DIR_MODE=auto_profile diff --git a/Paper-KG-Pipeline/scripts/story_to_latex.py b/Paper-KG-Pipeline/scripts/story_to_latex.py new file mode 100644 index 0000000..e897531 --- /dev/null +++ b/Paper-KG-Pipeline/scripts/story_to_latex.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +""" +将 final_story.json 填充到 arXiv LaTeX 模板并输出 .tex 文件。 + +Usage (在 Idea2Paper 仓库根目录执行): + python Paper-KG-Pipeline/scripts/story_to_latex.py + python Paper-KG-Pipeline/scripts/story_to_latex.py "results/Lite Digital Twin" + python Paper-KG-Pipeline/scripts/story_to_latex.py Paper-KG-Pipeline/output + + --no-download 不下载 arxiv.sty,使用纯标准 article 模板(离线可用) + -o paper.tex 指定输出文件名 + +功能: + 1. 读取指定 results 目录下的 final_story.json + 2. 自动下载 arXiv LaTeX 模板 (kourgeorge/arxiv-style) + 3. 将 story 内容填充到模板 + 4. 输出 paper.tex 到 final_story.json 所在目录 +""" + +import argparse +import json +import re +import sys +from pathlib import Path + +try: + import urllib.request + from urllib.error import URLError, HTTPError +except ImportError: + urllib = None + +# arXiv 模板下载地址 (kourgeorge/arxiv-style) +ARXIV_TEMPLATE_URL = "https://raw.githubusercontent.com/kourgeorge/arxiv-style/master/template.tex" +ARXIV_STY_URL = "https://raw.githubusercontent.com/kourgeorge/arxiv-style/master/arxiv.sty" + + +def _latex_escape(text: str) -> str: + """转义 LaTeX 特殊字符""" + if not text: + return "" + # 顺序重要:先处理反斜杠 + replacements = [ + ("\\", r"\textbackslash{}"), + ("&", r"\&"), + ("%", r"\%"), + ("$", r"\$"), + ("#", r"\#"), + ("_", r"\_"), + ("{", r"\{"), + ("}", r"\}"), + ("~", r"\textasciitilde{}"), + ("^", r"\textasciicircum{}"), + ] + for old, new in replacements: + text = text.replace(old, new) + return text + + +def _download_url(url: str) -> str: + """下载 URL 内容,返回文本""" + req = urllib.request.Request(url, headers={"User-Agent": "Idea2Paper/1.0"}) + with urllib.request.urlopen(req, timeout=30) as resp: + return resp.read().decode("utf-8") + + +def _parse_method_steps(text: str) -> list[str]: + """将 method_skeleton 解析为步骤列表""" + if not text or not text.strip(): + return [] + # 匹配 "Step N: ..." 或 "Step N. ..." + steps = re.split(r"\s*Step\s+\d+\s*[.:]\s*", text, flags=re.IGNORECASE) + steps = [s.strip() for s in steps if s.strip()] + if not steps and text.strip(): + # 无 Step 前缀时按分号分割 + steps = [s.strip() for s in text.split(";") if s.strip()] + return steps + + +def _build_arxiv_template(use_arxiv_sty: bool = True) -> str: + """构建用于填充的 arXiv 风格模板(含占位符)""" + if use_arxiv_sty: + preamble = r"""\documentclass{article} + +\usepackage{arxiv} + +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{hyperref} +\usepackage{url} +\usepackage{booktabs} +\usepackage{amsfonts} +\usepackage{graphicx} +\usepackage{natbib} +\usepackage{doi} + +\title{__TITLE__} + +\author{Anonymous Author(s)\thanks{Replace with your affiliation and email.}} + +\renewcommand{\shorttitle}{\textit{arXiv} Preprint} +""" + else: + preamble = r"""\documentclass{article} + +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{hyperref} +\usepackage{url} +\usepackage{booktabs} +\usepackage{amsfonts} +\usepackage{graphicx} + +\title{__TITLE__} + +\author{Anonymous Author(s)\thanks{Replace with your affiliation and email.}} +""" + return preamble + r""" + +\hypersetup{ +pdftitle={__TITLE__}, +pdfauthor={Anonymous}, +pdfkeywords={Research, Paper}, +} + +\begin{document} +\maketitle + +\begin{abstract} +__ABSTRACT__ +\end{abstract} + +\section{Introduction} +\label{sec:intro} + +__PROBLEM_FRAMING__ + +__GAP_PATTERN__ + +\section{Methodology} +\label{sec:method} + +__SOLUTION__ + +\subsection{Technical Steps} + +__METHOD_SKELETON__ + +\section{Contributions} +\label{sec:contributions} + +__INNOVATION_CLAIMS__ + +\section{Experiments} +\label{sec:experiments} + +__EXPERIMENTS_PLAN__ + +\end{document} +""" + + +def _fill_template(template: str, story: dict) -> str: + """将 story 内容填充到模板""" + title = _latex_escape(story.get("title", "")) + abstract = _latex_escape(story.get("abstract", "")) + problem_framing = _latex_escape(story.get("problem_framing", "")) + gap_pattern = _latex_escape(story.get("gap_pattern", "")) + solution = _latex_escape(story.get("solution", "")) + experiments_plan = _latex_escape(story.get("experiments_plan", "")) + + # method_skeleton -> itemize + method_text = story.get("method_skeleton", "") + steps = _parse_method_steps(method_text) + if steps: + method_latex = "\\begin{itemize}\n" + for s in steps: + method_latex += f" \\item {_latex_escape(s)}\n" + method_latex += "\\end{itemize}" + else: + method_latex = _latex_escape(method_text) if method_text else "" + + # innovation_claims -> itemize + claims = story.get("innovation_claims", []) + if isinstance(claims, list): + claims_latex = "\\begin{itemize}\n" + for c in claims: + claims_latex += f" \\item {_latex_escape(str(c))}\n" + claims_latex += "\\end{itemize}" + else: + claims_latex = _latex_escape(str(claims)) + + replacements = { + "__TITLE__": title, + "__ABSTRACT__": abstract, + "__PROBLEM_FRAMING__": problem_framing, + "__GAP_PATTERN__": gap_pattern, + "__SOLUTION__": solution, + "__METHOD_SKELETON__": method_latex, + "__INNOVATION_CLAIMS__": claims_latex, + "__EXPERIMENTS_PLAN__": experiments_plan, + } + result = template + for k, v in replacements.items(): + result = result.replace(k, v) + return result + + +def _download_arxiv_template(output_dir: Path) -> str: + """下载 arXiv 模板并返回填充用模板内容。下载失败时回退到内置模板(无需 arxiv.sty)。""" + try: + template_content = _download_url(ARXIV_TEMPLATE_URL) + sty_content = _download_url(ARXIV_STY_URL) + except (URLError, HTTPError, OSError) as e: + print(f"[warn] 无法下载 arXiv 模板: {e}", file=sys.stderr) + return _build_arxiv_template(use_arxiv_sty=False) + + # 保存 arxiv.sty 到输出目录(模板需要) + sty_path = output_dir / "arxiv.sty" + sty_path.write_text(sty_content, encoding="utf-8") + print(f" ✓ 已保存 arxiv.sty 到 {output_dir}") + + # 从下载的模板提取结构,替换为我们的占位符 + # 简化:使用内置模板结构,但保留 arxiv 包 + return _build_arxiv_template() + + +def main(): + parser = argparse.ArgumentParser( + description="将 final_story.json 填充到 arXiv LaTeX 模板并输出 .tex 文件" + ) + parser.add_argument( + "results_dir", + type=str, + help="包含 final_story.json 的 results 目录路径", + ) + parser.add_argument( + "-o", "--output", + type=str, + default="paper.tex", + help="输出 .tex 文件名 (默认: paper.tex)", + ) + parser.add_argument( + "--no-download", + action="store_true", + help="不下载 arXiv 模板,仅使用内置模板", + ) + args = parser.parse_args() + + results_path = Path(args.results_dir).resolve() + story_file = results_path / "final_story.json" + + if not story_file.exists(): + print(f"错误: 未找到 {story_file}", file=sys.stderr) + sys.exit(1) + + print(f"读取: {story_file}") + with story_file.open("r", encoding="utf-8") as f: + story = json.load(f) + + # 下载或使用内置模板 + if args.no_download: + template = _build_arxiv_template(use_arxiv_sty=False) + print("使用内置模板(无需 arxiv.sty)") + else: + print("下载 arXiv LaTeX 模板...") + template = _download_arxiv_template(results_path) + + # 填充并输出 + filled = _fill_template(template, story) + output_path = results_path / args.output + output_path.write_text(filled, encoding="utf-8") + + print(f"✓ 已输出 LaTeX 文件: {output_path}") + print(f" 编译: cd {results_path} && pdflatex paper.tex") + + +if __name__ == "__main__": + main() diff --git a/Paper-KG-Pipeline/src/idea2paper/infra/embeddings.py b/Paper-KG-Pipeline/src/idea2paper/infra/embeddings.py index 1720b42..0966856 100644 --- a/Paper-KG-Pipeline/src/idea2paper/infra/embeddings.py +++ b/Paper-KG-Pipeline/src/idea2paper/infra/embeddings.py @@ -1,5 +1,5 @@ import time -from typing import Optional, List +from typing import Dict, List, Optional import requests @@ -7,91 +7,201 @@ EMBEDDING_API_KEY, EMBEDDING_API_URL, EMBEDDING_MODEL, + EMBEDDING_PROVIDER, ) from idea2paper.infra.run_context import get_logger -EMBEDDING_PROVIDER_FOR_LOG = "openai_compatible" +def _effective_provider() -> str: + return (EMBEDDING_PROVIDER or "openai_compatible").strip().lower() -def get_embedding(text: str, logger=None, timeout: int = 120) -> Optional[List[float]]: - """Get embedding for text using OpenAI-compatible embeddings API. +def _is_gemini() -> bool: + return _effective_provider() == "gemini" - Returns None on failure (no exception thrown). - """ - if logger is None: - logger = get_logger() - start_ts = time.time() - if not EMBEDDING_API_KEY: +# --------------------------------------------------------------------------- +# Gemini helpers +# --------------------------------------------------------------------------- + +def _gemini_embed_url() -> str: + """Return the embedContent URL for Gemini.""" + url = (EMBEDDING_API_URL or "").strip() + if ":embedContent" in url: + return url + base = url.rstrip("/") + if not base: + base = "https://generativelanguage.googleapis.com/v1beta" + return f"{base}/models/{EMBEDDING_MODEL}:embedContent" + + +def _gemini_batch_url() -> str: + """Return the batchEmbedContents URL for Gemini.""" + return _gemini_embed_url().replace(":embedContent", ":batchEmbedContents") + + +def _gemini_headers() -> Dict[str, str]: + return { + "x-goog-api-key": EMBEDDING_API_KEY, + "Content-Type": "application/json", + } + + +def _gemini_single(text: str, logger, timeout: int) -> Optional[List[float]]: + start_ts = time.time() + url = _gemini_embed_url() + payload = { + "model": f"models/{EMBEDDING_MODEL}", + "content": {"parts": [{"text": text}]}, + } + try: + resp = requests.post(url, headers=_gemini_headers(), json=payload, timeout=timeout) + resp.raise_for_status() + emb = resp.json()["embedding"]["values"] if logger: logger.log_embedding_call( - request={ - "provider": EMBEDDING_PROVIDER_FOR_LOG, - "url": EMBEDDING_API_URL, - "model": EMBEDDING_MODEL, - "input_preview": text, - "timeout": timeout, - "simulated": True - }, - response={ - "ok": False, - "latency_ms": int((time.time() - start_ts) * 1000), - "error": "EMBEDDING_API_KEY not configured" - } + request={"provider": "gemini", "url": url, "model": EMBEDDING_MODEL, + "input_preview": text, "timeout": timeout, "simulated": False}, + response={"ok": True, "latency_ms": int((time.time() - start_ts) * 1000)}, + ) + return emb + except Exception as e: + if logger: + logger.log_embedding_call( + request={"provider": "gemini", "url": url, "model": EMBEDDING_MODEL, + "input_preview": text, "timeout": timeout, "simulated": False}, + response={"ok": False, "latency_ms": int((time.time() - start_ts) * 1000), "error": str(e)}, ) return None - headers = { - "Authorization": f"Bearer {EMBEDDING_API_KEY}", - "Content-Type": "application/json" - } + +def _gemini_batch(texts: List[str], logger, timeout: int) -> Optional[List[List[float]]]: + start_ts = time.time() + url = _gemini_batch_url() + model_ref = f"models/{EMBEDDING_MODEL}" payload = { - "model": EMBEDDING_MODEL, - "input": text + "requests": [ + {"model": model_ref, "content": {"parts": [{"text": t}]}} + for t in texts + ] } + try: + resp = requests.post(url, headers=_gemini_headers(), json=payload, timeout=timeout) + resp.raise_for_status() + data = resp.json() + embs = [item["values"] for item in data.get("embeddings", [])] + if len(embs) != len(texts): + raise ValueError(f"embedding batch size mismatch: got {len(embs)} expected {len(texts)}") + if logger: + logger.log_embedding_call( + request={"provider": "gemini", "url": url, "model": EMBEDDING_MODEL, + "input_preview": _preview_texts(texts), "timeout": timeout, + "simulated": False, "batch_size": len(texts)}, + response={"ok": True, "latency_ms": int((time.time() - start_ts) * 1000)}, + ) + return embs + except Exception as e: + if logger: + logger.log_embedding_call( + request={"provider": "gemini", "url": url, "model": EMBEDDING_MODEL, + "input_preview": _preview_texts(texts), "timeout": timeout, + "simulated": False, "batch_size": len(texts)}, + response={"ok": False, "latency_ms": int((time.time() - start_ts) * 1000), "error": str(e)}, + ) + return None + +# --------------------------------------------------------------------------- +# OpenAI-compatible helpers +# --------------------------------------------------------------------------- + +def _openai_single(text: str, logger, timeout: int) -> Optional[List[float]]: + start_ts = time.time() + provider_tag = "openai_compatible" + headers = {"Authorization": f"Bearer {EMBEDDING_API_KEY}", "Content-Type": "application/json"} + payload = {"model": EMBEDDING_MODEL, "input": text} try: resp = requests.post(EMBEDDING_API_URL, headers=headers, json=payload, timeout=timeout) resp.raise_for_status() - data = resp.json() - emb = data["data"][0]["embedding"] + emb = resp.json()["data"][0]["embedding"] if logger: logger.log_embedding_call( - request={ - "provider": EMBEDDING_PROVIDER_FOR_LOG, - "url": EMBEDDING_API_URL, - "model": EMBEDDING_MODEL, - "input_preview": text, - "timeout": timeout, - "simulated": False - }, - response={ - "ok": True, - "latency_ms": int((time.time() - start_ts) * 1000) - } + request={"provider": provider_tag, "url": EMBEDDING_API_URL, "model": EMBEDDING_MODEL, + "input_preview": text, "timeout": timeout, "simulated": False}, + response={"ok": True, "latency_ms": int((time.time() - start_ts) * 1000)}, ) return emb except Exception as e: if logger: logger.log_embedding_call( - request={ - "provider": EMBEDDING_PROVIDER_FOR_LOG, - "url": EMBEDDING_API_URL, - "model": EMBEDDING_MODEL, - "input_preview": text, - "timeout": timeout, - "simulated": False - }, - response={ - "ok": False, - "latency_ms": int((time.time() - start_ts) * 1000), - "error": str(e) - } + request={"provider": provider_tag, "url": EMBEDDING_API_URL, "model": EMBEDDING_MODEL, + "input_preview": text, "timeout": timeout, "simulated": False}, + response={"ok": False, "latency_ms": int((time.time() - start_ts) * 1000), "error": str(e)}, + ) + return None + + +def _openai_batch(texts: List[str], logger, timeout: int) -> Optional[List[List[float]]]: + start_ts = time.time() + provider_tag = "openai_compatible" + headers = {"Authorization": f"Bearer {EMBEDDING_API_KEY}", "Content-Type": "application/json"} + payload = {"model": EMBEDDING_MODEL, "input": texts} + try: + resp = requests.post(EMBEDDING_API_URL, headers=headers, json=payload, timeout=timeout) + resp.raise_for_status() + data = resp.json() + embs = [item["embedding"] for item in data.get("data", [])] + if len(embs) != len(texts): + raise ValueError(f"embedding batch size mismatch: got {len(embs)} expected {len(texts)}") + if logger: + logger.log_embedding_call( + request={"provider": provider_tag, "url": EMBEDDING_API_URL, "model": EMBEDDING_MODEL, + "input_preview": _preview_texts(texts), "timeout": timeout, + "simulated": False, "batch_size": len(texts)}, + response={"ok": True, "latency_ms": int((time.time() - start_ts) * 1000)}, + ) + return embs + except Exception as e: + if logger: + logger.log_embedding_call( + request={"provider": provider_tag, "url": EMBEDDING_API_URL, "model": EMBEDDING_MODEL, + "input_preview": _preview_texts(texts), "timeout": timeout, + "simulated": False, "batch_size": len(texts)}, + response={"ok": False, "latency_ms": int((time.time() - start_ts) * 1000), "error": str(e)}, ) return None +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def get_embedding(text: str, logger=None, timeout: int = 120) -> Optional[List[float]]: + """Get embedding for text. Dispatches to Gemini or OpenAI-compatible based on EMBEDDING_PROVIDER. + + Returns None on failure (no exception thrown). + """ + if logger is None: + logger = get_logger() + + if not EMBEDDING_API_KEY: + start_ts = time.time() + prov = _effective_provider() + url = _gemini_embed_url() if _is_gemini() else EMBEDDING_API_URL + if logger: + logger.log_embedding_call( + request={"provider": prov, "url": url, "model": EMBEDDING_MODEL, + "input_preview": text, "timeout": timeout, "simulated": True}, + response={"ok": False, "latency_ms": int((time.time() - start_ts) * 1000), + "error": "EMBEDDING_API_KEY not configured"}, + ) + return None + + if _is_gemini(): + return _gemini_single(text, logger, timeout) + return _openai_single(text, logger, timeout) + + def _preview_texts(texts: List[str], max_chars: int = 200) -> List[str]: previews = [] for t in texts: @@ -110,77 +220,21 @@ def get_embeddings_batch(texts: List[str], logger=None, timeout: int = 120) -> O """Get embeddings for a batch of texts. Returns None on failure.""" if logger is None: logger = get_logger() - start_ts = time.time() if not EMBEDDING_API_KEY: + start_ts = time.time() + prov = _effective_provider() + url = _gemini_batch_url() if _is_gemini() else EMBEDDING_API_URL if logger: logger.log_embedding_call( - request={ - "provider": EMBEDDING_PROVIDER_FOR_LOG, - "url": EMBEDDING_API_URL, - "model": EMBEDDING_MODEL, - "input_preview": _preview_texts(texts), - "timeout": timeout, - "simulated": True, - "batch_size": len(texts), - }, - response={ - "ok": False, - "latency_ms": int((time.time() - start_ts) * 1000), - "error": "EMBEDDING_API_KEY not configured" - } + request={"provider": prov, "url": url, "model": EMBEDDING_MODEL, + "input_preview": _preview_texts(texts), "timeout": timeout, + "simulated": True, "batch_size": len(texts)}, + response={"ok": False, "latency_ms": int((time.time() - start_ts) * 1000), + "error": "EMBEDDING_API_KEY not configured"}, ) return None - headers = { - "Authorization": f"Bearer {EMBEDDING_API_KEY}", - "Content-Type": "application/json" - } - payload = { - "model": EMBEDDING_MODEL, - "input": texts - } - - try: - resp = requests.post(EMBEDDING_API_URL, headers=headers, json=payload, timeout=timeout) - resp.raise_for_status() - data = resp.json() - embs = [item["embedding"] for item in data.get("data", [])] - if len(embs) != len(texts): - raise ValueError(f"embedding batch size mismatch: got {len(embs)} expected {len(texts)}") - if logger: - logger.log_embedding_call( - request={ - "provider": EMBEDDING_PROVIDER_FOR_LOG, - "url": EMBEDDING_API_URL, - "model": EMBEDDING_MODEL, - "input_preview": _preview_texts(texts), - "timeout": timeout, - "simulated": False, - "batch_size": len(texts), - }, - response={ - "ok": True, - "latency_ms": int((time.time() - start_ts) * 1000) - } - ) - return embs - except Exception as e: - if logger: - logger.log_embedding_call( - request={ - "provider": EMBEDDING_PROVIDER_FOR_LOG, - "url": EMBEDDING_API_URL, - "model": EMBEDDING_MODEL, - "input_preview": _preview_texts(texts), - "timeout": timeout, - "simulated": False, - "batch_size": len(texts), - }, - response={ - "ok": False, - "latency_ms": int((time.time() - start_ts) * 1000), - "error": str(e) - } - ) - return None + if _is_gemini(): + return _gemini_batch(texts, logger, timeout) + return _openai_batch(texts, logger, timeout) diff --git a/Paper-KG-Pipeline/src/idea2paper/infra/startup_preflight.py b/Paper-KG-Pipeline/src/idea2paper/infra/startup_preflight.py index 4c9e4b5..34ec74d 100644 --- a/Paper-KG-Pipeline/src/idea2paper/infra/startup_preflight.py +++ b/Paper-KG-Pipeline/src/idea2paper/infra/startup_preflight.py @@ -1,6 +1,7 @@ import os import time from dataclasses import dataclass +from pathlib import Path from typing import Any, Dict, Optional, Tuple import numpy as np @@ -9,6 +10,7 @@ from idea2paper.config import ( EMBEDDING_API_URL, EMBEDDING_MODEL, + EMBEDDING_PROVIDER, LLM_API_URL, LLM_BASE_URL, LLM_MODEL, @@ -50,7 +52,7 @@ def _int_env(key: str, default: int) -> int: return default -def _sleep_backoff(attempt: int, base: float = 1.0, cap: float = 8.0): +def _sleep_backoff(attempt: int, base: float = 1.0, cap: float = 8.0) -> None: time.sleep(min(cap, base * (2 ** attempt))) @@ -144,9 +146,14 @@ def _llm_ping_once(timeout: int) -> Tuple[bool, str]: return False, str(e) +def _is_gemini_embedding() -> bool: + return (EMBEDDING_PROVIDER or "").strip().lower() == "gemini" + + def _embedding_ping_once(timeout: int) -> Tuple[bool, Optional[int], str]: """ Real embedding ping (fail-fast) and infer embedding_dim. + Supports both OpenAI-compatible and Gemini native APIs. """ api_key = os.getenv("EMBEDDING_API_KEY", "") if not api_key: @@ -155,6 +162,38 @@ def _embedding_ping_once(timeout: int) -> Tuple[bool, Optional[int], str]: if not EMBEDDING_API_URL: return False, None, "EMBEDDING_API_URL not configured" + if _is_gemini_embedding(): + return _embedding_ping_gemini(api_key, timeout) + return _embedding_ping_openai(api_key, timeout) + + +def _embedding_ping_gemini(api_key: str, timeout: int) -> Tuple[bool, Optional[int], str]: + url = (EMBEDDING_API_URL or "").strip() + if ":embedContent" not in url: + base = url.rstrip("/") or "https://generativelanguage.googleapis.com/v1beta" + url = f"{base}/models/{EMBEDDING_MODEL}:embedContent" + + headers = {"x-goog-api-key": api_key, "Content-Type": "application/json"} + payload = { + "model": f"models/{EMBEDDING_MODEL}", + "content": {"parts": [{"text": "ping"}]}, + } + try: + resp = requests.post(url, headers=headers, json=payload, timeout=timeout) + resp.raise_for_status() + data = resp.json() + emb = data.get("embedding", {}).get("values") + if not isinstance(emb, list): + return False, None, "invalid Gemini embedding response: 'embedding.values' is not a list" + dim = len(emb) + if dim <= 0: + return False, None, "embedding_dim is 0" + return True, dim, "" + except Exception as e: + return False, None, str(e) + + +def _embedding_ping_openai(api_key: str, timeout: int) -> Tuple[bool, Optional[int], str]: headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} payload = {"model": EMBEDDING_MODEL, "input": "ping"} try: @@ -172,7 +211,7 @@ def _embedding_ping_once(timeout: int) -> Tuple[bool, Optional[int], str]: return False, None, str(e) -def _read_npy_dim(path) -> Optional[int]: +def _read_npy_dim(path: Path) -> Optional[int]: try: arr = np.load(path, mmap_mode="r") if getattr(arr, "ndim", None) != 2: @@ -182,7 +221,7 @@ def _read_npy_dim(path) -> Optional[int]: return None -def _check_index_dims(online_dim: int): +def _check_index_dims(online_dim: int) -> None: """ Compare online embedding dim with existing local index .npy dims. If index does not exist yet (first run), skip that check. diff --git a/README-zh_CN.md b/README-zh_CN.md index eab33d2..74022c0 100644 --- a/README-zh_CN.md +++ b/README-zh_CN.md @@ -98,6 +98,7 @@ python Paper-KG-Pipeline/scripts/idea2story_pipeline.py "your idea" - 📄 `Paper-KG-Pipeline/output/final_story.json`:最终 Story(结构化字段:标题/摘要/问题/方法/贡献/实验等) - 🔍 `Paper-KG-Pipeline/output/pipeline_result.json`:完整链路结果(包含评审、修正、查重、审计信息) +- 📄 `paper.tex`:LaTeX 论文(通过 `story_to_latex.py` 生成,见 [Story 转 LaTeX](#story-转-latex)) - 📂 `Paper-KG-Pipeline/log/run_.../`:每次运行的结构化运行日志 ## 🚀 快速开始 @@ -140,13 +141,30 @@ paper-KG-Pipeline/ > **建议(温度配置):** 支持通过 `I2P_LLM_TEMPERATURE_*` 或 `llm.temperature.*` 配置各阶段温度,默认保持不变;critic 建议低温更稳,story 生成可中温。 > **建议(Idea Packaging):** 可选的质量增强(默认关闭),开启后会进行 pattern 引导的 idea 包装与二次召回:`I2P_IDEA_PACKAGING_ENABLE=1` 或 `idea.packaging_enable=true`。 > **建议(Subdomain Taxonomy):** 可选质量增强,用于减少 Path2 子领域重复与长尾影响。开启后会自动检测并在 `I2P_INDEX_ALLOW_BUILD=1` 时自动构建 `recall_index_dir/subdomain_taxonomy.json`(推荐:`I2P_SUBDOMAIN_TAXONOMY_PATH` 留空)。首次构建会分 batch 调 embedding;也可手动运行 `Paper-KG-Pipeline/scripts/tools/build_subdomain_taxonomy.py`。 -> **当前可直接适配(无需改代码):** 兼容 OpenAI Embeddings API 的 `/v1/embeddings`(要求 `input` 支持字符串或数组)。 -> **暂不直接支持:** DashScope/百炼原生 embeddings 接口(`/api/v1/services/embeddings/...`),需要额外适配层。 +> **当前可直接适配(无需改代码):** 兼容 OpenAI Embeddings API 的 `/v1/embeddings`(要求 `input` 支持字符串或数组)。Gemini 原生 embeddings 可通过 `EMBEDDING_PROVIDER=gemini` 使用。 +> **暂不直接支持:** DashScope/百炼原生 embeddings 接口(`/api/v1/services/embeddings/...`),需要额外适配层。 +> **建议(启动预检):** 每次运行前会检查 LLM/embedding 连通性。跳过预检:`I2P_PREFLIGHT_ENABLE=0`。 ### **4. 运行**: ```bash python Paper-KG-Pipeline/scripts/idea2story_pipeline.py "你的研究Idea描述" ``` + +### Story 转 LaTeX + +将 `final_story.json` 转为 arXiv 风格 LaTeX 论文: + +```bash +python Paper-KG-Pipeline/scripts/story_to_latex.py +``` + +示例: + +```bash +python Paper-KG-Pipeline/scripts/story_to_latex.py "results/xxx" -o paper.tex +``` + +选项:`-o paper.tex`(指定输出文件名),`--no-download`(使用内置模板,不下载 arxiv.sty,离线可用)。输出写入 results 目录,可以使用 `pdflatex paper.tex` 直接编译。 ## 🌐 前端(本地 Web UI) diff --git a/README.md b/README.md index f6b0734..ef6fe36 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,7 @@ reliable autonomous scientific discovery. - 📄 `Paper-KG-Pipeline/output/final_story.json`: Final structured Story (title/abstract/problem/method/contribs/experiments). - 🔍 `Paper-KG-Pipeline/output/pipeline_result.json`: Full pipeline trace (reviews, corrections, audits). +- 📄 `paper.tex`: LaTeX paper (via `story_to_latex.py`, see [Story to LaTeX](#story-to-latex)). - 📂 `log/run_.../`: Structured logs for every run. ## 🚀 Getting Started @@ -105,8 +106,9 @@ pip install -r Paper-KG-Pipeline/requirements.txt > **Tip (LLM temperature):** per-stage temperatures are configurable via `I2P_LLM_TEMPERATURE_*` or `llm.temperature.*`; defaults preserve current behavior. Critic is usually low temp for stability, while story generation can be moderate. > **Tip (Idea Packaging):** optional quality boost via pattern-guided idea packaging + double recall (default off). Enable with `I2P_IDEA_PACKAGING_ENABLE=1` or `idea.packaging_enable=true`. > **Tip (Subdomain taxonomy):** optional quality boost for Path2 to reduce duplicated/long-tail subdomains. When enabled, the pipeline auto-detects and (if `I2P_INDEX_ALLOW_BUILD=1`) auto-builds `subdomain_taxonomy.json` under `recall_index_dir` (recommended: leave `I2P_SUBDOMAIN_TAXONOMY_PATH` empty). First build uses batched embeddings; you can also build manually via `Paper-KG-Pipeline/scripts/tools/build_subdomain_taxonomy.py`. -> **Supported (no code changes):** OpenAI-compatible Embeddings APIs (`/v1/embeddings`) that accept `input` as a string or a list. -> **Not supported yet:** DashScope “native” embeddings endpoint (`/api/v1/services/embeddings/...`) requires an adapter. +> **Supported (no code changes):** OpenAI-compatible Embeddings APIs (`/v1/embeddings`) that accept `input` as a string or a list. Gemini native embeddings via `EMBEDDING_PROVIDER=gemini`. +> **Not supported yet:** DashScope “native” embeddings endpoint (`/api/v1/services/embeddings/...`) requires an adapter. +> **Tip (startup preflight):** Pipeline runs LLM/embedding connectivity check before each run. To skip: `I2P_PREFLIGHT_ENABLE=0`. ### Dataset @@ -135,6 +137,22 @@ and make sure the embedding model matches the index you downloaded, otherwise er python Paper-KG-Pipeline/scripts/idea2story_pipeline.py "your research idea" ``` +### Story to LaTeX + +Convert `final_story.json` to an arXiv-style LaTeX paper: + +```bash +python Paper-KG-Pipeline/scripts/story_to_latex.py +``` + +Examples: + +```bash +python Paper-KG-Pipeline/scripts/story_to_latex.py "results/xxx" -o paper.tex +``` + +Options: `-o paper.tex` (output filename), `--no-download` (use built-in template without arxiv.sty, for offline use). Output is written to the results directory; can be compiled with `pdflatex paper.tex`. + ## 🌐 Frontend (Local Web UI) > **Status:** The frontend is currently unstable. We recommend running the pipeline from the terminal for now. We will improve the frontend in future updates.