diff --git a/.claude/skills/create-issue-himarket/SKILL.md b/.claude/skills/create-issue-himarket/SKILL.md new file mode 100644 index 000000000..244461d22 --- /dev/null +++ b/.claude/skills/create-issue-himarket/SKILL.md @@ -0,0 +1,163 @@ +--- +name: create-issue-himarket +description: "通过自然语言在 HiMarket 社区创建 Issue。支持 Feature Request(功能请求)和 Bug Report(问题报告)两种类型。当用户想要向 HiMarket 提交功能建议或报告问题时使用此 skill。" +--- + +# 创建 HiMarket Issue + +## 概述 + +帮助用户通过自然语言描述,在 [higress-group/himarket](https://github.com/higress-group/himarket) 仓库创建规范的 Issue。 + +## Issue 类型 + +HiMarket 支持两种 Issue 类型: + +### 1. Feature Request(功能请求) + +用于提交新功能建议或改进想法。 + +**必填信息:** +- **Why do you need it?** - 为什么需要这个功能?描述你遇到的问题或痛点 +- **How could it be?** - 期望的功能是什么样的?描述输入和输出 +- **Other related information** - 其他相关信息、截图或上下文(可选) + +### 2. Bug Report(问题报告) + +用于报告 Bug 或异常行为。 + +**必填信息:** +- **Issue Description** - 问题简述 +- **What happened** - 发生了什么?包括异常堆栈信息 +- **Expected behavior** - 期望的行为是什么 +- **Reproduction steps** - 最小化的复现步骤 +- **Environment** - 环境信息(可选) + +**可选信息:** +- **Anything else** - 其他补充信息 +- **Root Cause** - 根因分析(如已定位) +- **Proposed Solution** - 建议的解决方案 + +## 工作流程 + +### 步骤 1:确定 Issue 类型 + +首先询问用户要创建的 Issue 类型: +- 功能请求(Feature Request) +- 问题报告(Bug Report) + +### 步骤 2:收集必要信息 + +根据 Issue 类型,通过对话逐步收集必要信息: + +**对于功能请求:** +1. 询问为什么需要这个功能(遇到了什么问题) +2. 询问期望的功能实现方式 +3. 询问是否有其他补充信息 + +**对于问题报告:** +1. 询问问题的简要描述 +2. 询问具体发生了什么(包括错误信息) +3. 询问期望的正确行为 +4. 询问复现步骤 +5. 询问是否已定位根因或有解决方案建议 + +### 步骤 3:生成 Issue 内容 + +根据收集的信息,按模板格式生成 Issue 内容,展示给用户确认。 + +### 步骤 4:创建 Issue + +确认后,使用 GitHub CLI 创建 Issue: + +```bash +# Feature Request +gh issue create --repo higress-group/himarket \ + --title "[Feature] <标题>" \ + --body "$(cat <<'EOF' +## Why do you need it? +<用户描述的问题/痛点> + +## How could it be? +<期望的功能实现> + +## Other related information +<其他补充信息> + +--- +**Type**: Feature Request +EOF +)" + +# Bug Report +gh issue create --repo higress-group/himarket \ + --title "[Bug] <标题>" \ + --body "$(cat <<'EOF' +## Issue Description +<问题简述> + +## What happened +<发生了什么> + +``` +<异常堆栈或错误信息> +``` + +## Expected behavior +<期望行为> + +## Reproduction steps +1. <步骤1> +2. <步骤2> +3. <步骤3> + +## Root Cause (if known) +<根因分析> + +## Proposed Solution (if any) +<建议的解决方案> + +## Files Changed (if applicable) +- `path/to/file1` +- `path/to/file2` + +--- +**Type**: Bug Report +**Component**: +EOF +)" +``` + +## 关键原则 + +- **一次一个问题** - 逐步收集信息,不要一次性询问太多 +- **提供示例** - 在询问时给出示例帮助用户理解 +- **确认重复** - 提醒用户先搜索是否有类似 Issue +- **语言选择** - 支持用户用中文或英文描述 +- **预览确认** - 创建前展示完整内容让用户确认 +- **返回链接** - 创建成功后返回 Issue URL +- **标题前缀** - Bug 使用 `[Bug]`,功能请求使用 `[Feature]`,也可按组件添加如 `[Frontend]` + +## 示例对话 + +**用户:** 我想给 HiMarket 提一个功能建议 + +**助手:** 好的,我来帮你创建一个 Feature Request。请问你为什么需要这个功能?遇到了什么问题或痛点? + +**用户:** 聊天页面停止按钮点击没反应 + +**助手:** 明白了,这是一个 Bug Report。请问具体是什么情况?在什么操作下发现的? + +**用户:** 模型推理过程中点击停止按钮无法取消 + +**助手:** 好的,我整理了以下 Issue 内容,请确认: + +> **Title:** [Frontend] HiChat 停止按钮无法取消模型推理 +> +> ## Issue Description +> HiChat 聊天页面在模型推理过程中,停止按钮点击无响应。 +> +> ## Expected behavior +> 点击停止按钮后应中断推理过程。 + +确认无误后我将创建这个 Issue。 diff --git a/.claude/skills/create-pr-himarket/SKILL.md b/.claude/skills/create-pr-himarket/SKILL.md new file mode 100644 index 000000000..f57d682f4 --- /dev/null +++ b/.claude/skills/create-pr-himarket/SKILL.md @@ -0,0 +1,278 @@ +--- +name: create-pr-himarket +description: "为 HiMarket 项目创建符合规范的 Pull Request。当用户需要提交代码、推送分支或创建 PR 时使用此 skill,确保 PR 标题和内容符合项目 CI 检查要求。" +--- + +# 创建 HiMarket Pull Request + +## 概述 + +帮助用户在 [higress-group/himarket](https://github.com/higress-group/himarket) 仓库创建符合规范的 Pull Request,确保通过 PR Title Check、PR Content Check 和 PR Size Check。 + +## PR 标题格式 + +### 必需格式 + +``` +type: 简短描述 +``` + +或带范围: + +``` +type(scope): 简短描述 +``` + +### 允许的 Type + +| Type | 说明 | 示例 | +|------|------|------| +| `feat` | 新功能 | `feat: add user authentication` | +| `fix` | Bug 修复 | `fix: resolve memory leak` | +| `docs` | 文档更新 | `docs: update API documentation` | +| `style` | 代码格式 | `style: format with prettier` | +| `refactor` | 重构 | `refactor: simplify service logic` | +| `perf` | 性能优化 | `perf: optimize queries` | +| `test` | 测试 | `test: add unit tests` | +| `build` | 构建系统 | `build: update dependencies` | +| `ci` | CI/CD | `ci: add workflow` | +| `chore` | 其他变更 | `chore: update gitignore` | +| `revert` | 回滚 | `revert: revert commit abc123` | + +### 标题规则 + +1. 必须包含 type 前缀 +2. type 后需要冒号和空格:`feat: ` 而不是 `feat:` +3. 描述必须以**小写字母**开头 +4. 保持简短清晰(建议 < 50 字符) + +## PR 内容格式(必填) + +### 必填部分 + +#### 1. Description(必填) + +必须包含 `## 📝 Description` 部分,且内容至少 10 个字符。 + +```markdown +## 📝 Description + +- 变更点 1 +- 变更点 2 +- 变更点 3 +``` + +#### 2. Type of Change(必填) + +必须至少勾选一项变更类型。 + +```markdown +## ✅ Type of Change + +- [x] Bug fix (non-breaking change) +- [ ] New feature (non-breaking change) +- [ ] Breaking change +- [ ] Documentation update +- [ ] Code refactoring +- [ ] Performance improvement +``` + +### 可选部分 + +#### 3. Related Issues(推荐) + +```markdown +## 🔗 Related Issues + +Fix #123 +Close #456 +``` + +#### 4. Testing(推荐) + +```markdown +## 🧪 Testing + +- [x] Unit tests added/updated +- [x] Manual testing completed +- 测试场景描述 +``` + +#### 5. Checklist(推荐) + +```markdown +## 📋 Checklist + +- [x] Code has been formatted (`mvn spotless:apply` for backend, `npm run lint:fix` for frontend) +- [x] Code is self-reviewed +- [x] No breaking changes +- [x] All CI checks pass +``` + +## 工作流程 + +### 步骤 1:检查当前状态 + +```bash +# 并行执行以下命令 +git status +git diff --stat +git log --oneline -5 +git branch -a +git remote -v +``` + +### 步骤 2:确认分支和远程 + +- 确认当前分支名称 +- 确认 origin 指向用户的 fork(如 `lexburner/himarket`) +- 确认 upstream 指向主仓库(`higress-group/himarket`) + +### 步骤 3:推送分支 + +```bash +git push -u origin +``` + +### 步骤 4:创建 PR + +使用 HEREDOC 格式确保内容正确: + +```bash +gh pr create --repo higress-group/himarket \ + --base main \ + --head : \ + --title "feat: 简短描述" \ + --body "$(cat <<'EOF' +## 📝 Description + +- 变更点 1 +- 变更点 2 + +## 🔗 Related Issues + +Fix #123 + +## ✅ Type of Change + +- [ ] Bug fix (non-breaking change) +- [x] New feature (non-breaking change) +- [ ] Breaking change +- [ ] Documentation update +- [ ] Code refactoring +- [ ] Performance improvement + +## 🧪 Testing + +- [x] Unit tests pass locally +- [x] Manual testing completed + +## 📋 Checklist + +- [x] Code has been formatted (`mvn spotless:apply` for backend, `npm run lint:fix` for frontend) +- [x] Code is self-reviewed +- [x] No breaking changes +- [x] All CI checks pass +EOF +)" +``` + +### 步骤 5:验证检查状态 + +```bash +gh pr checks --repo higress-group/himarket +``` + +确保以下检查通过: +- PR Title Check +- PR Content Check +- PR Size Check +- PR Validation Summary + +## 关键原则 + +- **标题小写** - 描述部分必须以小写字母开头 +- **内容完整** - 必须包含 `## 📝 Description` 和 `## ✅ Type of Change` +- **勾选类型** - Type of Change 必须至少勾选一项 `[x]` +- **关联 Issue** - 推荐使用 `Fix #123` 格式关联 Issue +- **格式化代码** - 提交前运行 `mvn spotless:apply` 或 `npm run lint:fix` +- **不提交图片** - 避免将截图等二进制文件提交到仓库 + +## 常见错误 + +### 错误 1:标题首字母大写 + +``` +❌ feat: Add new feature +✅ feat: add new feature +``` + +### 错误 2:缺少 Description 标题 + +```markdown +❌ 直接写内容 +✅ ## 📝 Description + 内容 +``` + +### 错误 3:未勾选 Type of Change + +```markdown +❌ - [ ] New feature +✅ - [x] New feature +``` + +### 错误 4:Description 内容太短 + +```markdown +❌ ## 📝 Description + Fix bug + +✅ ## 📝 Description + Fix pagination bug in product list +``` + +## 完整示例 + +**标题:** +``` +feat(chat): add tool call support and stop generation feature +``` + +**内容:** +```markdown +## 📝 Description + +- 添加聊天工具调用(Tool Call)支持,工具执行状态按消息顺序内联展示 +- 添加停止生成过程功能,支持中断正在进行的 AI 回复 +- 优化模型推理时滚动条自由滑动体验 + +## 🔗 Related Issues + +Fix #163 +Fix #164 +Fix #165 + +## ✅ Type of Change + +- [x] Bug fix (non-breaking change) +- [x] New feature (non-breaking change) +- [ ] Breaking change +- [ ] Documentation update +- [ ] Code refactoring +- [ ] Performance improvement + +## 🧪 Testing + +- [x] Unit tests pass locally +- [x] Manual testing completed +- 测试停止按钮能否正常中断 SSE 流式请求 +- 测试模型推理时滚动条是否可自由滑动 + +## 📋 Checklist + +- [x] Code has been formatted (`mvn spotless:apply` for backend, `npm run lint:fix` for frontend) +- [x] Code is self-reviewed +- [x] No breaking changes +- [x] All CI checks pass +``` diff --git a/.claude/skills/java-coding-standards/SKILL.md b/.claude/skills/java-coding-standards/SKILL.md new file mode 100644 index 000000000..af9902559 --- /dev/null +++ b/.claude/skills/java-coding-standards/SKILL.md @@ -0,0 +1,147 @@ +--- +name: java-coding-standards +description: "Java coding standards for Spring Boot services: naming, immutability, Optional usage, streams, exceptions, generics, and project layout." +origin: ECC +--- + +# Java Coding Standards + +Standards for readable, maintainable Java (17+) code in Spring Boot services. + +## When to Activate + +- Writing or reviewing Java code in Spring Boot projects +- Enforcing naming, immutability, or exception handling conventions +- Working with records, sealed classes, or pattern matching (Java 17+) +- Reviewing use of Optional, streams, or generics +- Structuring packages and project layout + +## Core Principles + +- Prefer clarity over cleverness +- Immutable by default; minimize shared mutable state +- Fail fast with meaningful exceptions +- Consistent naming and package structure + +## Naming + +```java +// ✅ Classes/Records: PascalCase +public class MarketService {} +public record Money(BigDecimal amount, Currency currency) {} + +// ✅ Methods/fields: camelCase +private final MarketRepository marketRepository; +public Market findBySlug(String slug) {} + +// ✅ Constants: UPPER_SNAKE_CASE +private static final int MAX_PAGE_SIZE = 100; +``` + +## Immutability + +```java +// ✅ Favor records and final fields +public record MarketDto(Long id, String name, MarketStatus status) {} + +public class Market { + private final Long id; + private final String name; + // getters only, no setters +} +``` + +## Optional Usage + +```java +// ✅ Return Optional from find* methods +Optional market = marketRepository.findBySlug(slug); + +// ✅ Map/flatMap instead of get() +return market + .map(MarketResponse::from) + .orElseThrow(() -> new EntityNotFoundException("Market not found")); +``` + +## Streams Best Practices + +```java +// ✅ Use streams for transformations, keep pipelines short +List names = markets.stream() + .map(Market::name) + .filter(Objects::nonNull) + .toList(); + +// ❌ Avoid complex nested streams; prefer loops for clarity +``` + +## Exceptions + +- Use unchecked exceptions for domain errors; wrap technical exceptions with context +- Create domain-specific exceptions (e.g., `MarketNotFoundException`) +- Avoid broad `catch (Exception ex)` unless rethrowing/logging centrally + +```java +throw new MarketNotFoundException(slug); +``` + +## Generics and Type Safety + +- Avoid raw types; declare generic parameters +- Prefer bounded generics for reusable utilities + +```java +public Map indexById(Collection items) { ... } +``` + +## Project Structure (Maven/Gradle) + +``` +src/main/java/com/example/app/ + config/ + controller/ + service/ + repository/ + domain/ + dto/ + util/ +src/main/resources/ + application.yml +src/test/java/... (mirrors main) +``` + +## Formatting and Style + +- Use 2 or 4 spaces consistently (project standard) +- One public top-level type per file +- Keep methods short and focused; extract helpers +- Order members: constants, fields, constructors, public methods, protected, private + +## Code Smells to Avoid + +- Long parameter lists → use DTO/builders +- Deep nesting → early returns +- Magic numbers → named constants +- Static mutable state → prefer dependency injection +- Silent catch blocks → log and act or rethrow + +## Logging + +```java +private static final Logger log = LoggerFactory.getLogger(MarketService.class); +log.info("fetch_market slug={}", slug); +log.error("failed_fetch_market slug={}", slug, ex); +``` + +## Null Handling + +- Accept `@Nullable` only when unavoidable; otherwise use `@NonNull` +- Use Bean Validation (`@NotNull`, `@NotBlank`) on inputs + +## Testing Expectations + +- JUnit 5 + AssertJ for fluent assertions +- Mockito for mocking; avoid partial mocks where possible +- Favor deterministic tests; no hidden sleeps + +**Remember**: Keep code intentional, typed, and observable. Optimize for maintainability over micro-optimizations unless proven necessary. diff --git a/.claude/skills/springboot-verification/SKILL.md b/.claude/skills/springboot-verification/SKILL.md new file mode 100644 index 000000000..c8f790aac --- /dev/null +++ b/.claude/skills/springboot-verification/SKILL.md @@ -0,0 +1,231 @@ +--- +name: springboot-verification +description: "Verification loop for Spring Boot projects: build, static analysis, tests with coverage, security scans, and diff review before release or PR." +origin: ECC +--- + +# Spring Boot Verification Loop + +Run before PRs, after major changes, and pre-deploy. + +## When to Activate + +- Before opening a pull request for a Spring Boot service +- After major refactoring or dependency upgrades +- Pre-deployment verification for staging or production +- Running full build → lint → test → security scan pipeline +- Validating test coverage meets thresholds + +## Phase 1: Build + +```bash +mvn -T 4 clean verify -DskipTests +# or +./gradlew clean assemble -x test +``` + +If build fails, stop and fix. + +## Phase 2: Static Analysis + +Maven (common plugins): +```bash +mvn -T 4 spotbugs:check pmd:check checkstyle:check +``` + +Gradle (if configured): +```bash +./gradlew checkstyleMain pmdMain spotbugsMain +``` + +## Phase 3: Tests + Coverage + +```bash +mvn -T 4 test +mvn jacoco:report # verify 80%+ coverage +# or +./gradlew test jacocoTestReport +``` + +Report: +- Total tests, passed/failed +- Coverage % (lines/branches) + +### Unit Tests + +Test service logic in isolation with mocked dependencies: + +```java +@ExtendWith(MockitoExtension.class) +class UserServiceTest { + + @Mock private UserRepository userRepository; + @InjectMocks private UserService userService; + + @Test + void createUser_validInput_returnsUser() { + var dto = new CreateUserDto("Alice", "alice@example.com"); + var expected = new User(1L, "Alice", "alice@example.com"); + when(userRepository.save(any(User.class))).thenReturn(expected); + + var result = userService.create(dto); + + assertThat(result.name()).isEqualTo("Alice"); + verify(userRepository).save(any(User.class)); + } + + @Test + void createUser_duplicateEmail_throwsException() { + var dto = new CreateUserDto("Alice", "existing@example.com"); + when(userRepository.existsByEmail(dto.email())).thenReturn(true); + + assertThatThrownBy(() -> userService.create(dto)) + .isInstanceOf(DuplicateEmailException.class); + } +} +``` + +### Integration Tests with Testcontainers + +Test against a real database instead of H2: + +```java +@SpringBootTest +@Testcontainers +class UserRepositoryIntegrationTest { + + @Container + static PostgreSQLContainer postgres = new PostgreSQLContainer<>("postgres:16-alpine") + .withDatabaseName("testdb"); + + @DynamicPropertySource + static void configureProperties(DynamicPropertyRegistry registry) { + registry.add("spring.datasource.url", postgres::getJdbcUrl); + registry.add("spring.datasource.username", postgres::getUsername); + registry.add("spring.datasource.password", postgres::getPassword); + } + + @Autowired private UserRepository userRepository; + + @Test + void findByEmail_existingUser_returnsUser() { + userRepository.save(new User("Alice", "alice@example.com")); + + var found = userRepository.findByEmail("alice@example.com"); + + assertThat(found).isPresent(); + assertThat(found.get().getName()).isEqualTo("Alice"); + } +} +``` + +### API Tests with MockMvc + +Test controller layer with full Spring context: + +```java +@WebMvcTest(UserController.class) +class UserControllerTest { + + @Autowired private MockMvc mockMvc; + @MockBean private UserService userService; + + @Test + void createUser_validInput_returns201() throws Exception { + var user = new UserDto(1L, "Alice", "alice@example.com"); + when(userService.create(any())).thenReturn(user); + + mockMvc.perform(post("/api/users") + .contentType(MediaType.APPLICATION_JSON) + .content(""" + {"name": "Alice", "email": "alice@example.com"} + """)) + .andExpect(status().isCreated()) + .andExpect(jsonPath("$.name").value("Alice")); + } + + @Test + void createUser_invalidEmail_returns400() throws Exception { + mockMvc.perform(post("/api/users") + .contentType(MediaType.APPLICATION_JSON) + .content(""" + {"name": "Alice", "email": "not-an-email"} + """)) + .andExpect(status().isBadRequest()); + } +} +``` + +## Phase 4: Security Scan + +```bash +# Dependency CVEs +mvn org.owasp:dependency-check-maven:check +# or +./gradlew dependencyCheckAnalyze + +# Secrets in source +grep -rn "password\s*=\s*\"" src/ --include="*.java" --include="*.yml" --include="*.properties" +grep -rn "sk-\|api_key\|secret" src/ --include="*.java" --include="*.yml" + +# Secrets (git history) +git secrets --scan # if configured +``` + +### Common Security Findings + +``` +# Check for System.out.println (use logger instead) +grep -rn "System\.out\.print" src/main/ --include="*.java" + +# Check for raw exception messages in responses +grep -rn "e\.getMessage()" src/main/ --include="*.java" + +# Check for wildcard CORS +grep -rn "allowedOrigins.*\*" src/main/ --include="*.java" +``` + +## Phase 5: Lint/Format (optional gate) + +```bash +mvn spotless:apply # if using Spotless plugin +./gradlew spotlessApply +``` + +## Phase 6: Diff Review + +```bash +git diff --stat +git diff +``` + +Checklist: +- No debugging logs left (`System.out`, `log.debug` without guards) +- Meaningful errors and HTTP statuses +- Transactions and validation present where needed +- Config changes documented + +## Output Template + +``` +VERIFICATION REPORT +=================== +Build: [PASS/FAIL] +Static: [PASS/FAIL] (spotbugs/pmd/checkstyle) +Tests: [PASS/FAIL] (X/Y passed, Z% coverage) +Security: [PASS/FAIL] (CVE findings: N) +Diff: [X files changed] + +Overall: [READY / NOT READY] + +Issues to Fix: +1. ... +2. ... +``` + +## Continuous Mode + +- Re-run phases on significant changes or every 30–60 minutes in long sessions +- Keep a short loop: `mvn -T 4 test` + spotbugs for quick feedback + +**Remember**: Fast feedback beats late surprises. Keep the gate strict—treat warnings as defects in production systems. diff --git a/.claude/skills/tmux/SKILL.md b/.claude/skills/tmux/SKILL.md new file mode 100644 index 000000000..fa589c0e2 --- /dev/null +++ b/.claude/skills/tmux/SKILL.md @@ -0,0 +1,153 @@ +--- +name: tmux +description: Remote-control tmux sessions for interactive CLIs by sending keystrokes and scraping pane output. +metadata: + { "openclaw": { "emoji": "🧵", "os": ["darwin", "linux"], "requires": { "bins": ["tmux"] } } } +--- + +# tmux Session Control + +Control tmux sessions by sending keystrokes and reading output. Essential for managing Claude Code sessions. + +## When to Use + +✅ **USE this skill when:** + +- Monitoring Claude/Codex sessions in tmux +- Sending input to interactive terminal applications +- Scraping output from long-running processes in tmux +- Navigating tmux panes/windows programmatically +- Checking on background work in existing sessions + +## When NOT to Use + +❌ **DON'T use this skill when:** + +- Running one-off shell commands → use `exec` tool directly +- Starting new background processes → use `exec` with `background:true` +- Non-interactive scripts → use `exec` tool +- The process isn't in tmux +- You need to create a new tmux session → use `exec` with `tmux new-session` + +## Example Sessions + +| Session | Purpose | +| ----------------------- | --------------------------- | +| `shared` | Primary interactive session | +| `worker-2` - `worker-8` | Parallel worker sessions | + +## Common Commands + +### List Sessions + +```bash +tmux list-sessions +tmux ls +``` + +### Capture Output + +```bash +# Last 20 lines of pane +tmux capture-pane -t shared -p | tail -20 + +# Entire scrollback +tmux capture-pane -t shared -p -S - + +# Specific pane in window +tmux capture-pane -t shared:0.0 -p +``` + +### Send Keys + +```bash +# Send text (doesn't press Enter) +tmux send-keys -t shared "hello" + +# Send text + Enter +tmux send-keys -t shared "y" Enter + +# Send special keys +tmux send-keys -t shared Enter +tmux send-keys -t shared Escape +tmux send-keys -t shared C-c # Ctrl+C +tmux send-keys -t shared C-d # Ctrl+D (EOF) +tmux send-keys -t shared C-z # Ctrl+Z (suspend) +``` + +### Window/Pane Navigation + +```bash +# Select window +tmux select-window -t shared:0 + +# Select pane +tmux select-pane -t shared:0.1 + +# List windows +tmux list-windows -t shared +``` + +### Session Management + +```bash +# Create new session +tmux new-session -d -s newsession + +# Kill session +tmux kill-session -t sessionname + +# Rename session +tmux rename-session -t old new +``` + +## Sending Input Safely + +For interactive TUIs (Claude Code, Codex, etc.), split text and Enter into separate sends to avoid paste/multiline edge cases: + +```bash +tmux send-keys -t shared -l -- "Please apply the patch in src/foo.ts" +sleep 0.1 +tmux send-keys -t shared Enter +``` + +## Claude Code Session Patterns + +### Check if Session Needs Input + +```bash +# Look for prompts +tmux capture-pane -t worker-3 -p | tail -10 | grep -E "❯|Yes.*No|proceed|permission" +``` + +### Approve Claude Code Prompt + +```bash +# Send 'y' and Enter +tmux send-keys -t worker-3 'y' Enter + +# Or select numbered option +tmux send-keys -t worker-3 '2' Enter +``` + +### Check All Sessions Status + +```bash +for s in shared worker-2 worker-3 worker-4 worker-5 worker-6 worker-7 worker-8; do + echo "=== $s ===" + tmux capture-pane -t $s -p 2>/dev/null | tail -5 +done +``` + +### Send Task to Session + +```bash +tmux send-keys -t worker-4 "Fix the bug in auth.js" Enter +``` + +## Notes + +- Use `capture-pane -p` to print to stdout (essential for scripting) +- `-S -` captures entire scrollback history +- Target format: `session:window.pane` (e.g., `shared:0.0`) +- Sessions persist across SSH disconnects diff --git a/.claude/skills/tmux/scripts/find-sessions.sh b/.claude/skills/tmux/scripts/find-sessions.sh new file mode 100755 index 000000000..8387c1629 --- /dev/null +++ b/.claude/skills/tmux/scripts/find-sessions.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: find-sessions.sh [-L socket-name|-S socket-path|-A] [-q pattern] + +List tmux sessions on a socket (default tmux socket if none provided). + +Options: + -L, --socket tmux socket name (passed to tmux -L) + -S, --socket-path tmux socket path (passed to tmux -S) + -A, --all scan all sockets under OPENCLAW_TMUX_SOCKET_DIR + -q, --query case-insensitive substring to filter session names + -h, --help show this help +USAGE +} + +socket_name="" +socket_path="" +query="" +scan_all=false +socket_dir="${OPENCLAW_TMUX_SOCKET_DIR:-${CLAWDBOT_TMUX_SOCKET_DIR:-${TMPDIR:-/tmp}/openclaw-tmux-sockets}}" + +while [[ $# -gt 0 ]]; do + case "$1" in + -L|--socket) socket_name="${2-}"; shift 2 ;; + -S|--socket-path) socket_path="${2-}"; shift 2 ;; + -A|--all) scan_all=true; shift ;; + -q|--query) query="${2-}"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) echo "Unknown option: $1" >&2; usage; exit 1 ;; + esac +done + +if [[ "$scan_all" == true && ( -n "$socket_name" || -n "$socket_path" ) ]]; then + echo "Cannot combine --all with -L or -S" >&2 + exit 1 +fi + +if [[ -n "$socket_name" && -n "$socket_path" ]]; then + echo "Use either -L or -S, not both" >&2 + exit 1 +fi + +if ! command -v tmux >/dev/null 2>&1; then + echo "tmux not found in PATH" >&2 + exit 1 +fi + +list_sessions() { + local label="$1"; shift + local tmux_cmd=(tmux "$@") + + if ! sessions="$("${tmux_cmd[@]}" list-sessions -F '#{session_name}\t#{session_attached}\t#{session_created_string}' 2>/dev/null)"; then + echo "No tmux server found on $label" >&2 + return 1 + fi + + if [[ -n "$query" ]]; then + sessions="$(printf '%s\n' "$sessions" | grep -i -- "$query" || true)" + fi + + if [[ -z "$sessions" ]]; then + echo "No sessions found on $label" + return 0 + fi + + echo "Sessions on $label:" + printf '%s\n' "$sessions" | while IFS=$'\t' read -r name attached created; do + attached_label=$([[ "$attached" == "1" ]] && echo "attached" || echo "detached") + printf ' - %s (%s, started %s)\n' "$name" "$attached_label" "$created" + done +} + +if [[ "$scan_all" == true ]]; then + if [[ ! -d "$socket_dir" ]]; then + echo "Socket directory not found: $socket_dir" >&2 + exit 1 + fi + + shopt -s nullglob + sockets=("$socket_dir"/*) + shopt -u nullglob + + if [[ "${#sockets[@]}" -eq 0 ]]; then + echo "No sockets found under $socket_dir" >&2 + exit 1 + fi + + exit_code=0 + for sock in "${sockets[@]}"; do + if [[ ! -S "$sock" ]]; then + continue + fi + list_sessions "socket path '$sock'" -S "$sock" || exit_code=$? + done + exit "$exit_code" +fi + +tmux_cmd=(tmux) +socket_label="default socket" + +if [[ -n "$socket_name" ]]; then + tmux_cmd+=(-L "$socket_name") + socket_label="socket name '$socket_name'" +elif [[ -n "$socket_path" ]]; then + tmux_cmd+=(-S "$socket_path") + socket_label="socket path '$socket_path'" +fi + +list_sessions "$socket_label" "${tmux_cmd[@]:1}" diff --git a/.claude/skills/tmux/scripts/wait-for-text.sh b/.claude/skills/tmux/scripts/wait-for-text.sh new file mode 100755 index 000000000..56354be83 --- /dev/null +++ b/.claude/skills/tmux/scripts/wait-for-text.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: wait-for-text.sh -t target -p pattern [options] + +Poll a tmux pane for text and exit when found. + +Options: + -t, --target tmux target (session:window.pane), required + -p, --pattern regex pattern to look for, required + -F, --fixed treat pattern as a fixed string (grep -F) + -T, --timeout seconds to wait (integer, default: 15) + -i, --interval poll interval in seconds (default: 0.5) + -l, --lines number of history lines to inspect (integer, default: 1000) + -h, --help show this help +USAGE +} + +target="" +pattern="" +grep_flag="-E" +timeout=15 +interval=0.5 +lines=1000 + +while [[ $# -gt 0 ]]; do + case "$1" in + -t|--target) target="${2-}"; shift 2 ;; + -p|--pattern) pattern="${2-}"; shift 2 ;; + -F|--fixed) grep_flag="-F"; shift ;; + -T|--timeout) timeout="${2-}"; shift 2 ;; + -i|--interval) interval="${2-}"; shift 2 ;; + -l|--lines) lines="${2-}"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) echo "Unknown option: $1" >&2; usage; exit 1 ;; + esac +done + +if [[ -z "$target" || -z "$pattern" ]]; then + echo "target and pattern are required" >&2 + usage + exit 1 +fi + +if ! [[ "$timeout" =~ ^[0-9]+$ ]]; then + echo "timeout must be an integer number of seconds" >&2 + exit 1 +fi + +if ! [[ "$lines" =~ ^[0-9]+$ ]]; then + echo "lines must be an integer" >&2 + exit 1 +fi + +if ! command -v tmux >/dev/null 2>&1; then + echo "tmux not found in PATH" >&2 + exit 1 +fi + +# End time in epoch seconds (integer, good enough for polling) +start_epoch=$(date +%s) +deadline=$((start_epoch + timeout)) + +while true; do + # -J joins wrapped lines, -S uses negative index to read last N lines + pane_text="$(tmux capture-pane -p -J -t "$target" -S "-${lines}" 2>/dev/null || true)" + + if printf '%s\n' "$pane_text" | grep $grep_flag -- "$pattern" >/dev/null 2>&1; then + exit 0 + fi + + now=$(date +%s) + if (( now >= deadline )); then + echo "Timed out after ${timeout}s waiting for pattern: $pattern" >&2 + echo "Last ${lines} lines from $target:" >&2 + printf '%s\n' "$pane_text" >&2 + exit 1 + fi + + sleep "$interval" +done diff --git a/.github/workflows/backend-ci.yml b/.github/workflows/backend-ci.yml index 3c34cb818..e744784bf 100644 --- a/.github/workflows/backend-ci.yml +++ b/.github/workflows/backend-ci.yml @@ -143,6 +143,7 @@ jobs: - name: Generate test report if: always() && steps.check-reports.outputs.reports_exist == 'true' + continue-on-error: true uses: dorny/test-reporter@v1 with: name: Backend Test Report diff --git a/.gitignore b/.gitignore index ccaa7935a..9ab7160f8 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,9 @@ target/ !**/src/main/**/target/ !**/src/test/**/target/ +### jqwik ### +.jqwik-database + ### IntelliJ IDEA ### .idea *.iws @@ -31,10 +34,6 @@ build/ ### VS Code ### .vscode/ -### AI Coding Assistants ### -.cursor/ -.qoder/ - ### Mac OS ### .DS_Store @@ -42,8 +41,31 @@ build/ package-lock.json +### Node modules ### +node_modules/ + .clinerules/ # Local / deploy scripts (do not commit) .env push-himarket.sh + +/workspace/ +/workspaces/ + +### Local docs ### +/docs/acp +.app.pid + +### OpenSandbox (local clone, not committed) ### +/OpenSandbox/ + +### Nacos (local symlink/clone, not committed) ### +/nacos + +### Kiro (AWS Kira AI coding agent) ### +.kiro/ + +### Docker deploy data (should live outside project) ### +deploy/docker/data/ +deploy/docker/standalone-logs/ diff --git a/.husky/pre-commit b/.husky/pre-commit deleted file mode 100755 index c34d5a4e8..000000000 --- a/.husky/pre-commit +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env sh - -# 检查前端子目录的 diff 文件 -echo "🔍 Linting frontend diff files..." -cd himarket-web/himarket-frontend && npx lint-staged - -# 检查管理后台子目录的 diff 文件 -echo "🔍 Linting admin diff files..." -cd ../himarket-admin && npx lint-staged - diff --git a/.mvn/wrapper/maven-wrapper.properties b/.mvn/wrapper/maven-wrapper.properties new file mode 100644 index 000000000..15a6c6e87 --- /dev/null +++ b/.mvn/wrapper/maven-wrapper.properties @@ -0,0 +1,3 @@ +wrapperVersion=3.3.4 +distributionType=only-script +distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.3/apache-maven-3.8.3-bin.zip diff --git a/.qoder/skills/create-issue-himarket/SKILL.md b/.qoder/skills/create-issue-himarket/SKILL.md new file mode 100644 index 000000000..244461d22 --- /dev/null +++ b/.qoder/skills/create-issue-himarket/SKILL.md @@ -0,0 +1,163 @@ +--- +name: create-issue-himarket +description: "通过自然语言在 HiMarket 社区创建 Issue。支持 Feature Request(功能请求)和 Bug Report(问题报告)两种类型。当用户想要向 HiMarket 提交功能建议或报告问题时使用此 skill。" +--- + +# 创建 HiMarket Issue + +## 概述 + +帮助用户通过自然语言描述,在 [higress-group/himarket](https://github.com/higress-group/himarket) 仓库创建规范的 Issue。 + +## Issue 类型 + +HiMarket 支持两种 Issue 类型: + +### 1. Feature Request(功能请求) + +用于提交新功能建议或改进想法。 + +**必填信息:** +- **Why do you need it?** - 为什么需要这个功能?描述你遇到的问题或痛点 +- **How could it be?** - 期望的功能是什么样的?描述输入和输出 +- **Other related information** - 其他相关信息、截图或上下文(可选) + +### 2. Bug Report(问题报告) + +用于报告 Bug 或异常行为。 + +**必填信息:** +- **Issue Description** - 问题简述 +- **What happened** - 发生了什么?包括异常堆栈信息 +- **Expected behavior** - 期望的行为是什么 +- **Reproduction steps** - 最小化的复现步骤 +- **Environment** - 环境信息(可选) + +**可选信息:** +- **Anything else** - 其他补充信息 +- **Root Cause** - 根因分析(如已定位) +- **Proposed Solution** - 建议的解决方案 + +## 工作流程 + +### 步骤 1:确定 Issue 类型 + +首先询问用户要创建的 Issue 类型: +- 功能请求(Feature Request) +- 问题报告(Bug Report) + +### 步骤 2:收集必要信息 + +根据 Issue 类型,通过对话逐步收集必要信息: + +**对于功能请求:** +1. 询问为什么需要这个功能(遇到了什么问题) +2. 询问期望的功能实现方式 +3. 询问是否有其他补充信息 + +**对于问题报告:** +1. 询问问题的简要描述 +2. 询问具体发生了什么(包括错误信息) +3. 询问期望的正确行为 +4. 询问复现步骤 +5. 询问是否已定位根因或有解决方案建议 + +### 步骤 3:生成 Issue 内容 + +根据收集的信息,按模板格式生成 Issue 内容,展示给用户确认。 + +### 步骤 4:创建 Issue + +确认后,使用 GitHub CLI 创建 Issue: + +```bash +# Feature Request +gh issue create --repo higress-group/himarket \ + --title "[Feature] <标题>" \ + --body "$(cat <<'EOF' +## Why do you need it? +<用户描述的问题/痛点> + +## How could it be? +<期望的功能实现> + +## Other related information +<其他补充信息> + +--- +**Type**: Feature Request +EOF +)" + +# Bug Report +gh issue create --repo higress-group/himarket \ + --title "[Bug] <标题>" \ + --body "$(cat <<'EOF' +## Issue Description +<问题简述> + +## What happened +<发生了什么> + +``` +<异常堆栈或错误信息> +``` + +## Expected behavior +<期望行为> + +## Reproduction steps +1. <步骤1> +2. <步骤2> +3. <步骤3> + +## Root Cause (if known) +<根因分析> + +## Proposed Solution (if any) +<建议的解决方案> + +## Files Changed (if applicable) +- `path/to/file1` +- `path/to/file2` + +--- +**Type**: Bug Report +**Component**: +EOF +)" +``` + +## 关键原则 + +- **一次一个问题** - 逐步收集信息,不要一次性询问太多 +- **提供示例** - 在询问时给出示例帮助用户理解 +- **确认重复** - 提醒用户先搜索是否有类似 Issue +- **语言选择** - 支持用户用中文或英文描述 +- **预览确认** - 创建前展示完整内容让用户确认 +- **返回链接** - 创建成功后返回 Issue URL +- **标题前缀** - Bug 使用 `[Bug]`,功能请求使用 `[Feature]`,也可按组件添加如 `[Frontend]` + +## 示例对话 + +**用户:** 我想给 HiMarket 提一个功能建议 + +**助手:** 好的,我来帮你创建一个 Feature Request。请问你为什么需要这个功能?遇到了什么问题或痛点? + +**用户:** 聊天页面停止按钮点击没反应 + +**助手:** 明白了,这是一个 Bug Report。请问具体是什么情况?在什么操作下发现的? + +**用户:** 模型推理过程中点击停止按钮无法取消 + +**助手:** 好的,我整理了以下 Issue 内容,请确认: + +> **Title:** [Frontend] HiChat 停止按钮无法取消模型推理 +> +> ## Issue Description +> HiChat 聊天页面在模型推理过程中,停止按钮点击无响应。 +> +> ## Expected behavior +> 点击停止按钮后应中断推理过程。 + +确认无误后我将创建这个 Issue。 diff --git a/.qoder/skills/create-pr-himarket/SKILL.md b/.qoder/skills/create-pr-himarket/SKILL.md new file mode 100644 index 000000000..f57d682f4 --- /dev/null +++ b/.qoder/skills/create-pr-himarket/SKILL.md @@ -0,0 +1,278 @@ +--- +name: create-pr-himarket +description: "为 HiMarket 项目创建符合规范的 Pull Request。当用户需要提交代码、推送分支或创建 PR 时使用此 skill,确保 PR 标题和内容符合项目 CI 检查要求。" +--- + +# 创建 HiMarket Pull Request + +## 概述 + +帮助用户在 [higress-group/himarket](https://github.com/higress-group/himarket) 仓库创建符合规范的 Pull Request,确保通过 PR Title Check、PR Content Check 和 PR Size Check。 + +## PR 标题格式 + +### 必需格式 + +``` +type: 简短描述 +``` + +或带范围: + +``` +type(scope): 简短描述 +``` + +### 允许的 Type + +| Type | 说明 | 示例 | +|------|------|------| +| `feat` | 新功能 | `feat: add user authentication` | +| `fix` | Bug 修复 | `fix: resolve memory leak` | +| `docs` | 文档更新 | `docs: update API documentation` | +| `style` | 代码格式 | `style: format with prettier` | +| `refactor` | 重构 | `refactor: simplify service logic` | +| `perf` | 性能优化 | `perf: optimize queries` | +| `test` | 测试 | `test: add unit tests` | +| `build` | 构建系统 | `build: update dependencies` | +| `ci` | CI/CD | `ci: add workflow` | +| `chore` | 其他变更 | `chore: update gitignore` | +| `revert` | 回滚 | `revert: revert commit abc123` | + +### 标题规则 + +1. 必须包含 type 前缀 +2. type 后需要冒号和空格:`feat: ` 而不是 `feat:` +3. 描述必须以**小写字母**开头 +4. 保持简短清晰(建议 < 50 字符) + +## PR 内容格式(必填) + +### 必填部分 + +#### 1. Description(必填) + +必须包含 `## 📝 Description` 部分,且内容至少 10 个字符。 + +```markdown +## 📝 Description + +- 变更点 1 +- 变更点 2 +- 变更点 3 +``` + +#### 2. Type of Change(必填) + +必须至少勾选一项变更类型。 + +```markdown +## ✅ Type of Change + +- [x] Bug fix (non-breaking change) +- [ ] New feature (non-breaking change) +- [ ] Breaking change +- [ ] Documentation update +- [ ] Code refactoring +- [ ] Performance improvement +``` + +### 可选部分 + +#### 3. Related Issues(推荐) + +```markdown +## 🔗 Related Issues + +Fix #123 +Close #456 +``` + +#### 4. Testing(推荐) + +```markdown +## 🧪 Testing + +- [x] Unit tests added/updated +- [x] Manual testing completed +- 测试场景描述 +``` + +#### 5. Checklist(推荐) + +```markdown +## 📋 Checklist + +- [x] Code has been formatted (`mvn spotless:apply` for backend, `npm run lint:fix` for frontend) +- [x] Code is self-reviewed +- [x] No breaking changes +- [x] All CI checks pass +``` + +## 工作流程 + +### 步骤 1:检查当前状态 + +```bash +# 并行执行以下命令 +git status +git diff --stat +git log --oneline -5 +git branch -a +git remote -v +``` + +### 步骤 2:确认分支和远程 + +- 确认当前分支名称 +- 确认 origin 指向用户的 fork(如 `lexburner/himarket`) +- 确认 upstream 指向主仓库(`higress-group/himarket`) + +### 步骤 3:推送分支 + +```bash +git push -u origin +``` + +### 步骤 4:创建 PR + +使用 HEREDOC 格式确保内容正确: + +```bash +gh pr create --repo higress-group/himarket \ + --base main \ + --head : \ + --title "feat: 简短描述" \ + --body "$(cat <<'EOF' +## 📝 Description + +- 变更点 1 +- 变更点 2 + +## 🔗 Related Issues + +Fix #123 + +## ✅ Type of Change + +- [ ] Bug fix (non-breaking change) +- [x] New feature (non-breaking change) +- [ ] Breaking change +- [ ] Documentation update +- [ ] Code refactoring +- [ ] Performance improvement + +## 🧪 Testing + +- [x] Unit tests pass locally +- [x] Manual testing completed + +## 📋 Checklist + +- [x] Code has been formatted (`mvn spotless:apply` for backend, `npm run lint:fix` for frontend) +- [x] Code is self-reviewed +- [x] No breaking changes +- [x] All CI checks pass +EOF +)" +``` + +### 步骤 5:验证检查状态 + +```bash +gh pr checks --repo higress-group/himarket +``` + +确保以下检查通过: +- PR Title Check +- PR Content Check +- PR Size Check +- PR Validation Summary + +## 关键原则 + +- **标题小写** - 描述部分必须以小写字母开头 +- **内容完整** - 必须包含 `## 📝 Description` 和 `## ✅ Type of Change` +- **勾选类型** - Type of Change 必须至少勾选一项 `[x]` +- **关联 Issue** - 推荐使用 `Fix #123` 格式关联 Issue +- **格式化代码** - 提交前运行 `mvn spotless:apply` 或 `npm run lint:fix` +- **不提交图片** - 避免将截图等二进制文件提交到仓库 + +## 常见错误 + +### 错误 1:标题首字母大写 + +``` +❌ feat: Add new feature +✅ feat: add new feature +``` + +### 错误 2:缺少 Description 标题 + +```markdown +❌ 直接写内容 +✅ ## 📝 Description + 内容 +``` + +### 错误 3:未勾选 Type of Change + +```markdown +❌ - [ ] New feature +✅ - [x] New feature +``` + +### 错误 4:Description 内容太短 + +```markdown +❌ ## 📝 Description + Fix bug + +✅ ## 📝 Description + Fix pagination bug in product list +``` + +## 完整示例 + +**标题:** +``` +feat(chat): add tool call support and stop generation feature +``` + +**内容:** +```markdown +## 📝 Description + +- 添加聊天工具调用(Tool Call)支持,工具执行状态按消息顺序内联展示 +- 添加停止生成过程功能,支持中断正在进行的 AI 回复 +- 优化模型推理时滚动条自由滑动体验 + +## 🔗 Related Issues + +Fix #163 +Fix #164 +Fix #165 + +## ✅ Type of Change + +- [x] Bug fix (non-breaking change) +- [x] New feature (non-breaking change) +- [ ] Breaking change +- [ ] Documentation update +- [ ] Code refactoring +- [ ] Performance improvement + +## 🧪 Testing + +- [x] Unit tests pass locally +- [x] Manual testing completed +- 测试停止按钮能否正常中断 SSE 流式请求 +- 测试模型推理时滚动条是否可自由滑动 + +## 📋 Checklist + +- [x] Code has been formatted (`mvn spotless:apply` for backend, `npm run lint:fix` for frontend) +- [x] Code is self-reviewed +- [x] No breaking changes +- [x] All CI checks pass +``` diff --git a/.qoder/skills/java-coding-standards/SKILL.md b/.qoder/skills/java-coding-standards/SKILL.md new file mode 100644 index 000000000..af9902559 --- /dev/null +++ b/.qoder/skills/java-coding-standards/SKILL.md @@ -0,0 +1,147 @@ +--- +name: java-coding-standards +description: "Java coding standards for Spring Boot services: naming, immutability, Optional usage, streams, exceptions, generics, and project layout." +origin: ECC +--- + +# Java Coding Standards + +Standards for readable, maintainable Java (17+) code in Spring Boot services. + +## When to Activate + +- Writing or reviewing Java code in Spring Boot projects +- Enforcing naming, immutability, or exception handling conventions +- Working with records, sealed classes, or pattern matching (Java 17+) +- Reviewing use of Optional, streams, or generics +- Structuring packages and project layout + +## Core Principles + +- Prefer clarity over cleverness +- Immutable by default; minimize shared mutable state +- Fail fast with meaningful exceptions +- Consistent naming and package structure + +## Naming + +```java +// ✅ Classes/Records: PascalCase +public class MarketService {} +public record Money(BigDecimal amount, Currency currency) {} + +// ✅ Methods/fields: camelCase +private final MarketRepository marketRepository; +public Market findBySlug(String slug) {} + +// ✅ Constants: UPPER_SNAKE_CASE +private static final int MAX_PAGE_SIZE = 100; +``` + +## Immutability + +```java +// ✅ Favor records and final fields +public record MarketDto(Long id, String name, MarketStatus status) {} + +public class Market { + private final Long id; + private final String name; + // getters only, no setters +} +``` + +## Optional Usage + +```java +// ✅ Return Optional from find* methods +Optional market = marketRepository.findBySlug(slug); + +// ✅ Map/flatMap instead of get() +return market + .map(MarketResponse::from) + .orElseThrow(() -> new EntityNotFoundException("Market not found")); +``` + +## Streams Best Practices + +```java +// ✅ Use streams for transformations, keep pipelines short +List names = markets.stream() + .map(Market::name) + .filter(Objects::nonNull) + .toList(); + +// ❌ Avoid complex nested streams; prefer loops for clarity +``` + +## Exceptions + +- Use unchecked exceptions for domain errors; wrap technical exceptions with context +- Create domain-specific exceptions (e.g., `MarketNotFoundException`) +- Avoid broad `catch (Exception ex)` unless rethrowing/logging centrally + +```java +throw new MarketNotFoundException(slug); +``` + +## Generics and Type Safety + +- Avoid raw types; declare generic parameters +- Prefer bounded generics for reusable utilities + +```java +public Map indexById(Collection items) { ... } +``` + +## Project Structure (Maven/Gradle) + +``` +src/main/java/com/example/app/ + config/ + controller/ + service/ + repository/ + domain/ + dto/ + util/ +src/main/resources/ + application.yml +src/test/java/... (mirrors main) +``` + +## Formatting and Style + +- Use 2 or 4 spaces consistently (project standard) +- One public top-level type per file +- Keep methods short and focused; extract helpers +- Order members: constants, fields, constructors, public methods, protected, private + +## Code Smells to Avoid + +- Long parameter lists → use DTO/builders +- Deep nesting → early returns +- Magic numbers → named constants +- Static mutable state → prefer dependency injection +- Silent catch blocks → log and act or rethrow + +## Logging + +```java +private static final Logger log = LoggerFactory.getLogger(MarketService.class); +log.info("fetch_market slug={}", slug); +log.error("failed_fetch_market slug={}", slug, ex); +``` + +## Null Handling + +- Accept `@Nullable` only when unavoidable; otherwise use `@NonNull` +- Use Bean Validation (`@NotNull`, `@NotBlank`) on inputs + +## Testing Expectations + +- JUnit 5 + AssertJ for fluent assertions +- Mockito for mocking; avoid partial mocks where possible +- Favor deterministic tests; no hidden sleeps + +**Remember**: Keep code intentional, typed, and observable. Optimize for maintainability over micro-optimizations unless proven necessary. diff --git a/.qoder/skills/publish-skills/SKILL.md b/.qoder/skills/publish-skills/SKILL.md new file mode 100644 index 000000000..d1c81f9ce --- /dev/null +++ b/.qoder/skills/publish-skills/SKILL.md @@ -0,0 +1,216 @@ +--- +name: publish-skills +description: Publish local Agent Skills to a HiMarket backend instance. Use when the user wants to upload, publish, deploy, or sync skills to HiMarket. Supports batch publishing all skills in a directory with automatic category selection, tag generation, and conflict avoidance. +--- + +# Publish Skills to HiMarket + +将本地 Skill 目录批量发布到 HiMarket 后台,自动处理分类选择、Tag 生成、冲突检测和门户发布。 + +## Configuration + +从 `~/.env` 读取以下变量(也可通过 shell 环境变量覆盖): + +``` +HIMARKET_PUBLISH_URL=http://localhost:8080 # HiMarket 后台地址 +HIMARKET_PUBLISH_USERNAME=admin # 管理员用户名 +HIMARKET_PUBLISH_PASSWORD=admin # 管理员密码 +``` + +如果 `~/.env` 中没有这些变量,提示用户配置后再执行。 + +## Invocation + +用户调用方式: +- `publish-skills` — 发布默认目录 `~/Downloads/skills` 下所有 skill +- `publish-skills /path/to/skills` — 发布指定目录下所有 skill +- `publish-skills /path/to/skills/pdf` — 仅发布单个 skill + +## Workflow + +### Step 1: Load Config & Authenticate + +```bash +# 加载环境变量 +source ~/.env 2>/dev/null +HM_URL="${HIMARKET_PUBLISH_URL:-http://localhost:8080}" +HM_USER="${HIMARKET_PUBLISH_USERNAME:-admin}" +HM_PASS="${HIMARKET_PUBLISH_PASSWORD:-admin}" + +# 获取 admin token +TOKEN=$(curl -s --connect-timeout 10 --max-time 15 -X POST "$HM_URL/admins/login" \ + -H "Content-Type: application/json" \ + -d "{\"username\":\"$HM_USER\",\"password\":\"$HM_PASS\"}" | jq -r '.data.access_token') +``` + +验证 TOKEN 非空且非 `null`,否则报错退出。 + +### Step 2: Get Portal & Existing Data + +```bash +# 获取默认门户 ID +PORTAL_ID=$(curl -s --max-time 15 -H "Authorization: Bearer $TOKEN" \ + "$HM_URL/portals?size=1" | jq -r '.data.content[0].portalId // empty') + +# 获取已有 AGENT_SKILL 产品列表(避免重复) +curl -s -H "Authorization: Bearer $TOKEN" \ + "$HM_URL/products?type=AGENT_SKILL&size=200" | jq '.data.content[] | {name, productId}' + +# 获取已有分类列表 +curl -s -H "Authorization: Bearer $TOKEN" \ + "$HM_URL/product-categories?size=200" | jq '.data.content[] | {categoryId, name}' +``` + +记录已有产品名称和分类信息,用于后续冲突检测和分类匹配。 + +### Step 3: Scan Skills Directory + +遍历目标目录下每个子目录,检查是否包含 `SKILL.md`: +- 有 `SKILL.md` → 有效 skill,继续处理 +- 无 `SKILL.md` → 跳过,记录日志 + +从 `SKILL.md` front matter 中解析: +- `name` — skill 名称(必填) +- `description` — 描述(截断到 256 字符) + +**冲突检测**:如果该 name 已存在于远端产品列表中,记录其 productId,后续走更新流程(仅重新上传 package),不重新创建产品。 + +### Step 4: Determine Category + +根据 skill 的 `name`、`description` 和 `SKILL.md` 内容,为每个 skill 选择最合适的分类。 + +**预定义分类映射**(优先匹配): + +| Skill 关键词 | 分类名称 | 说明 | +|---|---|---| +| pdf, docx, pptx, xlsx | 文档处理 | 文档读写、格式转换 | +| frontend-design, notion-infographic, remotion | 设计创意 | UI 设计、图形生成、视频制作 | +| vite | 开发工具 | 开发框架、构建工具 | +| crawl, extract, search, tavily-best-practices, discord | 自动化 | 爬虫、搜索、集成 | +| find-skill, find-skills | 技能发现 | Skill 搜索和安装 | +| research | 效率提升 | 调研、信息整合 | + +**如果 skill 不在预定义映射中**,阅读其 `SKILL.md` 内容后自行判断最佳分类。 + +**分类匹配逻辑**: +1. 先在已有分类列表中按名称模糊匹配 +2. 匹配到 → 使用已有 categoryId +3. 未匹配到 → 创建新分类: + +```bash +curl -s -X POST "$HM_URL/product-categories" \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"name":"分类名称","description":"分类描述"}' +``` + +### Step 5: Generate Tags + +为每个 skill 生成 3-6 个有意义的 tag。Tag 要求: +- 英文小写,用连字符分隔(如 `pdf-parsing`、`web-scraping`) +- 反映 skill 的核心能力和使用场景 +- 不要过于宽泛(避免 `tool`、`utility` 这类无意义 tag) + +**Tag 生成参考**: + +| Skill | 推荐 Tags | +|---|---| +| pdf | `pdf-parsing`, `document-generation`, `form-filling`, `text-extraction`, `page-manipulation` | +| docx | `word-document`, `docx-generation`, `content-extraction`, `template-processing` | +| pptx | `presentation`, `slide-generation`, `powerpoint`, `content-modification` | +| xlsx | `spreadsheet`, `excel`, `data-manipulation`, `formula-processing` | +| vite | `web-development`, `react`, `frontend-scaffold`, `build-tool` | +| frontend-design | `ui-design`, `frontend`, `visual-design`, `web-components` | +| notion-infographic | `infographic`, `visual-content`, `notion-style`, `social-media` | +| remotion | `video-creation`, `react-video`, `animation`, `media-production` | +| crawl | `web-crawling`, `content-download`, `site-archival`, `markdown-conversion` | +| search | `web-search`, `tavily`, `content-discovery`, `real-time-data` | +| extract | `content-extraction`, `url-parsing`, `web-content`, `markdown` | +| research | `ai-research`, `topic-synthesis`, `citations`, `knowledge-base` | +| find-skills | `skill-discovery`, `skill-installation`, `capability-search` | +| discord | `discord-bot`, `messaging`, `community-management`, `automation` | +| tavily-best-practices | `tavily-integration`, `search-api`, `rag-pipeline`, `agentic-workflow` | + +**如果 skill 不在上表中**,阅读 `SKILL.md` 内容后自行生成合理的 tag。 + +### Step 6: Create or Update Product + +**新建产品**(远端不存在同名产品时): + +```bash +# document 是 SKILL.md 的全文内容 +PAYLOAD=$(jq -n \ + --arg name "$SKILL_NAME" \ + --arg desc "$DESCRIPTION" \ + --arg doc "$DOCUMENT" \ + --argjson cats '["categoryId1"]' \ + --argjson tags '["tag1","tag2","tag3"]' \ + '{ + name: $name, + description: $desc, + type: "AGENT_SKILL", + document: $doc, + autoApprove: true, + categories: $cats, + feature: { skillConfig: { skillTags: $tags } } + }') + +curl -s -X POST "$HM_URL/products" \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d "$PAYLOAD" | jq '.data.productId' +``` + +**已存在产品**:直接使用已有 productId,跳到 Step 7 上传 package。 + +### Step 7: Zip & Upload Package + +```bash +# 打包 skill 目录为 zip(排除 .DS_Store) +TMPZIP=$(mktemp /tmp/skill-XXXXXX.zip) +(cd "$SKILL_DIR" && zip -qry "$TMPZIP" . --exclude "*.DS_Store") + +# 上传到 HiMarket(支持重试,Nacos 写入可能较慢) +curl -s --max-time 120 -X POST "$HM_URL/skills/$PRODUCT_ID/package" \ + -H "Authorization: Bearer $TOKEN" \ + -F "file=@$TMPZIP;type=application/zip" + +rm -f "$TMPZIP" +``` + +上传失败时最多重试 3 次,每次间隔递增(1s、2s、3s)。 + +### Step 8: Publish to Portal + +```bash +curl -s -X POST "$HM_URL/products/$PRODUCT_ID/publications" \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d "{\"portalId\":\"$PORTAL_ID\"}" +``` + +如果产品已发布(返回错误),忽略该错误继续。 + +### Step 9: Summary + +处理完所有 skill 后,输出汇总表格: + +| Skill | 状态 | 分类 | Tags | 备注 | +|---|---|---|---|---| +| pdf | 新建并发布 | 文档处理 | pdf-parsing, ... | | +| vite | 已存在,更新包 | 开发工具 | web-development, ... | | + +## Error Handling + +- **Token 获取失败**:检查 URL 和凭据,提示用户检查 `~/.env` 配置 +- **Portal 不存在**:警告用户需先在 HiMarket 后台创建门户 +- **上传超时**:自动重试,最终失败记录到汇总 +- **分类创建失败**:跳过分类关联,继续其他步骤 +- **产品创建失败**:记录错误,继续处理下一个 skill + +## Important Notes + +- 执行前始终先列出将要处理的 skill 清单,等用户确认后再开始 +- 每处理完一个 skill 立即输出进度 +- 使用 TodoWrite 跟踪每个 skill 的处理状态 +- description 截断到 256 字符以符合 API 限制 diff --git a/.qoder/skills/springboot-verification/SKILL.md b/.qoder/skills/springboot-verification/SKILL.md new file mode 100644 index 000000000..c8f790aac --- /dev/null +++ b/.qoder/skills/springboot-verification/SKILL.md @@ -0,0 +1,231 @@ +--- +name: springboot-verification +description: "Verification loop for Spring Boot projects: build, static analysis, tests with coverage, security scans, and diff review before release or PR." +origin: ECC +--- + +# Spring Boot Verification Loop + +Run before PRs, after major changes, and pre-deploy. + +## When to Activate + +- Before opening a pull request for a Spring Boot service +- After major refactoring or dependency upgrades +- Pre-deployment verification for staging or production +- Running full build → lint → test → security scan pipeline +- Validating test coverage meets thresholds + +## Phase 1: Build + +```bash +mvn -T 4 clean verify -DskipTests +# or +./gradlew clean assemble -x test +``` + +If build fails, stop and fix. + +## Phase 2: Static Analysis + +Maven (common plugins): +```bash +mvn -T 4 spotbugs:check pmd:check checkstyle:check +``` + +Gradle (if configured): +```bash +./gradlew checkstyleMain pmdMain spotbugsMain +``` + +## Phase 3: Tests + Coverage + +```bash +mvn -T 4 test +mvn jacoco:report # verify 80%+ coverage +# or +./gradlew test jacocoTestReport +``` + +Report: +- Total tests, passed/failed +- Coverage % (lines/branches) + +### Unit Tests + +Test service logic in isolation with mocked dependencies: + +```java +@ExtendWith(MockitoExtension.class) +class UserServiceTest { + + @Mock private UserRepository userRepository; + @InjectMocks private UserService userService; + + @Test + void createUser_validInput_returnsUser() { + var dto = new CreateUserDto("Alice", "alice@example.com"); + var expected = new User(1L, "Alice", "alice@example.com"); + when(userRepository.save(any(User.class))).thenReturn(expected); + + var result = userService.create(dto); + + assertThat(result.name()).isEqualTo("Alice"); + verify(userRepository).save(any(User.class)); + } + + @Test + void createUser_duplicateEmail_throwsException() { + var dto = new CreateUserDto("Alice", "existing@example.com"); + when(userRepository.existsByEmail(dto.email())).thenReturn(true); + + assertThatThrownBy(() -> userService.create(dto)) + .isInstanceOf(DuplicateEmailException.class); + } +} +``` + +### Integration Tests with Testcontainers + +Test against a real database instead of H2: + +```java +@SpringBootTest +@Testcontainers +class UserRepositoryIntegrationTest { + + @Container + static PostgreSQLContainer postgres = new PostgreSQLContainer<>("postgres:16-alpine") + .withDatabaseName("testdb"); + + @DynamicPropertySource + static void configureProperties(DynamicPropertyRegistry registry) { + registry.add("spring.datasource.url", postgres::getJdbcUrl); + registry.add("spring.datasource.username", postgres::getUsername); + registry.add("spring.datasource.password", postgres::getPassword); + } + + @Autowired private UserRepository userRepository; + + @Test + void findByEmail_existingUser_returnsUser() { + userRepository.save(new User("Alice", "alice@example.com")); + + var found = userRepository.findByEmail("alice@example.com"); + + assertThat(found).isPresent(); + assertThat(found.get().getName()).isEqualTo("Alice"); + } +} +``` + +### API Tests with MockMvc + +Test controller layer with full Spring context: + +```java +@WebMvcTest(UserController.class) +class UserControllerTest { + + @Autowired private MockMvc mockMvc; + @MockBean private UserService userService; + + @Test + void createUser_validInput_returns201() throws Exception { + var user = new UserDto(1L, "Alice", "alice@example.com"); + when(userService.create(any())).thenReturn(user); + + mockMvc.perform(post("/api/users") + .contentType(MediaType.APPLICATION_JSON) + .content(""" + {"name": "Alice", "email": "alice@example.com"} + """)) + .andExpect(status().isCreated()) + .andExpect(jsonPath("$.name").value("Alice")); + } + + @Test + void createUser_invalidEmail_returns400() throws Exception { + mockMvc.perform(post("/api/users") + .contentType(MediaType.APPLICATION_JSON) + .content(""" + {"name": "Alice", "email": "not-an-email"} + """)) + .andExpect(status().isBadRequest()); + } +} +``` + +## Phase 4: Security Scan + +```bash +# Dependency CVEs +mvn org.owasp:dependency-check-maven:check +# or +./gradlew dependencyCheckAnalyze + +# Secrets in source +grep -rn "password\s*=\s*\"" src/ --include="*.java" --include="*.yml" --include="*.properties" +grep -rn "sk-\|api_key\|secret" src/ --include="*.java" --include="*.yml" + +# Secrets (git history) +git secrets --scan # if configured +``` + +### Common Security Findings + +``` +# Check for System.out.println (use logger instead) +grep -rn "System\.out\.print" src/main/ --include="*.java" + +# Check for raw exception messages in responses +grep -rn "e\.getMessage()" src/main/ --include="*.java" + +# Check for wildcard CORS +grep -rn "allowedOrigins.*\*" src/main/ --include="*.java" +``` + +## Phase 5: Lint/Format (optional gate) + +```bash +mvn spotless:apply # if using Spotless plugin +./gradlew spotlessApply +``` + +## Phase 6: Diff Review + +```bash +git diff --stat +git diff +``` + +Checklist: +- No debugging logs left (`System.out`, `log.debug` without guards) +- Meaningful errors and HTTP statuses +- Transactions and validation present where needed +- Config changes documented + +## Output Template + +``` +VERIFICATION REPORT +=================== +Build: [PASS/FAIL] +Static: [PASS/FAIL] (spotbugs/pmd/checkstyle) +Tests: [PASS/FAIL] (X/Y passed, Z% coverage) +Security: [PASS/FAIL] (CVE findings: N) +Diff: [X files changed] + +Overall: [READY / NOT READY] + +Issues to Fix: +1. ... +2. ... +``` + +## Continuous Mode + +- Re-run phases on significant changes or every 30–60 minutes in long sessions +- Keep a short loop: `mvn -T 4 test` + spotbugs for quick feedback + +**Remember**: Fast feedback beats late surprises. Keep the gate strict—treat warnings as defects in production systems. diff --git a/.qoder/skills/tmux/SKILL.md b/.qoder/skills/tmux/SKILL.md new file mode 100644 index 000000000..fa589c0e2 --- /dev/null +++ b/.qoder/skills/tmux/SKILL.md @@ -0,0 +1,153 @@ +--- +name: tmux +description: Remote-control tmux sessions for interactive CLIs by sending keystrokes and scraping pane output. +metadata: + { "openclaw": { "emoji": "🧵", "os": ["darwin", "linux"], "requires": { "bins": ["tmux"] } } } +--- + +# tmux Session Control + +Control tmux sessions by sending keystrokes and reading output. Essential for managing Claude Code sessions. + +## When to Use + +✅ **USE this skill when:** + +- Monitoring Claude/Codex sessions in tmux +- Sending input to interactive terminal applications +- Scraping output from long-running processes in tmux +- Navigating tmux panes/windows programmatically +- Checking on background work in existing sessions + +## When NOT to Use + +❌ **DON'T use this skill when:** + +- Running one-off shell commands → use `exec` tool directly +- Starting new background processes → use `exec` with `background:true` +- Non-interactive scripts → use `exec` tool +- The process isn't in tmux +- You need to create a new tmux session → use `exec` with `tmux new-session` + +## Example Sessions + +| Session | Purpose | +| ----------------------- | --------------------------- | +| `shared` | Primary interactive session | +| `worker-2` - `worker-8` | Parallel worker sessions | + +## Common Commands + +### List Sessions + +```bash +tmux list-sessions +tmux ls +``` + +### Capture Output + +```bash +# Last 20 lines of pane +tmux capture-pane -t shared -p | tail -20 + +# Entire scrollback +tmux capture-pane -t shared -p -S - + +# Specific pane in window +tmux capture-pane -t shared:0.0 -p +``` + +### Send Keys + +```bash +# Send text (doesn't press Enter) +tmux send-keys -t shared "hello" + +# Send text + Enter +tmux send-keys -t shared "y" Enter + +# Send special keys +tmux send-keys -t shared Enter +tmux send-keys -t shared Escape +tmux send-keys -t shared C-c # Ctrl+C +tmux send-keys -t shared C-d # Ctrl+D (EOF) +tmux send-keys -t shared C-z # Ctrl+Z (suspend) +``` + +### Window/Pane Navigation + +```bash +# Select window +tmux select-window -t shared:0 + +# Select pane +tmux select-pane -t shared:0.1 + +# List windows +tmux list-windows -t shared +``` + +### Session Management + +```bash +# Create new session +tmux new-session -d -s newsession + +# Kill session +tmux kill-session -t sessionname + +# Rename session +tmux rename-session -t old new +``` + +## Sending Input Safely + +For interactive TUIs (Claude Code, Codex, etc.), split text and Enter into separate sends to avoid paste/multiline edge cases: + +```bash +tmux send-keys -t shared -l -- "Please apply the patch in src/foo.ts" +sleep 0.1 +tmux send-keys -t shared Enter +``` + +## Claude Code Session Patterns + +### Check if Session Needs Input + +```bash +# Look for prompts +tmux capture-pane -t worker-3 -p | tail -10 | grep -E "❯|Yes.*No|proceed|permission" +``` + +### Approve Claude Code Prompt + +```bash +# Send 'y' and Enter +tmux send-keys -t worker-3 'y' Enter + +# Or select numbered option +tmux send-keys -t worker-3 '2' Enter +``` + +### Check All Sessions Status + +```bash +for s in shared worker-2 worker-3 worker-4 worker-5 worker-6 worker-7 worker-8; do + echo "=== $s ===" + tmux capture-pane -t $s -p 2>/dev/null | tail -5 +done +``` + +### Send Task to Session + +```bash +tmux send-keys -t worker-4 "Fix the bug in auth.js" Enter +``` + +## Notes + +- Use `capture-pane -p` to print to stdout (essential for scripting) +- `-S -` captures entire scrollback history +- Target format: `session:window.pane` (e.g., `shared:0.0`) +- Sessions persist across SSH disconnects diff --git a/.qoder/skills/tmux/scripts/find-sessions.sh b/.qoder/skills/tmux/scripts/find-sessions.sh new file mode 100755 index 000000000..8387c1629 --- /dev/null +++ b/.qoder/skills/tmux/scripts/find-sessions.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: find-sessions.sh [-L socket-name|-S socket-path|-A] [-q pattern] + +List tmux sessions on a socket (default tmux socket if none provided). + +Options: + -L, --socket tmux socket name (passed to tmux -L) + -S, --socket-path tmux socket path (passed to tmux -S) + -A, --all scan all sockets under OPENCLAW_TMUX_SOCKET_DIR + -q, --query case-insensitive substring to filter session names + -h, --help show this help +USAGE +} + +socket_name="" +socket_path="" +query="" +scan_all=false +socket_dir="${OPENCLAW_TMUX_SOCKET_DIR:-${CLAWDBOT_TMUX_SOCKET_DIR:-${TMPDIR:-/tmp}/openclaw-tmux-sockets}}" + +while [[ $# -gt 0 ]]; do + case "$1" in + -L|--socket) socket_name="${2-}"; shift 2 ;; + -S|--socket-path) socket_path="${2-}"; shift 2 ;; + -A|--all) scan_all=true; shift ;; + -q|--query) query="${2-}"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) echo "Unknown option: $1" >&2; usage; exit 1 ;; + esac +done + +if [[ "$scan_all" == true && ( -n "$socket_name" || -n "$socket_path" ) ]]; then + echo "Cannot combine --all with -L or -S" >&2 + exit 1 +fi + +if [[ -n "$socket_name" && -n "$socket_path" ]]; then + echo "Use either -L or -S, not both" >&2 + exit 1 +fi + +if ! command -v tmux >/dev/null 2>&1; then + echo "tmux not found in PATH" >&2 + exit 1 +fi + +list_sessions() { + local label="$1"; shift + local tmux_cmd=(tmux "$@") + + if ! sessions="$("${tmux_cmd[@]}" list-sessions -F '#{session_name}\t#{session_attached}\t#{session_created_string}' 2>/dev/null)"; then + echo "No tmux server found on $label" >&2 + return 1 + fi + + if [[ -n "$query" ]]; then + sessions="$(printf '%s\n' "$sessions" | grep -i -- "$query" || true)" + fi + + if [[ -z "$sessions" ]]; then + echo "No sessions found on $label" + return 0 + fi + + echo "Sessions on $label:" + printf '%s\n' "$sessions" | while IFS=$'\t' read -r name attached created; do + attached_label=$([[ "$attached" == "1" ]] && echo "attached" || echo "detached") + printf ' - %s (%s, started %s)\n' "$name" "$attached_label" "$created" + done +} + +if [[ "$scan_all" == true ]]; then + if [[ ! -d "$socket_dir" ]]; then + echo "Socket directory not found: $socket_dir" >&2 + exit 1 + fi + + shopt -s nullglob + sockets=("$socket_dir"/*) + shopt -u nullglob + + if [[ "${#sockets[@]}" -eq 0 ]]; then + echo "No sockets found under $socket_dir" >&2 + exit 1 + fi + + exit_code=0 + for sock in "${sockets[@]}"; do + if [[ ! -S "$sock" ]]; then + continue + fi + list_sessions "socket path '$sock'" -S "$sock" || exit_code=$? + done + exit "$exit_code" +fi + +tmux_cmd=(tmux) +socket_label="default socket" + +if [[ -n "$socket_name" ]]; then + tmux_cmd+=(-L "$socket_name") + socket_label="socket name '$socket_name'" +elif [[ -n "$socket_path" ]]; then + tmux_cmd+=(-S "$socket_path") + socket_label="socket path '$socket_path'" +fi + +list_sessions "$socket_label" "${tmux_cmd[@]:1}" diff --git a/.qoder/skills/tmux/scripts/wait-for-text.sh b/.qoder/skills/tmux/scripts/wait-for-text.sh new file mode 100755 index 000000000..56354be83 --- /dev/null +++ b/.qoder/skills/tmux/scripts/wait-for-text.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: wait-for-text.sh -t target -p pattern [options] + +Poll a tmux pane for text and exit when found. + +Options: + -t, --target tmux target (session:window.pane), required + -p, --pattern regex pattern to look for, required + -F, --fixed treat pattern as a fixed string (grep -F) + -T, --timeout seconds to wait (integer, default: 15) + -i, --interval poll interval in seconds (default: 0.5) + -l, --lines number of history lines to inspect (integer, default: 1000) + -h, --help show this help +USAGE +} + +target="" +pattern="" +grep_flag="-E" +timeout=15 +interval=0.5 +lines=1000 + +while [[ $# -gt 0 ]]; do + case "$1" in + -t|--target) target="${2-}"; shift 2 ;; + -p|--pattern) pattern="${2-}"; shift 2 ;; + -F|--fixed) grep_flag="-F"; shift ;; + -T|--timeout) timeout="${2-}"; shift 2 ;; + -i|--interval) interval="${2-}"; shift 2 ;; + -l|--lines) lines="${2-}"; shift 2 ;; + -h|--help) usage; exit 0 ;; + *) echo "Unknown option: $1" >&2; usage; exit 1 ;; + esac +done + +if [[ -z "$target" || -z "$pattern" ]]; then + echo "target and pattern are required" >&2 + usage + exit 1 +fi + +if ! [[ "$timeout" =~ ^[0-9]+$ ]]; then + echo "timeout must be an integer number of seconds" >&2 + exit 1 +fi + +if ! [[ "$lines" =~ ^[0-9]+$ ]]; then + echo "lines must be an integer" >&2 + exit 1 +fi + +if ! command -v tmux >/dev/null 2>&1; then + echo "tmux not found in PATH" >&2 + exit 1 +fi + +# End time in epoch seconds (integer, good enough for polling) +start_epoch=$(date +%s) +deadline=$((start_epoch + timeout)) + +while true; do + # -J joins wrapped lines, -S uses negative index to read last N lines + pane_text="$(tmux capture-pane -p -J -t "$target" -S "-${lines}" 2>/dev/null || true)" + + if printf '%s\n' "$pane_text" | grep $grep_flag -- "$pattern" >/dev/null 2>&1; then + exit 0 + fi + + now=$(date +%s) + if (( now >= deadline )); then + echo "Timed out after ${timeout}s waiting for pattern: $pattern" >&2 + echo "Last ${lines} lines from $target:" >&2 + printf '%s\n' "$pane_text" >&2 + exit 1 + fi + + sleep "$interval" +done diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..6dbfd409a --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,202 @@ +**ALWAYS RESPOND IN CHINESE-SIMPLIFIED** + +## 本地开发环境 + +### 数据库访问 + +本地开发时,数据库连接信息可以通过以下任意方式提供(优先级从高到低): +- shell 环境变量(直接 export 或写入 `~/.zshrc` / `~/.bashrc`) +- `~/.env` 文件(`scripts/run.sh` 启动时会自动 source) + +需要包含以下变量: +- `DB_HOST`:数据库地址 +- `DB_PORT`:端口(默认 3306) +- `DB_NAME`:数据库名 +- `DB_USERNAME`:用户名 +- `DB_PASSWORD`:密码 + +查询数据库时,使用 mysql CLI(环境变量已在 shell 中或通过 `~/.env` 加载): + +```bash +mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USERNAME" -p"$DB_PASSWORD" "$DB_NAME" -e "YOUR_SQL_HERE" +``` + +注意事项: +- 只执行 SELECT 查询,除非用户明确要求修改数据 +- 不要在回复中展示完整的密码、密钥等敏感字段 +- 数据库 schema 由 Flyway 管理,迁移文件在 `himarket-bootstrap/src/main/resources/db/migration/` + +### 启动后端服务 + +使用 `scripts/run.sh` 脚本编译并启动 Java 后端: + +```bash +./scripts/run.sh +``` + +脚本会自动完成:加载环境变量 → 优雅关闭旧进程 → 编译打包 → 后台启动 jar → 轮询等待就绪。 +脚本退出码为 0 表示启动成功,非 0 表示失败(编译错误或启动超时)。 + +### 修改代码后的验证 + +以下场景建议主动进行"重启 → 接口验证"闭环,而不是只改代码就结束: +- 用户明确要求调试某个 bug 或修复接口问题 +- 新增或修改了 REST/WebSocket 接口 +- 用户要求端到端验证 +- 完成 spec 任务的代码开发后,进行端到端功能验证 + +#### 验证流程 + +1. `./scripts/run.sh` 重启,确认退出码为 0 +2. 用 curl 调用相关接口,检查返回结果 +3. 如果涉及数据变更,用 mysql CLI 查询确认 +4. 验证失败时读取 `~/himarket.log` 排查,修复后重试 + +### API 接口测试 + +后端运行在 `http://localhost:8080`,接口路径不带 `/portal` 前缀。使用 JWT Bearer Token 认证。 + +接口返回格式为 `{"code":"SUCCESS","data":{...}}`,token 在 `data.access_token` 中。 + +#### 获取管理员 Token(后台管理) + +```bash +TOKEN=$(curl -s -X POST http://localhost:8080/admins/login \ + -H "Content-Type: application/json" \ + -d '{"username":"admin","password":"admin"}' | jq -r '.data.access_token') +``` + +#### 获取开发者 Token(前台门户) + +```bash +TOKEN=$(curl -s -X POST http://localhost:8080/developers/login \ + -H "Content-Type: application/json" \ + -d '{"username":"user","password":"123456"}' | jq -r '.data.access_token') +``` + +#### 带认证请求示例 + +```bash +curl -s -H "Authorization: Bearer $TOKEN" http://localhost:8080/your-endpoint | jq . +``` + +#### WebSocket 接口验证 + +对于 WebSocket 接口,使用 `websocat` 工具: + +```bash +websocat -H "Authorization: Bearer $TOKEN" ws://localhost:8080/your-ws-endpoint +``` + +#### 认证注解说明 + +接口上的注解决定了需要哪种角色的 token: +- `@AdminAuth`:需要管理员 token +- `@DeveloperAuth`:需要开发者 token +- `@AdminOrDeveloperAuth`:两种都可以 +- 无注解:无需认证 + +Token 有效期为 7 天。Swagger 文档:`http://localhost:8080/portal/swagger-ui.html` + +### 应用日志 + +本地运行时日志文件位于 `~/himarket.log`。排查后端问题时应主动读取该日志。 + +## OpenSandbox 集成 + +HiMarket 集成了阿里巴巴开源的 OpenSandbox 项目,用于提供安全的代码执行沙箱环境。 + +### 项目位置 + +OpenSandbox 仓库位于 `OpenSandbox/` 目录(本地 clone,不提交到 git)。 + +**首次设置:** +```bash +cd /Users/xujingfeng/IdeaProjects/himarket +git clone https://github.com/alibaba/OpenSandbox.git +``` + +该目录已在 `.gitignore` 中配置,不会被提交到版本控制,但 AI Agent 可以正常访问和探索其中的源码和文档。 + +### 渐进性探索指南 + +当需要对接或调试 OpenSandbox 相关功能时,按以下顺序探索: + +1. **快速了解**:阅读 `OpenSandbox/README.md` 了解项目概述、核心功能和基本用法 +2. **开发指导**: + - `OpenSandbox/CLAUDE.md` - Claude Code 的开发指导(中文) + - `OpenSandbox/AGENTS.md` - AI Agent 的仓库指南 +3. **架构文档**:`OpenSandbox/docs/architecture.md` - 整体架构和设计理念 +4. **关键目录**: + - `OpenSandbox/server/` - Python FastAPI 沙箱生命周期管理服务 + - `OpenSandbox/sdks/` - 多语言 SDK(Python、Java/Kotlin、TypeScript、C#) + - `OpenSandbox/components/execd/` - Go 执行守护进程 + - `OpenSandbox/examples/` - 集成示例(包括 claude-code、kimi-cli 等) + - `OpenSandbox/specs/` - OpenAPI 规范文档 + - `OpenSandbox/kubernetes/` - Kubernetes 部署和 Operator + +### 何时探索 OpenSandbox + +仅在以下场景需要深入探索 OpenSandbox 源码和文档: +- 实现或调试沙箱创建、生命周期管理功能 +- 集成代码执行、命令执行、文件操作等沙箱能力 +- 排查沙箱相关的错误或性能问题 +- 扩展或定制沙箱运行时行为 +- 对接 OpenSandbox 的 API 或 SDK + +对于其他 HiMarket 功能开发,无需关注 OpenSandbox 目录。 + +## Nacos 集成 + +HiMarket 使用阿里巴巴开源的 Nacos 作为服务发现和配置管理基础设施。本地通过符号链接引入了 Nacos 源码仓库,方便 AI Agent 理解 Nacos 内部实现。 + +### 项目位置 + +Nacos 源码位于 `nacos/` 目录(本地符号链接,指向 `/Users/xujingfeng/AIProjects/nacos`,不提交到 git)。 + +**首次设置:** +```bash +cd /Users/xujingfeng/IdeaProjects/himarket +ln -s /Users/xujingfeng/AIProjects/nacos nacos +``` + +该目录已在 `.gitignore` 中配置,不会被提交到版本控制,但 AI Agent 可以正常访问和探索其中的源码和文档。 + +### 渐进性探索指南 + +当需要对接或调试 Nacos 相关功能时,按以下顺序探索: + +1. **快速了解**:阅读 `nacos/README.md` 了解项目概述(动态服务发现、配置管理、DNS 服务) +2. **架构文档**:`nacos/doc/` 目录下的设计文档 +3. **关键模块**: + - `nacos/api/` - Nacos 公共 API 定义(SPI 接口、模型类) + - `nacos/client/` - Java 客户端 SDK(服务注册/发现、配置监听) + - `nacos/naming/` - 服务注册与发现核心实现 + - `nacos/config/` - 配置管理核心实现 + - `nacos/server/` - Nacos Server 启动入口 + - `nacos/console/` - 管理控制台后端 + - `nacos/console-ui/` - 管理控制台前端 + - `nacos/core/` - 核心通用模块(集群、鉴权、分布式协议) + - `nacos/consistency/` - 一致性协议(Raft/Distro) + - `nacos/auth/` - 认证鉴权模块 + - `nacos/plugin/` - 插件体系(鉴权、配置加密、数据源等) + - `nacos/persistence/` - 持久化层 + - `nacos/distribution/` - 打包和发布配置 +4. **高级主题**(特定场景): + - `nacos/mcp-registry-adaptor/` - MCP 注册适配器 + - `nacos/istio/` - Istio 集成(MCP/xDS 协议) + - `nacos/k8s-sync/` - Kubernetes 服务同步 + - `nacos/ai/` - AI 相关能力 + - `nacos/skills/` - Skill 市场能力 + +### 何时探索 Nacos + +仅在以下场景需要深入探索 Nacos 源码和文档: +- 实现或调试 HiMarket 与 Nacos 的服务注册/发现集成 +- 对接 Nacos 配置管理能力(动态配置推送、监听) +- 排查 Nacos 客户端连接、心跳、同步等问题 +- 理解 Nacos 的一致性协议(Raft/Distro)实现细节 +- 扩展 Nacos 插件(鉴权、数据源、配置加密等) +- 对接 Nacos 的 Open API 或使用其 Java SDK + +对于其他 HiMarket 功能开发,无需关注 Nacos 目录。 diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..652cc511e --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,366 @@ +# CLAUDE.md + +本文件为 Claude Code (claude.ai/code) 在处理本仓库代码时提供指导。 + +**ALWAYS RESPOND IN CHINESE-SIMPLIFIED** + +## 项目概述 + +HiMarket 是基于 Higress AI 网关构建的企业级 AI 开放平台,帮助企业构建私有 AI 能力市场,管理和分发 AI 资源(LLM、MCP Server、Agent、Agent Skill)。 + +**仓库结构:** +- `himarket-bootstrap/` - Spring Boot 应用入口和配置 +- `himarket-server/` - 业务逻辑、REST 控制器、服务 +- `himarket-dal/` - 数据访问层(JPA 实体、仓储) +- `himarket-web/himarket-admin/` - 管理门户前端(React + Vite) +- `himarket-web/himarket-frontend/` - 开发者门户前端(React + TypeScript) +- `deploy/` - Docker Compose 和 Helm 部署配置 + +## 开发命令 + +### 后端(Java 17 + Maven) + +```bash +# 构建所有模块,跳过测试 +mvn clean package -DskipTests + +# 运行测试(默认排除集成测试) +mvn test + +# 运行所有测试,包括集成测试 +mvn test -Pintegration + +# 检查代码格式(Spotless + Google Java Format) +mvn spotless:check + +# 应用代码格式化 +mvn spotless:apply + +# 本地启动后端(加载 ~/.env,杀掉 8080 端口旧进程,编译,启动 Spring Boot) +./scripts/run.sh +``` + +**后端运行地址:** `http://localhost:8080` +**Swagger UI:** `http://localhost:8080/portal/swagger-ui.html` +**日志文件:** `~/himarket.log` + +### 前端 - 管理门户(`himarket-web/himarket-admin/`) + +```bash +npm run dev # 开发服务器(端口 5174) +npm run build # 生产环境构建 +npm run lint # ESLint 检查 +npm run serve # 构建并启动生产服务器 +``` + +### 前端 - 开发者门户(`himarket-web/himarket-frontend/`) + +```bash +npm run dev # 开发服务器(端口 5173) +npm run build # 类型检查和生产环境构建 +npm run lint # ESLint 检查 +npm run type-check # 仅进行 TypeScript 类型检查 +npm run test # 运行 Vitest 测试 +npm run preview # 预览生产环境构建 +``` + +## 本地开发环境设置 + +### 数据库配置 + +数据库连接可通过以下方式配置(优先级从高到低): +1. Shell 环境变量(直接 export 或写入 `~/.zshrc` / `~/.bashrc`) +2. `~/.env` 文件(`scripts/run.sh` 自动加载) + +必需的环境变量: +```bash +DB_HOST=your_db_host +DB_PORT=3306 +DB_NAME=himarket +DB_USERNAME=your_username +DB_PASSWORD=your_password +``` + +**查询数据库:** +```bash +mysql -h "$DB_HOST" -P "$DB_PORT" -u "$DB_USERNAME" -p"$DB_PASSWORD" "$DB_NAME" -e "YOUR_SQL_HERE" +``` + +注意事项: +- 只执行 SELECT 查询,除非用户明确要求修改数据 +- 不要在回复中展示完整的密码、密钥等敏感字段 +- 数据库 schema 由 Flyway 管理,迁移文件在 `himarket-bootstrap/src/main/resources/db/migration/` + +### API 认证 + +所有 API 端点都需要 JWT Bearer Token 认证(登录/注册端点除外)。 + +**接口返回格式:** +```json +{"code":"SUCCESS","data":{...}} +``` +Token 在 `data.access_token` 中,有效期为 7 天。 + +**获取管理员 Token:** +```bash +# 基础方式 +curl -X POST http://localhost:8080/admins/login \ + -H "Content-Type: application/json" \ + -d '{"username":"admin","password":"admin"}' + +# 自动提取 token(推荐) +TOKEN=$(curl -s -X POST http://localhost:8080/admins/login \ + -H "Content-Type: application/json" \ + -d '{"username":"admin","password":"admin"}' | jq -r '.data.access_token') +``` + +**获取开发者 Token:** +```bash +# 基础方式 +curl -X POST http://localhost:8080/developers/login \ + -H "Content-Type: application/json" \ + -d '{"username":"user","password":"123456"}' + +# 自动提取 token(推荐) +TOKEN=$(curl -s -X POST http://localhost:8080/developers/login \ + -H "Content-Type: application/json" \ + -d '{"username":"user","password":"123456"}' | jq -r '.data.access_token') +``` + +**使用 Token:** +```bash +curl -s -H "Authorization: Bearer $TOKEN" http://localhost:8080/your-endpoint | jq . +``` + +**WebSocket 接口验证:** +```bash +websocat -H "Authorization: Bearer $TOKEN" ws://localhost:8080/your-ws-endpoint +``` + +### 认证注解 + +控制器方法使用注解来强制认证: +- `@AdminAuth` - 需要管理员 token +- `@DeveloperAuth` - 需要开发者 token +- `@AdminOrDeveloperAuth` - 接受任意一种 token +- 无注解 - 公开端点 + +### 启动后端服务 + +使用 `scripts/run.sh` 脚本编译并启动 Java 后端: + +```bash +./scripts/run.sh +``` + +脚本会自动完成:加载环境变量 → 优雅关闭旧进程 → 编译打包 → 后台启动 jar → 轮询等待就绪。 +脚本退出码为 0 表示启动成功,非 0 表示失败(编译错误或启动超时)。 + +### 应用日志 + +本地运行时日志文件位于 `~/himarket.log`。排查后端问题时应主动读取该日志。 + +## 高层架构 + +### 模块依赖 + +``` +himarket-bootstrap (入口点) + ├── himarket-server (业务逻辑) + │ └── himarket-dal (数据访问) + └── himarket-dal +``` + +### 核心领域实体 + +**产品管理:** +- `Product` - 代表 API 产品的核心实体(REST_API、MCP_SERVER、AGENT_API、MODEL_API) +- `ProductPublication` - 产品的已发布版本及路由配置 +- `ProductSubscription` - 开发者对产品的订阅 +- `ProductCategory` / `ProductCategoryRelation` - 产品分类 + +**身份与访问:** +- `Administrator` - 门户管理的管理员用户 +- `Developer` - 订阅 API 的开发者用户 +- `DeveloperExternalIdentity` - OIDC/OAuth 关联身份 +- `Consumer` / `ConsumerCredential` - 应用及其 API 凭证 + +**基础设施:** +- `Portal` / `PortalDomain` - 门户配置和自定义域名 +- `Gateway` - 网关配置(Higress、APIG 等) +- `NacosInstance` - Nacos 服务发现配置 +- `K8sCluster` - Kubernetes 集群配置 + +**对话式 AI:** +- `ChatSession` / `Chat` / `ChatAttachment` - AI 聊天会话和消息 + +### 关键服务模式 + +**网关集成(`service/gateway/`):** +- `AIGWOperator` - AI 网关操作的抽象接口 +- `client/GatewayClient` - 网关客户端实现 +- 不同网关实现:Higress、阿里云 APIG、MSE + +**事件驱动清理:** +- `DeveloperDeletingEvent` / `PortalDeletingEvent` / `ProductDeletingEvent` +- 事件监听器处理级联删除(凭证、订阅等) + +### 产品类型 + +产品有一个 `type` 字段,类型特定的配置以 JSON 格式存储: + +1. **REST_API** - 传统 REST API + - 配置:`apiConfig.spec`(OpenAPI/Swagger 规范) + +2. **MCP_SERVER** - Model Context Protocol 服务器 + - 配置:`mcpConfig`,包含服务器名称、域名、工具 + +3. **AGENT_API** - AI 智能体 API + - 配置:`agentConfig.agentAPIConfig`,包含路由和协议 + +4. **MODEL_API** - AI 模型 API + - 配置:`modelConfig.modelAPIConfig`,包含模型类别和路由 + +### 数据库迁移 + +Flyway 管理 `himarket-bootstrap/src/main/resources/db/migration/` 中的模式迁移: +- `V1__init.sql` - 初始模式 +- `V2__*.sql` - 后续迁移 + +## 代码风格 + +**Java:** +- Google Java Format(AOSP 风格)通过 Spotless +- 自动导入排序和删除未使用的导入 +- 在 `mvn compile` 时自动运行 + +**前端:** +- ESLint 配合 React hooks 和 refresh 插件 +- TypeScript 严格模式 +- Prettier 格式化 + +## OpenSandbox 集成 + +HiMarket 集成了阿里巴巴开源的 OpenSandbox 项目,为 AI 应用提供安全的沙箱执行环境。 + +### 项目位置 + +OpenSandbox 仓库位于 `OpenSandbox/` 目录(本地 clone,不提交到 git)。 + +**首次设置:** +```bash +cd /Users/xujingfeng/IdeaProjects/himarket +git clone https://github.com/alibaba/OpenSandbox.git +``` + +该目录已在 `.gitignore` 中配置,不会被提交到版本控制,但 Claude Code 可以正常访问和探索其中的源码和文档。 + +### 渐进性探索指南 + +**仅在需要对接或调试 OpenSandbox 功能时才探索此目录。** 按以下顺序渐进式学习: + +#### 第一层:快速了解 +- `OpenSandbox/README.md` - 项目概述、核心功能、快速开始 + +#### 第二层:开发指导 +- `OpenSandbox/CLAUDE.md` - Claude Code 的详细开发指导(中文) +- `OpenSandbox/AGENTS.md` - AI Agent 的仓库指南 +- `OpenSandbox/docs/architecture.md` - 整体架构和设计理念 + +#### 第三层:核心组件(按需探索) +- **Server**:`OpenSandbox/server/` - Python FastAPI 沙箱生命周期管理服务 + - 配置:`~/.sandbox.toml`(从 `server/example.config.toml` 复制) + - 启动:`opensandbox-server` 或 `cd server && uv run python -m src.main` +- **SDKs**:`OpenSandbox/sdks/` - 多语言客户端库 + - `sdks/sandbox/` - 基础沙箱 SDK(生命周期、命令、文件) + - `sdks/code-interpreter/` - 代码解释器 SDK + - 支持语言:Python、Java/Kotlin、JavaScript/TypeScript、C#/.NET +- **execd**:`OpenSandbox/components/execd/` - Go 执行守护进程 + - 注入到沙箱容器中,提供代码执行、命令和文件操作 +- **Examples**:`OpenSandbox/examples/` - 集成示例 + - `examples/claude-code/` - Claude Code 集成示例 + - `examples/code-interpreter/` - 代码解释器示例 + - `examples/kimi-cli/`、`examples/gemini-cli/` 等 - 其他 AI CLI 集成 + +#### 第四层:高级主题(特定场景) +- **Kubernetes**:`OpenSandbox/kubernetes/` - K8s 部署和自定义 Operator +- **Specs**:`OpenSandbox/specs/` - OpenAPI 规范(沙箱生命周期 API、执行 API) +- **Components**:`OpenSandbox/components/` - Ingress 网关、Egress 控制 +- **OSEPs**:`OpenSandbox/oseps/` - 架构提案和设计文档 + +### 何时探索 OpenSandbox + +仅在以下场景需要深入探索: +- 实现沙箱创建、启动、停止等生命周期管理 +- 集成代码执行、命令执行、文件操作等沙箱能力 +- 调试沙箱相关的错误或性能问题 +- 扩展或定制沙箱运行时行为 +- 对接 OpenSandbox 的 REST API 或使用其 SDK + +**对于其他 HiMarket 功能开发(产品管理、用户认证、网关配置等),无需关注 OpenSandbox 目录。** + +## Nacos 集成 + +HiMarket 使用阿里巴巴开源的 Nacos 作为服务发现和配置管理基础设施。本地通过符号链接引入了 Nacos 源码仓库,方便理解 Nacos 内部实现。 + +### 项目位置 + +Nacos 源码位于 `nacos/` 目录(本地符号链接,指向 `/Users/xujingfeng/AIProjects/nacos`,不提交到 git)。 + +**首次设置:** +```bash +cd /Users/xujingfeng/IdeaProjects/himarket +ln -s /Users/xujingfeng/AIProjects/nacos nacos +``` + +该目录已在 `.gitignore` 中配置,不会被提交到版本控制,但 Claude Code 可以正常访问和探索其中的源码和文档。 + +### 渐进性探索指南 + +**仅在需要对接或调试 Nacos 功能时才探索此目录。** 按以下顺序渐进式学习: + +#### 第一层:快速了解 +- `nacos/README.md` - 项目概述、核心功能(动态服务发现、配置管理、DNS 服务) + +#### 第二层:架构与设计 +- `nacos/doc/` - 设计文档和架构说明 + +#### 第三层:核心模块(按需探索) +- **API**:`nacos/api/` - 公共 API 定义(SPI 接口、模型类、异常定义) +- **Client**:`nacos/client/` - Java 客户端 SDK + - 服务注册/发现、配置监听、长连接管理 +- **Naming**:`nacos/naming/` - 服务注册与发现核心实现 + - 服务实例管理、健康检查、路由策略 +- **Config**:`nacos/config/` - 配置管理核心实现 + - 配置发布/订阅、灰度发布、历史版本 +- **Server**:`nacos/server/` - Nacos Server 启动入口 +- **Console**:`nacos/console/` + `nacos/console-ui/` - 管理控制台(后端 + 前端) +- **Core**:`nacos/core/` - 核心通用模块(集群管理、鉴权、分布式协议) +- **Consistency**:`nacos/consistency/` - 一致性协议实现(Raft/Distro) +- **Auth**:`nacos/auth/` - 认证鉴权模块 +- **Plugin**:`nacos/plugin/` - 插件体系(鉴权、配置加密、数据源等) +- **Persistence**:`nacos/persistence/` - 持久化层 + +#### 第四层:高级主题(特定场景) +- **MCP 适配**:`nacos/mcp-registry-adaptor/` - MCP 注册适配器 +- **Istio 集成**:`nacos/istio/` - Istio MCP/xDS 协议对接 +- **K8s 同步**:`nacos/k8s-sync/` - Kubernetes 服务同步 +- **AI 能力**:`nacos/ai/` - AI 相关能力 +- **Skill 市场**:`nacos/skills/` - Skill 市场能力 +- **Distribution**:`nacos/distribution/` - 打包和发布配置 + +### 何时探索 Nacos + +仅在以下场景需要深入探索: +- 实现或调试 HiMarket 与 Nacos 的服务注册/发现集成 +- 对接 Nacos 配置管理能力(动态配置推送、监听) +- 排查 Nacos 客户端连接、心跳、同步等问题 +- 理解 Nacos 的一致性协议(Raft/Distro)实现细节 +- 扩展 Nacos 插件(鉴权、数据源、配置加密等) +- 对接 Nacos 的 Open API 或使用其 Java SDK + +**对于其他 HiMarket 功能开发(产品管理、用户认证、网关配置等),无需关注 Nacos 目录。** + +## 其他文档 + +- 用户指南:`USER_GUIDE.md` diff --git a/README.md b/README.md index a8fbf8b2a..a06273a3f 100644 --- a/README.md +++ b/README.md @@ -25,26 +25,31 @@

-## Table of Contents - -- [What is HiMarket?](#what-is-himarket) -- [System Architecture](#system-architecture) -- [Quick Start](#quick-start) -- [Documentation](#documentation) -- [Community](#community) -- [Contributors](#contributors) -- [Star History](#star-history) - ## What is HiMarket? -HiMarket is an enterprise-grade AI open platform built on Higress AI Gateway, helping enterprises build private AI capability marketplace to uniformly manage and distribute AI resources such as LLM, MCP Server, and Agent. The platform encapsulates distributed AI capabilities into standardized API products, supports multi-version management and gray-scale release, provides self-service developer portal, and features comprehensive enterprise-level operation capabilities including security control, observability analysis, metering and billing, making AI resource sharing and reuse efficient and convenient. +HiMarket is an enterprise-grade AI open platform built on Higress AI Gateway, helping enterprises build private AI capability marketplace to uniformly manage and distribute AI resources such as LLM, MCP Server, Agent, and Agent Skill. The platform encapsulates distributed AI capabilities into standardized API products, supports multi-version management and gray-scale release, includes a built-in Skills Marketplace for developers to browse and install Agent Skills, provides HiChat AI conversation and HiCoding online programming for self-service developer experience, and features comprehensive enterprise-level operation capabilities including security control, observability analysis, metering and billing, making AI resource sharing and reuse efficient and convenient.
- HiMarket 核心能力 -
Capabilities
+| Category | Feature | Description | +|----------|---------|-------------| +| **AI Marketplace** | Model Marketplace | Integrate various models with content safety, token rate limiting, and other protection capabilities | +| | MCP Marketplace | Integrate MCP Servers from various platforms, support converting external APIs to standard MCP Servers | +| | Agent Marketplace | Package and publish Agent applications, integrate with AgentScope and other Agent building platforms | +| | Skills Marketplace | Upload and distribute Agent Skills, developers can browse, subscribe, and install Skill packages | +| **AI Experience Center** | HiChat Conversation | Single-model conversation and multi-model comparison, MCP tool invocation testing, enhanced features like web-connected Q&A | +| | HiCoding Online Programming | Integrated secure sandbox environment, supporting Vibe Coding and human-AI collaborative development with real-time file changes and code preview | +| **Enterprise Management** | Product Management | Authentication, traffic control, call quotas, and other protection capabilities | +| | Observability | Full-chain monitoring, call tracing, heatmaps, anomaly alerts | +| | Metering & Billing | Token-based call counting with automatic cost statistics | +| | Version Management | Multi-version parallel operation, gray-scale release, quick rollback | +| **Customization** | Portal Branding | Custom domain, logo, color scheme, page layout | +| | Identity Authentication | Support third-party OIDC integration with enterprise identity systems | +| | Approval Workflow | Configurable auto/manual approval for subscription and product scenarios | +| | Product Catalog | Custom category tags with browsing, filtering, and search support | + ## System Architecture
@@ -56,8 +61,8 @@ HiMarket is an enterprise-grade AI open platform built on Higress AI Gateway, he HiMarket system architecture consists of three layers: 1. **Infrastructure**: Composed of AI Gateway, API Gateway, Higress and Nacos. HiMarket abstracts and encapsulates underlying AI resources based on these components to form standard API products for external use. -2. **AI Open Platform Admin**: Management platform for administrators to create and customize portals, manage AI resources such as MCP Server, Model, and Agent, including setting authentication policies and subscription approval workflows. The admin portal also provides observability dashboard to help administrators monitor AI resource usage and operational status in real-time. -3. **AI Open Platform Portal**: Developer-facing portal site, also known as AI Marketplace or AI Hub, providing one-stop self-service where developers can complete identity registration, credential application, product browsing and subscription, online debugging, and more. +2. **AI Open Platform Admin**: Management platform for administrators to create and customize portals, manage AI resources such as MCP Server, Model, Agent, and Agent Skill, including setting authentication policies and subscription approval workflows. The admin portal also provides observability dashboard to help administrators monitor AI resource usage and operational status in real-time. +3. **AI Open Platform Portal**: Developer-facing portal site, also known as AI Marketplace or AI Hub, providing one-stop self-service where developers can complete identity registration, credential application, product browsing and subscription, online debugging, and more. Developers can also interact with models and MCP Servers through HiChat, or perform online AI programming in secure sandboxes through HiCoding. @@ -124,29 +129,31 @@ npm run dev
-Use the `deploy.sh` script to deploy HiMarket, Higress, and Nacos with data initialization. +**Requirements:** Docker, Docker Compose + +**Script Deployment:** Use the interactive `install.sh` script to deploy the full stack (HiMarket, Higress, Nacos, MySQL) with guided configuration. ```bash -# Clone repository git clone https://github.com/higress-group/himarket.git -cd himarket/deploy/docker/scripts +cd himarket/deploy/docker +./install.sh +``` -# Deploy full stack and initialize -./deploy.sh install +**AI Deployment (Recommended):** If you're concerned about environment compatibility issues during deployment, we recommend using AI Coding tools such as Cursor, Qoder, or Claude Code, which can automatically detect and resolve environment problems. After cloning the project, simply enter in your AI tool: -# Or deploy HiMarket only (without Nacos/Higress) -./deploy.sh himarket-only +> Read the deployment docs under deploy/docker and help me deploy HiMarket with Docker Compose -# Uninstall all services -./deploy.sh uninstall +See the [Deployment Guide](./deploy/DEPLOYMENT.md) for details. -# Service URLs -# Admin portal: http://localhost:5174 -# Developer portal: http://localhost:5173 -# Backend API: http://localhost:8081 -``` +**Service URLs after deployment:** +- Admin Portal: http://localhost:5174 +- Developer Portal: http://localhost:5173 +- Backend API: http://localhost:8081 -> For detailed Docker deployment instructions, please refer to [Docker Deployment Guide](./deploy/docker/Docker部署脚本说明.md) +**Uninstall:** +```bash +./install.sh --uninstall +``` @@ -155,24 +162,26 @@ cd himarket/deploy/docker/scripts
-Use the `deploy.sh` script to deploy HiMarket to Kubernetes cluster. +**Requirements:** kubectl (connected to a K8s cluster), Helm + +**Script Deployment:** Use the interactive `install.sh` script to deploy HiMarket to a Kubernetes cluster with guided configuration. ```bash -# Clone repository git clone https://github.com/higress-group/himarket.git -cd himarket/deploy/helm/scripts +cd himarket/deploy/helm +./install.sh +``` -# Deploy full stack and initialize -./deploy.sh install +**AI Deployment (Recommended):** If you're concerned about environment compatibility issues during deployment, we recommend using AI Coding tools such as Cursor, Qoder, or Claude Code, which can automatically detect and resolve environment problems. After cloning the project, simply enter in your AI tool: -# Or deploy HiMarket only (without Nacos/Higress) -./deploy.sh himarket-only +> Read the deployment docs under the deploy directory and help me deploy HiMarket to my K8s cluster with Helm -# Uninstall -./deploy.sh uninstall -``` +See the [Deployment Guide](./deploy/DEPLOYMENT.md) for details. -> For detailed Helm deployment instructions, please refer to [Helm Deployment Guide](./deploy/helm/Helm部署脚本说明.md) +**Uninstall:** +```bash +./install.sh --uninstall +``` diff --git a/README_zh.md b/README_zh.md index 78c8b073b..0a52f454f 100644 --- a/README_zh.md +++ b/README_zh.md @@ -25,26 +25,31 @@

-## 目录 - -- [HiMarket 是什么?](#himarket-是什么) -- [系统架构](#系统架构) -- [快速开始](#快速开始) -- [文档](#文档) -- [社区](#社区) -- [贡献者](#贡献者) -- [Star History](#star-history) - ## HiMarket 是什么? -HiMarket 是基于 Higress AI 网关构建的企业级 AI 开放平台,帮助企业构建私有 AI 能力市场,统一管理和分发 LLM、MCP Server、Agent 等 AI 资源。平台将分散的 AI 能力封装为标准化的 API 产品,支持多版本管理和灰度发布,提供自助式开发者门户,并具备安全管控、观测分析、计量计费等完整的企业级运营能力,让 AI 资源的共享和复用变得高效便捷。 +HiMarket 是基于 Higress AI 网关构建的企业级 AI 开放平台,帮助企业构建私有 AI 能力市场,统一管理和分发 LLM、MCP Server、Agent、Agent Skill 等 AI 资源。平台将分散的 AI 能力封装为标准化的 API 产品,支持多版本管理和灰度发布,内置 Skills 市场供开发者浏览和安装 Agent Skill,提供 HiChat AI 对话和 HiCoding 在线编程等自助式开发者体验,并具备安全管控、观测分析、计量计费等完整的企业级运营能力,让 AI 资源的共享和复用变得高效便捷。
- HiMarket 核心能力 -
核心能力
+| 类别 | 功能 | 说明 | +|------|------|------| +| **AI 能力市场** | Model 市场 | 接入各类 Model,提供内容安全、Token 限流等防护能力 | +| | MCP 市场 | 接入各平台 MCP Server,支持外部 API 转换为标准 MCP Server | +| | Agent 市场 | 打包上架 Agent 应用,对接 AgentScope 等 Agent 构建平台 | +| | Skills 市场 | 上传和分发 Agent Skill,开发者可浏览、订阅和安装 Skill 包 | +| **AI 体验中心** | HiChat 对话调试 | 单模型对话与多模型对比,结合 MCP 进行工具调用测试,支持联网问答等增强功能 | +| | HiCoding 在线编程 | 集成安全沙箱环境,支持 Vibe Coding 和人机协作开发,实时查看文件变更与代码预览 | +| **企业级管理** | 产品管理 | 认证鉴权、流量控制、调用配额等防护能力 | +| | 观测分析 | 全链路监控、调用追踪、热力图、异常告警 | +| | 计量计费 | 基于 Token 调用次数、自动统计成本费用 | +| | 版本管理 | 多版本并行、灰度发布、快速回滚 | +| **灵活定制** | 门户品牌 | 自定义域名、Logo、配色、页面布局 | +| | 身份认证 | 支持接入第三方 OIDC,对接企业用户身份体系 | +| | 审批流程 | 按照订阅、产品订阅等场景可配置自动/人工审批 | +| | 产品目录 | 自定义类别标签,支持浏览、筛选、搜索 | + ## 系统架构
@@ -56,8 +61,8 @@ HiMarket 是基于 Higress AI 网关构建的企业级 AI 开放平台,帮助 HiMarket 系统架构分为三层: 1. **基础设施**:由 AI 网关、API 网关、Higress 和 Nacos 组成。HiMarket 基于这些组件对底层 AI 资源进行抽象封装,形成可对外开放的标准 API 产品。 -2. **AI 开放平台后台**:面向管理员的管理平台,管理员可以创建和定制门户,管理 MCP Server、Model、Agent 等 AI 资源,例如设置鉴权策略、订阅审批流程等。后台还提供可观测大盘,帮助管理员实时了解 AI 资源的使用和运行状态。 -3. **AI 开放平台前台**:面向外部开发者的门户站点,也称为 AI 市场或 AI 中台,提供一站式自助服务,开发者可以完成身份注册、凭证申请、浏览订阅产品、在线调试等操作。 +2. **AI 开放平台后台**:面向管理员的管理平台,管理员可以创建和定制门户,管理 MCP Server、Model、Agent、Agent Skill 等 AI 资源,例如设置鉴权策略、订阅审批流程等。后台还提供可观测大盘,帮助管理员实时了解 AI 资源的使用和运行状态。 +3. **AI 开放平台前台**:面向外部开发者的门户站点,也称为 AI 市场或 AI 中台,提供一站式自助服务,开发者可以完成身份注册、凭证申请、浏览订阅产品、在线调试等操作,还可以通过 HiChat 与模型和 MCP Server 交互对话,通过 HiCoding 在安全沙箱中进行在线 AI 编程。
@@ -124,29 +129,31 @@ npm run dev
-使用 `deploy.sh` 脚本完成 HiMarket、Higress、Nacos 全栈部署和数据初始化。 +**环境依赖:** Docker、Docker Compose + +**脚本部署:** 使用交互式 `install.sh` 脚本一键部署全栈服务(HiMarket、Higress、Nacos、MySQL),脚本会引导完成所有配置。 ```bash -# 克隆项目 git clone https://github.com/higress-group/himarket.git -cd himarket/deploy/docker/scripts +cd himarket/deploy/docker +./install.sh +``` -# 部署全栈服务并初始化 -./deploy.sh install +**AI 部署(推荐):** 如果担心部署过程中遇到环境兼容性等问题,推荐使用 Cursor、Qoder、Claude Code 等 AI Coding 工具进行部署,AI 可以自动识别和解决环境问题。clone 项目后在 AI 工具中输入: -# 或仅部署 HiMarket 服务(不含 Nacos/Higress) -./deploy.sh himarket-only +> 阅读 deploy 目录下的部署文档,帮我用 Docker Compose 部署 HiMarket -# 卸载所有服务 -./deploy.sh uninstall +详细文档请参考 [部署文档](./deploy/DEPLOYMENT_zh.md)。 -# 服务地址 -# 管理后台地址:http://localhost:5174 -# 开发者门户地址:http://localhost:5173 -# 后端 API 地址:http://localhost:8081 -``` +**部署完成后的服务地址:** +- 管理后台:http://localhost:5174 +- 开发者门户:http://localhost:5173 +- 后端 API:http://localhost:8081 -> 详细的 Docker 部署说明请参考 [Docker 部署文档](./deploy/docker/Docker部署脚本说明.md) +**卸载:** +```bash +./install.sh --uninstall +``` @@ -155,24 +162,26 @@ cd himarket/deploy/docker/scripts
-使用 `deploy.sh` 脚本将 HiMarket 部署到 Kubernetes 集群。 +**环境依赖:** kubectl(已连接 K8s 集群)、Helm + +**脚本部署:** 使用交互式 `install.sh` 脚本将 HiMarket 部署到 Kubernetes 集群,脚本会引导完成所有配置。 ```bash -# 克隆项目 git clone https://github.com/higress-group/himarket.git -cd himarket/deploy/helm/scripts +cd himarket/deploy/helm +./install.sh +``` -# 部署全栈服务并初始化 -./deploy.sh install +**AI 部署(推荐):** 如果担心部署过程中遇到环境兼容性等问题,推荐使用 Cursor、Qoder、Claude Code 等 AI Coding 工具进行部署,AI 可以自动识别和解决环境问题。clone 项目后在 AI 工具中输入: -# 或仅部署 HiMarket 服务(不含 Nacos/Higress) -./deploy.sh himarket-only +> 阅读 deploy 目录下的部署文档,帮我用 Helm Chart 部署 HiMarket 到 K8s 集群 -# 卸载 -./deploy.sh uninstall -``` +详细文档请参考 [部署文档](./deploy/DEPLOYMENT_zh.md)。 -> 详细的 Helm 部署说明请参考 [Helm 部署文档](./deploy/helm/Helm部署脚本说明.md) +**卸载:** +```bash +./install.sh --uninstall +``` @@ -224,4 +233,3 @@ cd himarket/deploy/helm/scripts [![Star History Chart](https://api.star-history.com/svg?repos=higress-group/himarket&type=Date)](https://star-history.com/#higress-group/himarket&Date) - diff --git a/deploy/DEPLOYMENT.md b/deploy/DEPLOYMENT.md new file mode 100644 index 000000000..9bc10b6a9 --- /dev/null +++ b/deploy/DEPLOYMENT.md @@ -0,0 +1,144 @@ +# HiMarket Deployment Guide + +## Overview + +A complete HiMarket deployment includes the following components: +- **MySQL** - Database service +- **Nacos** - Configuration center +- **Higress** - AI Gateway +- **Redis** - Cache service +- **HiMarket** - Server + Admin + Frontend + Sandbox + +Two deployment methods are available: +- **Docker Compose** - Suitable for local development and single-machine deployment +- **Helm (Kubernetes)** - Suitable for production environments and cluster deployment + +Both methods use the interactive guided script `install.sh`, which automatically completes all configuration and initialization. + +## Docker Compose Deployment + +### Prerequisites + +- `docker` +- `docker compose` +- `curl` +- `jq` + +### Interactive Deployment (Recommended) + +```bash +git clone https://github.com/higress-group/himarket.git +cd himarket/deploy/docker +./install.sh +``` + +The script will guide you step by step through all configurations, including image selection, password setup, AI model configuration, etc. + +### Non-Interactive Mode (CI/CD) + +```bash +cp .env.example ~/himarket-install.env +# Edit ~/himarket-install.env to modify configurations as needed +./install.sh -n +``` + +### Uninstall + +```bash +./install.sh --uninstall +``` + +### Upgrade + +Re-run `./install.sh`, and the script will automatically detect existing deployments and provide upgrade options. + +### Service Ports + +| Service | Host Port | Description | +|---------|-----------|-------------| +| HiMarket Admin | 5174 | Admin console UI | +| HiMarket Frontend | 5173 | Developer portal UI | +| HiMarket Server | 8081 | Backend API service | +| Nacos | 8848 | Nacos console | +| Higress Console | 8001 | Higress console | +| Higress Gateway | 8082 | Gateway HTTP endpoint | +| MySQL | 3306 | Database service | +| Redis | 6379 | Redis service | + +## Helm Deployment (Kubernetes) + +### Prerequisites + +- `kubectl` (connected to a K8s cluster) +- `helm` +- `curl` +- `jq` +- `python3` + +### Interactive Deployment (Recommended) + +```bash +git clone https://github.com/higress-group/himarket.git +cd himarket/deploy/helm +./install.sh +``` + +### Non-Interactive Mode + +```bash +cp .env.example ~/himarket-install.env +# Edit configurations +./install.sh -n +``` + +### Uninstall + +```bash +./install.sh --uninstall +``` + +### Upgrade + +Re-run `./install.sh`, and the script will automatically detect existing deployments and provide upgrade options. + +### Notes + +- Deploys to the `himarket` namespace by default +- Admin and Frontend services use LoadBalancer type by default + +## Configuration + +- In interactive mode, the script guides you through all configuration items (images, database passwords, service credentials, default users, AI models, etc.) +- Configurations are automatically saved to `~/himarket-install.env` and reused for subsequent upgrades +- `.env.example` is the configuration template containing all configurable items with default values, suitable for non-interactive mode +- Log file is located at `~/himarket-install.log` + +## FAQ + +### View Service Logs + +Docker method: +```bash +docker compose logs -f +``` + +Helm method: +```bash +kubectl logs -n himarket +``` + +### How to Troubleshoot Deployment Failures + +Check the log file: +```bash +cat ~/himarket-install.log +``` + +### How to Retry Initialization Hooks + +Hook scripts are located in the `hooks/post_ready.d/` directory and can be executed independently. + +### How to Skip Partial Initialization + +- Interactive mode: The script provides skip options +- Non-interactive mode: Control via environment variables, e.g., `SKIP_MCP_INIT=true` diff --git a/deploy/DEPLOYMENT_zh.md b/deploy/DEPLOYMENT_zh.md new file mode 100644 index 000000000..43e2af37e --- /dev/null +++ b/deploy/DEPLOYMENT_zh.md @@ -0,0 +1,144 @@ +# HiMarket 部署指南 + +## 概述 + +HiMarket 完整部署包含以下组件: +- **MySQL** - 数据库服务 +- **Nacos** - 配置中心 +- **Higress** - AI 网关 +- **Redis** - 缓存服务 +- **HiMarket** - Server + Admin + Frontend + Sandbox + +提供两种部署方式: +- **Docker Compose** - 适合本地开发和单机部署 +- **Helm(Kubernetes)** - 适合生产环境和集群部署 + +两种方式均使用交互式引导脚本 `install.sh`,自动完成全部配置和初始化。 + +## Docker Compose 部署 + +### 前置条件 + +- `docker` +- `docker compose` +- `curl` +- `jq` + +### 交互式部署(推荐) + +```bash +git clone https://github.com/higress-group/himarket.git +cd himarket/deploy/docker +./install.sh +``` + +脚本会逐步引导您完成所有配置,包括镜像选择、密码设置、AI 模型配置等。 + +### 非交互模式(CI/CD) + +```bash +cp .env.example ~/himarket-install.env +# 编辑 ~/himarket-install.env 按需修改配置 +./install.sh -n +``` + +### 卸载 + +```bash +./install.sh --uninstall +``` + +### 升级 + +重新运行 `./install.sh`,脚本会自动检测已有部署并提供升级选项。 + +### 服务端口 + +| 服务 | 主机端口 | 说明 | +|------|---------|------| +| HiMarket Admin | 5174 | 管理后台界面 | +| HiMarket Frontend | 5173 | 开发者门户界面 | +| HiMarket Server | 8081 | 后端 API 服务 | +| Nacos | 8848 | Nacos 控制台 | +| Higress Console | 8001 | Higress 控制台 | +| Higress Gateway | 8082 | 网关 HTTP 入口 | +| MySQL | 3306 | 数据库服务 | +| Redis | 6379 | Redis 服务 | + +## Helm 部署(Kubernetes) + +### 前置条件 + +- `kubectl`(已连接 K8s 集群) +- `helm` +- `curl` +- `jq` +- `python3` + +### 交互式部署(推荐) + +```bash +git clone https://github.com/higress-group/himarket.git +cd himarket/deploy/helm +./install.sh +``` + +### 非交互模式 + +```bash +cp .env.example ~/himarket-install.env +# 编辑配置 +./install.sh -n +``` + +### 卸载 + +```bash +./install.sh --uninstall +``` + +### 升级 + +重新运行 `./install.sh`,脚本会自动检测已有部署并提供升级选项。 + +### 说明 + +- 默认部署到 `himarket` 命名空间 +- Admin 和 Frontend 服务默认使用 LoadBalancer 类型 + +## 配置说明 + +- 交互模式下脚本会逐步引导所有配置项(镜像、数据库密码、服务凭证、默认用户、AI 模型等) +- 配置自动保存到 `~/himarket-install.env`,后续升级时自动复用 +- `.env.example` 是配置模板,包含所有可配置项及默认值,可用于非交互模式 +- 日志文件位于 `~/himarket-install.log` + +## 常见问题 + +### 查看服务日志 + +Docker 方式: +```bash +docker compose logs -f +``` + +Helm 方式: +```bash +kubectl logs -n himarket +``` + +### 部署失败如何排查 + +检查日志文件: +```bash +cat ~/himarket-install.log +``` + +### 如何重试初始化钩子 + +钩子脚本在 `hooks/post_ready.d/` 目录下,可独立执行。 + +### 如何跳过部分初始化 + +- 交互模式:脚本会提供跳过选项 +- 非交互模式:通过环境变量控制,如 `SKIP_MCP_INIT=true` diff --git a/deploy/docker/scripts/data/higress-mcp.json b/deploy/data/higress-mcp.json similarity index 100% rename from deploy/docker/scripts/data/higress-mcp.json rename to deploy/data/higress-mcp.json diff --git a/deploy/docker/scripts/data/nacos-mcp.json b/deploy/data/nacos-mcp.json similarity index 100% rename from deploy/docker/scripts/data/nacos-mcp.json rename to deploy/data/nacos-mcp.json diff --git a/deploy/data/skills/docx/SKILL.md b/deploy/data/skills/docx/SKILL.md new file mode 100644 index 000000000..af5f12523 --- /dev/null +++ b/deploy/data/skills/docx/SKILL.md @@ -0,0 +1,198 @@ +--- +name: docx +description: "Advanced Word document toolkit for content extraction, document generation, page manipulation, and interactive form processing. Use when you need to parse DOCX text and tables, create professional documents, combine or split files, or complete fillable forms programmatically." +description_zh: "高级 Word 文档工具集,支持内容提取、文档生成、页面操作和交互式表单处理。用于解析 DOCX 文本和表格、创建专业文档、合并或拆分文件,或以编程方式填写表单。" +license: Proprietary. LICENSE.txt has complete terms +--- + +# Word Document Processing Toolkit + +## Overview + +A user may ask you to generate, modify, or extract content from a .docx file. A .docx file is essentially a ZIP archive containing XML files and other resources that you can read or edit. You have different tools and workflows available for different tasks. + +## Workflow Selection Guide + +### Content Extraction & Analysis +Use "Text extraction" or "Raw XML access" sections below + +### Document Generation +Use "Generating a new Word document" workflow + +### Document Modification +- **Your own document + simple changes** + Use "Basic OpenXML modification" workflow + +- **Third-party document** + Use **"Revision tracking workflow"** (recommended default) + +- **Legal, academic, business, or government docs** + Use **"Revision tracking workflow"** (required) + +## Content Extraction and Analysis + +### Text extraction +If you just need to read the text contents of a document, you should convert the document to markdown using pandoc. Pandoc provides excellent support for preserving document structure and can show tracked changes: + +```bash +# Convert document to markdown with tracked changes +pandoc --track-changes=all path-to-file.docx -o output.md +# Options: --track-changes=accept/reject/all +``` + +### Raw XML access +You need raw XML access for: comments, complex formatting, document structure, embedded media, and metadata. For any of these features, you'll need to unpack a document and read its raw XML contents. + +#### Unpacking a file +`python openxml/scripts/extract.py ` + +#### Key file structures +* `word/document.xml` - Main document contents +* `word/comments.xml` - Comments referenced in document.xml +* `word/media/` - Embedded images and media files +* Tracked changes use `` (insertions) and `` (deletions) tags + +## Generating a new Word document + +When generating a new Word document from scratch, use **docx-js**, which allows you to create Word documents using JavaScript/TypeScript. + +### Workflow +1. **MANDATORY - READ ENTIRE FILE**: Read [`word-generator.md`](word-generator.md) (~500 lines) completely from start to finish. **NEVER set any range limits when reading this file.** Read the full file content for detailed syntax, critical formatting rules, and best practices before proceeding with document creation. +2. Create a JavaScript/TypeScript file using Document, Paragraph, TextRun components (You can assume all dependencies are installed, but if not, refer to the dependencies section below) +3. Export as .docx using Packer.toBuffer() + +## Modifying an existing Word document + +When modifying an existing Word document, use the **WordFile library** (a Python library for OpenXML manipulation). The library automatically handles infrastructure setup and provides methods for document manipulation. For complex scenarios, you can access the underlying DOM directly through the library. + +### Workflow +1. **MANDATORY - READ ENTIRE FILE**: Read [`office-xml-spec.md`](office-xml-spec.md) (~600 lines) completely from start to finish. **NEVER set any range limits when reading this file.** Read the full file content for the WordFile library API and XML patterns for directly editing document files. +2. Unpack the document: `python openxml/scripts/extract.py ` +3. Create and run a Python script using the WordFile library (see "WordFile Library" section in office-xml-spec.md) +4. Pack the final document: `python openxml/scripts/assemble.py ` + +The WordFile library provides both high-level methods for common operations and direct DOM access for complex scenarios. + +## Revision tracking workflow for document review + +This workflow allows you to plan comprehensive tracked changes using markdown before implementing them in OpenXML. **CRITICAL**: For complete tracked changes, you must implement ALL changes systematically. + +**Batching Strategy**: Group related changes into batches of 3-10 changes. This makes debugging manageable while maintaining efficiency. Test each batch before moving to the next. + +**Principle: Minimal, Precise Edits** +When implementing tracked changes, only mark text that actually changes. Repeating unchanged text makes edits harder to review and appears unprofessional. Break replacements into: [unchanged text] + [deletion] + [insertion] + [unchanged text]. Preserve the original run's RSID for unchanged text by extracting the `` element from the original and reusing it. + +Example - Changing "30 days" to "60 days" in a sentence: +```python +# BAD - Replaces entire sentence +'The term is 30 days.The term is 60 days.' + +# GOOD - Only marks what changed, preserves original for unchanged text +'The term is 3060 days.' +``` + +### Tracked changes workflow + +1. **Get markdown representation**: Convert document to markdown with tracked changes preserved: + ```bash + pandoc --track-changes=all path-to-file.docx -o current.md + ``` + +2. **Identify and group changes**: Review the document and identify ALL changes needed, organizing them into logical batches: + + **Location methods** (for finding changes in XML): + - Section/heading numbers (e.g., "Section 3.2", "Article IV") + - Paragraph identifiers if numbered + - Grep patterns with unique surrounding text + - Document structure (e.g., "first paragraph", "signature block") + - **DO NOT use markdown line numbers** - they don't map to XML structure + + **Batch organization** (group 3-10 related changes per batch): + - By section: "Batch 1: Section 2 amendments", "Batch 2: Section 5 updates" + - By type: "Batch 1: Date corrections", "Batch 2: Party name changes" + - By complexity: Start with simple text replacements, then tackle complex structural changes + - Sequential: "Batch 1: Pages 1-3", "Batch 2: Pages 4-6" + +3. **Read documentation and unpack**: + - **MANDATORY - READ ENTIRE FILE**: Read [`office-xml-spec.md`](office-xml-spec.md) (~600 lines) completely from start to finish. **NEVER set any range limits when reading this file.** Pay special attention to the "WordFile Library" and "Tracked Change Patterns" sections. + - **Unpack the document**: `python openxml/scripts/extract.py ` + - **Note the suggested RSID**: The extract script will suggest an RSID to use for your tracked changes. Copy this RSID for use in step 4b. + +4. **Implement changes in batches**: Group changes logically (by section, by type, or by proximity) and implement them together in a single script. This approach: + - Makes debugging easier (smaller batch = easier to isolate errors) + - Allows incremental progress + - Maintains efficiency (batch size of 3-10 changes works well) + + **Suggested batch groupings:** + - By document section (e.g., "Section 3 changes", "Definitions", "Termination clause") + - By change type (e.g., "Date changes", "Party name updates", "Legal term replacements") + - By proximity (e.g., "Changes on pages 1-3", "Changes in first half of document") + + For each batch of related changes: + + **a. Map text to XML**: Grep for text in `word/document.xml` to verify how text is split across `` elements. + + **b. Create and run script**: Use `locate_element` to find nodes, implement changes, then `doc.persist()`. See **"WordFile Library"** section in office-xml-spec.md for patterns. + + **Note**: Always grep `word/document.xml` immediately before writing a script to get current line numbers and verify text content. Line numbers change after each script run. + +5. **Pack the document**: After all batches are complete, convert the unpacked directory back to .docx: + ```bash + python openxml/scripts/assemble.py unpacked reviewed-document.docx + ``` + +6. **Final verification**: Do a comprehensive check of the complete document: + - Convert final document to markdown: + ```bash + pandoc --track-changes=all reviewed-document.docx -o verification.md + ``` + - Verify ALL changes were applied correctly: + ```bash + grep "original phrase" verification.md # Should NOT find it + grep "replacement phrase" verification.md # Should find it + ``` + - Check that no unintended changes were introduced + + +## Converting Documents to Images + +To visually analyze Word documents, convert them to images using a two-step process: + +1. **Convert DOCX to PDF**: + ```bash + soffice --headless --convert-to pdf document.docx + ``` + +2. **Convert PDF pages to JPEG images**: + ```bash + pdftoppm -jpeg -r 150 document.pdf page + ``` + This creates files like `page-1.jpg`, `page-2.jpg`, etc. + +Options: +- `-r 150`: Sets resolution to 150 DPI (adjust for quality/size balance) +- `-jpeg`: Output JPEG format (use `-png` for PNG if preferred) +- `-f N`: First page to convert (e.g., `-f 2` starts from page 2) +- `-l N`: Last page to convert (e.g., `-l 5` stops at page 5) +- `page`: Prefix for output files + +Example for specific range: +```bash +pdftoppm -jpeg -r 150 -f 2 -l 5 document.pdf page # Converts only pages 2-5 +``` + +## Code Style Guidelines +**IMPORTANT**: When generating code for DOCX operations: +- Write concise code +- Avoid verbose variable names and redundant operations +- Avoid unnecessary print statements + +## Dependencies + +Required dependencies (install if not available): + +- **pandoc**: `sudo apt-get install pandoc` (for text extraction) +- **docx**: `npm install -g docx` (for creating new documents) +- **LibreOffice**: `sudo apt-get install libreoffice` (for PDF conversion) +- **Poppler**: `sudo apt-get install poppler-utils` (for pdftoppm to convert PDF to images) +- **defusedxml**: `pip install defusedxml` (for secure XML parsing) diff --git a/deploy/data/skills/docx/office-xml-spec.md b/deploy/data/skills/docx/office-xml-spec.md new file mode 100644 index 000000000..ae83d6943 --- /dev/null +++ b/deploy/data/skills/docx/office-xml-spec.md @@ -0,0 +1,610 @@ +# Office Open XML Technical Reference + +**Important: Read this entire document before starting.** This document covers: +- [Technical Guidelines](#technical-guidelines) - Schema compliance rules and validation requirements +- [Document Content Patterns](#document-content-patterns) - XML patterns for headings, lists, tables, formatting, etc. +- [WordFile Library (Python)](#wordfile-library-python) - Recommended approach for OpenXML manipulation with automatic infrastructure setup +- [Tracked Changes (Revision Tracking)](#tracked-changes-revision-tracking) - XML patterns for implementing tracked changes + +## Technical Guidelines + +### Schema Compliance +- **Element ordering in ``**: ``, ``, ``, ``, `` +- **Whitespace**: Add `xml:space='preserve'` to `` elements with leading/trailing spaces +- **Unicode**: Escape characters in ASCII content: `"` becomes `“` + - **Character encoding reference**: Curly quotes `""` become `“”`, apostrophe `'` becomes `’`, em-dash `—` becomes `—` +- **Tracked changes**: Use `` and `` tags with `w:author="Claude"` outside `` elements + - **Critical**: `` closes with ``, `` closes with `` - never mix + - **RSIDs must be 8-digit hex**: Use values like `00AB1234` (only 0-9, A-F characters) + - **trackRevisions placement**: Add `` after `` in settings.xml +- **Images**: Add to `word/media/`, reference in `document.xml`, set dimensions to prevent overflow + +## Document Content Patterns + +### Basic Structure +```xml + + Text content + +``` + +### Headings and Styles +```xml + + + + + + Document Title + + + + + Section Heading + +``` + +### Text Formatting +```xml + +Bold + +Italic + +Underlined + +Highlighted +``` + +### Lists +```xml + + + + + + + + First item + + + + + + + + + + New list item 1 + + + + + + + + + + + Bullet item + +``` + +### Tables +```xml + + + + + + + + + + + + Cell 1 + + + + Cell 2 + + + +``` + +### Layout +```xml + + + + + + + + + + + + New Section Title + + + + + + + + + + Centered text + + + + + + + + Monospace text + + + + + + + This text is Courier New + + and this text uses default font + +``` + +## File Updates + +When adding content, update these files: + +**`word/_rels/document.xml.rels`:** +```xml + + +``` + +**`[Content_Types].xml`:** +```xml + + +``` + +### Images +**CRITICAL**: Calculate dimensions to prevent page overflow and maintain aspect ratio. + +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +``` + +### Links (Hyperlinks) + +**IMPORTANT**: All hyperlinks (both internal and external) require the Hyperlink style to be defined in styles.xml. Without this style, links will look like regular text instead of blue underlined clickable links. + +**External Links:** +```xml + + + + + Link Text + + + + + +``` + +**Internal Links:** + +```xml + + + + + Link Text + + + + + +Target content + +``` + +**Hyperlink Style (required in styles.xml):** +```xml + + + + + + + + + + +``` + +## WordFile Library (Python) + +Use the WordFile class from `scripts/wordfile.py` for all tracked changes and comments. It automatically handles infrastructure setup (people.xml, RSIDs, settings.xml, comment files, relationships, content types). Only use direct XML manipulation for complex scenarios not supported by the library. + +**Working with Unicode and Entities:** +- **Searching**: Both entity notation and Unicode characters work - `contains="“Company"` and `contains="\u201cCompany"` find the same text +- **Replacing**: Use either entities (`“`) or Unicode (`\u201c`) - both work and will be converted appropriately based on the file's encoding (ascii -> entities, utf-8 -> Unicode) + +### Initialization + +**Find the docx-v2 skill root** (directory containing `scripts/` and `openxml/`): +```bash +# Search for wordfile.py to locate the skill root +# Note: /mnt/skills is used here as an example; check your context for the actual location +find /mnt/skills -name "wordfile.py" -path "*/docx-v2/scripts/*" 2>/dev/null | head -1 +# Example output: /mnt/skills/docx-v2/scripts/wordfile.py +# Skill root is: /mnt/skills/docx-v2 +``` + +**Run your script with PYTHONPATH** set to the docx-v2 skill root: +```bash +PYTHONPATH=/mnt/skills/docx-v2 python your_script.py +``` + +**In your script**, import from the skill root: +```python +from scripts.wordfile import WordFile, WordXMLProcessor + +# Basic initialization (automatically creates temp copy and sets up infrastructure) +doc = WordFile('unpacked') + +# Customize author and initials +doc = WordFile('unpacked', author="John Doe", initials="JD") + +# Enable track revisions mode +doc = WordFile('unpacked', track_revisions=True) + +# Specify custom RSID (auto-generated if not provided) +doc = WordFile('unpacked', rsid="07DC5ECB") +``` + +### Creating Tracked Changes + +**CRITICAL**: Only mark text that actually changes. Keep ALL unchanged text outside ``/`` tags. Marking unchanged text makes edits unprofessional and harder to review. + +**Attribute Handling**: The WordFile class auto-injects attributes (w:id, w:date, w:rsidR, w:rsidDel, w16du:dateUtc, xml:space) into new elements. When preserving unchanged text from the original document, copy the original `` element with its existing attributes to maintain document integrity. + +**Method Selection Guide**: +- **Adding your own changes to regular text**: Use `swap_element()` with ``/`` tags, or `mark_for_deletion()` for removing entire `` or `` elements +- **Partially modifying another author's tracked change**: Use `swap_element()` to nest your changes inside their ``/`` +- **Completely rejecting another author's insertion**: Use `undo_insertion()` on the `` element (NOT `mark_for_deletion()`) +- **Completely rejecting another author's deletion**: Use `undo_deletion()` on the `` element to restore deleted content using tracked changes + +```python +# Minimal edit - change one word: "The report is monthly" -> "The report is quarterly" +# Original: The report is monthly +node = doc["word/document.xml"].locate_element(tag="w:r", contains="The report is monthly") +rpr = tags[0].toxml() if (tags := node.getElementsByTagName("w:rPr")) else "" +replacement = f'{rpr}The report is {rpr}monthly{rpr}quarterly' +doc["word/document.xml"].swap_element(node, replacement) + +# Minimal edit - change number: "within 30 days" -> "within 45 days" +# Original: within 30 days +node = doc["word/document.xml"].locate_element(tag="w:r", contains="within 30 days") +rpr = tags[0].toxml() if (tags := node.getElementsByTagName("w:rPr")) else "" +replacement = f'{rpr}within {rpr}30{rpr}45{rpr} days' +doc["word/document.xml"].swap_element(node, replacement) + +# Complete replacement - preserve formatting even when replacing all text +node = doc["word/document.xml"].locate_element(tag="w:r", contains="apple") +rpr = tags[0].toxml() if (tags := node.getElementsByTagName("w:rPr")) else "" +replacement = f'{rpr}apple{rpr}banana orange' +doc["word/document.xml"].swap_element(node, replacement) + +# Insert new content (no attributes needed - auto-injected) +node = doc["word/document.xml"].locate_element(tag="w:r", contains="existing text") +doc["word/document.xml"].add_after(node, 'new text') + +# Partially delete another author's insertion +# Original: quarterly financial report +# Goal: Delete only "financial" to make it "quarterly report" +node = doc["word/document.xml"].locate_element(tag="w:ins", attrs={"w:id": "5"}) +# IMPORTANT: Preserve w:author="Jane Smith" on the outer to maintain authorship +replacement = ''' + quarterly + financial + report +''' +doc["word/document.xml"].swap_element(node, replacement) + +# Change part of another author's insertion +# Original: in silence, safe and sound +# Goal: Change "safe and sound" to "soft and unbound" +node = doc["word/document.xml"].locate_element(tag="w:ins", attrs={"w:id": "8"}) +replacement = f''' + in silence, + + + soft and unbound + + + safe and sound +''' +doc["word/document.xml"].swap_element(node, replacement) + +# Delete entire run (use only when deleting all content; use swap_element for partial deletions) +node = doc["word/document.xml"].locate_element(tag="w:r", contains="text to delete") +doc["word/document.xml"].mark_for_deletion(node) + +# Delete entire paragraph (in-place, handles both regular and numbered list paragraphs) +para = doc["word/document.xml"].locate_element(tag="w:p", contains="paragraph to delete") +doc["word/document.xml"].mark_for_deletion(para) + +# Add new numbered list item +target_para = doc["word/document.xml"].locate_element(tag="w:p", contains="existing list item") +pPr = tags[0].toxml() if (tags := target_para.getElementsByTagName("w:pPr")) else "" +new_item = f'{pPr}New item' +tracked_para = WordXMLProcessor.wrap_paragraph_insertion(new_item) +doc["word/document.xml"].add_after(target_para, tracked_para) +# Optional: add spacing paragraph before content for better visual separation +# spacing = WordXMLProcessor.wrap_paragraph_insertion('') +# doc["word/document.xml"].add_after(target_para, spacing + tracked_para) +``` + +### Adding Comments + +```python +# Add comment spanning two existing tracked changes +# Note: w:id is auto-generated. Only search by w:id if you know it from XML inspection +start_node = doc["word/document.xml"].locate_element(tag="w:del", attrs={"w:id": "1"}) +end_node = doc["word/document.xml"].locate_element(tag="w:ins", attrs={"w:id": "2"}) +doc.insert_comment(start=start_node, end=end_node, text="Explanation of this change") + +# Add comment on a paragraph +para = doc["word/document.xml"].locate_element(tag="w:p", contains="paragraph text") +doc.insert_comment(start=para, end=para, text="Comment on this paragraph") + +# Add comment on newly created tracked change +# First create the tracked change +node = doc["word/document.xml"].locate_element(tag="w:r", contains="old") +new_nodes = doc["word/document.xml"].swap_element( + node, + 'oldnew' +) +# Then add comment on the newly created elements +# new_nodes[0] is the , new_nodes[1] is the +doc.insert_comment(start=new_nodes[0], end=new_nodes[1], text="Changed old to new per requirements") + +# Reply to existing comment +doc.respond_to_comment(parent_comment_id=0, text="I agree with this change") +``` + +### Rejecting Tracked Changes + +**IMPORTANT**: Use `undo_insertion()` to reject insertions and `undo_deletion()` to restore deletions using tracked changes. Use `mark_for_deletion()` only for regular unmarked content. + +```python +# Reject insertion (wraps it in deletion) +# Use this when another author inserted text that you want to delete +ins = doc["word/document.xml"].locate_element(tag="w:ins", attrs={"w:id": "5"}) +nodes = doc["word/document.xml"].undo_insertion(ins) # Returns [ins] + +# Reject deletion (creates insertion to restore deleted content) +# Use this when another author deleted text that you want to restore +del_elem = doc["word/document.xml"].locate_element(tag="w:del", attrs={"w:id": "3"}) +nodes = doc["word/document.xml"].undo_deletion(del_elem) # Returns [del_elem, new_ins] + +# Reject all insertions in a paragraph +para = doc["word/document.xml"].locate_element(tag="w:p", contains="paragraph text") +nodes = doc["word/document.xml"].undo_insertion(para) # Returns [para] + +# Reject all deletions in a paragraph +para = doc["word/document.xml"].locate_element(tag="w:p", contains="paragraph text") +nodes = doc["word/document.xml"].undo_deletion(para) # Returns [para] +``` + +### Inserting Images + +**CRITICAL**: The WordFile class works with a temporary copy at `doc.unpacked_path`. Always copy images to this temp directory, not the original unpacked folder. + +```python +from PIL import Image +import shutil, os + +# Initialize document first +doc = WordFile('unpacked') + +# Copy image and calculate full-width dimensions with aspect ratio +media_dir = os.path.join(doc.unpacked_path, 'word/media') +os.makedirs(media_dir, exist_ok=True) +shutil.copy('image.png', os.path.join(media_dir, 'image1.png')) +img = Image.open(os.path.join(media_dir, 'image1.png')) +width_emus = int(6.5 * 914400) # 6.5" usable width, 914400 EMUs/inch +height_emus = int(width_emus * img.size[1] / img.size[0]) + +# Add relationship and content type +rels_editor = doc['word/_rels/document.xml.rels'] +next_rid = rels_editor.get_next_relationship_id() +rels_editor.add_to(rels_editor.dom.documentElement, + f'') +doc['[Content_Types].xml'].add_to(doc['[Content_Types].xml'].dom.documentElement, + '') + +# Insert image +node = doc["word/document.xml"].locate_element(tag="w:p", line_number=100) +doc["word/document.xml"].add_after(node, f''' + + + + + + + + + + + + + + + + + +''') +``` + +### Getting Nodes + +```python +# By text content +node = doc["word/document.xml"].locate_element(tag="w:p", contains="specific text") + +# By line range +para = doc["word/document.xml"].locate_element(tag="w:p", line_number=range(100, 150)) + +# By attributes +node = doc["word/document.xml"].locate_element(tag="w:del", attrs={"w:id": "1"}) + +# By exact line number (must be line number where tag opens) +para = doc["word/document.xml"].locate_element(tag="w:p", line_number=42) + +# Combine filters +node = doc["word/document.xml"].locate_element(tag="w:r", line_number=range(40, 60), contains="text") + +# Disambiguate when text appears multiple times - add line_number range +node = doc["word/document.xml"].locate_element(tag="w:r", contains="Section", line_number=range(2400, 2500)) +``` + +### Saving + +```python +# Save with automatic validation (copies back to original directory) +doc.persist() # Validates by default, raises error if validation fails + +# Save to different location +doc.persist('modified-unpacked') + +# Skip validation (debugging only - needing this in production indicates XML issues) +doc.persist(validate=False) +``` + +### Direct DOM Manipulation + +For complex scenarios not covered by the library: + +```python +# Access any XML file +editor = doc["word/document.xml"] +editor = doc["word/comments.xml"] + +# Direct DOM access (defusedxml.minidom.Document) +node = doc["word/document.xml"].locate_element(tag="w:p", line_number=5) +parent = node.parentNode +parent.removeChild(node) +parent.appendChild(node) # Move to end + +# General document manipulation (without tracked changes) +old_node = doc["word/document.xml"].locate_element(tag="w:p", contains="original text") +doc["word/document.xml"].swap_element(old_node, "replacement text") + +# Multiple insertions - use return value to maintain order +node = doc["word/document.xml"].locate_element(tag="w:r", line_number=100) +nodes = doc["word/document.xml"].add_after(node, "A") +nodes = doc["word/document.xml"].add_after(nodes[-1], "B") +nodes = doc["word/document.xml"].add_after(nodes[-1], "C") +# Results in: original_node, A, B, C +``` + +## Tracked Changes (Revision Tracking) + +**Use the WordFile class above for all tracked changes.** The patterns below are for reference when constructing replacement XML strings. + +### Validation Rules +The validator checks that the document text matches the original after reverting Claude's changes. This means: +- **NEVER modify text inside another author's `` or `` tags** +- **ALWAYS use nested deletions** to remove another author's insertions +- **Every edit must be properly tracked** with `` or `` tags + +### Tracked Change Patterns + +**CRITICAL RULES**: +1. Never modify the content inside another author's tracked changes. Always use nested deletions. +2. **XML Structure**: Always place `` and `` at paragraph level containing complete `` elements. Never nest inside `` elements - this creates invalid XML that breaks document processing. + +**Text Insertion:** +```xml + + + inserted text + + +``` + +**Text Deletion:** +```xml + + + deleted text + + +``` + +**Deleting Another Author's Insertion (MUST use nested structure):** +```xml + + + + monthly + + + + weekly + +``` + +**Restoring Another Author's Deletion:** +```xml + + + within 30 days + + + within 30 days + +``` diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd new file mode 100644 index 000000000..6454ef9a9 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd @@ -0,0 +1,1499 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd new file mode 100644 index 000000000..afa4f463e --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd @@ -0,0 +1,146 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd new file mode 100644 index 000000000..64e66b8ab --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd @@ -0,0 +1,1085 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd new file mode 100644 index 000000000..687eea829 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd @@ -0,0 +1,11 @@ + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd new file mode 100644 index 000000000..6ac81b06b --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd @@ -0,0 +1,3081 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd new file mode 100644 index 000000000..1dbf05140 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd new file mode 100644 index 000000000..f1af17db4 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd @@ -0,0 +1,185 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd new file mode 100644 index 000000000..0a185ab6e --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd @@ -0,0 +1,287 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/pml.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/pml.xsd new file mode 100644 index 000000000..14ef48886 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/pml.xsd @@ -0,0 +1,1676 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd new file mode 100644 index 000000000..c20f3bf14 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd new file mode 100644 index 000000000..ac6025226 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd @@ -0,0 +1,144 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd new file mode 100644 index 000000000..424b8ba8d --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd @@ -0,0 +1,174 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd new file mode 100644 index 000000000..2bddce292 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd new file mode 100644 index 000000000..8a8c18ba2 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd new file mode 100644 index 000000000..5c42706a0 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd new file mode 100644 index 000000000..853c341c8 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd new file mode 100644 index 000000000..da835ee82 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd @@ -0,0 +1,195 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd new file mode 100644 index 000000000..87ad2658f --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd @@ -0,0 +1,582 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd new file mode 100644 index 000000000..9e86f1b2b --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/sml.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/sml.xsd new file mode 100644 index 000000000..d0be42e75 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/sml.xsd @@ -0,0 +1,4439 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd new file mode 100644 index 000000000..8821dd183 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd @@ -0,0 +1,570 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd new file mode 100644 index 000000000..ca2575c75 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd @@ -0,0 +1,509 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd new file mode 100644 index 000000000..dd079e603 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd @@ -0,0 +1,12 @@ + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd new file mode 100644 index 000000000..3dd6cf625 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd @@ -0,0 +1,108 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd new file mode 100644 index 000000000..f1041e34e --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd @@ -0,0 +1,96 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/wml.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/wml.xsd new file mode 100644 index 000000000..9c5b7a633 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/wml.xsd @@ -0,0 +1,3646 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/xml.xsd b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/xml.xsd new file mode 100644 index 000000000..0f13678d8 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ISO-IEC29500-4_2016/xml.xsd @@ -0,0 +1,116 @@ + + + + + + See http://www.w3.org/XML/1998/namespace.html and + http://www.w3.org/TR/REC-xml for information about this namespace. + + This schema document describes the XML namespace, in a form + suitable for import by other schema documents. + + Note that local names in this namespace are intended to be defined + only by the World Wide Web Consortium or its subgroups. The + following names are currently defined in this namespace and should + not be used with conflicting semantics by any Working Group, + specification, or document instance: + + base (as an attribute name): denotes an attribute whose value + provides a URI to be used as the base for interpreting any + relative URIs in the scope of the element on which it + appears; its value is inherited. This name is reserved + by virtue of its definition in the XML Base specification. + + lang (as an attribute name): denotes an attribute whose value + is a language code for the natural language of the content of + any element; its value is inherited. This name is reserved + by virtue of its definition in the XML specification. + + space (as an attribute name): denotes an attribute whose + value is a keyword indicating what whitespace processing + discipline is intended for the content of the element; its + value is inherited. This name is reserved by virtue of its + definition in the XML specification. + + Father (in any context at all): denotes Jon Bosak, the chair of + the original XML Working Group. This name is reserved by + the following decision of the W3C XML Plenary and + XML Coordination groups: + + In appreciation for his vision, leadership and dedication + the W3C XML Plenary on this 10th day of February, 2000 + reserves for Jon Bosak in perpetuity the XML name + xml:Father + + + + + This schema defines attributes and an attribute group + suitable for use by + schemas wishing to allow xml:base, xml:lang or xml:space attributes + on elements they define. + + To enable this, such a schema must import this schema + for the XML namespace, e.g. as follows: + <schema . . .> + . . . + <import namespace="http://www.w3.org/XML/1998/namespace" + schemaLocation="http://www.w3.org/2001/03/xml.xsd"/> + + Subsequently, qualified reference to any of the attributes + or the group defined below will have the desired effect, e.g. + + <type . . .> + . . . + <attributeGroup ref="xml:specialAttrs"/> + + will define a type which will schema-validate an instance + element with any of those attributes + + + + In keeping with the XML Schema WG's standard versioning + policy, this schema document will persist at + http://www.w3.org/2001/03/xml.xsd. + At the date of issue it can also be found at + http://www.w3.org/2001/xml.xsd. + The schema document at that URI may however change in the future, + in order to remain compatible with the latest version of XML Schema + itself. In other words, if the XML Schema namespace changes, the version + of this document at + http://www.w3.org/2001/xml.xsd will change + accordingly; the version at + http://www.w3.org/2001/03/xml.xsd will not change. + + + + + + In due course, we should install the relevant ISO 2- and 3-letter + codes as the enumerated possible values . . . + + + + + + + + + + + + + + + See http://www.w3.org/TR/xmlbase/ for + information about this attribute. + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd b/deploy/data/skills/docx/openxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd new file mode 100644 index 000000000..a6de9d273 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd b/deploy/data/skills/docx/openxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd new file mode 100644 index 000000000..10e978b66 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd @@ -0,0 +1,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ecma/fouth-edition/opc-digSig.xsd b/deploy/data/skills/docx/openxml/schemas/ecma/fouth-edition/opc-digSig.xsd new file mode 100644 index 000000000..4248bf7a3 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ecma/fouth-edition/opc-digSig.xsd @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/ecma/fouth-edition/opc-relationships.xsd b/deploy/data/skills/docx/openxml/schemas/ecma/fouth-edition/opc-relationships.xsd new file mode 100644 index 000000000..564974671 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/ecma/fouth-edition/opc-relationships.xsd @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/mce/mc.xsd b/deploy/data/skills/docx/openxml/schemas/mce/mc.xsd new file mode 100644 index 000000000..ef725457c --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/mce/mc.xsd @@ -0,0 +1,75 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/microsoft/wml-2010.xsd b/deploy/data/skills/docx/openxml/schemas/microsoft/wml-2010.xsd new file mode 100644 index 000000000..f65f77773 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/microsoft/wml-2010.xsd @@ -0,0 +1,560 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/microsoft/wml-2012.xsd b/deploy/data/skills/docx/openxml/schemas/microsoft/wml-2012.xsd new file mode 100644 index 000000000..6b00755a9 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/microsoft/wml-2012.xsd @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/microsoft/wml-2018.xsd b/deploy/data/skills/docx/openxml/schemas/microsoft/wml-2018.xsd new file mode 100644 index 000000000..f321d333a --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/microsoft/wml-2018.xsd @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/microsoft/wml-cex-2018.xsd b/deploy/data/skills/docx/openxml/schemas/microsoft/wml-cex-2018.xsd new file mode 100644 index 000000000..364c6a9b8 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/microsoft/wml-cex-2018.xsd @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/microsoft/wml-cid-2016.xsd b/deploy/data/skills/docx/openxml/schemas/microsoft/wml-cid-2016.xsd new file mode 100644 index 000000000..fed9d15b7 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/microsoft/wml-cid-2016.xsd @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/microsoft/wml-sdtdatahash-2020.xsd b/deploy/data/skills/docx/openxml/schemas/microsoft/wml-sdtdatahash-2020.xsd new file mode 100644 index 000000000..680cf1540 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/microsoft/wml-sdtdatahash-2020.xsd @@ -0,0 +1,4 @@ + + + + diff --git a/deploy/data/skills/docx/openxml/schemas/microsoft/wml-symex-2015.xsd b/deploy/data/skills/docx/openxml/schemas/microsoft/wml-symex-2015.xsd new file mode 100644 index 000000000..89ada9083 --- /dev/null +++ b/deploy/data/skills/docx/openxml/schemas/microsoft/wml-symex-2015.xsd @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/deploy/data/skills/docx/openxml/scripts/assemble.py b/deploy/data/skills/docx/openxml/scripts/assemble.py new file mode 100644 index 000000000..e407d8dc6 --- /dev/null +++ b/deploy/data/skills/docx/openxml/scripts/assemble.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +""" +Tool to assemble a directory into a .docx, .pptx, or .xlsx file with XML formatting undone. + +Example usage: + python assemble.py [--force] +""" + +import argparse +import shutil +import subprocess +import sys +import tempfile +import defusedxml.minidom +import zipfile +from pathlib import Path + + +def main(): + parser = argparse.ArgumentParser(description="Assemble a directory into an Office file") + parser.add_argument("input_directory", help="Unpacked Office document directory") + parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)") + parser.add_argument("--force", action="store_true", help="Skip validation") + args = parser.parse_args() + + try: + success = assemble_document( + args.input_directory, args.output_file, validate=not args.force + ) + + # Show warning if validation was skipped + if args.force: + print("Warning: Skipped validation, file may be corrupt", file=sys.stderr) + # Exit with error if validation failed + elif not success: + print("Contents would produce a corrupt file.", file=sys.stderr) + print("Please validate XML before reassembling.", file=sys.stderr) + print("Use --force to skip validation and assemble anyway.", file=sys.stderr) + sys.exit(1) + + except ValueError as e: + sys.exit(f"Error: {e}") + + +def assemble_document(input_dir, output_file, validate=False): + """Assemble a directory into an Office file (.docx/.pptx/.xlsx). + + Args: + input_dir: Path to unpacked Office document directory + output_file: Path to output Office file + validate: If True, validates with soffice (default: False) + + Returns: + bool: True if successful, False if validation failed + """ + input_dir = Path(input_dir) + output_file = Path(output_file) + + if not input_dir.is_dir(): + raise ValueError(f"{input_dir} is not a directory") + if output_file.suffix.lower() not in {".docx", ".pptx", ".xlsx"}: + raise ValueError(f"{output_file} must be a .docx, .pptx, or .xlsx file") + + # Work in temporary directory to avoid modifying original + with tempfile.TemporaryDirectory() as temp_dir: + temp_content_dir = Path(temp_dir) / "content" + shutil.copytree(input_dir, temp_content_dir) + + # Process XML files to remove pretty-printing whitespace + for pattern in ["*.xml", "*.rels"]: + for xml_file in temp_content_dir.rglob(pattern): + condense_xml(xml_file) + + # Create final Office file as zip archive + output_file.parent.mkdir(parents=True, exist_ok=True) + with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf: + for f in temp_content_dir.rglob("*"): + if f.is_file(): + zf.write(f, f.relative_to(temp_content_dir)) + + # Validate if requested + if validate: + if not validate_document(output_file): + output_file.unlink() # Delete the corrupt file + return False + + return True + + +def validate_document(doc_path): + """Validate document by converting to HTML with soffice.""" + # Determine the correct filter based on file extension + match doc_path.suffix.lower(): + case ".docx": + filter_name = "html:HTML" + case ".pptx": + filter_name = "html:impress_html_Export" + case ".xlsx": + filter_name = "html:HTML (StarCalc)" + + with tempfile.TemporaryDirectory() as temp_dir: + try: + result = subprocess.run( + [ + "soffice", + "--headless", + "--convert-to", + filter_name, + "--outdir", + temp_dir, + str(doc_path), + ], + capture_output=True, + timeout=10, + text=True, + ) + if not (Path(temp_dir) / f"{doc_path.stem}.html").exists(): + error_msg = result.stderr.strip() or "Document validation failed" + print(f"Validation error: {error_msg}", file=sys.stderr) + return False + return True + except FileNotFoundError: + print("Warning: soffice not found. Skipping validation.", file=sys.stderr) + return True + except subprocess.TimeoutExpired: + print("Validation error: Timeout during conversion", file=sys.stderr) + return False + except Exception as e: + print(f"Validation error: {e}", file=sys.stderr) + return False + + +def condense_xml(xml_file): + """Strip unnecessary whitespace and remove comments.""" + with open(xml_file, "r", encoding="utf-8") as f: + dom = defusedxml.minidom.parse(f) + + # Process each element to remove whitespace and comments + for element in dom.getElementsByTagName("*"): + # Skip w:t elements and their processing + if element.tagName.endswith(":t"): + continue + + # Remove whitespace-only text nodes and comment nodes + for child in list(element.childNodes): + if ( + child.nodeType == child.TEXT_NODE + and child.nodeValue + and child.nodeValue.strip() == "" + ) or child.nodeType == child.COMMENT_NODE: + element.removeChild(child) + + # Write back the condensed XML + with open(xml_file, "wb") as f: + f.write(dom.toxml(encoding="UTF-8")) + + +if __name__ == "__main__": + main() diff --git a/deploy/data/skills/docx/openxml/scripts/extract.py b/deploy/data/skills/docx/openxml/scripts/extract.py new file mode 100644 index 000000000..7c172f9bd --- /dev/null +++ b/deploy/data/skills/docx/openxml/scripts/extract.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +"""Extract and format XML contents of Office files (.docx, .pptx, .xlsx)""" + +import random +import sys +import defusedxml.minidom +import zipfile +from pathlib import Path + +# Get command line arguments +assert len(sys.argv) == 3, "Usage: python extract.py " +input_file, output_dir = sys.argv[1], sys.argv[2] + +# Extract and format +output_path = Path(output_dir) +output_path.mkdir(parents=True, exist_ok=True) +zipfile.ZipFile(input_file).extractall(output_path) + +# Pretty print all XML files +xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels")) +for xml_file in xml_files: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + xml_file.write_bytes(dom.toprettyxml(indent=" ", encoding="ascii")) + +# For .docx files, suggest an RSID for tracked changes +if input_file.endswith(".docx"): + suggested_rsid = "".join(random.choices("0123456789ABCDEF", k=8)) + print(f"Suggested RSID for edit session: {suggested_rsid}") diff --git a/deploy/data/skills/docx/openxml/scripts/validation/__init__.py b/deploy/data/skills/docx/openxml/scripts/validation/__init__.py new file mode 100644 index 000000000..db092ece7 --- /dev/null +++ b/deploy/data/skills/docx/openxml/scripts/validation/__init__.py @@ -0,0 +1,15 @@ +""" +Validation modules for Word document processing. +""" + +from .base import BaseSchemaValidator +from .docx import DOCXSchemaValidator +from .pptx import PPTXSchemaValidator +from .redlining import RedliningValidator + +__all__ = [ + "BaseSchemaValidator", + "DOCXSchemaValidator", + "PPTXSchemaValidator", + "RedliningValidator", +] diff --git a/deploy/data/skills/docx/openxml/scripts/validation/base.py b/deploy/data/skills/docx/openxml/scripts/validation/base.py new file mode 100644 index 000000000..0681b199c --- /dev/null +++ b/deploy/data/skills/docx/openxml/scripts/validation/base.py @@ -0,0 +1,951 @@ +""" +Base validator with common validation logic for document files. +""" + +import re +from pathlib import Path + +import lxml.etree + + +class BaseSchemaValidator: + """Base validator with common validation logic for document files.""" + + # Elements whose 'id' attributes must be unique within their file + # Format: element_name -> (attribute_name, scope) + # scope can be 'file' (unique within file) or 'global' (unique across all files) + UNIQUE_ID_REQUIREMENTS = { + # Word elements + "comment": ("id", "file"), # Comment IDs in comments.xml + "commentrangestart": ("id", "file"), # Must match comment IDs + "commentrangeend": ("id", "file"), # Must match comment IDs + "bookmarkstart": ("id", "file"), # Bookmark start IDs + "bookmarkend": ("id", "file"), # Bookmark end IDs + # Note: ins and del (track changes) can share IDs when part of same revision + # PowerPoint elements + "sldid": ("id", "file"), # Slide IDs in presentation.xml + "sldmasterid": ("id", "global"), # Slide master IDs must be globally unique + "sldlayoutid": ("id", "global"), # Slide layout IDs must be globally unique + "cm": ("authorid", "file"), # Comment author IDs + # Excel elements + "sheet": ("sheetid", "file"), # Sheet IDs in workbook.xml + "definedname": ("id", "file"), # Named range IDs + # Drawing/Shape elements (all formats) + "cxnsp": ("id", "file"), # Connection shape IDs + "sp": ("id", "file"), # Shape IDs + "pic": ("id", "file"), # Picture IDs + "grpsp": ("id", "file"), # Group shape IDs + } + + # Mapping of element names to expected relationship types + # Subclasses should override this with format-specific mappings + ELEMENT_RELATIONSHIP_TYPES = {} + + # Unified schema mappings for all Office document types + SCHEMA_MAPPINGS = { + # Document type specific schemas + "word": "ISO-IEC29500-4_2016/wml.xsd", # Word documents + "ppt": "ISO-IEC29500-4_2016/pml.xsd", # PowerPoint presentations + "xl": "ISO-IEC29500-4_2016/sml.xsd", # Excel spreadsheets + # Common file types + "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd", + "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd", + "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd", + "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd", + ".rels": "ecma/fouth-edition/opc-relationships.xsd", + # Word-specific files + "people.xml": "microsoft/wml-2012.xsd", + "commentsIds.xml": "microsoft/wml-cid-2016.xsd", + "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd", + "commentsExtended.xml": "microsoft/wml-2012.xsd", + # Chart files (common across document types) + "chart": "ISO-IEC29500-4_2016/dml-chart.xsd", + # Theme files (common across document types) + "theme": "ISO-IEC29500-4_2016/dml-main.xsd", + # Drawing and media files + "drawing": "ISO-IEC29500-4_2016/dml-main.xsd", + } + + # Unified namespace constants + MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006" + XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace" + + # Common OOXML namespaces used across validators + PACKAGE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/relationships" + ) + OFFICE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships" + ) + CONTENT_TYPES_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/content-types" + ) + + # Folders where we should clean ignorable namespaces + MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"} + + # All allowed OOXML namespaces (superset of all document types) + OOXML_NAMESPACES = { + "http://schemas.openxmlformats.org/officeDocument/2006/math", + "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "http://schemas.openxmlformats.org/schemaLibrary/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/chart", + "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/diagram", + "http://schemas.openxmlformats.org/drawingml/2006/picture", + "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "http://schemas.openxmlformats.org/presentationml/2006/main", + "http://schemas.openxmlformats.org/spreadsheetml/2006/main", + "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes", + "http://www.w3.org/XML/1998/namespace", + } + + def __init__(self, unpacked_dir, original_file, verbose=False): + self.unpacked_dir = Path(unpacked_dir).resolve() + self.original_file = Path(original_file) + self.verbose = verbose + + # Set schemas directory + self.schemas_dir = Path(__file__).parent.parent.parent / "schemas" + + # Get all XML and .rels files + patterns = ["*.xml", "*.rels"] + self.xml_files = [ + f for pattern in patterns for f in self.unpacked_dir.rglob(pattern) + ] + + if not self.xml_files: + print(f"Warning: No XML files found in {self.unpacked_dir}") + + def validate(self): + """Run all validation checks and return True if all pass.""" + raise NotImplementedError("Subclasses must implement the validate method") + + def validate_xml(self): + """Validate that all XML files are well-formed.""" + errors = [] + + for xml_file in self.xml_files: + try: + # Try to parse the XML file + lxml.etree.parse(str(xml_file)) + except lxml.etree.XMLSyntaxError as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {e.lineno}: {e.msg}" + ) + except Exception as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Unexpected error: {str(e)}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} XML violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All XML files are well-formed") + return True + + def validate_namespaces(self): + """Validate that namespace prefixes in Ignorable attributes are declared.""" + errors = [] + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + declared = set(root.nsmap.keys()) - {None} # Exclude default namespace + + for attr_val in [ + v for k, v in root.attrib.items() if k.endswith("Ignorable") + ]: + undeclared = set(attr_val.split()) - declared + errors.extend( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Namespace '{ns}' in Ignorable but not declared" + for ns in undeclared + ) + except lxml.etree.XMLSyntaxError: + continue + + if errors: + print(f"FAILED - {len(errors)} namespace issues:") + for error in errors: + print(error) + return False + if self.verbose: + print("PASSED - All namespace prefixes properly declared") + return True + + def validate_unique_ids(self): + """Validate that specific IDs are unique according to OOXML requirements.""" + errors = [] + global_ids = {} # Track globally unique IDs across all files + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + file_ids = {} # Track IDs that must be unique within this file + + # Remove all mc:AlternateContent elements from the tree + mc_elements = root.xpath( + ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE} + ) + for elem in mc_elements: + elem.getparent().remove(elem) + + # Now check IDs in the cleaned tree + for elem in root.iter(): + # Get the element name without namespace + tag = ( + elem.tag.split("}")[-1].lower() + if "}" in elem.tag + else elem.tag.lower() + ) + + # Check if this element type has ID uniqueness requirements + if tag in self.UNIQUE_ID_REQUIREMENTS: + attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag] + + # Look for the specified attribute + id_value = None + for attr, value in elem.attrib.items(): + attr_local = ( + attr.split("}")[-1].lower() + if "}" in attr + else attr.lower() + ) + if attr_local == attr_name: + id_value = value + break + + if id_value is not None: + if scope == "global": + # Check global uniqueness + if id_value in global_ids: + prev_file, prev_line, prev_tag = global_ids[ + id_value + ] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> " + f"already used in {prev_file} at line {prev_line} in <{prev_tag}>" + ) + else: + global_ids[id_value] = ( + xml_file.relative_to(self.unpacked_dir), + elem.sourceline, + tag, + ) + elif scope == "file": + # Check file-level uniqueness + key = (tag, attr_name) + if key not in file_ids: + file_ids[key] = {} + + if id_value in file_ids[key]: + prev_line = file_ids[key][id_value] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> " + f"(first occurrence at line {prev_line})" + ) + else: + file_ids[key][id_value] = elem.sourceline + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} ID uniqueness violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All required IDs are unique") + return True + + def validate_file_references(self): + """ + Validate that all .rels files properly reference files and that all files are referenced. + """ + errors = [] + + # Find all .rels files + rels_files = list(self.unpacked_dir.rglob("*.rels")) + + if not rels_files: + if self.verbose: + print("PASSED - No .rels files found") + return True + + # Get all files in the unpacked directory (excluding reference files) + all_files = [] + for file_path in self.unpacked_dir.rglob("*"): + if ( + file_path.is_file() + and file_path.name != "[Content_Types].xml" + and not file_path.name.endswith(".rels") + ): # This file is not referenced by .rels + all_files.append(file_path.resolve()) + + # Track all files that are referenced by any .rels file + all_referenced_files = set() + + if self.verbose: + print( + f"Found {len(rels_files)} .rels files and {len(all_files)} target files" + ) + + # Check each .rels file + for rels_file in rels_files: + try: + # Parse relationships file + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + # Get the directory where this .rels file is located + rels_dir = rels_file.parent + + # Find all relationships and their targets + referenced_files = set() + broken_refs = [] + + for rel in rels_root.findall( + ".//ns:Relationship", + namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE}, + ): + target = rel.get("Target") + if target and not target.startswith( + ("http", "mailto:") + ): # Skip external URLs + # Resolve the target path relative to the .rels file location + if rels_file.name == ".rels": + # Root .rels file - targets are relative to unpacked_dir + target_path = self.unpacked_dir / target + else: + # Other .rels files - targets are relative to their parent's parent + # e.g., word/_rels/document.xml.rels -> targets relative to word/ + base_dir = rels_dir.parent + target_path = base_dir / target + + # Normalize the path and check if it exists + try: + target_path = target_path.resolve() + if target_path.exists() and target_path.is_file(): + referenced_files.add(target_path) + all_referenced_files.add(target_path) + else: + broken_refs.append((target, rel.sourceline)) + except (OSError, ValueError): + broken_refs.append((target, rel.sourceline)) + + # Report broken references + if broken_refs: + rel_path = rels_file.relative_to(self.unpacked_dir) + for broken_ref, line_num in broken_refs: + errors.append( + f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}" + ) + + except Exception as e: + rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append(f" Error parsing {rel_path}: {e}") + + # Check for unreferenced files (files that exist but are not referenced anywhere) + unreferenced_files = set(all_files) - all_referenced_files + + if unreferenced_files: + for unref_file in sorted(unreferenced_files): + unref_rel_path = unref_file.relative_to(self.unpacked_dir) + errors.append(f" Unreferenced file: {unref_rel_path}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship validation errors:") + for error in errors: + print(error) + print( + "CRITICAL: These errors will cause the document to appear corrupt. " + + "Broken references MUST be fixed, " + + "and unreferenced files MUST be referenced or removed." + ) + return False + else: + if self.verbose: + print( + "PASSED - All references are valid and all files are properly referenced" + ) + return True + + def validate_all_relationship_ids(self): + """ + Validate that all r:id attributes in XML files reference existing IDs + in their corresponding .rels files, and optionally validate relationship types. + """ + import lxml.etree + + errors = [] + + # Process each XML file that might contain r:id references + for xml_file in self.xml_files: + # Skip .rels files themselves + if xml_file.suffix == ".rels": + continue + + # Determine the corresponding .rels file + # For dir/file.xml, it's dir/_rels/file.xml.rels + rels_dir = xml_file.parent / "_rels" + rels_file = rels_dir / f"{xml_file.name}.rels" + + # Skip if there's no corresponding .rels file (that's okay) + if not rels_file.exists(): + continue + + try: + # Parse the .rels file to get valid relationship IDs and their types + rels_root = lxml.etree.parse(str(rels_file)).getroot() + rid_to_type = {} + + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rid = rel.get("Id") + rel_type = rel.get("Type", "") + if rid: + # Check for duplicate rIds + if rid in rid_to_type: + rels_rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append( + f" {rels_rel_path}: Line {rel.sourceline}: " + f"Duplicate relationship ID '{rid}' (IDs must be unique)" + ) + # Extract just the type name from the full URL + type_name = ( + rel_type.split("/")[-1] if "/" in rel_type else rel_type + ) + rid_to_type[rid] = type_name + + # Parse the XML file to find all r:id references + xml_root = lxml.etree.parse(str(xml_file)).getroot() + + # Find all elements with r:id attributes + for elem in xml_root.iter(): + # Check for r:id attribute (relationship ID) + rid_attr = elem.get(f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id") + if rid_attr: + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + elem_name = ( + elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag + ) + + # Check if the ID exists + if rid_attr not in rid_to_type: + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> references non-existent relationship '{rid_attr}' " + f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})" + ) + # Check if we have type expectations for this element + elif self.ELEMENT_RELATIONSHIP_TYPES: + expected_type = self._get_expected_relationship_type( + elem_name + ) + if expected_type: + actual_type = rid_to_type[rid_attr] + # Check if the actual type matches or contains the expected type + if expected_type not in actual_type.lower(): + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' " + f"but should point to a '{expected_type}' relationship" + ) + + except Exception as e: + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + errors.append(f" Error processing {xml_rel_path}: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship ID reference errors:") + for error in errors: + print(error) + print("\nThese ID mismatches will cause the document to appear corrupt!") + return False + else: + if self.verbose: + print("PASSED - All relationship ID references are valid") + return True + + def _get_expected_relationship_type(self, element_name): + """ + Get the expected relationship type for an element. + First checks the explicit mapping, then tries pattern detection. + """ + # Normalize element name to lowercase + elem_lower = element_name.lower() + + # Check explicit mapping first + if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES: + return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower] + + # Try pattern detection for common patterns + # Pattern 1: Elements ending in "Id" often expect a relationship of the prefix type + if elem_lower.endswith("id") and len(elem_lower) > 2: + # e.g., "sldId" -> "sld", "sldMasterId" -> "sldMaster" + prefix = elem_lower[:-2] # Remove "id" + # Check if this might be a compound like "sldMasterId" + if prefix.endswith("master"): + return prefix.lower() + elif prefix.endswith("layout"): + return prefix.lower() + else: + # Simple case like "sldId" -> "slide" + # Common transformations + if prefix == "sld": + return "slide" + return prefix.lower() + + # Pattern 2: Elements ending in "Reference" expect a relationship of the prefix type + if elem_lower.endswith("reference") and len(elem_lower) > 9: + prefix = elem_lower[:-9] # Remove "reference" + return prefix.lower() + + return None + + def validate_content_types(self): + """Validate that all content files are properly declared in [Content_Types].xml.""" + errors = [] + + # Find [Content_Types].xml file + content_types_file = self.unpacked_dir / "[Content_Types].xml" + if not content_types_file.exists(): + print("FAILED - [Content_Types].xml file not found") + return False + + try: + # Parse and get all declared parts and extensions + root = lxml.etree.parse(str(content_types_file)).getroot() + declared_parts = set() + declared_extensions = set() + + # Get Override declarations (specific files) + for override in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override" + ): + part_name = override.get("PartName") + if part_name is not None: + declared_parts.add(part_name.lstrip("/")) + + # Get Default declarations (by extension) + for default in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default" + ): + extension = default.get("Extension") + if extension is not None: + declared_extensions.add(extension.lower()) + + # Root elements that require content type declaration + declarable_roots = { + "sld", + "sldLayout", + "sldMaster", + "presentation", # PowerPoint + "document", # Word + "workbook", + "worksheet", # Excel + "theme", # Common + } + + # Common media file extensions that should be declared + media_extensions = { + "png": "image/png", + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "gif": "image/gif", + "bmp": "image/bmp", + "tiff": "image/tiff", + "wmf": "image/x-wmf", + "emf": "image/x-emf", + } + + # Get all files in the unpacked directory + all_files = list(self.unpacked_dir.rglob("*")) + all_files = [f for f in all_files if f.is_file()] + + # Check all XML files for Override declarations + for xml_file in self.xml_files: + path_str = str(xml_file.relative_to(self.unpacked_dir)).replace( + "\\", "/" + ) + + # Skip non-content files + if any( + skip in path_str + for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"] + ): + continue + + try: + root_tag = lxml.etree.parse(str(xml_file)).getroot().tag + root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag + + if root_name in declarable_roots and path_str not in declared_parts: + errors.append( + f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml" + ) + + except Exception: + continue # Skip unparseable files + + # Check all non-XML files for Default extension declarations + for file_path in all_files: + # Skip XML files and metadata files (already checked above) + if file_path.suffix.lower() in {".xml", ".rels"}: + continue + if file_path.name == "[Content_Types].xml": + continue + if "_rels" in file_path.parts or "docProps" in file_path.parts: + continue + + extension = file_path.suffix.lstrip(".").lower() + if extension and extension not in declared_extensions: + # Check if it's a known media extension that should be declared + if extension in media_extensions: + relative_path = file_path.relative_to(self.unpacked_dir) + errors.append( + f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: ' + ) + + except Exception as e: + errors.append(f" Error parsing [Content_Types].xml: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} content type declaration errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print( + "PASSED - All content files are properly declared in [Content_Types].xml" + ) + return True + + def validate_file_against_xsd(self, xml_file, verbose=False): + """Validate a single XML file against XSD schema, comparing with original. + + Args: + xml_file: Path to XML file to validate + verbose: Enable verbose output + + Returns: + tuple: (is_valid, new_errors_set) where is_valid is True/False/None (skipped) + """ + # Resolve both paths to handle symlinks + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + + # Validate current file + is_valid, current_errors = self._validate_single_file_xsd( + xml_file, unpacked_dir + ) + + if is_valid is None: + return None, set() # Skipped + elif is_valid: + return True, set() # Valid, no errors + + # Get errors from original file for this specific file + original_errors = self._get_original_file_errors(xml_file) + + # Compare with original (both are guaranteed to be sets here) + assert current_errors is not None + new_errors = current_errors - original_errors + + if new_errors: + if verbose: + relative_path = xml_file.relative_to(unpacked_dir) + print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)") + for error in list(new_errors)[:3]: + truncated = error[:250] + "..." if len(error) > 250 else error + print(f" - {truncated}") + return False, new_errors + else: + # All errors existed in original + if verbose: + print( + f"PASSED - No new errors (original had {len(current_errors)} errors)" + ) + return True, set() + + def validate_against_xsd(self): + """Validate XML files against XSD schemas, showing only new errors compared to original.""" + new_errors = [] + original_error_count = 0 + valid_count = 0 + skipped_count = 0 + + for xml_file in self.xml_files: + relative_path = str(xml_file.relative_to(self.unpacked_dir)) + is_valid, new_file_errors = self.validate_file_against_xsd( + xml_file, verbose=False + ) + + if is_valid is None: + skipped_count += 1 + continue + elif is_valid and not new_file_errors: + valid_count += 1 + continue + elif is_valid: + # Had errors but all existed in original + original_error_count += 1 + valid_count += 1 + continue + + # Has new errors + new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)") + for error in list(new_file_errors)[:3]: # Show first 3 errors + new_errors.append( + f" - {error[:250]}..." if len(error) > 250 else f" - {error}" + ) + + # Print summary + if self.verbose: + print(f"Validated {len(self.xml_files)} files:") + print(f" - Valid: {valid_count}") + print(f" - Skipped (no schema): {skipped_count}") + if original_error_count: + print(f" - With original errors (ignored): {original_error_count}") + print( + f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}" + ) + + if new_errors: + print("\nFAILED - Found NEW validation errors:") + for error in new_errors: + print(error) + return False + else: + if self.verbose: + print("\nPASSED - No new XSD validation errors introduced") + return True + + def _get_schema_path(self, xml_file): + """Determine the appropriate schema path for an XML file.""" + # Check exact filename match + if xml_file.name in self.SCHEMA_MAPPINGS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name] + + # Check .rels files + if xml_file.suffix == ".rels": + return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"] + + # Check chart files + if "charts/" in str(xml_file) and xml_file.name.startswith("chart"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"] + + # Check theme files + if "theme/" in str(xml_file) and xml_file.name.startswith("theme"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"] + + # Check if file is in a main content folder and use appropriate schema + if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name] + + return None + + def _clean_ignorable_namespaces(self, xml_doc): + """Remove attributes and elements not in allowed namespaces.""" + # Create a clean copy + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + # Remove attributes not in allowed namespaces + for elem in xml_copy.iter(): + attrs_to_remove = [] + + for attr in elem.attrib: + # Check if attribute is from a namespace other than allowed ones + if "{" in attr: + ns = attr.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + attrs_to_remove.append(attr) + + # Remove collected attributes + for attr in attrs_to_remove: + del elem.attrib[attr] + + # Remove elements not in allowed namespaces + self._remove_ignorable_elements(xml_copy) + + return lxml.etree.ElementTree(xml_copy) + + def _remove_ignorable_elements(self, root): + """Recursively remove all elements not in allowed namespaces.""" + elements_to_remove = [] + + # Find elements to remove + for elem in list(root): + # Skip non-element nodes (comments, processing instructions, etc.) + if not hasattr(elem, "tag") or callable(elem.tag): + continue + + tag_str = str(elem.tag) + if tag_str.startswith("{"): + ns = tag_str.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + elements_to_remove.append(elem) + continue + + # Recursively clean child elements + self._remove_ignorable_elements(elem) + + # Remove collected elements + for elem in elements_to_remove: + root.remove(elem) + + def _preprocess_for_mc_ignorable(self, xml_doc): + """Preprocess XML to handle mc:Ignorable attribute properly.""" + # Remove mc:Ignorable attributes before validation + root = xml_doc.getroot() + + # Remove mc:Ignorable attribute from root + if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib: + del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"] + + return xml_doc + + def _validate_single_file_xsd(self, xml_file, base_path): + """Validate a single XML file against XSD schema. Returns (is_valid, errors_set).""" + schema_path = self._get_schema_path(xml_file) + if not schema_path: + return None, None # Skip file + + try: + # Load schema + with open(schema_path, "rb") as xsd_file: + parser = lxml.etree.XMLParser() + xsd_doc = lxml.etree.parse( + xsd_file, parser=parser, base_url=str(schema_path) + ) + schema = lxml.etree.XMLSchema(xsd_doc) + + # Load and preprocess XML + with open(xml_file, "r") as f: + xml_doc = lxml.etree.parse(f) + + xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc) + xml_doc = self._preprocess_for_mc_ignorable(xml_doc) + + # Clean ignorable namespaces if needed + relative_path = xml_file.relative_to(base_path) + if ( + relative_path.parts + and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS + ): + xml_doc = self._clean_ignorable_namespaces(xml_doc) + + # Validate + if schema.validate(xml_doc): + return True, set() + else: + errors = set() + for error in schema.error_log: + # Store normalized error message (without line numbers for comparison) + errors.add(error.message) + return False, errors + + except Exception as e: + return False, {str(e)} + + def _get_original_file_errors(self, xml_file): + """Get XSD validation errors from a single file in the original document. + + Args: + xml_file: Path to the XML file in unpacked_dir to check + + Returns: + set: Set of error messages from the original file + """ + import tempfile + import zipfile + + # Resolve both paths to handle symlinks (e.g., /var vs /private/var on macOS) + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + relative_path = xml_file.relative_to(unpacked_dir) + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Extract original file + with zipfile.ZipFile(self.original_file, "r") as zip_ref: + zip_ref.extractall(temp_path) + + # Find corresponding file in original + original_xml_file = temp_path / relative_path + + if not original_xml_file.exists(): + # File didn't exist in original, so no original errors + return set() + + # Validate the specific file in original + is_valid, errors = self._validate_single_file_xsd( + original_xml_file, temp_path + ) + return errors if errors else set() + + def _remove_template_tags_from_text_nodes(self, xml_doc): + """Remove template tags from XML text nodes and collect warnings. + + Template tags follow the pattern {{ ... }} and are used as placeholders + for content replacement. They should be removed from text content before + XSD validation while preserving XML structure. + + Returns: + tuple: (cleaned_xml_doc, warnings_list) + """ + warnings = [] + template_pattern = re.compile(r"\{\{[^}]*\}\}") + + # Create a copy of the document to avoid modifying the original + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + def process_text_content(text, content_type): + if not text: + return text + matches = list(template_pattern.finditer(text)) + if matches: + for match in matches: + warnings.append( + f"Found template tag in {content_type}: {match.group()}" + ) + return template_pattern.sub("", text) + return text + + # Process all text nodes in the document + for elem in xml_copy.iter(): + # Skip processing if this is a w:t element + if not hasattr(elem, "tag") or callable(elem.tag): + continue + tag_str = str(elem.tag) + if tag_str.endswith("}t") or tag_str == "t": + continue + + elem.text = process_text_content(elem.text, "text content") + elem.tail = process_text_content(elem.tail, "tail content") + + return lxml.etree.ElementTree(xml_copy), warnings + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/deploy/data/skills/docx/openxml/scripts/validation/docx.py b/deploy/data/skills/docx/openxml/scripts/validation/docx.py new file mode 100644 index 000000000..602c47087 --- /dev/null +++ b/deploy/data/skills/docx/openxml/scripts/validation/docx.py @@ -0,0 +1,274 @@ +""" +Validator for Word document XML files against XSD schemas. +""" + +import re +import tempfile +import zipfile + +import lxml.etree + +from .base import BaseSchemaValidator + + +class DOCXSchemaValidator(BaseSchemaValidator): + """Validator for Word document XML files against XSD schemas.""" + + # Word-specific namespace + WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + + # Word-specific element to relationship type mappings + # Start with empty mapping - add specific cases as we discover them + ELEMENT_RELATIONSHIP_TYPES = {} + + def validate(self): + """Run all validation checks and return True if all pass.""" + # Test 0: XML well-formedness + if not self.validate_xml(): + return False + + # Test 1: Namespace declarations + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + # Test 2: Unique IDs + if not self.validate_unique_ids(): + all_valid = False + + # Test 3: Relationship and file reference validation + if not self.validate_file_references(): + all_valid = False + + # Test 4: Content type declarations + if not self.validate_content_types(): + all_valid = False + + # Test 5: XSD schema validation + if not self.validate_against_xsd(): + all_valid = False + + # Test 6: Whitespace preservation + if not self.validate_whitespace_preservation(): + all_valid = False + + # Test 7: Deletion validation + if not self.validate_deletions(): + all_valid = False + + # Test 8: Insertion validation + if not self.validate_insertions(): + all_valid = False + + # Test 9: Relationship ID reference validation + if not self.validate_all_relationship_ids(): + all_valid = False + + # Count and compare paragraphs + self.compare_paragraph_counts() + + return all_valid + + def validate_whitespace_preservation(self): + """ + Validate that w:t elements with whitespace have xml:space='preserve'. + """ + errors = [] + + for xml_file in self.xml_files: + # Only check document.xml files + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + # Find all w:t elements + for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"): + if elem.text: + text = elem.text + # Check if text starts or ends with whitespace + if re.match(r"^\s.*", text) or re.match(r".*\s$", text): + # Check if xml:space="preserve" attribute exists + xml_space_attr = f"{{{self.XML_NAMESPACE}}}space" + if ( + xml_space_attr not in elem.attrib + or elem.attrib[xml_space_attr] != "preserve" + ): + # Show a preview of the text + text_preview = ( + repr(text)[:50] + "..." + if len(repr(text)) > 50 + else repr(text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} whitespace preservation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All whitespace is properly preserved") + return True + + def validate_deletions(self): + """ + Validate that w:t elements are not within w:del elements. + For some reason, XSD validation does not catch this, so we do it manually. + """ + errors = [] + + for xml_file in self.xml_files: + # Only check document.xml files + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + # Find all w:t elements that are descendants of w:del elements + namespaces = {"w": self.WORD_2006_NAMESPACE} + xpath_expression = ".//w:del//w:t" + problematic_t_elements = root.xpath( + xpath_expression, namespaces=namespaces + ) + for t_elem in problematic_t_elements: + if t_elem.text: + # Show a preview of the text + text_preview = ( + repr(t_elem.text)[:50] + "..." + if len(repr(t_elem.text)) > 50 + else repr(t_elem.text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {t_elem.sourceline}: found within : {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} deletion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:t elements found within w:del elements") + return True + + def count_paragraphs_in_unpacked(self): + """Count the number of paragraphs in the unpacked document.""" + count = 0 + + for xml_file in self.xml_files: + # Only check document.xml files + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + # Count all w:p elements + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + except Exception as e: + print(f"Error counting paragraphs in unpacked document: {e}") + + return count + + def count_paragraphs_in_original(self): + """Count the number of paragraphs in the original docx file.""" + count = 0 + + try: + # Create temporary directory to unpack original + with tempfile.TemporaryDirectory() as temp_dir: + # Unpack original docx + with zipfile.ZipFile(self.original_file, "r") as zip_ref: + zip_ref.extractall(temp_dir) + + # Parse document.xml + doc_xml_path = temp_dir + "/word/document.xml" + root = lxml.etree.parse(doc_xml_path).getroot() + + # Count all w:p elements + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + + except Exception as e: + print(f"Error counting paragraphs in original document: {e}") + + return count + + def validate_insertions(self): + """ + Validate that w:delText elements are not within w:ins elements. + w:delText is only allowed in w:ins if nested within a w:del. + """ + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + # Find w:delText in w:ins that are NOT within w:del + invalid_elements = root.xpath( + ".//w:ins//w:delText[not(ancestor::w:del)]", + namespaces=namespaces + ) + + for elem in invalid_elements: + text_preview = ( + repr(elem.text or "")[:50] + "..." + if len(repr(elem.text or "")) > 50 + else repr(elem.text or "") + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: within : {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} insertion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:delText elements within w:ins elements") + return True + + def compare_paragraph_counts(self): + """Compare paragraph counts between original and new document.""" + original_count = self.count_paragraphs_in_original() + new_count = self.count_paragraphs_in_unpacked() + + diff = new_count - original_count + diff_str = f"+{diff}" if diff > 0 else str(diff) + print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})") + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/deploy/data/skills/docx/openxml/scripts/validation/pptx.py b/deploy/data/skills/docx/openxml/scripts/validation/pptx.py new file mode 100644 index 000000000..66d5b1e2d --- /dev/null +++ b/deploy/data/skills/docx/openxml/scripts/validation/pptx.py @@ -0,0 +1,315 @@ +""" +Validator for PowerPoint presentation XML files against XSD schemas. +""" + +import re + +from .base import BaseSchemaValidator + + +class PPTXSchemaValidator(BaseSchemaValidator): + """Validator for PowerPoint presentation XML files against XSD schemas.""" + + # PowerPoint presentation namespace + PRESENTATIONML_NAMESPACE = ( + "http://schemas.openxmlformats.org/presentationml/2006/main" + ) + + # PowerPoint-specific element to relationship type mappings + ELEMENT_RELATIONSHIP_TYPES = { + "sldid": "slide", + "sldmasterid": "slidemaster", + "notesmasterid": "notesmaster", + "sldlayoutid": "slidelayout", + "themeid": "theme", + "tablestyleid": "tablestyles", + } + + def validate(self): + """Run all validation checks and return True if all pass.""" + # Test 0: XML well-formedness + if not self.validate_xml(): + return False + + # Test 1: Namespace declarations + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + # Test 2: Unique IDs + if not self.validate_unique_ids(): + all_valid = False + + # Test 3: UUID ID validation + if not self.validate_uuid_ids(): + all_valid = False + + # Test 4: Relationship and file reference validation + if not self.validate_file_references(): + all_valid = False + + # Test 5: Slide layout ID validation + if not self.validate_slide_layout_ids(): + all_valid = False + + # Test 6: Content type declarations + if not self.validate_content_types(): + all_valid = False + + # Test 7: XSD schema validation + if not self.validate_against_xsd(): + all_valid = False + + # Test 8: Notes slide reference validation + if not self.validate_notes_slide_references(): + all_valid = False + + # Test 9: Relationship ID reference validation + if not self.validate_all_relationship_ids(): + all_valid = False + + # Test 10: Duplicate slide layout references validation + if not self.validate_no_duplicate_slide_layouts(): + all_valid = False + + return all_valid + + def validate_uuid_ids(self): + """Validate that ID attributes that look like UUIDs contain only hex values.""" + import lxml.etree + + errors = [] + # UUID pattern: 8-4-4-4-12 hex digits with optional braces/hyphens + uuid_pattern = re.compile( + r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$" + ) + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + # Check all elements for ID attributes + for elem in root.iter(): + for attr, value in elem.attrib.items(): + # Check if this is an ID attribute + attr_name = attr.split("}")[-1].lower() + if attr_name == "id" or attr_name.endswith("id"): + # Check if value looks like a UUID (has the right length and pattern structure) + if self._looks_like_uuid(value): + # Validate that it contains only hex characters in the right positions + if not uuid_pattern.match(value): + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} UUID ID validation errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All UUID-like IDs contain valid hex values") + return True + + def _looks_like_uuid(self, value): + """Check if a value has the general structure of a UUID.""" + # Remove common UUID delimiters + clean_value = value.strip("{}()").replace("-", "") + # Check if it's 32 hex-like characters (could include invalid hex chars) + return len(clean_value) == 32 and all(c.isalnum() for c in clean_value) + + def validate_slide_layout_ids(self): + """Validate that sldLayoutId elements in slide masters reference valid slide layouts.""" + import lxml.etree + + errors = [] + + # Find all slide master files + slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml")) + + if not slide_masters: + if self.verbose: + print("PASSED - No slide masters found") + return True + + for slide_master in slide_masters: + try: + # Parse the slide master file + root = lxml.etree.parse(str(slide_master)).getroot() + + # Find the corresponding _rels file for this slide master + rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels" + + if not rels_file.exists(): + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}" + ) + continue + + # Parse the relationships file + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + # Build a set of valid relationship IDs that point to slide layouts + valid_layout_rids = set() + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "slideLayout" in rel_type: + valid_layout_rids.add(rel.get("Id")) + + # Find all sldLayoutId elements in the slide master + for sld_layout_id in root.findall( + f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId" + ): + r_id = sld_layout_id.get( + f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id" + ) + layout_id = sld_layout_id.get("id") + + if r_id and r_id not in valid_layout_rids: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' " + f"references r:id='{r_id}' which is not found in slide layout relationships" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} slide layout ID validation errors:") + for error in errors: + print(error) + print( + "Remove invalid references or add missing slide layouts to the relationships file." + ) + return False + else: + if self.verbose: + print("PASSED - All slide layout IDs reference valid slide layouts") + return True + + def validate_no_duplicate_slide_layouts(self): + """Validate that each slide has exactly one slideLayout reference.""" + import lxml.etree + + errors = [] + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + for rels_file in slide_rels_files: + try: + root = lxml.etree.parse(str(rels_file)).getroot() + + # Find all slideLayout relationships + layout_rels = [ + rel + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ) + if "slideLayout" in rel.get("Type", "") + ] + + if len(layout_rels) > 1: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references" + ) + + except Exception as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print("FAILED - Found slides with duplicate slideLayout references:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All slides have exactly one slideLayout reference") + return True + + def validate_notes_slide_references(self): + """Validate that each notesSlide file is referenced by only one slide.""" + import lxml.etree + + errors = [] + notes_slide_references = {} # Track which slides reference each notesSlide + + # Find all slide relationship files + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + if not slide_rels_files: + if self.verbose: + print("PASSED - No slide relationship files found") + return True + + for rels_file in slide_rels_files: + try: + # Parse the relationships file + root = lxml.etree.parse(str(rels_file)).getroot() + + # Find all notesSlide relationships + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "notesSlide" in rel_type: + target = rel.get("Target", "") + if target: + # Normalize the target path to handle relative paths + normalized_target = target.replace("../", "") + + # Track which slide references this notesSlide + slide_name = rels_file.stem.replace( + ".xml", "" + ) # e.g., "slide1" + + if normalized_target not in notes_slide_references: + notes_slide_references[normalized_target] = [] + notes_slide_references[normalized_target].append( + (slide_name, rels_file) + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + # Check for duplicate references + for target, references in notes_slide_references.items(): + if len(references) > 1: + slide_names = [ref[0] for ref in references] + errors.append( + f" Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}" + ) + for slide_name, rels_file in references: + errors.append(f" - {rels_file.relative_to(self.unpacked_dir)}") + + if errors: + print( + f"FAILED - Found {len([e for e in errors if not e.startswith(' ')])} notes slide reference validation errors:" + ) + for error in errors: + print(error) + print("Each slide may optionally have its own slide file.") + return False + else: + if self.verbose: + print("PASSED - All notes slide references are unique") + return True + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/deploy/data/skills/docx/openxml/scripts/validation/redlining.py b/deploy/data/skills/docx/openxml/scripts/validation/redlining.py new file mode 100644 index 000000000..7ed425edf --- /dev/null +++ b/deploy/data/skills/docx/openxml/scripts/validation/redlining.py @@ -0,0 +1,279 @@ +""" +Validator for tracked changes in Word documents. +""" + +import subprocess +import tempfile +import zipfile +from pathlib import Path + + +class RedliningValidator: + """Validator for tracked changes in Word documents.""" + + def __init__(self, unpacked_dir, original_docx, verbose=False): + self.unpacked_dir = Path(unpacked_dir) + self.original_docx = Path(original_docx) + self.verbose = verbose + self.namespaces = { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + } + + def validate(self): + """Main validation method that returns True if valid, False otherwise.""" + # Verify unpacked directory exists and has correct structure + modified_file = self.unpacked_dir / "word" / "document.xml" + if not modified_file.exists(): + print(f"FAILED - Modified document.xml not found at {modified_file}") + return False + + # First, check if there are any tracked changes by Claude to validate + try: + import xml.etree.ElementTree as ET + + tree = ET.parse(modified_file) + root = tree.getroot() + + # Check for w:del or w:ins tags authored by Claude + del_elements = root.findall(".//w:del", self.namespaces) + ins_elements = root.findall(".//w:ins", self.namespaces) + + # Filter to only include changes by Claude + claude_del_elements = [ + elem + for elem in del_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude" + ] + claude_ins_elements = [ + elem + for elem in ins_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude" + ] + + # Redlining validation is only needed if tracked changes by Claude have been used. + if not claude_del_elements and not claude_ins_elements: + if self.verbose: + print("PASSED - No tracked changes by Claude found.") + return True + + except Exception: + # If we can't parse the XML, continue with full validation + pass + + # Create temporary directory for unpacking original docx + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Unpack original docx + try: + with zipfile.ZipFile(self.original_docx, "r") as zip_ref: + zip_ref.extractall(temp_path) + except Exception as e: + print(f"FAILED - Error unpacking original docx: {e}") + return False + + original_file = temp_path / "word" / "document.xml" + if not original_file.exists(): + print( + f"FAILED - Original document.xml not found in {self.original_docx}" + ) + return False + + # Parse both XML files using xml.etree.ElementTree for redlining validation + try: + import xml.etree.ElementTree as ET + + modified_tree = ET.parse(modified_file) + modified_root = modified_tree.getroot() + original_tree = ET.parse(original_file) + original_root = original_tree.getroot() + except ET.ParseError as e: + print(f"FAILED - Error parsing XML files: {e}") + return False + + # Remove Claude's tracked changes from both documents + self._remove_claude_tracked_changes(original_root) + self._remove_claude_tracked_changes(modified_root) + + # Extract and compare text content + modified_text = self._extract_text_content(modified_root) + original_text = self._extract_text_content(original_root) + + if modified_text != original_text: + # Show detailed character-level differences for each paragraph + error_message = self._generate_detailed_diff( + original_text, modified_text + ) + print(error_message) + return False + + if self.verbose: + print("PASSED - All changes by Claude are properly tracked") + return True + + def _generate_detailed_diff(self, original_text, modified_text): + """Generate detailed word-level differences using git word diff.""" + error_parts = [ + "FAILED - Document text doesn't match after removing Claude's tracked changes", + "", + "Likely causes:", + " 1. Modified text inside another author's or tags", + " 2. Made edits without proper tracked changes", + " 3. Didn't nest inside when deleting another's insertion", + "", + "For pre-redlined documents, use correct patterns:", + " - To reject another's INSERTION: Nest inside their ", + " - To restore another's DELETION: Add new AFTER their ", + "", + ] + + # Show git word diff + git_diff = self._get_git_word_diff(original_text, modified_text) + if git_diff: + error_parts.extend(["Differences:", "============", git_diff]) + else: + error_parts.append("Unable to generate word diff (git not available)") + + return "\n".join(error_parts) + + def _get_git_word_diff(self, original_text, modified_text): + """Generate word diff using git with character-level precision.""" + try: + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create two files + original_file = temp_path / "original.txt" + modified_file = temp_path / "modified.txt" + + original_file.write_text(original_text, encoding="utf-8") + modified_file.write_text(modified_text, encoding="utf-8") + + # Try character-level diff first for precise differences + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "--word-diff-regex=.", # Character-by-character diff + "-U0", # Zero lines of context - show only changed lines + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + # Clean up the output - remove git diff header lines + lines = result.stdout.split("\n") + # Skip the header lines (diff --git, index, +++, ---, @@) + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + + if content_lines: + return "\n".join(content_lines) + + # Fallback to word-level diff if character-level is too verbose + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "-U0", # Zero lines of context + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + lines = result.stdout.split("\n") + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + return "\n".join(content_lines) + + except (subprocess.CalledProcessError, FileNotFoundError, Exception): + # Git not available or other error, return None to use fallback + pass + + return None + + def _remove_claude_tracked_changes(self, root): + """Remove tracked changes authored by Claude from the XML root.""" + ins_tag = f"{{{self.namespaces['w']}}}ins" + del_tag = f"{{{self.namespaces['w']}}}del" + author_attr = f"{{{self.namespaces['w']}}}author" + + # Remove w:ins elements + for parent in root.iter(): + to_remove = [] + for child in parent: + if child.tag == ins_tag and child.get(author_attr) == "Claude": + to_remove.append(child) + for elem in to_remove: + parent.remove(elem) + + # Unwrap content in w:del elements where author is "Claude" + deltext_tag = f"{{{self.namespaces['w']}}}delText" + t_tag = f"{{{self.namespaces['w']}}}t" + + for parent in root.iter(): + to_process = [] + for child in parent: + if child.tag == del_tag and child.get(author_attr) == "Claude": + to_process.append((child, list(parent).index(child))) + + # Process in reverse order to maintain indices + for del_elem, del_index in reversed(to_process): + # Convert w:delText to w:t before moving + for elem in del_elem.iter(): + if elem.tag == deltext_tag: + elem.tag = t_tag + + # Move all children of w:del to its parent before removing w:del + for child in reversed(list(del_elem)): + parent.insert(del_index, child) + parent.remove(del_elem) + + def _extract_text_content(self, root): + """Extract text content from Word XML, preserving paragraph structure. + + Empty paragraphs are skipped to avoid false positives when tracked + insertions add only structural elements without text content. + """ + p_tag = f"{{{self.namespaces['w']}}}p" + t_tag = f"{{{self.namespaces['w']}}}t" + + paragraphs = [] + for p_elem in root.findall(f".//{p_tag}"): + # Get all text elements within this paragraph + text_parts = [] + for t_elem in p_elem.findall(f".//{t_tag}"): + if t_elem.text: + text_parts.append(t_elem.text) + paragraph_text = "".join(text_parts) + # Skip empty paragraphs - they don't affect content validation + if paragraph_text: + paragraphs.append(paragraph_text) + + return "\n".join(paragraphs) + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/deploy/data/skills/docx/openxml/scripts/verify.py b/deploy/data/skills/docx/openxml/scripts/verify.py new file mode 100644 index 000000000..eee7986ea --- /dev/null +++ b/deploy/data/skills/docx/openxml/scripts/verify.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +""" +Command line tool to verify Office document XML files against XSD schemas and tracked changes. + +Usage: + python verify.py --original +""" + +import argparse +import sys +from pathlib import Path + +from validation import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator + + +def main(): + parser = argparse.ArgumentParser(description="Verify Office document XML files") + parser.add_argument( + "unpacked_dir", + help="Path to unpacked Office document directory", + ) + parser.add_argument( + "--original", + required=True, + help="Path to original file (.docx/.pptx/.xlsx)", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Enable verbose output", + ) + args = parser.parse_args() + + # Validate paths + unpacked_dir = Path(args.unpacked_dir) + original_file = Path(args.original) + file_extension = original_file.suffix.lower() + assert unpacked_dir.is_dir(), f"Error: {unpacked_dir} is not a directory" + assert original_file.is_file(), f"Error: {original_file} is not a file" + assert file_extension in [".docx", ".pptx", ".xlsx"], ( + f"Error: {original_file} must be a .docx, .pptx, or .xlsx file" + ) + + # Run validations + match file_extension: + case ".docx": + validators = [DOCXSchemaValidator, RedliningValidator] + case ".pptx": + validators = [PPTXSchemaValidator] + case _: + print(f"Error: Validation not supported for file type {file_extension}") + sys.exit(1) + + # Run validators + success = True + for V in validators: + validator = V(unpacked_dir, original_file, verbose=args.verbose) + if not validator.validate(): + success = False + + if success: + print("All validations PASSED!") + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/deploy/data/skills/docx/scripts/__init__.py b/deploy/data/skills/docx/scripts/__init__.py new file mode 100644 index 000000000..bf9c56272 --- /dev/null +++ b/deploy/data/skills/docx/scripts/__init__.py @@ -0,0 +1 @@ +# Make scripts directory a package for relative imports in tests diff --git a/deploy/data/skills/docx/scripts/tpl/comments.xml b/deploy/data/skills/docx/scripts/tpl/comments.xml new file mode 100644 index 000000000..b5dace0ef --- /dev/null +++ b/deploy/data/skills/docx/scripts/tpl/comments.xml @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/deploy/data/skills/docx/scripts/tpl/commentsExtended.xml b/deploy/data/skills/docx/scripts/tpl/commentsExtended.xml new file mode 100644 index 000000000..b4cf23e35 --- /dev/null +++ b/deploy/data/skills/docx/scripts/tpl/commentsExtended.xml @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/deploy/data/skills/docx/scripts/tpl/commentsExtensible.xml b/deploy/data/skills/docx/scripts/tpl/commentsExtensible.xml new file mode 100644 index 000000000..e32a05e0c --- /dev/null +++ b/deploy/data/skills/docx/scripts/tpl/commentsExtensible.xml @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/deploy/data/skills/docx/scripts/tpl/commentsIds.xml b/deploy/data/skills/docx/scripts/tpl/commentsIds.xml new file mode 100644 index 000000000..d04bc8e06 --- /dev/null +++ b/deploy/data/skills/docx/scripts/tpl/commentsIds.xml @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/deploy/data/skills/docx/scripts/tpl/people.xml b/deploy/data/skills/docx/scripts/tpl/people.xml new file mode 100644 index 000000000..a839cafeb --- /dev/null +++ b/deploy/data/skills/docx/scripts/tpl/people.xml @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/deploy/data/skills/docx/scripts/wordfile.py b/deploy/data/skills/docx/scripts/wordfile.py new file mode 100644 index 000000000..be9ba110b --- /dev/null +++ b/deploy/data/skills/docx/scripts/wordfile.py @@ -0,0 +1,1276 @@ +#!/usr/bin/env python3 +""" +Library for working with Word documents: comments, tracked changes, and editing. + +Usage: + from skills.docx-v2.scripts.wordfile import WordFile + + # Initialize + doc = WordFile('workspace/unpacked') + doc = WordFile('workspace/unpacked', author="John Doe", initials="JD") + + # Find nodes + node = doc["word/document.xml"].locate_element(tag="w:del", attrs={"w:id": "1"}) + node = doc["word/document.xml"].locate_element(tag="w:p", line_number=10) + + # Add comments + doc.insert_comment(start=node, end=node, text="Comment text") + doc.respond_to_comment(parent_comment_id=0, text="Reply text") + + # Suggest tracked changes + doc["word/document.xml"].mark_for_deletion(node) # Delete content + doc["word/document.xml"].undo_insertion(ins_node) # Reject insertion + doc["word/document.xml"].undo_deletion(del_node) # Reject deletion + + # Save + doc.persist() +""" + +import html +import random +import shutil +import tempfile +from datetime import datetime, timezone +from pathlib import Path + +from defusedxml import minidom +from openxml.scripts.assemble import assemble_document +from openxml.scripts.validation.docx import DOCXSchemaValidator +from openxml.scripts.validation.redlining import RedliningValidator + +from .xml_helper import XMLProcessor + +# Path to template files +TEMPLATE_DIR = Path(__file__).parent / "tpl" + + +class WordXMLProcessor(XMLProcessor): + """XMLProcessor that automatically applies RSID, author, and date to new elements. + + Automatically adds attributes to elements that support them when inserting new content: + - w:rsidR, w:rsidRDefault, w:rsidP (for w:p and w:r elements) + - w:author and w:date (for w:ins, w:del, w:comment elements) + - w:id (for w:ins and w:del elements) + + Attributes: + dom (defusedxml.minidom.Document): The DOM document for direct manipulation + """ + + def __init__( + self, xml_path, rsid: str, author: str = "Claude", initials: str = "C" + ): + """Initialize with required RSID and optional author. + + Args: + xml_path: Path to XML file to edit + rsid: RSID to automatically apply to new elements + author: Author name for tracked changes and comments (default: "Claude") + initials: Author initials (default: "C") + """ + super().__init__(xml_path) + self.rsid = rsid + self.author = author + self.initials = initials + + def _get_next_change_id(self): + """Get the next available change ID by checking all tracked change elements.""" + max_id = -1 + for tag in ("w:ins", "w:del"): + elements = self.dom.getElementsByTagName(tag) + for elem in elements: + change_id = elem.getAttribute("w:id") + if change_id: + try: + max_id = max(max_id, int(change_id)) + except ValueError: + pass + return max_id + 1 + + def _ensure_w16du_namespace(self): + """Ensure w16du namespace is declared on the root element.""" + root = self.dom.documentElement + if not root.hasAttribute("xmlns:w16du"): # type: ignore + root.setAttribute( # type: ignore + "xmlns:w16du", + "http://schemas.microsoft.com/office/word/2023/wordml/word16du", + ) + + def _ensure_w16cex_namespace(self): + """Ensure w16cex namespace is declared on the root element.""" + root = self.dom.documentElement + if not root.hasAttribute("xmlns:w16cex"): # type: ignore + root.setAttribute( # type: ignore + "xmlns:w16cex", + "http://schemas.microsoft.com/office/word/2018/wordml/cex", + ) + + def _ensure_w14_namespace(self): + """Ensure w14 namespace is declared on the root element.""" + root = self.dom.documentElement + if not root.hasAttribute("xmlns:w14"): # type: ignore + root.setAttribute( # type: ignore + "xmlns:w14", + "http://schemas.microsoft.com/office/word/2010/wordml", + ) + + def _inject_attributes_to_nodes(self, nodes): + """Inject RSID, author, and date attributes into DOM nodes where applicable. + + Adds attributes to elements that support them: + - w:r: gets w:rsidR (or w:rsidDel if inside w:del) + - w:p: gets w:rsidR, w:rsidRDefault, w:rsidP, w14:paraId, w14:textId + - w:t: gets xml:space="preserve" if text has leading/trailing whitespace + - w:ins, w:del: get w:id, w:author, w:date, w16du:dateUtc + - w:comment: gets w:author, w:date, w:initials + - w16cex:commentExtensible: gets w16cex:dateUtc + + Args: + nodes: List of DOM nodes to process + """ + from datetime import datetime, timezone + + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + def is_inside_deletion(elem): + """Check if element is inside a w:del element.""" + parent = elem.parentNode + while parent: + if parent.nodeType == parent.ELEMENT_NODE and parent.tagName == "w:del": + return True + parent = parent.parentNode + return False + + def add_rsid_to_p(elem): + if not elem.hasAttribute("w:rsidR"): + elem.setAttribute("w:rsidR", self.rsid) + if not elem.hasAttribute("w:rsidRDefault"): + elem.setAttribute("w:rsidRDefault", self.rsid) + if not elem.hasAttribute("w:rsidP"): + elem.setAttribute("w:rsidP", self.rsid) + # Add w14:paraId and w14:textId if not present + if not elem.hasAttribute("w14:paraId"): + self._ensure_w14_namespace() + elem.setAttribute("w14:paraId", _generate_hex_id()) + if not elem.hasAttribute("w14:textId"): + self._ensure_w14_namespace() + elem.setAttribute("w14:textId", _generate_hex_id()) + + def add_rsid_to_r(elem): + # Use w:rsidDel for inside , otherwise w:rsidR + if is_inside_deletion(elem): + if not elem.hasAttribute("w:rsidDel"): + elem.setAttribute("w:rsidDel", self.rsid) + else: + if not elem.hasAttribute("w:rsidR"): + elem.setAttribute("w:rsidR", self.rsid) + + def add_tracked_change_attrs(elem): + # Auto-assign w:id if not present + if not elem.hasAttribute("w:id"): + elem.setAttribute("w:id", str(self._get_next_change_id())) + if not elem.hasAttribute("w:author"): + elem.setAttribute("w:author", self.author) + if not elem.hasAttribute("w:date"): + elem.setAttribute("w:date", timestamp) + # Add w16du:dateUtc for tracked changes (same as w:date since we generate UTC timestamps) + if elem.tagName in ("w:ins", "w:del") and not elem.hasAttribute( + "w16du:dateUtc" + ): + self._ensure_w16du_namespace() + elem.setAttribute("w16du:dateUtc", timestamp) + + def add_comment_attrs(elem): + if not elem.hasAttribute("w:author"): + elem.setAttribute("w:author", self.author) + if not elem.hasAttribute("w:date"): + elem.setAttribute("w:date", timestamp) + if not elem.hasAttribute("w:initials"): + elem.setAttribute("w:initials", self.initials) + + def add_comment_extensible_date(elem): + # Add w16cex:dateUtc for comment extensible elements + if not elem.hasAttribute("w16cex:dateUtc"): + self._ensure_w16cex_namespace() + elem.setAttribute("w16cex:dateUtc", timestamp) + + def add_xml_space_to_t(elem): + # Add xml:space="preserve" to w:t if text has leading/trailing whitespace + if ( + elem.firstChild + and elem.firstChild.nodeType == elem.firstChild.TEXT_NODE + ): + text = elem.firstChild.data + if text and (text[0].isspace() or text[-1].isspace()): + if not elem.hasAttribute("xml:space"): + elem.setAttribute("xml:space", "preserve") + + for node in nodes: + if node.nodeType != node.ELEMENT_NODE: + continue + + # Handle the node itself + if node.tagName == "w:p": + add_rsid_to_p(node) + elif node.tagName == "w:r": + add_rsid_to_r(node) + elif node.tagName == "w:t": + add_xml_space_to_t(node) + elif node.tagName in ("w:ins", "w:del"): + add_tracked_change_attrs(node) + elif node.tagName == "w:comment": + add_comment_attrs(node) + elif node.tagName == "w16cex:commentExtensible": + add_comment_extensible_date(node) + + # Process descendants (getElementsByTagName doesn't return the element itself) + for elem in node.getElementsByTagName("w:p"): + add_rsid_to_p(elem) + for elem in node.getElementsByTagName("w:r"): + add_rsid_to_r(elem) + for elem in node.getElementsByTagName("w:t"): + add_xml_space_to_t(elem) + for tag in ("w:ins", "w:del"): + for elem in node.getElementsByTagName(tag): + add_tracked_change_attrs(elem) + for elem in node.getElementsByTagName("w:comment"): + add_comment_attrs(elem) + for elem in node.getElementsByTagName("w16cex:commentExtensible"): + add_comment_extensible_date(elem) + + def swap_element(self, elem, new_content): + """Replace node with automatic attribute injection.""" + nodes = super().swap_element(elem, new_content) + self._inject_attributes_to_nodes(nodes) + return nodes + + def add_after(self, elem, xml_content): + """Insert after with automatic attribute injection.""" + nodes = super().add_after(elem, xml_content) + self._inject_attributes_to_nodes(nodes) + return nodes + + def add_before(self, elem, xml_content): + """Insert before with automatic attribute injection.""" + nodes = super().add_before(elem, xml_content) + self._inject_attributes_to_nodes(nodes) + return nodes + + def add_to(self, elem, xml_content): + """Append to with automatic attribute injection.""" + nodes = super().add_to(elem, xml_content) + self._inject_attributes_to_nodes(nodes) + return nodes + + def undo_insertion(self, elem): + """Reject an insertion by wrapping its content in a deletion. + + Wraps all runs inside w:ins in w:del, converting w:t to w:delText. + Can process a single w:ins element or a container element with multiple w:ins. + + Args: + elem: Element to process (w:ins, w:p, w:body, etc.) + + Returns: + list: List containing the processed element(s) + + Raises: + ValueError: If the element contains no w:ins elements + + Example: + # Reject a single insertion + ins = doc["word/document.xml"].locate_element(tag="w:ins", attrs={"w:id": "5"}) + doc["word/document.xml"].undo_insertion(ins) + + # Reject all insertions in a paragraph + para = doc["word/document.xml"].locate_element(tag="w:p", line_number=42) + doc["word/document.xml"].undo_insertion(para) + """ + # Collect insertions + ins_elements = [] + if elem.tagName == "w:ins": + ins_elements.append(elem) + else: + ins_elements.extend(elem.getElementsByTagName("w:ins")) + + # Validate that there are insertions to reject + if not ins_elements: + raise ValueError( + f"undo_insertion requires w:ins elements. " + f"The provided element <{elem.tagName}> contains no insertions. " + ) + + # Process all insertions - wrap all children in w:del + for ins_elem in ins_elements: + runs = list(ins_elem.getElementsByTagName("w:r")) + if not runs: + continue + + # Create deletion wrapper + del_wrapper = self.dom.createElement("w:del") + + # Process each run + for run in runs: + # Convert w:t -> w:delText and w:rsidR -> w:rsidDel + if run.hasAttribute("w:rsidR"): + run.setAttribute("w:rsidDel", run.getAttribute("w:rsidR")) + run.removeAttribute("w:rsidR") + elif not run.hasAttribute("w:rsidDel"): + run.setAttribute("w:rsidDel", self.rsid) + + for t_elem in list(run.getElementsByTagName("w:t")): + del_text = self.dom.createElement("w:delText") + # Copy ALL child nodes (not just firstChild) to handle entities + while t_elem.firstChild: + del_text.appendChild(t_elem.firstChild) + for i in range(t_elem.attributes.length): + attr = t_elem.attributes.item(i) + del_text.setAttribute(attr.name, attr.value) + t_elem.parentNode.replaceChild(del_text, t_elem) + + # Move all children from ins to del wrapper + while ins_elem.firstChild: + del_wrapper.appendChild(ins_elem.firstChild) + + # Add del wrapper back to ins + ins_elem.appendChild(del_wrapper) + + # Inject attributes to the deletion wrapper + self._inject_attributes_to_nodes([del_wrapper]) + + return [elem] + + def undo_deletion(self, elem): + """Reject a deletion by re-inserting the deleted content. + + Creates w:ins elements after each w:del, copying deleted content and + converting w:delText back to w:t. + Can process a single w:del element or a container element with multiple w:del. + + Args: + elem: Element to process (w:del, w:p, w:body, etc.) + + Returns: + list: If elem is w:del, returns [elem, new_ins]. Otherwise returns [elem]. + + Raises: + ValueError: If the element contains no w:del elements + + Example: + # Reject a single deletion - returns [w:del, w:ins] + del_elem = doc["word/document.xml"].locate_element(tag="w:del", attrs={"w:id": "3"}) + nodes = doc["word/document.xml"].undo_deletion(del_elem) + + # Reject all deletions in a paragraph - returns [para] + para = doc["word/document.xml"].locate_element(tag="w:p", line_number=42) + nodes = doc["word/document.xml"].undo_deletion(para) + """ + # Collect deletions FIRST - before we modify the DOM + del_elements = [] + is_single_del = elem.tagName == "w:del" + + if is_single_del: + del_elements.append(elem) + else: + del_elements.extend(elem.getElementsByTagName("w:del")) + + # Validate that there are deletions to reject + if not del_elements: + raise ValueError( + f"undo_deletion requires w:del elements. " + f"The provided element <{elem.tagName}> contains no deletions. " + ) + + # Track created insertion (only relevant if elem is a single w:del) + created_insertion = None + + # Process all deletions - create insertions that copy the deleted content + for del_elem in del_elements: + # Clone the deleted runs and convert them to insertions + runs = list(del_elem.getElementsByTagName("w:r")) + if not runs: + continue + + # Create insertion wrapper + ins_elem = self.dom.createElement("w:ins") + + for run in runs: + # Clone the run + new_run = run.cloneNode(True) + + # Convert w:delText -> w:t + for del_text in list(new_run.getElementsByTagName("w:delText")): + t_elem = self.dom.createElement("w:t") + # Copy ALL child nodes (not just firstChild) to handle entities + while del_text.firstChild: + t_elem.appendChild(del_text.firstChild) + for i in range(del_text.attributes.length): + attr = del_text.attributes.item(i) + t_elem.setAttribute(attr.name, attr.value) + del_text.parentNode.replaceChild(t_elem, del_text) + + # Update run attributes: w:rsidDel -> w:rsidR + if new_run.hasAttribute("w:rsidDel"): + new_run.setAttribute("w:rsidR", new_run.getAttribute("w:rsidDel")) + new_run.removeAttribute("w:rsidDel") + elif not new_run.hasAttribute("w:rsidR"): + new_run.setAttribute("w:rsidR", self.rsid) + + ins_elem.appendChild(new_run) + + # Insert the new insertion after the deletion + nodes = self.add_after(del_elem, ins_elem.toxml()) + + # If processing a single w:del, track the created insertion + if is_single_del and nodes: + created_insertion = nodes[0] + + # Return based on input type + if is_single_del and created_insertion: + return [elem, created_insertion] + else: + return [elem] + + @staticmethod + def wrap_paragraph_insertion(xml_content: str) -> str: + """Transform paragraph XML to add tracked change wrapping for insertion. + + Wraps runs in and adds to w:rPr in w:pPr for numbered lists. + + Args: + xml_content: XML string containing a element + + Returns: + str: Transformed XML with tracked change wrapping + """ + wrapper = f'{xml_content}' + doc = minidom.parseString(wrapper) + para = doc.getElementsByTagName("w:p")[0] + + # Ensure w:pPr exists + pPr_list = para.getElementsByTagName("w:pPr") + if not pPr_list: + pPr = doc.createElement("w:pPr") + para.insertBefore( + pPr, para.firstChild + ) if para.firstChild else para.appendChild(pPr) + else: + pPr = pPr_list[0] + + # Ensure w:rPr exists in w:pPr + rPr_list = pPr.getElementsByTagName("w:rPr") + if not rPr_list: + rPr = doc.createElement("w:rPr") + pPr.appendChild(rPr) + else: + rPr = rPr_list[0] + + # Add to w:rPr + ins_marker = doc.createElement("w:ins") + rPr.insertBefore( + ins_marker, rPr.firstChild + ) if rPr.firstChild else rPr.appendChild(ins_marker) + + # Wrap all non-pPr children in + ins_wrapper = doc.createElement("w:ins") + for child in [c for c in para.childNodes if c.nodeName != "w:pPr"]: + para.removeChild(child) + ins_wrapper.appendChild(child) + para.appendChild(ins_wrapper) + + return para.toxml() + + def mark_for_deletion(self, elem): + """Mark a w:r or w:p element as deleted with tracked changes (in-place DOM manipulation). + + For w:r: wraps in , converts to , preserves w:rPr + For w:p (regular): wraps content in , converts to + For w:p (numbered list): adds to w:rPr in w:pPr, wraps content in + + Args: + elem: A w:r or w:p DOM element without existing tracked changes + + Returns: + Element: The modified element + + Raises: + ValueError: If element has existing tracked changes or invalid structure + """ + if elem.nodeName == "w:r": + # Check for existing w:delText + if elem.getElementsByTagName("w:delText"): + raise ValueError("w:r element already contains w:delText") + + # Convert w:t -> w:delText + for t_elem in list(elem.getElementsByTagName("w:t")): + del_text = self.dom.createElement("w:delText") + # Copy ALL child nodes (not just firstChild) to handle entities + while t_elem.firstChild: + del_text.appendChild(t_elem.firstChild) + # Preserve attributes like xml:space + for i in range(t_elem.attributes.length): + attr = t_elem.attributes.item(i) + del_text.setAttribute(attr.name, attr.value) + t_elem.parentNode.replaceChild(del_text, t_elem) + + # Update run attributes: w:rsidR -> w:rsidDel + if elem.hasAttribute("w:rsidR"): + elem.setAttribute("w:rsidDel", elem.getAttribute("w:rsidR")) + elem.removeAttribute("w:rsidR") + elif not elem.hasAttribute("w:rsidDel"): + elem.setAttribute("w:rsidDel", self.rsid) + + # Wrap in w:del + del_wrapper = self.dom.createElement("w:del") + parent = elem.parentNode + parent.insertBefore(del_wrapper, elem) + parent.removeChild(elem) + del_wrapper.appendChild(elem) + + # Inject attributes to the deletion wrapper + self._inject_attributes_to_nodes([del_wrapper]) + + return del_wrapper + + elif elem.nodeName == "w:p": + # Check for existing tracked changes + if elem.getElementsByTagName("w:ins") or elem.getElementsByTagName("w:del"): + raise ValueError("w:p element already contains tracked changes") + + # Check if it's a numbered list item + pPr_list = elem.getElementsByTagName("w:pPr") + is_numbered = pPr_list and pPr_list[0].getElementsByTagName("w:numPr") + + if is_numbered: + # Add to w:rPr in w:pPr + pPr = pPr_list[0] + rPr_list = pPr.getElementsByTagName("w:rPr") + + if not rPr_list: + rPr = self.dom.createElement("w:rPr") + pPr.appendChild(rPr) + else: + rPr = rPr_list[0] + + # Add marker + del_marker = self.dom.createElement("w:del") + rPr.insertBefore( + del_marker, rPr.firstChild + ) if rPr.firstChild else rPr.appendChild(del_marker) + + # Convert w:t -> w:delText in all runs + for t_elem in list(elem.getElementsByTagName("w:t")): + del_text = self.dom.createElement("w:delText") + # Copy ALL child nodes (not just firstChild) to handle entities + while t_elem.firstChild: + del_text.appendChild(t_elem.firstChild) + # Preserve attributes like xml:space + for i in range(t_elem.attributes.length): + attr = t_elem.attributes.item(i) + del_text.setAttribute(attr.name, attr.value) + t_elem.parentNode.replaceChild(del_text, t_elem) + + # Update run attributes: w:rsidR -> w:rsidDel + for run in elem.getElementsByTagName("w:r"): + if run.hasAttribute("w:rsidR"): + run.setAttribute("w:rsidDel", run.getAttribute("w:rsidR")) + run.removeAttribute("w:rsidR") + elif not run.hasAttribute("w:rsidDel"): + run.setAttribute("w:rsidDel", self.rsid) + + # Wrap all non-pPr children in + del_wrapper = self.dom.createElement("w:del") + for child in [c for c in elem.childNodes if c.nodeName != "w:pPr"]: + elem.removeChild(child) + del_wrapper.appendChild(child) + elem.appendChild(del_wrapper) + + # Inject attributes to the deletion wrapper + self._inject_attributes_to_nodes([del_wrapper]) + + return elem + + else: + raise ValueError(f"Element must be w:r or w:p, got {elem.nodeName}") + + +def _generate_hex_id() -> str: + """Generate random 8-character hex ID for para/durable IDs. + + Values are constrained to be less than 0x7FFFFFFF per OOXML spec: + - paraId must be < 0x80000000 + - durableId must be < 0x7FFFFFFF + We use the stricter constraint (0x7FFFFFFF) for both. + """ + return f"{random.randint(1, 0x7FFFFFFE):08X}" + + +def _generate_rsid() -> str: + """Generate random 8-character hex RSID.""" + return "".join(random.choices("0123456789ABCDEF", k=8)) + + +class WordFile: + """Manages comments in unpacked Word documents.""" + + def __init__( + self, + unpacked_dir, + rsid=None, + track_revisions=False, + author="Claude", + initials="C", + ): + """ + Initialize with path to unpacked Word document directory. + Automatically sets up comment infrastructure (people.xml, RSIDs). + + Args: + unpacked_dir: Path to unpacked DOCX directory (must contain word/ subdirectory) + rsid: Optional RSID to use for all comment elements. If not provided, one will be generated. + track_revisions: If True, enables track revisions in settings.xml (default: False) + author: Default author name for comments (default: "Claude") + initials: Default author initials for comments (default: "C") + """ + self.original_path = Path(unpacked_dir) + + if not self.original_path.exists() or not self.original_path.is_dir(): + raise ValueError(f"Directory not found: {unpacked_dir}") + + # Create temporary directory with subdirectories for unpacked content and baseline + self.temp_dir = tempfile.mkdtemp(prefix="wordfile_") + self.unpacked_path = Path(self.temp_dir) / "unpacked" + shutil.copytree(self.original_path, self.unpacked_path) + + # Pack original directory into temporary .docx for validation baseline (outside unpacked dir) + self.original_docx = Path(self.temp_dir) / "original.docx" + assemble_document(self.original_path, self.original_docx, validate=False) + + self.word_path = self.unpacked_path / "word" + + # Generate RSID if not provided + self.rsid = rsid if rsid else _generate_rsid() + print(f"Using RSID: {self.rsid}") + + # Set default author and initials + self.author = author + self.initials = initials + + # Cache for lazy-loaded editors + self._processors = {} + + # Comment file paths + self.comments_path = self.word_path / "comments.xml" + self.comments_extended_path = self.word_path / "commentsExtended.xml" + self.comments_ids_path = self.word_path / "commentsIds.xml" + self.comments_extensible_path = self.word_path / "commentsExtensible.xml" + + # Load existing comments and determine next ID (before setup modifies files) + self.existing_comments = self._load_existing_comments() + self.next_comment_id = self._get_next_comment_id() + + # Convenient access to document.xml processor (semi-private) + self._document = self["word/document.xml"] + + # Setup tracked changes infrastructure + self._setup_tracking(track_revisions=track_revisions) + + # Add author to people.xml + self._add_author_to_people(author) + + def __getitem__(self, xml_path: str) -> WordXMLProcessor: + """ + Get or create a WordXMLProcessor for the specified XML file. + + Enables lazy-loaded processors with bracket notation: + node = doc["word/document.xml"].locate_element(tag="w:p", line_number=42) + + Args: + xml_path: Relative path to XML file (e.g., "word/document.xml", "word/comments.xml") + + Returns: + WordXMLProcessor instance for the specified file + + Raises: + ValueError: If the file does not exist + + Example: + # Get node from document.xml + node = doc["word/document.xml"].locate_element(tag="w:del", attrs={"w:id": "1"}) + + # Get node from comments.xml + comment = doc["word/comments.xml"].locate_element(tag="w:comment", attrs={"w:id": "0"}) + """ + if xml_path not in self._processors: + file_path = self.unpacked_path / xml_path + if not file_path.exists(): + raise ValueError(f"XML file not found: {xml_path}") + # Use WordXMLProcessor with RSID, author, and initials for all processors + self._processors[xml_path] = WordXMLProcessor( + file_path, rsid=self.rsid, author=self.author, initials=self.initials + ) + return self._processors[xml_path] + + def insert_comment(self, start, end, text: str) -> int: + """ + Add a comment spanning from one element to another. + + Args: + start: DOM element for the starting point + end: DOM element for the ending point + text: Comment content + + Returns: + The comment ID that was created + + Example: + start_node = cm.get_document_node(tag="w:del", id="1") + end_node = cm.get_document_node(tag="w:ins", id="2") + cm.insert_comment(start=start_node, end=end_node, text="Explanation") + """ + comment_id = self.next_comment_id + para_id = _generate_hex_id() + durable_id = _generate_hex_id() + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + # Add comment ranges to document.xml immediately + self._document.add_before(start, self._comment_range_start_xml(comment_id)) + + # If end node is a paragraph, append comment markup inside it + # Otherwise insert after it (for run-level anchors) + if end.tagName == "w:p": + self._document.add_to(end, self._comment_range_end_xml(comment_id)) + else: + self._document.add_after(end, self._comment_range_end_xml(comment_id)) + + # Add to comments.xml immediately + self._add_to_comments_xml( + comment_id, para_id, text, self.author, self.initials, timestamp + ) + + # Add to commentsExtended.xml immediately + self._add_to_comments_extended_xml(para_id, parent_para_id=None) + + # Add to commentsIds.xml immediately + self._add_to_comments_ids_xml(para_id, durable_id) + + # Add to commentsExtensible.xml immediately + self._add_to_comments_extensible_xml(durable_id) + + # Update existing_comments so replies work + self.existing_comments[comment_id] = {"para_id": para_id} + + self.next_comment_id += 1 + return comment_id + + def respond_to_comment( + self, + parent_comment_id: int, + text: str, + ) -> int: + """ + Add a reply to an existing comment. + + Args: + parent_comment_id: The w:id of the parent comment to reply to + text: Reply text + + Returns: + The comment ID that was created for the reply + + Example: + cm.respond_to_comment(parent_comment_id=0, text="I agree with this change") + """ + if parent_comment_id not in self.existing_comments: + raise ValueError(f"Parent comment with id={parent_comment_id} not found") + + parent_info = self.existing_comments[parent_comment_id] + comment_id = self.next_comment_id + para_id = _generate_hex_id() + durable_id = _generate_hex_id() + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + # Add comment ranges to document.xml immediately + parent_start_elem = self._document.locate_element( + tag="w:commentRangeStart", attrs={"w:id": str(parent_comment_id)} + ) + parent_ref_elem = self._document.locate_element( + tag="w:commentReference", attrs={"w:id": str(parent_comment_id)} + ) + + self._document.add_after( + parent_start_elem, self._comment_range_start_xml(comment_id) + ) + parent_ref_run = parent_ref_elem.parentNode + self._document.add_after( + parent_ref_run, f'' + ) + self._document.add_after( + parent_ref_run, self._comment_ref_run_xml(comment_id) + ) + + # Add to comments.xml immediately + self._add_to_comments_xml( + comment_id, para_id, text, self.author, self.initials, timestamp + ) + + # Add to commentsExtended.xml immediately (with parent) + self._add_to_comments_extended_xml( + para_id, parent_para_id=parent_info["para_id"] + ) + + # Add to commentsIds.xml immediately + self._add_to_comments_ids_xml(para_id, durable_id) + + # Add to commentsExtensible.xml immediately + self._add_to_comments_extensible_xml(durable_id) + + # Update existing_comments so replies work + self.existing_comments[comment_id] = {"para_id": para_id} + + self.next_comment_id += 1 + return comment_id + + def __del__(self): + """Clean up temporary directory on deletion.""" + if hasattr(self, "temp_dir") and Path(self.temp_dir).exists(): + shutil.rmtree(self.temp_dir) + + def check_validity(self) -> None: + """ + Validate the document against XSD schema and redlining rules. + + Raises: + ValueError: If validation fails. + """ + # Create validators with current state + schema_validator = DOCXSchemaValidator( + self.unpacked_path, self.original_docx, verbose=False + ) + redlining_validator = RedliningValidator( + self.unpacked_path, self.original_docx, verbose=False + ) + + # Run validations + if not schema_validator.validate(): + raise ValueError("Schema validation failed") + if not redlining_validator.validate(): + raise ValueError("Redlining validation failed") + + def persist(self, destination=None, validate=True) -> None: + """ + Save all modified XML files to disk and copy to destination directory. + + This persists all changes made via insert_comment() and respond_to_comment(). + + Args: + destination: Optional path to save to. If None, saves back to original directory. + validate: If True, validates document before saving (default: True). + """ + # Only ensure comment relationships and content types if comment files exist + if self.comments_path.exists(): + self._ensure_comment_relationships() + self._ensure_comment_content_types() + + # Save all modified XML files in temp directory + for processor in self._processors.values(): + processor.write_back() + + # Validate by default + if validate: + self.check_validity() + + # Copy contents from temp directory to destination (or original directory) + target_path = Path(destination) if destination else self.original_path + shutil.copytree(self.unpacked_path, target_path, dirs_exist_ok=True) + + # ==================== Private: Initialization ==================== + + def _get_next_comment_id(self): + """Get the next available comment ID.""" + if not self.comments_path.exists(): + return 0 + + processor = self["word/comments.xml"] + max_id = -1 + for comment_elem in processor.dom.getElementsByTagName("w:comment"): + comment_id = comment_elem.getAttribute("w:id") + if comment_id: + try: + max_id = max(max_id, int(comment_id)) + except ValueError: + pass + return max_id + 1 + + def _load_existing_comments(self): + """Load existing comments from files to enable replies.""" + if not self.comments_path.exists(): + return {} + + processor = self["word/comments.xml"] + existing = {} + + for comment_elem in processor.dom.getElementsByTagName("w:comment"): + comment_id = comment_elem.getAttribute("w:id") + if not comment_id: + continue + + # Find para_id from the w:p element within the comment + para_id = None + for p_elem in comment_elem.getElementsByTagName("w:p"): + para_id = p_elem.getAttribute("w14:paraId") + if para_id: + break + + if not para_id: + continue + + existing[int(comment_id)] = {"para_id": para_id} + + return existing + + # ==================== Private: Setup Methods ==================== + + def _setup_tracking(self, track_revisions=False): + """Set up comment infrastructure in unpacked directory. + + Args: + track_revisions: If True, enables track revisions in settings.xml + """ + # Create or update word/people.xml + people_file = self.word_path / "people.xml" + self._update_people_xml(people_file) + + # Update XML files + self._add_content_type_for_people(self.unpacked_path / "[Content_Types].xml") + self._add_relationship_for_people( + self.word_path / "_rels" / "document.xml.rels" + ) + + # Always add RSID to settings.xml, optionally enable trackRevisions + self._update_settings( + self.word_path / "settings.xml", track_revisions=track_revisions + ) + + def _update_people_xml(self, path): + """Create people.xml if it doesn't exist.""" + if not path.exists(): + # Copy from template + shutil.copy(TEMPLATE_DIR / "people.xml", path) + + def _add_content_type_for_people(self, path): + """Add people.xml content type to [Content_Types].xml if not already present.""" + processor = self["[Content_Types].xml"] + + if self._has_override(processor, "/word/people.xml"): + return + + # Add Override element + root = processor.dom.documentElement + override_xml = '' + processor.add_to(root, override_xml) + + def _add_relationship_for_people(self, path): + """Add people.xml relationship to document.xml.rels if not already present.""" + processor = self["word/_rels/document.xml.rels"] + + if self._has_relationship(processor, "people.xml"): + return + + root = processor.dom.documentElement + root_tag = root.tagName # type: ignore + prefix = root_tag.split(":")[0] + ":" if ":" in root_tag else "" + next_rid = processor.get_next_relationship_id() + + # Create the relationship entry + rel_xml = f'<{prefix}Relationship Id="{next_rid}" Type="http://schemas.microsoft.com/office/2011/relationships/people" Target="people.xml"/>' + processor.add_to(root, rel_xml) + + def _update_settings(self, path, track_revisions=False): + """Add RSID and optionally enable track revisions in settings.xml. + + Args: + path: Path to settings.xml + track_revisions: If True, adds trackRevisions element + + Places elements per OOXML schema order: + - trackRevisions: early (before defaultTabStop) + - rsids: late (after compat) + """ + processor = self["word/settings.xml"] + root = processor.locate_element(tag="w:settings") + prefix = root.tagName.split(":")[0] if ":" in root.tagName else "w" + + # Conditionally add trackRevisions if requested + if track_revisions: + track_revisions_exists = any( + elem.tagName == f"{prefix}:trackRevisions" + for elem in processor.dom.getElementsByTagName(f"{prefix}:trackRevisions") + ) + + if not track_revisions_exists: + track_rev_xml = f"<{prefix}:trackRevisions/>" + # Try to insert before documentProtection, defaultTabStop, or at start + inserted = False + for tag in [f"{prefix}:documentProtection", f"{prefix}:defaultTabStop"]: + elements = processor.dom.getElementsByTagName(tag) + if elements: + processor.add_before(elements[0], track_rev_xml) + inserted = True + break + if not inserted: + # Insert as first child of settings + if root.firstChild: + processor.add_before(root.firstChild, track_rev_xml) + else: + processor.add_to(root, track_rev_xml) + + # Always check if rsids section exists + rsids_elements = processor.dom.getElementsByTagName(f"{prefix}:rsids") + + if not rsids_elements: + # Add new rsids section + rsids_xml = f'''<{prefix}:rsids> + <{prefix}:rsidRoot {prefix}:val="{self.rsid}"/> + <{prefix}:rsid {prefix}:val="{self.rsid}"/> +''' + + # Try to insert after compat, before clrSchemeMapping, or before closing tag + inserted = False + compat_elements = processor.dom.getElementsByTagName(f"{prefix}:compat") + if compat_elements: + processor.add_after(compat_elements[0], rsids_xml) + inserted = True + + if not inserted: + clr_elements = processor.dom.getElementsByTagName( + f"{prefix}:clrSchemeMapping" + ) + if clr_elements: + processor.add_before(clr_elements[0], rsids_xml) + inserted = True + + if not inserted: + processor.add_to(root, rsids_xml) + else: + # Check if this rsid already exists + rsids_elem = rsids_elements[0] + rsid_exists = any( + elem.getAttribute(f"{prefix}:val") == self.rsid + for elem in rsids_elem.getElementsByTagName(f"{prefix}:rsid") + ) + + if not rsid_exists: + rsid_xml = f'<{prefix}:rsid {prefix}:val="{self.rsid}"/>' + processor.add_to(rsids_elem, rsid_xml) + + # ==================== Private: XML File Creation ==================== + + def _add_to_comments_xml( + self, comment_id, para_id, text, author, initials, timestamp + ): + """Add a single comment to comments.xml.""" + if not self.comments_path.exists(): + shutil.copy(TEMPLATE_DIR / "comments.xml", self.comments_path) + + processor = self["word/comments.xml"] + root = processor.locate_element(tag="w:comments") + + escaped_text = ( + text.replace("&", "&").replace("<", "<").replace(">", ">") + ) + # Note: w:rsidR, w:rsidRDefault, w:rsidP on w:p, w:rsidR on w:r, + # and w:author, w:date, w:initials on w:comment are automatically added by WordXMLProcessor + comment_xml = f''' + + + {escaped_text} + +''' + processor.add_to(root, comment_xml) + + def _add_to_comments_extended_xml(self, para_id, parent_para_id): + """Add a single comment to commentsExtended.xml.""" + if not self.comments_extended_path.exists(): + shutil.copy( + TEMPLATE_DIR / "commentsExtended.xml", self.comments_extended_path + ) + + processor = self["word/commentsExtended.xml"] + root = processor.locate_element(tag="w15:commentsEx") + + if parent_para_id: + xml = f'' + else: + xml = f'' + processor.add_to(root, xml) + + def _add_to_comments_ids_xml(self, para_id, durable_id): + """Add a single comment to commentsIds.xml.""" + if not self.comments_ids_path.exists(): + shutil.copy(TEMPLATE_DIR / "commentsIds.xml", self.comments_ids_path) + + processor = self["word/commentsIds.xml"] + root = processor.locate_element(tag="w16cid:commentsIds") + + xml = f'' + processor.add_to(root, xml) + + def _add_to_comments_extensible_xml(self, durable_id): + """Add a single comment to commentsExtensible.xml.""" + if not self.comments_extensible_path.exists(): + shutil.copy( + TEMPLATE_DIR / "commentsExtensible.xml", self.comments_extensible_path + ) + + processor = self["word/commentsExtensible.xml"] + root = processor.locate_element(tag="w16cex:commentsExtensible") + + xml = f'' + processor.add_to(root, xml) + + # ==================== Private: XML Fragments ==================== + + def _comment_range_start_xml(self, comment_id): + """Generate XML for comment range start.""" + return f'' + + def _comment_range_end_xml(self, comment_id): + """Generate XML for comment range end with reference run. + + Note: w:rsidR is automatically added by WordXMLProcessor. + """ + return f''' + + + +''' + + def _comment_ref_run_xml(self, comment_id): + """Generate XML for comment reference run. + + Note: w:rsidR is automatically added by WordXMLProcessor. + """ + return f''' + + +''' + + # ==================== Private: Metadata Updates ==================== + + def _has_relationship(self, processor, target): + """Check if a relationship with given target exists.""" + for rel_elem in processor.dom.getElementsByTagName("Relationship"): + if rel_elem.getAttribute("Target") == target: + return True + return False + + def _has_override(self, processor, part_name): + """Check if an override with given part name exists.""" + for override_elem in processor.dom.getElementsByTagName("Override"): + if override_elem.getAttribute("PartName") == part_name: + return True + return False + + def _has_author(self, processor, author): + """Check if an author already exists in people.xml.""" + for person_elem in processor.dom.getElementsByTagName("w15:person"): + if person_elem.getAttribute("w15:author") == author: + return True + return False + + def _add_author_to_people(self, author): + """Add author to people.xml (called during initialization).""" + people_path = self.word_path / "people.xml" + + # people.xml should already exist from _setup_tracking + if not people_path.exists(): + raise ValueError("people.xml should exist after _setup_tracking") + + processor = self["word/people.xml"] + root = processor.locate_element(tag="w15:people") + + # Check if author already exists + if self._has_author(processor, author): + return + + # Add author with proper XML escaping to prevent injection + escaped_author = html.escape(author, quote=True) + person_xml = f''' + +''' + processor.add_to(root, person_xml) + + def _ensure_comment_relationships(self): + """Ensure word/_rels/document.xml.rels has comment relationships.""" + processor = self["word/_rels/document.xml.rels"] + + if self._has_relationship(processor, "comments.xml"): + return + + root = processor.dom.documentElement + root_tag = root.tagName # type: ignore + prefix = root_tag.split(":")[0] + ":" if ":" in root_tag else "" + next_rid_num = int(processor.get_next_relationship_id()[3:]) + + # Add relationship elements + rels = [ + ( + next_rid_num, + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments", + "comments.xml", + ), + ( + next_rid_num + 1, + "http://schemas.microsoft.com/office/2011/relationships/commentsExtended", + "commentsExtended.xml", + ), + ( + next_rid_num + 2, + "http://schemas.microsoft.com/office/2016/09/relationships/commentsIds", + "commentsIds.xml", + ), + ( + next_rid_num + 3, + "http://schemas.microsoft.com/office/2018/08/relationships/commentsExtensible", + "commentsExtensible.xml", + ), + ] + + for rel_id, rel_type, target in rels: + rel_xml = f'<{prefix}Relationship Id="rId{rel_id}" Type="{rel_type}" Target="{target}"/>' + processor.add_to(root, rel_xml) + + def _ensure_comment_content_types(self): + """Ensure [Content_Types].xml has comment content types.""" + processor = self["[Content_Types].xml"] + + if self._has_override(processor, "/word/comments.xml"): + return + + root = processor.dom.documentElement + + # Add Override elements + overrides = [ + ( + "/word/comments.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml", + ), + ( + "/word/commentsExtended.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtended+xml", + ), + ( + "/word/commentsIds.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsIds+xml", + ), + ( + "/word/commentsExtensible.xml", + "application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtensible+xml", + ), + ] + + for part_name, content_type in overrides: + override_xml = ( + f'' + ) + processor.add_to(root, override_xml) diff --git a/deploy/data/skills/docx/scripts/xml_helper.py b/deploy/data/skills/docx/scripts/xml_helper.py new file mode 100644 index 000000000..9c25ab561 --- /dev/null +++ b/deploy/data/skills/docx/scripts/xml_helper.py @@ -0,0 +1,374 @@ +#!/usr/bin/env python3 +""" +Utilities for editing OpenXML documents. + +This module provides XMLProcessor, a tool for manipulating XML files with support for +line-number-based node finding and DOM manipulation. Each element is automatically +annotated with its original line and column position during parsing. + +Example usage: + processor = XMLProcessor("document.xml") + + # Find node by line number or range + elem = processor.locate_element(tag="w:r", line_number=519) + elem = processor.locate_element(tag="w:p", line_number=range(100, 200)) + + # Find node by text content + elem = processor.locate_element(tag="w:p", contains="specific text") + + # Find node by attributes + elem = processor.locate_element(tag="w:r", attrs={"w:id": "target"}) + + # Combine filters + elem = processor.locate_element(tag="w:p", line_number=range(1, 50), contains="text") + + # Replace, insert, or manipulate + new_elem = processor.swap_element(elem, "new text") + processor.add_after(new_elem, "more") + + # Save changes + processor.write_back() +""" + +import html +from pathlib import Path +from typing import Optional, Union + +import defusedxml.minidom +import defusedxml.sax + + +class XMLProcessor: + """ + Processor for manipulating OpenXML XML files with line-number-based node finding. + + This class parses XML files and tracks the original line and column position + of each element. This enables finding nodes by their line number in the original + file, which is useful when working with Read tool output. + + Attributes: + xml_path: Path to the XML file being edited + encoding: Detected encoding of the XML file ('ascii' or 'utf-8') + dom: Parsed DOM tree with parse_position attributes on elements + """ + + def __init__(self, xml_path): + """ + Initialize with path to XML file and parse with line number tracking. + + Args: + xml_path: Path to XML file to edit (str or Path) + + Raises: + ValueError: If the XML file does not exist + """ + self.xml_path = Path(xml_path) + if not self.xml_path.exists(): + raise ValueError(f"XML file not found: {xml_path}") + + with open(self.xml_path, "rb") as f: + header = f.read(200).decode("utf-8", errors="ignore") + self.encoding = "ascii" if 'encoding="ascii"' in header else "utf-8" + + parser = _create_position_tracking_parser() + self.dom = defusedxml.minidom.parse(str(self.xml_path), parser) + + def locate_element( + self, + tag: str, + attrs: Optional[dict[str, str]] = None, + line_number: Optional[Union[int, range]] = None, + contains: Optional[str] = None, + ): + """ + Get a DOM element by tag and identifier. + + Finds an element by either its line number in the original file or by + matching attribute values. Exactly one match must be found. + + Args: + tag: The XML tag name (e.g., "w:del", "w:ins", "w:r") + attrs: Dictionary of attribute name-value pairs to match (e.g., {"w:id": "1"}) + line_number: Line number (int) or line range (range) in original XML file (1-indexed) + contains: Text string that must appear in any text node within the element. + Supports both entity notation (“) and Unicode characters (\u201c). + + Returns: + defusedxml.minidom.Element: The matching DOM element + + Raises: + ValueError: If node not found or multiple matches found + + Example: + elem = processor.locate_element(tag="w:r", line_number=519) + elem = processor.locate_element(tag="w:r", line_number=range(100, 200)) + elem = processor.locate_element(tag="w:del", attrs={"w:id": "1"}) + elem = processor.locate_element(tag="w:p", attrs={"w14:paraId": "12345678"}) + elem = processor.locate_element(tag="w:commentRangeStart", attrs={"w:id": "0"}) + elem = processor.locate_element(tag="w:p", contains="specific text") + elem = processor.locate_element(tag="w:t", contains="“Agreement") # Entity notation + elem = processor.locate_element(tag="w:t", contains="\u201cAgreement") # Unicode character + """ + matches = [] + for elem in self.dom.getElementsByTagName(tag): + # Check line_number filter + if line_number is not None: + parse_pos = getattr(elem, "parse_position", (None,)) + elem_line = parse_pos[0] + + # Handle both single line number and range + if isinstance(line_number, range): + if elem_line not in line_number: + continue + else: + if elem_line != line_number: + continue + + # Check attrs filter + if attrs is not None: + if not all( + elem.getAttribute(attr_name) == attr_value + for attr_name, attr_value in attrs.items() + ): + continue + + # Check contains filter + if contains is not None: + elem_text = self._extract_text(elem) + # Normalize the search string: convert HTML entities to Unicode characters + # This allows searching for both "“Rowan" and ""Rowan" + normalized_contains = html.unescape(contains) + if normalized_contains not in elem_text: + continue + + # If all applicable filters passed, this is a match + matches.append(elem) + + if not matches: + # Build descriptive error message + filters = [] + if line_number is not None: + line_str = ( + f"lines {line_number.start}-{line_number.stop - 1}" + if isinstance(line_number, range) + else f"line {line_number}" + ) + filters.append(f"at {line_str}") + if attrs is not None: + filters.append(f"with attributes {attrs}") + if contains is not None: + filters.append(f"containing '{contains}'") + + filter_desc = " ".join(filters) if filters else "" + base_msg = f"Node not found: <{tag}> {filter_desc}".strip() + + # Add helpful hint based on filters used + if contains: + hint = "Text may be split across elements or use different wording." + elif line_number: + hint = "Line numbers may have changed if document was modified." + elif attrs: + hint = "Verify attribute values are correct." + else: + hint = "Try adding filters (attrs, line_number, or contains)." + + raise ValueError(f"{base_msg}. {hint}") + if len(matches) > 1: + raise ValueError( + f"Multiple nodes found: <{tag}>. " + f"Add more filters (attrs, line_number, or contains) to narrow the search." + ) + return matches[0] + + def _extract_text(self, elem): + """ + Recursively extract all text content from an element. + + Skips text nodes that contain only whitespace (spaces, tabs, newlines), + which typically represent XML formatting rather than document content. + + Args: + elem: defusedxml.minidom.Element to extract text from + + Returns: + str: Concatenated text from all non-whitespace text nodes within the element + """ + text_parts = [] + for node in elem.childNodes: + if node.nodeType == node.TEXT_NODE: + # Skip whitespace-only text nodes (XML formatting) + if node.data.strip(): + text_parts.append(node.data) + elif node.nodeType == node.ELEMENT_NODE: + text_parts.append(self._extract_text(node)) + return "".join(text_parts) + + def swap_element(self, elem, new_content): + """ + Replace a DOM element with new XML content. + + Args: + elem: defusedxml.minidom.Element to replace + new_content: String containing XML to replace the node with + + Returns: + List[defusedxml.minidom.Node]: All inserted nodes + + Example: + new_nodes = processor.swap_element(old_elem, "text") + """ + parent = elem.parentNode + nodes = self._parse_xml_fragment(new_content) + for node in nodes: + parent.insertBefore(node, elem) + parent.removeChild(elem) + return nodes + + def add_after(self, elem, xml_content): + """ + Insert XML content after a DOM element. + + Args: + elem: defusedxml.minidom.Element to insert after + xml_content: String containing XML to insert + + Returns: + List[defusedxml.minidom.Node]: All inserted nodes + + Example: + new_nodes = processor.add_after(elem, "text") + """ + parent = elem.parentNode + next_sibling = elem.nextSibling + nodes = self._parse_xml_fragment(xml_content) + for node in nodes: + if next_sibling: + parent.insertBefore(node, next_sibling) + else: + parent.appendChild(node) + return nodes + + def add_before(self, elem, xml_content): + """ + Insert XML content before a DOM element. + + Args: + elem: defusedxml.minidom.Element to insert before + xml_content: String containing XML to insert + + Returns: + List[defusedxml.minidom.Node]: All inserted nodes + + Example: + new_nodes = processor.add_before(elem, "text") + """ + parent = elem.parentNode + nodes = self._parse_xml_fragment(xml_content) + for node in nodes: + parent.insertBefore(node, elem) + return nodes + + def add_to(self, elem, xml_content): + """ + Append XML content as a child of a DOM element. + + Args: + elem: defusedxml.minidom.Element to append to + xml_content: String containing XML to append + + Returns: + List[defusedxml.minidom.Node]: All inserted nodes + + Example: + new_nodes = processor.add_to(elem, "text") + """ + nodes = self._parse_xml_fragment(xml_content) + for node in nodes: + elem.appendChild(node) + return nodes + + def get_next_relationship_id(self): + """Get the next available rId for relationships files.""" + max_id = 0 + for rel_elem in self.dom.getElementsByTagName("Relationship"): + rel_id = rel_elem.getAttribute("Id") + if rel_id.startswith("rId"): + try: + max_id = max(max_id, int(rel_id[3:])) + except ValueError: + pass + return f"rId{max_id + 1}" + + def write_back(self): + """ + Save the edited XML back to the file. + + Serializes the DOM tree and writes it back to the original file path, + preserving the original encoding (ascii or utf-8). + """ + content = self.dom.toxml(encoding=self.encoding) + self.xml_path.write_bytes(content) + + def _parse_xml_fragment(self, xml_content): + """ + Parse XML fragment and return list of imported nodes. + + Args: + xml_content: String containing XML fragment + + Returns: + List of defusedxml.minidom.Node objects imported into this document + + Raises: + AssertionError: If fragment contains no element nodes + """ + # Extract namespace declarations from the root document element + root_elem = self.dom.documentElement + namespaces = [] + if root_elem and root_elem.attributes: + for i in range(root_elem.attributes.length): + attr = root_elem.attributes.item(i) + if attr.name.startswith("xmlns"): # type: ignore + namespaces.append(f'{attr.name}="{attr.value}"') # type: ignore + + ns_decl = " ".join(namespaces) + wrapper = f"{xml_content}" + fragment_doc = defusedxml.minidom.parseString(wrapper) + nodes = [ + self.dom.importNode(child, deep=True) + for child in fragment_doc.documentElement.childNodes # type: ignore + ] + elements = [n for n in nodes if n.nodeType == n.ELEMENT_NODE] + assert elements, "Fragment must contain at least one element" + return nodes + + +def _create_position_tracking_parser(): + """ + Create a SAX parser that tracks line and column numbers for each element. + + Monkey patches the SAX content handler to store the current line and column + position from the underlying expat parser onto each element as a parse_position + attribute (line, column) tuple. + + Returns: + defusedxml.sax.xmlreader.XMLReader: Configured SAX parser + """ + + def set_content_handler(dom_handler): + def startElementNS(name, tagName, attrs): + orig_start_cb(name, tagName, attrs) + cur_elem = dom_handler.elementStack[-1] + cur_elem.parse_position = ( + parser._parser.CurrentLineNumber, # type: ignore + parser._parser.CurrentColumnNumber, # type: ignore + ) + + orig_start_cb = dom_handler.startElementNS + dom_handler.startElementNS = startElementNS + orig_set_content_handler(dom_handler) + + parser = defusedxml.sax.make_parser() + orig_set_content_handler = parser.setContentHandler + parser.setContentHandler = set_content_handler # type: ignore + return parser diff --git a/deploy/data/skills/docx/word-generator.md b/deploy/data/skills/docx/word-generator.md new file mode 100644 index 000000000..de5f33071 --- /dev/null +++ b/deploy/data/skills/docx/word-generator.md @@ -0,0 +1,350 @@ +# Word Document Generator Guide + +Create .docx files programmatically with JavaScript/TypeScript. + +**Important: Read this entire document before starting.** Critical formatting rules and common pitfalls are covered throughout - skipping sections may result in corrupted files or rendering issues. + +## Setup +Assumes docx is already installed globally +If not installed: `npm install -g docx` + +```javascript +const { Document, Packer, Paragraph, TextRun, Table, TableRow, TableCell, ImageRun, Media, + Header, Footer, AlignmentType, PageOrientation, LevelFormat, ExternalHyperlink, + InternalHyperlink, TableOfContents, HeadingLevel, BorderStyle, WidthType, TabStopType, + TabStopPosition, UnderlineType, ShadingType, VerticalAlign, SymbolRun, PageNumber, + FootnoteReferenceRun, Footnote, PageBreak } = require('docx'); + +// Create & Save +const doc = new Document({ sections: [{ children: [/* content */] }] }); +Packer.toBuffer(doc).then(buffer => fs.writeFileSync("output.docx", buffer)); // Node.js +Packer.toBlob(doc).then(blob => { /* download logic */ }); // Browser +``` + +## Text & Formatting +```javascript +// IMPORTANT: Never use \n for line breaks - always use separate Paragraph elements +// BAD: new TextRun("Line 1\nLine 2") +// GOOD: new Paragraph({ children: [new TextRun("Line 1")] }), new Paragraph({ children: [new TextRun("Line 2")] }) + +// Basic text with all formatting options +new Paragraph({ + alignment: AlignmentType.CENTER, + spacing: { before: 200, after: 200 }, + indent: { left: 720, right: 720 }, + children: [ + new TextRun({ text: "Bold", bold: true }), + new TextRun({ text: "Italic", italics: true }), + new TextRun({ text: "Underlined", underline: { type: UnderlineType.DOUBLE, color: "FF0000" } }), + new TextRun({ text: "Colored", color: "FF0000", size: 28, font: "Arial" }), // Arial default + new TextRun({ text: "Highlighted", highlight: "yellow" }), + new TextRun({ text: "Strikethrough", strike: true }), + new TextRun({ text: "x2", superScript: true }), + new TextRun({ text: "H2O", subScript: true }), + new TextRun({ text: "SMALL CAPS", smallCaps: true }), + new SymbolRun({ char: "2022", font: "Symbol" }), // Bullet point + new SymbolRun({ char: "00A9", font: "Arial" }) // Copyright symbol - Arial for symbols + ] +}) +``` + +## Styles & Professional Formatting + +```javascript +const doc = new Document({ + styles: { + default: { document: { run: { font: "Arial", size: 24 } } }, // 12pt default + paragraphStyles: [ + // Document title style - override built-in Title style + { id: "Title", name: "Title", basedOn: "Normal", + run: { size: 56, bold: true, color: "000000", font: "Arial" }, + paragraph: { spacing: { before: 240, after: 120 }, alignment: AlignmentType.CENTER } }, + // IMPORTANT: Override built-in heading styles by using their exact IDs + { id: "Heading1", name: "Heading 1", basedOn: "Normal", next: "Normal", quickFormat: true, + run: { size: 32, bold: true, color: "000000", font: "Arial" }, // 16pt + paragraph: { spacing: { before: 240, after: 240 }, outlineLevel: 0 } }, // Required for TOC + { id: "Heading2", name: "Heading 2", basedOn: "Normal", next: "Normal", quickFormat: true, + run: { size: 28, bold: true, color: "000000", font: "Arial" }, // 14pt + paragraph: { spacing: { before: 180, after: 180 }, outlineLevel: 1 } }, + // Custom styles use your own IDs + { id: "customStyle", name: "Custom Style", basedOn: "Normal", + run: { size: 28, bold: true, color: "000000" }, + paragraph: { spacing: { after: 120 }, alignment: AlignmentType.CENTER } } + ], + characterStyles: [{ id: "emphasisStyle", name: "Emphasis Style", + run: { color: "FF0000", bold: true, underline: { type: UnderlineType.SINGLE } } }] + }, + sections: [{ + properties: { page: { margin: { top: 1440, right: 1440, bottom: 1440, left: 1440 } } }, + children: [ + new Paragraph({ heading: HeadingLevel.TITLE, children: [new TextRun("Document Title")] }), // Uses overridden Title style + new Paragraph({ heading: HeadingLevel.HEADING_1, children: [new TextRun("Heading 1")] }), // Uses overridden Heading1 style + new Paragraph({ style: "customStyle", children: [new TextRun("Custom paragraph style")] }), + new Paragraph({ children: [ + new TextRun("Normal with "), + new TextRun({ text: "emphasis style", style: "emphasisStyle" }) + ]}) + ] + }] +}); +``` + +**Professional Font Combinations:** +- **Arial (Headers) + Arial (Body)** - Most universally supported, clean and professional +- **Times New Roman (Headers) + Arial (Body)** - Classic serif headers with modern sans-serif body +- **Georgia (Headers) + Verdana (Body)** - Optimized for screen reading, elegant contrast + +**Key Styling Principles:** +- **Override built-in styles**: Use exact IDs like "Heading1", "Heading2", "Heading3" to override Word's built-in heading styles +- **HeadingLevel constants**: `HeadingLevel.HEADING_1` uses "Heading1" style, `HeadingLevel.HEADING_2` uses "Heading2" style, etc. +- **Include outlineLevel**: Set `outlineLevel: 0` for H1, `outlineLevel: 1` for H2, etc. to ensure TOC works correctly +- **Use custom styles** instead of inline formatting for consistency +- **Set a default font** using `styles.default.document.run.font` - Arial is universally supported +- **Establish visual hierarchy** with different font sizes (titles > headers > body) +- **Add proper spacing** with `before` and `after` paragraph spacing +- **Use colors sparingly**: Default to black (000000) and shades of gray for titles and headings (heading 1, heading 2, etc.) +- **Set consistent margins** (1440 = 1 inch is standard) + + +## Lists (ALWAYS USE PROPER LISTS - NEVER USE UNICODE BULLETS) +```javascript +// Bullets - ALWAYS use the numbering config, NOT unicode symbols +// CRITICAL: Use LevelFormat.BULLET constant, NOT the string "bullet" +const doc = new Document({ + numbering: { + config: [ + { reference: "bullets", + levels: [{ level: 0, format: LevelFormat.BULLET, text: "\u2022", alignment: AlignmentType.LEFT, + style: { paragraph: { indent: { left: 720, hanging: 360 } } } }] }, + { reference: "numbers-a", + levels: [{ level: 0, format: LevelFormat.DECIMAL, text: "%1.", alignment: AlignmentType.LEFT, + style: { paragraph: { indent: { left: 720, hanging: 360 } } } }] }, + { reference: "numbers-b", // Different reference = restarts at 1 + levels: [{ level: 0, format: LevelFormat.DECIMAL, text: "%1.", alignment: AlignmentType.LEFT, + style: { paragraph: { indent: { left: 720, hanging: 360 } } } }] } + ] + }, + sections: [{ + children: [ + // Bullet list items + new Paragraph({ numbering: { reference: "bullets", level: 0 }, + children: [new TextRun("First bullet point")] }), + new Paragraph({ numbering: { reference: "bullets", level: 0 }, + children: [new TextRun("Second bullet point")] }), + // Numbered list items + new Paragraph({ numbering: { reference: "numbers-a", level: 0 }, + children: [new TextRun("First numbered item")] }), + new Paragraph({ numbering: { reference: "numbers-a", level: 0 }, + children: [new TextRun("Second numbered item")] }), + // CRITICAL: Different reference = INDEPENDENT list that restarts at 1 + // Same reference = CONTINUES previous numbering + new Paragraph({ numbering: { reference: "numbers-b", level: 0 }, + children: [new TextRun("Starts at 1 again (because different reference)")] }) + ] + }] +}); + +// CRITICAL NUMBERING RULE: Each reference creates an INDEPENDENT numbered list +// - Same reference = continues numbering (1, 2, 3... then 4, 5, 6...) +// - Different reference = restarts at 1 (1, 2, 3... then 1, 2, 3...) +// Use unique reference names for each separate numbered section! + +// CRITICAL: NEVER use unicode bullets - they create fake lists that don't work properly +// new TextRun("\u2022 Item") // WRONG +// new SymbolRun({ char: "2022" }) // WRONG +// ALWAYS use numbering config with LevelFormat.BULLET for real Word lists +``` + +## Tables +```javascript +// Complete table with margins, borders, headers, and bullet points +const border = { style: BorderStyle.SINGLE, size: 1, color: "CCCCCC" }; +const borders = { top: border, bottom: border, left: border, right: border }; + +new Table({ + columnWidths: [4680, 4680], // CRITICAL: Set column widths at table level - values in DXA (twentieths of a point) + margins: { top: 100, bottom: 100, left: 180, right: 180 }, // Set once for all cells + rows: [ + new TableRow({ + tableHeader: true, + children: [ + new TableCell({ + borders: borders, + width: { size: 4680, type: WidthType.DXA }, // ALSO set width on each cell + // CRITICAL: Always use ShadingType.CLEAR to prevent black backgrounds in Word. + shading: { fill: "D5E8F0", type: ShadingType.CLEAR }, + verticalAlign: VerticalAlign.CENTER, + children: [new Paragraph({ + alignment: AlignmentType.CENTER, + children: [new TextRun({ text: "Header", bold: true, size: 22 })] + })] + }), + new TableCell({ + borders: borders, + width: { size: 4680, type: WidthType.DXA }, // ALSO set width on each cell + shading: { fill: "D5E8F0", type: ShadingType.CLEAR }, + children: [new Paragraph({ + alignment: AlignmentType.CENTER, + children: [new TextRun({ text: "Bullet Points", bold: true, size: 22 })] + })] + }) + ] + }), + new TableRow({ + children: [ + new TableCell({ + borders: borders, + width: { size: 4680, type: WidthType.DXA }, // ALSO set width on each cell + children: [new Paragraph({ children: [new TextRun("Regular data")] })] + }), + new TableCell({ + borders: borders, + width: { size: 4680, type: WidthType.DXA }, // ALSO set width on each cell + children: [ + new Paragraph({ + numbering: { reference: "bullets", level: 0 }, + children: [new TextRun("First bullet point")] + }), + new Paragraph({ + numbering: { reference: "bullets", level: 0 }, + children: [new TextRun("Second bullet point")] + }) + ] + }) + ] + }) + ] +}) +``` + +**IMPORTANT: Table Width & Borders** +- Use BOTH `columnWidths: [width1, width2, ...]` array AND `width: { size: X, type: WidthType.DXA }` on each cell +- Values in DXA (twentieths of a point): 1440 = 1 inch, Letter usable width = 9360 DXA (with 1" margins) +- Apply borders to individual `TableCell` elements, NOT the `Table` itself + +**Precomputed Column Widths (Letter size with 1" margins = 9360 DXA total):** +- **2 columns:** `columnWidths: [4680, 4680]` (equal width) +- **3 columns:** `columnWidths: [3120, 3120, 3120]` (equal width) + +## Links & Navigation +```javascript +// TOC (requires headings) - CRITICAL: Use HeadingLevel only, NOT custom styles +// BAD: new Paragraph({ heading: HeadingLevel.HEADING_1, style: "customHeader", children: [new TextRun("Title")] }) +// GOOD: new Paragraph({ heading: HeadingLevel.HEADING_1, children: [new TextRun("Title")] }) +new TableOfContents("Table of Contents", { hyperlink: true, headingStyleRange: "1-3" }), + +// External link +new Paragraph({ + children: [new ExternalHyperlink({ + children: [new TextRun({ text: "Google", style: "Hyperlink" })], + link: "https://www.google.com" + })] +}), + +// Internal link & bookmark +new Paragraph({ + children: [new InternalHyperlink({ + children: [new TextRun({ text: "Go to Section", style: "Hyperlink" })], + anchor: "section1" + })] +}), +new Paragraph({ + children: [new TextRun("Section Content")], + bookmark: { id: "section1", name: "section1" } +}), +``` + +## Images & Media +```javascript +// Basic image with sizing & positioning +// CRITICAL: Always specify 'type' parameter - it's REQUIRED for ImageRun +new Paragraph({ + alignment: AlignmentType.CENTER, + children: [new ImageRun({ + type: "png", // NEW REQUIREMENT: Must specify image type (png, jpg, jpeg, gif, bmp, svg) + data: fs.readFileSync("image.png"), + transformation: { width: 200, height: 150, rotation: 0 }, // rotation in degrees + altText: { title: "Logo", description: "Company logo", name: "Name" } // IMPORTANT: All three fields are required + })] +}) +``` + +## Page Breaks +```javascript +// Manual page break +new Paragraph({ children: [new PageBreak()] }), + +// Page break before paragraph +new Paragraph({ + pageBreakBefore: true, + children: [new TextRun("This starts on a new page")] +}) + +// CRITICAL: NEVER use PageBreak standalone - it will create invalid XML that Word cannot open +// BAD: new PageBreak() +// GOOD: new Paragraph({ children: [new PageBreak()] }) +``` + +## Headers/Footers & Page Setup +```javascript +const doc = new Document({ + sections: [{ + properties: { + page: { + margin: { top: 1440, right: 1440, bottom: 1440, left: 1440 }, // 1440 = 1 inch + size: { orientation: PageOrientation.LANDSCAPE }, + pageNumbers: { start: 1, formatType: "decimal" } // "upperRoman", "lowerRoman", "upperLetter", "lowerLetter" + } + }, + headers: { + default: new Header({ children: [new Paragraph({ + alignment: AlignmentType.RIGHT, + children: [new TextRun("Header Text")] + })] }) + }, + footers: { + default: new Footer({ children: [new Paragraph({ + alignment: AlignmentType.CENTER, + children: [new TextRun("Page "), new TextRun({ children: [PageNumber.CURRENT] }), new TextRun(" of "), new TextRun({ children: [PageNumber.TOTAL_PAGES] })] + })] }) + }, + children: [/* content */] + }] +}); +``` + +## Tabs +```javascript +new Paragraph({ + tabStops: [ + { type: TabStopType.LEFT, position: TabStopPosition.MAX / 4 }, + { type: TabStopType.CENTER, position: TabStopPosition.MAX / 2 }, + { type: TabStopType.RIGHT, position: TabStopPosition.MAX * 3 / 4 } + ], + children: [new TextRun("Left\tCenter\tRight")] +}) +``` + +## Constants & Quick Reference +- **Underlines:** `SINGLE`, `DOUBLE`, `WAVY`, `DASH` +- **Borders:** `SINGLE`, `DOUBLE`, `DASHED`, `DOTTED` +- **Numbering:** `DECIMAL` (1,2,3), `UPPER_ROMAN` (I,II,III), `LOWER_LETTER` (a,b,c) +- **Tabs:** `LEFT`, `CENTER`, `RIGHT`, `DECIMAL` +- **Symbols:** `"2022"` (bullet), `"00A9"` (copyright), `"00AE"` (registered), `"2122"` (trademark), `"00B0"` (degree), `"F070"` (checkmark), `"F0FC"` (x-mark) + +## Critical Issues & Common Mistakes +- **CRITICAL: PageBreak must ALWAYS be inside a Paragraph** - standalone PageBreak creates invalid XML that Word cannot open +- **ALWAYS use ShadingType.CLEAR for table cell shading** - Never use ShadingType.SOLID (causes black background). +- Measurements in DXA (1440 = 1 inch) | Each table cell needs at least 1 Paragraph | TOC requires HeadingLevel styles only +- **ALWAYS use custom styles** with Arial font for professional appearance and proper visual hierarchy +- **ALWAYS set a default font** using `styles.default.document.run.font` - Arial recommended +- **ALWAYS use columnWidths array for tables** + individual cell widths for compatibility +- **NEVER use unicode symbols for bullets** - always use proper numbering configuration with `LevelFormat.BULLET` constant (NOT the string "bullet") +- **NEVER use \n for line breaks anywhere** - always use separate Paragraph elements for each line +- **ALWAYS use TextRun objects within Paragraph children** - never use text property directly on Paragraph +- **CRITICAL for images**: ImageRun REQUIRES `type` parameter - always specify "png", "jpg", "jpeg", "gif", "bmp", or "svg" +- **CRITICAL for bullets**: Must use `LevelFormat.BULLET` constant, not string "bullet", and include `text: "\u2022"` for the bullet character +- **CRITICAL for numbering**: Each numbering reference creates an INDEPENDENT list. Same reference = continues numbering (1,2,3 then 4,5,6). Different reference = restarts at 1 (1,2,3 then 1,2,3). Use unique reference names for each separate numbered section! +- **CRITICAL for TOC**: When using TableOfContents, headings must use HeadingLevel ONLY - do NOT add custom styles to heading paragraphs or TOC will break +- **Tables**: Set `columnWidths` array + individual cell widths, apply borders to cells not table +- **Set table margins at TABLE level** for consistent cell padding (avoids repetition per cell) diff --git a/deploy/data/skills/find-skills/SKILL.md b/deploy/data/skills/find-skills/SKILL.md new file mode 100644 index 000000000..9fd5ae464 --- /dev/null +++ b/deploy/data/skills/find-skills/SKILL.md @@ -0,0 +1,134 @@ +--- +name: find-skills +description: Helps users discover and install agent skills when they ask questions like "how do I do X", "find a skill for X", "is there a skill that can...", or express interest in extending capabilities. This skill should be used when the user is looking for functionality that might exist as an installable skill. +description_zh: 当用户询问"如何做X"、"找一个做X的技能"或表达对扩展功能的兴趣时,帮助用户发现和安装代理技能。当用户寻找可能存在的可安装技能时使用此功能。 +--- + +# Find Skills + +This skill helps you discover and install skills from the open agent skills ecosystem. + +## When to Use This Skill + +Use this skill when the user: + +- Asks "how do I do X" where X might be a common task with an existing skill +- Says "find a skill for X" or "is there a skill for X" +- Asks "can you do X" where X is a specialized capability +- Expresses interest in extending agent capabilities +- Wants to search for tools, templates, or workflows +- Mentions they wish they had help with a specific domain (design, testing, deployment, etc.) + +## What is the Skills CLI? + +The Skills CLI (`npx skills`) is the package manager for the open agent skills ecosystem. Skills are modular packages that extend agent capabilities with specialized knowledge, workflows, and tools. + +**Key commands:** + +- `npx skills find [query]` - Search for skills interactively or by keyword +- `npx skills add ` - Install a skill from GitHub or other sources +- `npx skills check` - Check for skill updates +- `npx skills update` - Update all installed skills + +**Browse skills at:** https://skills.sh/ + +## How to Help Users Find Skills + +### Step 1: Understand What They Need + +When a user asks for help with something, identify: + +1. The domain (e.g., React, testing, design, deployment) +2. The specific task (e.g., writing tests, creating animations, reviewing PRs) +3. Whether this is a common enough task that a skill likely exists + +### Step 2: Search for Skills + +Run the find command with a relevant query: + +```bash +npx skills find [query] +``` + +For example: + +- User asks "how do I make my React app faster?" → `npx skills find react performance` +- User asks "can you help me with PR reviews?" → `npx skills find pr review` +- User asks "I need to create a changelog" → `npx skills find changelog` + +The command will return results like: + +``` +Install with npx skills add + +vercel-labs/agent-skills@vercel-react-best-practices +└ https://skills.sh/vercel-labs/agent-skills/vercel-react-best-practices +``` + +### Step 3: Present Options to the User + +When you find relevant skills, present them to the user with: + +1. The skill name and what it does +2. The install command they can run +3. A link to learn more at skills.sh + +Example response: + +``` +I found a skill that might help! The "vercel-react-best-practices" skill provides +React and Next.js performance optimization guidelines from Vercel Engineering. + +To install it: +npx skills add vercel-labs/agent-skills@vercel-react-best-practices + +Learn more: https://skills.sh/vercel-labs/agent-skills/vercel-react-best-practices +``` + +### Step 4: Offer to Install + +If the user wants to proceed, you can install the skill for them: + +```bash +npx skills add --directory ~/.qoder/skills -y +``` + +The `--directory ~/.qoder/skills` flag installs to the Qoder skills directory and `-y` skips confirmation prompts. + +## Common Skill Categories + +When searching, consider these common categories: + +| Category | Example Queries | +| --------------- | ---------------------------------------- | +| Web Development | react, nextjs, typescript, css, tailwind | +| Testing | testing, jest, playwright, e2e | +| DevOps | deploy, docker, kubernetes, ci-cd | +| Documentation | docs, readme, changelog, api-docs | +| Code Quality | review, lint, refactor, best-practices | +| Design | ui, ux, design-system, accessibility | +| Productivity | workflow, automation, git | + +## Tips for Effective Searches + +1. **Use specific keywords**: "react testing" is better than just "testing" +2. **Try alternative terms**: If "deploy" doesn't work, try "deployment" or "ci-cd" +3. **Check popular sources**: Many skills come from `vercel-labs/agent-skills` or `ComposioHQ/awesome-claude-skills` + +## When No Skills Are Found + +If no relevant skills exist: + +1. Acknowledge that no existing skill was found +2. Offer to help with the task directly using your general capabilities +3. Suggest the user could create their own skill with `npx skills init` + +Example: + +``` +I searched for skills related to "xyz" but didn't find any matches. +I can still help you with this task directly! Would you like me to proceed? + +If this is something you do often, you could create your own skill: +npx skills init my-xyz-skill +``` \ No newline at end of file diff --git a/deploy/data/skills/frontend-design/SKILL.md b/deploy/data/skills/frontend-design/SKILL.md new file mode 100644 index 000000000..862e894fc --- /dev/null +++ b/deploy/data/skills/frontend-design/SKILL.md @@ -0,0 +1,43 @@ +--- +name: frontend-design +description: Create distinctive, production-grade frontend interfaces with high design quality. Use this skill when the user asks to build web components, pages, or applications. Generates creative, polished code that avoids generic AI aesthetics. +description_zh: 创建独特、生产级别的高质量前端界面。当用户要求构建 Web 组件、页面或应用程序时使用此技能。生成富有创意、精致的代码,避免千篇一律的 AI 风格。 +license: Complete terms in LICENSE.txt +--- + +This skill guides creation of distinctive, production-grade frontend interfaces that avoid generic "AI slop" aesthetics. Implement real working code with exceptional attention to aesthetic details and creative choices. + +The user provides frontend requirements: a component, page, application, or interface to build. They may include context about the purpose, audience, or technical constraints. + +## Design Thinking + +Before coding, understand the context and commit to a BOLD aesthetic direction: +- **Purpose**: What problem does this interface solve? Who uses it? +- **Tone**: Pick an extreme: brutally minimal, maximalist chaos, retro-futuristic, organic/natural, luxury/refined, playful/toy-like, editorial/magazine, brutalist/raw, art deco/geometric, soft/pastel, industrial/utilitarian, etc. There are so many flavors to choose from. Use these for inspiration but design one that is true to the aesthetic direction. +- **Constraints**: Technical requirements (framework, performance, accessibility). +- **Differentiation**: What makes this UNFORGETTABLE? What's the one thing someone will remember? + +**CRITICAL**: Choose a clear conceptual direction and execute it with precision. Bold maximalism and refined minimalism both work - the key is intentionality, not intensity. + +Then implement working code (HTML/CSS/JS, React, Vue, etc.) that is: +- Production-grade and functional +- Visually striking and memorable +- Cohesive with a clear aesthetic point-of-view +- Meticulously refined in every detail + +## Frontend Aesthetics Guidelines + +Focus on: +- **Typography**: Choose fonts that are beautiful, unique, and interesting. Avoid generic fonts like Arial and Inter; opt instead for distinctive choices that elevate the frontend's aesthetics; unexpected, characterful font choices. Pair a distinctive display font with a refined body font. +- **Color & Theme**: Commit to a cohesive aesthetic. Use CSS variables for consistency. Dominant colors with sharp accents outperform timid, evenly-distributed palettes. +- **Motion**: Use animations for effects and micro-interactions. Prioritize CSS-only solutions for HTML. Use Motion library for React when available. Focus on high-impact moments: one well-orchestrated page load with staggered reveals (animation-delay) creates more delight than scattered micro-interactions. Use scroll-triggering and hover states that surprise. +- **Spatial Composition**: Unexpected layouts. Asymmetry. Overlap. Diagonal flow. Grid-breaking elements. Generous negative space OR controlled density. +- **Backgrounds & Visual Details**: Create atmosphere and depth rather than defaulting to solid colors. Add contextual effects and textures that match the overall aesthetic. Apply creative forms like gradient meshes, noise textures, geometric patterns, layered transparencies, dramatic shadows, decorative borders, custom cursors, and grain overlays. + +NEVER use generic AI-generated aesthetics like overused font families (Inter, Roboto, Arial, system fonts), cliched color schemes (particularly purple gradients on white backgrounds), predictable layouts and component patterns, and cookie-cutter design that lacks context-specific character. + +Interpret creatively and make unexpected choices that feel genuinely designed for the context. No design should be the same. Vary between light and dark themes, different fonts, different aesthetics. NEVER converge on common choices (Space Grotesk, for example) across generations. + +**IMPORTANT**: Match implementation complexity to the aesthetic vision. Maximalist designs need elaborate code with extensive animations and effects. Minimalist or refined designs need restraint, precision, and careful attention to spacing, typography, and subtle details. Elegance comes from executing the vision well. + +Remember: Claude is capable of extraordinary creative work. Don't hold back, show what can truly be created when thinking outside the box and committing fully to a distinctive vision. \ No newline at end of file diff --git a/deploy/data/skills/notion-infographic/SKILL.md b/deploy/data/skills/notion-infographic/SKILL.md new file mode 100644 index 000000000..b0e0e1458 --- /dev/null +++ b/deploy/data/skills/notion-infographic/SKILL.md @@ -0,0 +1,187 @@ +--- +name: notion-infographic +description: "Batch generate Notion-style relaxed hand-drawn infographic series from reference documents. Use this skill when users need to read documents and generate a set of infographics for social media distribution. Ideal for converting articles and notes into visual content series." +description_zh: "根据参考文稿批量生成 Notion 风格松弛感手绘信息图组图。当用户需要阅读文档并生成一组社交媒体传播用的信息图时使用此 skill。适用于将文章、笔记转化为视觉化内容系列。" +--- + +# Notion 信息图生成器 + +将参考文稿转化为一组风格统一的 Notion 手绘风格信息图,用于社交媒体传播。 + +## 图片数量规则 + +**优先级:用户指定 > 文章意图分析 > 字数参考** + +### 用户指定 +如果用户明确指定了数量(如"生成 5 张"),则按用户要求生成。 + +### 文章意图分析(核心逻辑) + +基于文章内容提炼核心观点,每个独立观点对应一张图: + +1. **识别文章主旨** - 文章想要传达什么核心信息? +2. **拆解论点结构** - 有哪些支撑主旨的独立观点? +3. **评估可视化价值** - 每个观点是否适合独立成图? +4. **确定最终张数** - 观点数 + 封面 + 总结(如需要) + +**判断标准:** +- 每张图只承载 1 个核心观点 +- 观点不足时不强行凑数 +- 观点过多时合并相近内容 +- **硬上限:不超过 12 张** + +### 字数参考(仅供辅助) + +| 内容长度 | 通常观点数 | 参考范围 | +|----------|-----------|----------| +| 短文(<500字) | 2-4 个 | 3-5 张 | +| 中文(500-1500字) | 4-7 个 | 5-8 张 | +| 长文(>1500字) | 6-10 个 | 8-12 张 | + +**注意:字数仅作为辅助参考,最终张数以文章意图分析结果为准。** + +--- + +## 工作流程 + +### Step 1: 分析文章意图,确定张数 + +1. 阅读用户指定的文稿 +2. 识别文章主旨和核心信息 +3. 拆解独立观点,评估可视化价值 +4. 确定生图张数(上限 12 张) +5. 如用户指定张数,以用户为准 + +### Step 2: 规划组图结构 + +``` +第1张:标题封面图(主题 + 核心价值主张) +第2至N-1张:内容图(每张一个核心观点) +第N张:总结/行动号召图 +``` + +### Step 3: 生成提示词 + +每张图的提示词必须包含以下结构: + +``` +[风格前缀] + [内容描述] + [风格后缀] +``` + +### Step 4: 调用 imageGen + +为每个信息点调用 imageGen 工具生成图片。 + +--- + +## 核心风格提示词模板 + +**每张图的提示词都必须包含完整的风格描述,不可省略或简化任何细节:** + +### 风格前缀(每张图必须完整包含) + +``` +Notion官方插画风格松弛感线稿信息图。 + +线条特征: +- 纯黄色为主,搭配其他适合的颜色 +- 线条粗细不均匀,像马克笔随手画的质感 +- 笔触松弛、略带抖动,不追求工整 +- 所有图标、人物、图表都保持手绘涂鸦感 +- 少部份地方填充黑色让整体压实 + +人物画法(如需要): +- 简笔画风格:女孩,短发,圆润的头、点或线表示五官 +- 肢体夸张、动作松弛自然 +- 可以是完整人物或只露出局部(手、上半身) +``` + +### 风格后缀(每张图必须完整包含) + +``` +禁止事项(严格执行): +- 不要彩色渐变或复杂配色 +- 不要粗黑边框或生硬分隔线 +- 不要3D效果、阴影、立体感 +- 不要密集文字堆砌,文字内容简化 +- 保持大量留白和呼吸感 +- 装饰元素要克制,不要太多 + +输出规范: +- 图片比例16:9 +- 中文标注 +- 直接生成,不需要解释 +``` + +### 完整提示词组装示例 + +``` +Notion官方插画风格松弛感线稿信息图。线条特征:纯黄色为主,搭配其他适合的颜色。线条粗细不均匀,像马克笔随手画的质感。笔触松弛、略带抖动,不追求工整。所有图标、人物、图表都保持手绘涂鸦感。少部份地方填充黑色让整体压实。人物画法:简笔画风格女孩,短发,圆润的头、点或线表示五官,肢体夸张、动作松弛自然,可以是完整人物或只露出局部。 + +[内容描述区域 - 根据具体信息点填写,如:] +一个困惑的人被漂浮的AI热词包围,如"提示词过时了"和"程序员被淘汰",头顶飘着问号。两侧有喇叭在广播相互矛盾的信息,这个人在噪音中试图抓住清晰的方向。 + +禁止事项:不要彩色渐变或复杂配色。不要粗黑边框或生硬分隔线。不要3D效果、阴影、立体感。不要密集文字堆砌,文字内容简化。保持大量留白和呼吸感。装饰元素要克制。图片比例16:9,中文标注。 +``` + +--- + +## 多图一致性保障 + +### 硬性约束(不可修改) + +1. **风格前缀和后缀完整保留** - 每张图必须包含完全相同的风格描述,不可省略任何细节 +2. **线条特征锁定** - 纯黄色主色、马克笔质感、略带抖动的松弛笔触 +3. **人物形象统一** - 短发女孩、圆润头部、点/线五官、夸张肢体、松弛动作 +4. **配色方案锁定** - 主色纯黄色 + 黑色压实,不随内容变化 +5. **禁止元素全局生效** - 所有图都不能出现:渐变、粗黑边框、硬分隔线、3D、阴影、密集文字 + +### 内容差异化 + +- 仅通过 `[内容描述]` 部分体现每张图的独特信息 +- 内容描述应简洁、视觉化,适合转化为图像 +- 保持大量留白和呼吸感 + +--- + +## 输出规范 + +- 图片数量:基于文章意图自适应,或用户指定(上限 12 张) +- 语言:中文 +- 比例:16:9 +- 命名建议:`infographic-01.png`, `infographic-02.png`... + +--- + +## 使用示例 + +### 示例1:自适应张数 +**用户输入:** +> 阅读 docs/ai-myths.md,使用 notion-infographic-v2 skill 生成一组信息图 + +**执行:** +1. 读取文稿,分析内容复杂度 +2. 提取 N 个核心观点(根据内容自动决定) +3. 生成 N 张信息图 + +### 示例2:用户指定张数 +**用户输入:** +> 阅读 docs/ai-myths.md,生成 5 张信息图 + +**执行:** +1. 读取文稿 +2. 提取 5 个最核心的观点(用户指定优先) +3. 生成 5 张信息图 + +--- + +## 禁止事项 + +- 不要省略或简化风格前缀/后缀中的任何细节 +- 不要在单张图中堆砌多个信息点 +- 不要使用彩色渐变或复杂配色 +- 不要使用粗黑边框或生硬分隔线 +- 不要生成 3D 效果、阴影或立体感 +- 不要密集堆砌文字,文字内容需简化 +- 不要过度装饰,装饰元素要克制 +- 必须保持大量留白和呼吸感 diff --git a/deploy/data/skills/pdf/SKILL.md b/deploy/data/skills/pdf/SKILL.md new file mode 100644 index 000000000..43993da66 --- /dev/null +++ b/deploy/data/skills/pdf/SKILL.md @@ -0,0 +1,369 @@ +--- +name: pdf +description: Advanced PDF document toolkit for content extraction, document generation, page manipulation, and interactive form processing. Use when you need to parse PDF text and tables, create professional documents, combine or split files, or complete fillable forms programmatically. +description_zh: 高级 PDF 文档工具包,支持内容提取、文档生成、页面操作和交互式表单处理。适用于解析 PDF 文本和表格、创建专业文档、合并或拆分文件,或以编程方式完成可填写表单。 +license: Proprietary. LICENSE.txt has complete terms +--- + +# PDF Document Toolkit + +## Introduction + +This toolkit provides comprehensive PDF document operations using Python libraries and shell utilities. For advanced usage, JavaScript APIs, and detailed code samples, refer to advanced-guide.md. For filling PDF forms, consult form-handler.md and follow its workflow. + +## Important: Post-Completion Verification + +**After generating or modifying PDF files, ALWAYS verify the output for CJK text rendering issues:** + +1. **Open the generated PDF** and visually inspect all text content +2. **Check for garbled characters** - Look for: + - Black boxes (■) or rectangles instead of CJK characters + - Question marks (?) or replacement characters (�) + - Missing text where CJK content should appear + - Incorrectly rendered or overlapping characters +3. **If issues are found**, refer to the "CJK (Chinese/Japanese/Korean) Text Support" section below for font configuration solutions + +This verification step is critical when the PDF contains Chinese, Japanese, or Korean text. + +## Getting Started + +```python +from pypdf import PdfReader, PdfWriter + +# Open a PDF document +doc = PdfReader("sample.pdf") +print(f"Total pages: {len(doc.pages)}") + +# Gather text content +content = "" +for pg in doc.pages: + content += pg.extract_text() +``` + +## Python Libraries + +### pypdf - Core Operations + +#### Combine Multiple PDFs +```python +from pypdf import PdfWriter, PdfReader + +output = PdfWriter() +for pdf in ["first.pdf", "second.pdf", "third.pdf"]: + doc = PdfReader(pdf) + for pg in doc.pages: + output.add_page(pg) + +with open("combined.pdf", "wb") as out_file: + output.write(out_file) +``` + +#### Separate PDF Pages +```python +doc = PdfReader("source.pdf") +for idx, pg in enumerate(doc.pages): + output = PdfWriter() + output.add_page(pg) + with open(f"part_{idx+1}.pdf", "wb") as out_file: + output.write(out_file) +``` + +#### Read Document Properties +```python +doc = PdfReader("sample.pdf") +props = doc.metadata +print(f"Title: {props.title}") +print(f"Author: {props.author}") +print(f"Subject: {props.subject}") +print(f"Creator: {props.creator}") +``` + +#### Rotate Document Pages +```python +doc = PdfReader("source.pdf") +output = PdfWriter() + +pg = doc.pages[0] +pg.rotate(90) # 90 degrees clockwise +output.add_page(pg) + +with open("turned.pdf", "wb") as out_file: + output.write(out_file) +``` + +### pdfplumber - Content Extraction + +#### Extract Text with Layout +```python +import pdfplumber + +with pdfplumber.open("sample.pdf") as doc: + for pg in doc.pages: + content = pg.extract_text() + print(content) +``` + +#### Extract Tabular Data +```python +with pdfplumber.open("sample.pdf") as doc: + for pg_num, pg in enumerate(doc.pages): + data_tables = pg.extract_tables() + for tbl_num, tbl in enumerate(data_tables): + print(f"Table {tbl_num+1} on page {pg_num+1}:") + for row in tbl: + print(row) +``` + +#### Export Tables to Excel +```python +import pandas as pd + +with pdfplumber.open("sample.pdf") as doc: + collected_tables = [] + for pg in doc.pages: + data_tables = pg.extract_tables() + for tbl in data_tables: + if tbl: # Verify table is not empty + df = pd.DataFrame(tbl[1:], columns=tbl[0]) + collected_tables.append(df) + +# Merge all tables +if collected_tables: + merged_df = pd.concat(collected_tables, ignore_index=True) + merged_df.to_excel("tables_export.xlsx", index=False) +``` + +### reportlab - Document Generation + +#### Create Simple PDF +```python +from reportlab.lib.pagesizes import letter +from reportlab.pdfgen import canvas + +c = canvas.Canvas("greeting.pdf", pagesize=letter) +width, height = letter + +# Insert text +c.drawString(100, height - 100, "Welcome!") +c.drawString(100, height - 120, "Generated using reportlab library") + +# Draw a separator line +c.line(100, height - 140, 400, height - 140) + +# Save document +c.save() +``` + +#### Generate Multi-Page Document +```python +from reportlab.lib.pagesizes import letter +from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak +from reportlab.lib.styles import getSampleStyleSheet + +doc = SimpleDocTemplate("document.pdf", pagesize=letter) +styles = getSampleStyleSheet() +elements = [] + +# Add content +heading = Paragraph("Document Title", styles['Title']) +elements.append(heading) +elements.append(Spacer(1, 12)) + +body_text = Paragraph("This is the main content section. " * 20, styles['Normal']) +elements.append(body_text) +elements.append(PageBreak()) + +# Second page +elements.append(Paragraph("Section 2", styles['Heading1'])) +elements.append(Paragraph("Content for the second section", styles['Normal'])) + +# Generate PDF +doc.build(elements) +``` + +## Shell Utilities + +### pdftotext (poppler-utils) +```bash +# Convert to text +pdftotext source.pdf result.txt + +# Preserve layout formatting +pdftotext -layout source.pdf result.txt + +# Convert specific page range +pdftotext -f 1 -l 5 source.pdf result.txt # Pages 1-5 +``` + +### qpdf +```bash +# Merge documents +qpdf --empty --pages doc1.pdf doc2.pdf -- result.pdf + +# Extract page range +qpdf source.pdf --pages . 1-5 -- subset1-5.pdf +qpdf source.pdf --pages . 6-10 -- subset6-10.pdf + +# Rotate specific page +qpdf source.pdf result.pdf --rotate=+90:1 # Rotate page 1 by 90 degrees + +# Decrypt protected PDF +qpdf --password=secret --decrypt protected.pdf unlocked.pdf +``` + +### pdftk (if available) +```bash +# Merge documents +pdftk doc1.pdf doc2.pdf cat output result.pdf + +# Split into individual pages +pdftk source.pdf burst + +# Rotate page +pdftk source.pdf rotate 1east output turned.pdf +``` + +## Common Operations + +### OCR for Scanned Documents +```python +# Requires: pip install pytesseract pdf2image +import pytesseract +from pdf2image import convert_from_path + +# Convert PDF pages to images +pages = convert_from_path('scanned.pdf') + +# Process each page with OCR +content = "" +for idx, img in enumerate(pages): + content += f"Page {idx+1}:\n" + content += pytesseract.image_to_string(img) + content += "\n\n" + +print(content) +``` + +### Apply Watermark +```python +from pypdf import PdfReader, PdfWriter + +# Load watermark (or create one) +watermark_page = PdfReader("stamp.pdf").pages[0] + +# Apply to all pages +doc = PdfReader("sample.pdf") +output = PdfWriter() + +for pg in doc.pages: + pg.merge_page(watermark_page) + output.add_page(pg) + +with open("stamped.pdf", "wb") as out_file: + output.write(out_file) +``` + +### Export Embedded Images +```bash +# Using pdfimages (poppler-utils) +pdfimages -j source.pdf img_prefix + +# Outputs: img_prefix-000.jpg, img_prefix-001.jpg, etc. +``` + +### Add Document Password +```python +from pypdf import PdfReader, PdfWriter + +doc = PdfReader("source.pdf") +output = PdfWriter() + +for pg in doc.pages: + output.add_page(pg) + +# Set passwords +output.encrypt("user_pwd", "admin_pwd") + +with open("secured.pdf", "wb") as out_file: + output.write(out_file) +``` + +## Quick Reference Table + +| Operation | Recommended Tool | Example | +|-----------|------------------|---------| +| Merge documents | pypdf | `output.add_page(pg)` | +| Split document | pypdf | One page per output file | +| Extract text | pdfplumber | `pg.extract_text()` | +| Extract tables | pdfplumber | `pg.extract_tables()` | +| Create documents | reportlab | Canvas or Platypus | +| Shell merge | qpdf | `qpdf --empty --pages ...` | +| OCR scanned docs | pytesseract | Convert to image first | +| Fill PDF forms | pdf-lib or pypdf (see form-handler.md) | See form-handler.md | + +## CJK (Chinese/Japanese/Korean) Text Support + +**Important**: Standard PDF fonts (Arial, Helvetica, etc.) do not support CJK characters. If CJK text is used without a proper CJK font, characters will display as black boxes (■). + +### Automatic Font Detection + +The `apply_text_overlays.py` utility automatically: +1. Detects CJK characters in your text content +2. Searches for available CJK fonts on your system +3. **Exits with an error if CJK characters are detected but no CJK font is found** + +### Supported System Fonts + +| OS | Font Paths | +|----|------------| +| macOS | `/System/Library/Fonts/PingFang.ttc`, `/System/Library/Fonts/STHeiti Light.ttc` | +| Windows | `C:/Windows/Fonts/msyh.ttc` (Microsoft YaHei), `C:/Windows/Fonts/simsun.ttc` | +| Linux | `/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc`, `/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc` | + +### If You See "No CJK Font Found" Error + +Install a CJK font for your operating system: + +```bash +# Ubuntu/Debian +sudo apt-get install fonts-noto-cjk + +# Fedora/RHEL +sudo dnf install google-noto-sans-cjk-fonts + +# macOS - PingFang is pre-installed +# Windows - Microsoft YaHei is pre-installed +``` + +### Manual Font Registration (for reportlab) + +When using reportlab directly, register a CJK font before drawing text: + +```python +from reportlab.pdfbase import pdfmetrics +from reportlab.pdfbase.ttfonts import TTFont + +# Register CJK font (example for macOS) +# Note: For TTC (TrueType Collection) files, specify subfontIndex parameter +pdfmetrics.registerFont(TTFont('PingFang', '/System/Library/Fonts/PingFang.ttc', subfontIndex=0)) + +# Use the font for CJK text +c.setFont('PingFang', 14) +c.drawString(100, 700, '你好世界') # Chinese +c.drawString(100, 680, 'こんにちは') # Japanese +c.drawString(100, 660, '안녕하세요') # Korean +``` + +**Common subfontIndex values for TTC files:** +- PingFang.ttc: 0 (Regular), 1 (Medium), 2 (Semibold), etc. +- msyh.ttc: 0 (Regular), 1 (Bold) +- NotoSansCJK-Regular.ttc: varies by language variant + +For detailed CJK font configuration, see form-handler.md. + +## Additional Resources + +- For pypdfium2 advanced usage, see advanced-guide.md +- For JavaScript libraries (pdf-lib), see advanced-guide.md +- For filling PDF forms, follow instructions in form-handler.md +- For troubleshooting tips, see advanced-guide.md diff --git a/deploy/data/skills/pdf/advanced-guide.md b/deploy/data/skills/pdf/advanced-guide.md new file mode 100644 index 000000000..123e907c8 --- /dev/null +++ b/deploy/data/skills/pdf/advanced-guide.md @@ -0,0 +1,612 @@ +# PDF Document Toolkit - Advanced Guide + +This document covers advanced PDF operations, detailed examples, and supplementary libraries beyond the main toolkit instructions. + +## pypdfium2 Library (Apache/BSD License) + +### Overview +pypdfium2 provides Python bindings for PDFium (Chromium's PDF engine). It excels at fast rendering, image conversion, and serves as an alternative to PyMuPDF. + +### Render Pages to Images +```python +import pypdfium2 as pdfium +from PIL import Image + +# Load document +doc = pdfium.PdfDocument("sample.pdf") + +# Render first page +pg = doc[0] +bitmap = pg.render( + scale=2.0, # Higher DPI + rotation=0 # No rotation +) + +# Convert to PIL Image +img = bitmap.to_pil() +img.save("pg_1.png", "PNG") + +# Process all pages +for idx, pg in enumerate(doc): + bitmap = pg.render(scale=1.5) + img = bitmap.to_pil() + img.save(f"pg_{idx+1}.jpg", "JPEG", quality=90) +``` + +### Extract Text with pypdfium2 +```python +import pypdfium2 as pdfium + +doc = pdfium.PdfDocument("sample.pdf") +for idx, pg in enumerate(doc): + content = pg.get_text() + print(f"Page {idx+1} content length: {len(content)} chars") +``` + +## JavaScript Libraries + +### pdf-lib (MIT License) + +pdf-lib is a robust JavaScript library for creating and editing PDF documents across JavaScript environments. + +#### Load and Edit Existing PDF +```javascript +import { PDFDocument } from 'pdf-lib'; +import fs from 'fs'; + +async function editDocument() { + // Load existing document + const existingBytes = fs.readFileSync('source.pdf'); + const pdfDoc = await PDFDocument.load(existingBytes); + + // Get page count + const totalPages = pdfDoc.getPageCount(); + console.log(`Document contains ${totalPages} pages`); + + // Append new page + const newPg = pdfDoc.addPage([600, 400]); + newPg.drawText('Added via pdf-lib', { + x: 100, + y: 300, + size: 16 + }); + + // Save changes + const pdfBytes = await pdfDoc.save(); + fs.writeFileSync('edited.pdf', pdfBytes); +} +``` + +#### Generate Professional Documents from Scratch +```javascript +import { PDFDocument, rgb, StandardFonts } from 'pdf-lib'; +import fs from 'fs'; + +async function generateDocument() { + const pdfDoc = await PDFDocument.create(); + + // Embed fonts + const helvetica = await pdfDoc.embedFont(StandardFonts.Helvetica); + const helveticaBold = await pdfDoc.embedFont(StandardFonts.HelveticaBold); + + // Create page + const pg = pdfDoc.addPage([595, 842]); // A4 dimensions + const { width, height } = pg.getSize(); + + // Add styled text + pg.drawText('Invoice #12345', { + x: 50, + y: height - 50, + size: 18, + font: helveticaBold, + color: rgb(0.2, 0.2, 0.8) + }); + + // Add header background + pg.drawRectangle({ + x: 40, + y: height - 100, + width: width - 80, + height: 30, + color: rgb(0.9, 0.9, 0.9) + }); + + // Add tabular data + const rows = [ + ['Item', 'Qty', 'Price', 'Total'], + ['Widget', '2', '$50', '$100'], + ['Gadget', '1', '$75', '$75'] + ]; + + let yPos = height - 150; + rows.forEach(row => { + let xPos = 50; + row.forEach(cell => { + pg.drawText(cell, { + x: xPos, + y: yPos, + size: 12, + font: helvetica + }); + xPos += 120; + }); + yPos -= 25; + }); + + const pdfBytes = await pdfDoc.save(); + fs.writeFileSync('generated.pdf', pdfBytes); +} +``` + +#### Advanced Document Combination +```javascript +import { PDFDocument } from 'pdf-lib'; +import fs from 'fs'; + +async function combineDocuments() { + // Create output document + const combinedPdf = await PDFDocument.create(); + + // Load source documents + const doc1Bytes = fs.readFileSync('first.pdf'); + const doc2Bytes = fs.readFileSync('second.pdf'); + + const doc1 = await PDFDocument.load(doc1Bytes); + const doc2 = await PDFDocument.load(doc2Bytes); + + // Copy all pages from first document + const doc1Pages = await combinedPdf.copyPages(doc1, doc1.getPageIndices()); + doc1Pages.forEach(pg => combinedPdf.addPage(pg)); + + // Copy selected pages from second document (pages 0, 2, 4) + const doc2Pages = await combinedPdf.copyPages(doc2, [0, 2, 4]); + doc2Pages.forEach(pg => combinedPdf.addPage(pg)); + + const combinedBytes = await combinedPdf.save(); + fs.writeFileSync('combined.pdf', combinedBytes); +} +``` + +### pdfjs-dist (Apache License) + +PDF.js is Mozilla's JavaScript library for browser-based PDF rendering. + +#### Basic Document Loading and Rendering +```javascript +import * as pdfjsLib from 'pdfjs-dist'; + +// Configure worker for performance +pdfjsLib.GlobalWorkerOptions.workerSrc = './pdf.worker.js'; + +async function displayDocument() { + // Load document + const loadingTask = pdfjsLib.getDocument('sample.pdf'); + const doc = await loadingTask.promise; + + console.log(`Loaded document with ${doc.numPages} pages`); + + // Get first page + const pg = await doc.getPage(1); + const viewport = pg.getViewport({ scale: 1.5 }); + + // Render to canvas + const canvas = document.createElement('canvas'); + const ctx = canvas.getContext('2d'); + canvas.height = viewport.height; + canvas.width = viewport.width; + + const renderConfig = { + canvasContext: ctx, + viewport: viewport + }; + + await pg.render(renderConfig).promise; + document.body.appendChild(canvas); +} +``` + +#### Extract Text with Position Data +```javascript +import * as pdfjsLib from 'pdfjs-dist'; + +async function extractTextContent() { + const loadingTask = pdfjsLib.getDocument('sample.pdf'); + const doc = await loadingTask.promise; + + let fullContent = ''; + + // Extract from all pages + for (let i = 1; i <= doc.numPages; i++) { + const pg = await doc.getPage(i); + const textData = await pg.getTextContent(); + + const pageContent = textData.items + .map(item => item.str) + .join(' '); + + fullContent += `\n--- Page ${i} ---\n${pageContent}`; + + // Get text with coordinates for advanced processing + const textWithPositions = textData.items.map(item => ({ + text: item.str, + x: item.transform[4], + y: item.transform[5], + width: item.width, + height: item.height + })); + } + + console.log(fullContent); + return fullContent; +} +``` + +#### Extract Annotations and Form Elements +```javascript +import * as pdfjsLib from 'pdfjs-dist'; + +async function extractAnnotations() { + const loadingTask = pdfjsLib.getDocument('annotated.pdf'); + const doc = await loadingTask.promise; + + for (let i = 1; i <= doc.numPages; i++) { + const pg = await doc.getPage(i); + const annotations = await pg.getAnnotations(); + + annotations.forEach(ann => { + console.log(`Annotation type: ${ann.subtype}`); + console.log(`Content: ${ann.contents}`); + console.log(`Coordinates: ${JSON.stringify(ann.rect)}`); + }); + } +} +``` + +## Advanced Shell Operations + +### poppler-utils Advanced Features + +#### Extract Text with Bounding Boxes +```bash +# Extract text with coordinate data (essential for structured processing) +pdftotext -bbox-layout sample.pdf result.xml + +# The XML output contains precise coordinates for each text element +``` + +#### Advanced Image Conversion +```bash +# Convert to PNG with specific resolution +pdftoppm -png -r 300 sample.pdf output_prefix + +# Convert page range at high resolution +pdftoppm -png -r 600 -f 1 -l 3 sample.pdf highres_pages + +# Convert to JPEG with quality setting +pdftoppm -jpeg -jpegopt quality=85 -r 200 sample.pdf jpeg_output +``` + +#### Extract Embedded Images +```bash +# Extract all embedded images with metadata +pdfimages -j -p sample.pdf page_images + +# List image info without extraction +pdfimages -list sample.pdf + +# Extract images in original format +pdfimages -all sample.pdf images/img +``` + +### qpdf Advanced Features + +#### Complex Page Manipulation +```bash +# Split document into page groups +qpdf --split-pages=3 source.pdf output_group_%02d.pdf + +# Extract pages with complex range specifications +qpdf source.pdf --pages source.pdf 1,3-5,8,10-end -- extracted.pdf + +# Combine specific pages from multiple documents +qpdf --empty --pages doc1.pdf 1-3 doc2.pdf 5-7 doc3.pdf 2,4 -- combined.pdf +``` + +#### Document Optimization and Repair +```bash +# Optimize for web streaming (linearize) +qpdf --linearize source.pdf optimized.pdf + +# Remove unused objects and compress +qpdf --optimize-level=all source.pdf compressed.pdf + +# Attempt to repair corrupted document structure +qpdf --check source.pdf +qpdf --fix-qdf damaged.pdf repaired.pdf + +# Display detailed document structure for debugging +qpdf --show-all-pages source.pdf > structure.txt +``` + +#### Advanced Encryption +```bash +# Add password protection with specific permissions +qpdf --encrypt user_pass admin_pass 256 --print=none --modify=none -- source.pdf secured.pdf + +# Check encryption status +qpdf --show-encryption secured.pdf + +# Remove password protection (requires password) +qpdf --password=secret123 --decrypt secured.pdf unlocked.pdf +``` + +## Advanced Python Techniques + +### pdfplumber Advanced Features + +#### Extract Text with Precise Coordinates +```python +import pdfplumber + +with pdfplumber.open("sample.pdf") as doc: + pg = doc.pages[0] + + # Extract all text with coordinates + chars = pg.chars + for char in chars[:10]: # First 10 characters + print(f"Char: '{char['text']}' at x:{char['x0']:.1f} y:{char['y0']:.1f}") + + # Extract text by bounding box (left, top, right, bottom) + region_text = pg.within_bbox((100, 100, 400, 200)).extract_text() +``` + +#### Advanced Table Extraction with Custom Settings +```python +import pdfplumber +import pandas as pd + +with pdfplumber.open("complex_table.pdf") as doc: + pg = doc.pages[0] + + # Extract tables with custom settings for complex layouts + table_config = { + "vertical_strategy": "lines", + "horizontal_strategy": "lines", + "snap_tolerance": 3, + "intersection_tolerance": 15 + } + tables = pg.extract_tables(table_config) + + # Visual debugging for table extraction + img = pg.to_image(resolution=150) + img.save("debug_layout.png") +``` + +### reportlab Advanced Features + +#### Create Professional Reports with Tables +```python +from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph +from reportlab.lib.styles import getSampleStyleSheet +from reportlab.lib import colors + +# Sample data +data = [ + ['Product', 'Q1', 'Q2', 'Q3', 'Q4'], + ['Widgets', '120', '135', '142', '158'], + ['Gadgets', '85', '92', '98', '105'] +] + +# Create document with table +doc = SimpleDocTemplate("report.pdf") +elements = [] + +# Add title +styles = getSampleStyleSheet() +title = Paragraph("Quarterly Sales Report", styles['Title']) +elements.append(title) + +# Add table with advanced styling +table = Table(data) +table.setStyle(TableStyle([ + ('BACKGROUND', (0, 0), (-1, 0), colors.grey), + ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), + ('ALIGN', (0, 0), (-1, -1), 'CENTER'), + ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), + ('FONTSIZE', (0, 0), (-1, 0), 14), + ('BOTTOMPADDING', (0, 0), (-1, 0), 12), + ('BACKGROUND', (0, 1), (-1, -1), colors.beige), + ('GRID', (0, 0), (-1, -1), 1, colors.black) +])) +elements.append(table) + +doc.build(elements) +``` + +## Complex Workflows + +### Extract Figures/Images from PDF + +#### Method 1: Using pdfimages (fastest) +```bash +# Extract all images with original quality +pdfimages -all sample.pdf images/img +``` + +#### Method 2: Using pypdfium2 + Image Processing +```python +import pypdfium2 as pdfium +from PIL import Image +import numpy as np + +def extract_figures(pdf_path, output_dir): + doc = pdfium.PdfDocument(pdf_path) + + for page_num, pg in enumerate(doc): + # Render high-resolution page + bitmap = pg.render(scale=3.0) + img = bitmap.to_pil() + + # Convert to numpy for processing + img_array = np.array(img) + + # Simple figure detection (non-white regions) + mask = np.any(img_array != [255, 255, 255], axis=2) + + # Find contours and extract bounding boxes + # (This is simplified - real implementation would need more sophisticated detection) + + # Save detected figures + # ... implementation depends on specific needs +``` + +### Batch Document Processing with Error Handling +```python +import os +import glob +from pypdf import PdfReader, PdfWriter +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def batch_process(input_dir, operation='merge'): + pdf_files = glob.glob(os.path.join(input_dir, "*.pdf")) + + if operation == 'merge': + output = PdfWriter() + for pdf_file in pdf_files: + try: + doc = PdfReader(pdf_file) + for pg in doc.pages: + output.add_page(pg) + logger.info(f"Processed: {pdf_file}") + except Exception as e: + logger.error(f"Failed to process {pdf_file}: {e}") + continue + + with open("batch_combined.pdf", "wb") as out_file: + output.write(out_file) + + elif operation == 'extract_text': + for pdf_file in pdf_files: + try: + doc = PdfReader(pdf_file) + content = "" + for pg in doc.pages: + content += pg.extract_text() + + output_file = pdf_file.replace('.pdf', '.txt') + with open(output_file, 'w', encoding='utf-8') as f: + f.write(content) + logger.info(f"Extracted text from: {pdf_file}") + + except Exception as e: + logger.error(f"Failed to extract text from {pdf_file}: {e}") + continue +``` + +### Advanced Page Cropping +```python +from pypdf import PdfWriter, PdfReader + +doc = PdfReader("source.pdf") +output = PdfWriter() + +# Crop page (left, bottom, right, top in points) +pg = doc.pages[0] +pg.mediabox.left = 50 +pg.mediabox.bottom = 50 +pg.mediabox.right = 550 +pg.mediabox.top = 750 + +output.add_page(pg) +with open("cropped.pdf", "wb") as out_file: + output.write(out_file) +``` + +## Performance Optimization Tips + +### 1. For Large Documents +- Use streaming approaches instead of loading entire document in memory +- Use `qpdf --split-pages` for splitting large files +- Process pages individually with pypdfium2 + +### 2. For Text Extraction +- `pdftotext -bbox-layout` is fastest for plain text extraction +- Use pdfplumber for structured data and tables +- Avoid `pypdf.extract_text()` for very large documents + +### 3. For Image Extraction +- `pdfimages` is much faster than rendering pages +- Use low resolution for previews, high resolution for final output + +### 4. For Form Filling +- pdf-lib maintains form structure better than most alternatives +- Pre-validate form fields before processing + +### 5. Memory Management +```python +# Process documents in chunks +def process_large_document(pdf_path, chunk_size=10): + doc = PdfReader(pdf_path) + total_pages = len(doc.pages) + + for start_idx in range(0, total_pages, chunk_size): + end_idx = min(start_idx + chunk_size, total_pages) + output = PdfWriter() + + for i in range(start_idx, end_idx): + output.add_page(doc.pages[i]) + + # Process chunk + with open(f"chunk_{start_idx//chunk_size}.pdf", "wb") as out_file: + output.write(out_file) +``` + +## Troubleshooting Common Issues + +### Encrypted Documents +```python +# Handle password-protected documents +from pypdf import PdfReader + +try: + doc = PdfReader("secured.pdf") + if doc.is_encrypted: + doc.decrypt("password") +except Exception as e: + print(f"Failed to decrypt: {e}") +``` + +### Corrupted Documents +```bash +# Use qpdf to repair +qpdf --check corrupted.pdf +qpdf --replace-input corrupted.pdf +``` + +### Text Extraction Issues +```python +# Fallback to OCR for scanned documents +import pytesseract +from pdf2image import convert_from_path + +def extract_text_with_ocr(pdf_path): + pages = convert_from_path(pdf_path) + content = "" + for idx, img in enumerate(pages): + content += pytesseract.image_to_string(img) + return content +``` + +## License Information + +- **pypdf**: BSD License +- **pdfplumber**: MIT License +- **pypdfium2**: Apache/BSD License +- **reportlab**: BSD License +- **poppler-utils**: GPL-2 License +- **qpdf**: Apache License +- **pdf-lib**: MIT License +- **pdfjs-dist**: Apache License diff --git a/deploy/data/skills/pdf/form-handler.md b/deploy/data/skills/pdf/form-handler.md new file mode 100644 index 000000000..eed049c2e --- /dev/null +++ b/deploy/data/skills/pdf/form-handler.md @@ -0,0 +1,277 @@ +**IMPORTANT: Follow these steps sequentially. Do not proceed to code writing without completing earlier steps.** + +When you need to complete a PDF form, first determine whether it contains interactive form fields. Execute this utility from this file's directory: + `python utils/detect_interactive_fields.py `, then proceed to either "Interactive Form Fields" or "Static Form Layout" sections based on the output. + +# Interactive Form Fields +If the document contains interactive form fields: +- Execute from this file's directory: `python utils/parse_form_structure.py `. This generates a JSON file listing all fields: +``` +[ + { + "element_id": (unique identifier for the field), + "page_num": (page number, 1-indexed), + "bounds": ([left, bottom, right, top] bounding box in PDF coordinates, y=0 is page bottom), + "element_type": ("text_input", "toggle_box", "option_group", or "dropdown"), + }, + // Toggle boxes include "on_value" and "off_value" properties: + { + "element_id": (unique identifier for the field), + "page_num": (page number, 1-indexed), + "element_type": "toggle_box", + "on_value": (Use this value to activate the toggle), + "off_value": (Use this value to deactivate the toggle), + }, + // Option groups contain "available_options" array with selectable choices: + { + "element_id": (unique identifier for the field), + "page_num": (page number, 1-indexed), + "element_type": "option_group", + "available_options": [ + { + "option_value": (set field to this value to select this option), + "bounds": (bounding box for this option's selector) + }, + // Additional options + ] + }, + // Dropdown fields contain "menu_items" array: + { + "element_id": (unique identifier for the field), + "page_num": (page number, 1-indexed), + "element_type": "dropdown", + "menu_items": [ + { + "option_value": (set field to this value to select this item), + "display_text": (visible text of the item) + }, + // Additional menu items + ], + } +] +``` +- Generate page images (one PNG per page) with this utility (run from this file's directory): +`python utils/render_pages_to_png.py ` +Then examine the images to understand each field's purpose (convert bounding box PDF coordinates to image coordinates as needed). +- Create a `form_data.json` file with values for each field: +``` +[ + { + "element_id": "surname", // Must match element_id from `parse_form_structure.py` + "description": "Family name of the applicant", + "page_num": 1, // Must match "page_num" from structure_info.json + "fill_value": "Johnson" + }, + { + "element_id": "Checkbox12", + "description": "Toggle to mark if applicant is an adult", + "page_num": 1, + "fill_value": "/On" // For toggles, use "on_value" to activate. For option groups, use "option_value" from available_options. + }, + // additional fields +] +``` +- Execute the `populate_interactive_form.py` utility from this file's directory to generate the completed PDF: +`python utils/populate_interactive_form.py ` +This utility validates field IDs and values; if errors appear, correct them and retry. + +# Static Form Layout +For PDFs without interactive form fields, you must visually identify data entry locations and create text overlays. Execute these steps *precisely*. All steps are mandatory for accurate form completion. Detailed instructions follow. +- Render the PDF as PNG images and determine field positioning. +- Create a JSON configuration with field data and generate overlay preview images. +- Validate the positioning. +- Apply the text overlays to complete the form. + +## Step 1: Visual Inspection (MANDATORY) +- Render the PDF as PNG images. Execute from this file's directory: +`python utils/render_pages_to_png.py ` +This creates one PNG per page. +- Carefully review each image and locate all data entry areas. For each field, determine bounding boxes for both the field label and the data entry area. These boxes MUST NOT OVERLAP; the entry area should only cover the space for data input. Typically, entry areas are positioned adjacent to, above, or below their labels. Entry boxes must accommodate the expected text. + +Common form patterns you may encounter: + +*Label within bordered area* +``` ++------------------------+ +| Full Name: | ++------------------------+ +``` +The data entry area should be to the right of "Full Name" and extend to the border. + +*Label preceding underline* +``` +Email: _______________________ +``` +The data entry area should be above the underline spanning its width. + +*Label below underline* +``` +_________________________ +Signature +``` +The data entry area should be above the underline across its full width. Common for signatures and dates. + +*Label above underline* +``` +Additional comments: +________________________________________________ +``` +The data entry area extends from below the label to the underline, spanning its width. + +*Toggle boxes* +``` +Are you employed? Yes [] No [] +``` +For toggle boxes: +- Identify small square markers ([]) - these are the target elements. They may appear before or after their labels. +- Distinguish between label text ("Yes", "No") and the actual toggle squares. +- The entry bounding box should cover ONLY the square marker, not the label text. + +### Step 2: Create form_config.json and preview images (MANDATORY) +- Create `form_config.json` with field positioning data: +``` +{ + "page_dimensions": [ + { + "page_num": 1, + "img_width": (page 1 image width in pixels), + "img_height": (page 1 image height in pixels), + }, + { + "page_num": 2, + "img_width": (page 2 image width in pixels), + "img_height": (page 2 image height in pixels), + } + // more pages + ], + "field_entries": [ + // Text field example + { + "page_num": 1, + "description": "Enter applicant's surname here", + // Bounding boxes use [left, top, right, bottom] format. Label and entry boxes must not overlap. + "label_text": "Last name", + "label_bounds": [30, 125, 95, 142], + "entry_bounds": [100, 125, 280, 142], + "text_content": { + "content": "Smith", // Text to overlay at entry_bounds location + "text_size": 14, // optional, defaults to 14 + "text_color": "000000", // optional, RRGGBB format, defaults to 000000 (black) + } + }, + // Toggle box example. TARGET THE SQUARE MARKER, NOT THE TEXT + { + "page_num": 2, + "description": "Mark if applicant is over 21", + "entry_bounds": [140, 525, 155, 540], // Small area over toggle square + "label_text": "Yes", + "label_bounds": [100, 525, 132, 540], // Area containing "Yes" label + // Use "X" to mark a toggle box. + "text_content": { + "content": "X", + } + } + // more field entries + ] +} +``` + +Generate preview images by running this utility from this file's directory for each page: +`python utils/generate_preview_overlay.py + +Preview images display red rectangles for data entry areas and blue rectangles for label areas. + +### Step 3: Verify Positioning (MANDATORY) +#### Automated overlap detection +- Verify no bounding boxes overlap and entry boxes have sufficient height using the `verify_form_layout.py` utility (run from this file's directory): +`python utils/verify_form_layout.py ` + +If errors occur, re-examine the affected fields, adjust positioning, and iterate until all errors are resolved. Remember: label (blue) boxes contain text labels; entry (red) boxes should not. + +#### Manual preview inspection +**CRITICAL: Do not continue without visually reviewing preview images** +- Red rectangles must cover ONLY data entry areas +- Red rectangles MUST NOT contain any existing text +- Blue rectangles should encompass label text +- For toggle boxes: + - Red rectangle MUST be centered on the toggle square + - Blue rectangle should cover the toggle's text label + +- If any positioning appears incorrect, update form_config.json, regenerate previews, and verify again. Repeat until all positioning is accurate. + + +### Step 4: Apply text overlays to the PDF +Execute this utility from this file's directory to generate the completed PDF using form_config.json: +`python utils/apply_text_overlays.py + +# CJK (Chinese/Japanese/Korean) Font Support + +## Important: CJK Text Display Issues + +**Warning**: Standard PDF fonts (Arial, Helvetica, etc.) do not support CJK characters. Without a proper CJK font, Chinese/Japanese/Korean text will display as black boxes (■). + +The `apply_text_overlays.py` utility: +1. Automatically detects CJK characters in your text content +2. Searches for available CJK fonts on your system +3. **Exits with an error if CJK characters are detected but no CJK font is found** + +## Supported System Fonts + +The utility searches for CJK fonts in these locations: + +**macOS:** +- `/System/Library/Fonts/PingFang.ttc` (PingFang) - pre-installed +- `/System/Library/Fonts/STHeiti Light.ttc` (STHeiti) +- `/Library/Fonts/Arial Unicode.ttf` (Arial Unicode) + +**Windows:** +- `C:/Windows/Fonts/msyh.ttc` (Microsoft YaHei) - pre-installed +- `C:/Windows/Fonts/simsun.ttc` (SimSun) +- `C:/Windows/Fonts/simhei.ttf` (SimHei) + +**Linux:** +- `/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf` +- `/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc` +- `/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc` + +## If You See "No CJK Font Found" Error + +The script will exit with an error if CJK text is detected but no font is available. Install a CJK font: + +```bash +# Ubuntu/Debian +sudo apt-get install fonts-noto-cjk +# or +sudo apt-get install fonts-wqy-zenhei + +# Fedora/RHEL +sudo dnf install google-noto-sans-cjk-fonts + +# macOS - PingFang is pre-installed, no action needed +# Windows - Microsoft YaHei is pre-installed, no action needed +``` + +You can also add a custom font path by modifying the `CJK_FONT_PATHS` dictionary in `apply_text_overlays.py`. + +## Example with CJK Text + +```json +{ + "page_dimensions": [{"page_num": 1, "img_width": 800, "img_height": 1000}], + "field_entries": [ + { + "page_num": 1, + "description": "Applicant name in Chinese", + "label_text": "姓名", + "label_bounds": [30, 125, 70, 145], + "entry_bounds": [80, 125, 280, 145], + "text_content": { + "content": "张三", + "text_size": 14 + } + } + ] +} +``` + +The utility will automatically detect the Chinese characters and use an appropriate CJK font for rendering. diff --git a/deploy/data/skills/pdf/utils/apply_text_overlays.py b/deploy/data/skills/pdf/utils/apply_text_overlays.py new file mode 100644 index 000000000..a5107886e --- /dev/null +++ b/deploy/data/skills/pdf/utils/apply_text_overlays.py @@ -0,0 +1,280 @@ +import json +import os +import sys +import tempfile + +from pypdf import PdfReader, PdfWriter +from pypdf.annotations import FreeText +from reportlab.pdfgen import canvas +from reportlab.pdfbase import pdfmetrics +from reportlab.pdfbase.ttfonts import TTFont +from reportlab.lib.colors import HexColor + + +# Completes a PDF by adding text overlays defined in `form_config.json`. See form-handler.md. + + +# Common CJK font paths for different operating systems +CJK_FONT_PATHS = { + # macOS + "/System/Library/Fonts/PingFang.ttc": "PingFang", + "/System/Library/Fonts/STHeiti Light.ttc": "STHeiti", + "/Library/Fonts/Arial Unicode.ttf": "ArialUnicode", + # Windows + "C:/Windows/Fonts/msyh.ttc": "MicrosoftYaHei", + "C:/Windows/Fonts/simsun.ttc": "SimSun", + "C:/Windows/Fonts/simhei.ttf": "SimHei", + # Linux + "/usr/share/fonts/truetype/droid/DroidSansFallbackFull.ttf": "DroidSansFallback", + "/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc": "NotoSansCJK", + "/usr/share/fonts/truetype/wqy/wqy-zenhei.ttc": "WenQuanYi", +} + + +def has_cjk_characters(text): + """Check if text contains CJK (Chinese/Japanese/Korean) characters""" + for char in text: + code = ord(char) + # CJK Unified Ideographs and common CJK ranges + if (0x4E00 <= code <= 0x9FFF or # CJK Unified Ideographs + 0x3400 <= code <= 0x4DBF or # CJK Unified Ideographs Extension A + 0x3000 <= code <= 0x303F or # CJK Symbols and Punctuation + 0xFF00 <= code <= 0xFFEF or # Halfwidth and Fullwidth Forms + 0x3040 <= code <= 0x309F or # Hiragana + 0x30A0 <= code <= 0x30FF or # Katakana + 0xAC00 <= code <= 0xD7AF): # Hangul Syllables + return True + return False + + +def find_cjk_font(): + """Find an available CJK font on the system""" + for font_path, font_name in CJK_FONT_PATHS.items(): + if os.path.exists(font_path): + return font_path, font_name + return None, None + + +def register_cjk_font(): + """Register a CJK font with reportlab if available""" + font_path, font_name = find_cjk_font() + if font_path: + try: + pdfmetrics.registerFont(TTFont(font_name, font_path)) + return font_name + except Exception as e: + print(f"警告: 注册 CJK 字体 {font_name} 失败: {e}") + return None + + +def convert_image_to_pdf_coords(bbox, img_width, img_height, pdf_width, pdf_height): + """Transform bounding box from image coordinates to PDF coordinates""" + # Image coordinates: origin at top-left, y increases downward + # PDF coordinates: origin at bottom-left, y increases upward + x_scale = pdf_width / img_width + y_scale = pdf_height / img_height + + left = bbox[0] * x_scale + right = bbox[2] * x_scale + + # Flip Y coordinates for PDF + top = pdf_height - (bbox[1] * y_scale) + bottom = pdf_height - (bbox[3] * y_scale) + + return left, bottom, right, top + + +def apply_text_overlays(input_pdf_path, config_json_path, output_pdf_path): + """Apply text overlays to PDF based on form_config.json""" + + # `form_config.json` format described in form-handler.md. + with open(config_json_path, "r") as f: + config = json.load(f) + + # Open the PDF + reader = PdfReader(input_pdf_path) + writer = PdfWriter() + + # Copy all pages to writer + writer.append(reader) + + # Get PDF dimensions for each page + pdf_dims = {} + for idx, pg in enumerate(reader.pages): + mediabox = pg.mediabox + pdf_dims[idx + 1] = [float(mediabox.width), float(mediabox.height)] + + # Check if any text contains CJK characters + has_cjk = False + for entry in config["field_entries"]: + if "text_content" in entry and "content" in entry["text_content"]: + if has_cjk_characters(entry["text_content"]["content"]): + has_cjk = True + break + + # If CJK text detected, use reportlab method for proper font embedding + if has_cjk: + cjk_font_name = register_cjk_font() + if cjk_font_name: + print(f"检测到中日韩文字,使用嵌入字体: {cjk_font_name}") + apply_text_overlays_with_reportlab( + reader, writer, config, pdf_dims, cjk_font_name, output_pdf_path + ) + return + else: + print("错误: 检测到中日韩文字,但系统中未找到 CJK 字体。") + print("中文/日文/韩文将显示为方块(■)。") + print("") + print("请安装 CJK 字体后重试:") + print(" macOS: 系统已预装 PingFang 字体") + print(" Windows: 系统已预装 Microsoft YaHei 字体") + print(" Linux: sudo apt-get install fonts-noto-cjk") + print("") + print("支持的字体路径:") + for path, name in CJK_FONT_PATHS.items(): + print(f" - {path} ({name})") + sys.exit(1) + + # Process each field entry using standard FreeText annotation + overlay_annotations = [] + for entry in config["field_entries"]: + page_num = entry["page_num"] + + # Get page dimensions and transform coordinates. + page_info = next(p for p in config["page_dimensions"] if p["page_num"] == page_num) + img_width = page_info["img_width"] + img_height = page_info["img_height"] + pdf_width, pdf_height = pdf_dims[page_num] + + transformed_bounds = convert_image_to_pdf_coords( + entry["entry_bounds"], + img_width, img_height, + pdf_width, pdf_height + ) + + # Skip empty fields + if "text_content" not in entry or "content" not in entry["text_content"]: + continue + text_content = entry["text_content"] + content = text_content["content"] + if not content: + continue + + font_name = text_content.get("font", "Arial") + text_size = str(text_content.get("text_size", 14)) + "pt" + text_color = text_content.get("text_color", "000000") + + # Font size/color may not render consistently across viewers: + # https://github.com/py-pdf/pypdf/issues/2084 + annotation = FreeText( + text=content, + rect=transformed_bounds, + font=font_name, + font_size=text_size, + font_color=text_color, + border_color=None, + background_color=None, + ) + overlay_annotations.append(annotation) + # page_num is 0-based for pypdf + writer.add_annotation(page_number=page_num - 1, annotation=annotation) + + # Save the completed PDF + with open(output_pdf_path, "wb") as out_file: + writer.write(out_file) + + print(f"成功添加文本叠加层并保存到 {output_pdf_path}") + print(f"共添加 {len(overlay_annotations)} 个文本叠加") + + +def apply_text_overlays_with_reportlab(reader, writer, config, pdf_dims, cjk_font_name, output_pdf_path): + """Apply text overlays using reportlab for proper CJK font embedding""" + + # Group entries by page + entries_by_page = {} + for entry in config["field_entries"]: + if "text_content" in entry and "content" in entry["text_content"]: + content = entry["text_content"]["content"] + if content: + page_num = entry["page_num"] + if page_num not in entries_by_page: + entries_by_page[page_num] = [] + entries_by_page[page_num].append(entry) + + total_overlays = 0 + + # Create overlay PDF for each page with text + for page_num, entries in entries_by_page.items(): + pdf_width, pdf_height = pdf_dims[page_num] + page_info = next(p for p in config["page_dimensions"] if p["page_num"] == page_num) + img_width = page_info["img_width"] + img_height = page_info["img_height"] + + # Create temporary overlay PDF using reportlab + with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_file: + tmp_path = tmp_file.name + + try: + c = canvas.Canvas(tmp_path, pagesize=(pdf_width, pdf_height)) + + for entry in entries: + text_content = entry["text_content"] + content = text_content["content"] + text_size = text_content.get("text_size", 14) + text_color = text_content.get("text_color", "000000") + + # Transform coordinates + left, bottom, right, top = convert_image_to_pdf_coords( + entry["entry_bounds"], + img_width, img_height, + pdf_width, pdf_height + ) + + # Set font - use CJK font for CJK text, otherwise use specified font + if has_cjk_characters(content): + c.setFont(cjk_font_name, text_size) + else: + try: + c.setFont(text_content.get("font", "Helvetica"), text_size) + except: + c.setFont("Helvetica", text_size) + + # Set color + try: + c.setFillColor(HexColor(f"#{text_color}")) + except: + c.setFillColor(HexColor("#000000")) + + # Draw text at the position (left, bottom) + c.drawString(left, bottom, content) + total_overlays += 1 + + c.save() + + # Merge overlay with the page + overlay_reader = PdfReader(tmp_path) + overlay_page = overlay_reader.pages[0] + writer.pages[page_num - 1].merge_page(overlay_page) + + finally: + # Clean up temp file + if os.path.exists(tmp_path): + os.unlink(tmp_path) + + # Save the completed PDF + with open(output_pdf_path, "wb") as out_file: + writer.write(out_file) + + print(f"成功添加文本叠加层并保存到 {output_pdf_path}") + print(f"共添加 {total_overlays} 个文本叠加(使用 CJK 字体嵌入)") + + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("用法: apply_text_overlays.py [输入PDF] [form_config.json] [输出PDF]") + sys.exit(1) + input_pdf = sys.argv[1] + config_json = sys.argv[2] + output_pdf = sys.argv[3] + + apply_text_overlays(input_pdf, config_json, output_pdf) diff --git a/deploy/data/skills/pdf/utils/detect_interactive_fields.py b/deploy/data/skills/pdf/utils/detect_interactive_fields.py new file mode 100644 index 000000000..23ea3e7e4 --- /dev/null +++ b/deploy/data/skills/pdf/utils/detect_interactive_fields.py @@ -0,0 +1,12 @@ +import sys +from pypdf import PdfReader + + +# 检测 PDF 是否包含交互式表单字段的工具。参见 form-handler.md。 + + +doc = PdfReader(sys.argv[1]) +if (doc.get_fields()): + print("此 PDF 包含交互式表单字段") +else: + print("此 PDF 不包含交互式表单字段;需要通过视觉分析确定数据输入位置") diff --git a/deploy/data/skills/pdf/utils/generate_preview_overlay.py b/deploy/data/skills/pdf/utils/generate_preview_overlay.py new file mode 100644 index 000000000..8df2e8d7f --- /dev/null +++ b/deploy/data/skills/pdf/utils/generate_preview_overlay.py @@ -0,0 +1,40 @@ +import json +import sys + +from PIL import Image, ImageDraw + + +# 生成带有边界框矩形的"预览"图片,用于在 PDF 中确定文本叠加位置。参见 form-handler.md。 + + +def generate_preview(page_num, config_json_path, source_path, preview_path): + # 输入文件应采用 form-handler.md 中描述的 `form_config.json` 格式。 + with open(config_json_path, 'r') as f: + config = json.load(f) + + img = Image.open(source_path) + draw = ImageDraw.Draw(img) + box_count = 0 + + for entry in config["field_entries"]: + if entry["page_num"] == page_num: + entry_box = entry['entry_bounds'] + label_box = entry['label_bounds'] + # 在输入区域绘制红色矩形,在标签区域绘制蓝色矩形。 + draw.rectangle(entry_box, outline='red', width=2) + draw.rectangle(label_box, outline='blue', width=2) + box_count += 2 + + img.save(preview_path) + print(f"已生成预览图片 {preview_path},包含 {box_count} 个边界框") + + +if __name__ == "__main__": + if len(sys.argv) != 5: + print("用法: generate_preview_overlay.py [页码] [form_config.json文件] [源图片路径] [预览图片路径]") + sys.exit(1) + page_num = int(sys.argv[1]) + config_json_path = sys.argv[2] + source_image_path = sys.argv[3] + preview_image_path = sys.argv[4] + generate_preview(page_num, config_json_path, source_image_path, preview_image_path) diff --git a/deploy/data/skills/pdf/utils/parse_form_structure.py b/deploy/data/skills/pdf/utils/parse_form_structure.py new file mode 100644 index 000000000..344767b97 --- /dev/null +++ b/deploy/data/skills/pdf/utils/parse_form_structure.py @@ -0,0 +1,149 @@ +import json +import sys + +from pypdf import PdfReader + + +# 解析 PDF 中的交互式表单字段数据并输出 JSON 用于表单填写。参见 form-handler.md。 + + +# 匹配 PdfReader `get_fields` 和 `update_page_form_field_values` 方法使用的格式。 +def build_complete_element_id(annotation): + parts = [] + while annotation: + name = annotation.get('/T') + if name: + parts.append(name) + annotation = annotation.get('/Parent') + return ".".join(reversed(parts)) if parts else None + + +def build_element_dict(field, element_id): + element_dict = {"element_id": element_id} + field_type = field.get('/FT') + if field_type == "/Tx": + element_dict["element_type"] = "text_input" + elif field_type == "/Btn": + element_dict["element_type"] = "toggle_box" # 选项组单独处理 + available_states = field.get("/_States_", []) + if len(available_states) == 2: + # "/Off" 通常是未选中值,参见 PDF 规范: + # https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf#page=448 + # 它可能出现在 "/_States_" 列表的任一位置。 + if "/Off" in available_states: + element_dict["on_value"] = available_states[0] if available_states[0] != "/Off" else available_states[1] + element_dict["off_value"] = "/Off" + else: + print(f"切换框 `${element_id}` 的状态值异常。其开/关值可能不正确;请通过视觉检查验证结果。") + element_dict["on_value"] = available_states[0] + element_dict["off_value"] = available_states[1] + elif field_type == "/Ch": + element_dict["element_type"] = "dropdown" + available_states = field.get("/_States_", []) + element_dict["menu_items"] = [{ + "option_value": state[0], + "display_text": state[1], + } for state in available_states] + else: + element_dict["element_type"] = f"unknown ({field_type})" + return element_dict + + +# Returns a list of interactive PDF form elements: +# [ +# { +# "element_id": "name", +# "page_num": 1, +# "element_type": ("text_input", "toggle_box", "option_group", or "dropdown") +# // Per-type additional properties described in form-handler.md +# }, +# ] +def parse_form_elements(reader: PdfReader): + fields = reader.get_fields() + + elements_by_id = {} + potential_option_groups = set() + + for element_id, field in fields.items(): + # Skip container fields with children, except possible parent groups for radio options. + if field.get("/Kids"): + if field.get("/FT") == "/Btn": + potential_option_groups.add(element_id) + continue + elements_by_id[element_id] = build_element_dict(field, element_id) + + # Bounding rectangles are stored in annotations within page objects. + + # Radio option elements have a separate annotation for each choice; + # all choices share the same element name. + # See https://westhealth.github.io/exploring-fillable-forms-with-pdfrw.html + option_groups_by_id = {} + + for page_idx, pg in enumerate(reader.pages): + annotations = pg.get('/Annots', []) + for ann in annotations: + element_id = build_complete_element_id(ann) + if element_id in elements_by_id: + elements_by_id[element_id]["page_num"] = page_idx + 1 + elements_by_id[element_id]["bounds"] = ann.get('/Rect') + elif element_id in potential_option_groups: + try: + # ann['/AP']['/N'] should have two items. One is '/Off', + # the other is the active value. + active_values = [v for v in ann["/AP"]["/N"] if v != "/Off"] + except KeyError: + continue + if len(active_values) == 1: + bounds = ann.get("/Rect") + if element_id not in option_groups_by_id: + option_groups_by_id[element_id] = { + "element_id": element_id, + "element_type": "option_group", + "page_num": page_idx + 1, + "available_options": [], + } + # Note: macOS Preview.app may not display selected + # radio options correctly (removing leading slash helps there + # but breaks Chrome/Firefox/Acrobat/etc). + option_groups_by_id[element_id]["available_options"].append({ + "option_value": active_values[0], + "bounds": bounds, + }) + + # Some PDFs have form element definitions without corresponding annotations, + # so we can't determine their location. Exclude these elements. + elements_with_location = [] + for element in elements_by_id.values(): + if "page_num" in element: + elements_with_location.append(element) + else: + print(f"无法确定元素 ID: {element.get('element_id')} 的位置,已排除") + + # Sort by page number, then Y position (flipped in PDF coordinate system), then X. + def sort_key(elem): + if "available_options" in elem: + bounds = elem["available_options"][0]["bounds"] or [0, 0, 0, 0] + else: + bounds = elem.get("bounds") or [0, 0, 0, 0] + adjusted_pos = [-bounds[1], bounds[0]] + return [elem.get("page_num"), adjusted_pos] + + sorted_elements = elements_with_location + list(option_groups_by_id.values()) + sorted_elements.sort(key=sort_key) + + return sorted_elements + + +def export_form_structure(pdf_path: str, json_output_path: str): + reader = PdfReader(pdf_path) + elements = parse_form_elements(reader) + with open(json_output_path, "w") as f: + json.dump(elements, f, indent=2) + print(f"已将 {len(elements)} 个表单元素写入 {json_output_path}") + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("用法: parse_form_structure.py [输入PDF] [输出JSON]") + sys.exit(1) + export_form_structure(sys.argv[1], sys.argv[2]) diff --git a/deploy/data/skills/pdf/utils/populate_interactive_form.py b/deploy/data/skills/pdf/utils/populate_interactive_form.py new file mode 100644 index 000000000..5ef1361b8 --- /dev/null +++ b/deploy/data/skills/pdf/utils/populate_interactive_form.py @@ -0,0 +1,114 @@ +import json +import sys + +from pypdf import PdfReader, PdfWriter + +from parse_form_structure import parse_form_elements + + +# 填充 PDF 中的交互式表单字段。参见 form-handler.md。 + + +def populate_form_fields(input_pdf_path: str, form_data_path: str, output_pdf_path: str): + with open(form_data_path) as f: + form_data = json.load(f) + # Group by page number. + data_by_page = {} + for entry in form_data: + if "fill_value" in entry: + element_id = entry["element_id"] + page_num = entry["page_num"] + if page_num not in data_by_page: + data_by_page[page_num] = {} + data_by_page[page_num][element_id] = entry["fill_value"] + + reader = PdfReader(input_pdf_path) + + has_errors = False + elements = parse_form_elements(reader) + elements_by_id = {e["element_id"]: e for e in elements} + for entry in form_data: + existing_element = elements_by_id.get(entry["element_id"]) + if not existing_element: + has_errors = True + print(f"错误: `{entry['element_id']}` 不是有效的元素 ID") + elif entry["page_num"] != existing_element["page_num"]: + has_errors = True + print(f"错误: `{entry['element_id']}` 的页码不正确(得到 {entry['page_num']},期望 {existing_element['page_num']})") + else: + if "fill_value" in entry: + err = validate_element_value(existing_element, entry["fill_value"]) + if err: + print(err) + has_errors = True + if has_errors: + sys.exit(1) + + writer = PdfWriter(clone_from=reader) + for page_num, field_values in data_by_page.items(): + writer.update_page_form_field_values(writer.pages[page_num - 1], field_values, auto_regenerate=False) + + # Required for many PDF viewers to format form values correctly. + # May cause "save changes" dialog even without user modifications. + writer.set_need_appearances_writer(True) + + with open(output_pdf_path, "wb") as f: + writer.write(f) + + +def validate_element_value(element_info, fill_value): + element_type = element_info["element_type"] + element_id = element_info["element_id"] + if element_type == "toggle_box": + on_val = element_info["on_value"] + off_val = element_info["off_value"] + if fill_value != on_val and fill_value != off_val: + return f'错误: 切换元素 "{element_id}" 的值 "{fill_value}" 无效。开启值为 "{on_val}",关闭值为 "{off_val}"' + elif element_type == "option_group": + valid_values = [opt["option_value"] for opt in element_info["available_options"]] + if fill_value not in valid_values: + return f'错误: 选项组 "{element_id}" 的值 "{fill_value}" 无效。有效值为: {valid_values}' + elif element_type == "dropdown": + menu_values = [item["option_value"] for item in element_info["menu_items"]] + if fill_value not in menu_values: + return f'错误: 下拉框 "{element_id}" 的值 "{fill_value}" 无效。有效值为: {menu_values}' + return None + + +# pypdf (at least version 5.7.0) has a bug when setting values for selection list fields. +# In _writer.py around line 966: +# +# if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0: +# txt = "\n".join(annotation.get_inherited(FA.Opt, [])) +# +# The issue is that for selection lists, `get_inherited` returns a list of two-element lists like +# [["value1", "Text 1"], ["value2", "Text 2"], ...] +# This causes `join` to throw a TypeError because it expects an iterable of strings. +# The workaround is to patch `get_inherited` to return a list of value strings. +# We call the original method and adjust the return value only if the argument is +# `FA.Opt` and if the return value is a list of two-element lists. +def apply_pypdf_workaround(): + from pypdf.generic import DictionaryObject + from pypdf.constants import FieldDictionaryAttributes + + original_get_inherited = DictionaryObject.get_inherited + + def patched_get_inherited(self, key: str, default = None): + result = original_get_inherited(self, key, default) + if key == FieldDictionaryAttributes.Opt: + if isinstance(result, list) and all(isinstance(v, list) and len(v) == 2 for v in result): + result = [r[0] for r in result] + return result + + DictionaryObject.get_inherited = patched_get_inherited + + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("用法: populate_interactive_form.py [输入PDF] [form_data.json] [输出PDF]") + sys.exit(1) + apply_pypdf_workaround() + input_pdf = sys.argv[1] + form_data_path = sys.argv[2] + output_pdf = sys.argv[3] + populate_form_fields(input_pdf, form_data_path, output_pdf) diff --git a/deploy/data/skills/pdf/utils/render_pages_to_png.py b/deploy/data/skills/pdf/utils/render_pages_to_png.py new file mode 100644 index 000000000..eb25d7851 --- /dev/null +++ b/deploy/data/skills/pdf/utils/render_pages_to_png.py @@ -0,0 +1,35 @@ +import os +import sys + +from pdf2image import convert_from_path + + +# 将 PDF 文档的每一页渲染为 PNG 图片。 + + +def render_document(pdf_path, output_folder, max_dimension=1000): + page_images = convert_from_path(pdf_path, dpi=200) + + for idx, img in enumerate(page_images): + # 如果图片尺寸超过 max_dimension,则进行缩放 + w, h = img.size + if w > max_dimension or h > max_dimension: + scale = min(max_dimension / w, max_dimension / h) + new_w = int(w * scale) + new_h = int(h * scale) + img = img.resize((new_w, new_h)) + + img_path = os.path.join(output_folder, f"page_{idx+1}.png") + img.save(img_path) + print(f"已保存第 {idx+1} 页为 {img_path}(尺寸: {img.size})") + + print(f"共渲染 {len(page_images)} 页为 PNG 图片") + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("用法: render_pages_to_png.py [输入PDF] [输出目录]") + sys.exit(1) + pdf_path = sys.argv[1] + output_folder = sys.argv[2] + render_document(pdf_path, output_folder) diff --git a/deploy/data/skills/pdf/utils/verify_form_layout.py b/deploy/data/skills/pdf/utils/verify_form_layout.py new file mode 100644 index 000000000..b93df6d6a --- /dev/null +++ b/deploy/data/skills/pdf/utils/verify_form_layout.py @@ -0,0 +1,69 @@ +from dataclasses import dataclass +import json +import sys + + +# 验证分析 PDF 时创建的 `form_config.json` 文件是否存在重叠的边界框。参见 form-handler.md。 + + +@dataclass +class BoundsAndEntry: + bounds: list[float] + bounds_type: str + entry: dict + + +# 返回打印到标准输出供 Claude 读取的消息列表。 +def validate_form_layout(config_json_stream) -> list[str]: + messages = [] + config = json.load(config_json_stream) + messages.append(f"已读取 {len(config['field_entries'])} 个字段条目") + + def bounds_overlap(b1, b2): + no_horizontal_overlap = b1[0] >= b2[2] or b1[2] <= b2[0] + no_vertical_overlap = b1[1] >= b2[3] or b1[3] <= b2[1] + return not (no_horizontal_overlap or no_vertical_overlap) + + bounds_list = [] + for entry in config["field_entries"]: + bounds_list.append(BoundsAndEntry(entry["label_bounds"], "标签", entry)) + bounds_list.append(BoundsAndEntry(entry["entry_bounds"], "输入", entry)) + + found_error = False + for i, bi in enumerate(bounds_list): + # 时间复杂度 O(N^2);如有需要可优化。 + for j in range(i + 1, len(bounds_list)): + bj = bounds_list[j] + if bi.entry["page_num"] == bj.entry["page_num"] and bounds_overlap(bi.bounds, bj.bounds): + found_error = True + if bi.entry is bj.entry: + messages.append(f"失败: `{bi.entry['description']}` 的标签和输入边界框重叠 ({bi.bounds}, {bj.bounds})") + else: + messages.append(f"失败: `{bi.entry['description']}` 的{bi.bounds_type}边界框 ({bi.bounds}) 与 `{bj.entry['description']}` 的{bj.bounds_type}边界框 ({bj.bounds}) 重叠") + if len(messages) >= 20: + messages.append("中止后续检查;请修正边界框后重试") + return messages + if bi.bounds_type == "输入": + if "text_content" in bi.entry: + text_size = bi.entry["text_content"].get("text_size", 14) + entry_height = bi.bounds[3] - bi.bounds[1] + if entry_height < text_size: + found_error = True + messages.append(f"失败: `{bi.entry['description']}` 的输入边界框高度 ({entry_height}) 不足以容纳文本内容(文字大小: {text_size})。请增加边界框高度或减小文字大小。") + if len(messages) >= 20: + messages.append("中止后续检查;请修正边界框后重试") + return messages + + if not found_error: + messages.append("成功: 所有边界框均有效") + return messages + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("用法: verify_form_layout.py [form_config.json]") + sys.exit(1) + # 输入文件应采用 form-handler.md 中描述的 `form_config.json` 格式。 + with open(sys.argv[1]) as f: + messages = validate_form_layout(f) + for msg in messages: + print(msg) diff --git a/deploy/data/skills/pdf/utils/verify_form_layout_test.py b/deploy/data/skills/pdf/utils/verify_form_layout_test.py new file mode 100644 index 000000000..dbef6e23d --- /dev/null +++ b/deploy/data/skills/pdf/utils/verify_form_layout_test.py @@ -0,0 +1,226 @@ +import unittest +import json +import io +from verify_form_layout import validate_form_layout + + +# 此测试目前不在 CI 中自动运行;仅用于文档和手动验证。 +class TestValidateFormLayout(unittest.TestCase): + + def create_json_stream(self, data): + """辅助方法:从数据创建 JSON 流""" + return io.StringIO(json.dumps(data)) + + def test_no_overlaps(self): + """测试无边界框重叠的情况""" + data = { + "field_entries": [ + { + "description": "Name", + "page_num": 1, + "label_bounds": [10, 10, 50, 30], + "entry_bounds": [60, 10, 150, 30] + }, + { + "description": "Email", + "page_num": 1, + "label_bounds": [10, 40, 50, 60], + "entry_bounds": [60, 40, 150, 60] + } + ] + } + + stream = self.create_json_stream(data) + messages = validate_form_layout(stream) + self.assertTrue(any("成功" in msg for msg in messages)) + self.assertFalse(any("失败" in msg for msg in messages)) + + def test_label_entry_overlap_same_field(self): + """测试同一字段的标签和输入框重叠""" + data = { + "field_entries": [ + { + "description": "Name", + "page_num": 1, + "label_bounds": [10, 10, 60, 30], + "entry_bounds": [50, 10, 150, 30] # 与标签重叠 + } + ] + } + + stream = self.create_json_stream(data) + messages = validate_form_layout(stream) + self.assertTrue(any("失败" in msg and "重叠" in msg for msg in messages)) + self.assertFalse(any("成功" in msg for msg in messages)) + + def test_overlap_between_different_fields(self): + """测试不同字段边界框之间的重叠""" + data = { + "field_entries": [ + { + "description": "Name", + "page_num": 1, + "label_bounds": [10, 10, 50, 30], + "entry_bounds": [60, 10, 150, 30] + }, + { + "description": "Email", + "page_num": 1, + "label_bounds": [40, 20, 80, 40], # 与 Name 的边界框重叠 + "entry_bounds": [160, 10, 250, 30] + } + ] + } + + stream = self.create_json_stream(data) + messages = validate_form_layout(stream) + self.assertTrue(any("失败" in msg and "重叠" in msg for msg in messages)) + self.assertFalse(any("成功" in msg for msg in messages)) + + def test_different_pages_no_overlap(self): + """测试不同页面的边界框不算重叠""" + data = { + "field_entries": [ + { + "description": "Name", + "page_num": 1, + "label_bounds": [10, 10, 50, 30], + "entry_bounds": [60, 10, 150, 30] + }, + { + "description": "Email", + "page_num": 2, + "label_bounds": [10, 10, 50, 30], # 相同坐标但不同页面 + "entry_bounds": [60, 10, 150, 30] + } + ] + } + + stream = self.create_json_stream(data) + messages = validate_form_layout(stream) + self.assertTrue(any("成功" in msg for msg in messages)) + self.assertFalse(any("失败" in msg for msg in messages)) + + def test_entry_height_too_small(self): + """测试输入框高度是否根据文字大小检查""" + data = { + "field_entries": [ + { + "description": "Name", + "page_num": 1, + "label_bounds": [10, 10, 50, 30], + "entry_bounds": [60, 10, 150, 20], # 高度为 10 + "text_content": { + "text_size": 14 # 文字大小大于高度 + } + } + ] + } + + stream = self.create_json_stream(data) + messages = validate_form_layout(stream) + self.assertTrue(any("失败" in msg and "高度" in msg for msg in messages)) + self.assertFalse(any("成功" in msg for msg in messages)) + + def test_entry_height_adequate(self): + """测试输入框高度足够时通过验证""" + data = { + "field_entries": [ + { + "description": "Name", + "page_num": 1, + "label_bounds": [10, 10, 50, 30], + "entry_bounds": [60, 10, 150, 30], # 高度为 20 + "text_content": { + "text_size": 14 # 文字大小小于高度 + } + } + ] + } + + stream = self.create_json_stream(data) + messages = validate_form_layout(stream) + self.assertTrue(any("成功" in msg for msg in messages)) + self.assertFalse(any("失败" in msg for msg in messages)) + + def test_default_text_size(self): + """测试未指定时使用默认文字大小""" + data = { + "field_entries": [ + { + "description": "Name", + "page_num": 1, + "label_bounds": [10, 10, 50, 30], + "entry_bounds": [60, 10, 150, 20], # 高度为 10 + "text_content": {} # 未指定 text_size,应使用默认值 14 + } + ] + } + + stream = self.create_json_stream(data) + messages = validate_form_layout(stream) + self.assertTrue(any("失败" in msg and "高度" in msg for msg in messages)) + self.assertFalse(any("成功" in msg for msg in messages)) + + def test_no_text_content(self): + """测试缺少 text_content 时不进行高度检查""" + data = { + "field_entries": [ + { + "description": "Name", + "page_num": 1, + "label_bounds": [10, 10, 50, 30], + "entry_bounds": [60, 10, 150, 20] # 高度较小但无 text_content + } + ] + } + + stream = self.create_json_stream(data) + messages = validate_form_layout(stream) + self.assertTrue(any("成功" in msg for msg in messages)) + self.assertFalse(any("失败" in msg for msg in messages)) + + def test_multiple_errors_limit(self): + """测试错误消息数量限制,防止输出过多""" + entries = [] + # 创建多个重叠字段 + for i in range(25): + entries.append({ + "description": f"Field{i}", + "page_num": 1, + "label_bounds": [10, 10, 50, 30], # 全部重叠 + "entry_bounds": [20, 15, 60, 35] # 全部重叠 + }) + + data = {"field_entries": entries} + + stream = self.create_json_stream(data) + messages = validate_form_layout(stream) + # 应在约 20 条消息后中止 + self.assertTrue(any("中止" in msg for msg in messages)) + # 应有一些失败消息但不应有数百条 + failure_count = sum(1 for msg in messages if "失败" in msg) + self.assertGreater(failure_count, 0) + self.assertLess(len(messages), 30) # 应受限制 + + def test_edge_touching_boxes(self): + """测试边缘相接的边界框不算重叠""" + data = { + "field_entries": [ + { + "description": "Name", + "page_num": 1, + "label_bounds": [10, 10, 50, 30], + "entry_bounds": [50, 10, 150, 30] # 在 x=50 处相接 + } + ] + } + + stream = self.create_json_stream(data) + messages = validate_form_layout(stream) + self.assertTrue(any("成功" in msg for msg in messages)) + self.assertFalse(any("失败" in msg for msg in messages)) + + +if __name__ == '__main__': + unittest.main() diff --git a/deploy/data/skills/pptx/SKILL.md b/deploy/data/skills/pptx/SKILL.md new file mode 100644 index 000000000..bbbe7196f --- /dev/null +++ b/deploy/data/skills/pptx/SKILL.md @@ -0,0 +1,576 @@ +--- +name: pptx +description: "PowerPoint document toolkit for slide generation, content modification, and presentation analysis. Ideal for: (1) Building new slide decks from scratch, (2) Editing existing presentation content, (3) Managing slide layouts and templates, (4) Inserting notes and annotations, or handling other presentation-related operations" +description_zh: "PowerPoint 文档工具包,用于幻灯片生成、内容修改和演示文稿分析。适用于:(1) 从头构建新的幻灯片,(2) 编辑现有演示文稿内容,(3) 管理幻灯片布局和模板,(4) 插入备注和批注,或处理其他演示文稿相关操作" +license: Proprietary. LICENSE.txt has complete terms +--- + +# PowerPoint Document Generation and Editing Toolkit + +## Introduction + +Users may request you to generate, modify, or analyze .pptx files. A .pptx file is fundamentally a ZIP container with XML documents and associated resources that can be inspected or altered. Different utilities and processes are available depending on the task requirements. + +## Extracting and Analyzing Content + +### Text Content Extraction +When you only need to retrieve textual content from slides, convert the presentation to markdown format: + +```bash +# Transform presentation to markdown +python -m markitdown path-to-file.pptx +``` + +### Direct XML Inspection +Direct XML inspection is required for: annotations, presenter notes, master layouts, transition effects, visual styling, and advanced formatting. For these capabilities, unpack the presentation and examine its XML structure. + +#### Extracting Package Contents +`python openxml/scripts/extract.py ` + +**Note**: The extract.py script is located at `skills/pptx-v2/openxml/scripts/extract.py` relative to the project root. If unavailable at this path, use `find . -name "extract.py"` to locate it. + +#### Essential File Hierarchy +* `ppt/presentation.xml` - Core presentation metadata and slide references +* `ppt/slides/slide{N}.xml` - Individual slide content (slide1.xml, slide2.xml, etc.) +* `ppt/notesSlides/notesSlide{N}.xml` - Presenter notes per slide +* `ppt/comments/modernComment_*.xml` - Slide-specific annotations +* `ppt/slideLayouts/` - Layout template definitions +* `ppt/slideMasters/` - Master slide configurations +* `ppt/theme/` - Theme and styling definitions +* `ppt/media/` - Embedded images and media assets + +#### Typography and Color Extraction +**When provided with a reference design to replicate**: Analyze the presentation's typography and color scheme first using these approaches: +1. **Examine theme file**: Check `ppt/theme/theme1.xml` for color definitions (``) and font configurations (``) +2. **Inspect slide content**: Examine `ppt/slides/slide1.xml` for actual font usage (``) and color values +3. **Pattern search**: Use grep to locate color (``, ``) and font references across all XML files + +## Building a New Presentation **from Scratch** + +For creating new presentations without an existing template, use the **slideConverter** workflow to transform HTML slides into PowerPoint with precise element positioning. + +### Design Philosophy + +**ESSENTIAL**: Before building any presentation, evaluate the content and select appropriate visual elements: +1. **Analyze subject matter**: What is the presentation topic? What tone, industry context, or mood should it convey? +2. **Identify branding requirements**: If a company/organization is mentioned, consider their brand colors and visual identity +3. **Align palette with content**: Choose colors that complement the subject matter +4. **Plan visual elements**: Determine which slides require images, diagrams, or illustrations for better comprehension +5. **Document your approach**: Explain design decisions before writing code + +**Guidelines**: +- State your content-driven design approach BEFORE writing code +- Use universally available fonts: Arial, Helvetica, Times New Roman, Georgia, Courier New, Verdana, Tahoma, Trebuchet MS, Impact +- Establish visual hierarchy through size, weight, and color variations +- Prioritize readability: strong contrast, appropriately sized text, clean alignment +- Maintain consistency: repeat patterns, spacing, and visual language across slides +- **Incorporate images proactively**: Enhance presentations with relevant visuals (architecture diagrams, flowcharts, icons, illustrations) + +#### Color Palette Design + +**Developing creative color schemes**: +- **Move beyond defaults**: What colors authentically match this specific topic? Avoid automatic choices. +- **Explore multiple dimensions**: Topic, industry, mood, energy level, target audience, brand identity (if applicable) +- **Experiment boldly**: Try unexpected combinations - a healthcare presentation doesn't require green, finance doesn't require navy +- **Construct your palette**: Select 3-5 harmonious colors (dominant colors + supporting tones + accent) +- **Verify contrast**: Text must remain clearly readable against backgrounds + +**Sample color palettes** (use for inspiration - select one, adapt it, or create your own): + +1. **Corporate Navy**: Deep navy (#1C2833), slate gray (#2E4053), silver (#AAB7B8), off-white (#F4F6F6) +2. **Ocean Breeze**: Teal (#5EA8A7), deep teal (#277884), coral (#FE4447), white (#FFFFFF) +3. **Vibrant Sunset**: Red (#C0392B), bright red (#E74C3C), orange (#F39C12), yellow (#F1C40F), green (#2ECC71) +4. **Soft Blush**: Mauve (#A49393), blush (#EED6D3), rose (#E8B4B8), cream (#FAF7F2) +5. **Rich Wine**: Burgundy (#5D1D2E), crimson (#951233), rust (#C15937), gold (#997929) +6. **Royal Amethyst**: Purple (#B165FB), dark blue (#181B24), emerald (#40695B), white (#FFFFFF) +7. **Natural Cream**: Cream (#FFE1C7), forest green (#40695B), white (#FCFCFC) +8. **Berry Fusion**: Pink (#F8275B), coral (#FF574A), rose (#FF737D), purple (#3D2F68) +9. **Garden Fresh**: Lime (#C5DE82), plum (#7C3A5F), coral (#FD8C6E), blue-gray (#98ACB5) +10. **Luxe Noir**: Gold (#BF9A4A), black (#000000), cream (#F4F6F6) +11. **Mediterranean**: Sage (#87A96B), terracotta (#E07A5F), cream (#F4F1DE), charcoal (#2C2C2C) +12. **Modern Mono**: Charcoal (#292929), red (#E33737), light gray (#CCCBCB) +13. **Energy Burst**: Orange (#F96D00), light gray (#F2F2F2), charcoal (#222831) +14. **Tropical Forest**: Black (#191A19), green (#4E9F3D), dark green (#1E5128), white (#FFFFFF) +15. **Retro Spectrum**: Purple (#722880), pink (#D72D51), orange (#EB5C18), amber (#F08800), gold (#DEB600) +16. **Autumn Harvest**: Mustard (#E3B448), sage (#CBD18F), forest green (#3A6B35), cream (#F4F1DE) +17. **Seaside Rose**: Old rose (#AD7670), beaver (#B49886), eggshell (#F3ECDC), ash gray (#BFD5BE) +18. **Citrus Splash**: Light orange (#FC993E), grayish turquoise (#667C6F), white (#FCFCFC) + +#### Visual Design Elements + +**Geometric Patterns**: +- Diagonal section dividers instead of horizontal +- Asymmetric column widths (30/70, 40/60, 25/75) +- Rotated text headers at 90 or 270 degrees +- Circular/hexagonal frames for images +- Triangular accent shapes in corners +- Overlapping shapes for depth + +**Border and Frame Treatments**: +- Thick single-color borders (10-20pt) on one side only +- Double-line borders with contrasting colors +- Corner brackets instead of full frames +- L-shaped borders (top+left or bottom+right) +- Underline accents beneath headers (3-5pt thick) + +**Typography Treatments**: +- Extreme size contrast (72pt headlines vs 11pt body) +- All-caps headers with wide letter spacing +- Numbered sections in oversized display type +- Monospace (Courier New) for data/stats/technical content +- Condensed fonts (Arial Narrow) for dense information +- Outlined text for emphasis + +**Data Visualization Styling**: +- Monochrome charts with single accent color for key data +- Horizontal bar charts instead of vertical +- Dot plots instead of bar charts +- Minimal gridlines or none at all +- Data labels directly on elements (no legends) +- Oversized numbers for key metrics + +**Layout Innovations**: +- Full-bleed images with text overlays +- Sidebar column (20-30% width) for navigation/context +- Modular grid systems (3x3, 4x4 blocks) +- Z-pattern or F-pattern content flow +- Floating text boxes over colored shapes +- Magazine-style multi-column layouts + +**Background Treatments**: +- Solid color blocks occupying 40-60% of slide +- Gradient fills (vertical or diagonal only) +- Split backgrounds (two colors, diagonal or vertical) +- Edge-to-edge color bands +- Negative space as a design element +- **Background images**: Use subtle, low-contrast images as backgrounds with text overlays +- **Gradient overlays**: Combine background images with semi-transparent gradient overlays for readability + +#### Visual Assets and Image Planning + +**CRITICAL**: Proactively enhance presentations with relevant images to improve visual communication and audience engagement. Do NOT rely solely on text. + +**When to Add Images**: +- **Architecture/System slides**: Always include system architecture diagrams, component diagrams, or infrastructure illustrations +- **Process/Workflow slides**: Add flowcharts, process diagrams, or step-by-step illustrations +- **Data flow slides**: Include data pipeline diagrams, ETL flow illustrations +- **Feature/Product slides**: Add UI mockups, screenshots, or product illustrations +- **Concept explanation slides**: Use metaphorical illustrations or conceptual diagrams +- **Team/About slides**: Include relevant icons or illustrations representing team activities +- **Comparison slides**: Use side-by-side visual comparisons or before/after images + +**Image Categories to Generate**: +1. **Architecture Diagrams**: System components, microservices layout, cloud infrastructure +2. **Flowcharts**: Business processes, user journeys, decision trees +3. **Data Visualizations**: Custom infographics, data flow diagrams +4. **Icons and Illustrations**: Conceptual icons, feature illustrations, decorative elements +5. **Backgrounds**: Subtle pattern backgrounds, gradient images, themed backgrounds +6. **UI/UX Elements**: Interface mockups, wireframe illustrations + +**Image Generation Guidelines**: +- Use the `ImageGen` tool to create high-quality images tailored to slide content +- Generate images as PNG format for direct insertion into slides +- **NEVER use code-based diagrams** (like Mermaid) that require rendering - all images must be static PNG/SVG +- Match image style to presentation theme (colors, mood, professionalism level) +- Ensure generated images have sufficient resolution (at least 1920x1080 for full-slide backgrounds) + +**ImageGen Tool Usage**: +``` +When creating images for presentations: +1. Analyze the slide content and determine what visual would enhance it +2. Craft a detailed prompt describing the desired image: + - Style: professional, flat design, isometric, minimalist, etc. + - Colors: match the presentation's color palette + - Content: specific elements to include + - Mood: professional, friendly, technical, creative +3. Generate the image and place it in the appropriate slide location +``` + +**Sample ImageGen Prompts for Slides**: +- Architecture diagram: "Professional flat design system architecture diagram showing microservices with API gateway, database, cache layer, using blue and gray color scheme, clean white background, no text labels" +- Process flow: "Minimalist business process flowchart with 5 connected steps, isometric style, using teal and coral colors, professional look" +- Background: "Subtle geometric pattern background in navy blue and silver, low contrast, suitable for text overlay, professional presentation style" +- Icon set: "Set of 4 business icons for innovation, teamwork, growth, and technology, flat design style, matching purple and emerald theme" + +#### Image Layout Patterns + +**Image Placement Approaches**: +1. **Full-bleed background**: Image covers entire slide with text overlay + - Use semi-transparent overlay (rgba) for text readability + - Position text in areas with lower visual complexity + +2. **Two-column (Image + Text)**: Most versatile layout + - Image: 40-60% of slide width + - Text: remaining space with adequate margins + - Variations: image left/right, equal or unequal splits + +3. **Image accent**: Small image as visual anchor + - Corner placement (top-right, bottom-left common) + - Size: 15-25% of slide area + - Use for icons, logos, or supporting graphics + +4. **Image grid**: Multiple images in organized layout + - 2x2 or 3x2 grids for comparison or gallery views + - Equal spacing between images + - Consistent image dimensions within grid + +5. **Hero image with caption**: Large central image + - Image: 60-80% of slide height + - Caption below or overlay at bottom + - Ideal for showcasing products, screenshots, diagrams + +**Image Sizing Recommendations**: +- **Full-slide background**: Match slide dimensions (720pt x 405pt for 16:9) +- **Half-slide image**: 360pt x 405pt (portrait) or 720pt x 200pt (landscape banner) +- **Quarter-slide image**: 350pt x 200pt +- **Icon/thumbnail**: 50-100pt x 50-100pt +- Always maintain aspect ratio to avoid distortion +- Leave 20-30pt margins from slide edges + +**Text-Image Coordination**: +- Ensure sufficient contrast between text and image backgrounds +- Use text shadows or backdrop shapes when placing text over images +- Align text blocks to image edges for visual coherence +- Match text color to accent colors in the image + +### Layout Strategies +**When creating slides with charts or tables:** +- **Two-column layout (PREFERRED)**: Use a header spanning the full width, then two columns below - text/bullets in one column and the featured content in the other. This provides better balance and makes charts/tables more readable. Use flexbox with unequal column widths (e.g., 40%/60% split) to optimize space for each content type. +- **Full-slide layout**: Let the featured content (chart/table) take up the entire slide for maximum impact and readability +- **NEVER vertically stack**: Do not place charts/tables below text in a single column - this causes poor readability and layout issues + +### Process +1. **REQUIRED - READ COMPLETE FILE**: Read [`slide-generator.md`](slide-generator.md) entirely from start to finish. **NEVER set any range limits when reading this file.** Read the full file content for detailed syntax, critical formatting rules, and best practices before proceeding with presentation creation. +2. Create an HTML file for each slide with proper dimensions (e.g., 720pt x 405pt for 16:9) + - Use `

`, `

`-`

`, `
    `, `
      ` for all text content + - Use `class="placeholder"` for areas where charts/tables will be added (render with gray background for visibility) + - **CRITICAL**: Rasterize gradients and icons as PNG images FIRST using Sharp, then reference in HTML + - **LAYOUT**: For slides with charts/tables/images, use either full-slide layout or two-column layout for better readability +3. Create and run a JavaScript file using the [`slideConverter.js`](scripts/slideConverter.js) library to convert HTML slides to PowerPoint and save the presentation + - Use the `convertSlide()` function to process each HTML file + - Add charts and tables to placeholder areas using PptxGenJS API + - Save the presentation using `pptx.writeFile()` +4. **Visual validation**: Generate thumbnails and inspect for layout issues + - Create thumbnail grid: `python scripts/slidePreview.py output.pptx workspace/thumbnails --cols 4` + - Read and carefully examine the thumbnail image for: + - **Text cutoff**: Text being cut off by header bars, shapes, or slide edges + - **Text overlap**: Text overlapping with other text or shapes + - **Positioning issues**: Content too close to slide boundaries or other elements + - **Contrast issues**: Insufficient contrast between text and backgrounds + - If issues found, adjust HTML margins/spacing/colors and regenerate the presentation + - Repeat until all slides are visually correct + +## Modifying an Existing Presentation + +When editing slides in an existing PowerPoint presentation, work with the raw Office Open XML (OOXML) format. This involves extracting the .pptx file, modifying the XML content, and repackaging it. + +### Process +1. **REQUIRED - READ COMPLETE FILE**: Read [`openxml.md`](openxml.md) (~500 lines) entirely from start to finish. **NEVER set any range limits when reading this file.** Read the full file content for detailed guidance on OOXML structure and editing workflows before any presentation editing. +2. Extract the presentation: `python openxml/scripts/extract.py ` +3. Modify the XML files (primarily `ppt/slides/slide{N}.xml` and related files) +4. **ESSENTIAL**: Validate immediately after each edit and fix any validation errors before proceeding: `python openxml/scripts/check.py --original ` +5. Repackage the final presentation: `python openxml/scripts/bundle.py ` + +## Building a New Presentation **Using a Template** + +When you need to create a presentation that follows an existing template's design, duplicate and re-arrange template slides before replacing placeholder content. + +### Process +1. **Extract template text AND create visual thumbnail grid**: + * Extract text: `python -m markitdown template.pptx > template-content.md` + * Read `template-content.md`: Read the entire file to understand the contents of the template presentation. **NEVER set any range limits when reading this file.** + * Create thumbnail grids: `python scripts/slidePreview.py template.pptx` + * See [Generating Thumbnail Grids](#generating-thumbnail-grids) section for more details + +2. **Analyze template and save inventory to a file**: + * **Visual Analysis**: Review thumbnail grid(s) to understand slide layouts, design patterns, and visual structure + * Create and save a template inventory file at `template-inventory.md` containing: + ```markdown + # Template Inventory Analysis + **Total Slides: [count]** + **IMPORTANT: Slides are 0-indexed (first slide = 0, last slide = count-1)** + + ## [Category Name] + - Slide 0: [Layout code if available] - Description/purpose + - Slide 1: [Layout code] - Description/purpose + - Slide 2: [Layout code] - Description/purpose + [... EVERY slide must be listed individually with its index ...] + ``` + * **Using the thumbnail grid**: Reference the visual thumbnails to identify: + - Layout patterns (title slides, content layouts, section dividers) + - Image placeholder locations and counts + - Design consistency across slide groups + - Visual hierarchy and structure + * This inventory file is REQUIRED for selecting appropriate templates in the next step + +3. **Create presentation outline based on template inventory**: + * Review available templates from step 2. + * Choose an intro or title template for the first slide. This should be one of the first templates. + * Choose safe, text-based layouts for the other slides. + * **ESSENTIAL: Match layout structure to actual content**: + - Single-column layouts: Use for unified narrative or single topic + - Two-column layouts: Use ONLY when you have exactly 2 distinct items/concepts + - Three-column layouts: Use ONLY when you have exactly 3 distinct items/concepts + - Image + text layouts: Use ONLY when you have actual images to insert + - Quote layouts: Use ONLY for actual quotes from people (with attribution), never for emphasis + - Never use layouts with more placeholders than you have content + - If you have 2 items, don't force them into a 3-column layout + - If you have 4+ items, consider breaking into multiple slides or using a list format + * Count your actual content pieces BEFORE selecting the layout + * Verify each placeholder in the chosen layout will be filled with meaningful content + * Select one option representing the **best** layout for each content section. + * Save `outline.md` with content AND template mapping that leverages available designs + * Example template mapping: + ``` + # Template slides to use (0-based indexing) + # WARNING: Verify indices are within range! Template with 73 slides has indices 0-72 + # Mapping: slide numbers from outline -> template slide indices + template_mapping = [ + 0, # Use slide 0 (Title/Cover) + 34, # Use slide 34 (B1: Title and body) + 34, # Use slide 34 again (duplicate for second B1) + 50, # Use slide 50 (E1: Quote) + 54, # Use slide 54 (F2: Closing + Text) + ] + ``` + +4. **Duplicate, reorder, and delete slides using `reorder.py`**: + * Use the `scripts/reorder.py` script to create a new presentation with slides in the desired order: + ```bash + python scripts/reorder.py template.pptx working.pptx 0,34,34,50,52 + ``` + * The script handles duplicating repeated slides, deleting unused slides, and reordering automatically + * Slide indices are 0-based (first slide is 0, second is 1, etc.) + * The same slide index can appear multiple times to duplicate that slide + +5. **Extract ALL text using the `textExtractor.py` script**: + * **Run inventory extraction**: + ```bash + python scripts/textExtractor.py working.pptx text-inventory.json + ``` + * **Read text-inventory.json**: Read the entire text-inventory.json file to understand all shapes and their properties. **NEVER set any range limits when reading this file.** + + * The inventory JSON structure: + ```json + { + "slide-0": { + "shape-0": { + "placeholder_type": "TITLE", // or null for non-placeholders + "left": 1.5, // position in inches + "top": 2.0, + "width": 7.5, + "height": 1.2, + "paragraphs": [ + { + "text": "Paragraph text", + // Optional properties (only included when non-default): + "bullet": true, // explicit bullet detected + "level": 0, // only included when bullet is true + "alignment": "CENTER", // CENTER, RIGHT (not LEFT) + "space_before": 10.0, // space before paragraph in points + "space_after": 6.0, // space after paragraph in points + "line_spacing": 22.4, // line spacing in points + "font_name": "Arial", // from first run + "font_size": 14.0, // in points + "bold": true, + "italic": false, + "underline": false, + "color": "FF0000" // RGB color + } + ] + } + } + } + ``` + + * Key features: + - **Slides**: Named as "slide-0", "slide-1", etc. + - **Shapes**: Ordered by visual position (top-to-bottom, left-to-right) as "shape-0", "shape-1", etc. + - **Placeholder types**: TITLE, CENTER_TITLE, SUBTITLE, BODY, OBJECT, or null + - **Default font size**: `default_font_size` in points extracted from layout placeholders (when available) + - **Slide numbers are filtered**: Shapes with SLIDE_NUMBER placeholder type are automatically excluded from inventory + - **Bullets**: When `bullet: true`, `level` is always included (even if 0) + - **Spacing**: `space_before`, `space_after`, and `line_spacing` in points (only included when set) + - **Colors**: `color` for RGB (e.g., "FF0000"), `theme_color` for theme colors (e.g., "DARK_1") + - **Properties**: Only non-default values are included in the output + +6. **Generate replacement text and save the data to a JSON file** + Based on the text inventory from the previous step: + - **ESSENTIAL**: First verify which shapes exist in the inventory - only reference shapes that are actually present + - **VALIDATION**: The textReplacer.py script will validate that all shapes in your replacement JSON exist in the inventory + - If you reference a non-existent shape, you'll get an error showing available shapes + - If you reference a non-existent slide, you'll get an error indicating the slide doesn't exist + - All validation errors are shown at once before the script exits + - **NOTE**: The textReplacer.py script uses textExtractor.py internally to identify ALL text shapes + - **AUTOMATIC CLEARING**: ALL text shapes from the inventory will be cleared unless you provide "paragraphs" for them + - Add a "paragraphs" field to shapes that need content (not "replacement_paragraphs") + - Shapes without "paragraphs" in the replacement JSON will have their text cleared automatically + - Paragraphs with bullets will be automatically left aligned. Don't set the `alignment` property when `"bullet": true` + - Generate appropriate replacement content for placeholder text + - Use shape size to determine appropriate content length + - **ESSENTIAL**: Include paragraph properties from the original inventory - don't just provide text + - **NOTE**: When bullet: true, do NOT include bullet symbols in text - they're added automatically + - **FORMATTING GUIDELINES**: + - Headers/titles should typically have `"bold": true` + - List items should have `"bullet": true, "level": 0` (level is required when bullet is true) + - Preserve any alignment properties (e.g., `"alignment": "CENTER"` for centered text) + - Include font properties when different from default (e.g., `"font_size": 14.0`, `"font_name": "Lora"`) + - Colors: Use `"color": "FF0000"` for RGB or `"theme_color": "DARK_1"` for theme colors + - The replacement script expects **properly formatted paragraphs**, not just text strings + - **Overlapping shapes**: Prefer shapes with larger default_font_size or more appropriate placeholder_type + - Save the updated inventory with replacements to `replacement-text.json` + - **CAUTION**: Different template layouts have different shape counts - always check the actual inventory before creating replacements + + Example paragraphs field showing proper formatting: + ```json + "paragraphs": [ + { + "text": "New presentation title text", + "alignment": "CENTER", + "bold": true + }, + { + "text": "Section Header", + "bold": true + }, + { + "text": "First bullet point without bullet symbol", + "bullet": true, + "level": 0 + }, + { + "text": "Red colored text", + "color": "FF0000" + }, + { + "text": "Theme colored text", + "theme_color": "DARK_1" + }, + { + "text": "Regular paragraph text without special formatting" + } + ] + ``` + + **Shapes not listed in the replacement JSON are automatically cleared**: + ```json + { + "slide-0": { + "shape-0": { + "paragraphs": [...] // This shape gets new text + } + // shape-1 and shape-2 from inventory will be cleared automatically + } + } + ``` + + **Common formatting patterns for presentations**: + - Title slides: Bold text, sometimes centered + - Section headers within slides: Bold text + - Bullet lists: Each item needs `"bullet": true, "level": 0` + - Body text: Usually no special properties needed + - Quotes: May have special alignment or font properties + +7. **Apply replacements using the `textReplacer.py` script** + ```bash + python scripts/textReplacer.py working.pptx replacement-text.json output.pptx + ``` + + The script will: + - First extract the inventory of ALL text shapes using functions from textExtractor.py + - Validate that all shapes in the replacement JSON exist in the inventory + - Clear text from ALL shapes identified in the inventory + - Apply new text only to shapes with "paragraphs" defined in the replacement JSON + - Preserve formatting by applying paragraph properties from the JSON + - Handle bullets, alignment, font properties, and colors automatically + - Save the updated presentation + + Example validation errors: + ``` + ERROR: Invalid shapes in replacement JSON: + - Shape 'shape-99' not found on 'slide-0'. Available shapes: shape-0, shape-1, shape-4 + - Slide 'slide-999' not found in inventory + ``` + + ``` + ERROR: Replacement text made overflow worse in these shapes: + - slide-0/shape-2: overflow worsened by 1.25" (was 0.00", now 1.25") + ``` + +## Generating Thumbnail Grids + +To create visual thumbnail grids of PowerPoint slides for quick analysis and reference: + +```bash +python scripts/slidePreview.py template.pptx [output_prefix] +``` + +**Capabilities**: +- Creates: `thumbnails.jpg` (or `thumbnails-1.jpg`, `thumbnails-2.jpg`, etc. for large decks) +- Default: 5 columns, max 30 slides per grid (5x6) +- Custom prefix: `python scripts/slidePreview.py template.pptx my-grid` + - Note: The output prefix should include the path if you want output in a specific directory (e.g., `workspace/my-grid`) +- Adjust columns: `--cols 4` (range: 3-6, affects slides per grid) +- Grid limits: 3 cols = 12 slides/grid, 4 cols = 20, 5 cols = 30, 6 cols = 42 +- Slides are zero-indexed (Slide 0, Slide 1, etc.) + +**Use cases**: +- Template analysis: Quickly understand slide layouts and design patterns +- Content review: Visual overview of entire presentation +- Navigation reference: Find specific slides by their visual appearance +- Quality check: Verify all slides are properly formatted + +**Examples**: +```bash +# Basic usage +python scripts/slidePreview.py presentation.pptx + +# Combine options: custom name, columns +python scripts/slidePreview.py template.pptx analysis --cols 4 +``` + +## Converting Slides to Images + +To visually analyze PowerPoint slides, convert them to images using a two-step process: + +1. **Convert PPTX to PDF**: + ```bash + soffice --headless --convert-to pdf template.pptx + ``` + +2. **Convert PDF pages to JPEG images**: + ```bash + pdftoppm -jpeg -r 150 template.pdf slide + ``` + This creates files like `slide-1.jpg`, `slide-2.jpg`, etc. + +Options: +- `-r 150`: Sets resolution to 150 DPI (adjust for quality/size balance) +- `-jpeg`: Output JPEG format (use `-png` for PNG if preferred) +- `-f N`: First page to convert (e.g., `-f 2` starts from page 2) +- `-l N`: Last page to convert (e.g., `-l 5` stops at page 5) +- `slide`: Prefix for output files + +Example for specific range: +```bash +pdftoppm -jpeg -r 150 -f 2 -l 5 template.pdf slide # Converts only pages 2-5 +``` + +## Code Style Guidelines +**CRITICAL**: When generating code for PPTX operations: +- Write concise code +- Avoid verbose variable names and redundant operations +- Avoid unnecessary print statements + +## Dependencies + +Required dependencies (should already be installed): + +- **markitdown**: `pip install "markitdown[pptx]"` (for text extraction from presentations) +- **pptxgenjs**: `npm install -g pptxgenjs` (for creating presentations via slideConverter) +- **playwright**: `npm install -g playwright` (for HTML rendering in slideConverter) +- **react-icons**: `npm install -g react-icons react react-dom` (for icons) +- **sharp**: `npm install -g sharp` (for SVG rasterization and image processing) +- **LibreOffice**: `sudo apt-get install libreoffice` (for PDF conversion) +- **Poppler**: `sudo apt-get install poppler-utils` (for pdftoppm to convert PDF to images) +- **defusedxml**: `pip install defusedxml` (for secure XML parsing) diff --git a/deploy/data/skills/pptx/openxml.md b/deploy/data/skills/pptx/openxml.md new file mode 100644 index 000000000..f071876b0 --- /dev/null +++ b/deploy/data/skills/pptx/openxml.md @@ -0,0 +1,427 @@ +# Office Open XML Technical Reference for PowerPoint + +**CRITICAL: Read this entire document before starting.** Important XML schema rules and formatting requirements are covered throughout. Incorrect implementation can create invalid PPTX files that PowerPoint cannot open. + +## Technical Requirements + +### Schema Compliance +- **Element ordering in ``**: ``, ``, `` +- **Whitespace**: Add `xml:space='preserve'` to `` elements with leading/trailing spaces +- **Unicode**: Escape characters in ASCII content: `"` becomes `“` +- **Images**: Add to `ppt/media/`, reference in slide XML, set dimensions to fit slide bounds +- **Relationships**: Update `ppt/slides/_rels/slideN.xml.rels` for each slide's resources +- **Dirty attribute**: Add `dirty="0"` to `` and `` elements to indicate clean state + +## Presentation Architecture + +### Basic Slide Structure +```xml + + + + + ... + ... + + + + +``` + +### Text Box / Shape with Text +```xml + + + + + + + + + + + + + + + + + + + + + + Slide Title + + + + +``` + +### Text Formatting +```xml + + + + Bold Text + + + + + + Italic Text + + + + + + Underlined + + + + + + + + + + Highlighted Text + + + + + + + + + + Colored Arial 24pt + + + + + + + + + + Formatted text + +``` + +### Lists +```xml + + + + + + + First bullet point + + + + + + + + + + First numbered item + + + + + + + + + + Indented bullet + + +``` + +### Shapes +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +``` + +### Images +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + +``` + +### Tables +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + + Cell 1 + + + + + + + + + + + Cell 2 + + + + + + + + + +``` + +### Slide Layouts + +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +``` + +## File Updates + +When adding content, update these files: + +**`ppt/_rels/presentation.xml.rels`:** +```xml + + +``` + +**`ppt/slides/_rels/slide1.xml.rels`:** +```xml + + +``` + +**`[Content_Types].xml`:** +```xml + + + +``` + +**`ppt/presentation.xml`:** +```xml + + + + +``` + +**`docProps/app.xml`:** Update slide count and statistics +```xml +2 +10 +50 +``` + +## Slide Operations + +### Adding a New Slide +When adding a slide to the end of the presentation: + +1. **Create the slide file** (`ppt/slides/slideN.xml`) +2. **Update `[Content_Types].xml`**: Add Override for the new slide +3. **Update `ppt/_rels/presentation.xml.rels`**: Add relationship for the new slide +4. **Update `ppt/presentation.xml`**: Add slide ID to `` +5. **Create slide relationships** (`ppt/slides/_rels/slideN.xml.rels`) if needed +6. **Update `docProps/app.xml`**: Increment slide count and update statistics (if present) + +### Duplicating a Slide +1. Copy the source slide XML file with a new name +2. Update all IDs in the new slide to be unique +3. Follow the "Adding a New Slide" steps above +4. **ESSENTIAL**: Remove or update any notes slide references in `_rels` files +5. Remove references to unused media files + +### Reordering Slides +1. **Update `ppt/presentation.xml`**: Reorder `` elements in `` +2. The order of `` elements determines slide order +3. Keep slide IDs and relationship IDs unchanged + +Example: +```xml + + + + + + + + + + + + + +``` + +### Deleting a Slide +1. **Remove from `ppt/presentation.xml`**: Delete the `` entry +2. **Remove from `ppt/_rels/presentation.xml.rels`**: Delete the relationship +3. **Remove from `[Content_Types].xml`**: Delete the Override entry +4. **Delete files**: Remove `ppt/slides/slideN.xml` and `ppt/slides/_rels/slideN.xml.rels` +5. **Update `docProps/app.xml`**: Decrement slide count and update statistics +6. **Clean up unused media**: Remove orphaned images from `ppt/media/` + +Note: Don't renumber remaining slides - keep their original IDs and filenames. + + +## Common Mistakes to Avoid + +- **Encodings**: Escape unicode characters in ASCII content: `"` becomes `“` +- **Images**: Add to `ppt/media/` and update relationship files +- **Lists**: Omit bullets from list headers +- **IDs**: Use valid hexadecimal values for UUIDs +- **Themes**: Check all themes in `theme` directory for colors + +## Validation Checklist for Template-Based Presentations + +### Before Repackaging, Always: +- **Clean unused resources**: Remove unreferenced media, fonts, and notes directories +- **Fix Content_Types.xml**: Declare ALL slides, layouts, and themes present in the package +- **Fix relationship IDs**: + - Remove font embed references if not using embedded fonts +- **Remove broken references**: Check all `_rels` files for references to deleted resources + +### Common Template Duplication Pitfalls: +- Multiple slides referencing the same notes slide after duplication +- Image/media references from template slides that no longer exist +- Font embedding references when fonts aren't included +- Missing slideLayout declarations for layouts 12-25 +- docProps directory may not unpack - this is optional diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd new file mode 100644 index 000000000..6454ef9a9 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd @@ -0,0 +1,1499 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd new file mode 100644 index 000000000..afa4f463e --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd @@ -0,0 +1,146 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd new file mode 100644 index 000000000..64e66b8ab --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd @@ -0,0 +1,1085 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd new file mode 100644 index 000000000..687eea829 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd @@ -0,0 +1,11 @@ + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd new file mode 100644 index 000000000..6ac81b06b --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd @@ -0,0 +1,3081 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd new file mode 100644 index 000000000..1dbf05140 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd new file mode 100644 index 000000000..f1af17db4 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd @@ -0,0 +1,185 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd new file mode 100644 index 000000000..0a185ab6e --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd @@ -0,0 +1,287 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/pml.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/pml.xsd new file mode 100644 index 000000000..14ef48886 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/pml.xsd @@ -0,0 +1,1676 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd new file mode 100644 index 000000000..c20f3bf14 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd new file mode 100644 index 000000000..ac6025226 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd @@ -0,0 +1,144 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd new file mode 100644 index 000000000..424b8ba8d --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd @@ -0,0 +1,174 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd new file mode 100644 index 000000000..2bddce292 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd new file mode 100644 index 000000000..8a8c18ba2 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd new file mode 100644 index 000000000..5c42706a0 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd new file mode 100644 index 000000000..853c341c8 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd @@ -0,0 +1,56 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd new file mode 100644 index 000000000..da835ee82 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd @@ -0,0 +1,195 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd new file mode 100644 index 000000000..87ad2658f --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd @@ -0,0 +1,582 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd new file mode 100644 index 000000000..9e86f1b2b --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/sml.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/sml.xsd new file mode 100644 index 000000000..d0be42e75 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/sml.xsd @@ -0,0 +1,4439 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd new file mode 100644 index 000000000..8821dd183 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd @@ -0,0 +1,570 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd new file mode 100644 index 000000000..ca2575c75 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd @@ -0,0 +1,509 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd new file mode 100644 index 000000000..dd079e603 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd @@ -0,0 +1,12 @@ + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd new file mode 100644 index 000000000..3dd6cf625 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd @@ -0,0 +1,108 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd new file mode 100644 index 000000000..f1041e34e --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd @@ -0,0 +1,96 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/wml.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/wml.xsd new file mode 100644 index 000000000..9c5b7a633 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/wml.xsd @@ -0,0 +1,3646 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/xml.xsd b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/xml.xsd new file mode 100644 index 000000000..0f13678d8 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ISO-IEC29500-4_2016/xml.xsd @@ -0,0 +1,116 @@ + + + + + + See http://www.w3.org/XML/1998/namespace.html and + http://www.w3.org/TR/REC-xml for information about this namespace. + + This schema document describes the XML namespace, in a form + suitable for import by other schema documents. + + Note that local names in this namespace are intended to be defined + only by the World Wide Web Consortium or its subgroups. The + following names are currently defined in this namespace and should + not be used with conflicting semantics by any Working Group, + specification, or document instance: + + base (as an attribute name): denotes an attribute whose value + provides a URI to be used as the base for interpreting any + relative URIs in the scope of the element on which it + appears; its value is inherited. This name is reserved + by virtue of its definition in the XML Base specification. + + lang (as an attribute name): denotes an attribute whose value + is a language code for the natural language of the content of + any element; its value is inherited. This name is reserved + by virtue of its definition in the XML specification. + + space (as an attribute name): denotes an attribute whose + value is a keyword indicating what whitespace processing + discipline is intended for the content of the element; its + value is inherited. This name is reserved by virtue of its + definition in the XML specification. + + Father (in any context at all): denotes Jon Bosak, the chair of + the original XML Working Group. This name is reserved by + the following decision of the W3C XML Plenary and + XML Coordination groups: + + In appreciation for his vision, leadership and dedication + the W3C XML Plenary on this 10th day of February, 2000 + reserves for Jon Bosak in perpetuity the XML name + xml:Father + + + + + This schema defines attributes and an attribute group + suitable for use by + schemas wishing to allow xml:base, xml:lang or xml:space attributes + on elements they define. + + To enable this, such a schema must import this schema + for the XML namespace, e.g. as follows: + <schema . . .> + . . . + <import namespace="http://www.w3.org/XML/1998/namespace" + schemaLocation="http://www.w3.org/2001/03/xml.xsd"/> + + Subsequently, qualified reference to any of the attributes + or the group defined below will have the desired effect, e.g. + + <type . . .> + . . . + <attributeGroup ref="xml:specialAttrs"/> + + will define a type which will schema-validate an instance + element with any of those attributes + + + + In keeping with the XML Schema WG's standard versioning + policy, this schema document will persist at + http://www.w3.org/2001/03/xml.xsd. + At the date of issue it can also be found at + http://www.w3.org/2001/xml.xsd. + The schema document at that URI may however change in the future, + in order to remain compatible with the latest version of XML Schema + itself. In other words, if the XML Schema namespace changes, the version + of this document at + http://www.w3.org/2001/xml.xsd will change + accordingly; the version at + http://www.w3.org/2001/03/xml.xsd will not change. + + + + + + In due course, we should install the relevant ISO 2- and 3-letter + codes as the enumerated possible values . . . + + + + + + + + + + + + + + + See http://www.w3.org/TR/xmlbase/ for + information about this attribute. + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd b/deploy/data/skills/pptx/openxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd new file mode 100644 index 000000000..a6de9d273 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd @@ -0,0 +1,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd b/deploy/data/skills/pptx/openxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd new file mode 100644 index 000000000..10e978b66 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd @@ -0,0 +1,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ecma/fouth-edition/opc-digSig.xsd b/deploy/data/skills/pptx/openxml/schemas/ecma/fouth-edition/opc-digSig.xsd new file mode 100644 index 000000000..4248bf7a3 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ecma/fouth-edition/opc-digSig.xsd @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/ecma/fouth-edition/opc-relationships.xsd b/deploy/data/skills/pptx/openxml/schemas/ecma/fouth-edition/opc-relationships.xsd new file mode 100644 index 000000000..564974671 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/ecma/fouth-edition/opc-relationships.xsd @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/mce/mc.xsd b/deploy/data/skills/pptx/openxml/schemas/mce/mc.xsd new file mode 100644 index 000000000..ef725457c --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/mce/mc.xsd @@ -0,0 +1,75 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-2010.xsd b/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-2010.xsd new file mode 100644 index 000000000..f65f77773 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-2010.xsd @@ -0,0 +1,560 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-2012.xsd b/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-2012.xsd new file mode 100644 index 000000000..6b00755a9 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-2012.xsd @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-2018.xsd b/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-2018.xsd new file mode 100644 index 000000000..f321d333a --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-2018.xsd @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-cex-2018.xsd b/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-cex-2018.xsd new file mode 100644 index 000000000..364c6a9b8 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-cex-2018.xsd @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-cid-2016.xsd b/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-cid-2016.xsd new file mode 100644 index 000000000..fed9d15b7 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-cid-2016.xsd @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-sdtdatahash-2020.xsd b/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-sdtdatahash-2020.xsd new file mode 100644 index 000000000..680cf1540 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-sdtdatahash-2020.xsd @@ -0,0 +1,4 @@ + + + + diff --git a/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-symex-2015.xsd b/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-symex-2015.xsd new file mode 100644 index 000000000..89ada9083 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/schemas/microsoft/wml-symex-2015.xsd @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/deploy/data/skills/pptx/openxml/scripts/bundle.py b/deploy/data/skills/pptx/openxml/scripts/bundle.py new file mode 100755 index 000000000..c0a04d51e --- /dev/null +++ b/deploy/data/skills/pptx/openxml/scripts/bundle.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +""" +Tool to bundle a directory into a .docx, .pptx, or .xlsx file with XML formatting undone. + +Example usage: + python bundle.py [--force] +""" + +import argparse +import shutil +import subprocess +import sys +import tempfile +import defusedxml.minidom +import zipfile +from pathlib import Path + + +def main(): + parser = argparse.ArgumentParser(description="Bundle a directory into an Office file") + parser.add_argument("input_directory", help="Unpacked Office document directory") + parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)") + parser.add_argument("--force", action="store_true", help="Skip validation") + args = parser.parse_args() + + try: + success = bundle_document( + args.input_directory, args.output_file, validate=not args.force + ) + + # Show warning if validation was skipped + if args.force: + print("Warning: Skipped validation, file may be corrupt", file=sys.stderr) + # Exit with error if validation failed + elif not success: + print("Contents would produce a corrupt file.", file=sys.stderr) + print("Please validate XML before repacking.", file=sys.stderr) + print("Use --force to skip validation and pack anyway.", file=sys.stderr) + sys.exit(1) + + except ValueError as e: + sys.exit(f"Error: {e}") + + +def bundle_document(input_dir, output_file, validate=False): + """Bundle a directory into an Office file (.docx/.pptx/.xlsx). + + Args: + input_dir: Path to unpacked Office document directory + output_file: Path to output Office file + validate: If True, validates with soffice (default: False) + + Returns: + bool: True if successful, False if validation failed + """ + input_dir = Path(input_dir) + output_file = Path(output_file) + + if not input_dir.is_dir(): + raise ValueError(f"{input_dir} is not a directory") + if output_file.suffix.lower() not in {".docx", ".pptx", ".xlsx"}: + raise ValueError(f"{output_file} must be a .docx, .pptx, or .xlsx file") + + # Work in temporary directory to avoid modifying original + with tempfile.TemporaryDirectory() as temp_dir: + temp_content_dir = Path(temp_dir) / "content" + shutil.copytree(input_dir, temp_content_dir) + + # Process XML files to remove pretty-printing whitespace + for pattern in ["*.xml", "*.rels"]: + for xml_file in temp_content_dir.rglob(pattern): + condense_xml(xml_file) + + # Create final Office file as zip archive + output_file.parent.mkdir(parents=True, exist_ok=True) + with zipfile.ZipFile(output_file, "w", zipfile.ZIP_DEFLATED) as zf: + for f in temp_content_dir.rglob("*"): + if f.is_file(): + zf.write(f, f.relative_to(temp_content_dir)) + + # Validate if requested + if validate: + if not validate_document(output_file): + output_file.unlink() # Delete the corrupt file + return False + + return True + + +def validate_document(doc_path): + """Validate document by converting to HTML with soffice.""" + # Determine the correct filter based on file extension + match doc_path.suffix.lower(): + case ".docx": + filter_name = "html:HTML" + case ".pptx": + filter_name = "html:impress_html_Export" + case ".xlsx": + filter_name = "html:HTML (StarCalc)" + + with tempfile.TemporaryDirectory() as temp_dir: + try: + result = subprocess.run( + [ + "soffice", + "--headless", + "--convert-to", + filter_name, + "--outdir", + temp_dir, + str(doc_path), + ], + capture_output=True, + timeout=10, + text=True, + ) + if not (Path(temp_dir) / f"{doc_path.stem}.html").exists(): + error_msg = result.stderr.strip() or "Document validation failed" + print(f"Validation error: {error_msg}", file=sys.stderr) + return False + return True + except FileNotFoundError: + print("Warning: soffice not found. Skipping validation.", file=sys.stderr) + return True + except subprocess.TimeoutExpired: + print("Validation error: Timeout during conversion", file=sys.stderr) + return False + except Exception as e: + print(f"Validation error: {e}", file=sys.stderr) + return False + + +def condense_xml(xml_file): + """Strip unnecessary whitespace and remove comments.""" + with open(xml_file, "r", encoding="utf-8") as f: + dom = defusedxml.minidom.parse(f) + + # Process each element to remove whitespace and comments + for element in dom.getElementsByTagName("*"): + # Skip w:t elements and their processing + if element.tagName.endswith(":t"): + continue + + # Remove whitespace-only text nodes and comment nodes + for child in list(element.childNodes): + if ( + child.nodeType == child.TEXT_NODE + and child.nodeValue + and child.nodeValue.strip() == "" + ) or child.nodeType == child.COMMENT_NODE: + element.removeChild(child) + + # Write back the condensed XML + with open(xml_file, "wb") as f: + f.write(dom.toxml(encoding="UTF-8")) + + +if __name__ == "__main__": + main() diff --git a/deploy/data/skills/pptx/openxml/scripts/check.py b/deploy/data/skills/pptx/openxml/scripts/check.py new file mode 100755 index 000000000..eabe0f057 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/scripts/check.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +""" +Command line tool to check Office document XML files against XSD schemas and tracked changes. + +Usage: + python check.py --original +""" + +import argparse +import sys +from pathlib import Path + +from validation import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator + + +def main(): + parser = argparse.ArgumentParser(description="Check Office document XML files") + parser.add_argument( + "unpacked_dir", + help="Path to unpacked Office document directory", + ) + parser.add_argument( + "--original", + required=True, + help="Path to original file (.docx/.pptx/.xlsx)", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Enable verbose output", + ) + args = parser.parse_args() + + # Validate paths + unpacked_dir = Path(args.unpacked_dir) + original_file = Path(args.original) + file_extension = original_file.suffix.lower() + assert unpacked_dir.is_dir(), f"Error: {unpacked_dir} is not a directory" + assert original_file.is_file(), f"Error: {original_file} is not a file" + assert file_extension in [".docx", ".pptx", ".xlsx"], ( + f"Error: {original_file} must be a .docx, .pptx, or .xlsx file" + ) + + # Run validations + match file_extension: + case ".docx": + validators = [DOCXSchemaValidator, RedliningValidator] + case ".pptx": + validators = [PPTXSchemaValidator] + case _: + print(f"Error: Validation not supported for file type {file_extension}") + sys.exit(1) + + # Run validators + success = True + for V in validators: + validator = V(unpacked_dir, original_file, verbose=args.verbose) + if not validator.validate(): + success = False + + if success: + print("All validations PASSED!") + + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/deploy/data/skills/pptx/openxml/scripts/extract.py b/deploy/data/skills/pptx/openxml/scripts/extract.py new file mode 100755 index 000000000..7c172f9bd --- /dev/null +++ b/deploy/data/skills/pptx/openxml/scripts/extract.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +"""Extract and format XML contents of Office files (.docx, .pptx, .xlsx)""" + +import random +import sys +import defusedxml.minidom +import zipfile +from pathlib import Path + +# Get command line arguments +assert len(sys.argv) == 3, "Usage: python extract.py " +input_file, output_dir = sys.argv[1], sys.argv[2] + +# Extract and format +output_path = Path(output_dir) +output_path.mkdir(parents=True, exist_ok=True) +zipfile.ZipFile(input_file).extractall(output_path) + +# Pretty print all XML files +xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels")) +for xml_file in xml_files: + content = xml_file.read_text(encoding="utf-8") + dom = defusedxml.minidom.parseString(content) + xml_file.write_bytes(dom.toprettyxml(indent=" ", encoding="ascii")) + +# For .docx files, suggest an RSID for tracked changes +if input_file.endswith(".docx"): + suggested_rsid = "".join(random.choices("0123456789ABCDEF", k=8)) + print(f"Suggested RSID for edit session: {suggested_rsid}") diff --git a/deploy/data/skills/pptx/openxml/scripts/validation/__init__.py b/deploy/data/skills/pptx/openxml/scripts/validation/__init__.py new file mode 100644 index 000000000..db092ece7 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/scripts/validation/__init__.py @@ -0,0 +1,15 @@ +""" +Validation modules for Word document processing. +""" + +from .base import BaseSchemaValidator +from .docx import DOCXSchemaValidator +from .pptx import PPTXSchemaValidator +from .redlining import RedliningValidator + +__all__ = [ + "BaseSchemaValidator", + "DOCXSchemaValidator", + "PPTXSchemaValidator", + "RedliningValidator", +] diff --git a/deploy/data/skills/pptx/openxml/scripts/validation/base.py b/deploy/data/skills/pptx/openxml/scripts/validation/base.py new file mode 100644 index 000000000..0681b199c --- /dev/null +++ b/deploy/data/skills/pptx/openxml/scripts/validation/base.py @@ -0,0 +1,951 @@ +""" +Base validator with common validation logic for document files. +""" + +import re +from pathlib import Path + +import lxml.etree + + +class BaseSchemaValidator: + """Base validator with common validation logic for document files.""" + + # Elements whose 'id' attributes must be unique within their file + # Format: element_name -> (attribute_name, scope) + # scope can be 'file' (unique within file) or 'global' (unique across all files) + UNIQUE_ID_REQUIREMENTS = { + # Word elements + "comment": ("id", "file"), # Comment IDs in comments.xml + "commentrangestart": ("id", "file"), # Must match comment IDs + "commentrangeend": ("id", "file"), # Must match comment IDs + "bookmarkstart": ("id", "file"), # Bookmark start IDs + "bookmarkend": ("id", "file"), # Bookmark end IDs + # Note: ins and del (track changes) can share IDs when part of same revision + # PowerPoint elements + "sldid": ("id", "file"), # Slide IDs in presentation.xml + "sldmasterid": ("id", "global"), # Slide master IDs must be globally unique + "sldlayoutid": ("id", "global"), # Slide layout IDs must be globally unique + "cm": ("authorid", "file"), # Comment author IDs + # Excel elements + "sheet": ("sheetid", "file"), # Sheet IDs in workbook.xml + "definedname": ("id", "file"), # Named range IDs + # Drawing/Shape elements (all formats) + "cxnsp": ("id", "file"), # Connection shape IDs + "sp": ("id", "file"), # Shape IDs + "pic": ("id", "file"), # Picture IDs + "grpsp": ("id", "file"), # Group shape IDs + } + + # Mapping of element names to expected relationship types + # Subclasses should override this with format-specific mappings + ELEMENT_RELATIONSHIP_TYPES = {} + + # Unified schema mappings for all Office document types + SCHEMA_MAPPINGS = { + # Document type specific schemas + "word": "ISO-IEC29500-4_2016/wml.xsd", # Word documents + "ppt": "ISO-IEC29500-4_2016/pml.xsd", # PowerPoint presentations + "xl": "ISO-IEC29500-4_2016/sml.xsd", # Excel spreadsheets + # Common file types + "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd", + "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd", + "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd", + "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd", + ".rels": "ecma/fouth-edition/opc-relationships.xsd", + # Word-specific files + "people.xml": "microsoft/wml-2012.xsd", + "commentsIds.xml": "microsoft/wml-cid-2016.xsd", + "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd", + "commentsExtended.xml": "microsoft/wml-2012.xsd", + # Chart files (common across document types) + "chart": "ISO-IEC29500-4_2016/dml-chart.xsd", + # Theme files (common across document types) + "theme": "ISO-IEC29500-4_2016/dml-main.xsd", + # Drawing and media files + "drawing": "ISO-IEC29500-4_2016/dml-main.xsd", + } + + # Unified namespace constants + MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006" + XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace" + + # Common OOXML namespaces used across validators + PACKAGE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/relationships" + ) + OFFICE_RELATIONSHIPS_NAMESPACE = ( + "http://schemas.openxmlformats.org/officeDocument/2006/relationships" + ) + CONTENT_TYPES_NAMESPACE = ( + "http://schemas.openxmlformats.org/package/2006/content-types" + ) + + # Folders where we should clean ignorable namespaces + MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"} + + # All allowed OOXML namespaces (superset of all document types) + OOXML_NAMESPACES = { + "http://schemas.openxmlformats.org/officeDocument/2006/math", + "http://schemas.openxmlformats.org/officeDocument/2006/relationships", + "http://schemas.openxmlformats.org/schemaLibrary/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/main", + "http://schemas.openxmlformats.org/drawingml/2006/chart", + "http://schemas.openxmlformats.org/drawingml/2006/chartDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/diagram", + "http://schemas.openxmlformats.org/drawingml/2006/picture", + "http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing", + "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing", + "http://schemas.openxmlformats.org/wordprocessingml/2006/main", + "http://schemas.openxmlformats.org/presentationml/2006/main", + "http://schemas.openxmlformats.org/spreadsheetml/2006/main", + "http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes", + "http://www.w3.org/XML/1998/namespace", + } + + def __init__(self, unpacked_dir, original_file, verbose=False): + self.unpacked_dir = Path(unpacked_dir).resolve() + self.original_file = Path(original_file) + self.verbose = verbose + + # Set schemas directory + self.schemas_dir = Path(__file__).parent.parent.parent / "schemas" + + # Get all XML and .rels files + patterns = ["*.xml", "*.rels"] + self.xml_files = [ + f for pattern in patterns for f in self.unpacked_dir.rglob(pattern) + ] + + if not self.xml_files: + print(f"Warning: No XML files found in {self.unpacked_dir}") + + def validate(self): + """Run all validation checks and return True if all pass.""" + raise NotImplementedError("Subclasses must implement the validate method") + + def validate_xml(self): + """Validate that all XML files are well-formed.""" + errors = [] + + for xml_file in self.xml_files: + try: + # Try to parse the XML file + lxml.etree.parse(str(xml_file)) + except lxml.etree.XMLSyntaxError as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {e.lineno}: {e.msg}" + ) + except Exception as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Unexpected error: {str(e)}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} XML violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All XML files are well-formed") + return True + + def validate_namespaces(self): + """Validate that namespace prefixes in Ignorable attributes are declared.""" + errors = [] + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + declared = set(root.nsmap.keys()) - {None} # Exclude default namespace + + for attr_val in [ + v for k, v in root.attrib.items() if k.endswith("Ignorable") + ]: + undeclared = set(attr_val.split()) - declared + errors.extend( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Namespace '{ns}' in Ignorable but not declared" + for ns in undeclared + ) + except lxml.etree.XMLSyntaxError: + continue + + if errors: + print(f"FAILED - {len(errors)} namespace issues:") + for error in errors: + print(error) + return False + if self.verbose: + print("PASSED - All namespace prefixes properly declared") + return True + + def validate_unique_ids(self): + """Validate that specific IDs are unique according to OOXML requirements.""" + errors = [] + global_ids = {} # Track globally unique IDs across all files + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + file_ids = {} # Track IDs that must be unique within this file + + # Remove all mc:AlternateContent elements from the tree + mc_elements = root.xpath( + ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE} + ) + for elem in mc_elements: + elem.getparent().remove(elem) + + # Now check IDs in the cleaned tree + for elem in root.iter(): + # Get the element name without namespace + tag = ( + elem.tag.split("}")[-1].lower() + if "}" in elem.tag + else elem.tag.lower() + ) + + # Check if this element type has ID uniqueness requirements + if tag in self.UNIQUE_ID_REQUIREMENTS: + attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag] + + # Look for the specified attribute + id_value = None + for attr, value in elem.attrib.items(): + attr_local = ( + attr.split("}")[-1].lower() + if "}" in attr + else attr.lower() + ) + if attr_local == attr_name: + id_value = value + break + + if id_value is not None: + if scope == "global": + # Check global uniqueness + if id_value in global_ids: + prev_file, prev_line, prev_tag = global_ids[ + id_value + ] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> " + f"already used in {prev_file} at line {prev_line} in <{prev_tag}>" + ) + else: + global_ids[id_value] = ( + xml_file.relative_to(self.unpacked_dir), + elem.sourceline, + tag, + ) + elif scope == "file": + # Check file-level uniqueness + key = (tag, attr_name) + if key not in file_ids: + file_ids[key] = {} + + if id_value in file_ids[key]: + prev_line = file_ids[key][id_value] + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> " + f"(first occurrence at line {prev_line})" + ) + else: + file_ids[key][id_value] = elem.sourceline + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} ID uniqueness violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All required IDs are unique") + return True + + def validate_file_references(self): + """ + Validate that all .rels files properly reference files and that all files are referenced. + """ + errors = [] + + # Find all .rels files + rels_files = list(self.unpacked_dir.rglob("*.rels")) + + if not rels_files: + if self.verbose: + print("PASSED - No .rels files found") + return True + + # Get all files in the unpacked directory (excluding reference files) + all_files = [] + for file_path in self.unpacked_dir.rglob("*"): + if ( + file_path.is_file() + and file_path.name != "[Content_Types].xml" + and not file_path.name.endswith(".rels") + ): # This file is not referenced by .rels + all_files.append(file_path.resolve()) + + # Track all files that are referenced by any .rels file + all_referenced_files = set() + + if self.verbose: + print( + f"Found {len(rels_files)} .rels files and {len(all_files)} target files" + ) + + # Check each .rels file + for rels_file in rels_files: + try: + # Parse relationships file + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + # Get the directory where this .rels file is located + rels_dir = rels_file.parent + + # Find all relationships and their targets + referenced_files = set() + broken_refs = [] + + for rel in rels_root.findall( + ".//ns:Relationship", + namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE}, + ): + target = rel.get("Target") + if target and not target.startswith( + ("http", "mailto:") + ): # Skip external URLs + # Resolve the target path relative to the .rels file location + if rels_file.name == ".rels": + # Root .rels file - targets are relative to unpacked_dir + target_path = self.unpacked_dir / target + else: + # Other .rels files - targets are relative to their parent's parent + # e.g., word/_rels/document.xml.rels -> targets relative to word/ + base_dir = rels_dir.parent + target_path = base_dir / target + + # Normalize the path and check if it exists + try: + target_path = target_path.resolve() + if target_path.exists() and target_path.is_file(): + referenced_files.add(target_path) + all_referenced_files.add(target_path) + else: + broken_refs.append((target, rel.sourceline)) + except (OSError, ValueError): + broken_refs.append((target, rel.sourceline)) + + # Report broken references + if broken_refs: + rel_path = rels_file.relative_to(self.unpacked_dir) + for broken_ref, line_num in broken_refs: + errors.append( + f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}" + ) + + except Exception as e: + rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append(f" Error parsing {rel_path}: {e}") + + # Check for unreferenced files (files that exist but are not referenced anywhere) + unreferenced_files = set(all_files) - all_referenced_files + + if unreferenced_files: + for unref_file in sorted(unreferenced_files): + unref_rel_path = unref_file.relative_to(self.unpacked_dir) + errors.append(f" Unreferenced file: {unref_rel_path}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship validation errors:") + for error in errors: + print(error) + print( + "CRITICAL: These errors will cause the document to appear corrupt. " + + "Broken references MUST be fixed, " + + "and unreferenced files MUST be referenced or removed." + ) + return False + else: + if self.verbose: + print( + "PASSED - All references are valid and all files are properly referenced" + ) + return True + + def validate_all_relationship_ids(self): + """ + Validate that all r:id attributes in XML files reference existing IDs + in their corresponding .rels files, and optionally validate relationship types. + """ + import lxml.etree + + errors = [] + + # Process each XML file that might contain r:id references + for xml_file in self.xml_files: + # Skip .rels files themselves + if xml_file.suffix == ".rels": + continue + + # Determine the corresponding .rels file + # For dir/file.xml, it's dir/_rels/file.xml.rels + rels_dir = xml_file.parent / "_rels" + rels_file = rels_dir / f"{xml_file.name}.rels" + + # Skip if there's no corresponding .rels file (that's okay) + if not rels_file.exists(): + continue + + try: + # Parse the .rels file to get valid relationship IDs and their types + rels_root = lxml.etree.parse(str(rels_file)).getroot() + rid_to_type = {} + + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rid = rel.get("Id") + rel_type = rel.get("Type", "") + if rid: + # Check for duplicate rIds + if rid in rid_to_type: + rels_rel_path = rels_file.relative_to(self.unpacked_dir) + errors.append( + f" {rels_rel_path}: Line {rel.sourceline}: " + f"Duplicate relationship ID '{rid}' (IDs must be unique)" + ) + # Extract just the type name from the full URL + type_name = ( + rel_type.split("/")[-1] if "/" in rel_type else rel_type + ) + rid_to_type[rid] = type_name + + # Parse the XML file to find all r:id references + xml_root = lxml.etree.parse(str(xml_file)).getroot() + + # Find all elements with r:id attributes + for elem in xml_root.iter(): + # Check for r:id attribute (relationship ID) + rid_attr = elem.get(f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id") + if rid_attr: + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + elem_name = ( + elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag + ) + + # Check if the ID exists + if rid_attr not in rid_to_type: + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> references non-existent relationship '{rid_attr}' " + f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})" + ) + # Check if we have type expectations for this element + elif self.ELEMENT_RELATIONSHIP_TYPES: + expected_type = self._get_expected_relationship_type( + elem_name + ) + if expected_type: + actual_type = rid_to_type[rid_attr] + # Check if the actual type matches or contains the expected type + if expected_type not in actual_type.lower(): + errors.append( + f" {xml_rel_path}: Line {elem.sourceline}: " + f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' " + f"but should point to a '{expected_type}' relationship" + ) + + except Exception as e: + xml_rel_path = xml_file.relative_to(self.unpacked_dir) + errors.append(f" Error processing {xml_rel_path}: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} relationship ID reference errors:") + for error in errors: + print(error) + print("\nThese ID mismatches will cause the document to appear corrupt!") + return False + else: + if self.verbose: + print("PASSED - All relationship ID references are valid") + return True + + def _get_expected_relationship_type(self, element_name): + """ + Get the expected relationship type for an element. + First checks the explicit mapping, then tries pattern detection. + """ + # Normalize element name to lowercase + elem_lower = element_name.lower() + + # Check explicit mapping first + if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES: + return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower] + + # Try pattern detection for common patterns + # Pattern 1: Elements ending in "Id" often expect a relationship of the prefix type + if elem_lower.endswith("id") and len(elem_lower) > 2: + # e.g., "sldId" -> "sld", "sldMasterId" -> "sldMaster" + prefix = elem_lower[:-2] # Remove "id" + # Check if this might be a compound like "sldMasterId" + if prefix.endswith("master"): + return prefix.lower() + elif prefix.endswith("layout"): + return prefix.lower() + else: + # Simple case like "sldId" -> "slide" + # Common transformations + if prefix == "sld": + return "slide" + return prefix.lower() + + # Pattern 2: Elements ending in "Reference" expect a relationship of the prefix type + if elem_lower.endswith("reference") and len(elem_lower) > 9: + prefix = elem_lower[:-9] # Remove "reference" + return prefix.lower() + + return None + + def validate_content_types(self): + """Validate that all content files are properly declared in [Content_Types].xml.""" + errors = [] + + # Find [Content_Types].xml file + content_types_file = self.unpacked_dir / "[Content_Types].xml" + if not content_types_file.exists(): + print("FAILED - [Content_Types].xml file not found") + return False + + try: + # Parse and get all declared parts and extensions + root = lxml.etree.parse(str(content_types_file)).getroot() + declared_parts = set() + declared_extensions = set() + + # Get Override declarations (specific files) + for override in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override" + ): + part_name = override.get("PartName") + if part_name is not None: + declared_parts.add(part_name.lstrip("/")) + + # Get Default declarations (by extension) + for default in root.findall( + f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default" + ): + extension = default.get("Extension") + if extension is not None: + declared_extensions.add(extension.lower()) + + # Root elements that require content type declaration + declarable_roots = { + "sld", + "sldLayout", + "sldMaster", + "presentation", # PowerPoint + "document", # Word + "workbook", + "worksheet", # Excel + "theme", # Common + } + + # Common media file extensions that should be declared + media_extensions = { + "png": "image/png", + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "gif": "image/gif", + "bmp": "image/bmp", + "tiff": "image/tiff", + "wmf": "image/x-wmf", + "emf": "image/x-emf", + } + + # Get all files in the unpacked directory + all_files = list(self.unpacked_dir.rglob("*")) + all_files = [f for f in all_files if f.is_file()] + + # Check all XML files for Override declarations + for xml_file in self.xml_files: + path_str = str(xml_file.relative_to(self.unpacked_dir)).replace( + "\\", "/" + ) + + # Skip non-content files + if any( + skip in path_str + for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"] + ): + continue + + try: + root_tag = lxml.etree.parse(str(xml_file)).getroot().tag + root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag + + if root_name in declarable_roots and path_str not in declared_parts: + errors.append( + f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml" + ) + + except Exception: + continue # Skip unparseable files + + # Check all non-XML files for Default extension declarations + for file_path in all_files: + # Skip XML files and metadata files (already checked above) + if file_path.suffix.lower() in {".xml", ".rels"}: + continue + if file_path.name == "[Content_Types].xml": + continue + if "_rels" in file_path.parts or "docProps" in file_path.parts: + continue + + extension = file_path.suffix.lstrip(".").lower() + if extension and extension not in declared_extensions: + # Check if it's a known media extension that should be declared + if extension in media_extensions: + relative_path = file_path.relative_to(self.unpacked_dir) + errors.append( + f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: ' + ) + + except Exception as e: + errors.append(f" Error parsing [Content_Types].xml: {e}") + + if errors: + print(f"FAILED - Found {len(errors)} content type declaration errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print( + "PASSED - All content files are properly declared in [Content_Types].xml" + ) + return True + + def validate_file_against_xsd(self, xml_file, verbose=False): + """Validate a single XML file against XSD schema, comparing with original. + + Args: + xml_file: Path to XML file to validate + verbose: Enable verbose output + + Returns: + tuple: (is_valid, new_errors_set) where is_valid is True/False/None (skipped) + """ + # Resolve both paths to handle symlinks + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + + # Validate current file + is_valid, current_errors = self._validate_single_file_xsd( + xml_file, unpacked_dir + ) + + if is_valid is None: + return None, set() # Skipped + elif is_valid: + return True, set() # Valid, no errors + + # Get errors from original file for this specific file + original_errors = self._get_original_file_errors(xml_file) + + # Compare with original (both are guaranteed to be sets here) + assert current_errors is not None + new_errors = current_errors - original_errors + + if new_errors: + if verbose: + relative_path = xml_file.relative_to(unpacked_dir) + print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)") + for error in list(new_errors)[:3]: + truncated = error[:250] + "..." if len(error) > 250 else error + print(f" - {truncated}") + return False, new_errors + else: + # All errors existed in original + if verbose: + print( + f"PASSED - No new errors (original had {len(current_errors)} errors)" + ) + return True, set() + + def validate_against_xsd(self): + """Validate XML files against XSD schemas, showing only new errors compared to original.""" + new_errors = [] + original_error_count = 0 + valid_count = 0 + skipped_count = 0 + + for xml_file in self.xml_files: + relative_path = str(xml_file.relative_to(self.unpacked_dir)) + is_valid, new_file_errors = self.validate_file_against_xsd( + xml_file, verbose=False + ) + + if is_valid is None: + skipped_count += 1 + continue + elif is_valid and not new_file_errors: + valid_count += 1 + continue + elif is_valid: + # Had errors but all existed in original + original_error_count += 1 + valid_count += 1 + continue + + # Has new errors + new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)") + for error in list(new_file_errors)[:3]: # Show first 3 errors + new_errors.append( + f" - {error[:250]}..." if len(error) > 250 else f" - {error}" + ) + + # Print summary + if self.verbose: + print(f"Validated {len(self.xml_files)} files:") + print(f" - Valid: {valid_count}") + print(f" - Skipped (no schema): {skipped_count}") + if original_error_count: + print(f" - With original errors (ignored): {original_error_count}") + print( + f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}" + ) + + if new_errors: + print("\nFAILED - Found NEW validation errors:") + for error in new_errors: + print(error) + return False + else: + if self.verbose: + print("\nPASSED - No new XSD validation errors introduced") + return True + + def _get_schema_path(self, xml_file): + """Determine the appropriate schema path for an XML file.""" + # Check exact filename match + if xml_file.name in self.SCHEMA_MAPPINGS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name] + + # Check .rels files + if xml_file.suffix == ".rels": + return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"] + + # Check chart files + if "charts/" in str(xml_file) and xml_file.name.startswith("chart"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"] + + # Check theme files + if "theme/" in str(xml_file) and xml_file.name.startswith("theme"): + return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"] + + # Check if file is in a main content folder and use appropriate schema + if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS: + return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name] + + return None + + def _clean_ignorable_namespaces(self, xml_doc): + """Remove attributes and elements not in allowed namespaces.""" + # Create a clean copy + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + # Remove attributes not in allowed namespaces + for elem in xml_copy.iter(): + attrs_to_remove = [] + + for attr in elem.attrib: + # Check if attribute is from a namespace other than allowed ones + if "{" in attr: + ns = attr.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + attrs_to_remove.append(attr) + + # Remove collected attributes + for attr in attrs_to_remove: + del elem.attrib[attr] + + # Remove elements not in allowed namespaces + self._remove_ignorable_elements(xml_copy) + + return lxml.etree.ElementTree(xml_copy) + + def _remove_ignorable_elements(self, root): + """Recursively remove all elements not in allowed namespaces.""" + elements_to_remove = [] + + # Find elements to remove + for elem in list(root): + # Skip non-element nodes (comments, processing instructions, etc.) + if not hasattr(elem, "tag") or callable(elem.tag): + continue + + tag_str = str(elem.tag) + if tag_str.startswith("{"): + ns = tag_str.split("}")[0][1:] + if ns not in self.OOXML_NAMESPACES: + elements_to_remove.append(elem) + continue + + # Recursively clean child elements + self._remove_ignorable_elements(elem) + + # Remove collected elements + for elem in elements_to_remove: + root.remove(elem) + + def _preprocess_for_mc_ignorable(self, xml_doc): + """Preprocess XML to handle mc:Ignorable attribute properly.""" + # Remove mc:Ignorable attributes before validation + root = xml_doc.getroot() + + # Remove mc:Ignorable attribute from root + if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib: + del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"] + + return xml_doc + + def _validate_single_file_xsd(self, xml_file, base_path): + """Validate a single XML file against XSD schema. Returns (is_valid, errors_set).""" + schema_path = self._get_schema_path(xml_file) + if not schema_path: + return None, None # Skip file + + try: + # Load schema + with open(schema_path, "rb") as xsd_file: + parser = lxml.etree.XMLParser() + xsd_doc = lxml.etree.parse( + xsd_file, parser=parser, base_url=str(schema_path) + ) + schema = lxml.etree.XMLSchema(xsd_doc) + + # Load and preprocess XML + with open(xml_file, "r") as f: + xml_doc = lxml.etree.parse(f) + + xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc) + xml_doc = self._preprocess_for_mc_ignorable(xml_doc) + + # Clean ignorable namespaces if needed + relative_path = xml_file.relative_to(base_path) + if ( + relative_path.parts + and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS + ): + xml_doc = self._clean_ignorable_namespaces(xml_doc) + + # Validate + if schema.validate(xml_doc): + return True, set() + else: + errors = set() + for error in schema.error_log: + # Store normalized error message (without line numbers for comparison) + errors.add(error.message) + return False, errors + + except Exception as e: + return False, {str(e)} + + def _get_original_file_errors(self, xml_file): + """Get XSD validation errors from a single file in the original document. + + Args: + xml_file: Path to the XML file in unpacked_dir to check + + Returns: + set: Set of error messages from the original file + """ + import tempfile + import zipfile + + # Resolve both paths to handle symlinks (e.g., /var vs /private/var on macOS) + xml_file = Path(xml_file).resolve() + unpacked_dir = self.unpacked_dir.resolve() + relative_path = xml_file.relative_to(unpacked_dir) + + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Extract original file + with zipfile.ZipFile(self.original_file, "r") as zip_ref: + zip_ref.extractall(temp_path) + + # Find corresponding file in original + original_xml_file = temp_path / relative_path + + if not original_xml_file.exists(): + # File didn't exist in original, so no original errors + return set() + + # Validate the specific file in original + is_valid, errors = self._validate_single_file_xsd( + original_xml_file, temp_path + ) + return errors if errors else set() + + def _remove_template_tags_from_text_nodes(self, xml_doc): + """Remove template tags from XML text nodes and collect warnings. + + Template tags follow the pattern {{ ... }} and are used as placeholders + for content replacement. They should be removed from text content before + XSD validation while preserving XML structure. + + Returns: + tuple: (cleaned_xml_doc, warnings_list) + """ + warnings = [] + template_pattern = re.compile(r"\{\{[^}]*\}\}") + + # Create a copy of the document to avoid modifying the original + xml_string = lxml.etree.tostring(xml_doc, encoding="unicode") + xml_copy = lxml.etree.fromstring(xml_string) + + def process_text_content(text, content_type): + if not text: + return text + matches = list(template_pattern.finditer(text)) + if matches: + for match in matches: + warnings.append( + f"Found template tag in {content_type}: {match.group()}" + ) + return template_pattern.sub("", text) + return text + + # Process all text nodes in the document + for elem in xml_copy.iter(): + # Skip processing if this is a w:t element + if not hasattr(elem, "tag") or callable(elem.tag): + continue + tag_str = str(elem.tag) + if tag_str.endswith("}t") or tag_str == "t": + continue + + elem.text = process_text_content(elem.text, "text content") + elem.tail = process_text_content(elem.tail, "tail content") + + return lxml.etree.ElementTree(xml_copy), warnings + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/deploy/data/skills/pptx/openxml/scripts/validation/docx.py b/deploy/data/skills/pptx/openxml/scripts/validation/docx.py new file mode 100644 index 000000000..602c47087 --- /dev/null +++ b/deploy/data/skills/pptx/openxml/scripts/validation/docx.py @@ -0,0 +1,274 @@ +""" +Validator for Word document XML files against XSD schemas. +""" + +import re +import tempfile +import zipfile + +import lxml.etree + +from .base import BaseSchemaValidator + + +class DOCXSchemaValidator(BaseSchemaValidator): + """Validator for Word document XML files against XSD schemas.""" + + # Word-specific namespace + WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + + # Word-specific element to relationship type mappings + # Start with empty mapping - add specific cases as we discover them + ELEMENT_RELATIONSHIP_TYPES = {} + + def validate(self): + """Run all validation checks and return True if all pass.""" + # Test 0: XML well-formedness + if not self.validate_xml(): + return False + + # Test 1: Namespace declarations + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + # Test 2: Unique IDs + if not self.validate_unique_ids(): + all_valid = False + + # Test 3: Relationship and file reference validation + if not self.validate_file_references(): + all_valid = False + + # Test 4: Content type declarations + if not self.validate_content_types(): + all_valid = False + + # Test 5: XSD schema validation + if not self.validate_against_xsd(): + all_valid = False + + # Test 6: Whitespace preservation + if not self.validate_whitespace_preservation(): + all_valid = False + + # Test 7: Deletion validation + if not self.validate_deletions(): + all_valid = False + + # Test 8: Insertion validation + if not self.validate_insertions(): + all_valid = False + + # Test 9: Relationship ID reference validation + if not self.validate_all_relationship_ids(): + all_valid = False + + # Count and compare paragraphs + self.compare_paragraph_counts() + + return all_valid + + def validate_whitespace_preservation(self): + """ + Validate that w:t elements with whitespace have xml:space='preserve'. + """ + errors = [] + + for xml_file in self.xml_files: + # Only check document.xml files + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + # Find all w:t elements + for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"): + if elem.text: + text = elem.text + # Check if text starts or ends with whitespace + if re.match(r"^\s.*", text) or re.match(r".*\s$", text): + # Check if xml:space="preserve" attribute exists + xml_space_attr = f"{{{self.XML_NAMESPACE}}}space" + if ( + xml_space_attr not in elem.attrib + or elem.attrib[xml_space_attr] != "preserve" + ): + # Show a preview of the text + text_preview = ( + repr(text)[:50] + "..." + if len(repr(text)) > 50 + else repr(text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: w:t element with whitespace missing xml:space='preserve': {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} whitespace preservation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All whitespace is properly preserved") + return True + + def validate_deletions(self): + """ + Validate that w:t elements are not within w:del elements. + For some reason, XSD validation does not catch this, so we do it manually. + """ + errors = [] + + for xml_file in self.xml_files: + # Only check document.xml files + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + # Find all w:t elements that are descendants of w:del elements + namespaces = {"w": self.WORD_2006_NAMESPACE} + xpath_expression = ".//w:del//w:t" + problematic_t_elements = root.xpath( + xpath_expression, namespaces=namespaces + ) + for t_elem in problematic_t_elements: + if t_elem.text: + # Show a preview of the text + text_preview = ( + repr(t_elem.text)[:50] + "..." + if len(repr(t_elem.text)) > 50 + else repr(t_elem.text) + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {t_elem.sourceline}: found within : {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} deletion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:t elements found within w:del elements") + return True + + def count_paragraphs_in_unpacked(self): + """Count the number of paragraphs in the unpacked document.""" + count = 0 + + for xml_file in self.xml_files: + # Only check document.xml files + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + # Count all w:p elements + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + except Exception as e: + print(f"Error counting paragraphs in unpacked document: {e}") + + return count + + def count_paragraphs_in_original(self): + """Count the number of paragraphs in the original docx file.""" + count = 0 + + try: + # Create temporary directory to unpack original + with tempfile.TemporaryDirectory() as temp_dir: + # Unpack original docx + with zipfile.ZipFile(self.original_file, "r") as zip_ref: + zip_ref.extractall(temp_dir) + + # Parse document.xml + doc_xml_path = temp_dir + "/word/document.xml" + root = lxml.etree.parse(doc_xml_path).getroot() + + # Count all w:p elements + paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p") + count = len(paragraphs) + + except Exception as e: + print(f"Error counting paragraphs in original document: {e}") + + return count + + def validate_insertions(self): + """ + Validate that w:delText elements are not within w:ins elements. + w:delText is only allowed in w:ins if nested within a w:del. + """ + errors = [] + + for xml_file in self.xml_files: + if xml_file.name != "document.xml": + continue + + try: + root = lxml.etree.parse(str(xml_file)).getroot() + namespaces = {"w": self.WORD_2006_NAMESPACE} + + # Find w:delText in w:ins that are NOT within w:del + invalid_elements = root.xpath( + ".//w:ins//w:delText[not(ancestor::w:del)]", + namespaces=namespaces + ) + + for elem in invalid_elements: + text_preview = ( + repr(elem.text or "")[:50] + "..." + if len(repr(elem.text or "")) > 50 + else repr(elem.text or "") + ) + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: within : {text_preview}" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} insertion validation violations:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - No w:delText elements within w:ins elements") + return True + + def compare_paragraph_counts(self): + """Compare paragraph counts between original and new document.""" + original_count = self.count_paragraphs_in_original() + new_count = self.count_paragraphs_in_unpacked() + + diff = new_count - original_count + diff_str = f"+{diff}" if diff > 0 else str(diff) + print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})") + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/deploy/data/skills/pptx/openxml/scripts/validation/pptx.py b/deploy/data/skills/pptx/openxml/scripts/validation/pptx.py new file mode 100644 index 000000000..66d5b1e2d --- /dev/null +++ b/deploy/data/skills/pptx/openxml/scripts/validation/pptx.py @@ -0,0 +1,315 @@ +""" +Validator for PowerPoint presentation XML files against XSD schemas. +""" + +import re + +from .base import BaseSchemaValidator + + +class PPTXSchemaValidator(BaseSchemaValidator): + """Validator for PowerPoint presentation XML files against XSD schemas.""" + + # PowerPoint presentation namespace + PRESENTATIONML_NAMESPACE = ( + "http://schemas.openxmlformats.org/presentationml/2006/main" + ) + + # PowerPoint-specific element to relationship type mappings + ELEMENT_RELATIONSHIP_TYPES = { + "sldid": "slide", + "sldmasterid": "slidemaster", + "notesmasterid": "notesmaster", + "sldlayoutid": "slidelayout", + "themeid": "theme", + "tablestyleid": "tablestyles", + } + + def validate(self): + """Run all validation checks and return True if all pass.""" + # Test 0: XML well-formedness + if not self.validate_xml(): + return False + + # Test 1: Namespace declarations + all_valid = True + if not self.validate_namespaces(): + all_valid = False + + # Test 2: Unique IDs + if not self.validate_unique_ids(): + all_valid = False + + # Test 3: UUID ID validation + if not self.validate_uuid_ids(): + all_valid = False + + # Test 4: Relationship and file reference validation + if not self.validate_file_references(): + all_valid = False + + # Test 5: Slide layout ID validation + if not self.validate_slide_layout_ids(): + all_valid = False + + # Test 6: Content type declarations + if not self.validate_content_types(): + all_valid = False + + # Test 7: XSD schema validation + if not self.validate_against_xsd(): + all_valid = False + + # Test 8: Notes slide reference validation + if not self.validate_notes_slide_references(): + all_valid = False + + # Test 9: Relationship ID reference validation + if not self.validate_all_relationship_ids(): + all_valid = False + + # Test 10: Duplicate slide layout references validation + if not self.validate_no_duplicate_slide_layouts(): + all_valid = False + + return all_valid + + def validate_uuid_ids(self): + """Validate that ID attributes that look like UUIDs contain only hex values.""" + import lxml.etree + + errors = [] + # UUID pattern: 8-4-4-4-12 hex digits with optional braces/hyphens + uuid_pattern = re.compile( + r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$" + ) + + for xml_file in self.xml_files: + try: + root = lxml.etree.parse(str(xml_file)).getroot() + + # Check all elements for ID attributes + for elem in root.iter(): + for attr, value in elem.attrib.items(): + # Check if this is an ID attribute + attr_name = attr.split("}")[-1].lower() + if attr_name == "id" or attr_name.endswith("id"): + # Check if value looks like a UUID (has the right length and pattern structure) + if self._looks_like_uuid(value): + # Validate that it contains only hex characters in the right positions + if not uuid_pattern.match(value): + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: " + f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} UUID ID validation errors:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All UUID-like IDs contain valid hex values") + return True + + def _looks_like_uuid(self, value): + """Check if a value has the general structure of a UUID.""" + # Remove common UUID delimiters + clean_value = value.strip("{}()").replace("-", "") + # Check if it's 32 hex-like characters (could include invalid hex chars) + return len(clean_value) == 32 and all(c.isalnum() for c in clean_value) + + def validate_slide_layout_ids(self): + """Validate that sldLayoutId elements in slide masters reference valid slide layouts.""" + import lxml.etree + + errors = [] + + # Find all slide master files + slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml")) + + if not slide_masters: + if self.verbose: + print("PASSED - No slide masters found") + return True + + for slide_master in slide_masters: + try: + # Parse the slide master file + root = lxml.etree.parse(str(slide_master)).getroot() + + # Find the corresponding _rels file for this slide master + rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels" + + if not rels_file.exists(): + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}" + ) + continue + + # Parse the relationships file + rels_root = lxml.etree.parse(str(rels_file)).getroot() + + # Build a set of valid relationship IDs that point to slide layouts + valid_layout_rids = set() + for rel in rels_root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "slideLayout" in rel_type: + valid_layout_rids.add(rel.get("Id")) + + # Find all sldLayoutId elements in the slide master + for sld_layout_id in root.findall( + f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId" + ): + r_id = sld_layout_id.get( + f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id" + ) + layout_id = sld_layout_id.get("id") + + if r_id and r_id not in valid_layout_rids: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: " + f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' " + f"references r:id='{r_id}' which is not found in slide layout relationships" + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {slide_master.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print(f"FAILED - Found {len(errors)} slide layout ID validation errors:") + for error in errors: + print(error) + print( + "Remove invalid references or add missing slide layouts to the relationships file." + ) + return False + else: + if self.verbose: + print("PASSED - All slide layout IDs reference valid slide layouts") + return True + + def validate_no_duplicate_slide_layouts(self): + """Validate that each slide has exactly one slideLayout reference.""" + import lxml.etree + + errors = [] + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + for rels_file in slide_rels_files: + try: + root = lxml.etree.parse(str(rels_file)).getroot() + + # Find all slideLayout relationships + layout_rels = [ + rel + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ) + if "slideLayout" in rel.get("Type", "") + ] + + if len(layout_rels) > 1: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references" + ) + + except Exception as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + if errors: + print("FAILED - Found slides with duplicate slideLayout references:") + for error in errors: + print(error) + return False + else: + if self.verbose: + print("PASSED - All slides have exactly one slideLayout reference") + return True + + def validate_notes_slide_references(self): + """Validate that each notesSlide file is referenced by only one slide.""" + import lxml.etree + + errors = [] + notes_slide_references = {} # Track which slides reference each notesSlide + + # Find all slide relationship files + slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels")) + + if not slide_rels_files: + if self.verbose: + print("PASSED - No slide relationship files found") + return True + + for rels_file in slide_rels_files: + try: + # Parse the relationships file + root = lxml.etree.parse(str(rels_file)).getroot() + + # Find all notesSlide relationships + for rel in root.findall( + f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship" + ): + rel_type = rel.get("Type", "") + if "notesSlide" in rel_type: + target = rel.get("Target", "") + if target: + # Normalize the target path to handle relative paths + normalized_target = target.replace("../", "") + + # Track which slide references this notesSlide + slide_name = rels_file.stem.replace( + ".xml", "" + ) # e.g., "slide1" + + if normalized_target not in notes_slide_references: + notes_slide_references[normalized_target] = [] + notes_slide_references[normalized_target].append( + (slide_name, rels_file) + ) + + except (lxml.etree.XMLSyntaxError, Exception) as e: + errors.append( + f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}" + ) + + # Check for duplicate references + for target, references in notes_slide_references.items(): + if len(references) > 1: + slide_names = [ref[0] for ref in references] + errors.append( + f" Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}" + ) + for slide_name, rels_file in references: + errors.append(f" - {rels_file.relative_to(self.unpacked_dir)}") + + if errors: + print( + f"FAILED - Found {len([e for e in errors if not e.startswith(' ')])} notes slide reference validation errors:" + ) + for error in errors: + print(error) + print("Each slide may optionally have its own slide file.") + return False + else: + if self.verbose: + print("PASSED - All notes slide references are unique") + return True + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/deploy/data/skills/pptx/openxml/scripts/validation/redlining.py b/deploy/data/skills/pptx/openxml/scripts/validation/redlining.py new file mode 100644 index 000000000..7ed425edf --- /dev/null +++ b/deploy/data/skills/pptx/openxml/scripts/validation/redlining.py @@ -0,0 +1,279 @@ +""" +Validator for tracked changes in Word documents. +""" + +import subprocess +import tempfile +import zipfile +from pathlib import Path + + +class RedliningValidator: + """Validator for tracked changes in Word documents.""" + + def __init__(self, unpacked_dir, original_docx, verbose=False): + self.unpacked_dir = Path(unpacked_dir) + self.original_docx = Path(original_docx) + self.verbose = verbose + self.namespaces = { + "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + } + + def validate(self): + """Main validation method that returns True if valid, False otherwise.""" + # Verify unpacked directory exists and has correct structure + modified_file = self.unpacked_dir / "word" / "document.xml" + if not modified_file.exists(): + print(f"FAILED - Modified document.xml not found at {modified_file}") + return False + + # First, check if there are any tracked changes by Claude to validate + try: + import xml.etree.ElementTree as ET + + tree = ET.parse(modified_file) + root = tree.getroot() + + # Check for w:del or w:ins tags authored by Claude + del_elements = root.findall(".//w:del", self.namespaces) + ins_elements = root.findall(".//w:ins", self.namespaces) + + # Filter to only include changes by Claude + claude_del_elements = [ + elem + for elem in del_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude" + ] + claude_ins_elements = [ + elem + for elem in ins_elements + if elem.get(f"{{{self.namespaces['w']}}}author") == "Claude" + ] + + # Redlining validation is only needed if tracked changes by Claude have been used. + if not claude_del_elements and not claude_ins_elements: + if self.verbose: + print("PASSED - No tracked changes by Claude found.") + return True + + except Exception: + # If we can't parse the XML, continue with full validation + pass + + # Create temporary directory for unpacking original docx + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Unpack original docx + try: + with zipfile.ZipFile(self.original_docx, "r") as zip_ref: + zip_ref.extractall(temp_path) + except Exception as e: + print(f"FAILED - Error unpacking original docx: {e}") + return False + + original_file = temp_path / "word" / "document.xml" + if not original_file.exists(): + print( + f"FAILED - Original document.xml not found in {self.original_docx}" + ) + return False + + # Parse both XML files using xml.etree.ElementTree for redlining validation + try: + import xml.etree.ElementTree as ET + + modified_tree = ET.parse(modified_file) + modified_root = modified_tree.getroot() + original_tree = ET.parse(original_file) + original_root = original_tree.getroot() + except ET.ParseError as e: + print(f"FAILED - Error parsing XML files: {e}") + return False + + # Remove Claude's tracked changes from both documents + self._remove_claude_tracked_changes(original_root) + self._remove_claude_tracked_changes(modified_root) + + # Extract and compare text content + modified_text = self._extract_text_content(modified_root) + original_text = self._extract_text_content(original_root) + + if modified_text != original_text: + # Show detailed character-level differences for each paragraph + error_message = self._generate_detailed_diff( + original_text, modified_text + ) + print(error_message) + return False + + if self.verbose: + print("PASSED - All changes by Claude are properly tracked") + return True + + def _generate_detailed_diff(self, original_text, modified_text): + """Generate detailed word-level differences using git word diff.""" + error_parts = [ + "FAILED - Document text doesn't match after removing Claude's tracked changes", + "", + "Likely causes:", + " 1. Modified text inside another author's or tags", + " 2. Made edits without proper tracked changes", + " 3. Didn't nest inside when deleting another's insertion", + "", + "For pre-redlined documents, use correct patterns:", + " - To reject another's INSERTION: Nest inside their ", + " - To restore another's DELETION: Add new AFTER their ", + "", + ] + + # Show git word diff + git_diff = self._get_git_word_diff(original_text, modified_text) + if git_diff: + error_parts.extend(["Differences:", "============", git_diff]) + else: + error_parts.append("Unable to generate word diff (git not available)") + + return "\n".join(error_parts) + + def _get_git_word_diff(self, original_text, modified_text): + """Generate word diff using git with character-level precision.""" + try: + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create two files + original_file = temp_path / "original.txt" + modified_file = temp_path / "modified.txt" + + original_file.write_text(original_text, encoding="utf-8") + modified_file.write_text(modified_text, encoding="utf-8") + + # Try character-level diff first for precise differences + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "--word-diff-regex=.", # Character-by-character diff + "-U0", # Zero lines of context - show only changed lines + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + # Clean up the output - remove git diff header lines + lines = result.stdout.split("\n") + # Skip the header lines (diff --git, index, +++, ---, @@) + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + + if content_lines: + return "\n".join(content_lines) + + # Fallback to word-level diff if character-level is too verbose + result = subprocess.run( + [ + "git", + "diff", + "--word-diff=plain", + "-U0", # Zero lines of context + "--no-index", + str(original_file), + str(modified_file), + ], + capture_output=True, + text=True, + ) + + if result.stdout.strip(): + lines = result.stdout.split("\n") + content_lines = [] + in_content = False + for line in lines: + if line.startswith("@@"): + in_content = True + continue + if in_content and line.strip(): + content_lines.append(line) + return "\n".join(content_lines) + + except (subprocess.CalledProcessError, FileNotFoundError, Exception): + # Git not available or other error, return None to use fallback + pass + + return None + + def _remove_claude_tracked_changes(self, root): + """Remove tracked changes authored by Claude from the XML root.""" + ins_tag = f"{{{self.namespaces['w']}}}ins" + del_tag = f"{{{self.namespaces['w']}}}del" + author_attr = f"{{{self.namespaces['w']}}}author" + + # Remove w:ins elements + for parent in root.iter(): + to_remove = [] + for child in parent: + if child.tag == ins_tag and child.get(author_attr) == "Claude": + to_remove.append(child) + for elem in to_remove: + parent.remove(elem) + + # Unwrap content in w:del elements where author is "Claude" + deltext_tag = f"{{{self.namespaces['w']}}}delText" + t_tag = f"{{{self.namespaces['w']}}}t" + + for parent in root.iter(): + to_process = [] + for child in parent: + if child.tag == del_tag and child.get(author_attr) == "Claude": + to_process.append((child, list(parent).index(child))) + + # Process in reverse order to maintain indices + for del_elem, del_index in reversed(to_process): + # Convert w:delText to w:t before moving + for elem in del_elem.iter(): + if elem.tag == deltext_tag: + elem.tag = t_tag + + # Move all children of w:del to its parent before removing w:del + for child in reversed(list(del_elem)): + parent.insert(del_index, child) + parent.remove(del_elem) + + def _extract_text_content(self, root): + """Extract text content from Word XML, preserving paragraph structure. + + Empty paragraphs are skipped to avoid false positives when tracked + insertions add only structural elements without text content. + """ + p_tag = f"{{{self.namespaces['w']}}}p" + t_tag = f"{{{self.namespaces['w']}}}t" + + paragraphs = [] + for p_elem in root.findall(f".//{p_tag}"): + # Get all text elements within this paragraph + text_parts = [] + for t_elem in p_elem.findall(f".//{t_tag}"): + if t_elem.text: + text_parts.append(t_elem.text) + paragraph_text = "".join(text_parts) + # Skip empty paragraphs - they don't affect content validation + if paragraph_text: + paragraphs.append(paragraph_text) + + return "\n".join(paragraphs) + + +if __name__ == "__main__": + raise RuntimeError("This module should not be run directly.") diff --git a/deploy/data/skills/pptx/scripts/reorder.py b/deploy/data/skills/pptx/scripts/reorder.py new file mode 100755 index 000000000..e33102c71 --- /dev/null +++ b/deploy/data/skills/pptx/scripts/reorder.py @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 +""" +Reorder PowerPoint slides based on a sequence of indices. + +Usage: + python reorder.py template.pptx output.pptx 0,34,34,50,52 + +This will generate output.pptx using slides from template.pptx in the specified order. +Slides can be repeated (e.g., 34 appears twice). +""" + +import argparse +import shutil +import sys +from copy import deepcopy +from pathlib import Path + +import six +from pptx import Presentation + + +def main(): + parser = argparse.ArgumentParser( + description="Rearrange PowerPoint slides based on a sequence of indices.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python rearrange.py template.pptx output.pptx 0,34,34,50,52 + Creates output.pptx using slides 0, 34 (twice), 50, and 52 from template.pptx + + python rearrange.py template.pptx output.pptx 5,3,1,2,4 + Creates output.pptx with slides reordered as specified + +Note: Slide indices are 0-based (first slide is 0, second is 1, etc.) + """, + ) + + parser.add_argument("template", help="Path to template PPTX file") + parser.add_argument("output", help="Path for output PPTX file") + parser.add_argument( + "sequence", help="Comma-separated sequence of slide indices (0-based)" + ) + + args = parser.parse_args() + + # Parse the slide sequence + try: + slide_sequence = [int(x.strip()) for x in args.sequence.split(",")] + except ValueError: + print( + "Error: Invalid sequence format. Use comma-separated integers (e.g., 0,34,34,50,52)" + ) + sys.exit(1) + + # Check template exists + template_path = Path(args.template) + if not template_path.exists(): + print(f"Error: Template file not found: {args.template}") + sys.exit(1) + + # Create output directory if needed + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + + try: + rearrange_presentation(template_path, output_path, slide_sequence) + except ValueError as e: + print(f"Error: {e}") + sys.exit(1) + except Exception as e: + print(f"Error processing presentation: {e}") + sys.exit(1) + + +def duplicate_slide(pres, index): + """Duplicate a slide in the presentation.""" + source = pres.slides[index] + + # Use source's layout to preserve formatting + new_slide = pres.slides.add_slide(source.slide_layout) + + # Collect all image and media relationships from the source slide + image_rels = {} + for rel_id, rel in six.iteritems(source.part.rels): + if "image" in rel.reltype or "media" in rel.reltype: + image_rels[rel_id] = rel + + # CRITICAL: Clear placeholder shapes to avoid duplicates + for shape in new_slide.shapes: + sp = shape.element + sp.getparent().remove(sp) + + # Copy all shapes from source + for shape in source.shapes: + el = shape.element + new_el = deepcopy(el) + new_slide.shapes._spTree.insert_element_before(new_el, "p:extLst") + + # Handle picture shapes - need to update the blip reference + # Look for all blip elements (they can be in pic or other contexts) + # Using the element's own xpath method without namespaces argument + blips = new_el.xpath(".//a:blip[@r:embed]") + for blip in blips: + old_rId = blip.get( + "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed" + ) + if old_rId in image_rels: + # Create a new relationship in the destination slide for this image + old_rel = image_rels[old_rId] + # get_or_add returns the rId directly, or adds and returns new rId + new_rId = new_slide.part.rels.get_or_add( + old_rel.reltype, old_rel._target + ) + # Update the blip's embed reference to use the new relationship ID + blip.set( + "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed", + new_rId, + ) + + # Copy any additional image/media relationships that might be referenced elsewhere + for rel_id, rel in image_rels.items(): + try: + new_slide.part.rels.get_or_add(rel.reltype, rel._target) + except Exception: + pass # Relationship might already exist + + return new_slide + + +def delete_slide(pres, index): + """Delete a slide from the presentation.""" + rId = pres.slides._sldIdLst[index].rId + pres.part.drop_rel(rId) + del pres.slides._sldIdLst[index] + + +def reorder_slides(pres, slide_index, target_index): + """Move a slide from one position to another.""" + slides = pres.slides._sldIdLst + + # Remove slide element from current position + slide_element = slides[slide_index] + slides.remove(slide_element) + + # Insert at target position + slides.insert(target_index, slide_element) + + +def rearrange_presentation(template_path, output_path, slide_sequence): + """ + Create a new presentation with slides from template in specified order. + + Args: + template_path: Path to template PPTX file + output_path: Path for output PPTX file + slide_sequence: List of slide indices (0-based) to include + """ + # Copy template to preserve dimensions and theme + if template_path != output_path: + shutil.copy2(template_path, output_path) + prs = Presentation(output_path) + else: + prs = Presentation(template_path) + + total_slides = len(prs.slides) + + # Validate indices + for idx in slide_sequence: + if idx < 0 or idx >= total_slides: + raise ValueError(f"Slide index {idx} out of range (0-{total_slides - 1})") + + # Track original slides and their duplicates + slide_map = [] # List of actual slide indices for final presentation + duplicated = {} # Track duplicates: original_idx -> [duplicate_indices] + + # Step 1: DUPLICATE repeated slides + print(f"Processing {len(slide_sequence)} slides from template...") + for i, template_idx in enumerate(slide_sequence): + if template_idx in duplicated and duplicated[template_idx]: + # Already duplicated this slide, use the duplicate + slide_map.append(duplicated[template_idx].pop(0)) + print(f" [{i}] Using duplicate of slide {template_idx}") + elif slide_sequence.count(template_idx) > 1 and template_idx not in duplicated: + # First occurrence of a repeated slide - create duplicates + slide_map.append(template_idx) + duplicates = [] + count = slide_sequence.count(template_idx) - 1 + print( + f" [{i}] Using original slide {template_idx}, creating {count} duplicate(s)" + ) + for _ in range(count): + duplicate_slide(prs, template_idx) + duplicates.append(len(prs.slides) - 1) + duplicated[template_idx] = duplicates + else: + # Unique slide or first occurrence already handled, use original + slide_map.append(template_idx) + print(f" [{i}] Using original slide {template_idx}") + + # Step 2: DELETE unwanted slides (work backwards) + slides_to_keep = set(slide_map) + print(f"\nDeleting {len(prs.slides) - len(slides_to_keep)} unused slides...") + for i in range(len(prs.slides) - 1, -1, -1): + if i not in slides_to_keep: + delete_slide(prs, i) + # Update slide_map indices after deletion + slide_map = [idx - 1 if idx > i else idx for idx in slide_map] + + # Step 3: REORDER to final sequence + print(f"Reordering {len(slide_map)} slides to final sequence...") + for target_pos in range(len(slide_map)): + # Find which slide should be at target_pos + current_pos = slide_map[target_pos] + if current_pos != target_pos: + reorder_slides(prs, current_pos, target_pos) + # Update slide_map: the move shifts other slides + for i in range(len(slide_map)): + if slide_map[i] > current_pos and slide_map[i] <= target_pos: + slide_map[i] -= 1 + elif slide_map[i] < current_pos and slide_map[i] >= target_pos: + slide_map[i] += 1 + slide_map[target_pos] = target_pos + + # Save the presentation + prs.save(output_path) + print(f"\nSaved rearranged presentation to: {output_path}") + print(f"Final presentation has {len(prs.slides)} slides") + + +if __name__ == "__main__": + main() diff --git a/deploy/data/skills/pptx/scripts/slideConverter.js b/deploy/data/skills/pptx/scripts/slideConverter.js new file mode 100644 index 000000000..5a845ed7a --- /dev/null +++ b/deploy/data/skills/pptx/scripts/slideConverter.js @@ -0,0 +1,979 @@ +/** + * slideConverter - Transform HTML slide designs into pptxgenjs slides with positioned elements + * + * USAGE: + * const pptx = new pptxgen(); + * pptx.layout = 'LAYOUT_16x9'; // Must match HTML body dimensions + * + * const { slide, placeholders } = await convertSlide('slide.html', pptx); + * slide.addChart(pptx.charts.LINE, data, placeholders[0]); + * + * await pptx.writeFile('output.pptx'); + * + * FEATURES: + * - Transforms HTML to PowerPoint with precise positioning + * - Handles text, images, shapes, and bullet lists + * - Extracts placeholder elements (class="placeholder") with positions + * - Processes CSS gradients, borders, and margins + * + * VALIDATION: + * - Uses body width/height from HTML for viewport sizing + * - Throws error if HTML dimensions don't match presentation layout + * - Throws error if content overflows body (with overflow details) + * + * RETURNS: + * { slide, placeholders } where placeholders is an array of { id, x, y, w, h } + */ + +const { chromium } = require('playwright'); +const path = require('path'); +const sharp = require('sharp'); + +const POINTS_PER_PIXEL = 0.75; +const PIXELS_PER_INCH = 96; +const EMU_PER_INCH = 914400; + +// Helper: Get body dimensions and check for overflow +async function getBodyDimensions(page) { + const bodyDimensions = await page.evaluate(() => { + const body = document.body; + const style = window.getComputedStyle(body); + + return { + width: parseFloat(style.width), + height: parseFloat(style.height), + scrollWidth: body.scrollWidth, + scrollHeight: body.scrollHeight + }; + }); + + const errors = []; + const widthOverflowPx = Math.max(0, bodyDimensions.scrollWidth - bodyDimensions.width - 1); + const heightOverflowPx = Math.max(0, bodyDimensions.scrollHeight - bodyDimensions.height - 1); + + const widthOverflowPt = widthOverflowPx * POINTS_PER_PIXEL; + const heightOverflowPt = heightOverflowPx * POINTS_PER_PIXEL; + + if (widthOverflowPt > 0 || heightOverflowPt > 0) { + const directions = []; + if (widthOverflowPt > 0) directions.push(`${widthOverflowPt.toFixed(1)}pt horizontally`); + if (heightOverflowPt > 0) directions.push(`${heightOverflowPt.toFixed(1)}pt vertically`); + const reminder = heightOverflowPt > 0 ? ' (Remember: leave 0.5" margin at bottom of slide)' : ''; + errors.push(`HTML content overflows body by ${directions.join(' and ')}${reminder}`); + } + + return { ...bodyDimensions, errors }; +} + +// Helper: Validate dimensions match presentation layout +function validateDimensions(bodyDimensions, pres) { + const errors = []; + const widthInches = bodyDimensions.width / PIXELS_PER_INCH; + const heightInches = bodyDimensions.height / PIXELS_PER_INCH; + + if (pres.presLayout) { + const layoutWidth = pres.presLayout.width / EMU_PER_INCH; + const layoutHeight = pres.presLayout.height / EMU_PER_INCH; + + if (Math.abs(layoutWidth - widthInches) > 0.1 || Math.abs(layoutHeight - heightInches) > 0.1) { + errors.push( + `HTML dimensions (${widthInches.toFixed(1)}" × ${heightInches.toFixed(1)}") ` + + `don't match presentation layout (${layoutWidth.toFixed(1)}" × ${layoutHeight.toFixed(1)}")` + ); + } + } + return errors; +} + +function validateTextBoxPosition(slideData, bodyDimensions) { + const errors = []; + const slideHeightInches = bodyDimensions.height / PIXELS_PER_INCH; + const minBottomMargin = 0.5; // 0.5 inches from bottom + + for (const el of slideData.elements) { + // Check text elements (p, h1-h6, list) + if (['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'list'].includes(el.type)) { + const fontSize = el.style?.fontSize || 0; + const bottomEdge = el.position.y + el.position.h; + const distanceFromBottom = slideHeightInches - bottomEdge; + + if (fontSize > 12 && distanceFromBottom < minBottomMargin) { + const getText = () => { + if (typeof el.text === 'string') return el.text; + if (Array.isArray(el.text)) return el.text.find(t => t.text)?.text || ''; + if (Array.isArray(el.items)) return el.items.find(item => item.text)?.text || ''; + return ''; + }; + const textPrefix = getText().substring(0, 50) + (getText().length > 50 ? '...' : ''); + + errors.push( + `Text box "${textPrefix}" ends too close to bottom edge ` + + `(${distanceFromBottom.toFixed(2)}" from bottom, minimum ${minBottomMargin}" required)` + ); + } + } + } + + return errors; +} + +// Helper: Add background to slide +async function addBackground(slideData, targetSlide, tmpDir) { + if (slideData.background.type === 'image' && slideData.background.path) { + let imagePath = slideData.background.path.startsWith('file://') + ? slideData.background.path.replace('file://', '') + : slideData.background.path; + targetSlide.background = { path: imagePath }; + } else if (slideData.background.type === 'color' && slideData.background.value) { + targetSlide.background = { color: slideData.background.value }; + } +} + +// Helper: Add elements to slide +function addElements(slideData, targetSlide, pres) { + for (const el of slideData.elements) { + if (el.type === 'image') { + let imagePath = el.src.startsWith('file://') ? el.src.replace('file://', '') : el.src; + targetSlide.addImage({ + path: imagePath, + x: el.position.x, + y: el.position.y, + w: el.position.w, + h: el.position.h + }); + } else if (el.type === 'line') { + targetSlide.addShape(pres.ShapeType.line, { + x: el.x1, + y: el.y1, + w: el.x2 - el.x1, + h: el.y2 - el.y1, + line: { color: el.color, width: el.width } + }); + } else if (el.type === 'shape') { + const shapeOptions = { + x: el.position.x, + y: el.position.y, + w: el.position.w, + h: el.position.h, + shape: el.shape.rectRadius > 0 ? pres.ShapeType.roundRect : pres.ShapeType.rect + }; + + if (el.shape.fill) { + shapeOptions.fill = { color: el.shape.fill }; + if (el.shape.transparency != null) shapeOptions.fill.transparency = el.shape.transparency; + } + if (el.shape.line) shapeOptions.line = el.shape.line; + if (el.shape.rectRadius > 0) shapeOptions.rectRadius = el.shape.rectRadius; + if (el.shape.shadow) shapeOptions.shadow = el.shape.shadow; + + targetSlide.addText(el.text || '', shapeOptions); + } else if (el.type === 'list') { + const listOptions = { + x: el.position.x, + y: el.position.y, + w: el.position.w, + h: el.position.h, + fontSize: el.style.fontSize, + fontFace: el.style.fontFace, + color: el.style.color, + align: el.style.align, + valign: 'top', + lineSpacing: el.style.lineSpacing, + paraSpaceBefore: el.style.paraSpaceBefore, + paraSpaceAfter: el.style.paraSpaceAfter, + margin: el.style.margin + }; + if (el.style.margin) listOptions.margin = el.style.margin; + targetSlide.addText(el.items, listOptions); + } else { + // Check if text is single-line (height suggests one line) + const lineHeight = el.style.lineSpacing || el.style.fontSize * 1.2; + const isSingleLine = el.position.h <= lineHeight * 1.5; + + let adjustedX = el.position.x; + let adjustedW = el.position.w; + + // Make single-line text 2% wider to account for underestimate + if (isSingleLine) { + const widthIncrease = el.position.w * 0.02; + const align = el.style.align; + + if (align === 'center') { + // Center: expand both sides + adjustedX = el.position.x - (widthIncrease / 2); + adjustedW = el.position.w + widthIncrease; + } else if (align === 'right') { + // Right: expand to the left + adjustedX = el.position.x - widthIncrease; + adjustedW = el.position.w + widthIncrease; + } else { + // Left (default): expand to the right + adjustedW = el.position.w + widthIncrease; + } + } + + const textOptions = { + x: adjustedX, + y: el.position.y, + w: adjustedW, + h: el.position.h, + fontSize: el.style.fontSize, + fontFace: el.style.fontFace, + color: el.style.color, + bold: el.style.bold, + italic: el.style.italic, + underline: el.style.underline, + valign: 'top', + lineSpacing: el.style.lineSpacing, + paraSpaceBefore: el.style.paraSpaceBefore, + paraSpaceAfter: el.style.paraSpaceAfter, + inset: 0 // Remove default PowerPoint internal padding + }; + + if (el.style.align) textOptions.align = el.style.align; + if (el.style.margin) textOptions.margin = el.style.margin; + if (el.style.rotate !== undefined) textOptions.rotate = el.style.rotate; + if (el.style.transparency !== null && el.style.transparency !== undefined) textOptions.transparency = el.style.transparency; + + targetSlide.addText(el.text, textOptions); + } + } +} + +// Helper: Extract slide data from HTML page +async function extractSlideData(page) { + return await page.evaluate(() => { + const POINTS_PER_PIXEL = 0.75; + const PIXELS_PER_INCH = 96; + + // Fonts that are single-weight and should not have bold applied + // (applying bold causes PowerPoint to use faux bold which makes text wider) + const SINGLE_WEIGHT_FONTS = ['impact']; + + // Helper: Check if a font should skip bold formatting + const shouldSkipBold = (fontFamily) => { + if (!fontFamily) return false; + const normalizedFont = fontFamily.toLowerCase().replace(/['"]/g, '').split(',')[0].trim(); + return SINGLE_WEIGHT_FONTS.includes(normalizedFont); + }; + + // Unit conversion helpers + const pxToInch = (px) => px / PIXELS_PER_INCH; + const pxToPoints = (pxStr) => parseFloat(pxStr) * POINTS_PER_PIXEL; + const rgbToHex = (rgbStr) => { + // Handle transparent backgrounds by defaulting to white + if (rgbStr === 'rgba(0, 0, 0, 0)' || rgbStr === 'transparent') return 'FFFFFF'; + + const match = rgbStr.match(/rgba?\((\d+),\s*(\d+),\s*(\d+)/); + if (!match) return 'FFFFFF'; + return match.slice(1).map(n => parseInt(n).toString(16).padStart(2, '0')).join(''); + }; + + const extractAlpha = (rgbStr) => { + const match = rgbStr.match(/rgba\((\d+),\s*(\d+),\s*(\d+),\s*([\d.]+)\)/); + if (!match || !match[4]) return null; + const alpha = parseFloat(match[4]); + return Math.round((1 - alpha) * 100); + }; + + const applyTextTransform = (text, textTransform) => { + if (textTransform === 'uppercase') return text.toUpperCase(); + if (textTransform === 'lowercase') return text.toLowerCase(); + if (textTransform === 'capitalize') { + return text.replace(/\b\w/g, c => c.toUpperCase()); + } + return text; + }; + + // Extract rotation angle from CSS transform and writing-mode + const getRotation = (transform, writingMode) => { + let angle = 0; + + // Handle writing-mode first + // PowerPoint: 90° = text rotated 90° clockwise (reads top to bottom, letters upright) + // PowerPoint: 270° = text rotated 270° clockwise (reads bottom to top, letters upright) + if (writingMode === 'vertical-rl') { + // vertical-rl alone = text reads top to bottom = 90° in PowerPoint + angle = 90; + } else if (writingMode === 'vertical-lr') { + // vertical-lr alone = text reads bottom to top = 270° in PowerPoint + angle = 270; + } + + // Then add any transform rotation + if (transform && transform !== 'none') { + // Try to match rotate() function + const rotateMatch = transform.match(/rotate\((-?\d+(?:\.\d+)?)deg\)/); + if (rotateMatch) { + angle += parseFloat(rotateMatch[1]); + } else { + // Browser may compute as matrix - extract rotation from matrix + const matrixMatch = transform.match(/matrix\(([^)]+)\)/); + if (matrixMatch) { + const values = matrixMatch[1].split(',').map(parseFloat); + // matrix(a, b, c, d, e, f) where rotation = atan2(b, a) + const matrixAngle = Math.atan2(values[1], values[0]) * (180 / Math.PI); + angle += Math.round(matrixAngle); + } + } + } + + // Normalize to 0-359 range + angle = angle % 360; + if (angle < 0) angle += 360; + + return angle === 0 ? null : angle; + }; + + // Get position/dimensions accounting for rotation + const getPositionAndSize = (el, rect, rotation) => { + if (rotation === null) { + return { x: rect.left, y: rect.top, w: rect.width, h: rect.height }; + } + + // For 90° or 270° rotations, swap width and height + // because PowerPoint applies rotation to the original (unrotated) box + const isVertical = rotation === 90 || rotation === 270; + + if (isVertical) { + // The browser shows us the rotated dimensions (tall box for vertical text) + // But PowerPoint needs the pre-rotation dimensions (wide box that will be rotated) + // So we swap: browser's height becomes PPT's width, browser's width becomes PPT's height + const centerX = rect.left + rect.width / 2; + const centerY = rect.top + rect.height / 2; + + return { + x: centerX - rect.height / 2, + y: centerY - rect.width / 2, + w: rect.height, + h: rect.width + }; + } + + // For other rotations, use element's offset dimensions + const centerX = rect.left + rect.width / 2; + const centerY = rect.top + rect.height / 2; + return { + x: centerX - el.offsetWidth / 2, + y: centerY - el.offsetHeight / 2, + w: el.offsetWidth, + h: el.offsetHeight + }; + }; + + // Parse CSS box-shadow into PptxGenJS shadow properties + const parseBoxShadow = (boxShadow) => { + if (!boxShadow || boxShadow === 'none') return null; + + // Browser computed style format: "rgba(0, 0, 0, 0.3) 2px 2px 8px 0px [inset]" + // CSS format: "[inset] 2px 2px 8px 0px rgba(0, 0, 0, 0.3)" + + const insetMatch = boxShadow.match(/inset/); + + // IMPORTANT: PptxGenJS/PowerPoint doesn't properly support inset shadows + // Only process outer shadows to avoid file corruption + if (insetMatch) return null; + + // Extract color first (rgba or rgb at start) + const colorMatch = boxShadow.match(/rgba?\([^)]+\)/); + + // Extract numeric values (handles both px and pt units) + const parts = boxShadow.match(/([-\d.]+)(px|pt)/g); + + if (!parts || parts.length < 2) return null; + + const offsetX = parseFloat(parts[0]); + const offsetY = parseFloat(parts[1]); + const blur = parts.length > 2 ? parseFloat(parts[2]) : 0; + + // Calculate angle from offsets (in degrees, 0 = right, 90 = down) + let angle = 0; + if (offsetX !== 0 || offsetY !== 0) { + angle = Math.atan2(offsetY, offsetX) * (180 / Math.PI); + if (angle < 0) angle += 360; + } + + // Calculate offset distance (hypotenuse) + const offset = Math.sqrt(offsetX * offsetX + offsetY * offsetY) * POINTS_PER_PIXEL; + + // Extract opacity from rgba + let opacity = 0.5; + if (colorMatch) { + const opacityMatch = colorMatch[0].match(/[\d.]+\)$/); + if (opacityMatch) { + opacity = parseFloat(opacityMatch[0].replace(')', '')); + } + } + + return { + type: 'outer', + angle: Math.round(angle), + blur: blur * 0.75, // Convert to points + color: colorMatch ? rgbToHex(colorMatch[0]) : '000000', + offset: offset, + opacity + }; + }; + + // Parse inline formatting tags (, , , , , ) into text runs + const parseInlineFormatting = (element, baseOptions = {}, runs = [], baseTextTransform = (x) => x) => { + let prevNodeIsText = false; + + element.childNodes.forEach((node) => { + let textTransform = baseTextTransform; + + const isText = node.nodeType === Node.TEXT_NODE || node.tagName === 'BR'; + if (isText) { + const text = node.tagName === 'BR' ? '\n' : textTransform(node.textContent.replace(/\s+/g, ' ')); + const prevRun = runs[runs.length - 1]; + if (prevNodeIsText && prevRun) { + prevRun.text += text; + } else { + runs.push({ text, options: { ...baseOptions } }); + } + + } else if (node.nodeType === Node.ELEMENT_NODE && node.textContent.trim()) { + const options = { ...baseOptions }; + const computed = window.getComputedStyle(node); + + // Handle inline elements with computed styles + if (node.tagName === 'SPAN' || node.tagName === 'B' || node.tagName === 'STRONG' || node.tagName === 'I' || node.tagName === 'EM' || node.tagName === 'U') { + const isBold = computed.fontWeight === 'bold' || parseInt(computed.fontWeight) >= 600; + if (isBold && !shouldSkipBold(computed.fontFamily)) options.bold = true; + if (computed.fontStyle === 'italic') options.italic = true; + if (computed.textDecoration && computed.textDecoration.includes('underline')) options.underline = true; + if (computed.color && computed.color !== 'rgb(0, 0, 0)') { + options.color = rgbToHex(computed.color); + const transparency = extractAlpha(computed.color); + if (transparency !== null) options.transparency = transparency; + } + if (computed.fontSize) options.fontSize = pxToPoints(computed.fontSize); + + // Apply text-transform on the span element itself + if (computed.textTransform && computed.textTransform !== 'none') { + const transformStr = computed.textTransform; + textTransform = (text) => applyTextTransform(text, transformStr); + } + + // Validate: Check for margins on inline elements + if (computed.marginLeft && parseFloat(computed.marginLeft) > 0) { + errors.push(`Inline element <${node.tagName.toLowerCase()}> has margin-left which is not supported in PowerPoint. Remove margin from inline elements.`); + } + if (computed.marginRight && parseFloat(computed.marginRight) > 0) { + errors.push(`Inline element <${node.tagName.toLowerCase()}> has margin-right which is not supported in PowerPoint. Remove margin from inline elements.`); + } + if (computed.marginTop && parseFloat(computed.marginTop) > 0) { + errors.push(`Inline element <${node.tagName.toLowerCase()}> has margin-top which is not supported in PowerPoint. Remove margin from inline elements.`); + } + if (computed.marginBottom && parseFloat(computed.marginBottom) > 0) { + errors.push(`Inline element <${node.tagName.toLowerCase()}> has margin-bottom which is not supported in PowerPoint. Remove margin from inline elements.`); + } + + // Recursively process the child node. This will flatten nested spans into multiple runs. + parseInlineFormatting(node, options, runs, textTransform); + } + } + + prevNodeIsText = isText; + }); + + // Trim leading space from first run and trailing space from last run + if (runs.length > 0) { + runs[0].text = runs[0].text.replace(/^\s+/, ''); + runs[runs.length - 1].text = runs[runs.length - 1].text.replace(/\s+$/, ''); + } + + return runs.filter(r => r.text.length > 0); + }; + + // Extract background from body (image or color) + const body = document.body; + const bodyStyle = window.getComputedStyle(body); + const bgImage = bodyStyle.backgroundImage; + const bgColor = bodyStyle.backgroundColor; + + // Collect validation errors + const errors = []; + + // Validate: Check for CSS gradients + if (bgImage && (bgImage.includes('linear-gradient') || bgImage.includes('radial-gradient'))) { + errors.push( + 'CSS gradients are not supported. Use Sharp to rasterize gradients as PNG images first, ' + + 'then reference with background-image: url(\'gradient.png\')' + ); + } + + let background; + if (bgImage && bgImage !== 'none') { + // Extract URL from url("...") or url(...) + const urlMatch = bgImage.match(/url\(["']?([^"')]+)["']?\)/); + if (urlMatch) { + background = { + type: 'image', + path: urlMatch[1] + }; + } else { + background = { + type: 'color', + value: rgbToHex(bgColor) + }; + } + } else { + background = { + type: 'color', + value: rgbToHex(bgColor) + }; + } + + // Process all elements + const elements = []; + const placeholders = []; + const textTags = ['P', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'UL', 'OL', 'LI']; + const processed = new Set(); + + document.querySelectorAll('*').forEach((el) => { + if (processed.has(el)) return; + + // Validate text elements don't have backgrounds, borders, or shadows + if (textTags.includes(el.tagName)) { + const computed = window.getComputedStyle(el); + const hasBg = computed.backgroundColor && computed.backgroundColor !== 'rgba(0, 0, 0, 0)'; + const hasBorder = (computed.borderWidth && parseFloat(computed.borderWidth) > 0) || + (computed.borderTopWidth && parseFloat(computed.borderTopWidth) > 0) || + (computed.borderRightWidth && parseFloat(computed.borderRightWidth) > 0) || + (computed.borderBottomWidth && parseFloat(computed.borderBottomWidth) > 0) || + (computed.borderLeftWidth && parseFloat(computed.borderLeftWidth) > 0); + const hasShadow = computed.boxShadow && computed.boxShadow !== 'none'; + + if (hasBg || hasBorder || hasShadow) { + errors.push( + `Text element <${el.tagName.toLowerCase()}> has ${hasBg ? 'background' : hasBorder ? 'border' : 'shadow'}. ` + + 'Backgrounds, borders, and shadows are only supported on
      elements, not text elements.' + ); + return; + } + } + + // Extract placeholder elements (for charts, etc.) + if (el.className && el.className.includes('placeholder')) { + const rect = el.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) { + errors.push( + `Placeholder "${el.id || 'unnamed'}" has ${rect.width === 0 ? 'width: 0' : 'height: 0'}. Check the layout CSS.` + ); + } else { + placeholders.push({ + id: el.id || `placeholder-${placeholders.length}`, + x: pxToInch(rect.left), + y: pxToInch(rect.top), + w: pxToInch(rect.width), + h: pxToInch(rect.height) + }); + } + processed.add(el); + return; + } + + // Extract images + if (el.tagName === 'IMG') { + const rect = el.getBoundingClientRect(); + if (rect.width > 0 && rect.height > 0) { + elements.push({ + type: 'image', + src: el.src, + position: { + x: pxToInch(rect.left), + y: pxToInch(rect.top), + w: pxToInch(rect.width), + h: pxToInch(rect.height) + } + }); + processed.add(el); + return; + } + } + + // Extract DIVs with backgrounds/borders as shapes + const isContainer = el.tagName === 'DIV' && !textTags.includes(el.tagName); + if (isContainer) { + const computed = window.getComputedStyle(el); + const hasBg = computed.backgroundColor && computed.backgroundColor !== 'rgba(0, 0, 0, 0)'; + + // Validate: Check for unwrapped text content in DIV + for (const node of el.childNodes) { + if (node.nodeType === Node.TEXT_NODE) { + const text = node.textContent.trim(); + if (text) { + errors.push( + `DIV element contains unwrapped text "${text.substring(0, 50)}${text.length > 50 ? '...' : ''}". ` + + 'All text must be wrapped in

      ,

      -

      ,
        , or
          tags to appear in PowerPoint.' + ); + } + } + } + + // Check for background images on shapes + const bgImage = computed.backgroundImage; + if (bgImage && bgImage !== 'none') { + errors.push( + 'Background images on DIV elements are not supported. ' + + 'Use solid colors or borders for shapes, or use slide.addImage() in PptxGenJS to layer images.' + ); + return; + } + + // Check for borders - both uniform and partial + const borderTop = computed.borderTopWidth; + const borderRight = computed.borderRightWidth; + const borderBottom = computed.borderBottomWidth; + const borderLeft = computed.borderLeftWidth; + const borders = [borderTop, borderRight, borderBottom, borderLeft].map(b => parseFloat(b) || 0); + const hasBorder = borders.some(b => b > 0); + const hasUniformBorder = hasBorder && borders.every(b => b === borders[0]); + const borderLines = []; + + if (hasBorder && !hasUniformBorder) { + const rect = el.getBoundingClientRect(); + const x = pxToInch(rect.left); + const y = pxToInch(rect.top); + const w = pxToInch(rect.width); + const h = pxToInch(rect.height); + + // Collect lines to add after shape (inset by half the line width to center on edge) + if (parseFloat(borderTop) > 0) { + const widthPt = pxToPoints(borderTop); + const inset = (widthPt / 72) / 2; // Convert points to inches, then half + borderLines.push({ + type: 'line', + x1: x, y1: y + inset, x2: x + w, y2: y + inset, + width: widthPt, + color: rgbToHex(computed.borderTopColor) + }); + } + if (parseFloat(borderRight) > 0) { + const widthPt = pxToPoints(borderRight); + const inset = (widthPt / 72) / 2; + borderLines.push({ + type: 'line', + x1: x + w - inset, y1: y, x2: x + w - inset, y2: y + h, + width: widthPt, + color: rgbToHex(computed.borderRightColor) + }); + } + if (parseFloat(borderBottom) > 0) { + const widthPt = pxToPoints(borderBottom); + const inset = (widthPt / 72) / 2; + borderLines.push({ + type: 'line', + x1: x, y1: y + h - inset, x2: x + w, y2: y + h - inset, + width: widthPt, + color: rgbToHex(computed.borderBottomColor) + }); + } + if (parseFloat(borderLeft) > 0) { + const widthPt = pxToPoints(borderLeft); + const inset = (widthPt / 72) / 2; + borderLines.push({ + type: 'line', + x1: x + inset, y1: y, x2: x + inset, y2: y + h, + width: widthPt, + color: rgbToHex(computed.borderLeftColor) + }); + } + } + + if (hasBg || hasBorder) { + const rect = el.getBoundingClientRect(); + if (rect.width > 0 && rect.height > 0) { + const shadow = parseBoxShadow(computed.boxShadow); + + // Only add shape if there's background or uniform border + if (hasBg || hasUniformBorder) { + elements.push({ + type: 'shape', + text: '', // Shape only - child text elements render on top + position: { + x: pxToInch(rect.left), + y: pxToInch(rect.top), + w: pxToInch(rect.width), + h: pxToInch(rect.height) + }, + shape: { + fill: hasBg ? rgbToHex(computed.backgroundColor) : null, + transparency: hasBg ? extractAlpha(computed.backgroundColor) : null, + line: hasUniformBorder ? { + color: rgbToHex(computed.borderColor), + width: pxToPoints(computed.borderWidth) + } : null, + // Convert border-radius to rectRadius (in inches) + // % values: 50%+ = circle (1), <50% = percentage of min dimension + // pt values: divide by 72 (72pt = 1 inch) + // px values: divide by 96 (96px = 1 inch) + rectRadius: (() => { + const radius = computed.borderRadius; + const radiusValue = parseFloat(radius); + if (radiusValue === 0) return 0; + + if (radius.includes('%')) { + if (radiusValue >= 50) return 1; + // Calculate percentage of smaller dimension + const minDim = Math.min(rect.width, rect.height); + return (radiusValue / 100) * pxToInch(minDim); + } + + if (radius.includes('pt')) return radiusValue / 72; + return radiusValue / PIXELS_PER_INCH; + })(), + shadow: shadow + } + }); + } + + // Add partial border lines + elements.push(...borderLines); + + processed.add(el); + return; + } + } + } + + // Extract bullet lists as single text block + if (el.tagName === 'UL' || el.tagName === 'OL') { + const rect = el.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) return; + + const liElements = Array.from(el.querySelectorAll('li')); + const items = []; + const ulComputed = window.getComputedStyle(el); + const ulPaddingLeftPt = pxToPoints(ulComputed.paddingLeft); + + // Split: margin-left for bullet position, indent for text position + // margin-left + indent = ul padding-left + const marginLeft = ulPaddingLeftPt * 0.5; + const textIndent = ulPaddingLeftPt * 0.5; + + liElements.forEach((li, idx) => { + const isLast = idx === liElements.length - 1; + const runs = parseInlineFormatting(li, { breakLine: false }); + // Clean manual bullets from first run + if (runs.length > 0) { + runs[0].text = runs[0].text.replace(/^[•\-\*▪▸]\s*/, ''); + runs[0].options.bullet = { indent: textIndent }; + } + // Set breakLine on last run + if (runs.length > 0 && !isLast) { + runs[runs.length - 1].options.breakLine = true; + } + items.push(...runs); + }); + + const computed = window.getComputedStyle(liElements[0] || el); + + elements.push({ + type: 'list', + items: items, + position: { + x: pxToInch(rect.left), + y: pxToInch(rect.top), + w: pxToInch(rect.width), + h: pxToInch(rect.height) + }, + style: { + fontSize: pxToPoints(computed.fontSize), + fontFace: computed.fontFamily.split(',')[0].replace(/['"]/g, '').trim(), + color: rgbToHex(computed.color), + transparency: extractAlpha(computed.color), + align: computed.textAlign === 'start' ? 'left' : computed.textAlign, + lineSpacing: computed.lineHeight && computed.lineHeight !== 'normal' ? pxToPoints(computed.lineHeight) : null, + paraSpaceBefore: 0, + paraSpaceAfter: pxToPoints(computed.marginBottom), + // PptxGenJS margin array is [left, right, bottom, top] + margin: [marginLeft, 0, 0, 0] + } + }); + + liElements.forEach(li => processed.add(li)); + processed.add(el); + return; + } + + // Extract text elements (P, H1, H2, etc.) + if (!textTags.includes(el.tagName)) return; + + const rect = el.getBoundingClientRect(); + const text = el.textContent.trim(); + if (rect.width === 0 || rect.height === 0 || !text) return; + + // Validate: Check for manual bullet symbols in text elements (not in lists) + if (el.tagName !== 'LI' && /^[•\-\*▪▸○●◆◇■□]\s/.test(text.trimStart())) { + errors.push( + `Text element <${el.tagName.toLowerCase()}> starts with bullet symbol "${text.substring(0, 20)}...". ` + + 'Use
            or
              lists instead of manual bullet symbols.' + ); + return; + } + + const computed = window.getComputedStyle(el); + const rotation = getRotation(computed.transform, computed.writingMode); + const { x, y, w, h } = getPositionAndSize(el, rect, rotation); + + const baseStyle = { + fontSize: pxToPoints(computed.fontSize), + fontFace: computed.fontFamily.split(',')[0].replace(/['"]/g, '').trim(), + color: rgbToHex(computed.color), + align: computed.textAlign === 'start' ? 'left' : computed.textAlign, + lineSpacing: pxToPoints(computed.lineHeight), + paraSpaceBefore: pxToPoints(computed.marginTop), + paraSpaceAfter: pxToPoints(computed.marginBottom), + // PptxGenJS margin array is [left, right, bottom, top] (not [top, right, bottom, left] as documented) + margin: [ + pxToPoints(computed.paddingLeft), + pxToPoints(computed.paddingRight), + pxToPoints(computed.paddingBottom), + pxToPoints(computed.paddingTop) + ] + }; + + const transparency = extractAlpha(computed.color); + if (transparency !== null) baseStyle.transparency = transparency; + + if (rotation !== null) baseStyle.rotate = rotation; + + const hasFormatting = el.querySelector('b, i, u, strong, em, span, br'); + + if (hasFormatting) { + // Text with inline formatting + const transformStr = computed.textTransform; + const runs = parseInlineFormatting(el, {}, [], (str) => applyTextTransform(str, transformStr)); + + // Adjust lineSpacing based on largest fontSize in runs + const adjustedStyle = { ...baseStyle }; + if (adjustedStyle.lineSpacing) { + const maxFontSize = Math.max( + adjustedStyle.fontSize, + ...runs.map(r => r.options?.fontSize || 0) + ); + if (maxFontSize > adjustedStyle.fontSize) { + const lineHeightMultiplier = adjustedStyle.lineSpacing / adjustedStyle.fontSize; + adjustedStyle.lineSpacing = maxFontSize * lineHeightMultiplier; + } + } + + elements.push({ + type: el.tagName.toLowerCase(), + text: runs, + position: { x: pxToInch(x), y: pxToInch(y), w: pxToInch(w), h: pxToInch(h) }, + style: adjustedStyle + }); + } else { + // Plain text - inherit CSS formatting + const textTransform = computed.textTransform; + const transformedText = applyTextTransform(text, textTransform); + + const isBold = computed.fontWeight === 'bold' || parseInt(computed.fontWeight) >= 600; + + elements.push({ + type: el.tagName.toLowerCase(), + text: transformedText, + position: { x: pxToInch(x), y: pxToInch(y), w: pxToInch(w), h: pxToInch(h) }, + style: { + ...baseStyle, + bold: isBold && !shouldSkipBold(computed.fontFamily), + italic: computed.fontStyle === 'italic', + underline: computed.textDecoration.includes('underline') + } + }); + } + + processed.add(el); + }); + + return { background, elements, placeholders, errors }; + }); +} + +async function convertSlide(htmlFile, pres, options = {}) { + const { + tmpDir = process.env.TMPDIR || '/tmp', + slide = null + } = options; + + try { + // Use Chrome on macOS, default Chromium on Unix + const launchOptions = { env: { TMPDIR: tmpDir } }; + if (process.platform === 'darwin') { + launchOptions.channel = 'chrome'; + } + + const browser = await chromium.launch(launchOptions); + + let bodyDimensions; + let slideData; + + const filePath = path.isAbsolute(htmlFile) ? htmlFile : path.join(process.cwd(), htmlFile); + const validationErrors = []; + + try { + const page = await browser.newPage(); + page.on('console', (msg) => { + // Log the message text to your test runner's console + console.log(`Browser console: ${msg.text()}`); + }); + + await page.goto(`file://${filePath}`); + + bodyDimensions = await getBodyDimensions(page); + + await page.setViewportSize({ + width: Math.round(bodyDimensions.width), + height: Math.round(bodyDimensions.height) + }); + + slideData = await extractSlideData(page); + } finally { + await browser.close(); + } + + // Collect all validation errors + if (bodyDimensions.errors && bodyDimensions.errors.length > 0) { + validationErrors.push(...bodyDimensions.errors); + } + + const dimensionErrors = validateDimensions(bodyDimensions, pres); + if (dimensionErrors.length > 0) { + validationErrors.push(...dimensionErrors); + } + + const textBoxPositionErrors = validateTextBoxPosition(slideData, bodyDimensions); + if (textBoxPositionErrors.length > 0) { + validationErrors.push(...textBoxPositionErrors); + } + + if (slideData.errors && slideData.errors.length > 0) { + validationErrors.push(...slideData.errors); + } + + // Throw all errors at once if any exist + if (validationErrors.length > 0) { + const errorMessage = validationErrors.length === 1 + ? validationErrors[0] + : `Multiple validation errors found:\n${validationErrors.map((e, i) => ` ${i + 1}. ${e}`).join('\n')}`; + throw new Error(errorMessage); + } + + const targetSlide = slide || pres.addSlide(); + + await addBackground(slideData, targetSlide, tmpDir); + addElements(slideData, targetSlide, pres); + + return { slide: targetSlide, placeholders: slideData.placeholders }; + } catch (error) { + if (!error.message.startsWith(htmlFile)) { + throw new Error(`${htmlFile}: ${error.message}`); + } + throw error; + } +} + +module.exports = convertSlide; diff --git a/deploy/data/skills/pptx/scripts/slidePreview.py b/deploy/data/skills/pptx/scripts/slidePreview.py new file mode 100755 index 000000000..5ca48590c --- /dev/null +++ b/deploy/data/skills/pptx/scripts/slidePreview.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python3 +""" +Generate thumbnail grids from PowerPoint presentation slides. + +Generates a grid layout of slide previews with configurable columns (max 6). +Each grid contains up to cols*(cols+1) images. For presentations with more +slides, multiple numbered grid files are generated automatically. + +The program outputs the names of all files generated. + +Output: +- Single grid: {prefix}.jpg (if slides fit in one grid) +- Multiple grids: {prefix}-1.jpg, {prefix}-2.jpg, etc. + +Grid limits by column count: +- 3 cols: max 12 slides per grid (3*4) +- 4 cols: max 20 slides per grid (4*5) +- 5 cols: max 30 slides per grid (5*6) [default] +- 6 cols: max 42 slides per grid (6*7) + +Usage: + python slidePreview.py input.pptx [output_prefix] [--cols N] [--outline-placeholders] + +Examples: + python slidePreview.py presentation.pptx + # Generates: thumbnails.jpg (using default prefix) + # Outputs: + # Generated 1 grid(s): + # - thumbnails.jpg + + python slidePreview.py large-deck.pptx grid --cols 4 + # Generates: grid-1.jpg, grid-2.jpg, grid-3.jpg + # Outputs: + # Generated 3 grid(s): + # - grid-1.jpg + # - grid-2.jpg + # - grid-3.jpg + + python slidePreview.py template.pptx analysis --outline-placeholders + # Generates thumbnail grids with red outlines around text placeholders +""" + +import argparse +import subprocess +import sys +import tempfile +from pathlib import Path + +from textExtractor import get_text_shapes_inventory as extract_text_inventory +from PIL import Image, ImageDraw, ImageFont +from pptx import Presentation + +# Constants +THUMBNAIL_WIDTH = 300 # Fixed thumbnail width in pixels +CONVERSION_DPI = 100 # DPI for PDF to image conversion +MAX_COLS = 6 # Maximum number of columns +DEFAULT_COLS = 5 # Default number of columns +JPEG_QUALITY = 95 # JPEG compression quality + +# Grid layout constants +GRID_PADDING = 20 # Padding between thumbnails +BORDER_WIDTH = 2 # Border width around thumbnails +FONT_SIZE_RATIO = 0.12 # Font size as fraction of thumbnail width +LABEL_PADDING_RATIO = 0.4 # Label padding as fraction of font size + + +def main(): + parser = argparse.ArgumentParser( + description="Create thumbnail grids from PowerPoint slides." + ) + parser.add_argument("input", help="Input PowerPoint file (.pptx)") + parser.add_argument( + "output_prefix", + nargs="?", + default="thumbnails", + help="Output prefix for image files (default: thumbnails, will create prefix.jpg or prefix-N.jpg)", + ) + parser.add_argument( + "--cols", + type=int, + default=DEFAULT_COLS, + help=f"Number of columns (default: {DEFAULT_COLS}, max: {MAX_COLS})", + ) + parser.add_argument( + "--outline-placeholders", + action="store_true", + help="Outline text placeholders with a colored border", + ) + + args = parser.parse_args() + + # Validate columns + cols = min(args.cols, MAX_COLS) + if args.cols > MAX_COLS: + print(f"Warning: Columns limited to {MAX_COLS} (requested {args.cols})") + + # Validate input + input_path = Path(args.input) + if not input_path.exists() or input_path.suffix.lower() != ".pptx": + print(f"Error: Invalid PowerPoint file: {args.input}") + sys.exit(1) + + # Construct output path (always JPG) + output_path = Path(f"{args.output_prefix}.jpg") + + print(f"Processing: {args.input}") + + try: + with tempfile.TemporaryDirectory() as temp_dir: + # Get placeholder regions if outlining is enabled + placeholder_regions = None + slide_dimensions = None + if args.outline_placeholders: + print("Extracting placeholder regions...") + placeholder_regions, slide_dimensions = get_placeholder_regions( + input_path + ) + if placeholder_regions: + print(f"Found placeholders on {len(placeholder_regions)} slides") + + # Convert slides to images + slide_images = convert_to_images(input_path, Path(temp_dir), CONVERSION_DPI) + if not slide_images: + print("Error: No slides found") + sys.exit(1) + + print(f"Found {len(slide_images)} slides") + + # Create grids (max cols×(cols+1) images per grid) + grid_files = create_grids( + slide_images, + cols, + THUMBNAIL_WIDTH, + output_path, + placeholder_regions, + slide_dimensions, + ) + + # Print saved files + print(f"Created {len(grid_files)} grid(s):") + for grid_file in grid_files: + print(f" - {grid_file}") + + except Exception as e: + print(f"Error: {e}") + sys.exit(1) + + +def create_hidden_slide_placeholder(size): + """Create placeholder image for hidden slides.""" + img = Image.new("RGB", size, color="#F0F0F0") + draw = ImageDraw.Draw(img) + line_width = max(5, min(size) // 100) + draw.line([(0, 0), size], fill="#CCCCCC", width=line_width) + draw.line([(size[0], 0), (0, size[1])], fill="#CCCCCC", width=line_width) + return img + + +def get_placeholder_regions(pptx_path): + """Extract ALL text regions from the presentation. + + Returns a tuple of (placeholder_regions, slide_dimensions). + text_regions is a dict mapping slide indices to lists of text regions. + Each region is a dict with 'left', 'top', 'width', 'height' in inches. + slide_dimensions is a tuple of (width_inches, height_inches). + """ + prs = Presentation(str(pptx_path)) + inventory = extract_text_inventory(pptx_path, prs) + placeholder_regions = {} + + # Get actual slide dimensions in inches (EMU to inches conversion) + slide_width_inches = (prs.slide_width or 9144000) / 914400.0 + slide_height_inches = (prs.slide_height or 5143500) / 914400.0 + + for slide_key, shapes in inventory.items(): + # Extract slide index from "slide-N" format + slide_idx = int(slide_key.split("-")[1]) + regions = [] + + for shape_key, shape_data in shapes.items(): + # The inventory only contains shapes with text, so all shapes should be highlighted + regions.append( + { + "left": shape_data.left, + "top": shape_data.top, + "width": shape_data.width, + "height": shape_data.height, + } + ) + + if regions: + placeholder_regions[slide_idx] = regions + + return placeholder_regions, (slide_width_inches, slide_height_inches) + + +def convert_to_images(pptx_path, temp_dir, dpi): + """Convert PowerPoint to images via PDF, handling hidden slides.""" + # Detect hidden slides + print("Analyzing presentation...") + prs = Presentation(str(pptx_path)) + total_slides = len(prs.slides) + + # Find hidden slides (1-based indexing for display) + hidden_slides = { + idx + 1 + for idx, slide in enumerate(prs.slides) + if slide.element.get("show") == "0" + } + + print(f"Total slides: {total_slides}") + if hidden_slides: + print(f"Hidden slides: {sorted(hidden_slides)}") + + pdf_path = temp_dir / f"{pptx_path.stem}.pdf" + + # Convert to PDF + print("Converting to PDF...") + result = subprocess.run( + [ + "soffice", + "--headless", + "--convert-to", + "pdf", + "--outdir", + str(temp_dir), + str(pptx_path), + ], + capture_output=True, + text=True, + ) + if result.returncode != 0 or not pdf_path.exists(): + raise RuntimeError("PDF conversion failed") + + # Convert PDF to images + print(f"Converting to images at {dpi} DPI...") + result = subprocess.run( + ["pdftoppm", "-jpeg", "-r", str(dpi), str(pdf_path), str(temp_dir / "slide")], + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError("Image conversion failed") + + visible_images = sorted(temp_dir.glob("slide-*.jpg")) + + # Create full list with placeholders for hidden slides + all_images = [] + visible_idx = 0 + + # Get placeholder dimensions from first visible slide + if visible_images: + with Image.open(visible_images[0]) as img: + placeholder_size = img.size + else: + placeholder_size = (1920, 1080) + + for slide_num in range(1, total_slides + 1): + if slide_num in hidden_slides: + # Create placeholder image for hidden slide + placeholder_path = temp_dir / f"hidden-{slide_num:03d}.jpg" + placeholder_img = create_hidden_slide_placeholder(placeholder_size) + placeholder_img.save(placeholder_path, "JPEG") + all_images.append(placeholder_path) + else: + # Use the actual visible slide image + if visible_idx < len(visible_images): + all_images.append(visible_images[visible_idx]) + visible_idx += 1 + + return all_images + + +def create_grids( + image_paths, + cols, + width, + output_path, + placeholder_regions=None, + slide_dimensions=None, +): + """Create multiple thumbnail grids from slide images, max cols×(cols+1) images per grid.""" + # Maximum images per grid is cols × (cols + 1) for better proportions + max_images_per_grid = cols * (cols + 1) + grid_files = [] + + print( + f"Creating grids with {cols} columns (max {max_images_per_grid} images per grid)" + ) + + # Split images into chunks + for chunk_idx, start_idx in enumerate( + range(0, len(image_paths), max_images_per_grid) + ): + end_idx = min(start_idx + max_images_per_grid, len(image_paths)) + chunk_images = image_paths[start_idx:end_idx] + + # Create grid for this chunk + grid = create_grid( + chunk_images, cols, width, start_idx, placeholder_regions, slide_dimensions + ) + + # Generate output filename + if len(image_paths) <= max_images_per_grid: + # Single grid - use base filename without suffix + grid_filename = output_path + else: + # Multiple grids - insert index before extension with dash + stem = output_path.stem + suffix = output_path.suffix + grid_filename = output_path.parent / f"{stem}-{chunk_idx + 1}{suffix}" + + # Save grid + grid_filename.parent.mkdir(parents=True, exist_ok=True) + grid.save(str(grid_filename), quality=JPEG_QUALITY) + grid_files.append(str(grid_filename)) + + return grid_files + + +def create_grid( + image_paths, + cols, + width, + start_slide_num=0, + placeholder_regions=None, + slide_dimensions=None, +): + """Create thumbnail grid from slide images with optional placeholder outlining.""" + font_size = int(width * FONT_SIZE_RATIO) + label_padding = int(font_size * LABEL_PADDING_RATIO) + + # Get dimensions + with Image.open(image_paths[0]) as img: + aspect = img.height / img.width + height = int(width * aspect) + + # Calculate grid size + rows = (len(image_paths) + cols - 1) // cols + grid_w = cols * width + (cols + 1) * GRID_PADDING + grid_h = rows * (height + font_size + label_padding * 2) + (rows + 1) * GRID_PADDING + + # Create grid + grid = Image.new("RGB", (grid_w, grid_h), "white") + draw = ImageDraw.Draw(grid) + + # Load font with size based on thumbnail width + try: + # Use Pillow's default font with size + font = ImageFont.load_default(size=font_size) + except Exception: + # Fall back to basic default font if size parameter not supported + font = ImageFont.load_default() + + # Place thumbnails + for i, img_path in enumerate(image_paths): + row, col = i // cols, i % cols + x = col * width + (col + 1) * GRID_PADDING + y_base = ( + row * (height + font_size + label_padding * 2) + (row + 1) * GRID_PADDING + ) + + # Add label with actual slide number + label = f"{start_slide_num + i}" + bbox = draw.textbbox((0, 0), label, font=font) + text_w = bbox[2] - bbox[0] + draw.text( + (x + (width - text_w) // 2, y_base + label_padding), + label, + fill="black", + font=font, + ) + + # Add thumbnail below label with proportional spacing + y_thumbnail = y_base + label_padding + font_size + label_padding + + with Image.open(img_path) as img: + # Get original dimensions before thumbnail + orig_w, orig_h = img.size + + # Apply placeholder outlines if enabled + if placeholder_regions and (start_slide_num + i) in placeholder_regions: + # Convert to RGBA for transparency support + if img.mode != "RGBA": + img = img.convert("RGBA") + + # Get the regions for this slide + regions = placeholder_regions[start_slide_num + i] + + # Calculate scale factors using actual slide dimensions + if slide_dimensions: + slide_width_inches, slide_height_inches = slide_dimensions + else: + # Fallback: estimate from image size at CONVERSION_DPI + slide_width_inches = orig_w / CONVERSION_DPI + slide_height_inches = orig_h / CONVERSION_DPI + + x_scale = orig_w / slide_width_inches + y_scale = orig_h / slide_height_inches + + # Create a highlight overlay + overlay = Image.new("RGBA", img.size, (255, 255, 255, 0)) + overlay_draw = ImageDraw.Draw(overlay) + + # Highlight each placeholder region + for region in regions: + # Convert from inches to pixels in the original image + px_left = int(region["left"] * x_scale) + px_top = int(region["top"] * y_scale) + px_width = int(region["width"] * x_scale) + px_height = int(region["height"] * y_scale) + + # Draw highlight outline with red color and thick stroke + # Using a bright red outline instead of fill + stroke_width = max( + 5, min(orig_w, orig_h) // 150 + ) # Thicker proportional stroke width + overlay_draw.rectangle( + [(px_left, px_top), (px_left + px_width, px_top + px_height)], + outline=(255, 0, 0, 255), # Bright red, fully opaque + width=stroke_width, + ) + + # Composite the overlay onto the image using alpha blending + img = Image.alpha_composite(img, overlay) + # Convert back to RGB for JPEG saving + img = img.convert("RGB") + + img.thumbnail((width, height), Image.Resampling.LANCZOS) + w, h = img.size + tx = x + (width - w) // 2 + ty = y_thumbnail + (height - h) // 2 + grid.paste(img, (tx, ty)) + + # Add border + if BORDER_WIDTH > 0: + draw.rectangle( + [ + (tx - BORDER_WIDTH, ty - BORDER_WIDTH), + (tx + w + BORDER_WIDTH - 1, ty + h + BORDER_WIDTH - 1), + ], + outline="gray", + width=BORDER_WIDTH, + ) + + return grid + + +if __name__ == "__main__": + main() diff --git a/deploy/data/skills/pptx/scripts/textExtractor.py b/deploy/data/skills/pptx/scripts/textExtractor.py new file mode 100755 index 000000000..5a177acc6 --- /dev/null +++ b/deploy/data/skills/pptx/scripts/textExtractor.py @@ -0,0 +1,1020 @@ +#!/usr/bin/env python3 +""" +Retrieve structured text content from PowerPoint presentations. + +This module provides functionality to: +- Retrieve all text content from PowerPoint shapes +- Preserve paragraph formatting (alignment, bullets, fonts, spacing) +- Handle nested GroupShapes recursively with correct absolute positions +- Sort shapes by visual position on slides +- Filter out slide numbers and non-content placeholders +- Export to JSON with clean, structured data + +Classes: + ParagraphInfo: Represents a text paragraph with formatting + ShapeInfo: Represents a shape with position and text content + +Main Functions: + get_text_shapes_inventory: Retrieve all text from a presentation + write_inventory: Save retrieved data to JSON + +Usage: + python textExtractor.py input.pptx output.json +""" + +import argparse +import json +import platform +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +from PIL import Image, ImageDraw, ImageFont +from pptx import Presentation +from pptx.enum.text import PP_ALIGN +from pptx.shapes.base import BaseShape + +# Type aliases for cleaner signatures +JsonValue = Union[str, int, float, bool, None] +ParagraphDict = Dict[str, JsonValue] +ShapeDict = Dict[ + str, Union[str, float, bool, List[ParagraphDict], List[str], Dict[str, Any], None] +] +InventoryData = Dict[ + str, Dict[str, "ShapeData"] +] # Dict of slide_id -> {shape_id -> ShapeData} +InventoryDict = Dict[str, Dict[str, ShapeDict]] # JSON-serializable inventory + + +def main(): + """Main entry point for command-line usage.""" + parser = argparse.ArgumentParser( + description="Extract text inventory from PowerPoint with proper GroupShape support.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python inventory.py presentation.pptx inventory.json + Extracts text inventory with correct absolute positions for grouped shapes + + python inventory.py presentation.pptx inventory.json --issues-only + Extracts only text shapes that have overflow or overlap issues + +The output JSON includes: + - All text content organized by slide and shape + - Correct absolute positions for shapes in groups + - Visual position and size in inches + - Paragraph properties and formatting + - Issue detection: text overflow and shape overlaps + """, + ) + + parser.add_argument("input", help="Input PowerPoint file (.pptx)") + parser.add_argument("output", help="Output JSON file for inventory") + parser.add_argument( + "--issues-only", + action="store_true", + help="Include only text shapes that have overflow or overlap issues", + ) + + args = parser.parse_args() + + input_path = Path(args.input) + if not input_path.exists(): + print(f"Error: Input file not found: {args.input}") + sys.exit(1) + + if not input_path.suffix.lower() == ".pptx": + print("Error: Input must be a PowerPoint file (.pptx)") + sys.exit(1) + + try: + print(f"Extracting text inventory from: {args.input}") + if args.issues_only: + print( + "Filtering to include only text shapes with issues (overflow/overlap)" + ) + inventory = get_text_shapes_inventory(input_path, issues_only=args.issues_only) + + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + write_inventory(inventory, output_path) + + print(f"Output saved to: {args.output}") + + # Report statistics + total_slides = len(inventory) + total_shapes = sum(len(shapes) for shapes in inventory.values()) + if args.issues_only: + if total_shapes > 0: + print( + f"Found {total_shapes} text elements with issues in {total_slides} slides" + ) + else: + print("No issues discovered") + else: + print( + f"Found text in {total_slides} slides with {total_shapes} text elements" + ) + + except Exception as e: + print(f"Error processing presentation: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +@dataclass +class ShapeWithPosition: + """A shape with its absolute position on the slide.""" + + shape: BaseShape + absolute_left: int # in EMUs + absolute_top: int # in EMUs + + +class ParagraphData: + """Data structure for paragraph properties extracted from a PowerPoint paragraph.""" + + def __init__(self, paragraph: Any): + """Initialize from a PowerPoint paragraph object. + + Args: + paragraph: The PowerPoint paragraph object + """ + self.text: str = paragraph.text.strip() + self.bullet: bool = False + self.level: Optional[int] = None + self.alignment: Optional[str] = None + self.space_before: Optional[float] = None + self.space_after: Optional[float] = None + self.font_name: Optional[str] = None + self.font_size: Optional[float] = None + self.bold: Optional[bool] = None + self.italic: Optional[bool] = None + self.underline: Optional[bool] = None + self.color: Optional[str] = None + self.theme_color: Optional[str] = None + self.line_spacing: Optional[float] = None + + # Check for bullet formatting + if ( + hasattr(paragraph, "_p") + and paragraph._p is not None + and paragraph._p.pPr is not None + ): + pPr = paragraph._p.pPr + ns = "{http://schemas.openxmlformats.org/drawingml/2006/main}" + if ( + pPr.find(f"{ns}buChar") is not None + or pPr.find(f"{ns}buAutoNum") is not None + ): + self.bullet = True + if hasattr(paragraph, "level"): + self.level = paragraph.level + + # Add alignment if not LEFT (default) + if hasattr(paragraph, "alignment") and paragraph.alignment is not None: + alignment_map = { + PP_ALIGN.CENTER: "CENTER", + PP_ALIGN.RIGHT: "RIGHT", + PP_ALIGN.JUSTIFY: "JUSTIFY", + } + if paragraph.alignment in alignment_map: + self.alignment = alignment_map[paragraph.alignment] + + # Add spacing properties if set + if hasattr(paragraph, "space_before") and paragraph.space_before: + self.space_before = paragraph.space_before.pt + if hasattr(paragraph, "space_after") and paragraph.space_after: + self.space_after = paragraph.space_after.pt + + # Extract font properties from first run + if paragraph.runs: + first_run = paragraph.runs[0] + if hasattr(first_run, "font"): + font = first_run.font + if font.name: + self.font_name = font.name + if font.size: + self.font_size = font.size.pt + if font.bold is not None: + self.bold = font.bold + if font.italic is not None: + self.italic = font.italic + if font.underline is not None: + self.underline = font.underline + + # Handle color - both RGB and theme colors + try: + # Try RGB color first + if font.color.rgb: + self.color = str(font.color.rgb) + except (AttributeError, TypeError): + # Fall back to theme color + try: + if font.color.theme_color: + self.theme_color = font.color.theme_color.name + except (AttributeError, TypeError): + pass + + # Add line spacing if set + if hasattr(paragraph, "line_spacing") and paragraph.line_spacing is not None: + if hasattr(paragraph.line_spacing, "pt"): + self.line_spacing = round(paragraph.line_spacing.pt, 2) + else: + # Multiplier - convert to points + font_size = self.font_size if self.font_size else 12.0 + self.line_spacing = round(paragraph.line_spacing * font_size, 2) + + def to_dict(self) -> ParagraphDict: + """Convert to dictionary for JSON serialization, excluding None values.""" + result: ParagraphDict = {"text": self.text} + + # Add optional fields only if they have values + if self.bullet: + result["bullet"] = self.bullet + if self.level is not None: + result["level"] = self.level + if self.alignment: + result["alignment"] = self.alignment + if self.space_before is not None: + result["space_before"] = self.space_before + if self.space_after is not None: + result["space_after"] = self.space_after + if self.font_name: + result["font_name"] = self.font_name + if self.font_size is not None: + result["font_size"] = self.font_size + if self.bold is not None: + result["bold"] = self.bold + if self.italic is not None: + result["italic"] = self.italic + if self.underline is not None: + result["underline"] = self.underline + if self.color: + result["color"] = self.color + if self.theme_color: + result["theme_color"] = self.theme_color + if self.line_spacing is not None: + result["line_spacing"] = self.line_spacing + + return result + + +class ShapeData: + """Data structure for shape properties extracted from a PowerPoint shape.""" + + @staticmethod + def emu_to_inches(emu: int) -> float: + """Convert EMUs (English Metric Units) to inches.""" + return emu / 914400.0 + + @staticmethod + def inches_to_pixels(inches: float, dpi: int = 96) -> int: + """Convert inches to pixels at given DPI.""" + return int(inches * dpi) + + @staticmethod + def get_font_path(font_name: str) -> Optional[str]: + """Get the font file path for a given font name. + + Args: + font_name: Name of the font (e.g., 'Arial', 'Calibri') + + Returns: + Path to the font file, or None if not found + """ + system = platform.system() + + # Common font file variations to try + font_variations = [ + font_name, + font_name.lower(), + font_name.replace(" ", ""), + font_name.replace(" ", "-"), + ] + + # Define font directories and extensions by platform + if system == "Darwin": # macOS + font_dirs = [ + "/System/Library/Fonts/", + "/Library/Fonts/", + "~/Library/Fonts/", + ] + extensions = [".ttf", ".otf", ".ttc", ".dfont"] + else: # Linux + font_dirs = [ + "/usr/share/fonts/truetype/", + "/usr/local/share/fonts/", + "~/.fonts/", + ] + extensions = [".ttf", ".otf"] + + # Try to find the font file + from pathlib import Path + + for font_dir in font_dirs: + font_dir_path = Path(font_dir).expanduser() + if not font_dir_path.exists(): + continue + + # First try exact matches + for variant in font_variations: + for ext in extensions: + font_path = font_dir_path / f"{variant}{ext}" + if font_path.exists(): + return str(font_path) + + # Then try fuzzy matching - find files containing the font name + try: + for file_path in font_dir_path.iterdir(): + if file_path.is_file(): + file_name_lower = file_path.name.lower() + font_name_lower = font_name.lower().replace(" ", "") + if font_name_lower in file_name_lower and any( + file_name_lower.endswith(ext) for ext in extensions + ): + return str(file_path) + except (OSError, PermissionError): + continue + + return None + + @staticmethod + def get_slide_dimensions(slide: Any) -> tuple[Optional[int], Optional[int]]: + """Get slide dimensions from slide object. + + Args: + slide: Slide object + + Returns: + Tuple of (width_emu, height_emu) or (None, None) if not found + """ + try: + prs = slide.part.package.presentation_part.presentation + return prs.slide_width, prs.slide_height + except (AttributeError, TypeError): + return None, None + + @staticmethod + def get_default_font_size(shape: BaseShape, slide_layout: Any) -> Optional[float]: + """Extract default font size from slide layout for a placeholder shape. + + Args: + shape: Placeholder shape + slide_layout: Slide layout containing the placeholder definition + + Returns: + Default font size in points, or None if not found + """ + try: + if not hasattr(shape, "placeholder_format"): + return None + + shape_type = shape.placeholder_format.type # type: ignore + for layout_placeholder in slide_layout.placeholders: + if layout_placeholder.placeholder_format.type == shape_type: + # Find first defRPr element with sz (size) attribute + for elem in layout_placeholder.element.iter(): + if "defRPr" in elem.tag and (sz := elem.get("sz")): + return float(sz) / 100.0 # Convert EMUs to points + break + except Exception: + pass + return None + + def __init__( + self, + shape: BaseShape, + absolute_left: Optional[int] = None, + absolute_top: Optional[int] = None, + slide: Optional[Any] = None, + ): + """Initialize from a PowerPoint shape object. + + Args: + shape: The PowerPoint shape object (should be pre-validated) + absolute_left: Absolute left position in EMUs (for shapes in groups) + absolute_top: Absolute top position in EMUs (for shapes in groups) + slide: Optional slide object to get dimensions and layout information + """ + self.shape = shape # Store reference to original shape + self.shape_id: str = "" # Will be set after sorting + + # Get slide dimensions from slide object + self.slide_width_emu, self.slide_height_emu = ( + self.get_slide_dimensions(slide) if slide else (None, None) + ) + + # Get placeholder type if applicable + self.placeholder_type: Optional[str] = None + self.default_font_size: Optional[float] = None + if hasattr(shape, "is_placeholder") and shape.is_placeholder: # type: ignore + if shape.placeholder_format and shape.placeholder_format.type: # type: ignore + self.placeholder_type = ( + str(shape.placeholder_format.type).split(".")[-1].split(" ")[0] # type: ignore + ) + + # Get default font size from layout + if slide and hasattr(slide, "slide_layout"): + self.default_font_size = self.get_default_font_size( + shape, slide.slide_layout + ) + + # Get position information + # Use absolute positions if provided (for shapes in groups), otherwise use shape's position + left_emu = ( + absolute_left + if absolute_left is not None + else (shape.left if hasattr(shape, "left") else 0) + ) + top_emu = ( + absolute_top + if absolute_top is not None + else (shape.top if hasattr(shape, "top") else 0) + ) + + self.left: float = round(self.emu_to_inches(left_emu), 2) # type: ignore + self.top: float = round(self.emu_to_inches(top_emu), 2) # type: ignore + self.width: float = round( + self.emu_to_inches(shape.width if hasattr(shape, "width") else 0), + 2, # type: ignore + ) + self.height: float = round( + self.emu_to_inches(shape.height if hasattr(shape, "height") else 0), + 2, # type: ignore + ) + + # Store EMU positions for overflow calculations + self.left_emu = left_emu + self.top_emu = top_emu + self.width_emu = shape.width if hasattr(shape, "width") else 0 + self.height_emu = shape.height if hasattr(shape, "height") else 0 + + # Calculate overflow status + self.frame_overflow_bottom: Optional[float] = None + self.slide_overflow_right: Optional[float] = None + self.slide_overflow_bottom: Optional[float] = None + self.overlapping_shapes: Dict[ + str, float + ] = {} # Dict of shape_id -> overlap area in sq inches + self.warnings: List[str] = [] + self._estimate_frame_overflow() + self._calculate_slide_overflow() + self._detect_bullet_issues() + + @property + def paragraphs(self) -> List[ParagraphData]: + """Calculate paragraphs from the shape's text frame.""" + if not self.shape or not hasattr(self.shape, "text_frame"): + return [] + + paragraphs = [] + for paragraph in self.shape.text_frame.paragraphs: # type: ignore + if paragraph.text.strip(): + paragraphs.append(ParagraphData(paragraph)) + return paragraphs + + def _get_default_font_size(self) -> int: + """Get default font size from theme text styles or use conservative default.""" + try: + if not ( + hasattr(self.shape, "part") and hasattr(self.shape.part, "slide_layout") + ): + return 14 + + slide_master = self.shape.part.slide_layout.slide_master # type: ignore + if not hasattr(slide_master, "element"): + return 14 + + # Determine theme style based on placeholder type + style_name = "bodyStyle" # Default + if self.placeholder_type and "TITLE" in self.placeholder_type: + style_name = "titleStyle" + + # Find font size in theme styles + for child in slide_master.element.iter(): + tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag + if tag == style_name: + for elem in child.iter(): + if "sz" in elem.attrib: + return int(elem.attrib["sz"]) // 100 + except Exception: + pass + + return 14 # Conservative default for body text + + def _get_usable_dimensions(self, text_frame) -> Tuple[int, int]: + """Get usable width and height in pixels after accounting for margins.""" + # Default PowerPoint margins in inches + margins = {"top": 0.05, "bottom": 0.05, "left": 0.1, "right": 0.1} + + # Override with actual margins if set + if hasattr(text_frame, "margin_top") and text_frame.margin_top: + margins["top"] = self.emu_to_inches(text_frame.margin_top) + if hasattr(text_frame, "margin_bottom") and text_frame.margin_bottom: + margins["bottom"] = self.emu_to_inches(text_frame.margin_bottom) + if hasattr(text_frame, "margin_left") and text_frame.margin_left: + margins["left"] = self.emu_to_inches(text_frame.margin_left) + if hasattr(text_frame, "margin_right") and text_frame.margin_right: + margins["right"] = self.emu_to_inches(text_frame.margin_right) + + # Calculate usable area + usable_width = self.width - margins["left"] - margins["right"] + usable_height = self.height - margins["top"] - margins["bottom"] + + # Convert to pixels + return ( + self.inches_to_pixels(usable_width), + self.inches_to_pixels(usable_height), + ) + + def _wrap_text_line(self, line: str, max_width_px: int, draw, font) -> List[str]: + """Wrap a single line of text to fit within max_width_px.""" + if not line: + return [""] + + # Use textlength for efficient width calculation + if draw.textlength(line, font=font) <= max_width_px: + return [line] + + # Need to wrap - split into words + wrapped = [] + words = line.split(" ") + current_line = "" + + for word in words: + test_line = current_line + (" " if current_line else "") + word + if draw.textlength(test_line, font=font) <= max_width_px: + current_line = test_line + else: + if current_line: + wrapped.append(current_line) + current_line = word + + if current_line: + wrapped.append(current_line) + + return wrapped + + def _estimate_frame_overflow(self) -> None: + """Estimate if text overflows the shape bounds using PIL text measurement.""" + if not self.shape or not hasattr(self.shape, "text_frame"): + return + + text_frame = self.shape.text_frame # type: ignore + if not text_frame or not text_frame.paragraphs: + return + + # Get usable dimensions after accounting for margins + usable_width_px, usable_height_px = self._get_usable_dimensions(text_frame) + if usable_width_px <= 0 or usable_height_px <= 0: + return + + # Set up PIL for text measurement + dummy_img = Image.new("RGB", (1, 1)) + draw = ImageDraw.Draw(dummy_img) + + # Get default font size from placeholder or use conservative estimate + default_font_size = self._get_default_font_size() + + # Calculate total height of all paragraphs + total_height_px = 0 + + for para_idx, paragraph in enumerate(text_frame.paragraphs): + if not paragraph.text.strip(): + continue + + para_data = ParagraphData(paragraph) + + # Load font for this paragraph + font_name = para_data.font_name or "Arial" + font_size = int(para_data.font_size or default_font_size) + + font = None + font_path = self.get_font_path(font_name) + if font_path: + try: + font = ImageFont.truetype(font_path, size=font_size) + except Exception: + font = ImageFont.load_default() + else: + font = ImageFont.load_default() + + # Wrap all lines in this paragraph + all_wrapped_lines = [] + for line in paragraph.text.split("\n"): + wrapped = self._wrap_text_line(line, usable_width_px, draw, font) + all_wrapped_lines.extend(wrapped) + + if all_wrapped_lines: + # Calculate line height + if para_data.line_spacing: + # Custom line spacing explicitly set + line_height_px = para_data.line_spacing * 96 / 72 + else: + # PowerPoint default single spacing (1.0x font size) + line_height_px = font_size * 96 / 72 + + # Add space_before (except first paragraph) + if para_idx > 0 and para_data.space_before: + total_height_px += para_data.space_before * 96 / 72 + + # Add paragraph text height + total_height_px += len(all_wrapped_lines) * line_height_px + + # Add space_after + if para_data.space_after: + total_height_px += para_data.space_after * 96 / 72 + + # Check for overflow (ignore negligible overflows <= 0.05") + if total_height_px > usable_height_px: + overflow_px = total_height_px - usable_height_px + overflow_inches = round(overflow_px / 96.0, 2) + if overflow_inches > 0.05: # Only report significant overflows + self.frame_overflow_bottom = overflow_inches + + def _calculate_slide_overflow(self) -> None: + """Calculate if shape overflows the slide boundaries.""" + if self.slide_width_emu is None or self.slide_height_emu is None: + return + + # Check right overflow (ignore negligible overflows <= 0.01") + right_edge_emu = self.left_emu + self.width_emu + if right_edge_emu > self.slide_width_emu: + overflow_emu = right_edge_emu - self.slide_width_emu + overflow_inches = round(self.emu_to_inches(overflow_emu), 2) + if overflow_inches > 0.01: # Only report significant overflows + self.slide_overflow_right = overflow_inches + + # Check bottom overflow (ignore negligible overflows <= 0.01") + bottom_edge_emu = self.top_emu + self.height_emu + if bottom_edge_emu > self.slide_height_emu: + overflow_emu = bottom_edge_emu - self.slide_height_emu + overflow_inches = round(self.emu_to_inches(overflow_emu), 2) + if overflow_inches > 0.01: # Only report significant overflows + self.slide_overflow_bottom = overflow_inches + + def _detect_bullet_issues(self) -> None: + """Detect bullet point formatting issues in paragraphs.""" + if not self.shape or not hasattr(self.shape, "text_frame"): + return + + text_frame = self.shape.text_frame # type: ignore + if not text_frame or not text_frame.paragraphs: + return + + # Common bullet symbols that indicate manual bullets + bullet_symbols = ["•", "●", "○"] + + for paragraph in text_frame.paragraphs: + text = paragraph.text.strip() + # Check for manual bullet symbols + if text and any(text.startswith(symbol + " ") for symbol in bullet_symbols): + self.warnings.append( + "manual_bullet_symbol: use proper bullet formatting" + ) + break + + @property + def has_any_issues(self) -> bool: + """Check if shape has any issues (overflow, overlap, or warnings).""" + return ( + self.frame_overflow_bottom is not None + or self.slide_overflow_right is not None + or self.slide_overflow_bottom is not None + or len(self.overlapping_shapes) > 0 + or len(self.warnings) > 0 + ) + + def to_dict(self) -> ShapeDict: + """Convert to dictionary for JSON serialization.""" + result: ShapeDict = { + "left": self.left, + "top": self.top, + "width": self.width, + "height": self.height, + } + + # Add optional fields if present + if self.placeholder_type: + result["placeholder_type"] = self.placeholder_type + + if self.default_font_size: + result["default_font_size"] = self.default_font_size + + # Add overflow information only if there is overflow + overflow_data = {} + + # Add frame overflow if present + if self.frame_overflow_bottom is not None: + overflow_data["frame"] = {"overflow_bottom": self.frame_overflow_bottom} + + # Add slide overflow if present + slide_overflow = {} + if self.slide_overflow_right is not None: + slide_overflow["overflow_right"] = self.slide_overflow_right + if self.slide_overflow_bottom is not None: + slide_overflow["overflow_bottom"] = self.slide_overflow_bottom + if slide_overflow: + overflow_data["slide"] = slide_overflow + + # Only add overflow field if there is overflow + if overflow_data: + result["overflow"] = overflow_data + + # Add overlap field if there are overlapping shapes + if self.overlapping_shapes: + result["overlap"] = {"overlapping_shapes": self.overlapping_shapes} + + # Add warnings field if there are warnings + if self.warnings: + result["warnings"] = self.warnings + + # Add paragraphs after placeholder_type + result["paragraphs"] = [para.to_dict() for para in self.paragraphs] + + return result + + +def is_valid_shape(shape: BaseShape) -> bool: + """Check if a shape contains meaningful text content.""" + # Must have a text frame with content + if not hasattr(shape, "text_frame") or not shape.text_frame: # type: ignore + return False + + text = shape.text_frame.text.strip() # type: ignore + if not text: + return False + + # Skip slide numbers and numeric footers + if hasattr(shape, "is_placeholder") and shape.is_placeholder: # type: ignore + if shape.placeholder_format and shape.placeholder_format.type: # type: ignore + placeholder_type = ( + str(shape.placeholder_format.type).split(".")[-1].split(" ")[0] # type: ignore + ) + if placeholder_type == "SLIDE_NUMBER": + return False + if placeholder_type == "FOOTER" and text.isdigit(): + return False + + return True + + +def collect_shapes_with_absolute_positions( + shape: BaseShape, parent_left: int = 0, parent_top: int = 0 +) -> List[ShapeWithPosition]: + """Recursively collect all shapes with valid text, calculating absolute positions. + + For shapes within groups, their positions are relative to the group. + This function calculates the absolute position on the slide by accumulating + parent group offsets. + + Args: + shape: The shape to process + parent_left: Accumulated left offset from parent groups (in EMUs) + parent_top: Accumulated top offset from parent groups (in EMUs) + + Returns: + List of ShapeWithPosition objects with absolute positions + """ + if hasattr(shape, "shapes"): # GroupShape + result = [] + # Get this group's position + group_left = shape.left if hasattr(shape, "left") else 0 + group_top = shape.top if hasattr(shape, "top") else 0 + + # Calculate absolute position for this group + abs_group_left = parent_left + group_left + abs_group_top = parent_top + group_top + + # Process children with accumulated offsets + for child in shape.shapes: # type: ignore + result.extend( + collect_shapes_with_absolute_positions( + child, abs_group_left, abs_group_top + ) + ) + return result + + # Regular shape - check if it has valid text + if is_valid_shape(shape): + # Calculate absolute position + shape_left = shape.left if hasattr(shape, "left") else 0 + shape_top = shape.top if hasattr(shape, "top") else 0 + + return [ + ShapeWithPosition( + shape=shape, + absolute_left=parent_left + shape_left, + absolute_top=parent_top + shape_top, + ) + ] + + return [] + + +def sort_shapes_by_position(shapes: List[ShapeData]) -> List[ShapeData]: + """Sort shapes by visual position (top-to-bottom, left-to-right). + + Shapes within 0.5 inches vertically are considered on the same row. + """ + if not shapes: + return shapes + + # Sort by top position first + shapes = sorted(shapes, key=lambda s: (s.top, s.left)) + + # Group shapes by row (within 0.5 inches vertically) + result = [] + row = [shapes[0]] + row_top = shapes[0].top + + for shape in shapes[1:]: + if abs(shape.top - row_top) <= 0.5: + row.append(shape) + else: + # Sort current row by left position and add to result + result.extend(sorted(row, key=lambda s: s.left)) + row = [shape] + row_top = shape.top + + # Don't forget the last row + result.extend(sorted(row, key=lambda s: s.left)) + return result + + +def calculate_overlap( + rect1: Tuple[float, float, float, float], + rect2: Tuple[float, float, float, float], + tolerance: float = 0.05, +) -> Tuple[bool, float]: + """Calculate if and how much two rectangles overlap. + + Args: + rect1: (left, top, width, height) of first rectangle in inches + rect2: (left, top, width, height) of second rectangle in inches + tolerance: Minimum overlap in inches to consider as overlapping (default: 0.05") + + Returns: + Tuple of (overlaps, overlap_area) where: + - overlaps: True if rectangles overlap by more than tolerance + - overlap_area: Area of overlap in square inches + """ + left1, top1, w1, h1 = rect1 + left2, top2, w2, h2 = rect2 + + # Calculate overlap dimensions + overlap_width = min(left1 + w1, left2 + w2) - max(left1, left2) + overlap_height = min(top1 + h1, top2 + h2) - max(top1, top2) + + # Check if there's meaningful overlap (more than tolerance) + if overlap_width > tolerance and overlap_height > tolerance: + # Calculate overlap area in square inches + overlap_area = overlap_width * overlap_height + return True, round(overlap_area, 2) + + return False, 0 + + +def detect_overlaps(shapes: List[ShapeData]) -> None: + """Detect overlapping shapes and update their overlapping_shapes dictionaries. + + This function requires each ShapeData to have its shape_id already set. + It modifies the shapes in-place, adding shape IDs with overlap areas in square inches. + + Args: + shapes: List of ShapeData objects with shape_id attributes set + """ + n = len(shapes) + + # Compare each pair of shapes + for i in range(n): + for j in range(i + 1, n): + shape1 = shapes[i] + shape2 = shapes[j] + + # Ensure shape IDs are set + assert shape1.shape_id, f"Shape at index {i} has no shape_id" + assert shape2.shape_id, f"Shape at index {j} has no shape_id" + + rect1 = (shape1.left, shape1.top, shape1.width, shape1.height) + rect2 = (shape2.left, shape2.top, shape2.width, shape2.height) + + overlaps, overlap_area = calculate_overlap(rect1, rect2) + + if overlaps: + # Add shape IDs with overlap area in square inches + shape1.overlapping_shapes[shape2.shape_id] = overlap_area + shape2.overlapping_shapes[shape1.shape_id] = overlap_area + + +def get_text_shapes_inventory( + pptx_path: Path, prs: Optional[Any] = None, issues_only: bool = False +) -> InventoryData: + """Retrieve text content from all slides in a PowerPoint presentation. + + Args: + pptx_path: Path to the PowerPoint file + prs: Optional Presentation object to use. If not provided, will load from pptx_path. + issues_only: If True, only include shapes that have overflow or overlap issues + + Returns a nested dictionary: {slide-N: {shape-N: ShapeInfo}} + Shapes are sorted by visual position (top-to-bottom, left-to-right). + The ShapeInfo objects contain the full shape information and can be + converted to dictionaries for JSON serialization using to_dict(). + """ + if prs is None: + prs = Presentation(str(pptx_path)) + inventory: InventoryData = {} + + for slide_idx, slide in enumerate(prs.slides): + # Collect all valid shapes from this slide with absolute positions + shapes_with_positions = [] + for shape in slide.shapes: # type: ignore + shapes_with_positions.extend(collect_shapes_with_absolute_positions(shape)) + + if not shapes_with_positions: + continue + + # Convert to ShapeData with absolute positions and slide reference + shape_data_list = [ + ShapeData( + swp.shape, + swp.absolute_left, + swp.absolute_top, + slide, + ) + for swp in shapes_with_positions + ] + + # Sort by visual position and assign stable IDs in one step + sorted_shapes = sort_shapes_by_position(shape_data_list) + for idx, shape_data in enumerate(sorted_shapes): + shape_data.shape_id = f"shape-{idx}" + + # Detect overlaps using the stable shape IDs + if len(sorted_shapes) > 1: + detect_overlaps(sorted_shapes) + + # Filter for issues only if requested (after overlap detection) + if issues_only: + sorted_shapes = [sd for sd in sorted_shapes if sd.has_any_issues] + + if not sorted_shapes: + continue + + # Create slide inventory using the stable shape IDs + inventory[f"slide-{slide_idx}"] = { + shape_data.shape_id: shape_data for shape_data in sorted_shapes + } + + return inventory + + +def get_inventory_as_dict(pptx_path: Path, issues_only: bool = False) -> InventoryDict: + """Extract text inventory and return as JSON-serializable dictionaries. + + This is a convenience wrapper around extract_text_inventory that returns + dictionaries instead of ShapeData objects, useful for testing and direct + JSON serialization. + + Args: + pptx_path: Path to the PowerPoint file + issues_only: If True, only include shapes that have overflow or overlap issues + + Returns: + Nested dictionary with all data serialized for JSON + """ + inventory = extract_text_inventory(pptx_path, issues_only=issues_only) + + # Convert ShapeData objects to dictionaries + dict_inventory: InventoryDict = {} + for slide_key, shapes in inventory.items(): + dict_inventory[slide_key] = { + shape_key: shape_data.to_dict() for shape_key, shape_data in shapes.items() + } + + return dict_inventory + + +def write_inventory(inventory: InventoryData, output_path: Path) -> None: + """Save inventory to JSON file with proper formatting. + + Converts ShapeData objects to dictionaries for JSON serialization. + """ + # Convert ShapeData objects to dictionaries + json_inventory: InventoryDict = {} + for slide_key, shapes in inventory.items(): + json_inventory[slide_key] = { + shape_key: shape_data.to_dict() for shape_key, shape_data in shapes.items() + } + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(json_inventory, f, indent=2, ensure_ascii=False) + + +if __name__ == "__main__": + main() diff --git a/deploy/data/skills/pptx/scripts/textReplacer.py b/deploy/data/skills/pptx/scripts/textReplacer.py new file mode 100755 index 000000000..9c3e71279 --- /dev/null +++ b/deploy/data/skills/pptx/scripts/textReplacer.py @@ -0,0 +1,385 @@ +#!/usr/bin/env python3 +"""Apply text replacements to PowerPoint presentation. + +Usage: + python textReplacer.py + +The replacements JSON should have the structure output by textExtractor.py. +ALL text shapes identified by textExtractor.py will have their text cleared +unless "paragraphs" is specified in the replacements for that shape. +""" + +import json +import sys +from pathlib import Path +from typing import Any, Dict, List + +from textExtractor import InventoryData, get_text_shapes_inventory +from pptx import Presentation +from pptx.dml.color import RGBColor +from pptx.enum.dml import MSO_THEME_COLOR +from pptx.enum.text import PP_ALIGN +from pptx.oxml.xmlchemy import OxmlElement +from pptx.util import Pt + + +def clear_paragraph_bullets(paragraph): + """Clear bullet formatting from a paragraph.""" + pPr = paragraph._element.get_or_add_pPr() + + # Remove existing bullet elements + for child in list(pPr): + if ( + child.tag.endswith("buChar") + or child.tag.endswith("buNone") + or child.tag.endswith("buAutoNum") + or child.tag.endswith("buFont") + ): + pPr.remove(child) + + return pPr + + +def apply_paragraph_properties(paragraph, para_data: Dict[str, Any]): + """Apply formatting properties to a paragraph.""" + # Get the text but don't set it on paragraph directly yet + text = para_data.get("text", "") + + # Get or create paragraph properties + pPr = clear_paragraph_bullets(paragraph) + + # Handle bullet formatting + if para_data.get("bullet", False): + level = para_data.get("level", 0) + paragraph.level = level + + # Calculate font-proportional indentation + font_size = para_data.get("font_size", 18.0) + level_indent_emu = int((font_size * (1.6 + level * 1.6)) * 12700) + hanging_indent_emu = int(-font_size * 0.8 * 12700) + + # Set indentation + pPr.attrib["marL"] = str(level_indent_emu) + pPr.attrib["indent"] = str(hanging_indent_emu) + + # Add bullet character + buChar = OxmlElement("a:buChar") + buChar.set("char", "•") + pPr.append(buChar) + + # Default to left alignment for bullets if not specified + if "alignment" not in para_data: + paragraph.alignment = PP_ALIGN.LEFT + else: + # Remove indentation for non-bullet text + pPr.attrib["marL"] = "0" + pPr.attrib["indent"] = "0" + + # Add buNone element + buNone = OxmlElement("a:buNone") + pPr.insert(0, buNone) + + # Apply alignment + if "alignment" in para_data: + alignment_map = { + "LEFT": PP_ALIGN.LEFT, + "CENTER": PP_ALIGN.CENTER, + "RIGHT": PP_ALIGN.RIGHT, + "JUSTIFY": PP_ALIGN.JUSTIFY, + } + if para_data["alignment"] in alignment_map: + paragraph.alignment = alignment_map[para_data["alignment"]] + + # Apply spacing + if "space_before" in para_data: + paragraph.space_before = Pt(para_data["space_before"]) + if "space_after" in para_data: + paragraph.space_after = Pt(para_data["space_after"]) + if "line_spacing" in para_data: + paragraph.line_spacing = Pt(para_data["line_spacing"]) + + # Apply run-level formatting + if not paragraph.runs: + run = paragraph.add_run() + run.text = text + else: + run = paragraph.runs[0] + run.text = text + + # Apply font properties + apply_font_properties(run, para_data) + + +def apply_font_properties(run, para_data: Dict[str, Any]): + """Apply font properties to a text run.""" + if "bold" in para_data: + run.font.bold = para_data["bold"] + if "italic" in para_data: + run.font.italic = para_data["italic"] + if "underline" in para_data: + run.font.underline = para_data["underline"] + if "font_size" in para_data: + run.font.size = Pt(para_data["font_size"]) + if "font_name" in para_data: + run.font.name = para_data["font_name"] + + # Apply color - prefer RGB, fall back to theme_color + if "color" in para_data: + color_hex = para_data["color"].lstrip("#") + if len(color_hex) == 6: + r = int(color_hex[0:2], 16) + g = int(color_hex[2:4], 16) + b = int(color_hex[4:6], 16) + run.font.color.rgb = RGBColor(r, g, b) + elif "theme_color" in para_data: + # Get theme color by name (e.g., "DARK_1", "ACCENT_1") + theme_name = para_data["theme_color"] + try: + run.font.color.theme_color = getattr(MSO_THEME_COLOR, theme_name) + except AttributeError: + print(f" WARNING: Unknown theme color name '{theme_name}'") + + +def detect_frame_overflow(inventory: InventoryData) -> Dict[str, Dict[str, float]]: + """Detect text overflow in shapes (text exceeding shape bounds). + + Returns dict of slide_key -> shape_key -> overflow_inches. + Only includes shapes that have text overflow. + """ + overflow_map = {} + + for slide_key, shapes_dict in inventory.items(): + for shape_key, shape_data in shapes_dict.items(): + # Check for frame overflow (text exceeding shape bounds) + if shape_data.frame_overflow_bottom is not None: + if slide_key not in overflow_map: + overflow_map[slide_key] = {} + overflow_map[slide_key][shape_key] = shape_data.frame_overflow_bottom + + return overflow_map + + +def validate_replacements(inventory: InventoryData, replacements: Dict) -> List[str]: + """Validate that all shapes in replacements exist in inventory. + + Returns list of error messages. + """ + errors = [] + + for slide_key, shapes_data in replacements.items(): + if not slide_key.startswith("slide-"): + continue + + # Check if slide exists + if slide_key not in inventory: + errors.append(f"Slide '{slide_key}' not found in inventory") + continue + + # Check each shape + for shape_key in shapes_data.keys(): + if shape_key not in inventory[slide_key]: + # Find shapes without replacements defined and show their content + unused_with_content = [] + for k in inventory[slide_key].keys(): + if k not in shapes_data: + shape_data = inventory[slide_key][k] + # Get text from paragraphs as preview + paragraphs = shape_data.paragraphs + if paragraphs and paragraphs[0].text: + first_text = paragraphs[0].text[:50] + if len(paragraphs[0].text) > 50: + first_text += "..." + unused_with_content.append(f"{k} ('{first_text}')") + else: + unused_with_content.append(k) + + errors.append( + f"Shape '{shape_key}' not found on '{slide_key}'. " + f"Shapes without replacements: {', '.join(sorted(unused_with_content)) if unused_with_content else 'none'}" + ) + + return errors + + +def check_duplicate_keys(pairs): + """Check for duplicate keys when loading JSON.""" + result = {} + for key, value in pairs: + if key in result: + raise ValueError(f"Duplicate key found in JSON: '{key}'") + result[key] = value + return result + + +def apply_replacements(pptx_file: str, json_file: str, output_file: str): + """Apply text replacements from JSON to PowerPoint presentation.""" + + # Load presentation + prs = Presentation(pptx_file) + + # Get inventory of all text shapes (returns ShapeData objects) + # Pass prs to use same Presentation instance + inventory = get_text_shapes_inventory(Path(pptx_file), prs) + + # Detect text overflow in original presentation + original_overflow = detect_frame_overflow(inventory) + + # Load replacement data with duplicate key detection + with open(json_file, "r") as f: + replacements = json.load(f, object_pairs_hook=check_duplicate_keys) + + # Validate replacements + errors = validate_replacements(inventory, replacements) + if errors: + print("ERROR: Invalid shapes in replacement JSON:") + for error in errors: + print(f" - {error}") + print("\nPlease check the inventory and update your replacement JSON.") + print( + "You can regenerate the inventory with: python inventory.py " + ) + raise ValueError(f"Found {len(errors)} validation error(s)") + + # Track statistics + shapes_processed = 0 + shapes_cleared = 0 + shapes_replaced = 0 + + # Process each slide from inventory + for slide_key, shapes_dict in inventory.items(): + if not slide_key.startswith("slide-"): + continue + + slide_index = int(slide_key.split("-")[1]) + + if slide_index >= len(prs.slides): + print(f"Warning: Slide {slide_index} not found") + continue + + # Process each shape from inventory + for shape_key, shape_data in shapes_dict.items(): + shapes_processed += 1 + + # Get the shape directly from ShapeData + shape = shape_data.shape + if not shape: + print(f"Warning: {shape_key} has no shape reference") + continue + + # ShapeData already validates text_frame in __init__ + text_frame = shape.text_frame # type: ignore + + text_frame.clear() # type: ignore + shapes_cleared += 1 + + # Check for replacement paragraphs + replacement_shape_data = replacements.get(slide_key, {}).get(shape_key, {}) + if "paragraphs" not in replacement_shape_data: + continue + + shapes_replaced += 1 + + # Add replacement paragraphs + for i, para_data in enumerate(replacement_shape_data["paragraphs"]): + if i == 0: + p = text_frame.paragraphs[0] # type: ignore + else: + p = text_frame.add_paragraph() # type: ignore + + apply_paragraph_properties(p, para_data) + + # Check for issues after replacements + # Save to a temporary file and reload to avoid modifying the presentation during inventory + # (get_text_shapes_inventory accesses font.color which adds empty elements) + import tempfile + + with tempfile.NamedTemporaryFile(suffix=".pptx", delete=False) as tmp: + tmp_path = Path(tmp.name) + prs.save(str(tmp_path)) + + try: + updated_inventory = get_text_shapes_inventory(tmp_path) + updated_overflow = detect_frame_overflow(updated_inventory) + finally: + tmp_path.unlink() # Clean up temp file + + # Check if any text overflow got worse + overflow_errors = [] + for slide_key, shape_overflows in updated_overflow.items(): + for shape_key, new_overflow in shape_overflows.items(): + # Get original overflow (0 if there was no overflow before) + original = original_overflow.get(slide_key, {}).get(shape_key, 0.0) + + # Error if overflow increased + if new_overflow > original + 0.01: # Small tolerance for rounding + increase = new_overflow - original + overflow_errors.append( + f'{slide_key}/{shape_key}: overflow worsened by {increase:.2f}" ' + f'(was {original:.2f}", now {new_overflow:.2f}")' + ) + + # Collect warnings from updated shapes + warnings = [] + for slide_key, shapes_dict in updated_inventory.items(): + for shape_key, shape_data in shapes_dict.items(): + if shape_data.warnings: + for warning in shape_data.warnings: + warnings.append(f"{slide_key}/{shape_key}: {warning}") + + # Fail if there are any issues + if overflow_errors or warnings: + print("\nERROR: Issues detected in replacement output:") + if overflow_errors: + print("\nText overflow worsened:") + for error in overflow_errors: + print(f" - {error}") + if warnings: + print("\nFormatting warnings:") + for warning in warnings: + print(f" - {warning}") + print("\nPlease fix these issues before saving.") + raise ValueError( + f"Found {len(overflow_errors)} overflow error(s) and {len(warnings)} warning(s)" + ) + + # Save the presentation + prs.save(output_file) + + # Report results + print(f"Saved updated presentation to: {output_file}") + print(f"Processed {len(prs.slides)} slides") + print(f" - Shapes processed: {shapes_processed}") + print(f" - Shapes cleared: {shapes_cleared}") + print(f" - Shapes replaced: {shapes_replaced}") + + +def main(): + """Main entry point for command-line usage.""" + if len(sys.argv) != 4: + print(__doc__) + sys.exit(1) + + input_pptx = Path(sys.argv[1]) + replacements_json = Path(sys.argv[2]) + output_pptx = Path(sys.argv[3]) + + if not input_pptx.exists(): + print(f"Error: Input file '{input_pptx}' not found") + sys.exit(1) + + if not replacements_json.exists(): + print(f"Error: Replacements JSON file '{replacements_json}' not found") + sys.exit(1) + + try: + apply_replacements(str(input_pptx), str(replacements_json), str(output_pptx)) + except Exception as e: + print(f"Error applying replacements: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/deploy/data/skills/pptx/slide-generator.md b/deploy/data/skills/pptx/slide-generator.md new file mode 100644 index 000000000..0ed1c79d5 --- /dev/null +++ b/deploy/data/skills/pptx/slide-generator.md @@ -0,0 +1,719 @@ +# HTML to PowerPoint Conversion Guide + +Transform HTML slide designs into PowerPoint presentations with precise element positioning using the `slideConverter.js` library. + +## Table of Contents + +1. [Designing HTML Slides](#designing-html-slides) +2. [Using the slideConverter Library](#using-the-slideconverter-library) +3. [Working with PptxGenJS](#working-with-pptxgenjs) + +--- + +## Designing HTML Slides + +Each HTML slide requires proper body dimensions: + +### Slide Dimensions + +- **16:9** (default): `width: 720pt; height: 405pt` +- **4:3**: `width: 720pt; height: 540pt` +- **16:10**: `width: 720pt; height: 450pt` + +### Supported HTML Elements + +- `

              `, `

              `-`

              ` - Text content with styling +- `
                `, `
                  ` - Lists (avoid manual bullet characters) +- ``, `` - Bold text (inline formatting) +- ``, `` - Italic text (inline formatting) +- `` - Underlined text (inline formatting) +- `` - Inline formatting with CSS styles (bold, italic, underline, color) +- `
                  ` - Line breaks +- `
                  ` with bg/border - Converts to shape +- `` - Images +- `class="placeholder"` - Reserved space for charts (returns `{ id, x, y, w, h }`) + +### Essential Text Formatting Rules + +**ALL text MUST be inside `

                  `, `

                  `-`

                  `, `
                    `, or `
                      ` tags:** +- Correct: `

                      Text here

                      ` +- Incorrect: `
                      Text here
                      ` - **Text will NOT appear in PowerPoint** +- Incorrect: `Text` - **Text will NOT appear in PowerPoint** +- Text in `
                      ` or `` without a text tag will be silently ignored + +**AVOID manual bullet symbols** - Use `
                        ` or `
                          ` lists instead + +**Use only universally available fonts:** +- Safe fonts: `Arial`, `Helvetica`, `Times New Roman`, `Georgia`, `Courier New`, `Verdana`, `Tahoma`, `Trebuchet MS`, `Impact`, `Comic Sans MS` +- Unsafe: `'Segoe UI'`, `'SF Pro'`, `'Roboto'`, custom fonts - **May cause rendering issues** + +### Styling Guidelines + +- Use `display: flex` on body to prevent margin collapse from breaking overflow validation +- Use `margin` for spacing (padding included in size) +- Inline formatting: Use ``, ``, `` tags OR `` with CSS styles + - `` supports: `font-weight: bold`, `font-style: italic`, `text-decoration: underline`, `color: #rrggbb` + - `` does NOT support: `margin`, `padding` (not supported in PowerPoint text runs) + - Example: `Bold blue text` +- Flexbox works - positions calculated from rendered layout +- Use hex colors with `#` prefix in CSS +- **Text alignment**: Use CSS `text-align` (`center`, `right`, etc.) when needed as a hint to PptxGenJS for text formatting if text lengths are slightly off + +### Shape Styling (DIV elements only) + +**NOTE: Backgrounds, borders, and shadows only work on `
                          ` elements, NOT on text elements (`

                          `, `

                          `-`

                          `, `
                            `, `
                              `)** + +- **Backgrounds**: CSS `background` or `background-color` on `
                              ` elements only + - Example: `
                              ` - Creates a shape with background +- **Borders**: CSS `border` on `
                              ` elements converts to PowerPoint shape borders + - Supports uniform borders: `border: 2px solid #333333` + - Supports partial borders: `border-left`, `border-right`, `border-top`, `border-bottom` (rendered as line shapes) + - Example: `
                              ` +- **Border radius**: CSS `border-radius` on `
                              ` elements for rounded corners + - `border-radius: 50%` or higher creates circular shape + - Percentages <50% calculated relative to shape's smaller dimension + - Supports px and pt units (e.g., `border-radius: 8pt;`, `border-radius: 12px;`) + - Example: `
                              ` on 100x200px box = 25% of 100px = 25px radius +- **Box shadows**: CSS `box-shadow` on `
                              ` elements converts to PowerPoint shadows + - Supports outer shadows only (inset shadows are ignored to prevent corruption) + - Example: `
                              ` + - Note: Inset/inner shadows are not supported by PowerPoint and will be skipped + +### Icons and Gradients + +- **ESSENTIAL: Never use CSS gradients (`linear-gradient`, `radial-gradient`)** - They don't convert to PowerPoint +- **ALWAYS create gradient/icon PNGs FIRST using Sharp, then reference in HTML** +- For gradients: Rasterize SVG to PNG background images +- For icons: Rasterize react-icons SVG to PNG images +- All visual effects must be pre-rendered as raster images before HTML rendering + +### Image Assets for Slides + +**NOTE**: Presentations should include relevant images to enhance visual communication. Use the `ImageGen` tool to create custom images before building slides. + +**Image Workflow**: +1. **Before creating HTML slides**, analyze content and determine needed visuals +2. **Generate images** using ImageGen tool with detailed prompts +3. **Reference images** in HTML using `` tags with proper sizing + +**Image Categories to Consider**: +- **Architecture diagrams**: System components, infrastructure layouts +- **Flowcharts**: Process flows, decision trees, user journeys +- **Illustrations**: Conceptual visuals, metaphorical images +- **Backgrounds**: Subtle patterns, gradient images, themed backgrounds +- **Icons**: Feature icons, category markers, decorative elements + +**Image Sizing in HTML**: +```html + + + + +
                              +
                              +

                              Text content here

                              +
                              + + + + + + +``` + +**Image Quality Requirements**: +- **Minimum resolution**: 1920x1080 for full-slide backgrounds +- **Format**: PNG for diagrams/icons (transparency support), JPEG for photos +- **Aspect ratio**: Maintain original ratios; never stretch images + +**Rasterizing Icons with Sharp:** + +```javascript +const React = require('react'); +const ReactDOMServer = require('react-dom/server'); +const sharp = require('sharp'); +const { FaHome } = require('react-icons/fa'); + +async function renderIconToPng(IconComponent, color, size = "256", filename) { + const svgString = ReactDOMServer.renderToStaticMarkup( + React.createElement(IconComponent, { color: `#${color}`, size: size }) + ); + + // Convert SVG to PNG using Sharp + await sharp(Buffer.from(svgString)) + .png() + .toFile(filename); + + return filename; +} + +// Usage: Rasterize icon before using in HTML +const iconPath = await renderIconToPng(FaHome, "4472c4", "256", "home-icon.png"); +// Then reference in HTML: +``` + +**Rasterizing Gradients with Sharp:** + +```javascript +const sharp = require('sharp'); + +async function generateGradientBackground(filename) { + const svg = ` + + + + + + + + `; + + await sharp(Buffer.from(svg)) + .png() + .toFile(filename); + + return filename; +} + +// Usage: Create gradient background before HTML +const bgPath = await generateGradientBackground("gradient-bg.png"); +// Then in HTML: +``` + +### Example + +```html + + + + + + +
                              +

                              Recipe Title

                              +
                                +
                              • Item: Description
                              • +
                              +

                              Text with bold, italic, underline.

                              +
                              + + +
                              +

                              5

                              +
                              +
                              + + +``` + +## Using the slideConverter Library + +### Dependencies + +These libraries have been globally installed and are available to use: +- `pptxgenjs` +- `playwright` +- `sharp` + +### Basic Usage + +```javascript +const pptxgen = require('pptxgenjs'); +const convertSlide = require('./slideConverter'); + +const pptx = new pptxgen(); +pptx.layout = 'LAYOUT_16x9'; // Must match HTML body dimensions + +const { slide, placeholders } = await convertSlide('slide1.html', pptx); + +// Add chart to placeholder area +if (placeholders.length > 0) { + slide.addChart(pptx.charts.LINE, chartData, placeholders[0]); +} + +await pptx.writeFile('output.pptx'); +``` + +### API Reference + +#### Function Signature +```javascript +await convertSlide(htmlFile, pres, options) +``` + +#### Parameters +- `htmlFile` (string): Path to HTML file (absolute or relative) +- `pres` (pptxgen): PptxGenJS presentation instance with layout already set +- `options` (object, optional): + - `tmpDir` (string): Temporary directory for generated files (default: `process.env.TMPDIR || '/tmp'`) + - `slide` (object): Existing slide to reuse (default: creates new slide) + +#### Returns +```javascript +{ + slide: pptxgenSlide, // The created/updated slide + placeholders: [ // Array of placeholder positions + { id: string, x: number, y: number, w: number, h: number }, + ... + ] +} +``` + +### Validation + +The library automatically validates and collects all errors before throwing: + +1. **HTML dimensions must match presentation layout** - Reports dimension mismatches +2. **Content must not overflow body** - Reports overflow with exact measurements +3. **CSS gradients** - Reports unsupported gradient usage +4. **Text element styling** - Reports backgrounds/borders/shadows on text elements (only allowed on divs) + +**All validation errors are collected and reported together** in a single error message, allowing you to fix all issues at once instead of one at a time. + +### Working with Placeholders + +```javascript +const { slide, placeholders } = await convertSlide('slide.html', pptx); + +// Use first placeholder +slide.addChart(pptx.charts.BAR, data, placeholders[0]); + +// Find by ID +const chartArea = placeholders.find(p => p.id === 'chart-area'); +slide.addChart(pptx.charts.LINE, data, chartArea); +``` + +### Complete Example + +```javascript +const pptxgen = require('pptxgenjs'); +const convertSlide = require('./slideConverter'); + +async function buildPresentation() { + const pptx = new pptxgen(); + pptx.layout = 'LAYOUT_16x9'; + pptx.author = 'Your Name'; + pptx.title = 'My Presentation'; + + // Slide 1: Title + const { slide: slide1 } = await convertSlide('slides/title.html', pptx); + + // Slide 2: Content with chart + const { slide: slide2, placeholders } = await convertSlide('slides/data.html', pptx); + + const chartData = [{ + name: 'Sales', + labels: ['Q1', 'Q2', 'Q3', 'Q4'], + values: [4500, 5500, 6200, 7100] + }]; + + slide2.addChart(pptx.charts.BAR, chartData, { + ...placeholders[0], + showTitle: true, + title: 'Quarterly Sales', + showCatAxisTitle: true, + catAxisTitle: 'Quarter', + showValAxisTitle: true, + valAxisTitle: 'Sales ($000s)' + }); + + // Save + await pptx.writeFile({ fileName: 'presentation.pptx' }); + console.log('Presentation created successfully!'); +} + +buildPresentation().catch(console.error); +``` + +## Working with PptxGenJS + +After converting HTML to slides with `convertSlide`, use PptxGenJS to add dynamic content like charts, images, and additional elements. + +### Critical Rules + +#### Colors +- **NEVER use `#` prefix** with hex colors in PptxGenJS - causes file corruption +- Correct: `color: "FF0000"`, `fill: { color: "0066CC" }` +- Incorrect: `color: "#FF0000"` (breaks document) + +### Adding Images + +Always calculate aspect ratios from actual image dimensions: + +```javascript +// Get image dimensions: identify image.png | grep -o '[0-9]* x [0-9]*' +const imgWidth = 1860, imgHeight = 1519; // From actual file +const aspectRatio = imgWidth / imgHeight; + +const h = 3; // Max height +const w = h * aspectRatio; +const x = (10 - w) / 2; // Center on 16:9 slide + +slide.addImage({ path: "chart.png", x, y: 1.5, w, h }); +``` + +**Image Layout Patterns**: + +```javascript +// Full-slide background image +slide.addImage({ + path: "background.png", + x: 0, y: 0, w: 10, h: 5.625, // 16:9 dimensions + sizing: { type: 'cover' } +}); + +// Two-column layout: Image left, text right +slide.addImage({ + path: "diagram.png", + x: 0.5, y: 1, w: 4.5, h: 3.5 +}); +slide.addText("Description text", { + x: 5.5, y: 1, w: 4, h: 3.5 +}); + +// Centered diagram with margins +slide.addImage({ + path: "architecture.png", + x: 1.5, y: 1.5, w: 7, h: 3, + sizing: { type: 'contain' } +}); + +// Image grid (2x2) +const gridImages = ["img1.png", "img2.png", "img3.png", "img4.png"]; +const gridW = 4, gridH = 2.5, gap = 0.2; +gridImages.forEach((img, i) => { + const col = i % 2, row = Math.floor(i / 2); + slide.addImage({ + path: img, + x: 0.5 + col * (gridW + gap), + y: 0.8 + row * (gridH + gap), + w: gridW, h: gridH + }); +}); +``` + +**Image with Text Overlay**: + +```javascript +// Background image with semi-transparent overlay for text +slide.addImage({ path: "hero-image.png", x: 0, y: 0, w: 10, h: 5.625 }); +slide.addShape(pptx.shapes.RECTANGLE, { + x: 0, y: 3.5, w: 10, h: 2.125, + fill: { color: "000000", transparency: 50 } // 50% transparent black +}); +slide.addText("Title Over Image", { + x: 0.5, y: 3.8, w: 9, h: 1, + color: "FFFFFF", fontSize: 36, bold: true +}); +``` + +### Adding Text + +```javascript +// Rich text with formatting +slide.addText([ + { text: "Bold ", options: { bold: true } }, + { text: "Italic ", options: { italic: true } }, + { text: "Normal" } +], { + x: 1, y: 2, w: 8, h: 1 +}); +``` + +### Adding Shapes + +```javascript +// Rectangle +slide.addShape(pptx.shapes.RECTANGLE, { + x: 1, y: 1, w: 3, h: 2, + fill: { color: "4472C4" }, + line: { color: "000000", width: 2 } +}); + +// Circle +slide.addShape(pptx.shapes.OVAL, { + x: 5, y: 1, w: 2, h: 2, + fill: { color: "ED7D31" } +}); + +// Rounded rectangle +slide.addShape(pptx.shapes.ROUNDED_RECTANGLE, { + x: 1, y: 4, w: 3, h: 1.5, + fill: { color: "70AD47" }, + rectRadius: 0.2 +}); +``` + +### Adding Charts + +**Required for most charts:** Axis labels using `catAxisTitle` (category) and `valAxisTitle` (value). + +**Chart Data Format:** +- Use **single series with all labels** for simple bar/line charts +- Each series creates a separate legend entry +- Labels array defines X-axis values + +**Time Series Data - Choose Correct Granularity:** +- **< 30 days**: Use daily grouping (e.g., "10-01", "10-02") - avoid monthly aggregation that creates single-point charts +- **30-365 days**: Use monthly grouping (e.g., "2024-01", "2024-02") +- **> 365 days**: Use yearly grouping (e.g., "2023", "2024") +- **Validate**: Charts with only 1 data point likely indicate incorrect aggregation for the time period + +```javascript +const { slide, placeholders } = await convertSlide('slide.html', pptx); + +// CORRECT: Single series with all labels +slide.addChart(pptx.charts.BAR, [{ + name: "Sales 2024", + labels: ["Q1", "Q2", "Q3", "Q4"], + values: [4500, 5500, 6200, 7100] +}], { + ...placeholders[0], // Use placeholder position + barDir: 'col', // 'col' = vertical bars, 'bar' = horizontal + showTitle: true, + title: 'Quarterly Sales', + showLegend: false, // No legend needed for single series + // Required axis labels + showCatAxisTitle: true, + catAxisTitle: 'Quarter', + showValAxisTitle: true, + valAxisTitle: 'Sales ($000s)', + // Optional: Control scaling (adjust min based on data range for better visualization) + valAxisMaxVal: 8000, + valAxisMinVal: 0, // Use 0 for counts/amounts; for clustered data (e.g., 4500-7100), consider starting closer to min value + valAxisMajorUnit: 2000, // Control y-axis label spacing to prevent crowding + catAxisLabelRotate: 45, // Rotate labels if crowded + dataLabelPosition: 'outEnd', + dataLabelColor: '000000', + // Use single color for single-series charts + chartColors: ["4472C4"] // All bars same color +}); +``` + +#### Scatter Chart + +**NOTE**: Scatter chart data format is unusual - first series contains X-axis values, subsequent series contain Y-values: + +```javascript +// Prepare data +const data1 = [{ x: 10, y: 20 }, { x: 15, y: 25 }, { x: 20, y: 30 }]; +const data2 = [{ x: 12, y: 18 }, { x: 18, y: 22 }]; + +const allXValues = [...data1.map(d => d.x), ...data2.map(d => d.x)]; + +slide.addChart(pptx.charts.SCATTER, [ + { name: 'X-Axis', values: allXValues }, // First series = X values + { name: 'Series 1', values: data1.map(d => d.y) }, // Y values only + { name: 'Series 2', values: data2.map(d => d.y) } // Y values only +], { + x: 1, y: 1, w: 8, h: 4, + lineSize: 0, // 0 = no connecting lines + lineDataSymbol: 'circle', + lineDataSymbolSize: 6, + showCatAxisTitle: true, + catAxisTitle: 'X Axis', + showValAxisTitle: true, + valAxisTitle: 'Y Axis', + chartColors: ["4472C4", "ED7D31"] +}); +``` + +#### Line Chart + +```javascript +slide.addChart(pptx.charts.LINE, [{ + name: "Temperature", + labels: ["Jan", "Feb", "Mar", "Apr"], + values: [32, 35, 42, 55] +}], { + x: 1, y: 1, w: 8, h: 4, + lineSize: 4, + lineSmooth: true, + // Required axis labels + showCatAxisTitle: true, + catAxisTitle: 'Month', + showValAxisTitle: true, + valAxisTitle: 'Temperature (F)', + // Optional: Y-axis range (set min based on data range for better visualization) + valAxisMinVal: 0, // For ranges starting at 0 (counts, percentages, etc.) + valAxisMaxVal: 60, + valAxisMajorUnit: 20, // Control y-axis label spacing to prevent crowding (e.g., 10, 20, 25) + // valAxisMinVal: 30, // PREFERRED: For data clustered in a range (e.g., 32-55 or ratings 3-5), start axis closer to min value to show variation + // Optional: Chart colors + chartColors: ["4472C4", "ED7D31", "A5A5A5"] +}); +``` + +#### Pie Chart (No Axis Labels Required) + +**ESSENTIAL**: Pie charts require a **single data series** with all categories in the `labels` array and corresponding values in the `values` array. + +```javascript +slide.addChart(pptx.charts.PIE, [{ + name: "Market Share", + labels: ["Product A", "Product B", "Other"], // All categories in one array + values: [35, 45, 20] // All values in one array +}], { + x: 2, y: 1, w: 6, h: 4, + showPercent: true, + showLegend: true, + legendPos: 'r', // right + chartColors: ["4472C4", "ED7D31", "A5A5A5"] +}); +``` + +#### Multiple Data Series + +```javascript +slide.addChart(pptx.charts.LINE, [ + { + name: "Product A", + labels: ["Q1", "Q2", "Q3", "Q4"], + values: [10, 20, 30, 40] + }, + { + name: "Product B", + labels: ["Q1", "Q2", "Q3", "Q4"], + values: [15, 25, 20, 35] + } +], { + x: 1, y: 1, w: 8, h: 4, + showCatAxisTitle: true, + catAxisTitle: 'Quarter', + showValAxisTitle: true, + valAxisTitle: 'Revenue ($M)' +}); +``` + +### Chart Colors + +**ESSENTIAL**: Use hex colors **without** the `#` prefix - including `#` causes file corruption. + +**Align chart colors with your chosen design palette**, ensuring sufficient contrast and distinctiveness for data visualization. Adjust colors for: +- Strong contrast between adjacent series +- Readability against slide backgrounds +- Accessibility (avoid red-green only combinations) + +```javascript +// Example: Ocean palette-inspired chart colors (adjusted for contrast) +const chartColors = ["16A085", "FF6B9D", "2C3E50", "F39C12", "9B59B6"]; + +// Single-series chart: Use one color for all bars/points +slide.addChart(pptx.charts.BAR, [{ + name: "Sales", + labels: ["Q1", "Q2", "Q3", "Q4"], + values: [4500, 5500, 6200, 7100] +}], { + ...placeholders[0], + chartColors: ["16A085"], // All bars same color + showLegend: false +}); + +// Multi-series chart: Each series gets a different color +slide.addChart(pptx.charts.LINE, [ + { name: "Product A", labels: ["Q1", "Q2", "Q3"], values: [10, 20, 30] }, + { name: "Product B", labels: ["Q1", "Q2", "Q3"], values: [15, 25, 20] } +], { + ...placeholders[0], + chartColors: ["16A085", "FF6B9D"] // One color per series +}); +``` + +### Adding Tables + +Tables can be added with basic or advanced formatting: + +#### Basic Table + +```javascript +slide.addTable([ + ["Header 1", "Header 2", "Header 3"], + ["Row 1, Col 1", "Row 1, Col 2", "Row 1, Col 3"], + ["Row 2, Col 1", "Row 2, Col 2", "Row 2, Col 3"] +], { + x: 0.5, + y: 1, + w: 9, + h: 3, + border: { pt: 1, color: "999999" }, + fill: { color: "F1F1F1" } +}); +``` + +#### Table with Custom Formatting + +```javascript +const tableData = [ + // Header row with custom styling + [ + { text: "Product", options: { fill: { color: "4472C4" }, color: "FFFFFF", bold: true } }, + { text: "Revenue", options: { fill: { color: "4472C4" }, color: "FFFFFF", bold: true } }, + { text: "Growth", options: { fill: { color: "4472C4" }, color: "FFFFFF", bold: true } } + ], + // Data rows + ["Product A", "$50M", "+15%"], + ["Product B", "$35M", "+22%"], + ["Product C", "$28M", "+8%"] +]; + +slide.addTable(tableData, { + x: 1, + y: 1.5, + w: 8, + h: 3, + colW: [3, 2.5, 2.5], // Column widths + rowH: [0.5, 0.6, 0.6, 0.6], // Row heights + border: { pt: 1, color: "CCCCCC" }, + align: "center", + valign: "middle", + fontSize: 14 +}); +``` + +#### Table with Merged Cells + +```javascript +const mergedTableData = [ + [ + { text: "Q1 Results", options: { colspan: 3, fill: { color: "4472C4" }, color: "FFFFFF", bold: true } } + ], + ["Product", "Sales", "Market Share"], + ["Product A", "$25M", "35%"], + ["Product B", "$18M", "25%"] +]; + +slide.addTable(mergedTableData, { + x: 1, + y: 1, + w: 8, + h: 2.5, + colW: [3, 2.5, 2.5], + border: { pt: 1, color: "DDDDDD" } +}); +``` + +### Table Options + +Common table options: +- `x, y, w, h` - Position and size +- `colW` - Array of column widths (in inches) +- `rowH` - Array of row heights (in inches) +- `border` - Border style: `{ pt: 1, color: "999999" }` +- `fill` - Background color (no # prefix) +- `align` - Text alignment: "left", "center", "right" +- `valign` - Vertical alignment: "top", "middle", "bottom" +- `fontSize` - Text size +- `autoPage` - Auto-create new slides if content overflows diff --git a/deploy/data/skills/remotion/SKILL.md b/deploy/data/skills/remotion/SKILL.md new file mode 100644 index 000000000..9ee913213 --- /dev/null +++ b/deploy/data/skills/remotion/SKILL.md @@ -0,0 +1,46 @@ +--- +name: remotion-best-practices +description: Best practices for Remotion - Video creation in React +description_zh: Remotion 最佳实践 - 使用 React 进行视频创作 +metadata: + tags: remotion, video, react, animation, composition +--- + +## When to use + +Use this skills whenever you are dealing with Remotion code to obtain the domain-specific knowledge. + +## How to use + +Read individual rule files for detailed explanations and code examples: + +- [rules/3d.md](rules/3d.md) - 3D content in Remotion using Three.js and React Three Fiber +- [rules/animations.md](rules/animations.md) - Fundamental animation skills for Remotion +- [rules/assets.md](rules/assets.md) - Importing images, videos, audio, and fonts into Remotion +- [rules/audio.md](rules/audio.md) - Using audio and sound in Remotion - importing, trimming, volume, speed, pitch +- [rules/calculate-metadata.md](rules/calculate-metadata.md) - Dynamically set composition duration, dimensions, and props +- [rules/can-decode.md](rules/can-decode.md) - Check if a video can be decoded by the browser using Mediabunny +- [rules/charts.md](rules/charts.md) - Chart and data visualization patterns for Remotion +- [rules/compositions.md](rules/compositions.md) - Defining compositions, stills, folders, default props and dynamic metadata +- [rules/display-captions.md](rules/display-captions.md) - Displaying captions in Remotion with TikTok-style pages and word highlighting +- [rules/extract-frames.md](rules/extract-frames.md) - Extract frames from videos at specific timestamps using Mediabunny +- [rules/fonts.md](rules/fonts.md) - Loading Google Fonts and local fonts in Remotion +- [rules/get-audio-duration.md](rules/get-audio-duration.md) - Getting the duration of an audio file in seconds with Mediabunny +- [rules/get-video-dimensions.md](rules/get-video-dimensions.md) - Getting the width and height of a video file with Mediabunny +- [rules/get-video-duration.md](rules/get-video-duration.md) - Getting the duration of a video file in seconds with Mediabunny +- [rules/gifs.md](rules/gifs.md) - Displaying GIFs synchronized with Remotion's timeline +- [rules/images.md](rules/images.md) - Embedding images in Remotion using the Img component +- [rules/import-srt-captions.md](rules/import-srt-captions.md) - Importing .srt subtitle files into Remotion using @remotion/captions +- [rules/lottie.md](rules/lottie.md) - Embedding Lottie animations in Remotion +- [rules/measuring-dom-nodes.md](rules/measuring-dom-nodes.md) - Measuring DOM element dimensions in Remotion +- [rules/measuring-text.md](rules/measuring-text.md) - Measuring text dimensions, fitting text to containers, and checking overflow +- [rules/sequencing.md](rules/sequencing.md) - Sequencing patterns for Remotion - delay, trim, limit duration of items +- [rules/tailwind.md](rules/tailwind.md) - Using TailwindCSS in Remotion +- [rules/text-animations.md](rules/text-animations.md) - Typography and text animation patterns for Remotion +- [rules/timing.md](rules/timing.md) - Interpolation curves in Remotion - linear, easing, spring animations +- [rules/transcribe-captions.md](rules/transcribe-captions.md) - Transcribing audio to generate captions in Remotion +- [rules/transitions.md](rules/transitions.md) - Scene transition patterns for Remotion +- [rules/trimming.md](rules/trimming.md) - Trimming patterns for Remotion - cut the beginning or end of animations +- [rules/videos.md](rules/videos.md) - Embedding videos in Remotion - trimming, volume, speed, looping, pitch +- [rules/parameters.md](rules/parameters.md) - Make a video parametrizable by adding a Zod schema +- [rules/maps.md](rules/maps.md) - Add a map using Mapbox and animate it diff --git a/deploy/data/skills/remotion/rules/3d.md b/deploy/data/skills/remotion/rules/3d.md new file mode 100644 index 000000000..31fa5c677 --- /dev/null +++ b/deploy/data/skills/remotion/rules/3d.md @@ -0,0 +1,86 @@ +--- +name: 3d +description: 3D content in Remotion using Three.js and React Three Fiber. +metadata: + tags: 3d, three, threejs +--- + +# Using Three.js and React Three Fiber in Remotion + +Follow React Three Fiber and Three.js best practices. +Only the following Remotion-specific rules need to be followed: + +## Prerequisites + +First, the `@remotion/three` package needs to be installed. +If it is not, use the following command: + +```bash +npx remotion add @remotion/three # If project uses npm +bunx remotion add @remotion/three # If project uses bun +yarn remotion add @remotion/three # If project uses yarn +pnpm exec remotion add @remotion/three # If project uses pnpm +``` + +## Using ThreeCanvas + +You MUST wrap 3D content in `` and include proper lighting. +`` MUST have a `width` and `height` prop. + +```tsx +import { ThreeCanvas } from "@remotion/three"; +import { useVideoConfig } from "remotion"; + +const { width, height } = useVideoConfig(); + + + + + + + + + +``` + +## No animations not driven by `useCurrentFrame()` + +Shaders, models etc MUST NOT animate by themselves. +No animations are allowed unless they are driven by `useCurrentFrame()`. +Otherwise, it will cause flickering during rendering. + +Using `useFrame()` from `@react-three/fiber` is forbidden. + +## Animate using `useCurrentFrame()` + +Use `useCurrentFrame()` to perform animations. + +```tsx +const frame = useCurrentFrame(); +const rotationY = frame * 0.02; + + + + + +``` + +## Using `` inside `` + +The `layout` prop of any `` inside a `` must be set to `none`. + +```tsx +import { Sequence } from "remotion"; +import { ThreeCanvas } from "@remotion/three"; + +const { width, height } = useVideoConfig(); + + + + + + + + + +``` \ No newline at end of file diff --git a/deploy/data/skills/remotion/rules/animations.md b/deploy/data/skills/remotion/rules/animations.md new file mode 100644 index 000000000..7e15623f8 --- /dev/null +++ b/deploy/data/skills/remotion/rules/animations.md @@ -0,0 +1,29 @@ +--- +name: animations +description: Fundamental animation skills for Remotion +metadata: + tags: animations, transitions, frames, useCurrentFrame +--- + +All animations MUST be driven by the `useCurrentFrame()` hook. +Write animations in seconds and multiply them by the `fps` value from `useVideoConfig()`. + +```tsx +import { useCurrentFrame } from "remotion"; + +export const FadeIn = () => { + const frame = useCurrentFrame(); + const { fps } = useVideoConfig(); + + const opacity = interpolate(frame, [0, 2 * fps], [0, 1], { + extrapolateRight: 'clamp', + }); + + return ( +
                              Hello World!
                              + ); +}; +``` + +CSS transitions or animations are FORBIDDEN - they will not render correctly. +Tailwind animation class names are FORBIDDEN - they will not render correctly. \ No newline at end of file diff --git a/deploy/data/skills/remotion/rules/assets.md b/deploy/data/skills/remotion/rules/assets.md new file mode 100644 index 000000000..04c8ad590 --- /dev/null +++ b/deploy/data/skills/remotion/rules/assets.md @@ -0,0 +1,78 @@ +--- +name: assets +description: Importing images, videos, audio, and fonts into Remotion +metadata: + tags: assets, staticFile, images, fonts, public +--- + +# Importing assets in Remotion + +## The public folder + +Place assets in the `public/` folder at your project root. + +## Using staticFile() + +You MUST use `staticFile()` to reference files from the `public/` folder: + +```tsx +import {Img, staticFile} from 'remotion'; + +export const MyComposition = () => { + return ; +}; +``` + +The function returns an encoded URL that works correctly when deploying to subdirectories. + +## Using with components + +**Images:** + +```tsx +import {Img, staticFile} from 'remotion'; + +; +``` + +**Videos:** + +```tsx +import {Video} from '@remotion/media'; +import {staticFile} from 'remotion'; + +
- { - navigate(`/api-products/detail?productId=${apiProduct.productId}&tab=portal`) - }} - > - } - valueStyle={{ color: '#1677ff', fontSize: '24px' }} - /> - - - - { - navigate(`/api-products/detail?productId=${apiProduct.productId}&tab=link-api`) - }} - > - } - valueStyle={{ color: '#1677ff', fontSize: '24px' }} - /> - - - - - } - valueStyle={{ color: '#1677ff', fontSize: '24px' }} - /> - - - + {/* 统计数据 - AGENT_SKILL 不展示 */} + {apiProduct.type !== 'AGENT_SKILL' && ( + + + { + navigate(`/api-products/detail?productId=${apiProduct.productId}&tab=portal`) + }} + > + } + valueStyle={{ color: '#1677ff', fontSize: '24px' }} + /> + + + + { + navigate(`/api-products/detail?productId=${apiProduct.productId}&tab=link-api`) + }} + > + } + valueStyle={{ color: '#1677ff', fontSize: '24px' }} + /> + + + + + } + valueStyle={{ color: '#1677ff', fontSize: '24px' }} + /> + + + + )} ) diff --git a/himarket-web/himarket-admin/src/components/api-product/ApiProductSkillPackage.tsx b/himarket-web/himarket-admin/src/components/api-product/ApiProductSkillPackage.tsx new file mode 100644 index 000000000..98b5cdf25 --- /dev/null +++ b/himarket-web/himarket-admin/src/components/api-product/ApiProductSkillPackage.tsx @@ -0,0 +1,299 @@ +import { useState, useEffect, useRef } from 'react' +import { Upload, message, Spin, Tooltip, Alert, Button as AntButton } from 'antd' +import { InboxOutlined, FolderOutlined, FolderOpenOutlined, FileOutlined } from '@ant-design/icons' +import ReactMarkdown from 'react-markdown' +import remarkGfm from 'remark-gfm' +import rehypeHighlight from 'rehype-highlight' +import MonacoEditor from 'react-monaco-editor' +import { skillApi } from '@/lib/api' +import 'github-markdown-css/github-markdown-light.css' +import 'highlight.js/styles/github.css' + +interface SkillFileTreeNode { + name: string + path: string + type: 'file' | 'directory' + encoding?: string + size?: number + children?: SkillFileTreeNode[] +} + +interface FileContent { + path: string + content: string + encoding: string + size: number +} + +interface ApiProductSkillPackageProps { + apiProduct: import('@/types/api-product').ApiProduct + onUploadSuccess?: () => void +} + +// ── 自定义文件树(与前台 SkillFileTree 对齐)───────────────── +interface TreeNodeProps { + node: SkillFileTreeNode + selectedPath?: string + onSelect: (path: string) => void + depth: number +} + +function TreeNode({ node, selectedPath, onSelect, depth }: TreeNodeProps) { + const [expanded, setExpanded] = useState(true) + const isDir = node.type === 'directory' + const isSelected = node.path === selectedPath + + return ( +
+ +
isDir ? setExpanded(v => !v) : onSelect(node.path)} + > + {isDir + ? expanded + ? + : + : + } + {node.name} +
+
+ {isDir && expanded && node.children && node.children.length > 0 && ( +
+ {node.children.map(child => ( + + ))} +
+ )} +
+ ) +} + +function SkillFileTree({ nodes, selectedPath, onSelect }: { nodes: SkillFileTreeNode[]; selectedPath?: string; onSelect: (p: string) => void }) { + return ( +
+ {nodes.map(node => ( + + ))} +
+ ) +} + +function parseFrontMatter(content: string): { entries: [string, string][]; body: string } { + const t = content.trim() + if (!t.startsWith('---')) return { entries: [], body: t } + const end = t.indexOf('---', 3) + if (end === -1) return { entries: [], body: t } + const yamlBlock = t.substring(3, end).trim() + const body = t.substring(end + 3).trim() + const entries: [string, string][] = yamlBlock.split('\n').flatMap((line) => { + const idx = line.indexOf(':') + if (idx <= 0) return [] + const k = line.substring(0, idx).trim() + let v = line.substring(idx + 1).trim() + if ((v.startsWith('"') && v.endsWith('"')) || (v.startsWith("'") && v.endsWith("'"))) v = v.slice(1, -1) + return [[k, v]] as [string, string][] + }) + return { entries, body } +} + +function findNode(nodes: SkillFileTreeNode[], path: string): SkillFileTreeNode | null { + for (const node of nodes) { + if (node.path === path) return node + if (node.children) { const f = findNode(node.children, path); if (f) return f } + } + return null +} + +export function ApiProductSkillPackage({ apiProduct, onUploadSuccess }: ApiProductSkillPackageProps) { + const productId = apiProduct.productId + const hasNacos = !!(apiProduct.skillConfig?.nacosId) + const [fileTree, setFileTree] = useState([]) + const [selectedPath, setSelectedPath] = useState() + const [selectedFile, setSelectedFile] = useState(null) + const [loadingTree, setLoadingTree] = useState(false) + const [loadingFile, setLoadingFile] = useState(false) + const [uploading, setUploading] = useState(false) + const [treeWidth, setTreeWidth] = useState(240) + const isDragging = useRef(false) + + const handleDragStart = (e: React.MouseEvent) => { + e.preventDefault() + isDragging.current = true + const startX = e.clientX + const startWidth = treeWidth + const onMove = (ev: MouseEvent) => { + if (!isDragging.current) return + setTreeWidth(Math.min(520, Math.max(160, startWidth + ev.clientX - startX))) + } + const onUp = () => { + isDragging.current = false + window.removeEventListener('mousemove', onMove) + window.removeEventListener('mouseup', onUp) + } + window.addEventListener('mousemove', onMove) + window.addEventListener('mouseup', onUp) + } + + const fetchFileTree = async () => { + setLoadingTree(true) + try { + const res: any = await skillApi.getSkillFiles(productId) + const nodes: SkillFileTreeNode[] = res.data || [] + setFileTree(nodes) + if (findNode(nodes, 'SKILL.md')) loadFileContent('SKILL.md') + } catch { + } finally { + setLoadingTree(false) + } + } + + const loadFileContent = async (path: string) => { + setSelectedPath(path) + setLoadingFile(true) + try { + const res: any = await skillApi.getSkillFileContent(productId, path) + setSelectedFile(res.data) + } catch { + } finally { + setLoadingFile(false) + } + } + + useEffect(() => { fetchFileTree() }, [productId]) + + const customRequest = async (options: any) => { + const { file, onSuccess, onError } = options + setUploading(true) + try { + const res: any = await skillApi.uploadSkillPackage(productId, file) + message.success('上传成功') + onSuccess(res) + await fetchFileTree() + onUploadSuccess?.() + } catch (error: any) { + message.destroy() + message.error(error.response?.data?.message || '上传失败') + onError(error) + } finally { + setUploading(false) + } + } + + const renderPreview = () => { + if (loadingFile) return
+ + if (!selectedFile) return ( +
+
+ +

点击左侧文件查看内容

+
+
+ ) + + if (selectedFile.encoding === 'base64') return ( +
+

二进制文件,不支持预览

+
+ ) + + if (selectedFile.path.endsWith('.md')) { + const { entries, body } = parseFrontMatter(selectedFile.content) + return ( +
+ {entries.length > 0 && ( +
+ + + {entries.map(([k]) => ( + + ))} + + + + + {entries.map(([k, v]) => ( + + ))} + + +
{k}
{v}
+ )} +
+ {body} +
+
+ ) + } + + const lang = (() => { + const ext = selectedFile.path.split('.').pop()?.toLowerCase() ?? '' + const map: Record = { py: 'python', js: 'javascript', ts: 'typescript', tsx: 'typescript', jsx: 'javascript', json: 'json', yaml: 'yaml', yml: 'yaml', sh: 'shell', bash: 'shell', css: 'css', html: 'html', xml: 'xml', sql: 'sql', java: 'java', go: 'go', rs: 'rust', rb: 'ruby', kt: 'kotlin', swift: 'swift', c: 'c', cpp: 'cpp', h: 'c', hpp: 'cpp' } + return map[ext] || 'plaintext' + })() + + return ( + + ) + } + + return ( +
+
+

Skill Package

+

上传并管理技能包文件

+
+ + {!hasNacos && ( + + 前往 Nacos 管理 + + } + /> + )} + + +

+

点击或拖拽上传 Skill 包

+

支持 .zip 和 .tar.gz 格式,最大 50MB

+
+ +
+
+ {loadingTree + ?
+ : fileTree.length === 0 + ?
暂无文件
+ : + } +
+ {/* 拖拽分隔条 */} +
+
{renderPreview()}
+
+
+ ) +} diff --git a/himarket-web/himarket-admin/src/components/api-product/ApiProductUsageGuide.tsx b/himarket-web/himarket-admin/src/components/api-product/ApiProductUsageGuide.tsx index 4c07f05c4..c072b2b3f 100644 --- a/himarket-web/himarket-admin/src/components/api-product/ApiProductUsageGuide.tsx +++ b/himarket-web/himarket-admin/src/components/api-product/ApiProductUsageGuide.tsx @@ -1,6 +1,6 @@ import { Card, Button, Space, message } from 'antd' import { SaveOutlined, UploadOutlined, FileMarkdownOutlined, EditOutlined } from '@ant-design/icons' -import { useEffect, useState, useRef } from 'react' +import { useState, useRef } from 'react' import ReactMarkdown from 'react-markdown' import remarkGfm from 'remark-gfm'; import MdEditor from 'react-markdown-editor-lite' @@ -17,22 +17,17 @@ export function ApiProductUsageGuide({ apiProduct, handleRefresh }: ApiProductUs const [content, setContent] = useState(apiProduct.document || '') const [isEditing, setIsEditing] = useState(false) const [originalContent, setOriginalContent] = useState(apiProduct.document || '') + const [saving, setSaving] = useState(false) const fileInputRef = useRef(null) - useEffect(() => { - const doc = apiProduct.document || '' - setContent(doc) - setOriginalContent(doc) - }, [apiProduct.document]) - const handleEdit = () => { setIsEditing(true) } const handleSave = () => { - // 提取 categoryIds 以保留产品类别信息 const categoryIds = apiProduct.categories?.map(cat => cat.categoryId) || []; - + + setSaving(true) apiProductApi.updateApiProduct(apiProduct.productId, { document: content, categories: categoryIds @@ -41,6 +36,8 @@ export function ApiProductUsageGuide({ apiProduct, handleRefresh }: ApiProductUs setIsEditing(false) setOriginalContent(content) handleRefresh(); + }).finally(() => { + setSaving(false) }) } @@ -70,7 +67,6 @@ export function ApiProductUsageGuide({ apiProduct, handleRefresh }: ApiProductUs } reader.readAsText(file) } - // 清空 input 值,允许重复选择同一文件 if (event.target) { event.target.value = '' } @@ -133,7 +129,7 @@ export function ApiProductUsageGuide({ apiProduct, handleRefresh }: ApiProductUs ) : (
{content ? ( -
{content}
@@ -309,4 +188,4 @@ export function ApiProductUsageGuide({ apiProduct, handleRefresh }: ApiProductUs />
) -} \ No newline at end of file +} diff --git a/himarket-web/himarket-admin/src/components/api-product/ModelFeatureForm.tsx b/himarket-web/himarket-admin/src/components/api-product/ModelFeatureForm.tsx index 13a95341b..9145670aa 100644 --- a/himarket-web/himarket-admin/src/components/api-product/ModelFeatureForm.tsx +++ b/himarket-web/himarket-admin/src/components/api-product/ModelFeatureForm.tsx @@ -1,107 +1,94 @@ -import { useState, useEffect } from "react"; -import { Form, Input, InputNumber, Switch, Collapse, Row, Col } from "antd"; +import { Form, Input, InputNumber, Switch, Row, Col, Divider } from "antd"; -const { Panel } = Collapse; - -interface ModelFeatureFormProps { - initialExpanded?: boolean; -} - -export default function ModelFeatureForm({ initialExpanded = false }: ModelFeatureFormProps) { - const [activeKey, setActiveKey] = useState([]); - - const tooltipStyle = { - overlayInnerStyle: { - backgroundColor: '#000', - color: '#fff', - } - }; - - useEffect(() => { - setActiveKey(initialExpanded ? ['1'] : []); - }, [initialExpanded]); +const tooltipStyle = { + overlayInnerStyle: { + backgroundColor: '#000', + color: '#fff', + } +}; +export default function ModelFeatureForm() { return ( - setActiveKey(keys as string[])} - style={{ marginBottom: 16 }} - > - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + <> + 模型参数 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ); } - diff --git a/himarket-web/himarket-admin/src/components/api-product/SkillConfigForm.tsx b/himarket-web/himarket-admin/src/components/api-product/SkillConfigForm.tsx new file mode 100644 index 000000000..d130ed809 --- /dev/null +++ b/himarket-web/himarket-admin/src/components/api-product/SkillConfigForm.tsx @@ -0,0 +1,32 @@ +import { Form, Select, Tag } from "antd"; + +/** + * 技能配置表单组件 + * 包含技能标签多选输入,用于 AGENT_SKILL 类型产品的配置 + */ +export default function SkillConfigForm() { + return ( + + - {searchTypeOptions.map(option => ( - - {option.label} - - ))} - - - {/* 分隔线 */} -
- - {/* 中间:搜索值输入框或选择框 */} - {searchType === 'type' ? ( - - ) : searchType === 'category' ? ( - - ) : ( + {/* Tabs 按类型分组 */} +
+
+ + {/* 名称搜索框 */} +
setSearchValue(e.target.value)} - style={{ - flex: 1, - }} + placeholder="搜索产品名称" + value={searchInput} + onChange={(e) => setSearchInput(e.target.value)} onPressEnter={handleSearch} allowClear - onClear={() => setSearchValue('')} - size="large" - className="h-10 border-0 rounded-none" + onClear={handleClearSearch} + size="middle" variant="borderless" + className="border-0" /> - )} - - {/* 分隔线 */} -
- - {/* 右侧:搜索按钮 */} -
- {/* 筛选条件标签 */} - {activeFilters.length > 0 && ( -
+ {/* 当前筛选提示 */} + {nameFilter && ( +
筛选条件: - - {activeFilters.map(filter => ( - removeFilter(filter.type)} - style={{ - backgroundColor: '#f5f5f5', - border: '1px solid #d9d9d9', - borderRadius: '16px', - color: '#666', - fontSize: '12px', - padding: '4px 12px', - }} - > - {filter.label} - - ))} - -
- 清除筛选条件 -
+ 产品名称:{nameFilter} +
)} -
- {loading ? ( -
- {Array.from({ length: pagination.pageSize || 12 }).map((_, index) => ( -
-
- -
-
- - -
- - -
- - + {/* 产品列表 */} +
+ {loading ? ( +
+ {Array.from({ length: pagination.pageSize || 12 }).map((_, index) => ( +
+
+ +
+
+ + +
+ + +
-
+ ))}
- ))} -
- ) : ( - <> -
- {apiProducts.map((product) => ( - fetchApiProducts(pagination.current, pagination.pageSize, filters)} - onEdit={handleEdit} - /> - ))} -
- - {pagination.total > 0 && ( -
- `共 ${total} 条`} - pageSizeOptions={['6', '12', '24', '48']} - /> + ) : apiProducts.length === 0 ? ( +
+ +

暂无{activeTab !== 'ALL' ? ` ${getTypeLabel(activeTab)} ` : ''}产品

+ ) : ( + <> +
+ {apiProducts.map((product) => ( + fetchApiProducts(pagination.current, pagination.pageSize)} + onEdit={handleEdit} + /> + ))} +
+ {pagination.total > 0 && ( +
+ `共 ${total} 条`} + pageSizeOptions={['6', '12', '24', '48']} + /> +
+ )} + )} - - )} +
+
- ) + ); } diff --git a/himarket-web/himarket-admin/src/pages/NacosConsoles.tsx b/himarket-web/himarket-admin/src/pages/NacosConsoles.tsx index 548699d34..1c62280df 100644 --- a/himarket-web/himarket-admin/src/pages/NacosConsoles.tsx +++ b/himarket-web/himarket-admin/src/pages/NacosConsoles.tsx @@ -1,7 +1,7 @@ import { useState, useEffect } from 'react' -import { Button, Table, Modal, Form, Input, message, Select } from 'antd' +import { Button, Table, Modal, Form, Input, message, Select, Tag, Popconfirm } from 'antd' import dayjs from 'dayjs' -import { PlusOutlined } from '@ant-design/icons' +import { PlusOutlined, StarOutlined } from '@ant-design/icons' import { nacosApi } from '@/lib/api' import NacosTypeSelector, { NacosImportType } from '@/components/console/NacosTypeSelector' import ImportMseNacosModal from '@/components/console/ImportMseNacosModal' @@ -26,6 +26,13 @@ export default function NacosConsoles() { // 创建来源:OPEN_SOURCE 或 MSE(用于控制是否展示 AK/SK) const [creationMode, setCreationMode] = useState<'OPEN_SOURCE' | 'MSE' | null>(null) // 命名空间字段已移除 + // 设置默认命名空间弹窗 + const [nsModalVisible, setNsModalVisible] = useState(false) + const [nsTargetNacos, setNsTargetNacos] = useState(null) + const [nsNamespaces, setNsNamespaces] = useState([]) + const [nsLoading, setNsLoading] = useState(false) + const [nsSelectedValue, setNsSelectedValue] = useState('public') + const [nsSaving, setNsSaving] = useState(false) // 分页状态 const [currentPage, setCurrentPage] = useState(1) @@ -60,18 +67,58 @@ export default function NacosConsoles() { } } + const handleSetDefault = async (nacosId: string) => { + try { + await nacosApi.setDefaultNacos(nacosId) + message.success('已设为默认 Nacos 实例') + fetchNacosInstances() + } catch (error) { + console.error('设置默认失败:', error) + } + } + + const handleOpenNsModal = async (record: NacosInstance) => { + setNsTargetNacos(record) + setNsSelectedValue(record.defaultNamespace || 'public') + setNsModalVisible(true) + setNsLoading(true) + try { + const res = await nacosApi.getNamespaces(record.nacosId, { page: 1, size: 1000 }) + setNsNamespaces(res.data?.content || []) + } catch { + setNsNamespaces([]) + message.error('获取命名空间列表失败,请检查 Nacos 连接信息') + } finally { + setNsLoading(false) + } + } + + const handleSaveDefaultNs = async () => { + if (!nsTargetNacos) return + setNsSaving(true) + try { + await nacosApi.setDefaultNamespace(nsTargetNacos.nacosId, nsSelectedValue) + message.success('默认命名空间设置成功') + setNsModalVisible(false) + fetchNacosInstances() + } catch { + message.error('设置默认命名空间失败') + } finally { + setNsSaving(false) + } + } + const handleEdit = (record: NacosInstance) => { setEditingNacos(record) form.setFieldsValue({ nacosName: record.nacosName, - serverUrl: record.serverUrl, + serverUrl: record.serverUrl, username: record.username, - // 密码/AK/SK 可能不返回,这里仅在存在时回填 - password: record.password, - accessKey: record.accessKey, - secretKey: record.secretKey, - description: record.description + password: record.password, + accessKey: record.accessKey, + secretKey: record.secretKey, + description: record.description, }) setModalVisible(true) } @@ -125,8 +172,8 @@ export default function NacosConsoles() { const handleModalCancel = () => { setModalVisible(false) setEditingNacos(null) - setCreationMode(null) - setImportEndpoints({}) + setCreationMode(null) + setImportEndpoints({}) form.resetFields() } @@ -137,13 +184,28 @@ export default function NacosConsoles() { title: '实例名称', dataIndex: 'nacosName', key: 'nacosName', + render: (name: string, record: NacosInstance) => ( + + {name} + {record.isDefault && 默认} + + ), }, { title: '服务器地址', dataIndex: 'serverUrl', key: 'serverUrl', }, - // 命名空间列已移除 + { + title: '默认命名空间', + dataIndex: 'defaultNamespace', + key: 'defaultNamespace', + render: (ns: string, record: NacosInstance) => ( + + ), + }, { title: '用户名', dataIndex: 'username', @@ -153,9 +215,6 @@ export default function NacosConsoles() { title: '描述', dataIndex: 'description', key: 'description', - // render: (description: string) => { - // return {description || '-'} - // }, ellipsis: true, }, { @@ -170,13 +229,31 @@ export default function NacosConsoles() { { title: '操作', key: 'action', - width: 150, + width: 220, render: (_: any, record: NacosInstance) => (
- + + )} +
@@ -336,6 +413,33 @@ export default function NacosConsoles() { setImportNacosId(values.nacosId || null) }} /> + + {/* 设置默认命名空间弹窗 */} + { setNsModalVisible(false); setNsTargetNacos(null); setNsNamespaces([]) }} + okText="确认" + cancelText="取消" + confirmLoading={nsSaving} + width={480} + > +
+ 选择该 Nacos 实例的默认命名空间,新建的 Skill 将自动使用此命名空间。 +
+ { + const providerObj = providers.find(p => p.key === val); + onChange(val, providerObj); + }} + title="切换 CLI Agent" + options={providers.map(p => ({ + value: p.key, + label: p.displayName + (!p.available ? " (不可用)" : ""), + disabled: !p.available, + }))} + /> + ); +} diff --git a/himarket-web/himarket-frontend/src/components/coding/CodingInput.tsx b/himarket-web/himarket-frontend/src/components/coding/CodingInput.tsx new file mode 100644 index 000000000..fb2e66c76 --- /dev/null +++ b/himarket-web/himarket-frontend/src/components/coding/CodingInput.tsx @@ -0,0 +1,583 @@ +import { + useState, + useRef, + useCallback, + type KeyboardEvent, + type ClipboardEvent, + type DragEvent, + type ChangeEvent, +} from "react"; +import { + Send, + Square, + Paperclip, + X, + Image, + FileText, + Loader2, +} from "lucide-react"; +import { useCodingState, useActiveCodingSession } from "../../context/CodingSessionContext"; +import { SlashMenu } from "./SlashMenu"; +import { FileMentionMenu } from "./FileMentionMenu"; +import { + uploadFileToWorkspace, + fetchDirectoryTree, +} from "../../lib/utils/workspaceApi"; +import { + flattenFileTree, + filterFiles, + type FlatFileItem, +} from "../../lib/utils/fileTreeUtils"; +import type { Attachment, FilePathAttachment } from "../../types/coding-protocol"; +import type { QueuedPromptItem } from "../../context/CodingSessionContext"; + +const MAX_ATTACHMENTS = 10; +const MAX_SIZE_BYTES = 5 * 1024 * 1024; // 5MB + +// Browsers often return "" for many text file types; map common extensions explicitly +const EXT_TO_MIME: Record = { + md: "text/markdown", + mdx: "text/markdown", + txt: "text/plain", + csv: "text/csv", + json: "application/json", + yaml: "application/x-yaml", + yml: "application/x-yaml", + toml: "application/toml", + xml: "application/xml", + sql: "application/sql", + graphql: "application/graphql", + sh: "application/x-sh", + bash: "application/x-sh", +}; + +function inferMimeType(file: File): string { + if (file.type) return file.type; + const ext = file.name.split(".").pop()?.toLowerCase() ?? ""; + return EXT_TO_MIME[ext] ?? "application/octet-stream"; +} + +let _attId = 0; +function nextAttId(): string { + return `att-${++_attId}-${Date.now()}`; +} + +interface CodingInputProps { + onSend: ( + text: string, + attachments?: Attachment[] + ) => + | { queued: true; queuedPromptId?: string } + | { queued: false; requestId?: string | number }; + onSendQueued?: (queuedPromptId?: string) => void; + onDropQueuedPrompt: (promptId: string) => void; + onCancel: () => void; + isProcessing: boolean; + queueSize: number; + queuedPrompts: QueuedPromptItem[]; + disabled: boolean; + variant?: "default" | "welcome"; + /** Extra elements rendered in the welcome toolbar, after the attachment button */ + toolbarExtra?: React.ReactNode; +} + +export function CodingInput({ + onSend, + onSendQueued, + onDropQueuedPrompt, + onCancel, + isProcessing, + queueSize, + queuedPrompts, + disabled, + variant = "default", + toolbarExtra, +}: CodingInputProps) { + const [text, setText] = useState(""); + const [showSlash, setShowSlash] = useState(false); + const [showMentionMenu, setShowMentionMenu] = useState(false); + const [mentionFilter, setMentionFilter] = useState(""); + const [flatFiles, setFlatFiles] = useState([]); + const [filesLoading, setFilesLoading] = useState(false); + const [attachments, setAttachments] = useState([]); + const [uploading, setUploading] = useState(false); + const [dragOver, setDragOver] = useState(false); + const [mentionedFiles, setMentionedFiles] = useState([]); + const inputRef = useRef(null); + const fileInputRef = useRef(null); + const state = useCodingState(); + const activeQuest = useActiveCodingSession(); + + // Upload files to backend and create FilePathAttachment entries + const addFiles = useCallback( + async (files: FileList | File[]) => { + const fileArray = Array.from(files); + if (fileArray.length === 0) return; + + const remaining = MAX_ATTACHMENTS - attachments.length; + if (remaining <= 0) return; + const toProcess = fileArray + .slice(0, remaining) + .filter(f => f.size <= MAX_SIZE_BYTES); + + if (toProcess.length === 0) return; + + setUploading(true); + const newAttachments: FilePathAttachment[] = []; + for (const file of toProcess) { + try { + const serverPath = await uploadFileToWorkspace(file); + const isImage = file.type.startsWith("image/"); + newAttachments.push({ + id: nextAttId(), + kind: "file_path", + name: file.name, + filePath: serverPath, + mimeType: inferMimeType(file), + previewUrl: isImage ? URL.createObjectURL(file) : undefined, + }); + } catch { + // skip failed files + } + } + setUploading(false); + if (newAttachments.length > 0) { + setAttachments(prev => [...prev, ...newAttachments]); + } + }, + [attachments.length] + ); + + const removeAttachment = useCallback((id: string) => { + setAttachments(prev => { + const att = prev.find(a => a.id === id); + if (att && att.previewUrl) { + URL.revokeObjectURL(att.previewUrl); + } + return prev.filter(a => a.id !== id); + }); + }, []); + + // Load file tree on first "@" trigger + const loadFileTree = useCallback(async () => { + if (flatFiles.length > 0 || !activeQuest?.cwd) return; + + setFilesLoading(true); + try { + const tree = await fetchDirectoryTree(activeQuest.cwd, 10); + const flattened = flattenFileTree(tree ?? [], activeQuest.cwd); + setFlatFiles(flattened); + } catch { + setFlatFiles([]); + } finally { + setFilesLoading(false); + } + }, [flatFiles.length, activeQuest?.cwd]); + + const removeMention = useCallback((path: string) => { + setMentionedFiles(prev => prev.filter(f => f.path !== path)); + }, []); + + const handleSend = useCallback(() => { + const trimmed = text.trim(); + if (!trimmed && attachments.length === 0 && mentionedFiles.length === 0) return; + + // Convert mentioned files to resource_link attachments + const mentionAttachments: FilePathAttachment[] = mentionedFiles.map(file => ({ + id: nextAttId(), + kind: "file_path" as const, + name: file.name, + filePath: file.path, + mimeType: file.extension ? `text/${file.extension}` : "text/plain", + })); + + const allAttachments = [...mentionAttachments, ...attachments]; + + const result = onSend( + trimmed, + allAttachments.length > 0 ? allAttachments : undefined + ); + if (result.queued) { + onSendQueued?.(result.queuedPromptId); + } + setText(""); + setShowSlash(false); + setShowMentionMenu(false); + setAttachments([]); + setMentionedFiles([]); + }, [text, attachments, mentionedFiles, onSend, onSendQueued]); + + const handleKeyDown = (e: KeyboardEvent) => { + // Let SlashMenu or FileMentionMenu handle navigation when open + if ( + (showSlash || showMentionMenu) && + ["ArrowDown", "ArrowUp", "Enter", "Tab"].includes(e.key) + ) { + return; + } + + if (e.key === "Enter" && !e.shiftKey) { + e.preventDefault(); + handleSend(); + } else if (e.key === "Escape") { + if (showSlash) { + e.preventDefault(); + setShowSlash(false); + } else if (showMentionMenu) { + e.preventDefault(); + setShowMentionMenu(false); + setMentionFilter(""); + } + } + }; + + const handleChange = (value: string) => { + setText(value); + + // Check for slash command (only at start) + const isSlashCommand = value === "/" || (value.startsWith("/") && !value.includes(" ")); + setShowSlash(isSlashCommand); + + // Check for "@" mention (at end of input) + const mentionMatch = value.match(/@(\S*)$/); + if (mentionMatch && !isSlashCommand) { + setShowMentionMenu(true); + setMentionFilter(mentionMatch[1]); + loadFileTree(); + } else { + setShowMentionMenu(false); + setMentionFilter(""); + } + }; + + const handleCommandSelect = (name: string) => { + setText("/" + name + " "); + setShowSlash(false); + inputRef.current?.focus(); + }; + + const handleFileSelect = useCallback( + (file: FlatFileItem) => { + // Remove "@query" from text — the file chip provides the visual reference + const newText = text.replace(/@\S*$/, ""); + setText(newText); + setShowMentionMenu(false); + setMentionFilter(""); + + // Add to mentioned files if not already present + setMentionedFiles(prev => { + if (prev.some(f => f.path === file.path)) return prev; + return [...prev, file]; + }); + + inputRef.current?.focus(); + }, + [text] + ); + + const handlePaste = (e: ClipboardEvent) => { + const items = e.clipboardData?.files; + if (items && items.length > 0) { + const hasImage = Array.from(items).some(f => f.type.startsWith("image/")); + if (hasImage) { + e.preventDefault(); + addFiles(items); + } + } + }; + + const handleDragOver = (e: DragEvent) => { + e.preventDefault(); + setDragOver(true); + }; + + const handleDragLeave = (e: DragEvent) => { + e.preventDefault(); + setDragOver(false); + }; + + const handleDrop = (e: DragEvent) => { + e.preventDefault(); + setDragOver(false); + const files = e.dataTransfer?.files; + if (files && files.length > 0) { + addFiles(files); + } + }; + + const handleFileChange = (e: ChangeEvent) => { + const files = e.target.files; + if (files && files.length > 0) { + addFiles(files); + } + // reset so same file can be selected again + e.target.value = ""; + }; + + const canSend = + !disabled && + !uploading && + (text.trim().length > 0 || attachments.length > 0 || mentionedFiles.length > 0); + + return ( +
+ {isProcessing && ( +
+
+
+ )} + {showSlash && state.commands.length > 0 && ( + + )} + {showMentionMenu && ( + + )} + + {/* Mentioned file chips (from @ references) */} + {mentionedFiles.length > 0 && ( +
+ {mentionedFiles.map(file => ( + + + + {file.name} + + + + ))} +
+ )} + + {/* Attachment preview strip */} + {(attachments.length > 0 || uploading) && ( +
+ {attachments.map(att => + att.previewUrl ? ( +
+ {att.name} + +
+ ) : ( +
+ {att.mimeType?.startsWith("image/") ? ( + + ) : ( + + )} + + {att.name} + + +
+ ) + )} + {uploading && ( +
+ + 上传中... +
+ )} +
+ )} + + {queuedPrompts.length > 0 && ( +
+
+ 队列中 {queueSize} 条消息 +
+
+ {queuedPrompts.map(item => ( +
+ + {item.text || "[仅附件]"} + + +
+ ))} +
+
+ )} + + {variant === "welcome" ? ( + /* Welcome 模式布局 */ + <> +