From 6d1509018d59edd40452708271625e211cfed478 Mon Sep 17 00:00:00 2001 From: coji Date: Sun, 11 Jan 2026 20:29:59 +0900 Subject: [PATCH] =?UTF-8?q?feat:=20=E3=83=93=E3=83=AB=E3=83=89=E3=83=91?= =?UTF-8?q?=E3=82=A4=E3=83=97=E3=83=A9=E3=82=A4=E3=83=B3=E3=81=AE=E6=9C=80?= =?UTF-8?q?=E9=81=A9=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ドキュメントJSON生成とOGP画像生成を並列化 (concurrency: 10) - BM25インデックスは生成済みJSONから構築(MD再処理を削除) - 未使用のPagefind生成を削除 - unstable_concurrencyでプリレンダリングを並列化 - 不要なbuild-index.tsを削除し、build-docs.tsに統合 ビルド時間: 50秒 → 46秒 (約8%改善) Co-Authored-By: Claude Opus 4.5 --- apps/react-router/package.json | 2 +- apps/react-router/react-router.config.ts | 33 +++-- apps/react-router/scripts/build-index.ts | 26 ++-- packages/scripts/src/build-bm25-index.ts | 170 +++++++++++++++-------- packages/scripts/src/build-docs.ts | 97 +++++++++++++ packages/scripts/src/build-index.ts | 51 ------- 6 files changed, 247 insertions(+), 132 deletions(-) create mode 100644 packages/scripts/src/build-docs.ts delete mode 100644 packages/scripts/src/build-index.ts diff --git a/apps/react-router/package.json b/apps/react-router/package.json index 2326160..95c5b88 100644 --- a/apps/react-router/package.json +++ b/apps/react-router/package.json @@ -4,7 +4,7 @@ "sideEffects": false, "type": "module", "scripts": { - "build": "run-s build:*", + "build": "run-s build:index build:app", "build:index": "tsx scripts/build-index.ts", "build:app": "react-router build", "dev": "react-router dev", diff --git a/apps/react-router/react-router.config.ts b/apps/react-router/react-router.config.ts index b84bdfd..0fbdfb8 100644 --- a/apps/react-router/react-router.config.ts +++ b/apps/react-router/react-router.config.ts @@ -1,22 +1,27 @@ import type { Config } from '@react-router/dev/config' import { buildMenu } from '@remix-docs-ja/scripts/services/menu' -export default { - ssr: false, - prerender: async () => { - const paths = ['/', '/sitemap.xml', '/resources/search'] - const categories = await buildMenu() - for (const category of categories) { - for (const doc of category.children) { - if (doc.children.length === 0) { - paths.push(doc.slug) - } +const getPrerenderPaths = async () => { + const paths = ['/', '/sitemap.xml', '/resources/search'] + const categories = await buildMenu() + for (const category of categories) { + for (const doc of category.children) { + if (doc.children.length === 0) { + paths.push(doc.slug) + } - for (const subDoc of doc.children) { - paths.push(subDoc.slug) - } + for (const subDoc of doc.children) { + paths.push(subDoc.slug) } } - return paths + } + return paths +} + +export default { + ssr: false, + prerender: { + paths: getPrerenderPaths, + unstable_concurrency: 10, }, } satisfies Config diff --git a/apps/react-router/scripts/build-index.ts b/apps/react-router/scripts/build-index.ts index 3ef47bf..9ad4e94 100644 --- a/apps/react-router/scripts/build-index.ts +++ b/apps/react-router/scripts/build-index.ts @@ -1,16 +1,26 @@ -import { buildBM25Index } from '@remix-docs-ja/scripts/build-bm25-index' -import { buildIndex } from '@remix-docs-ja/scripts/build-index' +import { buildBM25IndexFromDocs } from '@remix-docs-ja/scripts/build-bm25-index' +import { buildDocs } from '@remix-docs-ja/scripts/build-docs' import { buildMenus } from '@remix-docs-ja/scripts/build-menu' import { join } from 'node:path' -// Build Pagefind index (legacy) -await buildIndex('react-router-v7') +const docsPath = join(process.cwd(), 'docs') +const productId = 'react-router-v7' -// Build BM25 index (new) -await buildBM25Index({ - docsPath: join(process.cwd(), '../../docs/react-router-v7'), +// Build docs JSON and OGP images in parallel +const docs = await buildDocs({ + docsPath, + outputPath: join(process.cwd(), 'prebuild/docs'), + ogpOutputPath: join(process.cwd(), 'public/ogp'), + productId, + concurrency: 10, +}) + +// Build BM25 index from the already-processed docs (no MD re-processing) +await buildBM25IndexFromDocs({ + docs, outputPath: join(process.cwd(), 'public/search-index'), - product: 'react-router-v7', + product: productId, }) +// Build menus await buildMenus() diff --git a/packages/scripts/src/build-bm25-index.ts b/packages/scripts/src/build-bm25-index.ts index 6ab431c..0373079 100644 --- a/packages/scripts/src/build-bm25-index.ts +++ b/packages/scripts/src/build-bm25-index.ts @@ -1,15 +1,15 @@ #!/usr/bin/env node /** - * Build BM25 search index from documentation markdown files + * Build BM25 search index from pre-built JSON documents */ import glob from 'fast-glob' import { readFileSync, writeFileSync } from 'node:fs' import { mkdir } from 'node:fs/promises' import { join } from 'node:path' +import type { BuiltDoc } from './build-docs.js' import { BM25SearchEngine, type Document } from './services/bm25.js' -import { processMarkdown } from './services/md.server.js' interface BuildOptions { docsPath: string @@ -17,11 +17,16 @@ interface BuildOptions { product: 'react-router-v7' | 'remix' } +interface BuildFromDocsOptions { + docs: BuiltDoc[] + outputPath: string + product: 'react-router-v7' | 'remix' +} + /** * Extract plain text from HTML content */ function extractTextFromHtml(html: string): string { - // Remove HTML tags and decode entities return html .replace(/)<[^<]*)*<\/script>/gi, '') .replace(/)<[^<]*)*<\/style>/gi, '') @@ -39,57 +44,51 @@ function extractTextFromHtml(html: string): string { /** * Generate document sections for better granular search */ - -// biome-ignore lint/suspicious/noExplicitAny: doc is processed from markdown -function generateDocumentSections(doc: any): Document[] { +function generateDocumentSections( + pathname: string, + title: string, + html: string, +): Document[] { const documents: Document[] = [] - // Remove product prefix from path (e.g., react-router-v7/ or remix/) - const cleanPath = doc.attributes.slug.replace( - /^(react-router-v7|remix)\//, - '', - ) - const baseDoc = { - id: cleanPath, - title: doc.attributes.title, - path: `/${cleanPath}`, + id: pathname, + title, + path: `/${pathname}`, section: undefined, } // Main document - const mainContent = extractTextFromHtml(doc.html) + const mainContent = extractTextFromHtml(html) documents.push({ ...baseDoc, - content: `${doc.attributes.title} ${mainContent}`, + content: `${title} ${mainContent}`, tokens: [], length: 0, }) // If the document has sections (h2, h3), create separate documents for them const sectionRegex = /]*id="([^"]*)"[^>]*>([^<]*)<\/h[23]>/g - // biome-ignore lint/suspicious/noImplicitAnyLet: match is used in a loop - let match + let match: RegExpExecArray | null let lastIndex = 0 // biome-ignore lint/suspicious/noAssignInExpressions: match is used in a loop - while ((match = sectionRegex.exec(doc.html)) !== null) { - const [fullMatch, _level, id, title] = match + while ((match = sectionRegex.exec(html)) !== null) { + const [fullMatch, _level, id, sectionTitle] = match const sectionStart = match.index // Extract content between sections if (lastIndex < sectionStart) { - const sectionHtml = doc.html.slice(lastIndex, sectionStart) + const sectionHtml = html.slice(lastIndex, sectionStart) const sectionContent = extractTextFromHtml(sectionHtml) if (sectionContent.length > 50) { - // Only include substantial content documents.push({ ...baseDoc, - id: `${cleanPath}#${id}`, - path: `/${cleanPath}#${id}`, - section: title, - content: `${title} ${sectionContent}`, + id: `${pathname}#${id}`, + path: `/${pathname}#${id}`, + section: sectionTitle, + content: `${sectionTitle} ${sectionContent}`, tokens: [], length: 0, }) @@ -103,43 +102,102 @@ function generateDocumentSections(doc: any): Document[] { } /** - * Build BM25 index for a specific product + * Build BM25 index from pre-built docs (faster, no MD processing) + */ +export async function buildBM25IndexFromDocs( + options: BuildFromDocsOptions, +): Promise { + const { docs, outputPath, product } = options + + console.log(`🔍 Building BM25 index for ${product}...`) + + const engine = new BM25SearchEngine() + await engine.initialize() + + const allDocuments: Document[] = [] + + for (const { pathname, doc } of docs) { + if (doc.attributes.hidden) continue + + const documents = generateDocumentSections( + pathname, + String(doc.attributes.title), + doc.html, + ) + allDocuments.push(...documents) + } + + console.log(`🏗️ Building index for ${allDocuments.length} documents...`) + + const index = engine.buildIndex(allDocuments) + + await mkdir(join(outputPath, 'bm25'), { recursive: true }) + + const indexData = engine.serializeIndex() + writeFileSync(join(outputPath, 'bm25', 'index.json'), indexData, 'utf-8') + + const metadata = { + totalDocuments: index.totalDocuments, + averageDocumentLength: index.averageDocumentLength, + buildTime: new Date().toISOString(), + product, + version: '1.0.0', + } + + writeFileSync( + join(outputPath, 'bm25', 'metadata.json'), + JSON.stringify(metadata, null, 2), + 'utf-8', + ) + + console.log('✅ BM25 index built successfully!') + console.log(` 📊 Total documents: ${index.totalDocuments}`) + console.log( + ` 📏 Average length: ${Math.round(index.averageDocumentLength)} tokens`, + ) +} + +/** + * Build BM25 index from JSON files (for standalone use) */ export async function buildBM25Index(options: BuildOptions): Promise { - console.log(`🔍 Building BM25 index for ${options.product}...`) + const { docsPath, outputPath, product } = options + + console.log(`🔍 Building BM25 index for ${product} from JSON...`) const engine = new BM25SearchEngine() await engine.initialize() - // Collect all markdown files - const pattern = join(options.docsPath, '**/*.md') + const pattern = join(docsPath, '**/*.json') const files = await glob(pattern) - console.log(`📚 Found ${files.length} documentation files`) + console.log(`📚 Found ${files.length} JSON files`) const allDocuments: Document[] = [] - // Process each file for (const file of files) { try { const content = readFileSync(file, 'utf-8') - const slug = file.replace(/^.*\/docs\//, '').replace(/\.md$/, '') - const doc = await processMarkdown(content) + const doc = JSON.parse(content) - // Add slug to attributes for compatibility - doc.attributes.slug = slug + if (doc.attributes?.hidden) continue - // Skip hidden documents - if (doc.attributes.hidden) { - continue - } + const pathname = file + .replace( + new RegExp(`^${docsPath.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}/`), + '', + ) + .replace(/\.json$/, '') - // Generate document sections - const documents = generateDocumentSections(doc) + const documents = generateDocumentSections( + pathname, + String(doc.attributes?.title || pathname), + doc.html || '', + ) allDocuments.push(...documents) console.log( - `📄 Processed: ${doc.attributes.title} (${documents.length} sections)`, + `📄 Processed: ${doc.attributes?.title || pathname} (${documents.length} sections)`, ) } catch (error) { console.warn(`⚠️ Failed to process ${file}:`, error) @@ -148,31 +206,23 @@ export async function buildBM25Index(options: BuildOptions): Promise { console.log(`🏗️ Building index for ${allDocuments.length} documents...`) - // Build the index const index = engine.buildIndex(allDocuments) - // Create output directory - await mkdir(join(options.outputPath, 'bm25'), { recursive: true }) + await mkdir(join(outputPath, 'bm25'), { recursive: true }) - // Save index const indexData = engine.serializeIndex() - writeFileSync( - join(options.outputPath, 'bm25', 'index.json'), - indexData, - 'utf-8', - ) + writeFileSync(join(outputPath, 'bm25', 'index.json'), indexData, 'utf-8') - // Save metadata separately for faster loading const metadata = { totalDocuments: index.totalDocuments, averageDocumentLength: index.averageDocumentLength, buildTime: new Date().toISOString(), - product: options.product, + product, version: '1.0.0', } writeFileSync( - join(options.outputPath, 'bm25', 'metadata.json'), + join(outputPath, 'bm25', 'metadata.json'), JSON.stringify(metadata, null, 2), 'utf-8', ) @@ -182,7 +232,6 @@ export async function buildBM25Index(options: BuildOptions): Promise { console.log( ` 📏 Average length: ${Math.round(index.averageDocumentLength)} tokens`, ) - console.log(` 💾 Output: ${options.outputPath}/bm25/`) } /** @@ -196,7 +245,12 @@ async function main() { process.exit(1) } - const docsPath = join(process.cwd(), '../../docs', product) + const docsPath = join( + process.cwd(), + '../../apps', + product === 'react-router-v7' ? 'react-router' : 'remix', + 'prebuild/docs', + ) const outputPath = join( process.cwd(), '../../apps', diff --git a/packages/scripts/src/build-docs.ts b/packages/scripts/src/build-docs.ts new file mode 100644 index 0000000..9740935 --- /dev/null +++ b/packages/scripts/src/build-docs.ts @@ -0,0 +1,97 @@ +import glob from 'fast-glob' +import fs from 'node:fs/promises' +import path from 'node:path' +import { getDoc } from './services/document' +import { getOgpImageResponse } from './services/ogp-image.server' + +interface BuildDocsOptions { + docsPath: string + outputPath: string + ogpOutputPath: string + productId: string + concurrency?: number +} + +export interface BuiltDoc { + pathname: string + doc: { + attributes: { title: string; hidden?: boolean; [key: string]: unknown } + raw: string + html: string + headings: { headingLevel: string; html: string; slug: string | undefined }[] + } +} + +/** + * Build docs JSON and OGP images in parallel + * Returns the built docs for use by search index builders + */ +export const buildDocs = async ( + options: BuildDocsOptions, +): Promise => { + const { + docsPath, + outputPath, + ogpOutputPath, + productId, + concurrency = 10, + } = options + + const files = await glob(path.join(docsPath, '/**/*.md')) + console.log(`📚 Building ${files.length} documents...`) + + const results: BuiltDoc[] = [] + const docsPathRegex = new RegExp( + `^${docsPath.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}/`, + ) + + // Process in batches for concurrency + for (let i = 0; i < files.length; i += concurrency) { + const batch = files.slice(i, i + concurrency) + const batchResults = await Promise.all( + batch.map(async (filename) => { + const pathname = filename + .replace(docsPathRegex, '') + .replace(/\.md$/, '') + + try { + const doc = await getDoc(pathname, { productId }) + if (!doc) { + console.log(`⚠️ doc not found: ${pathname}`) + return null + } + + // Write JSON + const jsonFilename = path.join(outputPath, `${pathname}.json`) + await fs.mkdir(path.dirname(jsonFilename), { recursive: true }) + await fs.writeFile( + jsonFilename, + JSON.stringify(doc, null, 2), + 'utf-8', + ) + + // Generate OGP image + const ogpImage = await getOgpImageResponse(productId, pathname) + const ogpFilename = path.join(ogpOutputPath, `${pathname}.png`) + await fs.mkdir(path.dirname(ogpFilename), { recursive: true }) + await fs.writeFile( + ogpFilename, + Buffer.from(await ogpImage.arrayBuffer()), + ) + + console.log(`✓ ${pathname}`) + + return { pathname, doc } as BuiltDoc + } catch (error) { + console.error(`❌ Failed: ${pathname}`, error) + return null + } + }), + ) + + results.push(...batchResults.filter((r): r is BuiltDoc => r !== null)) + } + + console.log(`✅ Built ${results.length} documents`) + return results +} diff --git a/packages/scripts/src/build-index.ts b/packages/scripts/src/build-index.ts deleted file mode 100644 index 36a2e7e..0000000 --- a/packages/scripts/src/build-index.ts +++ /dev/null @@ -1,51 +0,0 @@ -import glob from 'fast-glob' -import fs from 'node:fs/promises' -import path from 'node:path' -import * as pagefind from 'pagefind' -import { getDoc } from './services/document' -import { getOgpImageResponse } from './services/ogp-image.server' - -export const buildIndex = async (productId: string) => { - const { index } = await pagefind.createIndex({}) - if (!index) throw new Error('index is not created') - - const docs = await glob(path.join('docs', '/**/*.md')) - for (const filename of docs) { - const regexp = /^docs\// - const pathname = filename.replace(regexp, '').replace(/\.md$/, '') - const doc = await getDoc(pathname, { productId }) - if (!doc) { - console.log('doc not found:\n\n', pathname) - continue - } - - const jsonFilename = path.join('prebuild/docs', `${pathname}.json`) - const jsonDir = path.dirname(jsonFilename) - await fs.mkdir(jsonDir, { recursive: true }) - await fs.writeFile(jsonFilename, JSON.stringify(doc, null, 2), { - encoding: 'utf-8', - }) - console.log('doc:', jsonFilename) - - // OGP画像生成 - const ogpImage = await getOgpImageResponse(productId, pathname) - await fs.mkdir(path.dirname(`public/ogp/${pathname}`), { - recursive: true, - }) - const ogpFilename = `public/ogp/${pathname}.png` - await fs.writeFile(ogpFilename, Buffer.from(await ogpImage.arrayBuffer())) - console.log('ogp:', ogpFilename) - - // 検索インデックス追加 - if (!doc.attributes.hidden) { - await index.addCustomRecord({ - content: doc.html, - meta: { title: doc.attributes.title?.toString() }, - language: 'ja', - url: pathname, - }) - } - } - - await index.writeFiles({ outputPath: 'public/pagefind' }) -}