Skip to content

Commit 950ec84

Browse files
refactor: improve paragraph chunking logic in content chunker
1 parent 91ae5bb commit 950ec84

File tree

1 file changed

+33
-17
lines changed

1 file changed

+33
-17
lines changed

build/utils/markdown-chunker.ts

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -350,23 +350,39 @@ export function chunkContent(
350350
currentLines = 0;
351351
}
352352
}
353-
} else if (block.type === 'paragraph' && block.estimatedLines > linesPerPage) {
354-
if (currentChunk.trim()) {
355-
chunks.push(currentChunk.trim());
356-
currentChunk = '';
357-
currentLines = 0;
358-
}
359-
360-
const sentences = block.content.match(/[^.!?]+[.!?]+/g) || [block.content];
361-
for (const sentence of sentences) {
362-
const sentenceLines = Math.ceil(sentence.length / charsPerLine);
363-
if (currentLines + sentenceLines > linesPerPage && currentChunk.trim()) {
364-
chunks.push(currentChunk.trim());
365-
currentChunk = sentence;
366-
currentLines = sentenceLines;
367-
} else {
368-
currentChunk += sentence;
369-
currentLines += sentenceLines;
353+
} else if (block.type === 'paragraph' || block.type === 'blockquote') {
354+
const remainingSpace = linesPerPage - currentLines;
355+
const quotePrefix = block.type === 'blockquote' ? '> ' : '';
356+
const contentWithoutPrefix = block.type === 'blockquote'
357+
? block.content.replace(/^>\s?/gm, '')
358+
: block.content;
359+
360+
if (remainingSpace >= block.estimatedLines) {
361+
currentChunk += (currentChunk ? '\n\n' : '') + block.content;
362+
currentLines += block.estimatedLines;
363+
} else {
364+
const sentences = contentWithoutPrefix.match(/[^.!?]+[.!?]+|[^.!?]+$/g) || [contentWithoutPrefix];
365+
let isFirstSentence = true;
366+
for (const sentence of sentences) {
367+
const fullSentence = isFirstSentence && block.type === 'blockquote'
368+
? quotePrefix + sentence.trim()
369+
: sentence.trim();
370+
const sentenceLines = Math.ceil(fullSentence.length / charsPerLine) + 1;
371+
if (currentLines + sentenceLines > linesPerPage && currentChunk.trim()) {
372+
chunks.push(currentChunk.trim());
373+
currentChunk = fullSentence;
374+
currentLines = sentenceLines;
375+
} else {
376+
if (currentChunk && !isFirstSentence) {
377+
currentChunk += ' ' + fullSentence;
378+
} else if (currentChunk) {
379+
currentChunk += '\n\n' + fullSentence;
380+
} else {
381+
currentChunk = fullSentence;
382+
}
383+
currentLines += sentenceLines;
384+
}
385+
isFirstSentence = false;
370386
}
371387
}
372388
} else {

0 commit comments

Comments
 (0)