Skip to content

Commit a7be152

Browse files
hotfix: a more liberal chunking algorithm that sends split items to the next page
1 parent 994a04a commit a7be152

1 file changed

Lines changed: 39 additions & 45 deletions

File tree

build/utils/markdown-chunker.ts

Lines changed: 39 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -120,24 +120,23 @@ export function splitList(block: Block, remainingBudget: number): [string, strin
120120
items.push(currentItem);
121121
}
122122

123+
const safetyBuffer = 0.8;
124+
const safeBudget = remainingBudget * safetyBuffer;
125+
123126
let usedLength = 0;
124127
let splitIndex = 0;
125128

126129
for (let i = 0; i < items.length; i++) {
127130
const itemContent = items[i].join('\n');
128131
const itemEffectiveLength = Math.ceil(itemContent.length * SCALE_FACTORS.list);
129132

130-
if (usedLength + itemEffectiveLength > remainingBudget && i > 0) {
133+
if (usedLength + itemEffectiveLength > safeBudget) {
131134
break;
132135
}
133136
usedLength += itemEffectiveLength;
134137
splitIndex = i + 1;
135138
}
136139

137-
if (splitIndex === 0) {
138-
splitIndex = 1;
139-
}
140-
141140
const firstPart = items.slice(0, splitIndex).map(item => item.join('\n')).join('\n');
142141
const secondPart = items.slice(splitIndex).map(item => item.join('\n')).join('\n');
143142

@@ -201,50 +200,45 @@ export function chunkContent(content: string, charsPerPage: number): string[] {
201200
continue;
202201
}
203202

204-
const remainingBudget = charsPerPage - currentEffectiveLength;
205-
206-
if (block.type === 'list' && block.effectiveLength > charsPerPage * 0.3) {
207-
const [firstPart, secondPart] = splitList(block, remainingBudget);
208-
209-
if (firstPart && remainingBudget > charsPerPage * 0.2) {
210-
currentChunk += (currentChunk ? '\n\n' : '') + firstPart;
211-
chunks.push(currentChunk.trim());
212-
currentChunk = '';
213-
currentEffectiveLength = 0;
214-
215-
if (secondPart) {
216-
const remainingBlock: Block = {
217-
type: 'list',
218-
content: secondPart,
219-
effectiveLength: Math.ceil(secondPart.length * SCALE_FACTORS.list),
220-
};
221-
blocks.splice(i + 1, 0, remainingBlock);
222-
}
223-
} else {
224-
if (currentChunk.trim()) {
225-
chunks.push(currentChunk.trim());
226-
}
227-
currentChunk = '';
228-
currentEffectiveLength = 0;
229-
i--;
230-
}
231-
} else if (block.type === 'code' && block.effectiveLength > charsPerPage) {
203+
if (block.type === 'list' || block.type === 'blockquote' || block.type === 'code') {
232204
if (currentChunk.trim()) {
233205
chunks.push(currentChunk.trim());
234-
currentChunk = '';
235-
currentEffectiveLength = 0;
236206
}
237207

238-
const [firstPart, secondPart] = splitCodeBlock(block, charsPerPage);
239-
chunks.push(firstPart.trim());
240-
241-
if (secondPart) {
242-
const remainingBlock: Block = {
243-
type: 'code',
244-
content: secondPart,
245-
effectiveLength: Math.ceil(secondPart.length * SCALE_FACTORS.code),
246-
};
247-
blocks.splice(i + 1, 0, remainingBlock);
208+
if (block.effectiveLength <= charsPerPage) {
209+
currentChunk = block.content;
210+
currentEffectiveLength = block.effectiveLength;
211+
} else {
212+
if (block.type === 'list') {
213+
const [firstPart, secondPart] = splitList(block, charsPerPage);
214+
chunks.push(firstPart.trim());
215+
if (secondPart) {
216+
const remainingBlock: Block = {
217+
type: 'list',
218+
content: secondPart,
219+
effectiveLength: Math.ceil(secondPart.length * SCALE_FACTORS.list),
220+
};
221+
blocks.splice(i + 1, 0, remainingBlock);
222+
}
223+
currentChunk = '';
224+
currentEffectiveLength = 0;
225+
} else if (block.type === 'code') {
226+
const [firstPart, secondPart] = splitCodeBlock(block, charsPerPage);
227+
chunks.push(firstPart.trim());
228+
if (secondPart) {
229+
const remainingBlock: Block = {
230+
type: 'code',
231+
content: secondPart,
232+
effectiveLength: Math.ceil(secondPart.length * SCALE_FACTORS.code),
233+
};
234+
blocks.splice(i + 1, 0, remainingBlock);
235+
}
236+
currentChunk = '';
237+
currentEffectiveLength = 0;
238+
} else {
239+
currentChunk = block.content;
240+
currentEffectiveLength = block.effectiveLength;
241+
}
248242
}
249243
} else if (block.type === 'paragraph' && block.effectiveLength > charsPerPage) {
250244
if (currentChunk.trim()) {

0 commit comments

Comments
 (0)