From edce2a92e57713e8945d2483dbf696331d9ed48a Mon Sep 17 00:00:00 2001 From: Kamil Monicz Date: Mon, 2 Feb 2026 14:46:42 +0100 Subject: [PATCH] Avoid cloneStr for small strings and intern tokenizer identifiers --- packages/pyright-internal/src/common/core.ts | 8 ++++++++ .../pyright-internal/src/parser/tokenizer.ts | 20 ++++++++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/packages/pyright-internal/src/common/core.ts b/packages/pyright-internal/src/common/core.ts index 296f4532fed8..24b70171e0d1 100644 --- a/packages/pyright-internal/src/common/core.ts +++ b/packages/pyright-internal/src/common/core.ts @@ -190,6 +190,14 @@ export function cloneStr(str: string): string { // to ensure we get a copy of the string to prevent the original string from being retained in memory. // For example, the import resolution cache in importResolver might hold onto the full original file content // because seemingly innocent the import name (e.g., `foo` in `import foo`) is in the cache. + + // V8 uses a SlicedString representation for substrings only above a small length threshold (currently 13), + // so short strings can be returned as-is without retaining the original text in memory. + // https://github.com/v8/v8/blob/02558d5a88c8f06ff064e3b6b332f342e1ab6143/src/objects/string.h#L1054 + if (str.length < 13) { + return str; + } + return Buffer.from(str, 'utf8').toString('utf8'); } diff --git a/packages/pyright-internal/src/parser/tokenizer.ts b/packages/pyright-internal/src/parser/tokenizer.ts index beff827182ad..562328586787 100644 --- a/packages/pyright-internal/src/parser/tokenizer.ts +++ b/packages/pyright-internal/src/parser/tokenizer.ts @@ -255,6 +255,11 @@ export class Tokenizer { // Assume Jupyter notebook tokenization rules? private _useNotebookMode = false; + // Intern identifier strings within a single tokenization pass. This reduces + // per-identifier allocations while still ensuring we don't retain substrings + // that reference the original source text. + private readonly _identifierInternedStrings = new Map(); + tokenize( text: string, start?: number, @@ -284,6 +289,7 @@ export class Tokenizer { this._lineRanges = []; this._indentAmounts = []; this._useNotebookMode = useNotebookMode; + this._identifierInternedStrings.clear(); const end = start + length; @@ -905,13 +911,15 @@ export class Tokenizer { if (this._cs.position > start) { const value = this._cs.getText().slice(start, this._cs.position); - if (_keywords.has(value)) { + const keywordType = _keywords.get(value); + if (keywordType !== undefined) { this._tokens.push( - KeywordToken.create(start, this._cs.position - start, _keywords.get(value)!, this._getComments()) + KeywordToken.create(start, this._cs.position - start, keywordType, this._getComments()) ); } else { + const internedValue = this._identifierInternedStrings.get(value) ?? this._internIdentifierString(value); this._tokens.push( - IdentifierToken.create(start, this._cs.position - start, cloneStr(value), this._getComments()) + IdentifierToken.create(start, this._cs.position - start, internedValue, this._getComments()) ); } return true; @@ -919,6 +927,12 @@ export class Tokenizer { return false; } + private _internIdentifierString(value: string) { + const clonedValue = cloneStr(value); + this._identifierInternedStrings.set(clonedValue, clonedValue); + return clonedValue; + } + private _isPossibleNumber(): boolean { if (isDecimal(this._cs.currentChar)) { return true;