Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions packages/pyright-internal/src/common/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,14 @@ export function cloneStr(str: string): string {
// to ensure we get a copy of the string to prevent the original string from being retained in memory.
// For example, the import resolution cache in importResolver might hold onto the full original file content
// because seemingly innocent the import name (e.g., `foo` in `import foo`) is in the cache.

// V8 uses a SlicedString representation for substrings only above a small length threshold (currently 13),
// so short strings can be returned as-is without retaining the original text in memory.
// https://github.com/v8/v8/blob/02558d5a88c8f06ff064e3b6b332f342e1ab6143/src/objects/string.h#L1054
if (str.length < 13) {
return str;
}

return Buffer.from(str, 'utf8').toString('utf8');
}

Expand Down
20 changes: 17 additions & 3 deletions packages/pyright-internal/src/parser/tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,11 @@ export class Tokenizer {
// Assume Jupyter notebook tokenization rules?
private _useNotebookMode = false;

// Intern identifier strings within a single tokenization pass. This reduces
// per-identifier allocations while still ensuring we don't retain substrings
// that reference the original source text.
private readonly _identifierInternedStrings = new Map<string, string>();

tokenize(
text: string,
start?: number,
Expand Down Expand Up @@ -284,6 +289,7 @@ export class Tokenizer {
this._lineRanges = [];
this._indentAmounts = [];
this._useNotebookMode = useNotebookMode;
this._identifierInternedStrings.clear();

const end = start + length;

Expand Down Expand Up @@ -905,20 +911,28 @@ export class Tokenizer {

if (this._cs.position > start) {
const value = this._cs.getText().slice(start, this._cs.position);
if (_keywords.has(value)) {
const keywordType = _keywords.get(value);
if (keywordType !== undefined) {
this._tokens.push(
KeywordToken.create(start, this._cs.position - start, _keywords.get(value)!, this._getComments())
KeywordToken.create(start, this._cs.position - start, keywordType, this._getComments())
);
} else {
const internedValue = this._identifierInternedStrings.get(value) ?? this._internIdentifierString(value);
this._tokens.push(
IdentifierToken.create(start, this._cs.position - start, cloneStr(value), this._getComments())
IdentifierToken.create(start, this._cs.position - start, internedValue, this._getComments())
);
}
return true;
}
return false;
}

private _internIdentifierString(value: string) {
const clonedValue = cloneStr(value);
this._identifierInternedStrings.set(clonedValue, clonedValue);
return clonedValue;
}

private _isPossibleNumber(): boolean {
if (isDecimal(this._cs.currentChar)) {
return true;
Expand Down
Loading