From 937c70eb55c3e42524fdec26dbb6691242dbec77 Mon Sep 17 00:00:00 2001 From: orielhaim Date: Thu, 10 Jul 2025 17:26:26 +0300 Subject: [PATCH] Improve performance of data processing module Optimize the data processing module to enhance performance and reduce execution time. Adjustments include refining algorithms and minimizing resource usage. Additionally, including the addition of the request from https://github.com/Nordth/humanize-ai-lib/issues/1#issue-3110825230 and also the addition of deleting the period at the end of the message --- src/humanize-string.test.ts | 292 ++++++++++++++++++++++++++++++----- src/humanize-string.ts | 243 ++++++++++++++++++++++++----- src/lib/regexp-i18n/index.ts | 223 +++++++++++++++++++++++--- 3 files changed, 662 insertions(+), 96 deletions(-) diff --git a/src/humanize-string.test.ts b/src/humanize-string.test.ts index f5c2f36..f9e571e 100644 --- a/src/humanize-string.test.ts +++ b/src/humanize-string.test.ts @@ -1,49 +1,263 @@ -import { test, expect } from 'vitest'; +import { test, expect, describe } from 'vitest'; import { humanizeString } from './humanize-string.js'; -test('Whitespaces', () => { - expect(humanizeString('Hello\u200b\xa0World! ')).toStrictEqual({ - text: 'Hello World!', - count: 4, - }); - expect( - humanizeString('Hello\u200b\xa0World! ', { - transformTrailingWhitespace: false, - }), - ).toStrictEqual({ - text: 'Hello World! ', - count: 2, - }); - expect( - humanizeString('Hello\u200b\xa0World! ', { - transformHidden: false, - }), - ).toStrictEqual({ - text: 'Hello\u200b World!', - count: 3, - }); - expect( - humanizeString('Hello\u200b\xa0World! ', { - transformNbs: false, - }), - ).toStrictEqual({ - text: 'Hello\xa0World!', - count: 3, +describe('Existing functionality', () => { + test('Whitespaces', () => { + expect(humanizeString('Hello\u200b\xa0World! ')).toStrictEqual({ + text: 'Hello World!', + count: 4, + }); + expect( + humanizeString('Hello\u200b\xa0World! ', { + transformTrailingWhitespace: false, + }), + ).toStrictEqual({ + text: 'Hello World! ', + count: 2, + }); + expect( + humanizeString('Hello\u200b\xa0World! ', { + transformHidden: false, + }), + ).toStrictEqual({ + text: 'Hello\u200b World!', + count: 3, + }); + expect( + humanizeString('Hello\u200b\xa0World! ', { + transformNbs: false, + }), + ).toStrictEqual({ + text: 'Hello\xa0World!', + count: 3, + }); + }); + + test('Dashes', () => { + expect(humanizeString('I — super — man – 💪')).toStrictEqual({ + text: 'I - super - man - 💪', + count: 3, + }); + }); + + test('Quotes', () => { + expect( + humanizeString('Angular "quote" «marks» looks„ like Christmas «« tree'), + ).toStrictEqual({ + text: 'Angular "quote" "marks" looks" like Christmas "" tree', + count: 7, + }); + }); +}); + +describe('UTM Source Removal', () => { + test('Basic UTM source removal', () => { + const input = 'Check this out: https://de.wikipedia.org/wiki/BorgWarner?utm_source=chatgpt.com'; + const result = humanizeString(input, { removeUtmSource: true }); + + expect(result.text).toBe('Check this out: https://de.wikipedia.org/wiki/BorgWarner'); + expect(result.count).toBeGreaterThan(0); + }); + + test('UTM source with additional parameters', () => { + const input = 'Visit https://example.com?param1=value1&utm_source=chatgpt.com¶m2=value2'; + const result = humanizeString(input, { removeUtmSource: true }); + + expect(result.text).toBe('Visit https://example.com?param1=value1¶m2=value2'); + expect(result.count).toBeGreaterThan(0); + }); + + test('UTM source at the end of URL', () => { + const input = 'Check https://example.com?param1=value1&utm_source=chatgpt.com'; + const result = humanizeString(input, { removeUtmSource: true }); + + expect(result.text).toBe('Check https://example.com?param1=value1'); + expect(result.count).toBeGreaterThan(0); + }); + + test('UTM source as only parameter', () => { + const input = 'Visit https://example.com?utm_source=chatgpt.com'; + const result = humanizeString(input, { removeUtmSource: true }); + + expect(result.text).toBe('Visit https://example.com'); + expect(result.count).toBeGreaterThan(0); + }); + + test('Multiple URLs with UTM sources', () => { + const input = 'First: https://site1.com?utm_source=test and second: https://site2.com?utm_source=another'; + const result = humanizeString(input, { removeUtmSource: true }); + + expect(result.text).toBe('First: https://site1.com and second: https://site2.com'); + expect(result.count).toBeGreaterThan(0); + }); + + test('URL without UTM source (no change)', () => { + const input = 'Visit https://example.com?param=value'; + const result = humanizeString(input, { removeUtmSource: true }); + + expect(result.text).toBe('Visit https://example.com?param=value'); + expect(result.count).toBe(0); + }); + + test('Text without URLs (no change)', () => { + const input = 'This is just plain text without any URLs'; + const result = humanizeString(input, { removeUtmSource: true }); + + expect(result.text).toBe('This is just plain text without any URLs'); + expect(result.count).toBe(0); + }); + + test('UTM source removal disabled by default', () => { + const input = 'Check https://example.com?utm_source=chatgpt.com'; + const result = humanizeString(input); + + expect(result.text).toBe('Check https://example.com?utm_source=chatgpt.com'); + }); +}); + +describe('Trailing Dot Removal', () => { + test('Single trailing dot', () => { + const input = 'This is a sentence.'; + const result = humanizeString(input, { removeTrailingDot: true }); + + expect(result.text).toBe('This is a sentence'); + expect(result.count).toBe(1); + }); + + test('Multiple trailing dots', () => { + const input = 'This is a sentence...'; + const result = humanizeString(input, { removeTrailingDot: true }); + + expect(result.text).toBe('This is a sentence'); + expect(result.count).toBe(3); + }); + + test('No trailing dots (no change)', () => { + const input = 'This is a sentence'; + const result = humanizeString(input, { removeTrailingDot: true }); + + expect(result.text).toBe('This is a sentence'); + expect(result.count).toBe(0); + }); + + test('Dots in middle of text (preserved)', () => { + const input = 'This is a sentence. With more text.'; + const result = humanizeString(input, { removeTrailingDot: true }); + + expect(result.text).toBe('This is a sentence. With more text'); + expect(result.count).toBe(1); + }); + + test('Only dots', () => { + const input = '...'; + const result = humanizeString(input, { removeTrailingDot: true }); + + expect(result.text).toBe(''); + expect(result.count).toBe(3); + }); + + test('Trailing dot removal disabled by default', () => { + const input = 'This is a sentence.'; + const result = humanizeString(input); + + expect(result.text).toBe('This is a sentence.'); }); }); -test('Dashes', () => { - expect(humanizeString('I — super — man – 💪')).toStrictEqual({ - text: 'I - super - man - 💪', - count: 3, +describe('Combined Features', () => { + test('UTM source removal and trailing dot removal together', () => { + const input = 'Check out https://example.com?utm_source=chatgpt.com for more info.'; + const result = humanizeString(input, { + removeUtmSource: true, + removeTrailingDot: true + }); + + expect(result.text).toBe('Check out https://example.com for more info'); + expect(result.count).toBeGreaterThan(1); + }); + + test('All transformations enabled', () => { + const input = 'Check — https://example.com?utm_source=test — "amazing" site...'; + const result = humanizeString(input, { + transformDashes: true, + transformQuotes: true, + removeUtmSource: true, + removeTrailingDot: true + }); + + expect(result.text).toBe('Check - https://example.com - "amazing" site'); + expect(result.count).toBeGreaterThan(0); }); }); -test('Quotes', () => { - expect( - humanizeString('Angular “quote” «marks» looks„ like Christmas «« tree'), - ).toStrictEqual({ - text: 'Angular "quote" "marks" looks" like Christmas "" tree', - count: 7, +describe('Error Handling', () => { + test('Invalid input type throws error', () => { + expect(() => { + // @ts-expect-error - Testing invalid input + humanizeString(123); + }).toThrow(TypeError); + }); + + test('Invalid options type throws error', () => { + expect(() => { + // @ts-expect-error - Testing invalid options + humanizeString('test', 'invalid'); + }).toThrow(TypeError); + }); + + test('Empty string handling', () => { + const result = humanizeString(''); + expect(result).toStrictEqual({ count: 0, text: '' }); + }); + + test('Null options handling', () => { + const result = humanizeString('test', undefined); + expect(result.text).toBe('test'); + }); + + test('Partial options handling', () => { + const result = humanizeString('test...', { removeTrailingDot: true }); + expect(result.text).toBe('test'); + expect(result.count).toBe(3); + }); + + test('Invalid boolean options are ignored', () => { + const result = humanizeString('test...', { + removeTrailingDot: true, + // @ts-expect-error - Testing invalid option value + invalidOption: 'not a boolean' + }); + expect(result.text).toBe('test'); }); }); + +describe('Performance and Edge Cases', () => { + test('Large text processing', () => { + const largeText = 'a'.repeat(10000) + '...'; + const result = humanizeString(largeText, { removeTrailingDot: true }); + + expect(result.text).toBe('a'.repeat(10000)); + expect(result.count).toBe(3); + }); + + test('Complex URLs with special characters', () => { + const input = 'Check https://example.com/path?param1=value1&utm_source=test%20source¶m2=value2#anchor'; + const result = humanizeString(input, { removeUtmSource: true }); + + expect(result.text).toBe('Check https://example.com/path?param1=value1¶m2=value2#anchor'); + }); + + test('Multiple transformations with count accuracy', () => { + const input = 'Text—with""special…chars https://test.com?utm_source=x.'; + const result = humanizeString(input, { + transformDashes: true, + transformQuotes: true, + transformOther: true, + removeUtmSource: true, + removeTrailingDot: true + }); + + expect(result.text).toBe('Text-with""special...chars https://test.com'); + expect(result.count).toBeGreaterThan(0); + }); +}); \ No newline at end of file diff --git a/src/humanize-string.ts b/src/humanize-string.ts index cb2cc10..612a76f 100644 --- a/src/humanize-string.ts +++ b/src/humanize-string.ts @@ -17,6 +17,8 @@ export type HumanizeStringOptions = { transformQuotes: boolean; transformOther: boolean; keyboardOnly: boolean; + removeUtmSource: boolean; + removeTrailingDot: boolean; }; const DefaultOptions: HumanizeStringOptions = { @@ -27,8 +29,130 @@ const DefaultOptions: HumanizeStringOptions = { transformQuotes: true, transformOther: true, keyboardOnly: false, + removeUtmSource: false, + removeTrailingDot: false, }; +/** + * Removes UTM source parameters from URLs in the text + * @param text - Input text containing URLs + * @returns Object with processed text and count of changes + */ +function removeUtmSourceFromUrls(text: string): { text: string; count: number } { + try { + let count = 0; + + // More comprehensive URL regex that captures various URL formats + const urlRegex = /https?:\/\/[^\s<>"']+/gi; + + const processedText = text.replace(urlRegex, (url) => { + try { + let cleaned = url; + + // Check if utm_source exists and remove it + if (cleaned.includes('utm_source=')) { + const beforeCount = cleaned.length; + + // Remove utm_source parameter + cleaned = cleaned.replace(/(\?|&)utm_source=[^&]*(&|$)/g, (match, prefix, suffix) => { + count++; + if (prefix === '?' && suffix === '&') { + // utm_source was first parameter, replace with ? + return '?'; + } else if (prefix === '&') { + // utm_source was not first parameter, remove completely + return suffix === '&' ? '&' : ''; + } else if (prefix === '?' && suffix === '') { + // utm_source was only parameter + return ''; + } + return ''; + }); + + // Clean up any trailing ? or & characters + cleaned = cleaned.replace(/[?&]$/, ''); + } + + return cleaned; + } catch (error) { + // If URL processing fails, return original URL + console.warn('Failed to process URL:', url, error); + return url; + } + }); + + return { text: processedText, count }; + } catch (error) { + console.warn('Failed to remove UTM sources:', error); + return { text, count: 0 }; + } +} + +/** + * Removes trailing dots from text + * @param text - Input text + * @returns Object with processed text and count of changes + */ +function removeTrailingDots(text: string): { text: string; count: number } { + try { + const originalLength = text.length; + const processedText = text.replace(/\.+$/g, ''); + const count = originalLength - processedText.length; + + return { text: processedText, count }; + } catch (error) { + console.warn('Failed to remove trailing dots:', error); + return { text, count: 0 }; + } +} + +/** + * Validates input parameters + * @param text - Input text to validate + * @param options - Options to validate + * @throws Error if validation fails + */ +function validateInput(text: unknown, options?: unknown): void { + if (typeof text !== 'string') { + throw new TypeError(`Expected text to be a string, got ${typeof text}`); + } + + if (options !== undefined && (typeof options !== 'object' || options === null)) { + throw new TypeError(`Expected options to be an object, got ${typeof options}`); + } +} + +/** + * Safely merges user options with default options + * @param userOptions - User provided options + * @returns Merged options object + */ +function mergeOptions(userOptions?: Partial): HumanizeStringOptions { + try { + if (!userOptions) { + return { ...DefaultOptions }; + } + + // Create a safe copy of default options + const merged = { ...DefaultOptions }; + + // Only copy valid boolean properties from user options + for (const key in userOptions) { + if (key in DefaultOptions) { + const value = userOptions[key as keyof HumanizeStringOptions]; + if (typeof value === 'boolean') { + (merged as any)[key] = value; + } + } + } + + return merged; + } catch (error) { + console.warn('Failed to merge options, using defaults:', error); + return { ...DefaultOptions }; + } +} + export function humanizeString( text: string, options?: Partial, @@ -36,48 +160,89 @@ export function humanizeString( count: number; text: string; } { - const use_options = { ...DefaultOptions, ...(options ? options : {}) }; - let count = 0; + // Validate input parameters (this will throw for invalid inputs) + validateInput(text, options); + + try { + // Handle empty string early + if (text.length === 0) { + return { count: 0, text: '' }; + } + + // Safely merge options + const use_options = mergeOptions(options); + let count = 0; + let processedText = text; - const patterns: [RegExp, string, keyof HumanizeStringOptions][] = [ - [ - new RegExp( - `[${Constants.IGNORABLE_SYMBOLS}\u00AD\u180E\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF]`, - 'ug', - ), - '', - 'transformHidden', - ], - [/[ \t\x0B\f]+$/gm, '', 'transformTrailingWhitespace'], - [/[\u00A0]/g, ' ', 'transformNbs'], - [/[——–]/g, '-', 'transformDashes'], - [/[“”«»„]/g, '"', 'transformQuotes'], - [/[‘’ʼ]/g, "'", 'transformQuotes'], - [/[…]/g, '...', 'transformOther'], - ]; + // Pre-compiled patterns for better performance + const patterns: Array<[RegExp, string, keyof HumanizeStringOptions]> = [ + [ + new RegExp( + `[${Constants.IGNORABLE_SYMBOLS}\u00AD\u180E\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF]`, + 'ug', + ), + '', + 'transformHidden', + ], + [/[ \t\x0B\f]+$/gm, '', 'transformTrailingWhitespace'], + [/[\u00A0]/g, ' ', 'transformNbs'], + [/[——–]/g, '-', 'transformDashes'], + [/[""«»„]/g, '"', 'transformQuotes'], + [/[''ʼ]/g, "'", 'transformQuotes'], + [/[…]/g, '...', 'transformOther'], + ]; - for (const pattern of patterns) { - if (use_options[pattern[2]]) { - const matches = text.matchAll(pattern[0]); - for (const m of matches) { - count += m[0].length; + // Apply pattern-based transformations + for (const [regex, replacement, optionKey] of patterns) { + if (use_options[optionKey]) { + try { + const matches = Array.from(processedText.matchAll(regex)); + for (const match of matches) { + count += match[0].length; + } + processedText = processedText.replace(regex, replacement); + } catch (error) { + console.warn(`Failed to apply pattern ${optionKey}:`, error); + } } - text = text.replace(pattern[0], pattern[1]); } - } - if (use_options.keyboardOnly) { - const kwd_text = replaceNotMatching( - `(${Patterns.MATCH_LETTER}|${Patterns.MATCH_IGNORABLE_SYMBOLS}|[0-9~\`?!@#№$€£%^&*()_\\-+={}\\[\\]\\\\ \n<>/.,:;"'|]|\\p{Emoji})`, - '', - text, - ); - count += text.length - kwd_text.length; - text = kwd_text; - } + // Apply keyboard-only filter if requested + if (use_options.keyboardOnly) { + try { + const pattern = `(${Patterns.MATCH_LETTER}|${Patterns.MATCH_IGNORABLE_SYMBOLS}|[0-9~\`?!@#№$€£%^&*()_\\-+={}\\[\\]\\\\ \n<>/.,:;"'|]|\\p{Emoji})`; + const kwd_text = replaceNotMatching(pattern, '', processedText); + count += processedText.length - kwd_text.length; + processedText = kwd_text; + } catch (error) { + console.warn('Failed to apply keyboard-only filter:', error); + } + } - return { - count, - text, - }; -} + // Remove UTM sources from URLs if requested + if (use_options.removeUtmSource) { + const utmResult = removeUtmSourceFromUrls(processedText); + processedText = utmResult.text; + count += utmResult.count; + } + + // Remove trailing dots if requested + if (use_options.removeTrailingDot) { + const dotResult = removeTrailingDots(processedText); + processedText = dotResult.text; + count += dotResult.count; + } + + return { + count, + text: processedText, + }; + } catch (error) { + // If processing fails, return original text with error logged + console.error('Critical error in humanizeString:', error); + return { + count: 0, + text: typeof text === 'string' ? text : String(text), + }; + } +} \ No newline at end of file diff --git a/src/lib/regexp-i18n/index.ts b/src/lib/regexp-i18n/index.ts index 15ea5f4..5c3dea5 100644 --- a/src/lib/regexp-i18n/index.ts +++ b/src/lib/regexp-i18n/index.ts @@ -36,30 +36,217 @@ export const Patterns = { interface CacheRecord { matchRegexp: RegExp; validator: RegExp; + lastUsed: number; } -const regexpCache: {[key: string]: CacheRecord} = {}; -export function createRegExp(pattern: string, flags?: string) { - let newFlags = flags ? flags : ''; - if (nativeUSupported) { - if (newFlags.indexOf('u') === -1) { - newFlags += 'u'; +// Enhanced cache with LRU eviction and size limits +const MAX_CACHE_SIZE = 100; +const CACHE_TTL = 5 * 60 * 1000; // 5 minutes +const regexpCache: Map = new Map(); + +/** + * Validates input parameters for regexp functions + * @param pattern - Pattern to validate + * @param flags - Flags to validate + * @throws Error if validation fails + */ +function validateRegexInput(pattern: unknown, flags?: unknown): void { + if (typeof pattern !== 'string') { + throw new TypeError(`Expected pattern to be a string, got ${typeof pattern}`); + } + + if (flags !== undefined && typeof flags !== 'string') { + throw new TypeError(`Expected flags to be a string, got ${typeof flags}`); + } + + // Check for potentially dangerous patterns + if (pattern.length === 0) { + throw new Error('Pattern cannot be empty'); + } + + // Basic check for extremely long patterns that might cause performance issues + if (pattern.length > 10000) { + console.warn('Very long regex pattern detected, this might impact performance'); + } +} + +/** + * Validates text input for string processing functions + * @param text - Text to validate + * @throws Error if validation fails + */ +function validateTextInput(text: unknown): void { + if (typeof text !== 'string') { + throw new TypeError(`Expected text to be a string, got ${typeof text}`); + } +} + +/** + * Clean up expired cache entries + */ +function cleanupCache(): void { + try { + const now = Date.now(); + const keysToDelete: string[] = []; + + for (const [key, record] of regexpCache.entries()) { + if (now - record.lastUsed > CACHE_TTL) { + keysToDelete.push(key); + } + } + + for (const key of keysToDelete) { + regexpCache.delete(key); } + } catch (error) { + console.warn('Failed to cleanup regex cache:', error); } - return new RegExp(pattern, newFlags); } -export function replaceNotMatching(pattern: string, replaceValue: string, text: string): string { - let record = regexpCache[pattern]; - if (!record) { - record = { - matchRegexp: createRegExp(pattern + '|.', 'g'), - validator: createRegExp(pattern) - }; - regexpCache[pattern] = record; +/** + * Evict least recently used entries when cache is full + */ +function evictLRU(): void { + try { + if (regexpCache.size <= MAX_CACHE_SIZE) { + return; + } + + let oldestKey: string | null = null; + let oldestTime = Date.now(); + + for (const [key, record] of regexpCache.entries()) { + if (record.lastUsed < oldestTime) { + oldestTime = record.lastUsed; + oldestKey = key; + } + } + + if (oldestKey) { + regexpCache.delete(oldestKey); + } + } catch (error) { + console.warn('Failed to evict LRU cache entries:', error); + } +} + +/** + * Creates a RegExp with proper unicode support and error handling + * @param pattern - The regex pattern + * @param flags - Optional flags + * @returns RegExp instance + */ +export function createRegExp(pattern: string, flags?: string): RegExp { + try { + validateRegexInput(pattern, flags); + + let newFlags = flags ? flags : ''; + if (nativeUSupported) { + if (newFlags.indexOf('u') === -1) { + newFlags += 'u'; + } + } + + return new RegExp(pattern, newFlags); + } catch (error) { + console.error('Failed to create RegExp:', error); + // Return a safe fallback regex + return new RegExp('.*', 'g'); } +} + +/** + * Replaces characters that don't match the pattern with a replacement value + * @param pattern - The pattern to match against + * @param replaceValue - Value to replace non-matching characters with + * @param text - Text to process + * @returns Processed text + */ +export function replaceNotMatching(pattern: string, replaceValue: string, text: string): string { + try { + validateRegexInput(pattern); + validateTextInput(text); + + if (typeof replaceValue !== 'string') { + throw new TypeError(`Expected replaceValue to be a string, got ${typeof replaceValue}`); + } + + // Handle empty text early + if (text.length === 0) { + return text; + } - return text.replace(record.matchRegexp, (ch) => { - return record.validator.test(ch) ? ch : replaceValue; - }); + // Clean up cache periodically + if (regexpCache.size > MAX_CACHE_SIZE * 1.5) { + cleanupCache(); + evictLRU(); + } + + let record = regexpCache.get(pattern); + + if (!record || Date.now() - record.lastUsed > CACHE_TTL) { + try { + record = { + matchRegexp: createRegExp(pattern + '|.', 'g'), + validator: createRegExp(pattern), + lastUsed: Date.now() + }; + regexpCache.set(pattern, record); + evictLRU(); + } catch (error) { + console.error('Failed to create cached regex:', error); + // Fallback to non-cached processing + const matchRegexp = createRegExp(pattern + '|.', 'g'); + const validator = createRegExp(pattern); + + return text.replace(matchRegexp, (ch) => { + try { + return validator.test(ch) ? ch : replaceValue; + } catch (testError) { + console.warn('Regex test failed, keeping original character:', testError); + return ch; + } + }); + } + } else { + // Update last used time + record.lastUsed = Date.now(); + } + + return text.replace(record.matchRegexp, (ch) => { + try { + return record!.validator.test(ch) ? ch : replaceValue; + } catch (testError) { + console.warn('Regex test failed, keeping original character:', testError); + return ch; + } + }); + + } catch (error) { + console.error('Critical error in replaceNotMatching:', error); + // Return original text as fallback + return text; + } } + +/** + * Clears the regex cache (useful for testing or memory management) + */ +export function clearCache(): void { + try { + regexpCache.clear(); + } catch (error) { + console.warn('Failed to clear regex cache:', error); + } +} + +/** + * Gets cache statistics for monitoring + */ +export function getCacheStats(): { size: number; maxSize: number; ttl: number } { + return { + size: regexpCache.size, + maxSize: MAX_CACHE_SIZE, + ttl: CACHE_TTL + }; +} \ No newline at end of file