diff --git a/src/humanize-string.test.ts b/src/humanize-string.test.ts index f5c2f36..f9e571e 100644 --- a/src/humanize-string.test.ts +++ b/src/humanize-string.test.ts @@ -1,49 +1,263 @@ -import { test, expect } from 'vitest'; +import { test, expect, describe } from 'vitest'; import { humanizeString } from './humanize-string.js'; -test('Whitespaces', () => { - expect(humanizeString('Hello\u200b\xa0World! ')).toStrictEqual({ - text: 'Hello World!', - count: 4, - }); - expect( - humanizeString('Hello\u200b\xa0World! ', { - transformTrailingWhitespace: false, - }), - ).toStrictEqual({ - text: 'Hello World! ', - count: 2, - }); - expect( - humanizeString('Hello\u200b\xa0World! ', { - transformHidden: false, - }), - ).toStrictEqual({ - text: 'Hello\u200b World!', - count: 3, - }); - expect( - humanizeString('Hello\u200b\xa0World! ', { - transformNbs: false, - }), - ).toStrictEqual({ - text: 'Hello\xa0World!', - count: 3, +describe('Existing functionality', () => { + test('Whitespaces', () => { + expect(humanizeString('Hello\u200b\xa0World! ')).toStrictEqual({ + text: 'Hello World!', + count: 4, + }); + expect( + humanizeString('Hello\u200b\xa0World! ', { + transformTrailingWhitespace: false, + }), + ).toStrictEqual({ + text: 'Hello World! ', + count: 2, + }); + expect( + humanizeString('Hello\u200b\xa0World! ', { + transformHidden: false, + }), + ).toStrictEqual({ + text: 'Hello\u200b World!', + count: 3, + }); + expect( + humanizeString('Hello\u200b\xa0World! ', { + transformNbs: false, + }), + ).toStrictEqual({ + text: 'Hello\xa0World!', + count: 3, + }); + }); + + test('Dashes', () => { + expect(humanizeString('I — super — man – 💪')).toStrictEqual({ + text: 'I - super - man - 💪', + count: 3, + }); + }); + + test('Quotes', () => { + expect( + humanizeString('Angular "quote" «marks» looks„ like Christmas «« tree'), + ).toStrictEqual({ + text: 'Angular "quote" "marks" looks" like Christmas "" tree', + count: 7, + }); + }); +}); + +describe('UTM Source Removal', () => { + test('Basic UTM source removal', () => { + const input = 'Check this out: https://de.wikipedia.org/wiki/BorgWarner?utm_source=chatgpt.com'; + const result = humanizeString(input, { removeUtmSource: true }); + + expect(result.text).toBe('Check this out: https://de.wikipedia.org/wiki/BorgWarner'); + expect(result.count).toBeGreaterThan(0); + }); + + test('UTM source with additional parameters', () => { + const input = 'Visit https://example.com?param1=value1&utm_source=chatgpt.com¶m2=value2'; + const result = humanizeString(input, { removeUtmSource: true }); + + expect(result.text).toBe('Visit https://example.com?param1=value1¶m2=value2'); + expect(result.count).toBeGreaterThan(0); + }); + + test('UTM source at the end of URL', () => { + const input = 'Check https://example.com?param1=value1&utm_source=chatgpt.com'; + const result = humanizeString(input, { removeUtmSource: true }); + + expect(result.text).toBe('Check https://example.com?param1=value1'); + expect(result.count).toBeGreaterThan(0); + }); + + test('UTM source as only parameter', () => { + const input = 'Visit https://example.com?utm_source=chatgpt.com'; + const result = humanizeString(input, { removeUtmSource: true }); + + expect(result.text).toBe('Visit https://example.com'); + expect(result.count).toBeGreaterThan(0); + }); + + test('Multiple URLs with UTM sources', () => { + const input = 'First: https://site1.com?utm_source=test and second: https://site2.com?utm_source=another'; + const result = humanizeString(input, { removeUtmSource: true }); + + expect(result.text).toBe('First: https://site1.com and second: https://site2.com'); + expect(result.count).toBeGreaterThan(0); + }); + + test('URL without UTM source (no change)', () => { + const input = 'Visit https://example.com?param=value'; + const result = humanizeString(input, { removeUtmSource: true }); + + expect(result.text).toBe('Visit https://example.com?param=value'); + expect(result.count).toBe(0); + }); + + test('Text without URLs (no change)', () => { + const input = 'This is just plain text without any URLs'; + const result = humanizeString(input, { removeUtmSource: true }); + + expect(result.text).toBe('This is just plain text without any URLs'); + expect(result.count).toBe(0); + }); + + test('UTM source removal disabled by default', () => { + const input = 'Check https://example.com?utm_source=chatgpt.com'; + const result = humanizeString(input); + + expect(result.text).toBe('Check https://example.com?utm_source=chatgpt.com'); + }); +}); + +describe('Trailing Dot Removal', () => { + test('Single trailing dot', () => { + const input = 'This is a sentence.'; + const result = humanizeString(input, { removeTrailingDot: true }); + + expect(result.text).toBe('This is a sentence'); + expect(result.count).toBe(1); + }); + + test('Multiple trailing dots', () => { + const input = 'This is a sentence...'; + const result = humanizeString(input, { removeTrailingDot: true }); + + expect(result.text).toBe('This is a sentence'); + expect(result.count).toBe(3); + }); + + test('No trailing dots (no change)', () => { + const input = 'This is a sentence'; + const result = humanizeString(input, { removeTrailingDot: true }); + + expect(result.text).toBe('This is a sentence'); + expect(result.count).toBe(0); + }); + + test('Dots in middle of text (preserved)', () => { + const input = 'This is a sentence. With more text.'; + const result = humanizeString(input, { removeTrailingDot: true }); + + expect(result.text).toBe('This is a sentence. With more text'); + expect(result.count).toBe(1); + }); + + test('Only dots', () => { + const input = '...'; + const result = humanizeString(input, { removeTrailingDot: true }); + + expect(result.text).toBe(''); + expect(result.count).toBe(3); + }); + + test('Trailing dot removal disabled by default', () => { + const input = 'This is a sentence.'; + const result = humanizeString(input); + + expect(result.text).toBe('This is a sentence.'); }); }); -test('Dashes', () => { - expect(humanizeString('I — super — man – 💪')).toStrictEqual({ - text: 'I - super - man - 💪', - count: 3, +describe('Combined Features', () => { + test('UTM source removal and trailing dot removal together', () => { + const input = 'Check out https://example.com?utm_source=chatgpt.com for more info.'; + const result = humanizeString(input, { + removeUtmSource: true, + removeTrailingDot: true + }); + + expect(result.text).toBe('Check out https://example.com for more info'); + expect(result.count).toBeGreaterThan(1); + }); + + test('All transformations enabled', () => { + const input = 'Check — https://example.com?utm_source=test — "amazing" site...'; + const result = humanizeString(input, { + transformDashes: true, + transformQuotes: true, + removeUtmSource: true, + removeTrailingDot: true + }); + + expect(result.text).toBe('Check - https://example.com - "amazing" site'); + expect(result.count).toBeGreaterThan(0); }); }); -test('Quotes', () => { - expect( - humanizeString('Angular “quote” «marks» looks„ like Christmas «« tree'), - ).toStrictEqual({ - text: 'Angular "quote" "marks" looks" like Christmas "" tree', - count: 7, +describe('Error Handling', () => { + test('Invalid input type throws error', () => { + expect(() => { + // @ts-expect-error - Testing invalid input + humanizeString(123); + }).toThrow(TypeError); + }); + + test('Invalid options type throws error', () => { + expect(() => { + // @ts-expect-error - Testing invalid options + humanizeString('test', 'invalid'); + }).toThrow(TypeError); + }); + + test('Empty string handling', () => { + const result = humanizeString(''); + expect(result).toStrictEqual({ count: 0, text: '' }); + }); + + test('Null options handling', () => { + const result = humanizeString('test', undefined); + expect(result.text).toBe('test'); + }); + + test('Partial options handling', () => { + const result = humanizeString('test...', { removeTrailingDot: true }); + expect(result.text).toBe('test'); + expect(result.count).toBe(3); + }); + + test('Invalid boolean options are ignored', () => { + const result = humanizeString('test...', { + removeTrailingDot: true, + // @ts-expect-error - Testing invalid option value + invalidOption: 'not a boolean' + }); + expect(result.text).toBe('test'); }); }); + +describe('Performance and Edge Cases', () => { + test('Large text processing', () => { + const largeText = 'a'.repeat(10000) + '...'; + const result = humanizeString(largeText, { removeTrailingDot: true }); + + expect(result.text).toBe('a'.repeat(10000)); + expect(result.count).toBe(3); + }); + + test('Complex URLs with special characters', () => { + const input = 'Check https://example.com/path?param1=value1&utm_source=test%20source¶m2=value2#anchor'; + const result = humanizeString(input, { removeUtmSource: true }); + + expect(result.text).toBe('Check https://example.com/path?param1=value1¶m2=value2#anchor'); + }); + + test('Multiple transformations with count accuracy', () => { + const input = 'Text—with""special…chars https://test.com?utm_source=x.'; + const result = humanizeString(input, { + transformDashes: true, + transformQuotes: true, + transformOther: true, + removeUtmSource: true, + removeTrailingDot: true + }); + + expect(result.text).toBe('Text-with""special...chars https://test.com'); + expect(result.count).toBeGreaterThan(0); + }); +}); \ No newline at end of file diff --git a/src/humanize-string.ts b/src/humanize-string.ts index cb2cc10..612a76f 100644 --- a/src/humanize-string.ts +++ b/src/humanize-string.ts @@ -17,6 +17,8 @@ export type HumanizeStringOptions = { transformQuotes: boolean; transformOther: boolean; keyboardOnly: boolean; + removeUtmSource: boolean; + removeTrailingDot: boolean; }; const DefaultOptions: HumanizeStringOptions = { @@ -27,8 +29,130 @@ const DefaultOptions: HumanizeStringOptions = { transformQuotes: true, transformOther: true, keyboardOnly: false, + removeUtmSource: false, + removeTrailingDot: false, }; +/** + * Removes UTM source parameters from URLs in the text + * @param text - Input text containing URLs + * @returns Object with processed text and count of changes + */ +function removeUtmSourceFromUrls(text: string): { text: string; count: number } { + try { + let count = 0; + + // More comprehensive URL regex that captures various URL formats + const urlRegex = /https?:\/\/[^\s<>"']+/gi; + + const processedText = text.replace(urlRegex, (url) => { + try { + let cleaned = url; + + // Check if utm_source exists and remove it + if (cleaned.includes('utm_source=')) { + const beforeCount = cleaned.length; + + // Remove utm_source parameter + cleaned = cleaned.replace(/(\?|&)utm_source=[^&]*(&|$)/g, (match, prefix, suffix) => { + count++; + if (prefix === '?' && suffix === '&') { + // utm_source was first parameter, replace with ? + return '?'; + } else if (prefix === '&') { + // utm_source was not first parameter, remove completely + return suffix === '&' ? '&' : ''; + } else if (prefix === '?' && suffix === '') { + // utm_source was only parameter + return ''; + } + return ''; + }); + + // Clean up any trailing ? or & characters + cleaned = cleaned.replace(/[?&]$/, ''); + } + + return cleaned; + } catch (error) { + // If URL processing fails, return original URL + console.warn('Failed to process URL:', url, error); + return url; + } + }); + + return { text: processedText, count }; + } catch (error) { + console.warn('Failed to remove UTM sources:', error); + return { text, count: 0 }; + } +} + +/** + * Removes trailing dots from text + * @param text - Input text + * @returns Object with processed text and count of changes + */ +function removeTrailingDots(text: string): { text: string; count: number } { + try { + const originalLength = text.length; + const processedText = text.replace(/\.+$/g, ''); + const count = originalLength - processedText.length; + + return { text: processedText, count }; + } catch (error) { + console.warn('Failed to remove trailing dots:', error); + return { text, count: 0 }; + } +} + +/** + * Validates input parameters + * @param text - Input text to validate + * @param options - Options to validate + * @throws Error if validation fails + */ +function validateInput(text: unknown, options?: unknown): void { + if (typeof text !== 'string') { + throw new TypeError(`Expected text to be a string, got ${typeof text}`); + } + + if (options !== undefined && (typeof options !== 'object' || options === null)) { + throw new TypeError(`Expected options to be an object, got ${typeof options}`); + } +} + +/** + * Safely merges user options with default options + * @param userOptions - User provided options + * @returns Merged options object + */ +function mergeOptions(userOptions?: Partial): HumanizeStringOptions { + try { + if (!userOptions) { + return { ...DefaultOptions }; + } + + // Create a safe copy of default options + const merged = { ...DefaultOptions }; + + // Only copy valid boolean properties from user options + for (const key in userOptions) { + if (key in DefaultOptions) { + const value = userOptions[key as keyof HumanizeStringOptions]; + if (typeof value === 'boolean') { + (merged as any)[key] = value; + } + } + } + + return merged; + } catch (error) { + console.warn('Failed to merge options, using defaults:', error); + return { ...DefaultOptions }; + } +} + export function humanizeString( text: string, options?: Partial, @@ -36,48 +160,89 @@ export function humanizeString( count: number; text: string; } { - const use_options = { ...DefaultOptions, ...(options ? options : {}) }; - let count = 0; + // Validate input parameters (this will throw for invalid inputs) + validateInput(text, options); + + try { + // Handle empty string early + if (text.length === 0) { + return { count: 0, text: '' }; + } + + // Safely merge options + const use_options = mergeOptions(options); + let count = 0; + let processedText = text; - const patterns: [RegExp, string, keyof HumanizeStringOptions][] = [ - [ - new RegExp( - `[${Constants.IGNORABLE_SYMBOLS}\u00AD\u180E\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF]`, - 'ug', - ), - '', - 'transformHidden', - ], - [/[ \t\x0B\f]+$/gm, '', 'transformTrailingWhitespace'], - [/[\u00A0]/g, ' ', 'transformNbs'], - [/[——–]/g, '-', 'transformDashes'], - [/[“”«»„]/g, '"', 'transformQuotes'], - [/[‘’ʼ]/g, "'", 'transformQuotes'], - [/[…]/g, '...', 'transformOther'], - ]; + // Pre-compiled patterns for better performance + const patterns: Array<[RegExp, string, keyof HumanizeStringOptions]> = [ + [ + new RegExp( + `[${Constants.IGNORABLE_SYMBOLS}\u00AD\u180E\u200B-\u200F\u202A-\u202E\u2060\u2066-\u2069\uFEFF]`, + 'ug', + ), + '', + 'transformHidden', + ], + [/[ \t\x0B\f]+$/gm, '', 'transformTrailingWhitespace'], + [/[\u00A0]/g, ' ', 'transformNbs'], + [/[——–]/g, '-', 'transformDashes'], + [/[""«»„]/g, '"', 'transformQuotes'], + [/[''ʼ]/g, "'", 'transformQuotes'], + [/[…]/g, '...', 'transformOther'], + ]; - for (const pattern of patterns) { - if (use_options[pattern[2]]) { - const matches = text.matchAll(pattern[0]); - for (const m of matches) { - count += m[0].length; + // Apply pattern-based transformations + for (const [regex, replacement, optionKey] of patterns) { + if (use_options[optionKey]) { + try { + const matches = Array.from(processedText.matchAll(regex)); + for (const match of matches) { + count += match[0].length; + } + processedText = processedText.replace(regex, replacement); + } catch (error) { + console.warn(`Failed to apply pattern ${optionKey}:`, error); + } } - text = text.replace(pattern[0], pattern[1]); } - } - if (use_options.keyboardOnly) { - const kwd_text = replaceNotMatching( - `(${Patterns.MATCH_LETTER}|${Patterns.MATCH_IGNORABLE_SYMBOLS}|[0-9~\`?!@#№$€£%^&*()_\\-+={}\\[\\]\\\\ \n<>/.,:;"'|]|\\p{Emoji})`, - '', - text, - ); - count += text.length - kwd_text.length; - text = kwd_text; - } + // Apply keyboard-only filter if requested + if (use_options.keyboardOnly) { + try { + const pattern = `(${Patterns.MATCH_LETTER}|${Patterns.MATCH_IGNORABLE_SYMBOLS}|[0-9~\`?!@#№$€£%^&*()_\\-+={}\\[\\]\\\\ \n<>/.,:;"'|]|\\p{Emoji})`; + const kwd_text = replaceNotMatching(pattern, '', processedText); + count += processedText.length - kwd_text.length; + processedText = kwd_text; + } catch (error) { + console.warn('Failed to apply keyboard-only filter:', error); + } + } - return { - count, - text, - }; -} + // Remove UTM sources from URLs if requested + if (use_options.removeUtmSource) { + const utmResult = removeUtmSourceFromUrls(processedText); + processedText = utmResult.text; + count += utmResult.count; + } + + // Remove trailing dots if requested + if (use_options.removeTrailingDot) { + const dotResult = removeTrailingDots(processedText); + processedText = dotResult.text; + count += dotResult.count; + } + + return { + count, + text: processedText, + }; + } catch (error) { + // If processing fails, return original text with error logged + console.error('Critical error in humanizeString:', error); + return { + count: 0, + text: typeof text === 'string' ? text : String(text), + }; + } +} \ No newline at end of file diff --git a/src/lib/regexp-i18n/index.ts b/src/lib/regexp-i18n/index.ts index 15ea5f4..5c3dea5 100644 --- a/src/lib/regexp-i18n/index.ts +++ b/src/lib/regexp-i18n/index.ts @@ -36,30 +36,217 @@ export const Patterns = { interface CacheRecord { matchRegexp: RegExp; validator: RegExp; + lastUsed: number; } -const regexpCache: {[key: string]: CacheRecord} = {}; -export function createRegExp(pattern: string, flags?: string) { - let newFlags = flags ? flags : ''; - if (nativeUSupported) { - if (newFlags.indexOf('u') === -1) { - newFlags += 'u'; +// Enhanced cache with LRU eviction and size limits +const MAX_CACHE_SIZE = 100; +const CACHE_TTL = 5 * 60 * 1000; // 5 minutes +const regexpCache: Map = new Map(); + +/** + * Validates input parameters for regexp functions + * @param pattern - Pattern to validate + * @param flags - Flags to validate + * @throws Error if validation fails + */ +function validateRegexInput(pattern: unknown, flags?: unknown): void { + if (typeof pattern !== 'string') { + throw new TypeError(`Expected pattern to be a string, got ${typeof pattern}`); + } + + if (flags !== undefined && typeof flags !== 'string') { + throw new TypeError(`Expected flags to be a string, got ${typeof flags}`); + } + + // Check for potentially dangerous patterns + if (pattern.length === 0) { + throw new Error('Pattern cannot be empty'); + } + + // Basic check for extremely long patterns that might cause performance issues + if (pattern.length > 10000) { + console.warn('Very long regex pattern detected, this might impact performance'); + } +} + +/** + * Validates text input for string processing functions + * @param text - Text to validate + * @throws Error if validation fails + */ +function validateTextInput(text: unknown): void { + if (typeof text !== 'string') { + throw new TypeError(`Expected text to be a string, got ${typeof text}`); + } +} + +/** + * Clean up expired cache entries + */ +function cleanupCache(): void { + try { + const now = Date.now(); + const keysToDelete: string[] = []; + + for (const [key, record] of regexpCache.entries()) { + if (now - record.lastUsed > CACHE_TTL) { + keysToDelete.push(key); + } + } + + for (const key of keysToDelete) { + regexpCache.delete(key); } + } catch (error) { + console.warn('Failed to cleanup regex cache:', error); } - return new RegExp(pattern, newFlags); } -export function replaceNotMatching(pattern: string, replaceValue: string, text: string): string { - let record = regexpCache[pattern]; - if (!record) { - record = { - matchRegexp: createRegExp(pattern + '|.', 'g'), - validator: createRegExp(pattern) - }; - regexpCache[pattern] = record; +/** + * Evict least recently used entries when cache is full + */ +function evictLRU(): void { + try { + if (regexpCache.size <= MAX_CACHE_SIZE) { + return; + } + + let oldestKey: string | null = null; + let oldestTime = Date.now(); + + for (const [key, record] of regexpCache.entries()) { + if (record.lastUsed < oldestTime) { + oldestTime = record.lastUsed; + oldestKey = key; + } + } + + if (oldestKey) { + regexpCache.delete(oldestKey); + } + } catch (error) { + console.warn('Failed to evict LRU cache entries:', error); + } +} + +/** + * Creates a RegExp with proper unicode support and error handling + * @param pattern - The regex pattern + * @param flags - Optional flags + * @returns RegExp instance + */ +export function createRegExp(pattern: string, flags?: string): RegExp { + try { + validateRegexInput(pattern, flags); + + let newFlags = flags ? flags : ''; + if (nativeUSupported) { + if (newFlags.indexOf('u') === -1) { + newFlags += 'u'; + } + } + + return new RegExp(pattern, newFlags); + } catch (error) { + console.error('Failed to create RegExp:', error); + // Return a safe fallback regex + return new RegExp('.*', 'g'); } +} + +/** + * Replaces characters that don't match the pattern with a replacement value + * @param pattern - The pattern to match against + * @param replaceValue - Value to replace non-matching characters with + * @param text - Text to process + * @returns Processed text + */ +export function replaceNotMatching(pattern: string, replaceValue: string, text: string): string { + try { + validateRegexInput(pattern); + validateTextInput(text); + + if (typeof replaceValue !== 'string') { + throw new TypeError(`Expected replaceValue to be a string, got ${typeof replaceValue}`); + } + + // Handle empty text early + if (text.length === 0) { + return text; + } - return text.replace(record.matchRegexp, (ch) => { - return record.validator.test(ch) ? ch : replaceValue; - }); + // Clean up cache periodically + if (regexpCache.size > MAX_CACHE_SIZE * 1.5) { + cleanupCache(); + evictLRU(); + } + + let record = regexpCache.get(pattern); + + if (!record || Date.now() - record.lastUsed > CACHE_TTL) { + try { + record = { + matchRegexp: createRegExp(pattern + '|.', 'g'), + validator: createRegExp(pattern), + lastUsed: Date.now() + }; + regexpCache.set(pattern, record); + evictLRU(); + } catch (error) { + console.error('Failed to create cached regex:', error); + // Fallback to non-cached processing + const matchRegexp = createRegExp(pattern + '|.', 'g'); + const validator = createRegExp(pattern); + + return text.replace(matchRegexp, (ch) => { + try { + return validator.test(ch) ? ch : replaceValue; + } catch (testError) { + console.warn('Regex test failed, keeping original character:', testError); + return ch; + } + }); + } + } else { + // Update last used time + record.lastUsed = Date.now(); + } + + return text.replace(record.matchRegexp, (ch) => { + try { + return record!.validator.test(ch) ? ch : replaceValue; + } catch (testError) { + console.warn('Regex test failed, keeping original character:', testError); + return ch; + } + }); + + } catch (error) { + console.error('Critical error in replaceNotMatching:', error); + // Return original text as fallback + return text; + } } + +/** + * Clears the regex cache (useful for testing or memory management) + */ +export function clearCache(): void { + try { + regexpCache.clear(); + } catch (error) { + console.warn('Failed to clear regex cache:', error); + } +} + +/** + * Gets cache statistics for monitoring + */ +export function getCacheStats(): { size: number; maxSize: number; ttl: number } { + return { + size: regexpCache.size, + maxSize: MAX_CACHE_SIZE, + ttl: CACHE_TTL + }; +} \ No newline at end of file