diff --git a/src/core/tumblr/playlist-parser.test.ts b/src/core/tumblr/playlist-parser.test.ts index 2ce95d1..27debb3 100644 --- a/src/core/tumblr/playlist-parser.test.ts +++ b/src/core/tumblr/playlist-parser.test.ts @@ -141,6 +141,60 @@ And this (https://blog.tumblr.com/post/456/another)! // Should NOT contain google.com redirect expect(urls[0]).not.toContain('google.com'); }); + + it('should extract both linked and bare plain-text URLs from HTML', () => { + const html = ` + +

Linked

+

https://www.tumblr.com/user2/222/bare-post

+ + `; + const urls = extractTumblrUrls(html); + expect(urls).toHaveLength(2); + expect(urls[0]).toBe('https://www.tumblr.com/user1/111/linked-post'); + expect(urls[1]).toBe('https://www.tumblr.com/user2/222/bare-post'); + }); + + it('should extract plain-text URLs from HTML even when no hrefs link to tumblr', () => { + const html = ` + + Not tumblr +

https://www.tumblr.com/blogger/333/plain-text-only

+ + `; + const urls = extractTumblrUrls(html); + expect(urls).toHaveLength(1); + expect(urls[0]).toBe('https://www.tumblr.com/blogger/333/plain-text-only'); + }); + + it('should extract URLs split across span tags', () => { + const html = ` + +

https://www.tumblr.com/user/444/split-span-post

+ + `; + const urls = extractTumblrUrls(html); + expect(urls).toHaveLength(1); + expect(urls[0]).toBe('https://www.tumblr.com/user/444/split-span-post'); + }); + + it('should preserve document order when linked and bare URLs are interleaved', () => { + const html = ` + +

First

+

https://www.tumblr.com/user2/222/second

+

Third

+

https://www.tumblr.com/user4/444/fourth

+ + `; + const urls = extractTumblrUrls(html); + expect(urls).toEqual([ + 'https://www.tumblr.com/user1/111/first', + 'https://www.tumblr.com/user2/222/second', + 'https://www.tumblr.com/user3/333/third', + 'https://www.tumblr.com/user4/444/fourth', + ]); + }); }); describe('extractLabelFromUrl', () => { diff --git a/src/core/tumblr/playlist-parser.ts b/src/core/tumblr/playlist-parser.ts index fb95654..32330d5 100644 --- a/src/core/tumblr/playlist-parser.ts +++ b/src/core/tumblr/playlist-parser.ts @@ -34,69 +34,64 @@ export function extractTumblrUrls(text: string): string[] { // Pattern for username.tumblr.com URLs: https://username.tumblr.com/post/123456/slug const subdomainPattern = /https?:\/\/[a-zA-Z0-9_-]+\.tumblr\.com\/post\/\d+[^\s<>"'\]]*/gi; - // Check if this looks like HTML content - const isHtml = /]/i.test(text); if (isHtml) { - // Extract URLs from anchor tags - // This handles Google Docs HTML export where URLs are in href attributes - // Google Docs wraps external links: https://www.google.com/url?q=https://tumblr.com/... - const hrefPattern = /href=["']([^"']*)["']/gi; - let hrefMatch; - while ((hrefMatch = hrefPattern.exec(text)) !== null) { - let url = hrefMatch[1]; - // Decode HTML entities (Google Docs may encode ampersands) - url = url.replace(/&/g, '&'); - - // Unwrap Google redirect URLs - if (url.includes('google.com/url')) { - const qMatch = url.match(/[?&]q=([^&]+)/); - if (qMatch) { - url = decodeURIComponent(qMatch[1]); + // Replace tags whose href resolves to a Tumblr URL with the unwrapped + // URL as plain text (in place). Non-Tumblr anchors are left untouched so + // their inner content is preserved after the subsequent tag strip. + processedText = processedText.replace( + /]*href=["']([^"']*)["'][^>]*>[\s\S]*?<\/a>/gi, + (_match, href: string) => { + let url = href.replace(/&/g, '&'); + + // Unwrap Google redirect URLs + if (url.includes('google.com/url')) { + const qMatch = url.match(/[?&]q=([^&]+)/); + if (qMatch) { + url = decodeURIComponent(qMatch[1]); + } } - } - // Only process Tumblr URLs - if (!url.includes('tumblr.com')) { - continue; - } + if (url.includes('tumblr.com')) { + return ' ' + url + ' '; + } - const cleaned = cleanUrl(url); - if (cleaned && !seen.has(cleaned)) { - seen.add(cleaned); - urls.push(cleaned); + return _match; } - } + ); + + // Strip remaining HTML tags so URLs split across s are reassembled + processedText = processedText.replace(/<[^>]+>/g, ''); } - // For non-HTML content, check for plain text URLs - // Skip this if we already found URLs from HTML to avoid duplicates from href values - if (!isHtml || urls.length === 0) { - for (const line of text.split('\n')) { - const trimmed = line.trim(); - - // Try tumblr.com pattern - const tumblrComMatches = trimmed.match(tumblrComPattern); - if (tumblrComMatches) { - for (const match of tumblrComMatches) { - // Clean up the URL - remove trailing punctuation - const cleaned = cleanUrl(match); - if (cleaned && !seen.has(cleaned)) { - seen.add(cleaned); - urls.push(cleaned); - } + // Single plain-text pass — document order is preserved + for (const line of processedText.split('\n')) { + const trimmed = line.trim(); + + // Try tumblr.com pattern + const tumblrComMatches = trimmed.match(tumblrComPattern); + if (tumblrComMatches) { + for (const match of tumblrComMatches) { + const cleaned = cleanUrl(match); + if (cleaned && !seen.has(cleaned)) { + seen.add(cleaned); + urls.push(cleaned); } } + } - // Try subdomain pattern - const subdomainMatches = trimmed.match(subdomainPattern); - if (subdomainMatches) { - for (const match of subdomainMatches) { - const cleaned = cleanUrl(match); - if (cleaned && !seen.has(cleaned)) { - seen.add(cleaned); - urls.push(cleaned); - } + // Try subdomain pattern + const subdomainMatches = trimmed.match(subdomainPattern); + if (subdomainMatches) { + for (const match of subdomainMatches) { + const cleaned = cleanUrl(match); + if (cleaned && !seen.has(cleaned)) { + seen.add(cleaned); + urls.push(cleaned); } } }