Entrolution · gvonness-apolitical · Feb 3, 2026 · Feb 3, 2026
diff --git a/src/core/tumblr/playlist-parser.test.ts b/src/core/tumblr/playlist-parser.test.ts
@@ -141,6 +141,60 @@ And this (https://blog.tumblr.com/post/456/another)!
     // Should NOT contain google.com redirect
     expect(urls[0]).not.toContain('google.com');
   });
+
+  it('should extract both linked and bare plain-text URLs from HTML', () => {
+    const html = `
+      <html><body>
+        <p><a href="https://www.tumblr.com/user1/111/linked-post">Linked</a></p>
+        <p>https://www.tumblr.com/user2/222/bare-post</p>
+      </body></html>
+    `;
+    const urls = extractTumblrUrls(html);
+    expect(urls).toHaveLength(2);
+    expect(urls[0]).toBe('https://www.tumblr.com/user1/111/linked-post');
+    expect(urls[1]).toBe('https://www.tumblr.com/user2/222/bare-post');
+  });
+
+  it('should extract plain-text URLs from HTML even when no hrefs link to tumblr', () => {
+    const html = `
+      <html><body>
+        <a href="https://example.com">Not tumblr</a>
+        <p>https://www.tumblr.com/blogger/333/plain-text-only</p>
+      </body></html>
+    `;
+    const urls = extractTumblrUrls(html);
+    expect(urls).toHaveLength(1);
+    expect(urls[0]).toBe('https://www.tumblr.com/blogger/333/plain-text-only');
+  });
+
+  it('should extract URLs split across span tags', () => {
+    const html = `
+      <html><body>
+        <p><span>https://www.</span><span>tumblr.com/user/444/split-span-post</span></p>
+      </body></html>
+    `;
+    const urls = extractTumblrUrls(html);
+    expect(urls).toHaveLength(1);
+    expect(urls[0]).toBe('https://www.tumblr.com/user/444/split-span-post');
+  });
+
+  it('should preserve document order when linked and bare URLs are interleaved', () => {
+    const html = `
+      <html><body>
+        <p><a href="https://www.tumblr.com/user1/111/first">First</a></p>
+        <p>https://www.tumblr.com/user2/222/second</p>
+        <p><a href="https://www.tumblr.com/user3/333/third">Third</a></p>
+        <p>https://www.tumblr.com/user4/444/fourth</p>
+      </body></html>
+    `;
+    const urls = extractTumblrUrls(html);
+    expect(urls).toEqual([
+      'https://www.tumblr.com/user1/111/first',
+      'https://www.tumblr.com/user2/222/second',
+      'https://www.tumblr.com/user3/333/third',
+      'https://www.tumblr.com/user4/444/fourth',
+    ]);
+  });
 });
 
 describe('extractLabelFromUrl', () => {

diff --git a/src/core/tumblr/playlist-parser.ts b/src/core/tumblr/playlist-parser.ts
@@ -34,69 +34,64 @@ export function extractTumblrUrls(text: string): string[] {
   // Pattern for username.tumblr.com URLs: https://username.tumblr.com/post/123456/slug
   const subdomainPattern = /https?:\/\/[a-zA-Z0-9_-]+\.tumblr\.com\/post\/\d+[^\s<>"'\]]*/gi;
 
-  // Check if this looks like HTML content
-  const isHtml = /<a\s|href=/i.test(text);
+  // For HTML content, resolve anchor hrefs inline so that a single plain-text
+  // pass preserves document order for both linked and bare URLs.
+  let processedText = text;
+  const isHtml = /<[a-z][\s>]/i.test(text);
 
   if (isHtml) {
-    // Extract URLs from anchor tags
-    // This handles Google Docs HTML export where URLs are in href attributes
-    // Google Docs wraps external links: https://www.google.com/url?q=https://tumblr.com/...
-    const hrefPattern = /href=["']([^"']*)["']/gi;
-    let hrefMatch;
-    while ((hrefMatch = hrefPattern.exec(text)) !== null) {
-      let url = hrefMatch[1];
-      // Decode HTML entities (Google Docs may encode ampersands)
-      url = url.replace(/&amp;/g, '&');
-
-      // Unwrap Google redirect URLs
-      if (url.includes('google.com/url')) {
-        const qMatch = url.match(/[?&]q=([^&]+)/);
-        if (qMatch) {
-          url = decodeURIComponent(qMatch[1]);
+    // Replace <a> tags whose href resolves to a Tumblr URL with the unwrapped
+    // URL as plain text (in place). Non-Tumblr anchors are left untouched so
+    // their inner content is preserved after the subsequent tag strip.
+    processedText = processedText.replace(
+      /<a\s[^>]*href=["']([^"']*)["'][^>]*>[\s\S]*?<\/a>/gi,
+      (_match, href: string) => {
+        let url = href.replace(/&amp;/g, '&');
+
+        // Unwrap Google redirect URLs
+        if (url.includes('google.com/url')) {
+          const qMatch = url.match(/[?&]q=([^&]+)/);
+          if (qMatch) {
+            url = decodeURIComponent(qMatch[1]);
+          }
         }
-      }
 
-      // Only process Tumblr URLs
-      if (!url.includes('tumblr.com')) {
-        continue;
-      }
+        if (url.includes('tumblr.com')) {
+          return ' ' + url + ' ';
+        }
 
-      const cleaned = cleanUrl(url);
-      if (cleaned && !seen.has(cleaned)) {
-        seen.add(cleaned);
-        urls.push(cleaned);
+        return _match;
       }
-    }
+    );
+
+    // Strip remaining HTML tags so URLs split across <span>s are reassembled
+    processedText = processedText.replace(/<[^>]+>/g, '');
   }
 
-  // For non-HTML content, check for plain text URLs
-  // Skip this if we already found URLs from HTML to avoid duplicates from href values
-  if (!isHtml || urls.length === 0) {
-    for (const line of text.split('\n')) {
-      const trimmed = line.trim();
-
-      // Try tumblr.com pattern
-      const tumblrComMatches = trimmed.match(tumblrComPattern);
-      if (tumblrComMatches) {
-        for (const match of tumblrComMatches) {
-          // Clean up the URL - remove trailing punctuation
-          const cleaned = cleanUrl(match);
-          if (cleaned && !seen.has(cleaned)) {
-            seen.add(cleaned);
-            urls.push(cleaned);
-          }
+  // Single plain-text pass — document order is preserved
+  for (const line of processedText.split('\n')) {
+    const trimmed = line.trim();
+
+    // Try tumblr.com pattern
+    const tumblrComMatches = trimmed.match(tumblrComPattern);
+    if (tumblrComMatches) {
+      for (const match of tumblrComMatches) {
+        const cleaned = cleanUrl(match);
+        if (cleaned && !seen.has(cleaned)) {
+          seen.add(cleaned);
+          urls.push(cleaned);
         }
       }
+    }
 
-      // Try subdomain pattern
-      const subdomainMatches = trimmed.match(subdomainPattern);
-      if (subdomainMatches) {
-        for (const match of subdomainMatches) {
-          const cleaned = cleanUrl(match);
-          if (cleaned && !seen.has(cleaned)) {
-            seen.add(cleaned);
-            urls.push(cleaned);
-          }
+    // Try subdomain pattern
+    const subdomainMatches = trimmed.match(subdomainPattern);
+    if (subdomainMatches) {
+      for (const match of subdomainMatches) {
+        const cleaned = cleanUrl(match);
+        if (cleaned && !seen.has(cleaned)) {
+          seen.add(cleaned);
+          urls.push(cleaned);
         }
       }
     }