Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions src/core/tumblr/playlist-parser.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,60 @@ And this (https://blog.tumblr.com/post/456/another)!
// Should NOT contain google.com redirect
expect(urls[0]).not.toContain('google.com');
});

it('should extract both linked and bare plain-text URLs from HTML', () => {
const html = `
<html><body>
<p><a href="https://www.tumblr.com/user1/111/linked-post">Linked</a></p>
<p>https://www.tumblr.com/user2/222/bare-post</p>
</body></html>
`;
const urls = extractTumblrUrls(html);
expect(urls).toHaveLength(2);
expect(urls[0]).toBe('https://www.tumblr.com/user1/111/linked-post');
expect(urls[1]).toBe('https://www.tumblr.com/user2/222/bare-post');
});

it('should extract plain-text URLs from HTML even when no hrefs link to tumblr', () => {
const html = `
<html><body>
<a href="https://example.com">Not tumblr</a>
<p>https://www.tumblr.com/blogger/333/plain-text-only</p>
</body></html>
`;
const urls = extractTumblrUrls(html);
expect(urls).toHaveLength(1);
expect(urls[0]).toBe('https://www.tumblr.com/blogger/333/plain-text-only');
});

it('should extract URLs split across span tags', () => {
const html = `
<html><body>
<p><span>https://www.</span><span>tumblr.com/user/444/split-span-post</span></p>
</body></html>
`;
const urls = extractTumblrUrls(html);
expect(urls).toHaveLength(1);
expect(urls[0]).toBe('https://www.tumblr.com/user/444/split-span-post');
});

it('should preserve document order when linked and bare URLs are interleaved', () => {
const html = `
<html><body>
<p><a href="https://www.tumblr.com/user1/111/first">First</a></p>
<p>https://www.tumblr.com/user2/222/second</p>
<p><a href="https://www.tumblr.com/user3/333/third">Third</a></p>
<p>https://www.tumblr.com/user4/444/fourth</p>
</body></html>
`;
const urls = extractTumblrUrls(html);
expect(urls).toEqual([
'https://www.tumblr.com/user1/111/first',
'https://www.tumblr.com/user2/222/second',
'https://www.tumblr.com/user3/333/third',
'https://www.tumblr.com/user4/444/fourth',
]);
});
});

describe('extractLabelFromUrl', () => {
Expand Down
99 changes: 47 additions & 52 deletions src/core/tumblr/playlist-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,69 +34,64 @@ export function extractTumblrUrls(text: string): string[] {
// Pattern for username.tumblr.com URLs: https://username.tumblr.com/post/123456/slug
const subdomainPattern = /https?:\/\/[a-zA-Z0-9_-]+\.tumblr\.com\/post\/\d+[^\s<>"'\]]*/gi;

// Check if this looks like HTML content
const isHtml = /<a\s|href=/i.test(text);
// For HTML content, resolve anchor hrefs inline so that a single plain-text
// pass preserves document order for both linked and bare URLs.
let processedText = text;
const isHtml = /<[a-z][\s>]/i.test(text);

if (isHtml) {
// Extract URLs from anchor tags
// This handles Google Docs HTML export where URLs are in href attributes
// Google Docs wraps external links: https://www.google.com/url?q=https://tumblr.com/...
const hrefPattern = /href=["']([^"']*)["']/gi;
let hrefMatch;
while ((hrefMatch = hrefPattern.exec(text)) !== null) {
let url = hrefMatch[1];
// Decode HTML entities (Google Docs may encode ampersands)
url = url.replace(/&amp;/g, '&');

// Unwrap Google redirect URLs
if (url.includes('google.com/url')) {
const qMatch = url.match(/[?&]q=([^&]+)/);
if (qMatch) {
url = decodeURIComponent(qMatch[1]);
// Replace <a> tags whose href resolves to a Tumblr URL with the unwrapped
// URL as plain text (in place). Non-Tumblr anchors are left untouched so
// their inner content is preserved after the subsequent tag strip.
processedText = processedText.replace(
/<a\s[^>]*href=["']([^"']*)["'][^>]*>[\s\S]*?<\/a>/gi,
(_match, href: string) => {
let url = href.replace(/&amp;/g, '&');

// Unwrap Google redirect URLs
if (url.includes('google.com/url')) {
const qMatch = url.match(/[?&]q=([^&]+)/);
if (qMatch) {
url = decodeURIComponent(qMatch[1]);
}
}
}

// Only process Tumblr URLs
if (!url.includes('tumblr.com')) {
continue;
}
if (url.includes('tumblr.com')) {
return ' ' + url + ' ';
}

const cleaned = cleanUrl(url);
if (cleaned && !seen.has(cleaned)) {
seen.add(cleaned);
urls.push(cleaned);
return _match;
}
}
);

// Strip remaining HTML tags so URLs split across <span>s are reassembled
processedText = processedText.replace(/<[^>]+>/g, '');
}

// For non-HTML content, check for plain text URLs
// Skip this if we already found URLs from HTML to avoid duplicates from href values
if (!isHtml || urls.length === 0) {
for (const line of text.split('\n')) {
const trimmed = line.trim();

// Try tumblr.com pattern
const tumblrComMatches = trimmed.match(tumblrComPattern);
if (tumblrComMatches) {
for (const match of tumblrComMatches) {
// Clean up the URL - remove trailing punctuation
const cleaned = cleanUrl(match);
if (cleaned && !seen.has(cleaned)) {
seen.add(cleaned);
urls.push(cleaned);
}
// Single plain-text pass — document order is preserved
for (const line of processedText.split('\n')) {
const trimmed = line.trim();

// Try tumblr.com pattern
const tumblrComMatches = trimmed.match(tumblrComPattern);
if (tumblrComMatches) {
for (const match of tumblrComMatches) {
const cleaned = cleanUrl(match);
if (cleaned && !seen.has(cleaned)) {
seen.add(cleaned);
urls.push(cleaned);
}
}
}

// Try subdomain pattern
const subdomainMatches = trimmed.match(subdomainPattern);
if (subdomainMatches) {
for (const match of subdomainMatches) {
const cleaned = cleanUrl(match);
if (cleaned && !seen.has(cleaned)) {
seen.add(cleaned);
urls.push(cleaned);
}
// Try subdomain pattern
const subdomainMatches = trimmed.match(subdomainPattern);
if (subdomainMatches) {
for (const match of subdomainMatches) {
const cleaned = cleanUrl(match);
if (cleaned && !seen.has(cleaned)) {
seen.add(cleaned);
urls.push(cleaned);
}
}
}
Expand Down