From 4f984f5c58c7096bd1fd08f25e0e3ce80d35bdc4 Mon Sep 17 00:00:00 2001 From: Sweets Sweetman Date: Tue, 13 Jan 2026 07:48:50 -0500 Subject: [PATCH] feat: extract conversation ID from email HTML for Superhuman replies Superhuman email client inserts tags in link text which breaks plain text extraction. Added extractRoomIdFromHtml function as secondary fallback in getEmailRoomId to handle this case. Co-Authored-By: Claude Opus 4.5 --- .../__tests__/extractRoomIdFromHtml.test.ts | 168 ++++++++++++++++++ .../inbound/__tests__/getEmailRoomId.test.ts | 42 ++++- lib/emails/inbound/extractRoomIdFromHtml.ts | 48 +++++ lib/emails/inbound/getEmailRoomId.ts | 14 +- 4 files changed, 269 insertions(+), 3 deletions(-) create mode 100644 lib/emails/inbound/__tests__/extractRoomIdFromHtml.test.ts create mode 100644 lib/emails/inbound/extractRoomIdFromHtml.ts diff --git a/lib/emails/inbound/__tests__/extractRoomIdFromHtml.test.ts b/lib/emails/inbound/__tests__/extractRoomIdFromHtml.test.ts new file mode 100644 index 00000000..5fcf968b --- /dev/null +++ b/lib/emails/inbound/__tests__/extractRoomIdFromHtml.test.ts @@ -0,0 +1,168 @@ +import { describe, it, expect } from "vitest"; +import { extractRoomIdFromHtml } from "../extractRoomIdFromHtml"; + +describe("extractRoomIdFromHtml", () => { + describe("Superhuman reply with conversation link in quoted content", () => { + it("extracts roomId from Superhuman reply with wbr tags in link text", () => { + // This is the actual HTML from a Superhuman reply where the link text + // contains tags for word breaking + const html = ` + + + + +
+
+
+
Send a picture of him
+

+
+
+

+
+
Sent via Superhuman

+
+

+
+
On Fri, Jan 09, 2026 at 11:59 AM, Agent by Recoup <agent@recoupable.com> wrote:
+
+
+
+

Short answer: Brian Kernighan.

+

Details: the earliest known use in computing appears in Kernighan's 1972 tutorial for the B language (the "hello, world!" example). It was then popularized by Kernighan & Ritchie's 1978 book The C Programming Language. (There are older claims—BCPL examples from the late 1960s and the exact phrase appeared as a radio catchphrase in the 1950s—but Kernighan is usually credited for putting it into programming tradition.)

+

Want the sources/links?

+ + +
+

+ Note: you can reply directly to this email to continue the conversation. +

+

+ Or continue the conversation on Recoup: + + https://chat.recoupable.com/chat/d5c473ec-04cf-4a23-a577-e0dc71542392 + +

+
+
+
+
+

+
+
+ + +`; + + const result = extractRoomIdFromHtml(html); + + expect(result).toBe("d5c473ec-04cf-4a23-a577-e0dc71542392"); + }); + }); + + describe("Gmail reply with proper threading", () => { + it("extracts roomId from Gmail reply with quoted content", () => { + const html = ` + + +

Thanks for the info!

+
+
+

Original message here

+

Continue the conversation: https://chat.recoupable.com/chat/a1b2c3d4-e5f6-7890-abcd-ef1234567890

+
+
+ + + `; + + const result = extractRoomIdFromHtml(html); + + expect(result).toBe("a1b2c3d4-e5f6-7890-abcd-ef1234567890"); + }); + }); + + describe("no conversation ID", () => { + it("returns undefined for undefined input", () => { + const result = extractRoomIdFromHtml(undefined); + + expect(result).toBeUndefined(); + }); + + it("returns undefined for empty string", () => { + const result = extractRoomIdFromHtml(""); + + expect(result).toBeUndefined(); + }); + + it("returns undefined when no chat link present", () => { + const html = "

This email has no Recoup chat link.

"; + + const result = extractRoomIdFromHtml(html); + + expect(result).toBeUndefined(); + }); + + it("returns undefined for invalid UUID format in link", () => { + const html = + 'link'; + + const result = extractRoomIdFromHtml(html); + + expect(result).toBeUndefined(); + }); + + it("returns undefined for wrong domain", () => { + const html = + 'link'; + + const result = extractRoomIdFromHtml(html); + + expect(result).toBeUndefined(); + }); + }); + + describe("edge cases", () => { + it("handles URL-encoded link in href attribute", () => { + // Resend tracking redirects URL-encode the destination + const html = + 'Click here'; + + const result = extractRoomIdFromHtml(html); + + expect(result).toBe("12345678-1234-1234-1234-123456789abc"); + }); + + it("extracts first roomId when multiple links present", () => { + const html = ` + First + Second + `; + + const result = extractRoomIdFromHtml(html); + + expect(result).toBe("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"); + }); + + it("handles link text with wbr tags breaking up the URL", () => { + const html = ` + + https://chat.recoupable.com/chat/abcdef12-3456-7890-abcd-ef1234567890 + + `; + + const result = extractRoomIdFromHtml(html); + + expect(result).toBe("abcdef12-3456-7890-abcd-ef1234567890"); + }); + + it("handles mixed case in URL", () => { + const html = + 'link'; + + const result = extractRoomIdFromHtml(html); + + expect(result).toBe("12345678-1234-1234-1234-123456789abc"); + }); + }); +}); diff --git a/lib/emails/inbound/__tests__/getEmailRoomId.test.ts b/lib/emails/inbound/__tests__/getEmailRoomId.test.ts index 690beb59..2850f7c3 100644 --- a/lib/emails/inbound/__tests__/getEmailRoomId.test.ts +++ b/lib/emails/inbound/__tests__/getEmailRoomId.test.ts @@ -45,14 +45,54 @@ describe("getEmailRoomId", () => { }); }); + describe("secondary: extracting from email HTML", () => { + it("returns roomId from HTML when text has no chat link", async () => { + const emailContent = { + text: "No chat link in text", + html: 'link', + headers: { references: "" }, + } as GetReceivingEmailResponseSuccess; + + const result = await getEmailRoomId(emailContent); + + expect(result).toBe("abcdef12-3456-7890-abcd-ef1234567890"); + expect(mockSelectMemoryEmails).not.toHaveBeenCalled(); + }); + + it("handles Superhuman wbr tags in HTML link text", async () => { + const emailContent = { + text: undefined, + html: 'https://chat.recoupable.com/chat/d5c473ec-04cf-4a23-a577-e0dc71542392', + headers: {}, + } as GetReceivingEmailResponseSuccess; + + const result = await getEmailRoomId(emailContent); + + expect(result).toBe("d5c473ec-04cf-4a23-a577-e0dc71542392"); + }); + + it("prioritizes text over HTML", async () => { + const emailContent = { + text: "https://chat.recoupable.com/chat/11111111-1111-1111-1111-111111111111", + html: 'link', + headers: {}, + } as GetReceivingEmailResponseSuccess; + + const result = await getEmailRoomId(emailContent); + + expect(result).toBe("11111111-1111-1111-1111-111111111111"); + }); + }); + describe("fallback: checking references header", () => { - it("falls back to references header when no chat link in text", async () => { + it("falls back to references header when no chat link in text or html", async () => { mockSelectMemoryEmails.mockResolvedValue([ { memories: { room_id: "22222222-3333-4444-5555-666666666666" } }, ] as Awaited>); const emailContent = { text: "No chat link here", + html: "

No chat link in HTML either

", headers: { references: "" }, } as GetReceivingEmailResponseSuccess; diff --git a/lib/emails/inbound/extractRoomIdFromHtml.ts b/lib/emails/inbound/extractRoomIdFromHtml.ts new file mode 100644 index 00000000..f637b17e --- /dev/null +++ b/lib/emails/inbound/extractRoomIdFromHtml.ts @@ -0,0 +1,48 @@ +const UUID_PATTERN = "[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"; + +// Matches chat.recoupable.com/chat/{uuid} in various formats: +// - Direct URL: https://chat.recoupable.com/chat/uuid +// - URL-encoded (in tracking redirects): chat.recoupable.com%2Fchat%2Fuuid +const CHAT_LINK_PATTERNS = [ + new RegExp(`https?://chat\\.recoupable\\.com/chat/(${UUID_PATTERN})`, "i"), + new RegExp(`chat\\.recoupable\\.com%2Fchat%2F(${UUID_PATTERN})`, "i"), +]; + +// Pattern to find UUID after /chat/ or %2Fchat%2F in link text that may contain tags +// The link text version: "https:///chat.recoupable.com/chat/uuid" +const WBR_STRIPPED_PATTERN = new RegExp( + `chat\\.recoupable\\.com/chat/(${UUID_PATTERN})`, + "i", +); + +/** + * Extracts the roomId from email HTML by looking for a Recoup chat link. + * Handles various formats including: + * - Direct URLs in href attributes + * - URL-encoded URLs in tracking redirect links + * - Link text with tags inserted for word breaking (common in Superhuman) + * + * @param html - The email HTML body + * @returns The roomId if found, undefined otherwise + */ +export function extractRoomIdFromHtml(html: string | undefined): string | undefined { + if (!html) return undefined; + + // Try direct URL patterns first (most common case) + for (const pattern of CHAT_LINK_PATTERNS) { + const match = html.match(pattern); + if (match?.[1]) { + return match[1]; + } + } + + // Fallback: strip tags and try again + // This handles Superhuman's link text formatting: "https://chat...." + const strippedHtml = html.replace(//gi, ""); + const strippedMatch = strippedHtml.match(WBR_STRIPPED_PATTERN); + if (strippedMatch?.[1]) { + return strippedMatch[1]; + } + + return undefined; +} diff --git a/lib/emails/inbound/getEmailRoomId.ts b/lib/emails/inbound/getEmailRoomId.ts index ef889381..f12db939 100644 --- a/lib/emails/inbound/getEmailRoomId.ts +++ b/lib/emails/inbound/getEmailRoomId.ts @@ -1,10 +1,13 @@ import type { GetReceivingEmailResponseSuccess } from "resend"; import selectMemoryEmails from "@/lib/supabase/memory_emails/selectMemoryEmails"; import { extractRoomIdFromText } from "./extractRoomIdFromText"; +import { extractRoomIdFromHtml } from "./extractRoomIdFromHtml"; /** - * Extracts the roomId from an email. First checks the email text for a Recoup chat link, - * then falls back to looking up existing memory_emails via the references header. + * Extracts the roomId from an email. Checks multiple sources in order: + * 1. Email text body for a Recoup chat link + * 2. Email HTML body for a Recoup chat link (handles Superhuman's wbr tags) + * 3. References header to look up existing memory_emails * * @param emailContent - The email content from Resend's Receiving API * @returns The roomId if found, undefined otherwise @@ -18,6 +21,13 @@ export async function getEmailRoomId( return roomIdFromText; } + // Secondary: check email HTML for Recoup chat link + // This handles clients like Superhuman that insert tags in link text + const roomIdFromHtml = extractRoomIdFromHtml(emailContent.html); + if (roomIdFromHtml) { + return roomIdFromHtml; + } + // Fallback: check references header for existing memory_emails const references = emailContent.headers?.references; if (!references) {