From 7267ae4ee9d5f068b94b27120770570271268824 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 3 Dec 2025 14:01:28 -0800 Subject: [PATCH 1/4] Recognize inline replies, even in the presence of unquoted trailers --- test/test_email_import.py | 182 +++++++++++++++++++++++++++++++ typeagent/emails/email_import.py | 176 +++++++++++++++++++++++++++++- 2 files changed, 353 insertions(+), 5 deletions(-) create mode 100644 test/test_email_import.py diff --git a/test/test_email_import.py b/test/test_email_import.py new file mode 100644 index 00000000..efd2bd11 --- /dev/null +++ b/test/test_email_import.py @@ -0,0 +1,182 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +from typeagent.emails.email_import import ( + extract_inline_reply, + get_last_response_in_thread, + is_inline_reply, +) + + +class TestIsInlineReply: + def test_empty_text(self) -> None: + assert is_inline_reply("") is False + + def test_no_header(self) -> None: + text = "Just a regular email with no quoted content." + assert is_inline_reply(text) is False + + def test_bottom_posted_reply(self) -> None: + # This has "On ... wrote:" but all quotes are at the bottom, no interleaving + text = """Thanks for the info! + +On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: + +> Here is some quoted text. +> More quoted text. +> Even more.""" + assert is_inline_reply(text) is False + + def test_inline_reply(self) -> None: + text = """I've given my replies in line with the quoted text. + +On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: +> Quoted blah. + +That is clearly BS. + +> Quoted blah blah. + +Here I must agree. + +> More quoted text. + +-- +Guido van Rossum""" + assert is_inline_reply(text) is True + + def test_inline_reply_no_preamble(self) -> None: + text = """On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: +> First quote. + +My first response. + +> Second quote. + +My second response.""" + assert is_inline_reply(text) is True + + +class TestExtractInlineReply: + def test_empty_text(self) -> None: + assert extract_inline_reply("") == "" + + def test_no_inline_pattern(self) -> None: + text = "Just a regular email." + assert extract_inline_reply(text) == "Just a regular email." + + def test_basic_inline_reply(self) -> None: + text = """I've given my replies in line with the quoted text. + +On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: +> Quoted blah. + +That is clearly BS. + +> Quoted blah blah. + +Here I must agree. + +> More quoted text. + +-- +Guido van Rossum""" + result = extract_inline_reply(text) + assert "I've given my replies in line" in result + assert "That is clearly BS." in result + assert "Here I must agree." in result + # Quoted content should be removed + assert "Quoted blah" not in result + # Signature should be removed + assert "Guido van Rossum" not in result + + def test_inline_reply_with_context(self) -> None: + text = """On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: +> Is Python good? + +Yes, absolutely! + +> What about JavaScript? + +It has its uses.""" + result = extract_inline_reply(text, include_context=True) + assert "Yes, absolutely!" in result + assert "It has its uses." in result + assert "[In reply to:" in result + assert "Python" in result + + def test_preserves_preamble(self) -> None: + text = """Here's my preamble before the inline replies. + +On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: +> Question? + +Answer!""" + result = extract_inline_reply(text) + assert "Here's my preamble" in result + assert "Answer!" in result + + def test_strips_trailing_delimiters(self) -> None: + text = """On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: +> Question? + +Answer! +_______________""" + result = extract_inline_reply(text) + assert result.endswith("Answer!") + + +class TestGetLastResponseInThread: + def test_empty_text(self) -> None: + assert get_last_response_in_thread("") == "" + + def test_simple_text(self) -> None: + assert get_last_response_in_thread("Hello world") == "Hello world" + + def test_bottom_posted_reply(self) -> None: + text = """This is my response. + +On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: + +> Original message here. +> More original.""" + result = get_last_response_in_thread(text) + assert result == "This is my response." + + def test_inline_reply(self) -> None: + text = """Preamble. + +On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: +> Quote 1. + +Reply 1. + +> Quote 2. + +Reply 2. + +-- +Signature""" + result = get_last_response_in_thread(text) + assert "Preamble" in result + assert "Reply 1" in result + assert "Reply 2" in result + assert "Quote" not in result + assert "Signature" not in result + + def test_original_message_delimiter(self) -> None: + text = """My response. + +-----Original Message----- +From: Someone +The original content.""" + result = get_last_response_in_thread(text) + assert result == "My response." + + def test_forwarded_delimiter(self) -> None: + text = """My thoughts on this. + +----- Forwarded by Someone ----- +Original forwarded content.""" + result = get_last_response_in_thread(text) + assert result == "My thoughts on this." diff --git a/typeagent/emails/email_import.py b/typeagent/emails/email_import.py index 5be5baaa..2cf51182 100644 --- a/typeagent/emails/email_import.py +++ b/typeagent/emails/email_import.py @@ -125,21 +125,187 @@ def get_forwarded_email_parts(email_text: str) -> list[str]: # Precompiled regex for trailing line delimiters (underscores, dashes, equals, spaces) _TRAILING_LINE_DELIMITERS = re.compile(r"[\r\n][_\-= ]+\s*$") +# Pattern to detect "On wrote:" header for inline replies +_INLINE_REPLY_HEADER = re.compile( + r"^on\s+.+\s+wrote:\s*$", + re.IGNORECASE | re.MULTILINE, +) + +# Pattern to match quoted lines (starting with > possibly with leading whitespace) +_QUOTED_LINE = re.compile(r"^\s*>") + +# Pattern to detect email signature markers +_SIGNATURE_MARKER = re.compile(r"^--\s*$", re.MULTILINE) + + +def is_inline_reply(email_text: str) -> bool: + """ + Detect if an email contains inline replies (responses interspersed with quotes). + + An inline reply has: + 1. An "On ... wrote:" header + 2. Quoted lines (starting with >) interspersed with non-quoted response lines + """ + if not email_text: + return False + + # Must have the "On ... wrote:" header + header_match = _INLINE_REPLY_HEADER.search(email_text) + if not header_match: + return False + + # Check content after the header for mixed quoted/non-quoted lines + content_after_header = email_text[header_match.end() :] + lines = content_after_header.split("\n") + + has_quoted = False + has_non_quoted_after_quoted = False + + for line in lines: + # Check for signature marker + if _SIGNATURE_MARKER.match(line): + break + + stripped = line.strip() + if not stripped: + continue + + if _QUOTED_LINE.match(line): + has_quoted = True + elif has_quoted: + # Non-quoted line after we've seen quoted lines = inline reply + has_non_quoted_after_quoted = True + break + + return has_quoted and has_non_quoted_after_quoted + + +def extract_inline_reply(email_text: str, include_context: bool = False) -> str: + """ + Extract reply content from an email with inline responses. + + For emails where the author responds inline to quoted text, this extracts + the non-quoted portions (the actual replies). + + Args: + email_text: The full email body text + include_context: If True, include abbreviated quoted context before each reply + + Returns: + The extracted reply text. If include_context is True, quoted lines are + prefixed with "[quoted]" to show what's being replied to. + """ + if not email_text: + return "" + + # Find the "On ... wrote:" header + header_match = _INLINE_REPLY_HEADER.search(email_text) + if not header_match: + # No inline reply pattern, return as-is + return email_text + + # Get preamble (content before the "On ... wrote:" header) + preamble = email_text[: header_match.start()].strip() + + # Process content after header + content_after_header = email_text[header_match.end() :] + lines = content_after_header.split("\n") + + result_parts: list[str] = [] + if preamble: + result_parts.append(preamble) + + current_reply_lines: list[str] = [] + current_quoted_lines: list[str] = [] + in_signature = False + + for line in lines: + # Check for signature marker + if _SIGNATURE_MARKER.match(line): + in_signature = True + # Flush any pending reply + if current_reply_lines: + if include_context and current_quoted_lines: + result_parts.append(_summarize_quoted(current_quoted_lines)) + result_parts.append("\n".join(current_reply_lines)) + current_reply_lines = [] + current_quoted_lines = [] + continue + + if in_signature: + # Skip signature content + continue + + if _QUOTED_LINE.match(line): + # This is a quoted line + if current_reply_lines: + # Flush the current reply block + if include_context and current_quoted_lines: + result_parts.append(_summarize_quoted(current_quoted_lines)) + result_parts.append("\n".join(current_reply_lines)) + current_reply_lines = [] + current_quoted_lines = [] + # Accumulate quoted lines for context (only if needed) + if include_context: + # Strip the leading > and any space after it + unquoted = re.sub(r"^\s*>\s?", "", line) + current_quoted_lines.append(unquoted) + else: + # Non-quoted line - part of the reply + stripped = line.strip() + if stripped or current_reply_lines: + # Include non-empty lines, or preserve blank lines within a reply block + current_reply_lines.append(line.rstrip()) + + # Flush any remaining reply + if current_reply_lines: + if include_context and current_quoted_lines: + result_parts.append(_summarize_quoted(current_quoted_lines)) + result_parts.append("\n".join(current_reply_lines)) + + result = "\n\n".join(part for part in result_parts if part.strip()) + return _strip_trailing_delimiters(result) + + +def _summarize_quoted(quoted_lines: list[str]) -> str: + """Create a brief summary of quoted content for context.""" + # Join and truncate to provide context + text = " ".join(line.strip() for line in quoted_lines if line.strip()) + if len(text) > 100: + text = text[:97] + "..." + return f"[In reply to: {text}]" + + +def _strip_trailing_delimiters(text: str) -> str: + """Remove trailing line delimiters (underscores, dashes, equals, spaces).""" + text = text.strip() + return _TRAILING_LINE_DELIMITERS.sub("", text) + # Simple way to get the last response on an email thread in MIME format def get_last_response_in_thread(email_text: str) -> str: + """ + Extract the latest response from an email thread. + + Handles two patterns: + 1. Bottom-posted replies: New content at top, quoted thread at bottom + 2. Inline replies: Responses interspersed with quoted text + + For inline replies, only the reply portions (non-quoted text) are extracted. + """ if not email_text: return "" + # Check for inline reply pattern first + if is_inline_reply(email_text): + return extract_inline_reply(email_text, include_context=False) + + # Fall back to original behavior for bottom-posted replies match = _THREAD_DELIMITERS.search(email_text) if match: email_text = email_text[: match.start()] - email_text = email_text.strip() - # Remove trailing line delimiters (e.g. underscores, dashes, equals) - _TRAILING_LINE_DELIMITER_REGEX = _TRAILING_LINE_DELIMITERS - email_text = _TRAILING_LINE_DELIMITER_REGEX.sub("", email_text) - return email_text + return _strip_trailing_delimiters(email_text) # Extracts the plain text body from an email.message.Message object. From 37006c12195fae67371c9949ef58155ad1a7d4f3 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 3 Dec 2025 14:07:36 -0800 Subject: [PATCH 2/4] Rename bottom-posting to top-posting --- test/test_email_import.py | 4 ++-- typeagent/emails/email_import.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_email_import.py b/test/test_email_import.py index efd2bd11..7e8d3308 100644 --- a/test/test_email_import.py +++ b/test/test_email_import.py @@ -16,7 +16,7 @@ def test_no_header(self) -> None: text = "Just a regular email with no quoted content." assert is_inline_reply(text) is False - def test_bottom_posted_reply(self) -> None: + def test_top_posted_reply(self) -> None: # This has "On ... wrote:" but all quotes are at the bottom, no interleaving text = """Thanks for the info! @@ -133,7 +133,7 @@ def test_empty_text(self) -> None: def test_simple_text(self) -> None: assert get_last_response_in_thread("Hello world") == "Hello world" - def test_bottom_posted_reply(self) -> None: + def test_top_posted_reply(self) -> None: text = """This is my response. On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: diff --git a/typeagent/emails/email_import.py b/typeagent/emails/email_import.py index 2cf51182..d5db5380 100644 --- a/typeagent/emails/email_import.py +++ b/typeagent/emails/email_import.py @@ -288,7 +288,7 @@ def get_last_response_in_thread(email_text: str) -> str: Extract the latest response from an email thread. Handles two patterns: - 1. Bottom-posted replies: New content at top, quoted thread at bottom + 1. Top-posted replies: New content at top, quoted thread at bottom 2. Inline replies: Responses interspersed with quoted text For inline replies, only the reply portions (non-quoted text) are extracted. From 3c1bc26d78d17c5c2b56d0985401da8cd653f6dd Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 3 Dec 2025 14:48:02 -0800 Subject: [PATCH 3/4] Add chunk_sources to track quoted vs original email content --- test/test_email_import.py | 188 +++++++++++++++--------------- typeagent/emails/email_import.py | 179 ++++++++++++++-------------- typeagent/emails/email_message.py | 6 + 3 files changed, 190 insertions(+), 183 deletions(-) diff --git a/test/test_email_import.py b/test/test_email_import.py index 7e8d3308..18208617 100644 --- a/test/test_email_import.py +++ b/test/test_email_import.py @@ -2,9 +2,8 @@ # Licensed under the MIT License. from typeagent.emails.email_import import ( - extract_inline_reply, - get_last_response_in_thread, is_inline_reply, + parse_email_chunks, ) @@ -18,17 +17,20 @@ def test_no_header(self) -> None: def test_top_posted_reply(self) -> None: # This has "On ... wrote:" but all quotes are at the bottom, no interleaving - text = """Thanks for the info! + text = """\ +Thanks for the info! On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: > Here is some quoted text. > More quoted text. -> Even more.""" +> Even more. +""" assert is_inline_reply(text) is False def test_inline_reply(self) -> None: - text = """I've given my replies in line with the quoted text. + text = """\ +I've given my replies in line with the quoted text. On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: > Quoted blah. @@ -42,31 +44,37 @@ def test_inline_reply(self) -> None: > More quoted text. -- -Guido van Rossum""" +Guido van Rossum +""" assert is_inline_reply(text) is True def test_inline_reply_no_preamble(self) -> None: - text = """On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: + text = """\ +On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: > First quote. My first response. > Second quote. -My second response.""" +My second response. +""" assert is_inline_reply(text) is True -class TestExtractInlineReply: +class TestParseEmailChunks: def test_empty_text(self) -> None: - assert extract_inline_reply("") == "" + assert parse_email_chunks("") == [] def test_no_inline_pattern(self) -> None: text = "Just a regular email." - assert extract_inline_reply(text) == "Just a regular email." + result = parse_email_chunks(text) + assert len(result) == 1 + assert result[0] == ("Just a regular email.", None) def test_basic_inline_reply(self) -> None: - text = """I've given my replies in line with the quoted text. + text = """\ +I've given my replies in line with the quoted text. On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: > Quoted blah. @@ -80,103 +88,95 @@ def test_basic_inline_reply(self) -> None: > More quoted text. -- -Guido van Rossum""" - result = extract_inline_reply(text) - assert "I've given my replies in line" in result - assert "That is clearly BS." in result - assert "Here I must agree." in result - # Quoted content should be removed - assert "Quoted blah" not in result - # Signature should be removed - assert "Guido van Rossum" not in result - - def test_inline_reply_with_context(self) -> None: - text = """On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: +Guido van Rossum +""" + result = parse_email_chunks(text) + # Should have: preamble (original), quoted, reply, quoted, reply, quoted + texts = [chunk[0] for chunk in result] + sources = [chunk[1] for chunk in result] + + # Check we have all the content + assert any("I've given my replies" in t for t in texts) + assert any("That is clearly BS" in t for t in texts) + assert any("Here I must agree" in t for t in texts) + assert any("Quoted blah" in t for t in texts) + + # Original content should have None source + for text, source in result: + if "I've given my replies" in text or "That is clearly BS" in text: + assert source is None + + # Quoted content should have the person's name + for text, source in result: + if "Quoted blah" in text: + assert source == "Someone" + + # Signature should NOT be included + assert not any("Guido van Rossum" in t for t in texts) + + def test_extracts_quoted_person_name(self) -> None: + text = """\ +On Mon, Dec 10, 2020 at 10:30 AM John Doe wrote: > Is Python good? Yes, absolutely! > What about JavaScript? -It has its uses.""" - result = extract_inline_reply(text, include_context=True) - assert "Yes, absolutely!" in result - assert "It has its uses." in result - assert "[In reply to:" in result - assert "Python" in result +It has its uses. +""" + result = parse_email_chunks(text) + + # Find quoted chunks - they should have "John Doe" as source + quoted_chunks = [(t, s) for t, s in result if s is not None] + assert len(quoted_chunks) == 2 + for text, source in quoted_chunks: + assert source == "John Doe" def test_preserves_preamble(self) -> None: - text = """Here's my preamble before the inline replies. + text = """\ +Here's my preamble before the inline replies. On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: > Question? -Answer!""" - result = extract_inline_reply(text) - assert "Here's my preamble" in result - assert "Answer!" in result - - def test_strips_trailing_delimiters(self) -> None: - text = """On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: -> Question? - Answer! -_______________""" - result = extract_inline_reply(text) - assert result.endswith("Answer!") +""" + result = parse_email_chunks(text) + texts = [chunk[0] for chunk in result] + assert any("preamble" in t for t in texts) + assert any("Answer" in t for t in texts) -class TestGetLastResponseInThread: - def test_empty_text(self) -> None: - assert get_last_response_in_thread("") == "" - - def test_simple_text(self) -> None: - assert get_last_response_in_thread("Hello world") == "Hello world" - - def test_top_posted_reply(self) -> None: - text = """This is my response. - + def test_strips_trailing_delimiters(self) -> None: + text = """\ On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: +> Question? -> Original message here. -> More original.""" - result = get_last_response_in_thread(text) - assert result == "This is my response." - - def test_inline_reply(self) -> None: - text = """Preamble. - +Answer! +_______________ +""" + result = parse_email_chunks(text) + # Last non-quoted chunk should not end with underscores + original_chunks = [t for t, s in result if s is None] + assert len(original_chunks) > 0 + assert not original_chunks[-1].endswith("_") + + def test_quoted_content_is_unabbreviated(self) -> None: + text = """\ On Mon, Dec 10, 2020 at 10:30 AM Someone wrote: -> Quote 1. - -Reply 1. - -> Quote 2. - -Reply 2. - --- -Signature""" - result = get_last_response_in_thread(text) - assert "Preamble" in result - assert "Reply 1" in result - assert "Reply 2" in result - assert "Quote" not in result - assert "Signature" not in result - - def test_original_message_delimiter(self) -> None: - text = """My response. - ------Original Message----- -From: Someone -The original content.""" - result = get_last_response_in_thread(text) - assert result == "My response." - - def test_forwarded_delimiter(self) -> None: - text = """My thoughts on this. - ------ Forwarded by Someone ----- -Original forwarded content.""" - result = get_last_response_in_thread(text) - assert result == "My thoughts on this." +> This is a very long quoted line that should be preserved in full. +> And this is another line that continues the quote. +> Even more content here. + +My response. +""" + result = parse_email_chunks(text) + + # Find the quoted chunk + quoted = [t for t, s in result if s is not None] + assert len(quoted) == 1 + # Full content should be preserved + assert "very long quoted line" in quoted[0] + assert "another line" in quoted[0] + assert "Even more content" in quoted[0] diff --git a/typeagent/emails/email_import.py b/typeagent/emails/email_import.py index d5db5380..89bfc92f 100644 --- a/typeagent/emails/email_import.py +++ b/typeagent/emails/email_import.py @@ -54,9 +54,8 @@ def import_forwarded_email_string( # Imports an email.message.Message object and returns an EmailMessage object -# If the message is a reply, returns only the latest response. def import_email_message(msg: Message, max_chunk_length: int) -> EmailMessage: - # Extract metadata from + # Extract metadata email_meta = EmailMessageMeta( sender=msg.get("From", ""), recipients=_import_address_headers(msg.get_all("To", [])), @@ -70,20 +69,32 @@ def import_email_message(msg: Message, max_chunk_length: int) -> EmailMessage: if timestamp_date is not None: timestamp = parsedate_to_datetime(timestamp_date).isoformat() - # Get email body. - # If the email was a reply, then ensure we only pick up the latest response + # Get email body and parse into chunks with source attribution body = _extract_email_body(msg) if body is None: body = "" - elif is_reply(msg): - body = get_last_response_in_thread(body) + # Prepend subject to body if available if email_meta.subject is not None: body = email_meta.subject + "\n\n" + body - body_chunks = _text_to_chunks(body, max_chunk_length) + # Parse into chunks with source attribution (handles inline replies) + parsed_chunks = parse_email_chunks(body) + + # Apply max_chunk_length splitting while preserving source attribution + text_chunks: list[str] = [] + chunk_sources: list[str | None] = [] + for text, source in parsed_chunks: + sub_chunks = _text_to_chunks(text, max_chunk_length) + for sub_chunk in sub_chunks: + text_chunks.append(sub_chunk) + chunk_sources.append(source) + email: EmailMessage = EmailMessage( - metadata=email_meta, text_chunks=body_chunks, timestamp=timestamp + metadata=email_meta, + text_chunks=text_chunks, + chunk_sources=chunk_sources, + timestamp=timestamp, ) return email @@ -126,8 +137,13 @@ def get_forwarded_email_parts(email_text: str) -> list[str]: _TRAILING_LINE_DELIMITERS = re.compile(r"[\r\n][_\-= ]+\s*$") # Pattern to detect "On wrote:" header for inline replies +# Uses alternation to handle different date formats: +# 1. "On Mon, Dec 10, 2020 at 10:30 AM John Doe wrote:" (AM/PM format) +# 2. "On Mon, Dec 10, 2020 Someone wrote:" (year followed by name) +# 3. Fallback: last words before "wrote:" +# Groups 1, 2, or 3 will capture the person's name depending on format _INLINE_REPLY_HEADER = re.compile( - r"^on\s+.+\s+wrote:\s*$", + r"^on\s+(?:.+[AP]M\s+(.+?)|.+,\s*\d{4}\s+(.+?)|.+\s+(.+?))\s+wrote:\s*$", re.IGNORECASE | re.MULTILINE, ) @@ -138,6 +154,10 @@ def get_forwarded_email_parts(email_text: str) -> list[str]: _SIGNATURE_MARKER = re.compile(r"^--\s*$", re.MULTILINE) +# Type alias for chunk with source info +ChunkWithSource = tuple[str, str | None] # (text, source: None=original, str=quoted) + + def is_inline_reply(email_text: str) -> bool: """ Detect if an email contains inline replies (responses interspersed with quotes). @@ -180,29 +200,36 @@ def is_inline_reply(email_text: str) -> bool: return has_quoted and has_non_quoted_after_quoted -def extract_inline_reply(email_text: str, include_context: bool = False) -> str: +def parse_email_chunks(email_text: str) -> list[ChunkWithSource]: """ - Extract reply content from an email with inline responses. - - For emails where the author responds inline to quoted text, this extracts - the non-quoted portions (the actual replies). + Parse email text into chunks with source attribution. - Args: - email_text: The full email body text - include_context: If True, include abbreviated quoted context before each reply + Returns a list of (text, source) tuples where: + - source is None for original (unquoted) content + - source is the quoted person's name for quoted content, or " " if unknown - Returns: - The extracted reply text. If include_context is True, quoted lines are - prefixed with "[quoted]" to show what's being replied to. + This handles inline replies where the sender responds inline to quoted text, + preserving both the quoted and unquoted portions as separate chunks. """ if not email_text: - return "" + return [] # Find the "On ... wrote:" header header_match = _INLINE_REPLY_HEADER.search(email_text) if not header_match: - # No inline reply pattern, return as-is - return email_text + # No inline reply pattern, return as a single original chunk + text = _strip_trailing_delimiters(email_text) + if text: + return [(text, None)] + return [] + + # Extract quoted person from header (first non-None group from groups 1, 2, or 3) + quoted_person = ( + header_match.group(1) or header_match.group(2) or header_match.group(3) or " " + ) + quoted_person = quoted_person.strip() if quoted_person else " " + if not quoted_person: + quoted_person = " " # Get preamble (content before the "On ... wrote:" header) preamble = email_text[: header_match.start()].strip() @@ -211,25 +238,37 @@ def extract_inline_reply(email_text: str, include_context: bool = False) -> str: content_after_header = email_text[header_match.end() :] lines = content_after_header.split("\n") - result_parts: list[str] = [] + result: list[ChunkWithSource] = [] if preamble: - result_parts.append(preamble) + result.append((preamble, None)) current_reply_lines: list[str] = [] current_quoted_lines: list[str] = [] in_signature = False + def flush_reply() -> None: + nonlocal current_reply_lines + if current_reply_lines: + text = "\n".join(current_reply_lines).strip() + if text: + result.append((text, None)) + current_reply_lines = [] + + def flush_quoted() -> None: + nonlocal current_quoted_lines + if current_quoted_lines: + text = "\n".join(current_quoted_lines).strip() + if text: + result.append((text, quoted_person)) + current_quoted_lines = [] + for line in lines: # Check for signature marker if _SIGNATURE_MARKER.match(line): in_signature = True - # Flush any pending reply - if current_reply_lines: - if include_context and current_quoted_lines: - result_parts.append(_summarize_quoted(current_quoted_lines)) - result_parts.append("\n".join(current_reply_lines)) - current_reply_lines = [] - current_quoted_lines = [] + # Flush any pending content + flush_quoted() + flush_reply() continue if in_signature: @@ -237,43 +276,31 @@ def extract_inline_reply(email_text: str, include_context: bool = False) -> str: continue if _QUOTED_LINE.match(line): - # This is a quoted line + # This is a quoted line - flush any pending reply first if current_reply_lines: - # Flush the current reply block - if include_context and current_quoted_lines: - result_parts.append(_summarize_quoted(current_quoted_lines)) - result_parts.append("\n".join(current_reply_lines)) - current_reply_lines = [] - current_quoted_lines = [] - # Accumulate quoted lines for context (only if needed) - if include_context: - # Strip the leading > and any space after it - unquoted = re.sub(r"^\s*>\s?", "", line) - current_quoted_lines.append(unquoted) + flush_reply() + # Strip the leading > and any space after it + unquoted = re.sub(r"^\s*>\s?", "", line) + current_quoted_lines.append(unquoted) else: - # Non-quoted line - part of the reply + # Non-quoted line - flush any pending quoted first + if current_quoted_lines: + flush_quoted() + # Only accumulate non-empty lines or preserve blank lines within a block stripped = line.strip() if stripped or current_reply_lines: - # Include non-empty lines, or preserve blank lines within a reply block current_reply_lines.append(line.rstrip()) - # Flush any remaining reply - if current_reply_lines: - if include_context and current_quoted_lines: - result_parts.append(_summarize_quoted(current_quoted_lines)) - result_parts.append("\n".join(current_reply_lines)) + # Flush any remaining content + flush_quoted() + flush_reply() - result = "\n\n".join(part for part in result_parts if part.strip()) - return _strip_trailing_delimiters(result) + # Strip trailing delimiters from the last chunk + if result: + last_text, last_source = result[-1] + result[-1] = (_strip_trailing_delimiters(last_text), last_source) - -def _summarize_quoted(quoted_lines: list[str]) -> str: - """Create a brief summary of quoted content for context.""" - # Join and truncate to provide context - text = " ".join(line.strip() for line in quoted_lines if line.strip()) - if len(text) > 100: - text = text[:97] + "..." - return f"[In reply to: {text}]" + return result def _strip_trailing_delimiters(text: str) -> str: @@ -282,32 +309,6 @@ def _strip_trailing_delimiters(text: str) -> str: return _TRAILING_LINE_DELIMITERS.sub("", text) -# Simple way to get the last response on an email thread in MIME format -def get_last_response_in_thread(email_text: str) -> str: - """ - Extract the latest response from an email thread. - - Handles two patterns: - 1. Top-posted replies: New content at top, quoted thread at bottom - 2. Inline replies: Responses interspersed with quoted text - - For inline replies, only the reply portions (non-quoted text) are extracted. - """ - if not email_text: - return "" - - # Check for inline reply pattern first - if is_inline_reply(email_text): - return extract_inline_reply(email_text, include_context=False) - - # Fall back to original behavior for bottom-posted replies - match = _THREAD_DELIMITERS.search(email_text) - if match: - email_text = email_text[: match.start()] - - return _strip_trailing_delimiters(email_text) - - # Extracts the plain text body from an email.message.Message object. def _extract_email_body(msg: Message) -> str: """Extracts the plain text body from an email.message.Message object.""" @@ -365,11 +366,11 @@ def _text_to_chunks(text: str, max_chunk_length: int) -> list[str]: if len(text) < max_chunk_length: return [text] - paragraphs = _splitIntoParagraphs(text) + paragraphs = _split_into_paragraphs(text) return list(_merge_chunks(paragraphs, "\n\n", max_chunk_length)) -def _splitIntoParagraphs(text: str) -> list[str]: +def _split_into_paragraphs(text: str) -> list[str]: return _remove_empty_strings(re.split(r"\n{2,}", text)) diff --git a/typeagent/emails/email_message.py b/typeagent/emails/email_message.py index 4b1ec287..69a06eda 100644 --- a/typeagent/emails/email_message.py +++ b/typeagent/emails/email_message.py @@ -160,6 +160,12 @@ def __init__(self, **data: Any) -> None: super().__init__(**data) text_chunks: list[str] = CamelCaseField("The text chunks of the email message") + # For each chunk: None means original content, str means quoted. + # If quoted, the string is the name of the person being quoted, or " " if unknown. + chunk_sources: list[str | None] = CamelCaseField( + "Source attribution for each chunk: None=original, str=quoted person or ' '", + default_factory=list, + ) metadata: EmailMessageMeta = CamelCaseField( "Metadata associated with the email message" ) From 21282ba6e959ff16cc3dd67530f6b24c4a7b1bbf Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Wed, 3 Dec 2025 15:00:29 -0800 Subject: [PATCH 4/4] Add CODEOWNERS --- CODEOWNERS | 1 + 1 file changed, 1 insertion(+) create mode 100644 CODEOWNERS diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 00000000..bcff3975 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1 @@ +* @gvanrossum @gvanrossum-ms @umeshma @robgruen