From c6a0211e4303465a29364e91b08f6b2d5a48b030 Mon Sep 17 00:00:00 2001
From: Guido van Rossum <gvanrossum@microsoft.com>
Date: Mon, 20 Oct 2025 13:47:34 -0700
Subject: [PATCH 1/3] Add demo code from docs/getting-started.md

And update .gitignore to ignore demo/env and database files anywhere.
---
 .gitignore          |  5 +++--
 demo/demo.py        | 34 ++++++++++++++++++++++++++++++++++
 demo/query.py       | 16 ++++++++++++++++
 demo/transcript.txt |  3 +++
 4 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 demo/demo.py
 create mode 100644 demo/query.py
 create mode 100644 demo/transcript.txt
diff --git a/.gitignore b/.gitignore
index 1569346..00f697f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,8 +3,8 @@ __pycache__
 /.env
 /.ui_history
 /.venv
-/db
-/*.db
+*.db
+*.db.index_log
 /evals
 /gmail/client_secret.json
 /gmail/token.json
@@ -12,6 +12,7 @@ __pycache__
 /testdata/Episode_53_Answer_results.json
 /testdata/Episode_53_Search_results.json
 /testdata/MP/
+/demo/env
 
 # VSCode
 .vscode
diff --git a/demo/demo.py b/demo/demo.py
new file mode 100644
index 0000000..ac1a15e
--- /dev/null
+++ b/demo/demo.py
@@ -0,0 +1,34 @@
+from typeagent import create_conversation
+from typeagent.transcripts.transcript import (
+    TranscriptMessage,
+    TranscriptMessageMeta,
+)
+
+
+def read_messages(filename) -> list[TranscriptMessage]:
+    messages: list[TranscriptMessage] = []
+    with open(filename, "r") as f:
+        for line in f:
+            # Parse each line into a TranscriptMessage
+            speaker, text_chunk = line.split(None, 1)
+            message = TranscriptMessage(
+                text_chunks=[text_chunk],
+                metadata=TranscriptMessageMeta(speaker=speaker),
+            )
+            messages.append(message)
+    return messages
+
+
+async def main():
+    conversation = await create_conversation("demo.db", TranscriptMessage)
+    messages = read_messages("transcript.txt")
+    print(f"Indexing {len(messages)} messages...")
+    results = await conversation.add_messages_with_indexing(messages)
+    print(f"Indexed {results.messages_added} messages.")
+    print(f"Got {results.semrefs_added} semantic refs.")
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(main())
diff --git a/demo/query.py b/demo/query.py
new file mode 100644
index 0000000..b28a7f0
--- /dev/null
+++ b/demo/query.py
@@ -0,0 +1,16 @@
+from typeagent import create_conversation
+from typeagent.transcripts.transcript import TranscriptMessage
+
+
+async def main():
+    conversation = await create_conversation("demo.db", TranscriptMessage)
+    question = "Who volunteered to do the python library?"
+    print("Q:", question)
+    answer = await conversation.query(question)
+    print("A:", answer)
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(main())
diff --git a/demo/transcript.txt b/demo/transcript.txt
new file mode 100644
index 0000000..e93557b
--- /dev/null
+++ b/demo/transcript.txt
@@ -0,0 +1,3 @@
+STEVE We should really make a Python library for Structured RAG.
+UMESH Who would be a good person to do the Python library?
+GUIDO I volunteer to do the Python library. Give me a few months.

From 584add1ed981fba38a7779e2570e04b97976d7da Mon Sep 17 00:00:00 2001
From: Guido van Rossum <gvanrossum@microsoft.com>
Date: Mon, 20 Oct 2025 13:52:35 -0700
Subject: [PATCH 2/3] Move still-relevant things from TADA.md (crudely) to
 TODO.md

---
 TADA.md | 124 --------------------------------------------------------
 TODO.md |  38 +++++++++++++++++
 2 files changed, 38 insertions(+), 124 deletions(-)
 delete mode 100644 TADA.md

diff --git a/TADA.md b/TADA.md
deleted file mode 100644
index 0d6ca1b..0000000
--- a/TADA.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# Things to do before Oct 18
-
-Talk at PyBay is on Sat, Oct 18 in SF
-
-## My Priorities
-
-- Write docs
-- Polish demos
-- Practice and polish talk
-
-## Software
-
-Minor:
-
-- Distinguish between release deps and build/dev deps?
-- Improve load_dotenv() (don't look for `<repo>/ts/.env`, use one loop)
-
-### Specifically for VTT import:
-
-Minor:
-
-- Reduce duplication between ingest_vtt.py and typeagent/transcripts/
-- `get_transcript_speakers` and `get_transcript_duration` should not
-  re-parse the transcript -- they should just take the parsed vtt object.
-
-### Not doing:
-
-- Fix MCP service (should use host's LLM, not its own)
-- Handle embeddings in MCP, even though MCP doesn't support it yet
-  - GPT5 suggests to run a separate MCP service for this
-  - Batch 128-256 items at a time
-  - Explicitly handle truncation by counting tokens
-- Handle caching using sha256() of text?
-
-## Documentation
-
-- Getting Started
-- Document the high-level API
-- Document what should go in `.env` and where it should live
-  - And alternatively (first?) what to put in shell env directly
-- Document test/build/release process
-- Document how to run evaluations (but don't reveal all the data)
-- Document how to use gmail_dump.py (set up a project etc.)
-
-## Demos
-
-- Adrian Tchaikovsky Podcast: ready
-- Monty Python Episode: ready but need to pick a list of sketches to index
-- Email demo: Umesh has a working prototype
-
-## Talk
-
-- Practice in private, timing, updating slides as needed
-- Practice run for the team?
-- Anticipate questions about (Lazy) GraphRAG?
-  The answer is we give equiv/better results without waiting minutes for
-  reindexing [in theory]
-
-
-# Appendix
-
-## Official abstract: "Structured RAG is better than RAG!"
-
-At Microsoft I've been contributing to an open source project
-demonstrating what we call Structured RAG.
-This is an improvement over the popular AI tactic named RAG (look it up)
-that can answer questions over large collections of text or images
-better and faster than RAG. We use this as the basis for long-term AI
-memory.
-
-I will explain the Structured RAG algorithm and show some demos with
-real-world data. I will also discuss the Python library we are releasing
-this summer and its API.
-
-## Scratch space for talk drafting
-
-1. Explain Structured RAG (SRAG)
-
-   1. Explain RAG
-   2. Explain how SRAG works instead
-   3. Show how SRAG is better (how?)
-
-1a. My process
-  - Over time using more and more AI (mostly Claude)
-  - Latest changes almost entirely written by AI (with my strict supervision :-)
-
-2. Demos
-
-   1. Podcast demo queries (clean up query.py for this?)
-   2. Document demo, show ingest and query (very briefly)
-   3. MP movie? Email?
-
-3. Basics for using the library
-   1. Install:
-      ```sh
-      pip install typeagent-py  # Installs typeagent and dependencies
-      ```
-   2. Create conversation (TENTATIVE):
-      ```py
-      import typeagent
-
-      conv = typeagent.get_conversation(dbfile="mymemory.sqlite")
-      # Could be empty (new) or could contain previously ingested data
-      # You can always ingest additional messages
-      ```
-   3. Ingest messages (TENTATIVE):
-      ```py
-      for message in ...:  # Source of message strings
-          metadata = ...  # Set date/time, speaker(s), listener(s)
-          conv.ingest_message(message, metadata)
-      ```
-   4. Query (TENTATIVE):
-      ```py
-      request = input("> ")
-      answer = conv.query(request)
-      print(request)
-      ```
-   5. Demo using podcast example data
-
-4. Links
-
-- To PyPI project
-- To GitHub (microsoft/typeagent-py)
-- To docs
diff --git a/TODO.md b/TODO.md
index de1ca7d..19b742e 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,5 +1,43 @@
 # TODO for the Python knowpro port
 
+# Leftover TODOs from TADA.md
+
+
+## Software
+
+Minor:
+
+- Distinguish between release deps and build/dev deps?
+- Improve load_dotenv() (don't look for `<repo>/ts/.env`, use one loop)
+
+### Specifically for VTT import (minor)
+
+- Reduce duplication between ingest_vtt.py and typeagent/transcripts/
+- `get_transcript_speakers` and `get_transcript_duration` should not
+  re-parse the transcript -- they should just take the parsed vtt object.
+
+### Later
+
+- Fix MCP service (should use host's LLM, not its own)
+- Handle embeddings in MCP, even though MCP doesn't support it yet
+  - GPT5 suggests to run a separate MCP service for this
+  - Batch 128-256 items at a time
+  - Explicitly handle truncation by counting tokens
+- Handle caching using sha256() of text?
+
+## Documentation
+
+- Document what should go in `.env` and where it should live
+  - And alternatively (first?) what to put in shell env directly
+- Document how o reproduce the demos from the talk (and Kevin/Adrian)
+- Document test/build/release process
+- Document how to use gmail_dump.py (set up a project etc.)
+
+Maybe later:
+
+- Document how to run evaluations (but don't reveal all the data)
+
+
 # TODOs for fully implementing persistence through SQLite
 
 ## Now

From dc2b3635505f80796503399793972228a10935ff Mon Sep 17 00:00:00 2001
From: Guido van Rossum <gvanrossum@microsoft.com>
Date: Mon, 20 Oct 2025 13:53:50 -0700
Subject: [PATCH 3/3] Add 'pull_request' back to ci.yml

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b26d64e..65fed02 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,6 +1,6 @@
 # .github/workflows/ci.yml
 name: CI
-on: [push]
+on: [push, pull_request]
 
 jobs:
   ci: