microsoft · gvanrossum · Oct 20, 2025 · Oct 20, 2025 · Oct 20, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,6 +1,6 @@
 # .github/workflows/ci.yml
 name: CI
-on: [push]
+on: [push, pull_request]
 
 jobs:
   ci:

diff --git a/.gitignore b/.gitignore
@@ -3,15 +3,16 @@ __pycache__
 /.env
 /.ui_history
 /.venv
-/db
-/*.db
+*.db
+*.db.index_log
 /evals
 /gmail/client_secret.json
 /gmail/token.json
 *_dump/
 /testdata/Episode_53_Answer_results.json
 /testdata/Episode_53_Search_results.json
 /testdata/MP/
+/demo/env
 
 # VSCode
 .vscode

diff --git a/TADA.md b/TADA.md
diff --git a/TODO.md b/TODO.md
@@ -1,5 +1,43 @@
 # TODO for the Python knowpro port
 
+# Leftover TODOs from TADA.md
+
+
+## Software
+
+Minor:
+
+- Distinguish between release deps and build/dev deps?
+- Improve load_dotenv() (don't look for `<repo>/ts/.env`, use one loop)
+
+### Specifically for VTT import (minor)
+
+- Reduce duplication between ingest_vtt.py and typeagent/transcripts/
+- `get_transcript_speakers` and `get_transcript_duration` should not
+  re-parse the transcript -- they should just take the parsed vtt object.
+
+### Later
+
+- Fix MCP service (should use host's LLM, not its own)
+- Handle embeddings in MCP, even though MCP doesn't support it yet
+  - GPT5 suggests to run a separate MCP service for this
+  - Batch 128-256 items at a time
+  - Explicitly handle truncation by counting tokens
+- Handle caching using sha256() of text?
+
+## Documentation
+
+- Document what should go in `.env` and where it should live
+  - And alternatively (first?) what to put in shell env directly
+- Document how o reproduce the demos from the talk (and Kevin/Adrian)
+- Document test/build/release process
+- Document how to use gmail_dump.py (set up a project etc.)
+
+Maybe later:
+
+- Document how to run evaluations (but don't reveal all the data)
+
+
 # TODOs for fully implementing persistence through SQLite
 
 ## Now

diff --git a/demo/demo.py b/demo/demo.py
@@ -0,0 +1,34 @@
+from typeagent import create_conversation
+from typeagent.transcripts.transcript import (
+    TranscriptMessage,
+    TranscriptMessageMeta,
+)
+
+
+def read_messages(filename) -> list[TranscriptMessage]:
+    messages: list[TranscriptMessage] = []
+    with open(filename, "r") as f:
+        for line in f:
+            # Parse each line into a TranscriptMessage
+            speaker, text_chunk = line.split(None, 1)
+            message = TranscriptMessage(
+                text_chunks=[text_chunk],
+                metadata=TranscriptMessageMeta(speaker=speaker),
+            )
+            messages.append(message)
+    return messages
+
+
+async def main():
+    conversation = await create_conversation("demo.db", TranscriptMessage)
+    messages = read_messages("transcript.txt")
+    print(f"Indexing {len(messages)} messages...")
+    results = await conversation.add_messages_with_indexing(messages)
+    print(f"Indexed {results.messages_added} messages.")
+    print(f"Got {results.semrefs_added} semantic refs.")
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(main())
diff --git a/demo/query.py b/demo/query.py
@@ -0,0 +1,16 @@
+from typeagent import create_conversation
+from typeagent.transcripts.transcript import TranscriptMessage
+
+
+async def main():
+    conversation = await create_conversation("demo.db", TranscriptMessage)
+    question = "Who volunteered to do the python library?"
+    print("Q:", question)
+    answer = await conversation.query(question)
+    print("A:", answer)
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(main())
diff --git a/demo/transcript.txt b/demo/transcript.txt
@@ -0,0 +1,3 @@
+STEVE We should really make a Python library for Structured RAG.
+UMESH Who would be a good person to do the Python library?
+GUIDO I volunteer to do the Python library. Give me a few months.