From c6a0211e4303465a29364e91b08f6b2d5a48b030 Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Mon, 20 Oct 2025 13:47:34 -0700 Subject: [PATCH 1/3] Add demo code from docs/getting-started.md And update .gitignore to ignore demo/env and database files anywhere. --- .gitignore | 5 +++-- demo/demo.py | 34 ++++++++++++++++++++++++++++++++++ demo/query.py | 16 ++++++++++++++++ demo/transcript.txt | 3 +++ 4 files changed, 56 insertions(+), 2 deletions(-) create mode 100644 demo/demo.py create mode 100644 demo/query.py create mode 100644 demo/transcript.txt diff --git a/.gitignore b/.gitignore index 1569346..00f697f 100644 --- a/.gitignore +++ b/.gitignore @@ -3,8 +3,8 @@ __pycache__ /.env /.ui_history /.venv -/db -/*.db +*.db +*.db.index_log /evals /gmail/client_secret.json /gmail/token.json @@ -12,6 +12,7 @@ __pycache__ /testdata/Episode_53_Answer_results.json /testdata/Episode_53_Search_results.json /testdata/MP/ +/demo/env # VSCode .vscode diff --git a/demo/demo.py b/demo/demo.py new file mode 100644 index 0000000..ac1a15e --- /dev/null +++ b/demo/demo.py @@ -0,0 +1,34 @@ +from typeagent import create_conversation +from typeagent.transcripts.transcript import ( + TranscriptMessage, + TranscriptMessageMeta, +) + + +def read_messages(filename) -> list[TranscriptMessage]: + messages: list[TranscriptMessage] = [] + with open(filename, "r") as f: + for line in f: + # Parse each line into a TranscriptMessage + speaker, text_chunk = line.split(None, 1) + message = TranscriptMessage( + text_chunks=[text_chunk], + metadata=TranscriptMessageMeta(speaker=speaker), + ) + messages.append(message) + return messages + + +async def main(): + conversation = await create_conversation("demo.db", TranscriptMessage) + messages = read_messages("transcript.txt") + print(f"Indexing {len(messages)} messages...") + results = await conversation.add_messages_with_indexing(messages) + print(f"Indexed {results.messages_added} messages.") + print(f"Got {results.semrefs_added} semantic refs.") + + +if __name__ == "__main__": + import asyncio + + asyncio.run(main()) diff --git a/demo/query.py b/demo/query.py new file mode 100644 index 0000000..b28a7f0 --- /dev/null +++ b/demo/query.py @@ -0,0 +1,16 @@ +from typeagent import create_conversation +from typeagent.transcripts.transcript import TranscriptMessage + + +async def main(): + conversation = await create_conversation("demo.db", TranscriptMessage) + question = "Who volunteered to do the python library?" + print("Q:", question) + answer = await conversation.query(question) + print("A:", answer) + + +if __name__ == "__main__": + import asyncio + + asyncio.run(main()) diff --git a/demo/transcript.txt b/demo/transcript.txt new file mode 100644 index 0000000..e93557b --- /dev/null +++ b/demo/transcript.txt @@ -0,0 +1,3 @@ +STEVE We should really make a Python library for Structured RAG. +UMESH Who would be a good person to do the Python library? +GUIDO I volunteer to do the Python library. Give me a few months. From 584add1ed981fba38a7779e2570e04b97976d7da Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Mon, 20 Oct 2025 13:52:35 -0700 Subject: [PATCH 2/3] Move still-relevant things from TADA.md (crudely) to TODO.md --- TADA.md | 124 -------------------------------------------------------- TODO.md | 38 +++++++++++++++++ 2 files changed, 38 insertions(+), 124 deletions(-) delete mode 100644 TADA.md diff --git a/TADA.md b/TADA.md deleted file mode 100644 index 0d6ca1b..0000000 --- a/TADA.md +++ /dev/null @@ -1,124 +0,0 @@ -# Things to do before Oct 18 - -Talk at PyBay is on Sat, Oct 18 in SF - -## My Priorities - -- Write docs -- Polish demos -- Practice and polish talk - -## Software - -Minor: - -- Distinguish between release deps and build/dev deps? -- Improve load_dotenv() (don't look for `/ts/.env`, use one loop) - -### Specifically for VTT import: - -Minor: - -- Reduce duplication between ingest_vtt.py and typeagent/transcripts/ -- `get_transcript_speakers` and `get_transcript_duration` should not - re-parse the transcript -- they should just take the parsed vtt object. - -### Not doing: - -- Fix MCP service (should use host's LLM, not its own) -- Handle embeddings in MCP, even though MCP doesn't support it yet - - GPT5 suggests to run a separate MCP service for this - - Batch 128-256 items at a time - - Explicitly handle truncation by counting tokens -- Handle caching using sha256() of text? - -## Documentation - -- Getting Started -- Document the high-level API -- Document what should go in `.env` and where it should live - - And alternatively (first?) what to put in shell env directly -- Document test/build/release process -- Document how to run evaluations (but don't reveal all the data) -- Document how to use gmail_dump.py (set up a project etc.) - -## Demos - -- Adrian Tchaikovsky Podcast: ready -- Monty Python Episode: ready but need to pick a list of sketches to index -- Email demo: Umesh has a working prototype - -## Talk - -- Practice in private, timing, updating slides as needed -- Practice run for the team? -- Anticipate questions about (Lazy) GraphRAG? - The answer is we give equiv/better results without waiting minutes for - reindexing [in theory] - - -# Appendix - -## Official abstract: "Structured RAG is better than RAG!" - -At Microsoft I've been contributing to an open source project -demonstrating what we call Structured RAG. -This is an improvement over the popular AI tactic named RAG (look it up) -that can answer questions over large collections of text or images -better and faster than RAG. We use this as the basis for long-term AI -memory. - -I will explain the Structured RAG algorithm and show some demos with -real-world data. I will also discuss the Python library we are releasing -this summer and its API. - -## Scratch space for talk drafting - -1. Explain Structured RAG (SRAG) - - 1. Explain RAG - 2. Explain how SRAG works instead - 3. Show how SRAG is better (how?) - -1a. My process - - Over time using more and more AI (mostly Claude) - - Latest changes almost entirely written by AI (with my strict supervision :-) - -2. Demos - - 1. Podcast demo queries (clean up query.py for this?) - 2. Document demo, show ingest and query (very briefly) - 3. MP movie? Email? - -3. Basics for using the library - 1. Install: - ```sh - pip install typeagent-py # Installs typeagent and dependencies - ``` - 2. Create conversation (TENTATIVE): - ```py - import typeagent - - conv = typeagent.get_conversation(dbfile="mymemory.sqlite") - # Could be empty (new) or could contain previously ingested data - # You can always ingest additional messages - ``` - 3. Ingest messages (TENTATIVE): - ```py - for message in ...: # Source of message strings - metadata = ... # Set date/time, speaker(s), listener(s) - conv.ingest_message(message, metadata) - ``` - 4. Query (TENTATIVE): - ```py - request = input("> ") - answer = conv.query(request) - print(request) - ``` - 5. Demo using podcast example data - -4. Links - -- To PyPI project -- To GitHub (microsoft/typeagent-py) -- To docs diff --git a/TODO.md b/TODO.md index de1ca7d..19b742e 100644 --- a/TODO.md +++ b/TODO.md @@ -1,5 +1,43 @@ # TODO for the Python knowpro port +# Leftover TODOs from TADA.md + + +## Software + +Minor: + +- Distinguish between release deps and build/dev deps? +- Improve load_dotenv() (don't look for `/ts/.env`, use one loop) + +### Specifically for VTT import (minor) + +- Reduce duplication between ingest_vtt.py and typeagent/transcripts/ +- `get_transcript_speakers` and `get_transcript_duration` should not + re-parse the transcript -- they should just take the parsed vtt object. + +### Later + +- Fix MCP service (should use host's LLM, not its own) +- Handle embeddings in MCP, even though MCP doesn't support it yet + - GPT5 suggests to run a separate MCP service for this + - Batch 128-256 items at a time + - Explicitly handle truncation by counting tokens +- Handle caching using sha256() of text? + +## Documentation + +- Document what should go in `.env` and where it should live + - And alternatively (first?) what to put in shell env directly +- Document how o reproduce the demos from the talk (and Kevin/Adrian) +- Document test/build/release process +- Document how to use gmail_dump.py (set up a project etc.) + +Maybe later: + +- Document how to run evaluations (but don't reveal all the data) + + # TODOs for fully implementing persistence through SQLite ## Now From dc2b3635505f80796503399793972228a10935ff Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Mon, 20 Oct 2025 13:53:50 -0700 Subject: [PATCH 3/3] Add 'pull_request' back to ci.yml --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b26d64e..65fed02 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,6 +1,6 @@ # .github/workflows/ci.yml name: CI -on: [push] +on: [push, pull_request] jobs: ci: