diff --git a/.gitignore b/.gitignore index 9f915715..55acbaab 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,9 @@ __pycache__ /*_data.json /*_embeddings.bin +# pytest configuration +pytest.local.ini + # Coverage .coverage .coverage.* diff --git a/typeagent/knowpro/answer_response_schema.py b/typeagent/knowpro/answer_response_schema.py index 563d9543..95819ff3 100644 --- a/typeagent/knowpro/answer_response_schema.py +++ b/typeagent/knowpro/answer_response_schema.py @@ -3,7 +3,8 @@ from typing import Literal, Annotated from typing_extensions import Doc -from pydantic.dataclasses import dataclass + +from .dataclasses import dataclass AnswerType = Literal[ "NoAnswer", # If question cannot be accurately answered from [ANSWER CONTEXT] diff --git a/typeagent/knowpro/dataclasses.py b/typeagent/knowpro/dataclasses.py new file mode 100644 index 00000000..8eb4482d --- /dev/null +++ b/typeagent/knowpro/dataclasses.py @@ -0,0 +1,35 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +"""Compatibility helpers for pydantic dataclasses.""" + +from collections.abc import Callable +from typing import Any, cast, overload + +from typing_extensions import dataclass_transform + +from pydantic.dataclasses import dataclass as _pydantic_dataclass + +from .field_helpers import CamelCaseField + + +@overload +def dataclass[T](__cls: type[T], /, **kwargs: Any) -> type[T]: ... + + +@overload +def dataclass[T](**kwargs: Any) -> Callable[[type[T]], type[T]]: ... + + +@dataclass_transform(field_specifiers=(CamelCaseField,)) +def dataclass[T]( + __cls: type[T] | None = None, /, **kwargs: Any +) -> Callable[[type[T]], type[T]] | type[T]: + """Wrapper that preserves pydantic behavior while informing type-checkers.""" + + def wrap(cls: type[T]) -> type[T]: + return cast(type[T], _pydantic_dataclass(cls, **kwargs)) + + if __cls is None: + return wrap + + return wrap(__cls) diff --git a/typeagent/knowpro/date_time_schema.py b/typeagent/knowpro/date_time_schema.py index e2c9581f..3aa0f5d1 100644 --- a/typeagent/knowpro/date_time_schema.py +++ b/typeagent/knowpro/date_time_schema.py @@ -1,10 +1,11 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -from pydantic.dataclasses import dataclass from typing import Annotated from typing_extensions import Doc +from .dataclasses import dataclass + @dataclass class DateVal: diff --git a/typeagent/knowpro/interfaces.py b/typeagent/knowpro/interfaces.py index 396fbf89..423e8885 100644 --- a/typeagent/knowpro/interfaces.py +++ b/typeagent/knowpro/interfaces.py @@ -1,911 +1,20 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +"""Aggregated knowpro interfaces for backwards compatibility.""" -from abc import ABC, abstractmethod -from collections.abc import AsyncIterable, Iterable, Sequence -from datetime import ( - datetime as Datetime, # For export. - timedelta as Timedelta, # For export. -) -from enum import Enum -from typing import ( - Any, - ClassVar, - Literal, - NotRequired, - Protocol, - Self, - TypedDict, - runtime_checkable, -) +from __future__ import annotations -from pydantic.dataclasses import dataclass -from pydantic import Field, AliasChoices -import typechat +from .interfaces_core import * +from .interfaces_indexes import * +from .interfaces_search import * +from .interfaces_serialization import * +from .interfaces_storage import * -from ..aitools.embeddings import NormalizedEmbeddings -from . import kplib -from .field_helpers import CamelCaseField +from .interfaces_core import __all__ as _core_all +from .interfaces_indexes import __all__ as _indexes_all +from .interfaces_search import __all__ as _search_all +from .interfaces_serialization import __all__ as _serialization_all +from .interfaces_storage import __all__ as _storage_all - -class IKnowledgeSource(Protocol): - """A Knowledge Source is any object that returns knowledge.""" - - def get_knowledge(self) -> kplib.KnowledgeResponse: - """Retrieves knowledge from the source.""" - ... - - -class IKnowledgeExtractor(Protocol): - """Interface for extracting knowledge from messages.""" - - async def extract(self, message: str) -> typechat.Result[kplib.KnowledgeResponse]: - """Extract knowledge from a message.""" - ... - - -@dataclass -class DeletionInfo: - timestamp: str - reason: str | None = None - - -@dataclass -class IndexingStartPoints: - """Track collection sizes before adding new items.""" - - message_count: int - semref_count: int - - -@dataclass -class AddMessagesResult: - """Result of add_messages_with_indexing operation.""" - - messages_added: int - semrefs_added: int - - -# Messages are referenced by their sequential ordinal numbers. -type MessageOrdinal = int - - -class IMessageMetadata(Protocol): - """Metadata associated with a message.""" - - # The source ("senders") of the message - source: str | list[str] | None = None - - # The dest ("recipients") of the message - dest: str | list[str] | None = None - - -class IMessage[TMetadata: IMessageMetadata](IKnowledgeSource, Protocol): - """A message in a conversation - - A Message contains one or more text chunks. - """ - - # The text of the message, split into chunks. - text_chunks: list[str] - - # (Optional) tags associated with the message. - tags: list[str] - - # The (optional) timestamp of the message. - timestamp: str | None = None - - # (Future) Information about the deletion of the message. - deletion_info: DeletionInfo | None = None - - # Metadata associated with the message such as its source. - metadata: TMetadata | None = None - - -type SemanticRefOrdinal = int - - -@dataclass -class ScoredSemanticRefOrdinal: - semantic_ref_ordinal: SemanticRefOrdinal = CamelCaseField( - "The ordinal of the semantic reference" - ) - score: float = CamelCaseField("The relevance score") - - def __repr__(self) -> str: - return f"{self.__class__.__name__}({self.semantic_ref_ordinal}, {self.score})" - - def serialize(self) -> "ScoredSemanticRefOrdinalData": - return self.__pydantic_serializer__.to_python(self, by_alias=True) # type: ignore - - @staticmethod - def deserialize(data: "ScoredSemanticRefOrdinalData") -> "ScoredSemanticRefOrdinal": - return ScoredSemanticRefOrdinal.__pydantic_validator__.validate_python(data) # type: ignore - - -@dataclass -class ScoredMessageOrdinal: - message_ordinal: MessageOrdinal - score: float - - -class ITermToSemanticRefIndex(Protocol): - async def size(self) -> int: ... - - async def get_terms(self) -> list[str]: ... - - async def add_term( - self, - term: str, - semantic_ref_ordinal: SemanticRefOrdinal | ScoredSemanticRefOrdinal, - ) -> str: ... - - async def remove_term( - self, term: str, semantic_ref_ordinal: SemanticRefOrdinal - ) -> None: ... - - async def lookup_term(self, term: str) -> list[ScoredSemanticRefOrdinal] | None: ... - - async def clear(self) -> None: ... - - async def serialize(self) -> Any: ... - - async def deserialize(self, data: Any) -> None: ... - - -type KnowledgeType = Literal["entity", "action", "topic", "tag"] - - -@dataclass -class Topic: - knowledge_type: ClassVar[Literal["topic"]] = "topic" - text: str - - -@dataclass -class Tag: - knowledge_type: ClassVar[Literal["tag"]] = "tag" - text: str - - -type Knowledge = kplib.ConcreteEntity | kplib.Action | Topic | Tag - - -class TextLocationData(TypedDict): - messageOrdinal: MessageOrdinal - chunkOrdinal: int - - -@dataclass(order=True) -class TextLocation: - # The ordinal of the message. - message_ordinal: MessageOrdinal = CamelCaseField("The ordinal of the message") - # The ordinal of the chunk. - # In the end of a TextRange, 1 + ordinal of the last chunk in the range. - chunk_ordinal: int = CamelCaseField( - "The ordinal of the chunk; in the end of a TextRange, 1 + ordinal of the last chunk in the range", - default=0, - ) - - def __repr__(self) -> str: - return ( - f"{self.__class__.__name__}({self.message_ordinal}, {self.chunk_ordinal})" - ) - - def serialize(self) -> TextLocationData: - return self.__pydantic_serializer__.to_python(self, by_alias=True) # type: ignore - - @staticmethod - def deserialize(data: TextLocationData) -> "TextLocation": - return TextLocation.__pydantic_validator__.validate_python(data) # type: ignore - - -class TextRangeData(TypedDict): - start: TextLocationData - end: NotRequired[TextLocationData | None] - - -# A text range within a session. -# TODO: Are TextRanges totally ordered? -@dataclass -class TextRange: - # The start of the range. - start: TextLocation - # The end of the range (exclusive). If None, the range is a single point. - end: TextLocation | None = None - - def __repr__(self) -> str: - if self.end is None: - return f"{self.__class__.__name__}({self.start})" - else: - return f"{self.__class__.__name__}({self.start}, {self.end})" - - def __eq__(self, other: object) -> bool: - if not isinstance(other, TextRange): - return NotImplemented - - if self.start != other.start: - return False - - # Get the effective end for both ranges - self_end = self.end or TextLocation( - self.start.message_ordinal, self.start.chunk_ordinal + 1 - ) - other_end = other.end or TextLocation( - other.start.message_ordinal, other.start.chunk_ordinal + 1 - ) - - return self_end == other_end - - def __lt__(self, other: Self) -> bool: - if self.start != other.start: - return self.start < other.start - self_end = self.end or TextLocation( - self.start.message_ordinal, self.start.chunk_ordinal + 1 - ) - other_end = other.end or TextLocation( - other.start.message_ordinal, other.start.chunk_ordinal + 1 - ) - return self_end < other_end - - def __gt__(self, other: Self) -> bool: - return other.__lt__(self) - - def __ge__(self, other: Self) -> bool: - return not self.__lt__(other) - - def __le__(self, other: Self) -> bool: - return not other.__lt__(self) - - def __contains__(self, other: Self) -> bool: - other_end = other.end or TextLocation( - other.start.message_ordinal, other.start.chunk_ordinal + 1 - ) - self_end = self.end or TextLocation( - self.start.message_ordinal, self.start.chunk_ordinal + 1 - ) - return self.start <= other.start and other_end <= self_end - - def serialize(self) -> TextRangeData: - return self.__pydantic_serializer__.to_python(self, by_alias=True, exclude_none=True) # type: ignore - - @staticmethod - def deserialize(data: TextRangeData) -> "TextRange": - return TextRange.__pydantic_validator__.validate_python(data) # type: ignore - - -# TODO: Implement serializing KnowledgeData (or import from kplib). -class KnowledgeData(TypedDict): - pass - - -class SemanticRefData(TypedDict): - semanticRefOrdinal: SemanticRefOrdinal - range: TextRangeData - knowledgeType: KnowledgeType - knowledge: KnowledgeData - - -@dataclass -class SemanticRef: - semantic_ref_ordinal: SemanticRefOrdinal = CamelCaseField( - "The ordinal of the semantic reference" - ) - range: TextRange = CamelCaseField("The text range of the semantic reference") - knowledge: Knowledge = CamelCaseField( - "The knowledge associated with this semantic reference" - ) - - def __repr__(self) -> str: - return f"{self.__class__.__name__}({self.semantic_ref_ordinal}, {self.range}, {self.knowledge.knowledge_type!r}, {self.knowledge})" - - def serialize(self) -> SemanticRefData: - from . import serialization - - return SemanticRefData( - semanticRefOrdinal=self.semantic_ref_ordinal, - range=self.range.serialize(), - knowledgeType=self.knowledge.knowledge_type, - knowledge=serialization.serialize_object(self.knowledge), - ) - - @staticmethod - def deserialize(data: SemanticRefData) -> "SemanticRef": - from . import serialization - - knowledge = serialization.deserialize_knowledge( - data["knowledgeType"], data["knowledge"] - ) - return SemanticRef( - semantic_ref_ordinal=data["semanticRefOrdinal"], - range=TextRange.deserialize(data["range"]), - knowledge=knowledge, - ) - - -@dataclass -class DateRange: - start: Datetime - # Inclusive. If None, the range is unbounded. - end: Datetime | None = None - - def __repr__(self) -> str: - if self.end is None: - return f"{self.__class__.__name__}({self.start!r})" - else: - return f"{self.__class__.__name__}({self.start!r}, {self.end!r})" - - def __contains__(self, datetime: Datetime) -> bool: - if self.end is None: - return self.start <= datetime - return self.start <= datetime <= self.end - - -# Term must be hashable to allow using it as a dict key or set member. -@dataclass(unsafe_hash=True) -class Term: - text: str - # Optional weighting for these matches. - weight: float | None = None - - def __repr__(self) -> str: - if self.weight is None: - return f"{self.__class__.__name__}({self.text!r})" - else: - return f"{self.__class__.__name__}({self.text!r}, {self.weight:.4g})" - - def serialize(self) -> "TermData": - return self.__pydantic_serializer__.to_python(self, by_alias=True, exclude_none=True) # type: ignore - - -# Allows for faster retrieval of name, value properties -@runtime_checkable -class IPropertyToSemanticRefIndex(Protocol): - async def size(self) -> int: ... - - async def get_values(self) -> list[str]: ... - - async def add_property( - self, - property_name: str, - value: str, - semantic_ref_ordinal: SemanticRefOrdinal | ScoredSemanticRefOrdinal, - ) -> None: ... - - async def lookup_property( - self, property_name: str, value: str - ) -> list[ScoredSemanticRefOrdinal] | None: ... - - async def clear(self) -> None: ... - - async def remove_property(self, prop_name: str, semref_id: int) -> None: ... - - async def remove_all_for_semref(self, semref_id: int) -> None: ... - - -@dataclass -class TimestampedTextRange: - timestamp: str - range: TextRange - - -# Return text ranges in the given date range. -class ITimestampToTextRangeIndex(Protocol): - # Contract (stable across providers): - # - Timestamps must be ISO-8601 strings sortable lexicographically. - # - lookup_range(DateRange) returns items with start <= t < end (end exclusive). - # If end is None, treat as a point query with end = start + epsilon. - async def size(self) -> int: ... - - async def add_timestamp( - self, message_ordinal: MessageOrdinal, timestamp: str - ) -> bool: ... - - async def add_timestamps( - self, message_timestamps: list[tuple[MessageOrdinal, str]] - ) -> None: ... - - async def lookup_range( - self, date_range: DateRange - ) -> list[TimestampedTextRange]: ... - - -class ITermToRelatedTerms(Protocol): - async def lookup_term(self, text: str) -> list[Term] | None: ... - - async def size(self) -> int: ... - - async def is_empty(self) -> bool: ... - - async def clear(self) -> None: ... - - async def add_related_term( - self, text: str, related_terms: Term | list[Term] - ) -> None: ... - - async def remove_term(self, text: str) -> None: ... - - async def serialize(self) -> "TermToRelatedTermsData": ... - - async def deserialize(self, data: "TermToRelatedTermsData | None") -> None: ... - - -class ITermToRelatedTermsFuzzy(Protocol): - async def size(self) -> int: ... - - async def add_terms(self, texts: list[str]) -> None: ... - - async def lookup_term( - self, - text: str, - max_hits: int | None = None, - min_score: float | None = None, - ) -> list[Term]: ... - - async def lookup_terms( - self, - texts: list[str], - max_hits: int | None = None, - min_score: float | None = None, - ) -> list[list[Term]]: ... - - -class ITermToRelatedTermsIndex(Protocol): - # Providers may implement aliases and fuzzy via separate tables, but must - # expose them through these properties. - @property - def aliases(self) -> ITermToRelatedTerms: ... - - @property - def fuzzy_index(self) -> ITermToRelatedTermsFuzzy | None: ... - - async def serialize(self) -> "TermsToRelatedTermsIndexData": ... - - async def deserialize(self, data: "TermsToRelatedTermsIndexData") -> None: ... - - -class ThreadData(TypedDict): - description: str - ranges: list[TextRangeData] - - -# A Thread is a set of text ranges in a conversation. -@dataclass -class Thread: - description: str - ranges: Sequence[TextRange] - - def serialize(self) -> ThreadData: - return self.__pydantic_serializer__.to_python(self, by_alias=True) # type: ignore - - @staticmethod - def deserialize(data: ThreadData) -> "Thread": - return Thread.__pydantic_validator__.validate_python(data) # type: ignore - - -type ThreadOrdinal = int - - -@dataclass -class ScoredThreadOrdinal: - thread_ordinal: ThreadOrdinal - score: float - - -class IConversationThreads(Protocol): - threads: list[Thread] - - async def add_thread(self, thread: Thread) -> None: ... - - async def lookup_thread( - self, - thread_description: str, - max_matches: int | None = None, - threshold_score: float | None = None, - ) -> list[ScoredThreadOrdinal] | None: ... - - def serialize(self) -> "ConversationThreadData[ThreadDataItem]": ... - - def deserialize(self, data: "ConversationThreadData[ThreadDataItem]") -> None: ... - - -@runtime_checkable -class IMessageTextIndex[TMessage: IMessage](Protocol): - - async def add_messages( - self, - messages: Iterable[TMessage], - ) -> None: ... - - async def add_messages_starting_at( - self, - start_message_ordinal: int, - messages: list[TMessage], - ) -> None: ... - - async def lookup_messages( - self, - message_text: str, - max_matches: int | None = None, - threshold_score: float | None = None, - ) -> list[ScoredMessageOrdinal]: ... - - async def lookup_messages_in_subset( - self, - message_text: str, - ordinals_to_search: list[MessageOrdinal], - max_matches: int | None = None, - threshold_score: float | None = None, - ) -> list[ScoredMessageOrdinal]: ... - - # Async alternatives to __len__ and __bool__ - async def size(self) -> int: ... - - async def is_empty(self) -> bool: ... - - # TODO: Others? - - async def serialize(self) -> "MessageTextIndexData": ... - - async def deserialize(self, data: "MessageTextIndexData") -> None: ... - - -class IConversationSecondaryIndexes[TMessage: IMessage](Protocol): - property_to_semantic_ref_index: IPropertyToSemanticRefIndex | None - timestamp_index: ITimestampToTextRangeIndex | None - term_to_related_terms_index: ITermToRelatedTermsIndex | None - threads: IConversationThreads | None = None - message_index: IMessageTextIndex[TMessage] | None = None - - -class IConversation[ - TMessage: IMessage, - TTermToSemanticRefIndex: ITermToSemanticRefIndex, -](Protocol): - name_tag: str - tags: list[str] - messages: "IMessageCollection[TMessage]" - semantic_refs: "ISemanticRefCollection" - semantic_ref_index: TTermToSemanticRefIndex - secondary_indexes: IConversationSecondaryIndexes[TMessage] | None - - -# ------------- -# Search Types -# ------------- - - -@dataclass -class SearchTerm: - """Represents a term being searched for. - - Attributes: - term: The term being searched for. - related_terms: Additional terms related to the term. These can be supplied - from synonym tables and so on. - - An empty list indicates no related matches for this term. - - `None` indicates that the search processor may try to resolve related - terms from any available secondary indexes (e.g., ITermToRelatedTermsIndex). - """ - - term: Term - related_terms: list[Term] | None = CamelCaseField( - "Additional terms related to the term. These can be supplied from synonym tables and so on", - default=None, - ) - - -# Well-known knowledge properties. -type KnowledgePropertyName = Literal[ - "name", # the name of an entity - "type", # the type of an entity - "verb", # the verb of an action - "subject", # the subject of an action - "object", # the object of an action - "indirectObject", # the indirect object of an action - "tag", # tag - "topic", # topic -] - - -@dataclass -class PropertySearchTerm: - """PropertySearch terms let you match named property values. - - - You can match a well-known property name (e.g., name("Bach"), type("book")). - - Or you can provide a SearchTerm as a propertyName. - For example, to match hue(red): - - propertyName as SearchTerm, set to 'hue' - - propertyValue as SearchTerm, set to 'red' - We also want hue(red) to match any facets called color(red). - - SearchTerms can include related terms: - - For example, you could include "color" as a related term for the - propertyName "hue", or 'crimson' for red. - - The query processor can also resolve related terms using a - related terms secondary index, if one is available. - """ - - property_name: KnowledgePropertyName | SearchTerm = CamelCaseField( - "The property name to search for" - ) - property_value: SearchTerm = CamelCaseField("The property value to search for") - - -@dataclass -class SearchTermGroup: - """A group of search terms.""" - - boolean_op: Literal["and", "or", "or_max"] = CamelCaseField( - "The boolean operation to apply to the terms" - ) - terms: list["SearchTermGroupTypes"] = CamelCaseField( - "The list of search terms in this group", default_factory=list - ) - - -type SearchTermGroupTypes = SearchTerm | PropertySearchTerm | SearchTermGroup - - -@dataclass -class WhenFilter: - """Additional constraints on when a SemanticRef is considered a match. - - A SemanticRef matching a term is actually considered a match - when the following optional conditions are met (if present, must match): - knowledgeType matches, e.g. knowledgeType == 'entity' - dateRange matches, e.g. (Jan 3rd to Jan 10th) - Semantic Refs are within supplied SCOPE, - i.e. only Semantic Refs from a 'scoping' set of text ranges will match - """ - - knowledge_type: KnowledgeType | None = None - date_range: DateRange | None = None - thread_description: str | None = None - tags: list[str] | None = None - - # SCOPE DEFINITION - - # Search terms whose matching text ranges supply the scope for this query - scope_defining_terms: SearchTermGroup | None = None - # Additional scoping ranges separately computed by caller - text_ranges_in_scope: list[TextRange] | None = None - - -@dataclass -class SearchSelectExpr: - """An expression used to select structured contents of a conversation.""" - - search_term_group: SearchTermGroup = CamelCaseField( - "Term group that matches information" - ) # Term group that matches information - when: WhenFilter | None = None # Filter that scopes what information to match - - -@dataclass -class SemanticRefSearchResult: - """Result of a semantic reference search.""" - - term_matches: set[str] - semantic_ref_matches: list[ScoredSemanticRefOrdinal] - - -# -------------------------------------------------- -# Serialization formats use TypedDict and camelCase -# -------------------------------------------------- - - -class ThreadDataItem(TypedDict): - thread: ThreadData - embedding: list[float] | None # TODO: Why not NormalizedEmbedding? - - -class ConversationThreadData[TThreadDataItem: ThreadDataItem](TypedDict): - threads: list[TThreadDataItem] | None - - -class TermData(TypedDict): - text: str - weight: NotRequired[float | None] - - -class TermsToRelatedTermsDataItem(TypedDict): - termText: str - relatedTerms: list[TermData] - - -class TermToRelatedTermsData(TypedDict): - relatedTerms: NotRequired[list[TermsToRelatedTermsDataItem] | None] - - -class TextEmbeddingIndexData(TypedDict): - textItems: list[str] - embeddings: NormalizedEmbeddings | None - - -class TermsToRelatedTermsIndexData(TypedDict): - aliasData: NotRequired[TermToRelatedTermsData] - textEmbeddingData: NotRequired[TextEmbeddingIndexData] - - -class ScoredSemanticRefOrdinalData(TypedDict): - semanticRefOrdinal: SemanticRefOrdinal - score: float - - -class TermToSemanticRefIndexItemData(TypedDict): - term: str - semanticRefOrdinals: list[ScoredSemanticRefOrdinalData] - - -# Persistent form of a term index. -class TermToSemanticRefIndexData(TypedDict): - items: list[TermToSemanticRefIndexItemData] - - -class ConversationData[TMessageData](TypedDict): - nameTag: str - messages: list[TMessageData] - tags: list[str] - semanticRefs: list[SemanticRefData] | None - semanticIndexData: NotRequired[TermToSemanticRefIndexData | None] - - -class TextToTextLocationIndexData(TypedDict): - textLocations: list[TextLocationData] - embeddings: NormalizedEmbeddings | None - - -class MessageTextIndexData(TypedDict): - indexData: NotRequired[TextToTextLocationIndexData | None] - - -class ConversationDataWithIndexes[TMessageData](ConversationData[TMessageData]): - relatedTermsIndexData: NotRequired[TermsToRelatedTermsIndexData | None] - threadData: NotRequired[ConversationThreadData[ThreadDataItem] | None] - messageIndexData: NotRequired[MessageTextIndexData | None] - - -# -------------------------------- -# Indexing helper data structures -# -------------------------------- - - -# -------- -# Storage -# -------- - - -@dataclass -class ConversationMetadata: - """Storage-provider-agnostic metadata for a conversation. - - This dataclass represents metadata that can be read from and written to - any storage provider (SQLite, in-memory, etc.). Providers may store this - internally in different formats (e.g., key-value pairs), but this provides - a uniform interface for accessing conversation metadata. - - When passed to a storage provider during initialization: - - None values indicate the provider should auto-generate/use defaults - - Non-None values are used as-is - - When returned from get_conversation_metadata(): - - None values indicate the field was not found in the database - - Non-None values are the actual stored values - - If the database has no metadata rows, returns an instance with all fields None - """ - - name_tag: str | None = None - schema_version: int | None = None - created_at: Datetime | None = None - updated_at: Datetime | None = None - embedding_size: int | None = None - embedding_model: str | None = None - tags: list[str] | None = None - extra: dict[str, str] | None = None - - -class IReadonlyCollection[T, TOrdinal](AsyncIterable[T], Protocol): - async def size(self) -> int: ... - - async def get_item(self, arg: TOrdinal) -> T: ... - - async def get_slice(self, start: int, stop: int) -> list[T]: ... - - async def get_multiple(self, arg: list[TOrdinal]) -> list[T]: ... - - -class ICollection[T, TOrdinal](IReadonlyCollection[T, TOrdinal], Protocol): - """An APPEND-ONLY collection.""" - - @property - def is_persistent(self) -> bool: ... - - async def append(self, item: T) -> None: ... - - async def extend(self, items: Iterable[T]) -> None: - """Append multiple items to the collection.""" - # The default implementation just calls append for each item. - for item in items: - await self.append(item) - - -class IMessageCollection[TMessage: IMessage]( - ICollection[TMessage, MessageOrdinal], Protocol -): - """A collection of Messages.""" - - -class ISemanticRefCollection(ICollection[SemanticRef, SemanticRefOrdinal], Protocol): - """A collection of SemanticRefs.""" - - -class IStorageProvider[TMessage: IMessage](Protocol): - """API spec for storage providers -- maybe in-memory or persistent.""" - - async def get_message_collection(self) -> IMessageCollection[TMessage]: ... - - async def get_semantic_ref_collection(self) -> ISemanticRefCollection: ... - - # Index getters - ALL 6 index types for this conversation - async def get_semantic_ref_index(self) -> ITermToSemanticRefIndex: ... - - async def get_property_index(self) -> IPropertyToSemanticRefIndex: ... - - async def get_timestamp_index(self) -> ITimestampToTextRangeIndex: ... - - async def get_message_text_index(self) -> IMessageTextIndex[TMessage]: ... - - async def get_related_terms_index(self) -> ITermToRelatedTermsIndex: ... - - async def get_conversation_threads(self) -> IConversationThreads: ... - - # Metadata management - def get_conversation_metadata(self) -> ConversationMetadata: - """Get conversation metadata (missing fields set to None).""" - ... - - def set_conversation_metadata(self, **kwds: str | list[str] | None) -> None: - """Set or update conversation metadata key-value pairs. - - Args: - **kwds: Metadata keys and values where: - - str value: Sets a single key-value pair (replaces existing) - - list[str] value: Sets multiple values for the same key - - None value: Deletes all rows for the given key - """ - ... - - def update_conversation_timestamps( - self, - created_at: Datetime | None = None, - updated_at: Datetime | None = None, - ) -> None: - """Update conversation timestamps.""" - ... - - # Ingested source tracking - def is_source_ingested(self, source_id: str) -> bool: - """Check if a source has already been ingested.""" - ... - - def mark_source_ingested(self, source_id: str) -> None: - """Mark a source as ingested (no commit; call within transaction context).""" - ... - - # Transaction management - async def __aenter__(self) -> Self: - """Enter transaction context. Calls begin_transaction().""" - ... - - async def __aexit__( - self, - exc_type: type[BaseException] | None, - exc_val: BaseException | None, - exc_tb: Any, - ) -> None: - """Exit transaction context. Commits on success, rolls back on exception.""" - ... - - async def close(self) -> None: ... +# pyright: reportUnsupportedDunderAll=false +__all__ = _core_all + _indexes_all + _search_all + _serialization_all + _storage_all diff --git a/typeagent/knowpro/interfaces_core.py b/typeagent/knowpro/interfaces_core.py new file mode 100644 index 00000000..88413bd6 --- /dev/null +++ b/typeagent/knowpro/interfaces_core.py @@ -0,0 +1,422 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +"""Core conversation and knowledge interfaces for knowpro.""" + +from __future__ import annotations + +from datetime import datetime as Datetime +from typing import ( + TYPE_CHECKING, + Any, + ClassVar, + Literal, + NotRequired, + Protocol, + Self, + TypedDict, +) + +import typechat +from pydantic.dataclasses import dataclass + +from . import kplib +from .field_helpers import CamelCaseField + +__all__ = [ + "AddMessagesResult", + "DateRange", + "DeletionInfo", + "ITermToSemanticRefIndex", + "Datetime", + "IKnowledgeExtractor", + "IKnowledgeSource", + "IMessage", + "IMessageMetadata", + "IndexingStartPoints", + "Knowledge", + "KnowledgeData", + "KnowledgeType", + "MessageOrdinal", + "ScoredMessageOrdinal", + "ScoredSemanticRefOrdinal", + "SemanticRef", + "SemanticRefData", + "SemanticRefOrdinal", + "Tag", + "Term", + "TextLocation", + "TextLocationData", + "TextRange", + "TextRangeData", + "Topic", +] + +if TYPE_CHECKING: + from .interfaces_serialization import ScoredSemanticRefOrdinalData, TermData + + +class IKnowledgeSource(Protocol): + """A Knowledge Source is any object that returns knowledge.""" + + def get_knowledge(self) -> kplib.KnowledgeResponse: + """Retrieves knowledge from the source.""" + ... + + +class IKnowledgeExtractor(Protocol): + """Interface for extracting knowledge from messages.""" + + async def extract(self, message: str) -> typechat.Result[kplib.KnowledgeResponse]: + """Extract knowledge from a message.""" + ... + + +@dataclass +class DeletionInfo: + timestamp: str + reason: str | None = None + + +@dataclass +class IndexingStartPoints: + """Track collection sizes before adding new items.""" + + message_count: int + semref_count: int + + +@dataclass +class AddMessagesResult: + """Result of add_messages_with_indexing operation.""" + + messages_added: int + semrefs_added: int + + +# Messages are referenced by their sequential ordinal numbers. +type MessageOrdinal = int + + +class IMessageMetadata(Protocol): + """Metadata associated with a message.""" + + # The source ("senders") of the message + source: str | list[str] | None = None + + # The dest ("recipients") of the message + dest: str | list[str] | None = None + + +class IMessage[TMetadata: IMessageMetadata](IKnowledgeSource, Protocol): + """A message in a conversation. + + A Message contains one or more text chunks. + """ + + # The text of the message, split into chunks. + text_chunks: list[str] + + # (Optional) tags associated with the message. + tags: list[str] + + # The (optional) timestamp of the message. + timestamp: str | None = None + + # (Future) Information about the deletion of the message. + deletion_info: DeletionInfo | None = None + + # Metadata associated with the message such as its source. + metadata: TMetadata | None = None + + +# Semantic references are also ordinal. +type SemanticRefOrdinal = int + + +@dataclass +class ScoredSemanticRefOrdinal: + semantic_ref_ordinal: SemanticRefOrdinal = CamelCaseField( + "The ordinal of the semantic reference" + ) + score: float = CamelCaseField("The relevance score") + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self.semantic_ref_ordinal}, {self.score})" + + def serialize(self) -> ScoredSemanticRefOrdinalData: + return self.__pydantic_serializer__.to_python(self, by_alias=True) # type: ignore + + @staticmethod + def deserialize(data: ScoredSemanticRefOrdinalData) -> "ScoredSemanticRefOrdinal": + return ScoredSemanticRefOrdinal.__pydantic_validator__.validate_python(data) # type: ignore + + +@dataclass +class ScoredMessageOrdinal: + message_ordinal: MessageOrdinal + score: float + + +class ITermToSemanticRefIndex(Protocol): + async def size(self) -> int: ... + + async def get_terms(self) -> list[str]: ... + + async def add_term( + self, + term: str, + semantic_ref_ordinal: SemanticRefOrdinal | ScoredSemanticRefOrdinal, + ) -> str: ... + + async def remove_term( + self, term: str, semantic_ref_ordinal: SemanticRefOrdinal + ) -> None: ... + + async def lookup_term(self, term: str) -> list[ScoredSemanticRefOrdinal] | None: ... + + async def clear(self) -> None: ... + + async def serialize(self) -> Any: ... + + async def deserialize(self, data: Any) -> None: ... + + +# Knowledge modeling --------------------------------------------------------- + +type KnowledgeType = Literal["entity", "action", "topic", "tag"] + + +@dataclass +class Topic: + knowledge_type: ClassVar[Literal["topic"]] = "topic" + text: str + + +@dataclass +class Tag: + knowledge_type: ClassVar[Literal["tag"]] = "tag" + text: str + + +type Knowledge = kplib.ConcreteEntity | kplib.Action | Topic | Tag + + +class TextLocationData(TypedDict): + messageOrdinal: MessageOrdinal + chunkOrdinal: int + + +@dataclass(order=True) +class TextLocation: + # The ordinal of the message. + message_ordinal: MessageOrdinal = CamelCaseField("The ordinal of the message") + # The ordinal of the chunk. + # In the end of a TextRange, 1 + ordinal of the last chunk in the range. + chunk_ordinal: int = CamelCaseField( + "The ordinal of the chunk; in the end of a TextRange, 1 + ordinal of the last chunk in the range", + default=0, + ) + + def __repr__(self) -> str: + return ( + f"{self.__class__.__name__}({self.message_ordinal}, {self.chunk_ordinal})" + ) + + def serialize(self) -> TextLocationData: + return self.__pydantic_serializer__.to_python(self, by_alias=True) # type: ignore + + @staticmethod + def deserialize(data: TextLocationData) -> "TextLocation": + return TextLocation.__pydantic_validator__.validate_python(data) # type: ignore + + +class TextRangeData(TypedDict): + start: TextLocationData + end: NotRequired[TextLocationData | None] + + +# TODO: Are TextRanges totally ordered? +@dataclass +class TextRange: + """A text range within a session.""" + + start: TextLocation # The start of the range. + end: TextLocation | None = None # exclusive end; None indicates a single point + + def __repr__(self) -> str: + if self.end is None: + return f"{self.__class__.__name__}({self.start})" + else: + return f"{self.__class__.__name__}({self.start}, {self.end})" + + def __eq__(self, other: object) -> bool: + if not isinstance(other, TextRange): + return NotImplemented + + if self.start != other.start: + return False + + # Get the effective end for both ranges + self_end = self.end or TextLocation( + self.start.message_ordinal, self.start.chunk_ordinal + 1 + ) + other_end = other.end or TextLocation( + other.start.message_ordinal, other.start.chunk_ordinal + 1 + ) + return self_end == other_end + + def __lt__(self, other: Self) -> bool: + if self.start != other.start: + return self.start < other.start + self_end = self.end or TextLocation( + self.start.message_ordinal, self.start.chunk_ordinal + 1 + ) + other_end = other.end or TextLocation( + other.start.message_ordinal, other.start.chunk_ordinal + 1 + ) + return self_end < other_end + + def __gt__(self, other: Self) -> bool: + return other.__lt__(self) + + def __ge__(self, other: Self) -> bool: + return not self.__lt__(other) + + def __le__(self, other: Self) -> bool: + return not other.__lt__(self) + + def __contains__(self, other: Self) -> bool: + other_end = other.end or TextLocation( + other.start.message_ordinal, other.start.chunk_ordinal + 1 + ) + self_end = self.end or TextLocation( + self.start.message_ordinal, self.start.chunk_ordinal + 1 + ) + return self.start <= other.start and other_end <= self_end + + def serialize(self) -> TextRangeData: + return self.__pydantic_serializer__.to_python( # type: ignore + self, by_alias=True, exclude_none=True + ) + + @staticmethod + def deserialize(data: TextRangeData) -> "TextRange": + return TextRange.__pydantic_validator__.validate_python(data) # type: ignore + + +# TODO: Implement serializing KnowledgeData (or import from kplib). +class KnowledgeData(TypedDict): + pass + + +class SemanticRefData(TypedDict): + semanticRefOrdinal: SemanticRefOrdinal + range: TextRangeData + knowledgeType: KnowledgeType + knowledge: KnowledgeData + + +@dataclass +class SemanticRef: + semantic_ref_ordinal: SemanticRefOrdinal = CamelCaseField( + "The ordinal of the semantic reference" + ) + range: TextRange = CamelCaseField("The text range of the semantic reference") + knowledge: Knowledge = CamelCaseField( + "The knowledge associated with this semantic reference" + ) + + def __repr__(self) -> str: + return ( + f"{self.__class__.__name__}({self.semantic_ref_ordinal}, {self.range}, " + f"{self.knowledge.knowledge_type!r}, {self.knowledge})" + ) + + def serialize(self) -> SemanticRefData: + from . import serialization + + return SemanticRefData( + semanticRefOrdinal=self.semantic_ref_ordinal, + range=self.range.serialize(), + knowledgeType=self.knowledge.knowledge_type, + knowledge=serialization.serialize_object(self.knowledge), + ) + + @staticmethod + def deserialize(data: SemanticRefData) -> "SemanticRef": + from . import serialization + + knowledge = serialization.deserialize_knowledge( + data["knowledgeType"], data["knowledge"] + ) + return SemanticRef( + semantic_ref_ordinal=data["semanticRefOrdinal"], + range=TextRange.deserialize(data["range"]), + knowledge=knowledge, + ) + + +@dataclass +class DateRange: + start: Datetime + end: Datetime | None = None # inclusive; None means unbounded + + def __repr__(self) -> str: + if self.end is None: + return f"{self.__class__.__name__}({self.start!r})" + else: + return f"{self.__class__.__name__}({self.start!r}, {self.end!r})" + + def __contains__(self, datetime: Datetime) -> bool: + if self.end is None: + return self.start <= datetime + return self.start <= datetime <= self.end + + +@dataclass(unsafe_hash=True) +class Term: + """A # Term must be hashable to allow using it as a dict key or set member.""" + + text: str + weight: float | None = None # Optional weighting for these matches. + + def __repr__(self) -> str: + if self.weight is None: + return f"{self.__class__.__name__}({self.text!r})" + else: + return f"{self.__class__.__name__}({self.text!r}, {self.weight:.4g})" + + def serialize(self) -> TermData: + return self.__pydantic_serializer__.to_python( # type: ignore + self, by_alias=True, exclude_none=True + ) + + +__all__ = [ + "AddMessagesResult", + "DateRange", + "DeletionInfo", + "ITermToSemanticRefIndex", + "Datetime", + "IKnowledgeExtractor", + "IKnowledgeSource", + "IMessage", + "IMessageMetadata", + "IndexingStartPoints", + "Knowledge", + "KnowledgeData", + "KnowledgeType", + "MessageOrdinal", + "ScoredMessageOrdinal", + "ScoredSemanticRefOrdinal", + "SemanticRef", + "SemanticRefData", + "SemanticRefOrdinal", + "Tag", + "Term", + "TextLocation", + "TextLocationData", + "TextRange", + "TextRangeData", + "Topic", +] diff --git a/typeagent/knowpro/interfaces_indexes.py b/typeagent/knowpro/interfaces_indexes.py new file mode 100644 index 00000000..941b45b0 --- /dev/null +++ b/typeagent/knowpro/interfaces_indexes.py @@ -0,0 +1,258 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +"""Index-related protocols and helpers for knowpro conversations.""" + +from __future__ import annotations + +from collections.abc import Iterable, Sequence +from typing import TYPE_CHECKING, Protocol, runtime_checkable + +from pydantic.dataclasses import dataclass +from .interfaces_core import ( + DateRange, + IMessage, + MessageOrdinal, + ScoredMessageOrdinal, + ScoredSemanticRefOrdinal, + SemanticRefOrdinal, + Term, + TextRange, +) +from .interfaces_serialization import ( + ConversationThreadData, + MessageTextIndexData, + TermToRelatedTermsData, + TermsToRelatedTermsIndexData, + ThreadData, + ThreadDataItem, +) + +__all__ = [ + "IConversationSecondaryIndexes", + "IConversationThreads", + "IMessageTextIndex", + "IPropertyToSemanticRefIndex", + "ITermToRelatedTerms", + "ITermToRelatedTermsFuzzy", + "ITermToRelatedTermsIndex", + "ITimestampToTextRangeIndex", + "ScoredThreadOrdinal", + "Thread", + "ThreadOrdinal", + "TimestampedTextRange", +] + + +@runtime_checkable +class IPropertyToSemanticRefIndex(Protocol): + """Allows for faster retrieval of name, value properties.""" + + async def size(self) -> int: ... + + async def get_values(self) -> list[str]: ... + + async def add_property( + self, + property_name: str, + value: str, + semantic_ref_ordinal: SemanticRefOrdinal | ScoredSemanticRefOrdinal, + ) -> None: ... + + async def lookup_property( + self, property_name: str, value: str + ) -> list[ScoredSemanticRefOrdinal] | None: ... + + async def clear(self) -> None: ... + + async def remove_property(self, prop_name: str, semref_id: int) -> None: ... + + async def remove_all_for_semref(self, semref_id: int) -> None: ... + + +@dataclass +class TimestampedTextRange: + timestamp: str + range: TextRange + + +class ITimestampToTextRangeIndex(Protocol): + """Return text ranges over a date range.""" + + # Contract (stable across providers): + # - Timestamps must be ISO-8601 strings sortable lexicographically. + # - lookup_range(DateRange) returns items with start <= t < end (end exclusive). + # If end is None, treat as a point query with end = start + epsilon. + + async def size(self) -> int: ... + + async def add_timestamp( + self, message_ordinal: MessageOrdinal, timestamp: str + ) -> bool: ... + + async def add_timestamps( + self, message_timestamps: list[tuple[MessageOrdinal, str]] + ) -> None: ... + + async def lookup_range( + self, date_range: DateRange + ) -> list[TimestampedTextRange]: ... + + +class ITermToRelatedTerms(Protocol): + async def lookup_term(self, text: str) -> list[Term] | None: ... + + async def size(self) -> int: ... + + async def is_empty(self) -> bool: ... + + async def clear(self) -> None: ... + + async def add_related_term( + self, text: str, related_terms: Term | list[Term] + ) -> None: ... + + async def remove_term(self, text: str) -> None: ... + + async def serialize(self) -> TermToRelatedTermsData: ... + + async def deserialize(self, data: TermToRelatedTermsData | None) -> None: ... + + +class ITermToRelatedTermsFuzzy(Protocol): + async def size(self) -> int: ... + + async def add_terms(self, texts: list[str]) -> None: ... + + async def lookup_term( + self, + text: str, + max_hits: int | None = None, + min_score: float | None = None, + ) -> list[Term]: ... + + async def lookup_terms( + self, + texts: list[str], + max_hits: int | None = None, + min_score: float | None = None, + ) -> list[list[Term]]: ... + + +class ITermToRelatedTermsIndex(Protocol): + # Providers may implement aliases and fuzzy via separate tables, but must + # expose them through these properties. + @property + def aliases(self) -> ITermToRelatedTerms: ... + + @property + def fuzzy_index(self) -> ITermToRelatedTermsFuzzy | None: ... + + async def serialize(self) -> TermsToRelatedTermsIndexData: ... + + async def deserialize(self, data: TermsToRelatedTermsIndexData) -> None: ... + + +@dataclass +class Thread: + """A conversation thread consisting of a description and associated text ranges.""" + + description: str + ranges: Sequence[TextRange] + + def serialize(self) -> ThreadData: + return self.__pydantic_serializer__.to_python(self, by_alias=True) # type: ignore + + @staticmethod + def deserialize(data: ThreadData) -> "Thread": + return Thread.__pydantic_validator__.validate_python(data) # type: ignore + + +type ThreadOrdinal = int + + +@dataclass +class ScoredThreadOrdinal: + thread_ordinal: ThreadOrdinal + score: float + + +class IConversationThreads(Protocol): + threads: list[Thread] + + async def add_thread(self, thread: Thread) -> None: ... + + async def lookup_thread( + self, + thread_description: str, + max_matches: int | None = None, + threshold_score: float | None = None, + ) -> list[ScoredThreadOrdinal] | None: ... + + def serialize(self) -> ConversationThreadData[ThreadDataItem]: ... + + def deserialize(self, data: ConversationThreadData[ThreadDataItem]) -> None: ... + + +@runtime_checkable +class IMessageTextIndex[TMessage: IMessage](Protocol): + async def add_messages( + self, + messages: Iterable[TMessage], + ) -> None: ... + + async def add_messages_starting_at( + self, + start_message_ordinal: int, + messages: list[TMessage], + ) -> None: ... + + async def lookup_messages( + self, + message_text: str, + max_matches: int | None = None, + threshold_score: float | None = None, + ) -> list[ScoredMessageOrdinal]: ... + + async def lookup_messages_in_subset( + self, + message_text: str, + ordinals_to_search: list[MessageOrdinal], + max_matches: int | None = None, + threshold_score: float | None = None, + ) -> list[ScoredMessageOrdinal]: ... + + # Async alternatives to __len__ and __bool__ + + async def size(self) -> int: ... + + async def is_empty(self) -> bool: ... + + # TODO: Others? + + async def serialize(self) -> MessageTextIndexData: ... + + async def deserialize(self, data: MessageTextIndexData) -> None: ... + + +class IConversationSecondaryIndexes[TMessage: IMessage](Protocol): + property_to_semantic_ref_index: IPropertyToSemanticRefIndex | None + timestamp_index: ITimestampToTextRangeIndex | None + term_to_related_terms_index: ITermToRelatedTermsIndex | None + threads: IConversationThreads | None = None + message_index: IMessageTextIndex[TMessage] | None = None + + +__all__ = [ + "IConversationSecondaryIndexes", + "IConversationThreads", + "IMessageTextIndex", + "IPropertyToSemanticRefIndex", + "ITermToRelatedTerms", + "ITermToRelatedTermsFuzzy", + "ITermToRelatedTermsIndex", + "ITimestampToTextRangeIndex", + "ScoredThreadOrdinal", + "Thread", + "ThreadOrdinal", + "TimestampedTextRange", +] diff --git a/typeagent/knowpro/interfaces_search.py b/typeagent/knowpro/interfaces_search.py new file mode 100644 index 00000000..71bb217e --- /dev/null +++ b/typeagent/knowpro/interfaces_search.py @@ -0,0 +1,155 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +"""Search-related interfaces for knowpro.""" + +from __future__ import annotations + +from typing import Literal, TYPE_CHECKING + +from pydantic.dataclasses import dataclass +from .field_helpers import CamelCaseField +from .interfaces_core import ( + DateRange, + KnowledgeType, + ScoredSemanticRefOrdinal, + Term, + TextRange, +) + +__all__ = [ + "SearchTerm", + "KnowledgePropertyName", + "PropertySearchTerm", + "SearchTermGroup", + "SearchTermGroupTypes", + "WhenFilter", + "SearchSelectExpr", +] + + +@dataclass +class SearchTerm: + """Represents a term being searched for. + + Attributes: + term: The term being searched for. + related_terms: Additional terms related to the term. These can be supplied + from synonym tables and so on. + - An empty list indicates no related matches for this term. + - `None` indicates that the search processor may try to resolve related + terms from any available secondary indexes (e.g., ITermToRelatedTermsIndex). + """ + + term: Term + related_terms: list[Term] | None = CamelCaseField( + "Additional terms related to the term. These can be supplied from synonym tables and so on", + default=None, + ) + + +# Well-known knowledge properties. +type KnowledgePropertyName = Literal[ + "name", # the name of an entity + "type", # the type of an entity + "verb", # the verb of an action + "subject", # the subject of an action + "object", # the object of an action + "indirectObject", # the indirect object of an action + "tag", # tag + "topic", # topic +] + + +@dataclass +class PropertySearchTerm: + """PropertySearch terms let you match named property values. + + - You can match a well-known property name (e.g., name("Bach"), type("book")). + - Or you can provide a SearchTerm as a propertyName. + For example, to match hue(red): + - propertyName as SearchTerm, set to 'hue' + - propertyValue as SearchTerm, set to 'red' + We also want hue(red) to match any facets called color(red). + + SearchTerms can include related terms: + - For example, you could include "color" as a related term for the + propertyName "hue", or 'crimson' for red. + + The query processor can also resolve related terms using a + related terms secondary index, if one is available. + """ + + property_name: KnowledgePropertyName | SearchTerm = CamelCaseField( + "The property name to search for" + ) + property_value: SearchTerm = CamelCaseField("The property value to search for") + + +@dataclass +class SearchTermGroup: + """A group of search terms.""" + + boolean_op: Literal["and", "or", "or_max"] = CamelCaseField( + "The boolean operation to apply to the terms" + ) + terms: list["SearchTermGroupTypes"] = CamelCaseField( + "The list of search terms in this group", default_factory=list + ) + + +type SearchTermGroupTypes = SearchTerm | PropertySearchTerm | SearchTermGroup + + +@dataclass +class WhenFilter: + """Additional constraints on when a SemanticRef is considered a match. + + A SemanticRef matching a term is actually considered a match + when the following optional conditions are met (if present, must match): + knowledgeType matches, e.g. knowledgeType == 'entity' + dateRange matches, e.g. (Jan 3rd to Jan 10th) + Semantic Refs are within supplied SCOPE, + i.e. only Semantic Refs from a 'scoping' set of text ranges will match + """ + + knowledge_type: KnowledgeType | None = None + date_range: DateRange | None = None + thread_description: str | None = None + tags: list[str] | None = None + + # SCOPE DEFINITION + + # Search terms whose matching text ranges supply the scope for this query + scope_defining_terms: SearchTermGroup | None = None + # Additional scoping ranges separately computed by caller + text_ranges_in_scope: list[TextRange] | None = None + + +@dataclass +class SearchSelectExpr: + """An expression used to select structured contents of a conversation.""" + + search_term_group: SearchTermGroup = CamelCaseField( + "Term group that matches information" + ) # Term group that matches information + when: WhenFilter | None = None # Filter that scopes what information to match + + +@dataclass +class SemanticRefSearchResult: + """Result of a semantic reference search.""" + + term_matches: set[str] + semantic_ref_matches: list[ScoredSemanticRefOrdinal] + + +__all__ = [ + "KnowledgePropertyName", + "PropertySearchTerm", + "SearchSelectExpr", + "SearchTerm", + "SearchTermGroup", + "SearchTermGroupTypes", + "SemanticRefSearchResult", + "WhenFilter", +] diff --git a/typeagent/knowpro/interfaces_serialization.py b/typeagent/knowpro/interfaces_serialization.py new file mode 100644 index 00000000..cb7178d1 --- /dev/null +++ b/typeagent/knowpro/interfaces_serialization.py @@ -0,0 +1,128 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +"""TypedDict helpers for serializing knowpro conversations and indexes.""" + +from __future__ import annotations + +from typing import NotRequired, TypedDict + +from ..aitools.embeddings import NormalizedEmbeddings +from .interfaces_core import ( + SemanticRefData, + TextLocationData, + TextRangeData, +) + +__all__ = [ + "ConversationData", + "ConversationDataWithIndexes", + "ConversationThreadData", + "MessageTextIndexData", + "ScoredSemanticRefOrdinalData", + "TermData", + "TermToRelatedTermsData", + "TermsToRelatedTermsDataItem", + "TermToSemanticRefIndexData", + "TermToSemanticRefIndexItemData", + "TermsToRelatedTermsIndexData", + "TextEmbeddingIndexData", + "TextToTextLocationIndexData", + "ThreadData", + "ThreadDataItem", +] + + +class ThreadData(TypedDict): + description: str + ranges: list[TextRangeData] + + +class ThreadDataItem(TypedDict): + thread: ThreadData + embedding: list[float] | None # TODO: Why not NormalizedEmbedding? + + +class ConversationThreadData[TThreadDataItem: ThreadDataItem](TypedDict): + threads: list[TThreadDataItem] | None + + +class TermData(TypedDict): + text: str + weight: NotRequired[float | None] + + +class TermsToRelatedTermsDataItem(TypedDict): + termText: str + relatedTerms: list[TermData] + + +class TermToRelatedTermsData(TypedDict): + relatedTerms: NotRequired[list[TermsToRelatedTermsDataItem] | None] + + +class TextEmbeddingIndexData(TypedDict): + textItems: list[str] + embeddings: NormalizedEmbeddings | None + + +class TermsToRelatedTermsIndexData(TypedDict): + aliasData: NotRequired[TermToRelatedTermsData] + textEmbeddingData: NotRequired[TextEmbeddingIndexData] + + +class ScoredSemanticRefOrdinalData(TypedDict): + semanticRefOrdinal: int + score: float + + +class TermToSemanticRefIndexItemData(TypedDict): + term: str + semanticRefOrdinals: list[ScoredSemanticRefOrdinalData] + + +class TermToSemanticRefIndexData(TypedDict): + """Persistent form of a term index.""" + + items: list[TermToSemanticRefIndexItemData] + + +class ConversationData[TMessageData](TypedDict): + nameTag: str + messages: list[TMessageData] + tags: list[str] + semanticRefs: list[SemanticRefData] | None + semanticIndexData: NotRequired[TermToSemanticRefIndexData | None] + + +class TextToTextLocationIndexData(TypedDict): + textLocations: list[TextLocationData] + embeddings: NormalizedEmbeddings | None + + +class MessageTextIndexData(TypedDict): + indexData: NotRequired[TextToTextLocationIndexData | None] + + +class ConversationDataWithIndexes[TMessageData](ConversationData[TMessageData]): + relatedTermsIndexData: NotRequired[TermsToRelatedTermsIndexData | None] + threadData: NotRequired[ConversationThreadData[ThreadDataItem] | None] + messageIndexData: NotRequired[MessageTextIndexData | None] + + +__all__ = [ + "ConversationData", + "ConversationDataWithIndexes", + "ConversationThreadData", + "MessageTextIndexData", + "ScoredSemanticRefOrdinalData", + "TermData", + "TermToRelatedTermsData", + "TermsToRelatedTermsDataItem", + "TermToSemanticRefIndexData", + "TermToSemanticRefIndexItemData", + "TermsToRelatedTermsIndexData", + "TextEmbeddingIndexData", + "TextToTextLocationIndexData", + "ThreadData", + "ThreadDataItem", +] diff --git a/typeagent/knowpro/interfaces_storage.py b/typeagent/knowpro/interfaces_storage.py new file mode 100644 index 00000000..20216eec --- /dev/null +++ b/typeagent/knowpro/interfaces_storage.py @@ -0,0 +1,193 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +"""Storage provider and collection interfaces for knowpro.""" + +from __future__ import annotations + +from collections.abc import AsyncIterable, Iterable +from datetime import datetime as Datetime +from typing import TYPE_CHECKING, Any, Protocol, Self + +from pydantic.dataclasses import dataclass +from .interfaces_core import ( + IMessage, + ITermToSemanticRefIndex, + MessageOrdinal, + SemanticRef, + SemanticRefOrdinal, +) +from .interfaces_indexes import ( + IConversationSecondaryIndexes, + IConversationThreads, + IMessageTextIndex, + IPropertyToSemanticRefIndex, + ITermToRelatedTermsIndex, + ITimestampToTextRangeIndex, +) + +__all__ = [ + "ConversationMetadata", + "IReadonlyCollection", + "ICollection", + "IMessageCollection", + "ISemanticRefCollection", + "IStorageProvider", +] + + +@dataclass +class ConversationMetadata: + """Storage-provider-agnostic metadata for a conversation. + + This dataclass represents metadata that can be read from and written to + any storage provider (SQLite, in-memory, etc.). Providers may store this + internally in different formats (e.g., key-value pairs), but this provides + a uniform interface for accessing conversation metadata. + + When passed to a storage provider during initialization: + - None values indicate the provider should auto-generate/use defaults + - Non-None values are used as-is + + When returned from get_conversation_metadata(): + - None values indicate the field was not found in the database + - Non-None values are the actual stored values + - If the database has no metadata rows, returns an instance with all fields None + """ + + name_tag: str | None = None + schema_version: int | None = None + created_at: Datetime | None = None + updated_at: Datetime | None = None + embedding_size: int | None = None + embedding_model: str | None = None + tags: list[str] | None = None + extra: dict[str, str] | None = None + + +class IReadonlyCollection[T, TOrdinal](AsyncIterable[T], Protocol): + async def size(self) -> int: ... + + async def get_item(self, arg: TOrdinal) -> T: ... + + async def get_slice(self, start: int, stop: int) -> list[T]: ... + + async def get_multiple(self, arg: list[TOrdinal]) -> list[T]: ... + + +class ICollection[T, TOrdinal](IReadonlyCollection[T, TOrdinal], Protocol): + """An APPEND-ONLY collection.""" + + @property + def is_persistent(self) -> bool: ... + + async def append(self, item: T) -> None: ... + + async def extend(self, items: Iterable[T]) -> None: + """Append multiple items to the collection.""" + # The default implementation just calls append for each item. + for item in items: + await self.append(item) + + +class IMessageCollection[TMessage: IMessage]( + ICollection[TMessage, MessageOrdinal], Protocol +): + """A collection of Messages.""" + + +class ISemanticRefCollection(ICollection[SemanticRef, SemanticRefOrdinal], Protocol): + """A collection of SemanticRefs.""" + + +class IStorageProvider[TMessage: IMessage](Protocol): + """API spec for storage providers -- maybe in-memory or persistent.""" + + async def get_message_collection(self) -> IMessageCollection[TMessage]: ... + + async def get_semantic_ref_collection(self) -> ISemanticRefCollection: ... + + # Index getters - ALL 6 index types for this conversation + + async def get_semantic_ref_index(self) -> ITermToSemanticRefIndex: ... + + async def get_property_index(self) -> IPropertyToSemanticRefIndex: ... + + async def get_timestamp_index(self) -> ITimestampToTextRangeIndex: ... + + async def get_message_text_index(self) -> IMessageTextIndex[TMessage]: ... + + async def get_related_terms_index(self) -> ITermToRelatedTermsIndex: ... + + async def get_conversation_threads(self) -> IConversationThreads: ... + + # Metadata management + + def get_conversation_metadata(self) -> ConversationMetadata: + """Get conversation metadata (missing fields set to None).""" + ... + + def set_conversation_metadata(self, **kwds: str | list[str] | None) -> None: + """Set or update conversation metadata key-value pairs. + Args: + **kwds: Metadata keys and values where: + - str value: Sets a single key-value pair (replaces existing) + - list[str] value: Sets multiple values for the same key + - None value: Deletes all rows for the given key + """ + ... + + def update_conversation_timestamps( + self, + created_at: Datetime | None = None, + updated_at: Datetime | None = None, + ) -> None: + """Update conversation timestamps.""" + ... + + # Ingested source tracking + def is_source_ingested(self, source_id: str) -> bool: + """Check if a source has already been ingested.""" + ... + + def mark_source_ingested(self, source_id: str) -> None: + """Mark a source as ingested (no commit; call within transaction context).""" + ... + + # Transaction management + async def __aenter__(self) -> Self: + """Enter transaction context. Calls begin_transaction().""" + ... + + async def __aexit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: Any, + ) -> None: + """Exit transaction context. Commits on success, rolls back on exception.""" + ... + + async def close(self) -> None: ... + + +class IConversation[ + TMessage: IMessage, + TTermToSemanticRefIndex: ITermToSemanticRefIndex, +](Protocol): + name_tag: str + tags: list[str] + messages: IMessageCollection[TMessage] + semantic_refs: ISemanticRefCollection + semantic_ref_index: TTermToSemanticRefIndex + secondary_indexes: IConversationSecondaryIndexes[TMessage] | None + + +__all__ = [ + "ConversationMetadata", + "ICollection", + "IConversation", + "IMessageCollection", + "IReadonlyCollection", + "ISemanticRefCollection", + "IStorageProvider", +] diff --git a/typeagent/knowpro/kplib.py b/typeagent/knowpro/kplib.py index 72ee1a85..f05c3d4b 100644 --- a/typeagent/knowpro/kplib.py +++ b/typeagent/knowpro/kplib.py @@ -7,11 +7,11 @@ Comments that should go into the schema are in docstrings and Doc() annotations. """ -from pydantic.dataclasses import dataclass from pydantic import Field, AliasChoices from typing import Annotated, ClassVar, Literal from typing_extensions import Doc +from .dataclasses import dataclass from .field_helpers import CamelCaseField diff --git a/typeagent/knowpro/search.py b/typeagent/knowpro/search.py index d6d4a14c..7121bc55 100644 --- a/typeagent/knowpro/search.py +++ b/typeagent/knowpro/search.py @@ -2,11 +2,11 @@ # Licensed under the MIT License. from collections.abc import Callable -from pydantic.dataclasses import dataclass from pydantic import Field, AliasChoices from typing import TypeGuard, cast, Annotated from .collections import MessageAccumulator, SemanticRefAccumulator +from .dataclasses import dataclass from .field_helpers import CamelCaseField from .interfaces import ( IConversation, diff --git a/typeagent/knowpro/search_query_schema.py b/typeagent/knowpro/search_query_schema.py index 0ad4b423..900ed48e 100644 --- a/typeagent/knowpro/search_query_schema.py +++ b/typeagent/knowpro/search_query_schema.py @@ -3,11 +3,11 @@ # TODO: Move this file into knowpro. -from pydantic.dataclasses import dataclass from pydantic import Field from typing import Annotated, Literal from typing_extensions import Doc +from .dataclasses import dataclass from .field_helpers import CamelCaseField from .date_time_schema import DateTimeRange diff --git a/typeagent/knowpro/universal_message.py b/typeagent/knowpro/universal_message.py index a2eb142b..a01f3e28 100644 --- a/typeagent/knowpro/universal_message.py +++ b/typeagent/knowpro/universal_message.py @@ -7,9 +7,9 @@ from typing import TypedDict from pydantic import Field -from pydantic.dataclasses import dataclass as pydantic_dataclass from . import kplib +from .dataclasses import dataclass as pydantic_dataclass from .field_helpers import CamelCaseField from .interfaces import IKnowledgeSource, IMessage, IMessageMetadata diff --git a/typeagent/transcripts/transcript.py b/typeagent/transcripts/transcript.py index 0c5eda19..37e624bc 100644 --- a/typeagent/transcripts/transcript.py +++ b/typeagent/transcripts/transcript.py @@ -33,7 +33,6 @@ MessageOrdinal, SemanticRef, Term, - Timedelta, Topic, AddMessagesResult, IndexingStartPoints,