diff --git a/impresso/api_client/models/find_entities_order_by.py b/impresso/api_client/models/find_entities_order_by.py index 419839d..5f24a93 100644 --- a/impresso/api_client/models/find_entities_order_by.py +++ b/impresso/api_client/models/find_entities_order_by.py @@ -26,3 +26,32 @@ def __str__(self) -> str: "-count", "-count-mentions", ] +"""Specifies the sorting order for entity results using string literals. + +This type defines the valid string values that can be used to specify the +field by which entity results should be ordered, and whether the order +should be ascending or descending. + +Possible ordering fields: +- `count`: Order by the total number of documents the entity appears in. +- `count-mentions`: Order by the total number of times the entity is mentioned across all documents. +- `name`: Order alphabetically by entity name. +- `relevance`: Order by relevance score (specific to the query context, often the default). + +Ascending order is the default (e.g., `"name"` sorts A-Z). +Descending order is indicated by a preceding hyphen (e.g., `"-count"` +sorts from the highest count to the lowest). + +Usage Example: + ```python + # Assume 'client' is an initialized API client instance + # Example: Find entities and sort by the number of mentions (descending) + entities_by_mentions = client.find_entities(query="some query", order_by="-count-mentions") + + # Example: Find entities and sort alphabetically by name (ascending) + entities_by_name = client.find_entities(query="another query", order_by="name") + ``` + +See Also: + `FindEntitiesOrderBy`: An enum representation of these literal values. +""" diff --git a/impresso/api_client/models/find_media_sources_order_by.py b/impresso/api_client/models/find_media_sources_order_by.py index 3393772..ec3002a 100644 --- a/impresso/api_client/models/find_media_sources_order_by.py +++ b/impresso/api_client/models/find_media_sources_order_by.py @@ -26,3 +26,35 @@ def __str__(self) -> str: "-lastIssue", "-countIssues", ] +"""Specifies the sorting order for media source results using string literals. + +This type defines the valid string values that can be used to specify the +field by which media source results should be ordered, and whether the order +should be ascending or descending. + +Possible ordering fields: +- `countIssues`: Order by the total number of issues available for the media source. +- `firstIssue`: Order by the publication date of the earliest available issue. +- `lastIssue`: Order by the publication date of the latest available issue. +- `name`: Order alphabetically by the media source's name. + +Ascending order is the default (e.g., `"name"` sorts A-Z, `"firstIssue"` sorts oldest to newest). +Descending order is indicated by a preceding hyphen (e.g., `"-countIssues"` +sorts from the highest count to the lowest, `"-lastIssue"` sorts newest to oldest). + +Usage Example: + ```python + # Assume 'client' is an initialized API client instance + # Example: Find media sources and sort by the number of issues (descending) + sources_by_issue_count = client.find_media_sources(order_by="-countIssues") + + # Example: Find media sources and sort alphabetically by name (ascending) + sources_by_name = client.find_media_sources(order_by="name") + + # Example: Find media sources and sort by the date of the last issue (newest first) + sources_by_last_issue = client.find_media_sources(order_by="-lastIssue") + ``` + +See Also: + `FindMediaSourcesOrderBy`: An enum representation of these literal values. +""" diff --git a/impresso/api_client/models/search_order_by.py b/impresso/api_client/models/search_order_by.py index 664ce40..ecaa2f5 100644 --- a/impresso/api_client/models/search_order_by.py +++ b/impresso/api_client/models/search_order_by.py @@ -6,9 +6,9 @@ class SearchOrderBy(str, Enum): DATE = "date" ID = "id" RELEVANCE = "relevance" - VALUE_0 = "-date" - VALUE_2 = "-relevance" - VALUE_5 = "-id" + VALUE_0 = "-date" # Descending date + VALUE_2 = "-relevance" # Descending relevance (default search behavior) + VALUE_5 = "-id" # Descending ID def __str__(self) -> str: return str(self.value) @@ -22,3 +22,30 @@ def __str__(self) -> str: "-relevance", "-id", ] +""" +Specifies the sorting order for search results using string literals. + +This type defines the valid string values that can be used to specify the +field by which search results should be ordered, and whether the order +should be ascending or descending. + +Ascending order is the default (e.g., `"date"` sorts from oldest to newest). +Descending order is indicated by a preceding hyphen (e.g., `"-date"` +sorts from newest to oldest). + +Usage Example: + ```python + # Example: Search for articles and sort by relevance (descending) + # Note: "-relevance" is often the default sorting for search APIs + results = client.search(query="example query", order_by="-relevance") + + # Example: Search and sort by date (ascending) + results_by_date = client.search(query="another query", order_by="date") + + # Example: Search and sort by date (descending) + results_by_date_desc = client.search(query="yet another query", order_by="-date") + ``` + +See Also: + `SearchOrderBy`: An enum representation of these literal values. +""" diff --git a/impresso/data_container.py b/impresso/data_container.py index b520d77..35bb944 100644 --- a/impresso/data_container.py +++ b/impresso/data_container.py @@ -10,7 +10,12 @@ class DataContainer(Generic[IT, T]): """ Generic container for responses from the Impresso API returned by resource methods (`get`, `find`). - Generally represents a single page of the result. + + Generally represents a single page of the result. The results can be + paginated through by adjusting the `offset` and `limit` parameters + in the corresponding resource method call (e.g., `client.newspapers.find`). + The `total`, `limit`, `offset`, and `size` properties provide information + about the current page and the overall result set. """ def __init__( @@ -76,43 +81,47 @@ def _get_preview_image_(self) -> str | None: @property def raw(self) -> dict[str, Any]: - """Returns the response data as a python dictionary.""" + """The response data as a python dictionary.""" return getattr(self._data, "to_dict")() @property def pydantic(self) -> T: - """Returns the response data as a pydantic model.""" + """The response data as a pydantic model.""" return self._pydantic_model.model_validate(self.raw) @property def df(self) -> DataFrame: - """Returns the response data as a pandas dataframe.""" + """ + The response data for the current page as a pandas dataframe. + + Note that this DataFrame only contains the items from the current + page of results, not the entire result set across all pages. + """ return DataFrame.from_dict(self._data) # type: ignore @property def total(self) -> int: - """Total number of results.""" + """Total number of results available across all pages.""" return self.raw.get("pagination", {}).get("total", 0) @property def limit(self) -> int: - """Current page size.""" + """Maximum number of items requested for the current page.""" return self.raw.get("pagination", {}).get("limit", 0) @property def offset(self) -> int: - """Current page offset.""" + """The starting index (0-based) of the items on the current page.""" return self.raw.get("pagination", {}).get("offset", 0) @property def size(self) -> int: - """Current page size.""" + """Number of items actually present on the current page.""" return len(self.raw.get("data", [])) @property def url(self) -> str | None: """ - URL of an Impresso web application page - representing the result set from this container. + URL of an Impresso web application page representing the result set. """ return self._web_app_search_result_url diff --git a/impresso/resources/collections.py b/impresso/resources/collections.py index 638a29e..e542228 100644 --- a/impresso/resources/collections.py +++ b/impresso/resources/collections.py @@ -66,6 +66,28 @@ def total(self) -> int: class CollectionsResource(Resource): """ Work with collections. + + Examples: + Find collections containing the term "war": + >>> results = collections.find(term="war") # doctest: +SKIP + >>> print(results.df) # doctest: +SKIP + + Get a specific collection by its ID: + >>> collection_id = "some-collection-id" # Replace with a real ID + >>> collection = collections.get(collection_id) # doctest: +SKIP + >>> print(collection.df) # doctest: +SKIP + + List items in a collection: + >>> items = collections.items(collection_id) # doctest: +SKIP + >>> print(items.df) # doctest: +SKIP + + Add items to a collection: + >>> item_ids_to_add = ["item-id-1", "item-id-2"] # Replace with real item IDs + >>> collections.add_items(collection_id, item_ids_to_add) # doctest: +SKIP + + Remove items from a collection: + >>> item_ids_to_remove = ["item-id-1"] # Replace with real item IDs + >>> collections.remove_items(collection_id, item_ids_to_remove) # doctest: +SKIP """ name = "collections" diff --git a/impresso/resources/search.py b/impresso/resources/search.py index 6b011e0..0bcac8a 100644 --- a/impresso/resources/search.py +++ b/impresso/resources/search.py @@ -86,15 +86,15 @@ def find( title: str | AND[str] | OR[str] | None = None, front_page: bool | None = None, entity_id: str | AND[str] | OR[str] | None = None, - newspaper_id: str | OR[str] | None = None, + newspaper_id: str | AND[str] | OR[str] | None = None, date_range: DateRange | None = None, - language: str | OR[str] | None = None, + language: str | AND[str] | OR[str] | None = None, mention: str | AND[str] | OR[str] | None = None, topic_id: str | AND[str] | OR[str] | None = None, - collection_id: str | OR[str] | None = None, - country: str | OR[str] | None = None, - partner_id: str | OR[str] | None = None, - text_reuse_cluster_id: str | OR[str] | None = None, + collection_id: str | AND[str] | OR[str] | None = None, + country: str | AND[str] | OR[str] | None = None, + partner_id: str | AND[str] | OR[str] | None = None, + text_reuse_cluster_id: str | AND[str] | OR[str] | None = None, ) -> SearchDataContainer: """ Search for content items in Impresso. @@ -179,15 +179,15 @@ def facet( title: str | AND[str] | OR[str] | None = None, front_page: bool | None = None, entity_id: str | AND[str] | OR[str] | None = None, - newspaper_id: str | OR[str] | None = None, + newspaper_id: str | AND[str] | OR[str] | None = None, date_range: DateRange | None = None, - language: str | OR[str] | None = None, + language: str | AND[str] | OR[str] | None = None, mention: str | AND[str] | OR[str] | None = None, topic_id: str | AND[str] | OR[str] | None = None, - collection_id: str | OR[str] | None = None, - country: str | OR[str] | None = None, - partner_id: str | OR[str] | None = None, - text_reuse_cluster_id: str | OR[str] | None = None, + collection_id: str | AND[str] | OR[str] | None = None, + country: str | AND[str] | OR[str] | None = None, + partner_id: str | AND[str] | OR[str] | None = None, + text_reuse_cluster_id: str | AND[str] | OR[str] | None = None, ) -> FacetDataContainer: facet_id = get_enum_from_literal(facet, GetSearchFacetId) @@ -258,15 +258,15 @@ def _build_filters( title: str | AND[str] | OR[str] | None = None, front_page: bool | None = None, entity_id: str | AND[str] | OR[str] | None = None, - newspaper_id: str | OR[str] | None = None, + newspaper_id: str | AND[str] | OR[str] | None = None, date_range: DateRange | None = None, - language: str | OR[str] | None = None, + language: str | AND[str] | OR[str] | None = None, mention: str | AND[str] | OR[str] | None = None, topic_id: str | AND[str] | OR[str] | None = None, - collection_id: str | OR[str] | None = None, - country: str | OR[str] | None = None, - partner_id: str | OR[str] | None = None, - text_reuse_cluster_id: str | OR[str] | None = None, + collection_id: str | AND[str] | OR[str] | None = None, + country: str | AND[str] | OR[str] | None = None, + partner_id: str | AND[str] | OR[str] | None = None, + text_reuse_cluster_id: str | AND[str] | OR[str] | None = None, ) -> list[Filter]: filters: list[Filter] = [] if string: diff --git a/impresso/resources/text_reuse/clusters.py b/impresso/resources/text_reuse/clusters.py index 97453dd..b91072c 100644 --- a/impresso/resources/text_reuse/clusters.py +++ b/impresso/resources/text_reuse/clusters.py @@ -51,7 +51,21 @@ def df(self) -> DataFrame: class TextReuseClustersResource(Resource): - """Text reuse clusters resource.""" + """ + Interact with the text reuse clusters endpoint of the Impresso API. + + This resource allows searching for text reuse clusters based on various criteria + and retrieving facet information about these clusters. + + Examples: + Find clusters with size between 10 and 20: + >>> results = textReuseClusters.find(cluster_size=(10, 20)) # doctest: +SKIP + >>> print(results.df) # doctest: +SKIP + + Get the distribution of newspapers involved in clusters: + >>> facet_results = textReuseClusters.facet(facet='newspaper', order_by='count') # doctest: +SKIP + >>> print(facet_results.df) # doctest: +SKIP + """ name = "textReuseClusters" @@ -75,6 +89,40 @@ def find( mention: str | AND[str] | OR[str] | None = None, entity_id: str | AND[str] | OR[str] | None = None, ) -> FindTextReuseClustersContainer: + """ + Find text reuse clusters based on various criteria. + + Args: + term: Search for clusters containing specific text. + title: Filter clusters by the title of the articles within them. + order_by: Specify the sorting order for the results. + cluster_size: Filter clusters by the number of items they contain. + lexical_overlap: Filter clusters by the lexical overlap score. + day_delta: Filter clusters by the time span (in days) between the first and last item. + date_range: Filter clusters based on the date range of their items. + newspaper_id: Filter clusters containing items from specific newspapers. + collection_id: Filter clusters containing items from specific collections. + limit: Maximum number of clusters to return. + offset: Number of clusters to skip from the beginning. + front_page: Filter clusters containing items published on the front page. + topic_id: Filter clusters associated with specific topics. + language: Filter clusters by the language of their items. + country: Filter clusters by the country of publication of their items. + mention: Filter clusters containing specific mentions (named entities). + entity_id: Filter clusters associated with specific entity IDs. + + Returns: + FindTextReuseClustersContainer: A container holding the search results. + + Examples: + Find clusters with size between 10 and 20: + >>> results = textReuseClusters.find(cluster_size=(10, 20)) # doctest: +SKIP + >>> print(results.df) # doctest: +SKIP + + Find clusters related to 'politics' in Swiss newspapers: + >>> results = textReuseClusters.find(term='politics', country='CH') # doctest: +SKIP + >>> print(results.df) # doctest: +SKIP + """ filters = _build_filters( text=term, @@ -130,6 +178,37 @@ def facet( lexical_overlap: Range | AND[Range] | OR[Range] | None = None, day_delta: Range | AND[Range] | OR[Range] | None = None, ) -> FacetDataContainer: + """ + Get facet information for text reuse clusters based on specified filters. + + Facets provide aggregated counts for different properties of the clusters, + such as the distribution of cluster sizes or newspapers involved. + + Args: + facet: The specific facet to retrieve (e.g., 'newspaper', 'cluster_size'). + order_by: How to order the facet values (e.g., 'value', 'count'). + limit: Maximum number of facet values to return. + offset: Number of facet values to skip. + cluster_size: Filter clusters by size before calculating facets. + date_range: Filter clusters by date range before calculating facets. + newspaper_id: Filter clusters by newspaper before calculating facets. + lexical_overlap: Filter clusters by lexical overlap before calculating facets. + day_delta: Filter clusters by day delta before calculating facets. + + Returns: + FacetDataContainer: A container holding the facet results. + + Examples: + Get the top 10 newspapers involved in clusters: + >>> facet_results = textReuseClusters.facet(facet='newspaper', limit=10, order_by='count') # doctest: +SKIP + >>> print(facet_results.df) # doctest: +SKIP + + Get the distribution of cluster sizes for clusters within a specific date range: + >>> from impresso.structures import DateRange + >>> date_filter = DateRange(start="1900-01-01", end="1910-12-31") + >>> facet_results = textReuseClusters.facet(facet='cluster_size', date_range=date_filter) # doctest: +SKIP + >>> print(facet_results.df) # doctest: +SKIP + """ facet_id = get_enum_from_literal(facet, GetTrClustersFacetId) if isinstance(facet_id, Unset): raise ValueError(f"{facet} is not a valid value") diff --git a/impresso/resources/text_reuse/passages.py b/impresso/resources/text_reuse/passages.py index 858d086..0f13149 100644 --- a/impresso/resources/text_reuse/passages.py +++ b/impresso/resources/text_reuse/passages.py @@ -73,6 +73,41 @@ def find( mention: str | AND[str] | OR[str] | None = None, entity_id: str | AND[str] | OR[str] | None = None, ) -> FindTextReusePassagesContainer: + """ + Find text reuse passages based on various criteria. + + Args: + term: Search for passages containing specific text. + limit: Maximum number of passages to return. + offset: Number of passages to skip from the beginning. + order_by: Specify the sorting order for the results. + cluster_id: Filter passages belonging to specific text reuse clusters. + cluster_size: Filter passages based on the size of the cluster they belong to. + title: Filter passages by the title of the articles they appear in. + lexical_overlap: Filter passages based on the lexical overlap score within their cluster. + day_delta: Filter passages based on the time span (in days) of their cluster. + date_range: Filter passages based on their publication date. + newspaper_id: Filter passages from specific newspapers. + collection_id: Filter passages from specific collections. + front_page: Filter passages appearing on the front page. + topic_id: Filter passages associated with specific topics. + language: Filter passages by their language. + country: Filter passages by the country of publication. + mention: Filter passages containing specific mentions (named entities). + entity_id: Filter passages associated with specific entity IDs. + + Returns: + FindTextReusePassagesContainer: A container holding the search results. + + Examples: + Find passages containing the term 'revolution' from French newspapers: + >>> results = textReusePassages.find(term='revolution', country='FR') # doctest: +SKIP + >>> print(results.df) # doctest: +SKIP + + Find passages from clusters with a size greater than 50: + >>> results = textReusePassages.find(cluster_size=(51, None)) # doctest: +SKIP + >>> print(results.df) # doctest: +SKIP + """ # reusing build filters from clusters - they are the same filters = _build_filters( cluster_id=cluster_id, @@ -140,6 +175,47 @@ def facet( mention: str | AND[str] | OR[str] | None = None, entity_id: str | AND[str] | OR[str] | None = None, ) -> FacetDataContainer: + """ + Get facet information for text reuse passages based on specified filters. + + Facets provide aggregated counts for different properties of the passages, + such as the distribution of newspapers or languages. + + Args: + facet: The specific facet to retrieve (e.g., 'newspaper', 'language'). + term: Filter passages by text content before calculating facets. + limit: Maximum number of facet values to return. + offset: Number of facet values to skip. + order_by: How to order the facet values (e.g., 'value', 'count'). + cluster_id: Filter passages by cluster ID before calculating facets. + cluster_size: Filter passages by cluster size before calculating facets. + title: Filter passages by article title before calculating facets. + lexical_overlap: Filter passages by lexical overlap before calculating facets. + day_delta: Filter passages by cluster day delta before calculating facets. + date_range: Filter passages by publication date before calculating facets. + newspaper_id: Filter passages by newspaper before calculating facets. + collection_id: Filter passages by collection before calculating facets. + front_page: Filter passages by front page status before calculating facets. + topic_id: Filter passages by topic ID before calculating facets. + language: Filter passages by language before calculating facets. + country: Filter passages by country before calculating facets. + mention: Filter passages by mention before calculating facets. + entity_id: Filter passages by entity ID before calculating facets. + + Returns: + FacetDataContainer: A container holding the facet results. + + Examples: + Get the top 10 newspapers associated with passages containing 'war': + >>> facet_results = textReusePassages.facet(facet='newspaper', term='war', limit=10) # doctest: +SKIP + >>> print(facet_results.df) # doctest: +SKIP + + Get the language distribution for passages published between 1914 and 1918: + >>> from impresso.structures import DateRange + >>> date_filter = DateRange(start="1914-01-01", end="1918-12-31") + >>> facet_results = textReusePassages.facet(facet='language', date_range=date_filter) # doctest: +SKIP + >>> print(facet_results.df) # doctest: +SKIP + """ facet_id = get_enum_from_literal(facet, GetTrPassagesFacetId) if isinstance(facet_id, Unset): raise ValueError(f"{facet} is not a valid value") diff --git a/impresso/structures.py b/impresso/structures.py index e09a2d3..d253c1f 100644 --- a/impresso/structures.py +++ b/impresso/structures.py @@ -180,7 +180,7 @@ def _as_date(value: datetime.date | str) -> datetime.date: class NumericRange: """ - Date range. + Numeric range. Example: