Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions impresso/api_client/models/find_entities_order_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,32 @@ def __str__(self) -> str:
"-count",
"-count-mentions",
]
"""Specifies the sorting order for entity results using string literals.

This type defines the valid string values that can be used to specify the
field by which entity results should be ordered, and whether the order
should be ascending or descending.

Possible ordering fields:
- `count`: Order by the total number of documents the entity appears in.
- `count-mentions`: Order by the total number of times the entity is mentioned across all documents.
- `name`: Order alphabetically by entity name.
- `relevance`: Order by relevance score (specific to the query context, often the default).

Ascending order is the default (e.g., `"name"` sorts A-Z).
Descending order is indicated by a preceding hyphen (e.g., `"-count"`
sorts from the highest count to the lowest).

Usage Example:
```python
# Assume 'client' is an initialized API client instance
# Example: Find entities and sort by the number of mentions (descending)
entities_by_mentions = client.find_entities(query="some query", order_by="-count-mentions")

# Example: Find entities and sort alphabetically by name (ascending)
entities_by_name = client.find_entities(query="another query", order_by="name")
```

See Also:
`FindEntitiesOrderBy`: An enum representation of these literal values.
"""
32 changes: 32 additions & 0 deletions impresso/api_client/models/find_media_sources_order_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,35 @@ def __str__(self) -> str:
"-lastIssue",
"-countIssues",
]
"""Specifies the sorting order for media source results using string literals.

This type defines the valid string values that can be used to specify the
field by which media source results should be ordered, and whether the order
should be ascending or descending.

Possible ordering fields:
- `countIssues`: Order by the total number of issues available for the media source.
- `firstIssue`: Order by the publication date of the earliest available issue.
- `lastIssue`: Order by the publication date of the latest available issue.
- `name`: Order alphabetically by the media source's name.

Ascending order is the default (e.g., `"name"` sorts A-Z, `"firstIssue"` sorts oldest to newest).
Descending order is indicated by a preceding hyphen (e.g., `"-countIssues"`
sorts from the highest count to the lowest, `"-lastIssue"` sorts newest to oldest).

Usage Example:
```python
# Assume 'client' is an initialized API client instance
# Example: Find media sources and sort by the number of issues (descending)
sources_by_issue_count = client.find_media_sources(order_by="-countIssues")

# Example: Find media sources and sort alphabetically by name (ascending)
sources_by_name = client.find_media_sources(order_by="name")

# Example: Find media sources and sort by the date of the last issue (newest first)
sources_by_last_issue = client.find_media_sources(order_by="-lastIssue")
```

See Also:
`FindMediaSourcesOrderBy`: An enum representation of these literal values.
"""
33 changes: 30 additions & 3 deletions impresso/api_client/models/search_order_by.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ class SearchOrderBy(str, Enum):
DATE = "date"
ID = "id"
RELEVANCE = "relevance"
VALUE_0 = "-date"
VALUE_2 = "-relevance"
VALUE_5 = "-id"
VALUE_0 = "-date" # Descending date
VALUE_2 = "-relevance" # Descending relevance (default search behavior)
VALUE_5 = "-id" # Descending ID

def __str__(self) -> str:
return str(self.value)
Expand All @@ -22,3 +22,30 @@ def __str__(self) -> str:
"-relevance",
"-id",
]
"""
Specifies the sorting order for search results using string literals.

This type defines the valid string values that can be used to specify the
field by which search results should be ordered, and whether the order
should be ascending or descending.

Ascending order is the default (e.g., `"date"` sorts from oldest to newest).
Descending order is indicated by a preceding hyphen (e.g., `"-date"`
sorts from newest to oldest).

Usage Example:
```python
# Example: Search for articles and sort by relevance (descending)
# Note: "-relevance" is often the default sorting for search APIs
results = client.search(query="example query", order_by="-relevance")

# Example: Search and sort by date (ascending)
results_by_date = client.search(query="another query", order_by="date")

# Example: Search and sort by date (descending)
results_by_date_desc = client.search(query="yet another query", order_by="-date")
```

See Also:
`SearchOrderBy`: An enum representation of these literal values.
"""
29 changes: 19 additions & 10 deletions impresso/data_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@ class DataContainer(Generic[IT, T]):
"""
Generic container for responses from the Impresso API
returned by resource methods (`get`, `find`).
Generally represents a single page of the result.

Generally represents a single page of the result. The results can be
paginated through by adjusting the `offset` and `limit` parameters
in the corresponding resource method call (e.g., `client.newspapers.find`).
The `total`, `limit`, `offset`, and `size` properties provide information
about the current page and the overall result set.
"""

def __init__(
Expand Down Expand Up @@ -76,43 +81,47 @@ def _get_preview_image_(self) -> str | None:

@property
def raw(self) -> dict[str, Any]:
"""Returns the response data as a python dictionary."""
"""The response data as a python dictionary."""
return getattr(self._data, "to_dict")()

@property
def pydantic(self) -> T:
"""Returns the response data as a pydantic model."""
"""The response data as a pydantic model."""
return self._pydantic_model.model_validate(self.raw)

@property
def df(self) -> DataFrame:
"""Returns the response data as a pandas dataframe."""
"""
The response data for the current page as a pandas dataframe.

Note that this DataFrame only contains the items from the current
page of results, not the entire result set across all pages.
"""
return DataFrame.from_dict(self._data) # type: ignore

@property
def total(self) -> int:
"""Total number of results."""
"""Total number of results available across all pages."""
return self.raw.get("pagination", {}).get("total", 0)

@property
def limit(self) -> int:
"""Current page size."""
"""Maximum number of items requested for the current page."""
return self.raw.get("pagination", {}).get("limit", 0)

@property
def offset(self) -> int:
"""Current page offset."""
"""The starting index (0-based) of the items on the current page."""
return self.raw.get("pagination", {}).get("offset", 0)

@property
def size(self) -> int:
"""Current page size."""
"""Number of items actually present on the current page."""
return len(self.raw.get("data", []))

@property
def url(self) -> str | None:
"""
URL of an Impresso web application page
representing the result set from this container.
URL of an Impresso web application page representing the result set.
"""
return self._web_app_search_result_url
22 changes: 22 additions & 0 deletions impresso/resources/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,28 @@ def total(self) -> int:
class CollectionsResource(Resource):
"""
Work with collections.

Examples:
Find collections containing the term "war":
>>> results = collections.find(term="war") # doctest: +SKIP
>>> print(results.df) # doctest: +SKIP

Get a specific collection by its ID:
>>> collection_id = "some-collection-id" # Replace with a real ID
>>> collection = collections.get(collection_id) # doctest: +SKIP
>>> print(collection.df) # doctest: +SKIP

List items in a collection:
>>> items = collections.items(collection_id) # doctest: +SKIP
>>> print(items.df) # doctest: +SKIP

Add items to a collection:
>>> item_ids_to_add = ["item-id-1", "item-id-2"] # Replace with real item IDs
>>> collections.add_items(collection_id, item_ids_to_add) # doctest: +SKIP

Remove items from a collection:
>>> item_ids_to_remove = ["item-id-1"] # Replace with real item IDs
>>> collections.remove_items(collection_id, item_ids_to_remove) # doctest: +SKIP
"""

name = "collections"
Expand Down
36 changes: 18 additions & 18 deletions impresso/resources/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,15 @@ def find(
title: str | AND[str] | OR[str] | None = None,
front_page: bool | None = None,
entity_id: str | AND[str] | OR[str] | None = None,
newspaper_id: str | OR[str] | None = None,
newspaper_id: str | AND[str] | OR[str] | None = None,
date_range: DateRange | None = None,
language: str | OR[str] | None = None,
language: str | AND[str] | OR[str] | None = None,
mention: str | AND[str] | OR[str] | None = None,
topic_id: str | AND[str] | OR[str] | None = None,
collection_id: str | OR[str] | None = None,
country: str | OR[str] | None = None,
partner_id: str | OR[str] | None = None,
text_reuse_cluster_id: str | OR[str] | None = None,
collection_id: str | AND[str] | OR[str] | None = None,
country: str | AND[str] | OR[str] | None = None,
partner_id: str | AND[str] | OR[str] | None = None,
text_reuse_cluster_id: str | AND[str] | OR[str] | None = None,
) -> SearchDataContainer:
"""
Search for content items in Impresso.
Expand Down Expand Up @@ -179,15 +179,15 @@ def facet(
title: str | AND[str] | OR[str] | None = None,
front_page: bool | None = None,
entity_id: str | AND[str] | OR[str] | None = None,
newspaper_id: str | OR[str] | None = None,
newspaper_id: str | AND[str] | OR[str] | None = None,
date_range: DateRange | None = None,
language: str | OR[str] | None = None,
language: str | AND[str] | OR[str] | None = None,
mention: str | AND[str] | OR[str] | None = None,
topic_id: str | AND[str] | OR[str] | None = None,
collection_id: str | OR[str] | None = None,
country: str | OR[str] | None = None,
partner_id: str | OR[str] | None = None,
text_reuse_cluster_id: str | OR[str] | None = None,
collection_id: str | AND[str] | OR[str] | None = None,
country: str | AND[str] | OR[str] | None = None,
partner_id: str | AND[str] | OR[str] | None = None,
text_reuse_cluster_id: str | AND[str] | OR[str] | None = None,
) -> FacetDataContainer:

facet_id = get_enum_from_literal(facet, GetSearchFacetId)
Expand Down Expand Up @@ -258,15 +258,15 @@ def _build_filters(
title: str | AND[str] | OR[str] | None = None,
front_page: bool | None = None,
entity_id: str | AND[str] | OR[str] | None = None,
newspaper_id: str | OR[str] | None = None,
newspaper_id: str | AND[str] | OR[str] | None = None,
date_range: DateRange | None = None,
language: str | OR[str] | None = None,
language: str | AND[str] | OR[str] | None = None,
mention: str | AND[str] | OR[str] | None = None,
topic_id: str | AND[str] | OR[str] | None = None,
collection_id: str | OR[str] | None = None,
country: str | OR[str] | None = None,
partner_id: str | OR[str] | None = None,
text_reuse_cluster_id: str | OR[str] | None = None,
collection_id: str | AND[str] | OR[str] | None = None,
country: str | AND[str] | OR[str] | None = None,
partner_id: str | AND[str] | OR[str] | None = None,
text_reuse_cluster_id: str | AND[str] | OR[str] | None = None,
) -> list[Filter]:
filters: list[Filter] = []
if string:
Expand Down
81 changes: 80 additions & 1 deletion impresso/resources/text_reuse/clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,21 @@ def df(self) -> DataFrame:


class TextReuseClustersResource(Resource):
"""Text reuse clusters resource."""
"""
Interact with the text reuse clusters endpoint of the Impresso API.

This resource allows searching for text reuse clusters based on various criteria
and retrieving facet information about these clusters.

Examples:
Find clusters with size between 10 and 20:
>>> results = textReuseClusters.find(cluster_size=(10, 20)) # doctest: +SKIP
>>> print(results.df) # doctest: +SKIP

Get the distribution of newspapers involved in clusters:
>>> facet_results = textReuseClusters.facet(facet='newspaper', order_by='count') # doctest: +SKIP
>>> print(facet_results.df) # doctest: +SKIP
"""

name = "textReuseClusters"

Expand All @@ -75,6 +89,40 @@ def find(
mention: str | AND[str] | OR[str] | None = None,
entity_id: str | AND[str] | OR[str] | None = None,
) -> FindTextReuseClustersContainer:
"""
Find text reuse clusters based on various criteria.

Args:
term: Search for clusters containing specific text.
title: Filter clusters by the title of the articles within them.
order_by: Specify the sorting order for the results.
cluster_size: Filter clusters by the number of items they contain.
lexical_overlap: Filter clusters by the lexical overlap score.
day_delta: Filter clusters by the time span (in days) between the first and last item.
date_range: Filter clusters based on the date range of their items.
newspaper_id: Filter clusters containing items from specific newspapers.
collection_id: Filter clusters containing items from specific collections.
limit: Maximum number of clusters to return.
offset: Number of clusters to skip from the beginning.
front_page: Filter clusters containing items published on the front page.
topic_id: Filter clusters associated with specific topics.
language: Filter clusters by the language of their items.
country: Filter clusters by the country of publication of their items.
mention: Filter clusters containing specific mentions (named entities).
entity_id: Filter clusters associated with specific entity IDs.

Returns:
FindTextReuseClustersContainer: A container holding the search results.

Examples:
Find clusters with size between 10 and 20:
>>> results = textReuseClusters.find(cluster_size=(10, 20)) # doctest: +SKIP
>>> print(results.df) # doctest: +SKIP

Find clusters related to 'politics' in Swiss newspapers:
>>> results = textReuseClusters.find(term='politics', country='CH') # doctest: +SKIP
>>> print(results.df) # doctest: +SKIP
"""

filters = _build_filters(
text=term,
Expand Down Expand Up @@ -130,6 +178,37 @@ def facet(
lexical_overlap: Range | AND[Range] | OR[Range] | None = None,
day_delta: Range | AND[Range] | OR[Range] | None = None,
) -> FacetDataContainer:
"""
Get facet information for text reuse clusters based on specified filters.

Facets provide aggregated counts for different properties of the clusters,
such as the distribution of cluster sizes or newspapers involved.

Args:
facet: The specific facet to retrieve (e.g., 'newspaper', 'cluster_size').
order_by: How to order the facet values (e.g., 'value', 'count').
limit: Maximum number of facet values to return.
offset: Number of facet values to skip.
cluster_size: Filter clusters by size before calculating facets.
date_range: Filter clusters by date range before calculating facets.
newspaper_id: Filter clusters by newspaper before calculating facets.
lexical_overlap: Filter clusters by lexical overlap before calculating facets.
day_delta: Filter clusters by day delta before calculating facets.

Returns:
FacetDataContainer: A container holding the facet results.

Examples:
Get the top 10 newspapers involved in clusters:
>>> facet_results = textReuseClusters.facet(facet='newspaper', limit=10, order_by='count') # doctest: +SKIP
>>> print(facet_results.df) # doctest: +SKIP

Get the distribution of cluster sizes for clusters within a specific date range:
>>> from impresso.structures import DateRange
>>> date_filter = DateRange(start="1900-01-01", end="1910-12-31")
>>> facet_results = textReuseClusters.facet(facet='cluster_size', date_range=date_filter) # doctest: +SKIP
>>> print(facet_results.df) # doctest: +SKIP
"""
facet_id = get_enum_from_literal(facet, GetTrClustersFacetId)
if isinstance(facet_id, Unset):
raise ValueError(f"{facet} is not a valid value")
Expand Down
Loading