Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 50 additions & 11 deletions docs/user-guide/exporting-data/dataframes.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,24 +86,63 @@ print(f"""

### Tracking data

For a [`TrackingDataset`][kloppy.domain.TrackingDataset], the output columns include:
The [`TrackingDataset`][kloppy.domain.TrackingDataset] supports two different layouts: **Wide** (default) and **Long**.

| Column | Description |
|-------------------------------------------------|---------------------------------------|
| frame_id | Frame number |
| period_id | Match period |
| timestamp | Frame timestamp |
| ball_x, ball_y, ball_z, ball_speed | Ball position and speed |
| <player_id\>_x, <player_id\>_y, <player_id\>_d, <player_id\>_s | Player coordinates, distance (since previous frame), and speed |
| ball_state | Current state of the ball |
| ball_owning_team | Which team owns the ball |
#### Wide layout

In the wide layout, each row represents a single frame. Player data is flattened into columns, with a specific column for each player's x/y coordinates, speed, etc.

**Common Columns:**

| Column | Description |
| :--- | :--- |
| `frame_id` | Frame number |
| `period_id` | Match period |
| `timestamp` | Frame timestamp |
| `ball_state` | Current state of the ball |
| `ball_owning_team` | Which team owns the ball |
| `ball_x`, `ball_y`, `ball_z` | Ball coordinates |
| `ball_speed` | Ball speed |
| `<player_id>_x`, `<player_id>_y` | Player coordinates |
| `<player_id>_d` | Player distance covered (since previous frame) |
| `<player_id>_s` | Player speed |

**Example:**

```python exec="true" html="true" session="export-df"
# Default is layout="wide"
print(f"""
<div class="md-typeset__scrollwrap"><div class="md-typeset__table">
{tracking_dataset.to_df(layout="wide").head(n=3).to_html(index=False, border="0")}
</div></div>
""")
```

#### Long layout

In the long layout, each frame is "melted" into multiple rows: one for the ball and one for each player present in that frame.

**Common Columns:**

| Column | Description |
| :--- | :--- |
| `frame_id` | Frame number |
| `period_id` | Match period |
| `timestamp` | Frame timestamp |
| `team_id` | Team identifier (or "ball") |
| `player_id` | Player identifier (or "ball") |
| `x`, `y`, `z` | Entity coordinates |
| `d` | Entity distance covered (since previous frame) |
| `s` | Entity speed |
| `ball_state` | Current state of the ball (repeated for all rows in frame) |
| `ball_owning_team` | Which team owns the ball (repeated for all rows in frame) |

**Example:**

```python exec="true" html="true" session="export-df"
print(f"""
<div class="md-typeset__scrollwrap"><div class="md-typeset__table">
{tracking_dataset.to_df().head(n=3).to_html(index=False, border="0")}
{tracking_dataset.to_df(layout="long").head(n=6).to_html(index=False, border="0")}
</div></div>
""")
```
Expand Down
115 changes: 89 additions & 26 deletions kloppy/domain/models/common.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from abc import ABC, abstractmethod
from collections import defaultdict
from collections.abc import Iterable
from dataclasses import dataclass, field, replace
from datetime import datetime, timedelta
from enum import Enum, Flag
from itertools import chain
import sys
from typing import (
TYPE_CHECKING,
Expand Down Expand Up @@ -679,12 +679,16 @@ def to_mplsoccer(self):
dim = BaseDims(
left=self.pitch_dimensions.x_dim.min,
right=self.pitch_dimensions.x_dim.max,
bottom=self.pitch_dimensions.y_dim.min
if not invert_y
else self.pitch_dimensions.y_dim.max,
top=self.pitch_dimensions.y_dim.max
if not invert_y
else self.pitch_dimensions.y_dim.min,
bottom=(
self.pitch_dimensions.y_dim.min
if not invert_y
else self.pitch_dimensions.y_dim.max
),
top=(
self.pitch_dimensions.y_dim.max
if not invert_y
else self.pitch_dimensions.y_dim.min
),
width=self.pitch_dimensions.x_dim.max
- self.pitch_dimensions.x_dim.min,
length=self.pitch_dimensions.y_dim.max
Expand Down Expand Up @@ -733,14 +737,16 @@ def to_mplsoccer(self):
- self.pitch_dimensions.x_dim.min
),
pad_multiplier=1,
aspect_equal=False
if self.pitch_dimensions.unit == Unit.NORMED
else True,
aspect_equal=(
False if self.pitch_dimensions.unit == Unit.NORMED else True
),
pitch_width=pitch_width,
pitch_length=pitch_length,
aspect=pitch_width / pitch_length
if self.pitch_dimensions.unit == Unit.NORMED
else 1.0,
aspect=(
pitch_width / pitch_length
if self.pitch_dimensions.unit == Unit.NORMED
else 1.0
),
)
return dim

Expand Down Expand Up @@ -1823,29 +1829,32 @@ def to_records(
self,
*columns: Unpack[tuple["Column"]],
as_list: Literal[True] = True,
layout: Optional[str] = None,
**named_columns: "NamedColumns",
) -> list[dict[str, Any]]: ...

@overload
def to_records(
self,
*columns: Unpack[tuple["Column"]],
layout: Optional[str] = None,
as_list: Literal[False] = False,
**named_columns: "NamedColumns",
) -> Iterable[dict[str, Any]]: ...

def to_records(
self,
*columns: Unpack[tuple["Column"]],
layout: Optional[str] = None,
as_list: bool = True,
**named_columns: "NamedColumns",
) -> Union[list[dict[str, Any]], Iterable[dict[str, Any]]]:
from ..services.transformers.data_record import get_transformer_cls

transformer = get_transformer_cls(self.dataset_type)(
transformer = get_transformer_cls(self.dataset_type, layout=layout)(
*columns, **named_columns
)
iterator = map(transformer, self.records)
iterator = chain.from_iterable(map(transformer, self.records))
if as_list:
return list(iterator)
else:
Expand All @@ -1855,23 +1864,36 @@ def to_dict(
self,
*columns: Unpack[tuple["Column"]],
orient: Literal["list"] = "list",
layout: Optional[str] = None,
**named_columns: "NamedColumns",
) -> dict[str, list[Any]]:
if orient == "list":
from ..services.transformers.data_record import get_transformer_cls

transformer = get_transformer_cls(self.dataset_type)(
transformer = get_transformer_cls(self.dataset_type, layout=layout)(
*columns, **named_columns
)

c = len(self.records)
items = defaultdict(lambda: [None] * c)
for i, record in enumerate(self.records):
item = transformer(record)
for k, v in item.items():
items[k][i] = v
result = {}
for record_idx, record in enumerate(self.records):
items = transformer(record)

for item_idx, item in enumerate(items):
seen_keys = set()
for k, v in item.items():
# If this is a new key, backfill previous records
if k not in result:
result[k] = [None] * (record_idx + item_idx)

return items
result[k].append(v)
seen_keys.add(k)

# Pad keys that were not seen in this record
for k in result:
if k not in seen_keys:
result[k].append(None)

return dict(result)
else:
raise KloppyParameterError(
f"Orient {orient} is not supported. Only orient='list' is supported"
Expand All @@ -1881,8 +1903,43 @@ def to_df(
self,
*columns: Unpack[tuple["Column"]],
engine: Optional[Literal["polars", "pandas", "pandas[pyarrow]"]] = None,
layout: Optional[str] = None,
**named_columns: "NamedColumns",
):
"""Converts the dataset's records into a DataFrame.

This method extracts data from the internal records and formats them into
a tabular structure using the specified dataframe engine (Pandas or Polars).

Args:
*columns: Column names to include in the output.
- If not provided, a default set of columns is returned.
- Supports wildcards (e.g., "*coordinates*").
- Supports callables for custom extraction logic.
engine: The dataframe engine to use.
- 'pandas': Returns a standard pandas DataFrame.
- 'pandas[pyarrow]': Returns a pandas DataFrame backed by PyArrow.
- 'polars': Returns a Polars DataFrame.
- None: Defaults to the `dataframe.enging` configuration value.
layout: The layout structure of the output.
- For Event data: Default is a flat list of events.
- For Tracking data:
- 'wide' (default): One row per frame, with players as columns.
- 'long': One row per entity (player/ball) per frame ("tidy" data).
**named_columns: Additional columns to create, where the key is the
column name and the value is a literal or a callable applied to
each record.

Examples:
Basic conversion to Pandas:
>>> df = dataset.to_df()

Using Polars and selecting specific columns:
>>> df = dataset.to_df("period_id", "timestamp", "player_id", "coordinates_*", engine="polars")

Tracking data in long format:
>>> df = tracking_dataset.to_df(layout="long")
"""
from kloppy.config import get_config

if not engine:
Expand Down Expand Up @@ -1913,7 +1970,9 @@ def to_df(
)

table = pa.Table.from_pydict(
self.to_dict(*columns, orient="list", **named_columns)
self.to_dict(
*columns, orient="list", layout=layout, **named_columns
)
)
return table.to_pandas(types_mapper=types_mapper)

Expand All @@ -1927,7 +1986,9 @@ def to_df(
)

return DataFrame.from_dict(
self.to_dict(*columns, orient="list", **named_columns)
self.to_dict(
*columns, orient="list", layout=layout, **named_columns
)
)
elif engine == "polars":
try:
Expand All @@ -1939,7 +2000,9 @@ def to_df(
)

return from_dict(
self.to_dict(*columns, orient="list", **named_columns)
self.to_dict(
*columns, orient="list", layout=layout, **named_columns
)
)
else:
raise KloppyParameterError(f"Engine {engine} is not valid")
Expand Down
1 change: 1 addition & 0 deletions kloppy/domain/models/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from ..services.transformers.data_record import NamedColumns
from .tracking import Frame


QualifierValueType = TypeVar("QualifierValueType")
EnumQualifierType = TypeVar("EnumQualifierType", bound=Enum)

Expand Down
2 changes: 1 addition & 1 deletion kloppy/domain/services/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from kloppy.domain import AttackingDirection, Frame, Ground, Period

from .event_factory import EventFactory, create_event
from .transformers import DatasetTransformer, DatasetTransformerBuilder
from .transformers.dataset import DatasetTransformer, DatasetTransformerBuilder

# NOT YET: from .enrichers import TrackingPossessionEnricher

Expand Down
4 changes: 2 additions & 2 deletions kloppy/domain/services/transformers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .dataset import DatasetTransformer, DatasetTransformerBuilder
from . import attribute, data_record, dataset

__all__ = ["DatasetTransformer", "DatasetTransformerBuilder"]
__all__ = ["dataset", "data_record", "attribute"]
Loading
Loading