Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion preprocessing/sports/event_data/event_class.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
class Event_data:
soccer_data_provider = ['datafactory', 'metrica', 'opta', 'robocup_2d', 'sportec', 'statsbomb', 'statsbomb_skillcorner',
'wyscout', 'datastadium','bepro']
'wyscout', 'datastadium', 'bepro', 'pff_fc']
handball_data_provider = []
rocket_league_data_provider = ['carball']

Expand Down
9 changes: 8 additions & 1 deletion preprocessing/sports/event_data/soccer/soccer_event_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ def load_data_single_file(self):
#based on the data provider, load the dataloading function from load_data.py (single file)
if self.data_provider == 'datafactory':
df=soccer_load_data.load_datafactory(self.event_path)
elif self.data_provider == 'pff_fc':
df=soccer_load_data.load_pff2metrica(self.event_path, match_id=self.match_id)
elif self.data_provider == 'metrica':
df=soccer_load_data.load_metrica(self.event_path,match_id=self.match_id,tracking_home_path=self.tracking_home_path,tracking_away_path=self.tracking_away_path)
elif self.data_provider == 'opta':
Expand Down Expand Up @@ -115,10 +117,13 @@ def load_data(self):
out_df_list.append(future.result())
df = pd.concat(out_df_list)
#other data providers
elif self.data_provider in ['datafactory','opta','wyscout']:
elif self.data_provider in ['datafactory','opta','wyscout','pff_fc']:
event_path = self.event_path
files = sorted(os.listdir(self.event_path))
files = [f for f in files if not f.startswith('.')]
if self.data_provider == "pff_fc":
#only json files
files = [f for f in files if f.endswith('.json')]
out_df_list = []
if self.data_provider == "opta":
if self.match_id is None:
Expand All @@ -135,6 +140,8 @@ def load_data(self):
count+=1
elif self.data_provider == "wyscout":
self.wyscout_matches_path=os.path.join(matches_path, f.replace("events_","matches_"))
elif self.data_provider == "pff_fc":
self.match_id = f.split(".")[0]
self.event_path = os.path.join(event_path, f)
df = self.load_data_single_file()
out_df_list.append(df)
Expand Down
216 changes: 216 additions & 0 deletions preprocessing/sports/event_data/soccer/soccer_load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1537,6 +1537,222 @@ def get_tracking_features(event_df, tracking_data, meta_data, verbose=True):

return event_df

def load_pff2metrica(event_path:str, match_id:str = None) -> pd.DataFrame:
"""
Convert PFF-style event data to Metrica format.

Parameters
----------
event_df : pd.DataFrame
Event data from PFF dataset with columns like:
- gameEvents_period
- gameEvents_playerName
- possessionEvents_receiverPlayerName
- possessionEvents_possessionEventType
- startTime, endTime, duration
- gameEvents_homeTeam
- various outcome types for success/failure
match_id : str, optional
Match identifier to add as a column, by default None

Returns
-------
Metrica_df : pd.DataFrame
DataFrame in Metrica format with columns:
['Team', 'Type', 'Subtype', 'Period', 'Start Frame', 'Start Time [s]',
'End Frame', 'End Time [s]', 'From', 'To', 'Start X', 'Start Y', 'End X', 'End Y']
"""
with open(event_path, 'r') as f:
event_data = json.load(f)
event_df = pd.json_normalize(event_data, sep='_')

def type_id2name(x):
"""
Map event type codes to descriptive names.

Parameters
----------
x : str | int | float | None
Event type code (e.g., 'PA', 'SH', 'FO', etc.)

Returns
-------
str | None
Descriptive event type name, or None if not mapped.
"""
import math
if x in ['PA']:
x = "pass"
elif x in ['CR']:
x = "cross"
# elif x == 2:
# x = "throw_in"
# elif x == 5:
# x = "corner_crossed"
# elif x == 7:
# x = "take_on"
elif x in ['FO']:
x = "foul"
elif x in ['CH']:
x = "tackle"
# elif x == 10:
# x = "interception"
elif x in ['SH']:
x = "shot"
elif x in ['CL']:
x = "clearance"
elif x in ['BC']:
x = "dribble"
# elif x == 22:
# x = "goalkick"
elif x in ['IT', 'RE', 'TC']:
x = "other"
elif x is None or (isinstance(x, (float, int)) and math.isnan(x)):
x = None
else:
print(f"Unmapped event type: {x}")
return x
def extract_player_xy(row):
"""
Extracts the (x, y) coordinates of the player involved in a game event.

Parameters
----------
row : pd.Series
A row from a DataFrame containing game event and player information.
Expected keys:
- "gameEvents_homeTeam" (bool): True if home team, False if away team.
- "homePlayers" (list|str): List or stringified list of home team players.
- "awayPlayers" (list|str): List or stringified list of away team players.
- "gameEvents_playerId" (int): ID of the player involved in the event.

Returns
-------
pd.Series
A Series with coordinates:
- "start_x"
- "start_y"
- "end_x"
- "end_y"
If the player is not found, all values are None.
"""
# choose player list
if row["gameEvents_homeTeam"] is True:
player_dict = row["homePlayers"]
elif row["gameEvents_homeTeam"] is False:
player_dict = row["awayPlayers"]
else:
return pd.Series([None, None, None, None], index=["start_x", "start_y", "end_x", "end_y"])

# find target player
player_dict = ast.literal_eval(player_dict) if type(player_dict) == str else player_dict
target_player = next((d for d in player_dict if d["playerId"] == row["gameEvents_playerId"]), None)

if target_player:
return pd.Series(
[target_player["x"], target_player["y"], target_player["x"], target_player["y"]],
index=["start_x", "start_y", "end_x", "end_y"]
)
else:
return pd.Series([None, None, None, None], index=["start_x", "start_y", "end_x", "end_y"])

# drop row where gameEvents_startGameClock is NaN
event_df = event_df.dropna(subset=['gameEvents_startGameClock']).reset_index(drop=True)

# set column name
column_name = ['Team',
'Type',
'Subtype',
'Period',
'Start Frame',
'Start Time [s]',
'End Frame',
'End Time [s]',
'From',
'To',
'Start X',
'Start Y',
'End X',
'End Y']
Metrica_df = pd.DataFrame(columns=column_name)
Metrica_df['Period'] = event_df['gameEvents_period']
event_df[["start_x", "start_y", "end_x", "end_y"]] = event_df.apply(extract_player_xy, axis=1)
Metrica_df['Start X'] = event_df['start_x'] #- 52.5
Metrica_df['Start Y'] = event_df['start_y'] #- 34
Metrica_df['End X'] = event_df['end_x'] #- 52.5
Metrica_df['End Y'] = event_df['end_y'] #- 34
Metrica_df['From'] = event_df['gameEvents_playerName']
Metrica_df['To'] = event_df['possessionEvents_receiverPlayerName']
Metrica_df['Type'] = event_df['possessionEvents_possessionEventType']
Metrica_df['Type'] = Metrica_df['Type'].apply(type_id2name)

idx = event_df.index

def col(name):
"""Safe getter: returns Series aligned to df (all NaN if col missing)."""
return event_df[name] if name in event_df.columns else pd.Series(pd.NA, index=idx)

# Raw outcome columns
pass_out = col('possessionEvents_passOutcomeType')
cross_out = col('possessionEvents_crossOutcomeType')
shot_out = col('possessionEvents_shotOutcomeType')
clr_out = col('possessionEvents_clearanceOutcomeType')
tkl_out = col('possessionEvents_challengeOutcomeType')
carry_out = col('possessionEvents_ballCarryOutcome')
touch_out = col('possessionEvents_touchOutcomeType')

# Per-action success masks (nullable booleans)
event_df['pass_success'] = pass_out.isin(['C'])
event_df['cross_success'] = cross_out.isin(['C'])
event_df['shot_success'] = shot_out.isin(['G'])
event_df['clearance_success'] = ~clr_out.isin(['B','D']) & clr_out.notna()
event_df['tackle_success'] = tkl_out.isin(['B','C','M'])
event_df['dribble_success'] = carry_out.isin(['R'])
event_df['touch_success'] = touch_out.isin(['R'])

# Where each action is *present* (not NaN), assign Subtype based on its success
event_df['Subtype'] = np.nan

def apply_subtype(success_col, present_series):
"""Set Subtype for rows where this action is present."""
is_present = present_series.notna()
success = event_df[success_col] == True
fail = event_df[success_col] == False
event_df.loc[is_present & success, 'Subtype'] = 'success'
event_df.loc[is_present & fail, 'Subtype'] = 'fail'

apply_subtype('pass_success', pass_out)
apply_subtype('cross_success', cross_out)
apply_subtype('shot_success', shot_out)
apply_subtype('clearance_success', clr_out)
apply_subtype('tackle_success', tkl_out)
apply_subtype('dribble_success', carry_out)
apply_subtype('touch_success', touch_out)
Metrica_df['Subtype'] = event_df['Subtype']

fps = 29.97

Metrica_df['Start Time [s]'] = (event_df['gameEvents_startGameClock']).round().astype(int)
Metrica_df['End Time [s]'] = (event_df['duration'] + event_df['gameEvents_startGameClock']).round().astype(int)

Metrica_df['Start Frame'] = ((event_df['startTime'] - event_df['startTime'][0]) * fps).round().astype(int)
end_frame = ((event_df['endTime'] - event_df['startTime'][0]) * fps).round()
Metrica_df['End Frame'] = end_frame.fillna(Metrica_df['Start Frame']).astype(int)
Metrica_df['Team'] = np.where(event_df['gameEvents_homeTeam'] == True, 'Home',
np.where(event_df['gameEvents_homeTeam'] == False, 'Away', None))

#drop rows where start_x or start_y is NaN
Metrica_df = Metrica_df.dropna(subset=['Start X', 'Start Y'])
Metrica_df = Metrica_df.reset_index(drop=True)

if match_id is not None:
Metrica_df['match_id'] = match_id
cols = Metrica_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
Metrica_df = Metrica_df[cols]

return Metrica_df

if __name__ == "__main__":
import pdb
import os
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "openstarlab_preprocessing"
version = "0.1.39"
version = "0.1.40"
description = "openstarlab preprocessing package"
readme = "README.md"
requires-python = ">=3.8"
Expand Down