diff --git a/README.md b/README.md index 3ffa3ae..e6e128b 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,9 @@ pip install -e . - [Space data in Football/Soccer ⚽](https://github.com/open-starlab/PreProcessing/blob/master/preprocessing/sports/space_data/soccer/README.md) - [Space data in Ultimate 🥏](https://github.com/open-starlab/PreProcessing/blob/master/preprocessing/sports/space_data/ultimate/README.md) +#### Phase Data +- [Phase data in Football/Soccer ⚽](https://github.com/open-starlab/PreProcessing/blob/master/preprocessing/sports/phase_data/soccer/README.md) + ## RoadMap - [x] Release the package - [ ] Incorporate more functions @@ -55,6 +58,7 @@ pip install -e . Calvin Yeung
Calvin Yeung

💻 Kenjiro Ide
Kenjiro Ide

💻 Zheng Chen
Zheng Chen

💻 + Kento Kuroda
Kento Kuroda

💻 Shunsuke Iwashita
Shunsuke Iwashita

💻 Keisuke Fujii
Keisuke Fujii

🧑‍💻 diff --git a/preprocessing/sports/phase_data/__init__.py b/preprocessing/sports/phase_data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/preprocessing/sports/phase_data/phase_class.py b/preprocessing/sports/phase_data/phase_class.py new file mode 100644 index 0000000..c41d4f6 --- /dev/null +++ b/preprocessing/sports/phase_data/phase_class.py @@ -0,0 +1,69 @@ +class Phase_data: + soccer_data_provider = ['bepro', 'statsbomb_skillcorner'] + other_soccer_data_provider = ['pff_fc', 'robocup_2d', 'datastadium'] + handball_data_provider = [] + rocket_league_data_provider = ['carball'] + + def __new__(cls, data_provider, *args, **kwargs): + if data_provider in cls.soccer_data_provider: + from preprocessing.sports.phase_data.soccer.soccer_phase_class import Soccer_phase_data + return Soccer_phase_data(data_provider, *args, **kwargs) + elif data_provider in cls.other_soccer_data_provider: + raise NotImplementedError('other soccer data provider not implemented yet') + elif data_provider in cls.handball_data_provider: + raise NotImplementedError('Handball phase data not implemented yet') + elif data_provider in cls.rocket_league_data_provider: + raise NotImplementedError('rocket_league phase data not implemented yet') + else: + raise ValueError(f'Unknown data provider: {data_provider}') + + +# def main(): +# import os +# import argparse +# import glob +# args = argparse.ArgumentParser() +# args.add_argument('--data_provider', required=True, choices=['bepro', 'statsbomb_skillcorner', 'pff_fc'], help='kind of data provider') +# args.add_argument('--match_id', required=False, help='ID of match data') +# args = args.parse_args() +# data_provider = args.data_provider +# base_dir = os.getcwd() + f"path/to" +# if data_provider == 'bepro': +# match_ids = [str(match_id) for match_id in args.match_id.split(",")] +# for match_id in match_ids: +# # The format for bepro has changed from Match ID: 130000(?). +# if int(match_id) >= 130000: +# file_pattern = os.path.join(base_dir, 'tracking_data', data_provider, match_id, f"{match_id}_*_frame_data.json") +# tracking_json_paths = sorted(glob.glob(file_pattern)) +# meta_data = os.path.join(base_dir, 'tracking_data', data_provider, match_id, f"{match_id}_metadata.json") +# event_csv_path = glob.glob(os.path.join(os.path.join(base_dir, 'event_data', data_provider, match_id), '*.csv')) +# preprocessing_df=Phase_data(data_provider=data_provider, bp_tracking_json_paths=tracking_json_paths, event_path=event_csv_path[0], meta_data=meta_data).load_data() +# else: +# tracking_path=os.getcwd()+f"path/to/tracking_data/{data_provider}/{match_id}/{match_id}_tracker_box_data.xml" +# meta_data = os.path.join(base_dir, 'tracking_data', data_provider, match_id, f"{match_id}_tracker_box_metadata.xml") +# event_csv_path = glob.glob(os.path.join(os.path.join(base_dir, 'event_data', data_provider, match_id), '*.csv')) +# preprocessing_df=Phase_data(data_provider=data_provider, bp_tracking_xml_path=tracking_path, event_path=event_csv_path[0], meta_data=meta_data).load_data() +# output_file_path = os.path.join(base_dir, 'phase_data', data_provider, match_id, f"{match_id}_main_data.csv") +# preprocessing_df.to_csv(output_file_path,index=False) +# print(f"✅ All period tracking data saved successfully at {output_file_path}.") +# elif data_provider == 'statsbomb_skillcorner': +# sb_match_id = 3894537 # 843, 537 +# sc_match_id = 1018887 # 1498966, 1018887 +# sb_event_path=f'path/to/event_data/statsbomb/{sb_match_id}_events.pkl' +# sc_tracking_path=f'path/to/tracking_data/skillcorner/LaLiga-2023-2024/tracking/{sc_match_id}.json' +# sc_match_path=f'path/to/tracking_data/skillcorner/LaLiga-2023-2024/match/{sc_match_id}.json' +# sc_players_path='path/to/tracking_data/skillcorner/LaLiga-2023-2024/players/players.json' +# preprocessing_df=Phase_data(data_provider=data_provider, sb_event_path=sb_event_path, sc_tracking_path=sc_tracking_path, sc_match_path=sc_match_path, sc_players_path=sc_players_path).load_data() +# output_file_dir = os.path.join(base_dir, 'phase_data', data_provider, f'{sb_match_id}_{sc_match_id}') +# os.makedirs(output_file_dir, exist_ok=True) +# output_file_path = os.path.join(output_file_dir, f"{sb_match_id}_{sc_match_id}_main_data.csv") +# preprocessing_df.to_csv(output_file_path,index=False) +# elif data_provider == 'pff_fc': +# print('not yet') +# output_file_path = os.path.join(base_dir, 'phase_data', data_provider, match_id, f"{match_id}_main_data.csv") +# preprocessing_df.to_csv(output_file_path,index=False) +# print(f"✅ All period tracking data saved successfully at {output_file_path}.") + + +# if __name__ == '__main__': +# main() \ No newline at end of file diff --git a/preprocessing/sports/phase_data/soccer/README.md b/preprocessing/sports/phase_data/soccer/README.md new file mode 100644 index 0000000..6c08873 --- /dev/null +++ b/preprocessing/sports/phase_data/soccer/README.md @@ -0,0 +1,24 @@ +# Phase Data in Football/Soccer ⚽ +[![Documentation Status](https://readthedocs.org/projects/openstarlab/badge/?version=latest)](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/index.html) +## Introduction +This package offers functions to load and preprocess phase data from various sources in football/soccer. + +## Supported Data Providers +You can find detailed documentation on supported data providers [here](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/Phase_data/Data_Provider/index.html). The supported providers include: + +- Bepro +- Statsbomb and Skillcorner + +For data format examples, visit [Kloppy](https://github.com/PySport/kloppy/tree/master/kloppy/tests/files) + +## Supported Preprocessing Methods +For information on supported preprocessing methods, visit [this documentation](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/Phase_data/Data_Format/index.html). + +## Examples +Here are some examples of how to download and preprocess data: + +- **Bepro Data:** + - [Read the Docs Example](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/Phase_data/Data_Provider/Football/Bepro/contents.html) + +- **StatsBomb and SkillCorner Data:** + - [Read the Docs Example](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/Phase_data/Data_Provider/Football/Statsbomb_Skillcorner/contents.html) \ No newline at end of file diff --git a/preprocessing/sports/phase_data/soccer/__init__.py b/preprocessing/sports/phase_data/soccer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/preprocessing/sports/phase_data/soccer/soccer_load_data.py b/preprocessing/sports/phase_data/soccer/soccer_load_data.py new file mode 100644 index 0000000..21811c2 --- /dev/null +++ b/preprocessing/sports/phase_data/soccer/soccer_load_data.py @@ -0,0 +1,763 @@ +import json +import pandas as pd +import numpy as np +import xml.etree.ElementTree as ET +import os +import pickle +from typing import List, Dict, Any + +def load_bepro(tracking_xml_path: str, tracking_json_paths: list, event_path: str, meta_data_path: str) -> pd.DataFrame: + """ + Loads and processes event and tracking data from soccer match recordings. + + This function combines event data with tracking data by merging based on event time. It also adds + additional features extracted from metadata, such as player information, and converts position + coordinates to the correct scale for analysis. + + Args: + tracking_path (str): Path to the XML or JSON file containing tracking data. + event_path (str): Path to the CSV file containing event data. + meta_path (str): Path to the XML file containing match metadata (pitch, teams, players, etc.). + + Returns: + pd.DataFrame: A DataFrame containing the merged and processed event and tracking data, + with additional features including player positions, speeds, ball position, + and metadata (e.g., player names, shirt numbers, positions). + """ + + def extract_tracking_data_from_xml(xml_path: str) -> List[Dict[str, Any]]: + """ + Parse the XML file and extract tracking data for players and the ball. + + Args: + xml_path (str): Path to the XML file. + Returns: + list of dict: A list containing tracking information for each player and the ball in each frame. + """ + tree = ET.parse(xml_path) + root = tree.getroot() + tracking_data = [] + + for frame in root.findall("frame"): + frame_number = int(frame.get("frameNumber")) + match_time = int(frame.get("matchTime")) + + for element in frame.findall("*"): + if element.tag == "player": + player_id = element.get("playerId") + loc = element.get("loc") + elif element.tag == "ball": + player_id = "ball" + loc = element.get("loc") + else: + continue + if loc is None: + continue + + try: + x, y = map(float, loc.strip("[]").split(",")) + + tracking_data.append({ + "frame": frame_number, + "match_time": match_time, + "player_id": player_id, + "x": "{:.2f}".format(x * 105 - 52.5), + "y": "{:.2f}".format(y * 68 - 34.0) + }) + except ValueError: + raise ValueError(f"Invalid location format for player {player_id} in frame {frame_number}") + + return tracking_data + + def extract_tracking_data_from_json(json_path: str, period: str) -> List[Dict[str, Any]]: + """ + Parse the JSON file and extract tracking data. + + Args: + json_path (str): Path to the JSON file. + Returns: + list of dict: A list containing tracking information for each player in each frame. + """ + with open(json_path, "r") as f: + data = json.load(f) + + tracking_data = [] + for frame_number, players in data.items(): + for player in players: + try: + tracking_data.append({ + "period": period, + "frame": int(frame_number), + "match_time": int(player.get("match_time", 0)), + "player_id": "ball" if player.get("player_id") == None else player.get("player_id"), + "x": "{:.2f}".format(float(player.get("x", 0) - 52.5)), + "y": "{:.2f}".format(float(player.get("y", 0) - 34.0)) + }) + except ValueError: + raise ValueError(f"Invalid data format in frame {frame_number}") + + return tracking_data + + def devide_by_period(tracking_data_list: List[dict]) -> List[pd.DataFrame]: + """Splits tracking data into multiple DataFrames based on period resets detected via frame number decreases. + + Args: + tracking_data_list (List[dict]): A list of dictionaries containing raw tracking data. + + Returns: + List[pd.DataFrame]: A list of DataFrames, each representing a single period with an added 'period' column. + """ + if not tracking_data_list: + return [] + + df = pd.DataFrame(tracking_data_list) + + first_occurrence_of_frame = df.drop_duplicates(subset=['frame', 'match_time'], keep='first') + frame_diff = first_occurrence_of_frame['frame'].diff().fillna(0) + period_reset_indices = frame_diff[frame_diff < 0].index + + split_indices = [0] + + for reset_idx in period_reset_indices: + if reset_idx > 0: + split_indices.append(reset_idx) + + split_indices.append(len(df)) + split_indices = sorted(list(set(split_indices))) + period_df_list = [] + + for i in range(len(split_indices) - 1): + start_idx = split_indices[i] + end_idx = split_indices[i+1] + current_period = i + 1 + period_df = df.iloc[start_idx:end_idx].copy() + period_df.loc[:, 'period'] = current_period + period_df_list.append(period_df.reset_index(drop=True)) + + return period_df_list + + def extract_meta_info_from_xml(xml_path: str) -> dict: + """ + Extract team information (ID, name, side) from an XML metadata file. + + Args: + xml_path (str): Path to the XML metadata file. + Returns: + dict: Dictionary in the format: {player_id: {'position': str, 'team_id': str, 'side': str}}. + """ + tree = ET.parse(xml_path) + root = tree.getroot() + + team_info = {} + player_info = {} + + teams_element = root.find("teams") + if teams_element is not None: + for team in teams_element.findall("team"): + team_id = team.get("id") + team_name = team.get("name") + side = team.get("side") + + if team_id: + team_info[team_id] = { + "team_name": team_name, + "side": side + } + players_element = root.find("players") + if players_element is not None: + for player in players_element.findall("player"): + player_id = player.get("id") + player_name = player.get("name") + team_id = player.get("teamId") + position = player.get("position") + + if player_id: + side = team_info.get(team_id, {}).get("side") + team_name = team_info.get(team_id, {}).get("team_name") + + player_info[player_id] = { + "team_id": team_id, + "team_name": team_name, + "side": side, + "player_name": player_name, + "position": position, + } + return player_info + + def extract_meta_info_from_json(json_path: str) -> dict: + """ + Extract team information (ID, name, side) from an JSON metadata file. + + Args: + xml_path (str): Path to the XML metadata file. + Returns: + dict: Dictionary in the format: {player_id: {'position': str, 'team_id': str, 'side': str}}. + """ + with open(json_path, "r", encoding="utf-8") as f: + data = json.load(f) + + player_info = {} + + teams = { + "home": data.get("home_team", {}), + "away": data.get("away_team", {}) + } + + for side, team_data in teams.items(): + if team_data: + team_id = str(team_data.get("team_id")) + team_name = str(team_data.get("team_name")) + + if "players" in team_data: + for player in team_data["players"]: + player_id = str(player.get("player_id")) + player_name = str(player.get("full_name")) + position = player.get("initial_position_name") + + if player_id: + player_info[player_id] = { + "team_id": team_id, + "team_name": team_name, + "side": side, + "player_name": player_name, + "position": position, + } + + return player_info + + def get_inplay_start_time(event_df: pd.DataFrame) -> pd.DataFrame: + """ + Add 'inplay_num' column to event_df. + If the first word in filtered_event_types matches the specified event type, + it is considered the start of a new in-play event, and inplay_num is incremented. + + Args: + event_df (pd.DataFrame): A DataFrame containing soccer event data. + Returns: + pd.DataFrame: A DataFrame with the 'inplay_num' column added. + """ + + event_df = event_df.copy() + START_EVENT_STRINGS = ['goalKick', 'throwIn', 'cornerKick', 'freeKick', 'goalAgainst'] + + event_df.loc[:, 'first_event_type'] = event_df['filtered_event_types'].fillna('').str.split(' ').str[0] + + is_start_frame = pd.Series(False, index=event_df.index) + is_start_frame.iloc[0] = True + is_restart_event = event_df['first_event_type'].isin(START_EVENT_STRINGS) + + is_normal_restart = is_restart_event & (event_df['first_event_type'] != 'goalAgainst') + is_start_frame = is_start_frame | is_normal_restart + is_goal_against = event_df['first_event_type'] == 'goalAgainst' + shifted_goal_against = is_goal_against.shift(periods=-1) + filled_shifted = shifted_goal_against.fillna(False).astype(bool) + is_start_frame = is_start_frame.astype(bool) + is_start_frame = is_start_frame | filled_shifted + + event_df.loc[:, 'inplay_num'] = is_start_frame.cumsum().astype(int) + + event_df = event_df.drop(columns=['first_event_type'], errors='ignore') + + return event_df + + def get_tracking(tracking_df: pd.DataFrame, event_df: pd.DataFrame, player_info_df: Dict[str, Dict[str, str]]) -> pd.DataFrame: + """ + Aggregates tracking data per frame into a wide-format DataFrame sorted by team side and position, and assigns in-play IDs. + + Args: + tracking_df (pd.DataFrame): Processed tracking data containing frame, period, coordinates, and player IDs. + event_df (pd.DataFrame): Processed event data containing match_time, period, and inplay_num. + player_info_df (Dict[str, Dict[str, str]]): Metadata mapping player IDs to positions, team IDs, and team sides. + + Returns: + pd.DataFrame: A wide-format tracking DataFrame structured frame-by-frame with normalized team orientations. + """ + POSITION_ORDER = ['GK', 'CB', 'RWB', 'RB', 'LWB', 'LB', 'CDM', 'RM', 'CM', 'LM', 'CAM', 'RW', 'LW', 'CF'] + + # ----------------------------------------------- + # 0. Player Information Merging and Preprocessing + # ----------------------------------------------- + event_df = event_df.copy() + player_map_df = pd.DataFrame.from_dict(player_info_df, orient='index').reset_index().rename( + columns={'index': 'player_id', 'side': 'team_side', 'team_name': 'team_name'} + ) + + tracking_df['player_id'] = tracking_df['player_id'].astype(str) + + tracking_df = pd.merge(tracking_df, player_map_df, on='player_id', how='left') + + tracking_df.loc[tracking_df['player_id'] == 'ball', ['team_id', 'team_name', 'team_side', 'position', 'player_name']] = \ + ['ball', 'ball', 'ball', 'ball', 'ball'] + + # ----------------------------------------------- + # 1. Determine Team Side (left/right) based on initial frame + # ----------------------------------------------- + target_frame = tracking_df['frame'].min() + 10 + gk_data_initial = tracking_df[(tracking_df['position'] == 'GK') & (tracking_df['frame'] == target_frame)] + + left_team_id = gk_data_initial.loc[gk_data_initial['x'].idxmin(), 'team_id'] + + team_meta = {} + unique_teams = tracking_df[tracking_df['team_id'] != 'ball'][['team_id', 'team_name', 'team_side']].drop_duplicates() + + for _, row in unique_teams.iterrows(): + current_side = 'left' if row['team_id'] == left_team_id else 'right' + + team_meta[f'{current_side}_team_id'] = row['team_id'] + team_meta[f'{current_side}_team_name'] = row['team_name'] + team_meta[f'{current_side}_team_side'] = row['team_side'] + + # ----------------------------------------------- + # 2. Assign In-Play Numbers (inplay_num) + # ----------------------------------------------- + tracking_df['inplay_num'] = np.nan + + inplay_times = event_df[['inplay_num', 'event_time']].drop_duplicates().sort_values('inplay_num') + inplay_periods = inplay_times.groupby('inplay_num')['event_time'].agg(['min', 'max']).reset_index() + inplay_periods.columns = ['inplay_num', 'start_time', 'end_time'] + + for period in tracking_df['period'].unique(): + p_inplay_periods = inplay_periods.copy() + + for _, row in p_inplay_periods.iterrows(): + current_inplay_num = row['inplay_num'] + start_time = row['start_time'] + end_time = row['end_time'] + + mask_index = tracking_df[ + (tracking_df['period'] == period) & + (tracking_df['match_time'] >= start_time) & + (tracking_df['match_time'] <= end_time) + ].index + + tracking_df.loc[mask_index, 'inplay_num'] = current_inplay_num + + final_tracking_df = tracking_df.copy() + + # ----------------------------------------------- + # 3. Determine Player Ordering and Join Keys + # ----------------------------------------------- + is_player = (final_tracking_df['player_id'] != 'ball') + side_calculated = np.where( + final_tracking_df['team_id'] == left_team_id, + 'left', + 'right' + ) + side_series = pd.Series(side_calculated, index=final_tracking_df.index) + if 'side' not in final_tracking_df.columns: + final_tracking_df.loc[:, 'side'] = np.nan + final_tracking_df['side'] = final_tracking_df['side'].astype(object) + final_tracking_df.loc[is_player, 'side'] = side_series.loc[is_player] + final_tracking_df.loc[final_tracking_df['player_id'] == 'ball', 'side'] = 'ball' + + pos_map = {pos: order for order, pos in enumerate(POSITION_ORDER, 1)} + player_df = final_tracking_df[final_tracking_df['player_id'] != 'ball'].copy() + player_df.loc[:, 'pos_order'] = player_df['position'].map(pos_map) + player_df.loc[:, 'pos_rank'] = player_df.groupby(['frame', 'side'])['pos_order'].rank(method='first').astype(int) + player_df.loc[:, 'variable'] = player_df['side'] + '_' + player_df['pos_rank'].astype(str) + + # ----------------------------------------------- + # 4. Transform Player Data to Wide Format (Pivot) + # ----------------------------------------------- + value_cols = ['x', 'y', 'player_id', 'player_name', 'position'] + + wide_data_list = [] + + for col in value_cols: + pivot_df = player_df.pivot_table( + index=['frame', 'match_time', 'period', 'inplay_num'], + columns='variable', + values=col, + aggfunc='first' + ).add_suffix(f'_{col.replace("player_id", "id").replace("player_name", "name")}') + + wide_data_list.append(pivot_df) + + wide_player_df = wide_data_list[0].join(wide_data_list[1:]) + + # ----------------------------------------------- + # 5. Extract and Merge Ball Data and Team Metadata + # ----------------------------------------------- + ball_df = final_tracking_df[final_tracking_df['player_id'] == 'ball'][['frame', 'x', 'y', 'match_time', 'period', 'inplay_num']].rename( + columns={'x': 'ball_x', 'y': 'ball_y'} + ).set_index(['frame', 'match_time', 'period', 'inplay_num']) + final_tracking_df = wide_player_df.join(ball_df).reset_index() + + for col, value in team_meta.items(): + final_tracking_df[col] = value + + # ----------------------------------------------- + # 6. Final Column Formatting and Reordering + # ----------------------------------------------- + ordered_player_cols = [] + for side in ['left', 'right']: + for i in range(1, 12): + prefix = f'{side}_{i}_' + + ordered_player_cols.append(prefix + 'id') + ordered_player_cols.append(prefix + 'name') + ordered_player_cols.append(prefix + 'position') + ordered_player_cols.append(prefix + 'x') + ordered_player_cols.append(prefix + 'y') + + base_cols = ['period', 'inplay_num', 'frame', 'match_time', 'ball_x', 'ball_y'] + + team_cols = [] + for side in ['left', 'right']: + team_cols.extend([f'{side}_team_id', f'{side}_team_name', f'{side}_team_side']) + + final_cols = base_cols + team_cols + ordered_player_cols + final_tracking_df = final_tracking_df.reindex(columns=final_cols) + + return final_tracking_df + + # Load the event data + event_df = pd.read_csv(event_path) + # devide by period + grouped_events = event_df.groupby('event_period') + PERIOD_ORDER = ['FIRST_HALF', 'SECOND_HALF', 'EXTRA_FIRST_HALF', 'EXTRA_SECOND_HALF'] + # check if the format is the latest version + if tracking_xml_path is None: + list_of_tracking_data = [] + for i in range(len(tracking_json_paths)): + tracking_data = extract_tracking_data_from_json(tracking_json_paths[i], period=str(i+1)) + list_of_tracking_data.append(tracking_data) + player_info_df = extract_meta_info_from_json(meta_data_path) + else: + tracking_data = extract_tracking_data_from_xml(tracking_xml_path) + # add period + list_of_tracking_data = devide_by_period(tracking_data) + player_info_df = extract_meta_info_from_xml(meta_data_path) + + final_tracking_df_list = [] + for i in range(len(list_of_tracking_data)): + event_df = grouped_events.get_group(PERIOD_ORDER[i]) + tracking_df = pd.DataFrame(list_of_tracking_data[i]) + # Get additional features + event_df = get_inplay_start_time(event_df) + # Get tracking features + processed_tracking_df = get_tracking(tracking_df, event_df, player_info_df) + final_tracking_df_list.append(processed_tracking_df) + + final_tracking_df = pd.concat(final_tracking_df_list, ignore_index=True) + return final_tracking_df + +def load_statsbomb_skillcorner(sb_event_path: str, sc_tracking_path: str, sc_match_path: str, sc_players_path: str) -> pd.DataFrame: + """ + Load and merge StatsBomb event data with SkillCorner tracking data. + + Args: + statsbomb_event_path (str): File path for StatsBomb event data. + skillcorner_tracking_path (str): File path for SkillCorner tracking data. + skillcorner_match_path (str): File path for SkillCorner match data. + skillcorner_players_path (str): File path for SkillCorner players data. + + Returns: + pd.DataFrame: Combined DataFrame with event and tracking data. + """ + + def extract_meta_info_from_match(sc_match: dict, sc_players: list) -> dict: + """ + Extract team and player information (ID, name, side) from a json match data file. + + Args: + sc_match (dict): Dataframe of match data file. + sc_players (dict): List of players data file. + Returns: + dict: Dictionary in the format: {team_id: {'team_name': str, 'team_side': str}}, {player_id: {'position': str, 'team_id': str, 'side': str}}. + """ + team_meta_df = {} + player_meta_df = {} + + player_trackable_map = {p['id']: p.get('trackable_object') for p in sc_players} + + home_id = sc_match['home_team']['id'] + team_meta_df[home_id] = { + 'team_name': sc_match['home_team']['name'], + 'team_side': 'home' + } + + away_id = sc_match['away_team']['id'] + team_meta_df[away_id] = { + 'team_name': sc_match['away_team']['name'], + 'team_side': 'away' + } + + for p in sc_match['players']: + player_id = p['id'] + trackable_id = player_trackable_map.get(player_id) + player_meta_df[trackable_id] = { + 'team_id': p['team_id'], + 'player_name': p['short_name'], + 'position_name': p['player_role']['name'], + 'position_acronym': p['player_role']['acronym'] + } + + return team_meta_df, player_meta_df + + def get_left_team_id(sc_tracking, team_meta_df, player_meta_df): + """ + Identifies which team ID is attacking the left side of the pitch based on the goalkeeper's position. + + Args: + sc_tracking (list): Raw tracking data containing frame-by-frame object positions. + team_meta_df (dict/pd.DataFrame): Metadata for teams including names and IDs. + player_meta_df (dict/pd.DataFrame): Metadata for players including their team IDs and positions. + + Returns: + int or None: The team ID assigned to the left side (x < 0), or None if not found. + """ + all_team_ids = list(team_meta_df.keys()) + for frame_data in sc_tracking: + if frame_data['data']==None: + continue + for obj in frame_data['data']: + if 'z' in obj: + continue + p_id = obj['trackable_object'] + p_info = player_meta_df[p_id] + if p_info['position_acronym'] == 'GK': + if obj['x'] < 0.0: + left_team_id = p_info['team_id'] + else: + left_team_id = [tid for tid in all_team_ids if tid != p_info['team_id']][0] + return left_team_id + return None + + def process_all_tracking(sc_tracking, team_meta_df, player_meta_df, left_team_id): + """ + Iterates through all frames to return a flattened DataFrame sorted by tactical positions. + + Args: + sc_tracking (list): Raw tracking data list. + team_meta_df (dict): Metadata containing team details. + player_meta_df (dict): Metadata containing player details. + left_team_id (int): The ID of the team currently on the left side. + + Returns: + pd.DataFrame: Processed tracking data with fixed columns for ball and 22 players (sorted by position). + """ + POSITION_ORDER = ['GK', 'CB', 'RCB', 'LCB', 'RWB', 'RB', 'LWB', 'LB', 'CDM', 'RDM', 'LDM', 'RM', 'CM', 'LM', 'CAM', 'RW', 'LW', 'CF'] + pos_priority = {pos: i for i, pos in enumerate(POSITION_ORDER)} + + all_team_ids = list(team_meta_df.keys()) + right_team_id = [tid for tid in all_team_ids if tid != left_team_id][0] + + all_frames_processed = [] + + for frame_data in sc_tracking: + res = { + 'period': int(frame_data['period']) if pd.notna(frame_data['period']) else None, + 'inplay_num': None, + 'frame': frame_data['frame'], + 'match_time': frame_data['timestamp'], + 'ball_x': None, + 'ball_y': None, + 'left_team_id': left_team_id, + 'left_team_name': team_meta_df[left_team_id]['team_name'], + 'left_team_side': team_meta_df[left_team_id]['team_side'], + 'right_team_id': right_team_id, + 'right_team_name': team_meta_df[right_team_id]['team_name'], + 'right_team_side': team_meta_df[right_team_id]['team_side'] + } + + left_players_in_frame = [] + right_players_in_frame = [] + + for obj in frame_data['data']: + if 'z' in obj: + res['ball_x'] = obj['x'] + res['ball_y'] = obj['y'] + continue + + p_id = obj['track_id'] + if p_id in player_meta_df: + p_info = player_meta_df[p_id] + player_data = { + 'id': p_id, + 'name': p_info['player_name'], + 'pos': p_info['position_acronym'], + 'x': obj['x'], + 'y': obj['y'], + 'priority': pos_priority.get(p_info['position_acronym'], 99) + } + + if p_info['team_id'] == left_team_id: + left_players_in_frame.append(player_data) + else: + right_players_in_frame.append(player_data) + + left_players_sorted = sorted(left_players_in_frame, key=lambda x: (x['priority'], x['id'])) + right_players_sorted = sorted(right_players_in_frame, key=lambda x: (x['priority'], x['id'])) + + for i in range(11): + idx = i + 1 + if i < len(left_players_sorted): + p = left_players_sorted[i] + res[f"left_{idx}_id"] = p['id'] + res[f"left_{idx}_name"] = p['name'] + res[f"left_{idx}_position"] = p['pos'] + res[f"left_{idx}_x"] = p['x'] + res[f"left_{idx}_y"] = p['y'] + else: + res[f"left_{idx}_id"] = None + res[f"left_{idx}_name"] = None + res[f"left_{idx}_position"] = None + res[f"left_{idx}_x"] = None + res[f"left_{idx}_y"] = None + + for i in range(11): + idx = i + 1 + if i < len(right_players_sorted): + p = right_players_sorted[i] + res[f"right_{idx}_id"] = p['id'] + res[f"right_{idx}_name"] = p['name'] + res[f"right_{idx}_position"] = p['pos'] + res[f"right_{idx}_x"] = p['x'] + res[f"right_{idx}_y"] = p['y'] + else: + res[f"right_{idx}_id"] = None + res[f"right_{idx}_name"] = None + res[f"right_{idx}_position"] = None + res[f"right_{idx}_x"] = None + res[f"right_{idx}_y"] = None + + all_frames_processed.append(res) + + return pd.DataFrame(all_frames_processed) + + def get_inplay_start_time(event_df: pd.DataFrame): + """ + Assigns in-play sequence numbers and identifies start times for each sequence from event data. + + Args: + event_df (pd.DataFrame): Dataframe of match events (passes, play patterns, etc.). + + Returns: + list: A list of dictionaries, each containing 'inplay_num', 'period', and 'timestamp' for sequence starts. + """ + df = event_df.copy() + inplay_info_list = [] + current_inplay = 0 + + continuing_patterns = ['Regular Play', 'From Counter', 'From Keeper'] + restart_types = ['Throw-in', 'Corner', 'Goal Kick', 'Free Kick'] + + for i in range(len(df) - 1): + curr_ev = df.iloc[i] + next_ev = df.iloc[i + 1] + + if pd.isna(next_ev['pass_type']): + continue + + is_new_inplay = False + + next_ts = pd.Timestamp(next_ev['timestamp']).round('100ms') + curr_ts = pd.Timestamp(curr_ev['timestamp']).round('100ms') + if next_ts < curr_ts: + is_new_inplay = True + + elif curr_ev['play_pattern'] != next_ev['play_pattern']: + if next_ev['play_pattern'] not in continuing_patterns: + is_new_inplay = True + + elif next_ev['pass_type'] in restart_types: + is_new_inplay = True + + if is_new_inplay: + current_inplay += 1 + inplay_info_list.append({ + 'inplay_num': current_inplay, + 'period': int(next_ev['period']), + 'timestamp': next_ts + }) + + return inplay_info_list + + def get_inplay_tracking(tracking_df: pd.DataFrame, inplay_info_list: List) -> pd.DataFrame: + """ + Filters tracking data to include only in-play periods and assigns sequence numbers. + + Args: + tracking_df (pd.DataFrame): Processed tracking data. + inplay_info_list (list): List of dictionaries defining start times of in-play sequences. + + Returns: + pd.DataFrame: Tracking data filtered for in-play time, sampled at 5fps (200ms intervals). + """ + df = tracking_df.copy() + + df['tmp_timestamp'] = pd.to_datetime( + df['match_time'], format='%H:%M:%S.%f', errors='coerce' + ).map(lambda x: x.replace(year=1900, month=1, day=1) if pd.notna(x) else x) + + def normalize_period_time(group): + period_start = group['tmp_timestamp'].min() + base = pd.Timestamp('1900-01-01 00:00:00') + group['tmp_timestamp'] = base + (group['tmp_timestamp'] - period_start) + return group + + def normalize_time(ts): + if isinstance(ts, pd.Timestamp): + return ts.replace(year=1900, month=1, day=1) + return ts + + df = df.groupby('period', group_keys=False).apply(normalize_period_time) + + for i in range(len(inplay_info_list)): + current_info = inplay_info_list[i] + + start_time = normalize_time(current_info['timestamp']) + period = current_info['period'] + num = current_info['inplay_num'] + + period_mask = (df['period'] == period) + + next_event_in_same_period = None + for j in range(i + 1, len(inplay_info_list)): + if int(inplay_info_list[j]['period']) == period: + next_event_in_same_period = normalize_time(inplay_info_list[j]['timestamp']) + break + + if next_event_in_same_period is not None: + time_mask = (df['tmp_timestamp'] >= start_time) & (df['tmp_timestamp'] < next_event_in_same_period) + else: + time_mask = (df['tmp_timestamp'] >= start_time) + + final_mask = period_mask & time_mask + df.loc[final_mask, 'inplay_num'] = num + + df = df.dropna(subset=['inplay_num']) + + base_time = pd.Timestamp('1900-01-01 00:00:00') + df['match_time'] = (df['tmp_timestamp'] - base_time).dt.total_seconds() * 1000 + df = df[df['match_time'] % 200 == 0] + df['match_time'] = df['match_time'].astype(int) + df = df.drop(columns=['tmp_timestamp']) + + df['period'] = df['period'].astype(int) + df['inplay_num'] = df['inplay_num'].astype(int) + + return df.reset_index(drop=True) + + # Load the event data + with open(sb_event_path, 'rb') as f: + sb_event = pickle.load(f) + with open(sc_tracking_path, 'r', encoding='utf-8') as f: + sc_tracking = json.load(f) + with open(sc_match_path, 'r', encoding='utf-8') as f: + sc_match = json.load(f) + with open(sc_players_path, 'r', encoding='utf-8') as f: + sc_players = json.load(f) + + team_meta_df, player_meta_df = extract_meta_info_from_match(sc_match, sc_players) + + left_team_id = get_left_team_id(sc_tracking, team_meta_df, player_meta_df) + + tracking_df = process_all_tracking(sc_tracking, team_meta_df, player_meta_df, left_team_id) + + inplay_info_list = get_inplay_start_time(sb_event) + + processed_tracking_df = get_inplay_tracking(tracking_df, inplay_info_list) + + return processed_tracking_df \ No newline at end of file diff --git a/preprocessing/sports/phase_data/soccer/soccer_phase_class.py b/preprocessing/sports/phase_data/soccer/soccer_phase_class.py new file mode 100644 index 0000000..3917c1d --- /dev/null +++ b/preprocessing/sports/phase_data/soccer/soccer_phase_class.py @@ -0,0 +1,36 @@ +#Target data provider [Metrica,Robocup 2D simulation,Statsbomb,Wyscout,Opta data,DataFactory,sportec] + +if __name__ == '__main__': + import soccer_load_data +else: + from . import soccer_load_data + +#create a class to wrap the data source +class Soccer_phase_data: + def __init__(self,data_provider,bp_tracking_xml_path=None,bp_tracking_json_paths=None,bp_event_path=None,bp_meta_data=None, + sb_event_path=None, sc_tracking_path=None, sc_match_path=None, sc_players_path=None): + self.data_provider = data_provider + self.bp_tracking_xml_path = bp_tracking_xml_path + self.bp_tracking_json_paths = bp_tracking_json_paths + self.bp_event_path = bp_event_path + self.bp_meta_data = bp_meta_data + self.sb_event_path = sb_event_path + self.sc_tracking_path = sc_tracking_path + self.sc_match_path = sc_match_path + self.sc_players_path=sc_players_path + + def load_data(self): + #based on the data provider, load the dataloading function from load_data.py (single file) + if self.data_provider == 'bepro': + df=soccer_load_data.load_bepro(self.bp_tracking_xml_path, self.bp_tracking_json_paths, self.bp_event_path, self.bp_meta_data) + elif self.data_provider == 'statsbomb_skillcorner': + df=soccer_load_data.load_statsbomb_skillcorner(sb_event_path=self.sb_event_path, sc_tracking_path=self.sc_tracking_path, sc_match_path=self.sc_match_path, sc_players_path=self.sc_players_path) + # elif self.data_provider == 'pff_fc': + # df=soccer_load_data.load_pff2metrica(self.bp_event_path) + # elif self.data_provider == 'robocup_2d': + # df=soccer_load_data.load_robocup_2d(self.event_path,match_id=self.match_id,tracking_path=self.tracking_path) + # elif self.data_provider == 'datastadium': + # df=soccer_load_data.load_datastadium(self.event_path,self.tracking_home_path,self.tracking_away_path) + else: + raise ValueError('Data provider not supported or not found') + return df diff --git a/pyproject.toml b/pyproject.toml index 9b1f134..9d3840c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" [project] name = "openstarlab_preprocessing" -version = "0.1.41" +version = "0.1.42" description = "openstarlab preprocessing package" readme = "README.md" requires-python = ">=3.8"