From fd07dcc59d0bb50cffff0cde6fcfa51c7808eb2c Mon Sep 17 00:00:00 2001 From: KurodaKento0505 Date: Tue, 2 Dec 2025 17:16:55 +0900 Subject: [PATCH 1/7] generated phase_data folder and updated phase_class.py, soccer_phase_class.py and soccer_load_data.py for bepro. --- preprocessing/sports/phase_data/__init__.py | 0 .../sports/phase_data/phase_class.py | 45 + .../sports/phase_data/soccer/README.md | 39 + .../sports/phase_data/soccer/__init__.py | 0 .../sports/phase_data/soccer/constant.py | 215 +++ .../phase_data/soccer/soccer_load_data.py | 1165 ++++++++++++ .../phase_data/soccer/soccer_phase_class.py | 578 ++++++ .../phase_data/soccer/soccer_plot_row.py | 175 ++ .../phase_data/soccer/soccer_processing.py | 1554 +++++++++++++++++ .../phase_data/soccer/soccer_tracking_data.py | 115 ++ 10 files changed, 3886 insertions(+) create mode 100644 preprocessing/sports/phase_data/__init__.py create mode 100644 preprocessing/sports/phase_data/phase_class.py create mode 100644 preprocessing/sports/phase_data/soccer/README.md create mode 100644 preprocessing/sports/phase_data/soccer/__init__.py create mode 100644 preprocessing/sports/phase_data/soccer/constant.py create mode 100644 preprocessing/sports/phase_data/soccer/soccer_load_data.py create mode 100644 preprocessing/sports/phase_data/soccer/soccer_phase_class.py create mode 100644 preprocessing/sports/phase_data/soccer/soccer_plot_row.py create mode 100644 preprocessing/sports/phase_data/soccer/soccer_processing.py create mode 100644 preprocessing/sports/phase_data/soccer/soccer_tracking_data.py diff --git a/preprocessing/sports/phase_data/__init__.py b/preprocessing/sports/phase_data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/preprocessing/sports/phase_data/phase_class.py b/preprocessing/sports/phase_data/phase_class.py new file mode 100644 index 0000000..ea258bd --- /dev/null +++ b/preprocessing/sports/phase_data/phase_class.py @@ -0,0 +1,45 @@ +class Phase_data: + soccer_data_provider = ['bepro', 'skillcorner', 'pff_fc'] # 'robocup_2d', 'datastadium', + handball_data_provider = [] + rocket_league_data_provider = [] # 'carball' + + def __new__(cls, data_provider, *args, **kwargs): + if data_provider in cls.soccer_data_provider: + from .soccer.soccer_phase_class import Soccer_phase_data + return Soccer_phase_data(data_provider, *args, **kwargs) + elif data_provider in cls.handball_data_provider: + raise NotImplementedError('Handball phase data not implemented yet') + elif data_provider in cls.rocket_league_data_provider: + raise NotImplementedError('rocket_league phase data not implemented yet') + # from .rocket_league.rocket_league_phase_class import Rocket_league_phase_data + # return Rocket_league_phase_data(data_provider, *args, **kwargs) + else: + raise ValueError(f'Unknown data provider: {data_provider}') + + +if __name__ == '__main__': + #check if the Soccer_tracking_data class is correctly implemented + import os + import argparse + import glob + args = argparse.ArgumentParser() + args.add_argument('--data_provider', required=True, choices=['bepro', 'skillcorner', 'pff_fc'], help='kind of data provider') + args.add_argument('--match_id', required=True, help='ID of match data') + data_provider = args.data_provider + match_ids = [str(match_id) for match_id in args.match_id.split(",")] + base_dir = os.getcwd() + f"/test/sports/tracking_data/{data_provider}/" + if data_provider == 'bepro': + for match_id in match_ids: + # The format for bepro has changed from Match ID: 130000(?). + if int(match_id) >= 130000: + file_pattern = os.path.join(base_dir, match_id, f"{match_id}_*_frame_data.json") + tracking_json_paths = sorted(glob.glob(file_pattern)) + preprocessing_df=Phase_data(data_provider=data_provider, bp_tracking_json_paths=tracking_json_paths).load_data() + else: + tracking_path=os.getcwd()+f"/test/sports/tracking_data/{data_provider}/{match_id}/{match_id}_tracker_box_data.xml" + preprocessing_df=Phase_data(data_provider=data_provider, bp_tracking_xml_path=tracking_path).load_data() + elif data_provider == 'skillcorner': + print('not yet') + elif data_provider == 'pff_fc': + print('not yet') + preprocessing_df.to_csv(os.getcwd()+f"/test/sports/tracking_data/{data_provider}/{match_id}/test_data_main.csv",index=False) \ No newline at end of file diff --git a/preprocessing/sports/phase_data/soccer/README.md b/preprocessing/sports/phase_data/soccer/README.md new file mode 100644 index 0000000..3e28411 --- /dev/null +++ b/preprocessing/sports/phase_data/soccer/README.md @@ -0,0 +1,39 @@ +# Event Data in Football/Soccer ⚽ +[![Documentation Status](https://readthedocs.org/projects/openstarlab/badge/?version=latest)](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/index.html) +## Introduction +This package offers functions to load and preprocess event data from various sources in football/soccer. + +## Supported Data Providers +You can find detailed documentation on supported data providers [here](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/Event_data/Data_Provider/index.html). The supported providers include: + +- DataFactory +- DataStadium +- Metrica +- Opta +- Robocup 2D Simulation +- SoccerTrackv2 (BePro) +- Sportec +- Statsbomb +- Statsbomb with Skillcorner Tracking Data +- Wyscout + +For data format examples, visit [Kloppy](https://github.com/PySport/kloppy/tree/master/kloppy/tests/files) + +## Supported Preprocessing Methods +For information on supported preprocessing methods, visit [this documentation](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/Event_data/Data_Format/index.html). The available preprocessing methods are: + +- Unified and Integrated Event Data (UIED) +- NMSTPP (same format required for [Football Match Event Forecast](https://github.com/calvinyeungck/Football-Match-Event-Forecast)) +- Other Event Data Formats + +## Examples +Here are some examples of how to download and preprocess data: + +- **Wyscout Data (NMSTPP format):** + - [Read the Docs Example](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/Event_data/Example/Football/Example_1/contents.html) + - [GitHub Example](https://github.com/open-starlab/PreProcessing/blob/master/example/NMSTPP_data.py) + - [Colab Example](https://colab.research.google.com/drive/1c7pAwXDVaT_XTYNHvgvxgmxj0E-6IEKH?authuser=1#scrollTo=p9AZJWlYfJYs) + +- **StatsBomb and SkillCorner Data:** + - [Read the Docs Example](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/Event_data/Example/Football/Example_2/contents.html) + - [GitHub Example](https://github.com/open-starlab/PreProcessing/blob/master/example/statsbomb_skillcorner.py) diff --git a/preprocessing/sports/phase_data/soccer/__init__.py b/preprocessing/sports/phase_data/soccer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/preprocessing/sports/phase_data/soccer/constant.py b/preprocessing/sports/phase_data/soccer/constant.py new file mode 100644 index 0000000..7f30b59 --- /dev/null +++ b/preprocessing/sports/phase_data/soccer/constant.py @@ -0,0 +1,215 @@ +from bidict import bidict + +""" +共通 +""" + +FREQUENCY_HZ_ORIGINAL = 25 +FREQUENCY_HZ_CONVERTED = 10 + + +""" +データスタジアムのカラム +""" +# データスタジアムのカラム"攻撃方向"とトラッキングデータの座標正負との関係 +ATTACKING_DIRECTION_PLUS = 1 # 1の場合、正方向に攻撃 +ATTACKING_DIRECTION_MINUS = 2 # 2の場合、負方向に攻撃 + +# データスタジアムのカラム"ホームアウェイF"とチームの関係 +F_HOME_AWAY_BALL = 0 +F_HOME_AWAY_HOME = 1 +F_HOME_AWAY_AWAY = 2 + +# play.csv(イベントデータ)から抽出するイベント +ACTION_NAME_FOR_ATTACK_ANALYSIS_LIST = [ + # 自チームイベント + 'シュート' + ,'ホームパス' + ,'アウェイパス' + ,'スルーパス' + ,'フィード' + ,'クロス' + ,'GK' + ,'CK' + ,'キックオフ' + ,'スローイン' + ,'ドリブル' + # ,'直接FK' + # ,'間接FK' + + # 敵チームイベント + , 'トラップ' + , 'クリア' + , 'ブロック' +] + + +# アクション一覧 +# usage +# print(ACTION_BIDICT['CK']) # 1 +# print(ACTION_BIDICT.inverse[1]) # CK + +ACTION_BIDICT = bidict({ + 'CK': 1 + ,'シュート': 2 + ,'キックオフ': 3 + ,'クロス': 4 + ,'ハンドクリア': 5 + ,'タッチ': 6 + ,'ボールアウト': 7 + ,'間接FK': 8 + ,'PK': 9 + ,'タックル': 10 + ,'試合中断(試合中)': 11 + ,'フリック>オン': 12 + ,'退場(レッド)': 13 + ,'オフサイド': 14 + ,'ポスト/バー': 15 + ,'ファウル受ける': 16 + ,'前半開始': 17 + ,'ホームパス': 18 + ,'トラップ': 19 + ,'クリア': 20 + ,'直接FK': 21 + ,'前半終了': 22 + ,'後半終了': 23 + ,'オウンゴール': 24 + ,'警告(イエロー)': 25 + ,'ドリブル': 26 + ,'ファウルする': 27 + ,'スルーパス': 28 + ,'キャッチ': 29 + ,'フィード': 30 + ,'アウェイパス': 31 + ,'交代': 32 + ,'後半開始': 33 + ,'インターセプト': 34 + ,'ドロップボール': 35 + ,'GK': 36 + ,'ブロック': 37 + ,'スローイン': 38 +}) + +# アクション優先度(大きいほど高い) +# 同フレームに複数のアクションが紐付いている際、どのアクションを残すかを決定 +ACTION_PRIORITY = { + # 自チームイベント + 'シュート': 10 + ,'ホームパス': 10 + ,'アウェイパス': 10 + ,'スルーパス': 10 + ,'フィード': 10 + ,'クロス': 10 + ,'GK': 8 + ,'CK': 8 + ,'キックオフ': 8 + ,'スローイン': 8 + ,'ドリブル': 8 + # ,'直接FK' + # ,'間接FK' + + # 敵チームイベント + , 'トラップ': 6 + , 'クリア': 9 # シュート、クリア重複あり + + # 共通イベント + , 'ブロック': 4 # トラップ、ブロック重複あり +} + +TEAM_BIDICT = bidict({ + 122: '浦和レッズ' + , 128: 'ガンバ大阪' + , 124: '横浜F・マリノス' + , 127: '名古屋グランパス' + , 126: '清水エスパルス' + , 133: 'セレッソ大阪' + , 136: 'ヴィッセル神戸' + , 30528: '松本山雅FC' + , 120: '鹿島アントラーズ' + , 238: 'ベガルタ仙台' + , 129: 'サンフレッチェ広島' + , 131: 'ジュビロ磐田' + , 207: '大分トリニータ' + , 86: '川崎フロンタ>ーレ' + , 86: '川崎フロンターレ' + , 276: '北海道コンサドーレ札幌' + , 270: 'FC東京' + , 130: '湘南ベルマーレ' + , 269: 'サガン鳥栖' +}) + + +N_AGENTS = 22 +EXTRA_FRAME = 4 + +FIELD_LENGTH = 105.0 # unit: meters +FIELD_WIDTH = 68.0 # unit: meters +GOAL_WIDTH = 7.32 # unit: meters +PENALTY_X = 105.0/2-16.5 # left point (unit: meters) +PENALTY_Y = 40.32 # upper point (unit: meters) + +# for gfootball +FIELD_LENGTH_GRF = 1*2 +FIELD_WIDTH_GRF = 0.42*2 +GOAL_WIDTH_GRF = 0.044*2 + +STOP_THRESHOLD = 0.1 # unit: m/s +SPRINT_THRESHOLD = 24000/3600 # unit: m/s (24 km/h) +LONGPASS_THRESHOLD = 30 # unit: meters +HIGHPASS_AGENT_THRESHOLD = 1 # unit: meters +BALL_KEEP_THRESHOLD = 1 # unit: m +SEED = 42 + +# super mini map for gfootball +SMM_WIDTH = 96 +SMM_HEIGHT = 72 + +SMM_LAYERS = ['left_team', 'right_team', 'ball', 'active'] + +# Normalized minimap coordinates +MINIMAP_NORM_X_MIN = -1.0 +MINIMAP_NORM_X_MAX = 1.0 +MINIMAP_NORM_Y_MIN = -1.0 / 2.25 +MINIMAP_NORM_Y_MAX = 1.0 / 2.25 + +MARKER_VALUE = 255 + +# GFootbal actions +ACTION_GRF_19 = bidict({ + 'idle': 0 + ,'left': 1 + ,'top_left': 2 + ,'top': 3 + ,'top_right': 4 + ,'right': 5 + ,'bottom_right': 6 + ,'bottom': 7 + ,'bottom_left': 8 + ,'long_pass': 9 + ,'high_pass': 10 + ,'short_pass': 11 + ,'shot': 12 + ,'sprint': 13 + ,'release_direction': 14 + ,'release_sprint': 15 + ,'sliding': 16 + ,'dribble': 17 + ,'release_dribble': 18 # ,'builtin_ai ': 19 +}) + +ACTION_GRF_14 = bidict({ + 'idle': 0 + ,'left': 1 + ,'top_left': 2 + ,'top': 3 + ,'top_right': 4 + ,'right': 5 + ,'bottom_right': 6 + ,'bottom': 7 + ,'bottom_left': 8 + ,'pass': 9 + ,'shot': 10 + ,'sprint': 11 + ,'release_direction': 12 + ,'release_sprint': 13 +}) \ No newline at end of file diff --git a/preprocessing/sports/phase_data/soccer/soccer_load_data.py b/preprocessing/sports/phase_data/soccer/soccer_load_data.py new file mode 100644 index 0000000..1763139 --- /dev/null +++ b/preprocessing/sports/phase_data/soccer/soccer_load_data.py @@ -0,0 +1,1165 @@ +#Target data provider [Metrica,Robocup 2D simulation,Statsbomb,Wyscout,Opta data,DataFactory,sportec] + +import json +import pandas as pd +import numpy as np +import xml.etree.ElementTree as ET +from statsbombpy import sb +from tqdm import tqdm +from datetime import datetime +import os +import pdb +import csv + +def load_bepro(tracking_xml_path: str, tracking_json_paths: list, event_path: str, verbose: bool = False) -> pd.DataFrame: + """ + Loads and processes event and tracking data from soccer match recordings. + + This function combines event data with tracking data by merging based on event time. It also adds + additional features extracted from metadata, such as player information, and converts position + coordinates to the correct scale for analysis. + + Args: + event_path (str): Path to the CSV file containing event data. + tracking_path (str): Path to the XML file containing tracking data. + meta_path (str): Path to the XML file containing match metadata (pitch, teams, players, etc.). + verbose (bool, optional): If True, prints additional information about the merging process and + feature extraction. Default is False. + + Returns: + pd.DataFrame: A DataFrame containing the merged and processed event and tracking data, + with additional features including player positions, speeds, ball position, + and metadata (e.g., player names, shirt numbers, positions). + """ + def extract_tracking_data_from_xml(xml_path): + """ + Parse the XML file and extract tracking data. + + Args: + xml_path (str): Path to the XML file. + Returns: + list of dict: A list containing tracking information for each player in each frame. + """ + tree = ET.parse(xml_path) + root = tree.getroot() + tracking_data = [] + + for frame in root.findall("frame"): + frame_number = int(frame.get("frameNumber")) + match_time = int(frame.get("matchTime")) + + for player in frame: + player_id = player.get("playerId") + loc = player.get("loc") + # Convert loc string to float coordinates + try: + x, y = map(float, loc.strip("[]").split(",")) + tracking_data.append({ + "frame": frame_number, + "match_time": match_time, + "player_id": player_id, + "x": "{:.2f}".format(x * 105 - 52.5), + "y": "{:.2f}".format(y * 68 - 34.0) + }) + except ValueError: + raise ValueError(f"Invalid location format for player {player_id} in frame {frame_number}") + tracking_df = add_period_column(tracking_data) + + return tracking_df + + def extract_tracking_data_from_json(json_path): + """ + Parse the JSON file and extract tracking data. + + Args: + json_path (str): Path to the JSON file. + Returns: + list of dict: A list containing tracking information for each player in each frame. + """ + with open(json_path, "r") as f: + data = json.load(f) + + tracking_data = [] + for frame_number, players in data.items(): + for player in players: + try: + tracking_data.append({ + "frame": int(frame_number), + "match_time": int(player.get("match_time", 0)), + "player_id": "ball" if player.get("player_id") == None else player.get("player_id"), + "x": "{:.2f}".format(float(player.get("x", 0) - 52.5)), + "y": "{:.2f}".format(float(player.get("y", 0) - 34.0)) + }) + except ValueError: + raise ValueError(f"Invalid data format in frame {frame_number}") + tracking_df = add_period_column(tracking_data) + + return tracking_df + + def add_period_column(tracking_data_list): + """ + Add a 'period' column to the tracking_data list. + + Increment the period each time the frame number significantly decreases (resets). + Args: + tracking_data_list (list of dict): A list containing tracking_data. + Returns: + pandas.DataFrame: A DataFrame with the 'period' column added. + """ + + df = pd.DataFrame(tracking_data_list) + first_occurrence_of_frame = df.drop_duplicates(subset=['frame'], keep='first') + frame_diff = first_occurrence_of_frame['frame'].diff().fillna(0) + period_reset = (frame_diff < 0) + period_values = period_reset.cumsum() + 1 + period_map = pd.Series(period_values.values, index=first_occurrence_of_frame['frame']).to_dict() + df['period'] = df['frame'].map(period_map) + cols = ['period'] + [col for col in df.columns if col != 'period'] + df = df[cols] + + return df + + def get_additional_features(event_df, meta_data): + #player info: id name nameEN shirtNumber position + # create features period, seconds, event_type, event_type_2, outcome, home_team, x_unscaled, y_unscaled, + period_dict = {"FIRST_HALF": 1, "SECOND_HALF": 2, "EXTRA_FIRST_HALF": 3, "EXTRA_SECOND_HALF": 4} + event_df["period"] = event_df["event_period"].map(period_dict) + event_df["seconds"] = event_df["event_time"]/1000 + + event_type_list = [] + for i in range(len(event_df)): + event_i = event_df.iloc[i].event_types + # print(event_i) + if not isinstance(event_i, str): + event_type_list.append(None) + else: + event_i = event_i.split(" ")[0] + event_type_list.append(event_i) + event_df["event_type"] = event_type_list + + home_team_dict = {int(team_info["id"]):team_info["side"] for team_info in meta_data["team_info"]} + event_df["home_team"] = event_df["team_id"].map(home_team_dict) + #convert "home" to 1 and "away" to 0 for home_team + event_df["home_team"] = event_df["home_team"].map({"home":1,"away":0}) + + #x and y coordinates of the field (height,width) for the event data (inverse of the tracking data) + event_df["x_unscaled"] = event_df["y"]*int(meta_data["pitch_info"]["width"]) + event_df["y_unscaled"] = event_df["x"]*int(meta_data["pitch_info"]["height"]) + return event_df + + def calculate_sync_bias(event_df, tracking_data, period=1, verbose=False): + # 'FIRST_HALF' "SECOND_HALF" + # Calculate the bias between event time and tracking time + limit = 5.0 #seconds + time_list = [key for key in tracking_data.keys()] + #split the time_list into two halves + if period == 1: + time_list = [time for time in time_list if tracking_data[time]['eventPeriod'] == 'FIRST_HALF'] + first_event_time = event_df[event_df["event_period"]=="FIRST_HALF"].iloc[0].event_time if "FIRST_HALF" in event_df["event_period"].values else 0 + elif period == 2: + time_list = [time for time in time_list if tracking_data[time]['eventPeriod'] == 'SECOND_HALF'] + first_event_time = event_df[event_df["event_period"]=="SECOND_HALF"].iloc[0].event_time if "SECOND_HALF" in event_df["event_period"].values else 0 + elif period == 3: + time_list = [time for time in time_list if tracking_data[time]['eventPeriod'] == 'EXTRA_FIRST_HALF'] + first_event_time = event_df[event_df["event_period"]=="EXTRA_FIRST_HALF"].iloc[0].event_time if "EXTRA_FIRST_HALF" in event_df["event_period"].values else 0 + + if time_list == []: + return 0 + + time_list.sort() + start_time = max(time_list[0],0) + #round to the nearest 1000 + start_time = round(start_time/1000)*1000 + print("start_time:",start_time) if verbose else None + #drop the time that exceeds the limit of the event time + time_list = [time for time in time_list if time <= start_time+limit*1000] + #order the time_list in ascending order + time_list.sort() + + ball_coordinates = [] + for time_i in time_list: + tracking_data_i = tracking_data[time_i] + ball_data_i = tracking_data_i['ball']['loc'] + ball_coordinates.append(ball_data_i) + #find the time with the highest acceleration + ball_coordinates = np.array(ball_coordinates) + ball_speed = np.linalg.norm(np.diff(ball_coordinates,axis=0),axis=1) + max_speed_index = np.argmax(ball_speed) + max_speed_time = time_list[max_speed_index] + bias = max_speed_time - first_event_time + + return bias + + def get_tracking_features(event_df, tracking_data, meta_data, verbose=True): + # combine the event data with the tracking data via event_time and matchTime + #get the player info + time_list = [key for key in tracking_data.keys()] + time_diff_list = [] + player_dict = {} + home_team_player_count = 0 + away_team_player_count = 0 + home_team_dict = {int(team_info["id"]):team_info["side"] for team_info in meta_data["team_info"]} + for player_i in meta_data["player_info"]: + player_dict[player_i["id"]] = player_i + team_id = int(player_dict[player_i["id"]]['teamId']) + if home_team_dict[team_id] == 'home': + player_dict[player_i["id"]]["player_num"] = home_team_player_count+1 + home_team_player_count += 1 + elif home_team_dict[team_id] == 'away': + player_dict[player_i["id"]]["player_num"] = away_team_player_count+1 + away_team_player_count += 1 + else: + print("team_id not found") + pdb.set_trace() + + #create the additional features + tracking_features=["player_id","x","y","speed"] + meta_features=["name","nameEn","shirtNumber","position"] + ball_features = ["ball_x","ball_y","ball_speed"] + additional_features = tracking_features+meta_features + additional_featurs_dict = {} + for i in range(home_team_player_count): + for j in range(len(additional_features)): + additional_featurs_dict[f"home_{additional_features[j]}_{i+1}"] = [] + for i in range(away_team_player_count): + for j in range(len(additional_features)): + additional_featurs_dict[f"away_{additional_features[j]}_{i+1}"] = [] + for j in range(len(ball_features)): + additional_featurs_dict[ball_features[j]] = [] + + additional_featurs_dict["tracking_time"] = [] + additaional_features_dict_key_list = [key for key in additional_featurs_dict.keys()] + + #get the sync bias for the event and tracking data + bias_1 = calculate_sync_bias(event_df, tracking_data, period=1, verbose=verbose) #FIRST_HALF + bias_2 = calculate_sync_bias(event_df, tracking_data, period=2, verbose=verbose) #SECOND_HALF + bias_3 = calculate_sync_bias(event_df, tracking_data, period=3, verbose=verbose) #EXTRA_FIRST_HALF + + print("bias_1:",bias_1,"bias_2:",bias_2,"bias_3:",bias_3) if verbose else None + + if verbose: + iterable = tqdm(range(len(event_df))) + else: + iterable = range(len(event_df)) + for i in iterable: + updated_features = [] + event_time = event_df.iloc[i].event_time + period = event_df.iloc[i].event_period + if period == 'FIRST_HALF': + event_time += bias_1 + elif period == 'SECOND_HALF': + event_time += bias_2 + elif period == 'EXTRA_FIRST_HALF': + event_time += bias_3 + else: + print("period not included") + #find the nearest time in the tracking data + nearest_time = min(time_list, key=lambda x:abs(x-event_time)) + try: + additional_featurs_dict["tracking_time"].append(nearest_time) + updated_features+=["tracking_time"] + except: + pass + time_diff_list.append(nearest_time-event_time) + #get the tracking data + tracking_data_i = tracking_data[nearest_time] + for player_track_j in tracking_data_i['players']: + player_j_id = player_track_j['playerId'] + player_j_num = player_dict[player_j_id]["player_num"] + player_j_team = player_dict[player_j_id]["teamId"] + player_j_home = home_team_dict[int(player_j_team)] + # append the tracking data and meta data to the additional features + additional_featurs_dict[f"{player_j_home}_player_id_{player_j_num}"].append(player_track_j['playerId']) + additional_featurs_dict[f"{player_j_home}_x_{player_j_num}"].append(round(player_track_j['loc'][0]*int(meta_data["pitch_info"]["width"]),2)) + additional_featurs_dict[f"{player_j_home}_y_{player_j_num}"].append(round(player_track_j['loc'][1]*int(meta_data["pitch_info"]["height"]),2)) + additional_featurs_dict[f"{player_j_home}_speed_{player_j_num}"].append(player_track_j['speed']) + additional_featurs_dict[f"{player_j_home}_name_{player_j_num}"].append(player_dict[player_j_id]["name"]) + additional_featurs_dict[f"{player_j_home}_nameEn_{player_j_num}"].append(player_dict[player_j_id]["nameEn"]) + additional_featurs_dict[f"{player_j_home}_shirtNumber_{player_j_num}"].append(player_dict[player_j_id]["shirtNumber"]) + additional_featurs_dict[f"{player_j_home}_position_{player_j_num}"].append(player_dict[player_j_id]["position"]) + updated_features+=[f"{player_j_home}_player_id_{player_j_num}",f"{player_j_home}_x_{player_j_num}",f"{player_j_home}_y_{player_j_num}",f"{player_j_home}_speed_{player_j_num}",f"{player_j_home}_name_{player_j_num}",f"{player_j_home}_nameEn_{player_j_num}",f"{player_j_home}_shirtNumber_{player_j_num}",f"{player_j_home}_position_{player_j_num}"] + ball_track = tracking_data_i['ball'] + additional_featurs_dict[f"ball_x"].append(round(ball_track['loc'][0]*int(meta_data["pitch_info"]["width"]),2)) + additional_featurs_dict[f"ball_y"].append(round(ball_track['loc'][1]*int(meta_data["pitch_info"]["height"]),2)) + if ball_track['speed'] == 'NA': + additional_featurs_dict[f"ball_speed"].append(None) + else: + additional_featurs_dict[f"ball_speed"].append(ball_track['speed']) + updated_features+=["ball_x","ball_y","ball_speed"] + # for features in additaional_features_dict_key_list but not in updated_features, append None + for key in additaional_features_dict_key_list: + if key not in updated_features: + additional_featurs_dict[key].append(None) + + #add the additional features to the event_df + out_event_df = event_df.copy() + if verbose: + for key in additional_featurs_dict.keys(): + print(key,len(additional_featurs_dict[key])) + + # Create a DataFrame from the additional features dictionary + additional_features_df = pd.DataFrame(additional_featurs_dict) + + # Concatenate the original event_df with the additional features DataFrame + out_event_df = pd.concat([event_df, additional_features_df], axis=1) + + #print the mean and std of the time_diff_list + if verbose: + print("mean time difference:",round(np.mean(time_diff_list),4)) + print("std time difference:",round(np.std(time_diff_list),4)) + print("max time difference:",round(np.max(time_diff_list),4)) + print("min time difference:",round(np.min(time_diff_list),4)) + return out_event_df + + # check if the format is the latest version + if tracking_xml_path is None: + list_of_tracking_dfs = [] + for i in range(len(tracking_json_paths)): + input_json = tracking_json_paths[i] + tracking_df = extract_tracking_data_from_json(input_json) + list_of_tracking_dfs.append(tracking_df) + tracking_df = pd.concat(list_of_tracking_dfs, ignore_index=True) + else: + input_json = tracking_xml_path + tracking_df = extract_tracking_data_from_xml(input_json) + # Load the event data + event_df = pd.read_csv(event_path) + # Get additional features + event_df = get_additional_features(event_df) + # Get tracking features + event_df = get_tracking_features(event_df, tracking_df, verbose=verbose) + + return event_df + +def load_statsbomb_skillcorner(statsbomb_event_dir: str, skillcorner_tracking_dir: str, skillcorner_match_dir: str, statsbomb_match_id: str, skillcorner_match_id: str) -> pd.DataFrame: + """ + Load and merge StatsBomb event data with SkillCorner tracking data. + + Args: + statsbomb_event_dir (str): Directory path for StatsBomb event data. + skillcorner_tracking_dir (str): Directory path for SkillCorner tracking data. + skillcorner_match_dir (str): Directory path for SkillCorner match data. + statsbomb_match_id (str): Match ID for StatsBomb data. + skillcorner_match_id (str): Match ID for SkillCorner data. + + Returns: + pd.DataFrame: Combined DataFrame with event and tracking data. + """ + + # File paths + statsbomb_event_path = f"{statsbomb_event_dir}/{statsbomb_match_id}.csv" + skillcorner_tracking_path = f"{skillcorner_tracking_dir}/{skillcorner_match_id}.json" + skillcorner_match_path = f"{skillcorner_match_dir}/{skillcorner_match_id}.json" + + # Load StatsBomb events + events = pd.read_csv(statsbomb_event_path) + + # Load SkillCorner tracking and match data + with open(skillcorner_tracking_path) as f: + tracking = json.load(f) + + with open(skillcorner_match_path) as f: + match = json.load(f) + + #check if the file exists + if not os.path.exists(statsbomb_event_path): + print(f"Statsbomb event file not found: {statsbomb_event_path}") + return None + if not os.path.exists(skillcorner_tracking_path): + print(f"Skillcorner tracking file not found: {skillcorner_tracking_path}") + return None + if not os.path.exists(skillcorner_match_path): + print(f"Skillcorner match file not found: {skillcorner_match_path}") + return None + + # Team name mapping + team_name_dict = { + 'UD Almería': 'Almería', 'Real Sociedad': 'Real Sociedad', 'Athletic Club de Bilbao': 'Athletic Club', + 'Villarreal CF': 'Villarreal', 'RC Celta de Vigo': 'Celta Vigo', 'Getafe CF': 'Getafe', + 'UD Las Palmas': 'Las Palmas', 'Sevilla FC': 'Sevilla', 'Cadiz CF': 'Cádiz', + 'Atlético Madrid': 'Atlético Madrid', 'RCD Mallorca': 'Mallorca', 'Valencia CF': 'Valencia', + 'CA Osasuna': 'Osasuna', 'Girona FC': 'Girona', 'Real Betis Balompié': 'Real Betis', + 'FC Barcelona': 'Barcelona', 'Deportivo Alavés': 'Deportivo Alavés', 'Granada CF': 'Granada', + 'Rayo Vallecano': 'Rayo Vallecano', 'Real Madrid CF': 'Real Madrid' + } + + home_team_name = team_name_dict[match['home_team']['name']] + away_team_name = team_name_dict[match['away_team']['name']] + + team_dict = { + match['home_team']['id']: {'role': 'home', 'name': home_team_name}, + match['away_team']['id']: {'role': 'away', 'name': away_team_name} + } + + # Convert the trackable object dict + trackable_objects = {} + home_count = away_count = 0 + + for player in match['players']: + role = team_dict[player['team_id']]['role'] + position = player['player_role']['name'] + if role == 'home': + trackable_objects[player['trackable_object']] = { + 'name': f"{player['first_name']} {player['last_name']}", + 'team': team_dict[player['team_id']]['name'], + 'role': role, + 'id': home_count, + 'position': position + } + home_count += 1 + elif role == 'away': + trackable_objects[player['trackable_object']] = { + 'name': f"{player['first_name']} {player['last_name']}", + 'team': team_dict[player['team_id']]['name'], + 'role': role, + 'id': away_count, + 'position': position + } + away_count += 1 + + trackable_objects[match['ball']['trackable_object']] = {'name': 'ball', 'team': 'ball', 'role': 'ball', 'position': 'ball'} + ball_id = match['ball']['trackable_object'] + + ##sync the tracking data with the events based on the ball velocity + #get the first 5s of the match + ball_velocity_period_1 = [] + ball_velocity_period_2 = [] + + for frame in tracking: + time = frame['timestamp'] + period = frame['period'] + data = frame['data'] + time_components = time.split(':') if time else None + seconds = float(time_components[0]) * 3600 + float(time_components[1]) * 60 + float(time_components[2]) if time else 0 + if time and period==1 and seconds<=5: + for obj in data: + if obj['trackable_object']==ball_id: + ball_velocity_period_1.append([time, obj['x'], obj['y'],obj['z']]) + + if time and period==2 and seconds <= 45*60+5: + for obj in data: + if obj['trackable_object']==ball_id: + ball_velocity_period_2.append([time, obj['x'], obj['y'],obj['z']]) + + if not ball_velocity_period_1 == [] or not ball_velocity_period_2 == []: + try: + max_velocity_timestamp1, max_velocity1 = calculate_velocity_and_max_timestamp(ball_velocity_period_1) + max_velocity_seconds1 = max_velocity_timestamp1.split(':') + max_velocity_seconds1 = float(max_velocity_seconds1[0]) * 3600 + float(max_velocity_seconds1[1]) * 60 + float(max_velocity_seconds1[2]) + except: + max_velocity_seconds1 = -1 + + try: + max_velocity_timestamp2, max_velocity2 = calculate_velocity_and_max_timestamp(ball_velocity_period_2) + max_velocity_seconds2 = max_velocity_timestamp2.split(':') + max_velocity_seconds2 = float(max_velocity_seconds2[0]) * 3600 + float(max_velocity_seconds2[1]) * 60 + float(max_velocity_seconds2[2]) + max_velocity_seconds2 = max_velocity_seconds2 - 45*60 + except: + max_velocity_seconds2 = -1 + + if max_velocity_seconds1 == -1 and max_velocity_seconds2 != -1: + max_velocity_seconds1 = max_velocity_seconds2 + elif max_velocity_seconds1 != -1 and max_velocity_seconds2 == -1: + max_velocity_seconds2 = max_velocity_seconds1 + elif max_velocity_seconds1 == -1 and max_velocity_seconds2 == -1: + max_velocity_seconds1 = max_velocity_seconds2 = 0 + + # Process tracking data + tracking_dict = {} + for frame in tracking: + time = frame['timestamp'] + if time: + time_components = time.split(':') + seconds = float(time_components[0]) * 3600 + float(time_components[1]) * 60 + float(time_components[2]) + period = frame['period'] + if period == 1: + seconds = seconds - max_velocity_seconds1 + elif period == 2: + seconds = seconds - max_velocity_seconds2 + seconds = round(seconds, 1) + uid = f"{period}_{seconds}" + tracking_dict[uid] = frame['data'] + + # Prepare data for DataFrame + df_list = [] + for _, event in events.iterrows(): + event_id = event['id'] + match_id = statsbomb_match_id + period = event['period'] + time = event['timestamp'] + minute = event['minute'] + second = event['second'] + event_type = event['type'] + event_type_2 = None + end_x = end_y = None + if event_type == "Pass": + end_location=event.get('pass_end_location') + #check if end_location is a string + if isinstance(end_location, (str)): + end_location = [float(x) for x in end_location[1:-1].split(",")] + end_x = end_location[0] + end_y = end_location[1] + cross=event.get('pass_cross') + pass_height=event.get('pass_height') + pass_type=event.get('pass_type') + if pass_type=="Corner": + event_type_2="Corner" + elif cross and not np.isnan(cross): + event_type_2="Cross" + elif pass_height: + event_type_2=pass_height + elif event_type=="Shot": + event_type_2=event.get('shot_outcome') + + team = event['team'] + home_team = 1 if team == home_team_name else 0 + player = event['player'] + location = event['location'] + + if isinstance(location, str): + location = [float(x) for x in location[1:-1].split(",")] + start_x, start_y = location[0], location[1] + else: + start_x = start_y = None + + time_components = time.split(':') + seconds = round(float(time_components[0]) * 3600 + float(time_components[1]) * 60 + float(time_components[2]), 4) + if period == 2: + seconds += 45 * 60 + elif period == 3: + seconds += 90 * 60 + elif period == 4: + seconds += (90 + 15) * 60 + + seconds_rounded = round(seconds, 1) + uid = f"{period}_{seconds_rounded}" + tracking_data = tracking_dict.get(uid) + home_tracking = [None] * 2 * 23 + away_tracking = [None] * 2 * 23 + home_side = [None] + + if tracking_data: + for obj in tracking_data: + track_obj = trackable_objects[obj['trackable_object']] + if track_obj['role'] == 'home': + home_tracking[2 * track_obj['id']] = obj['x'] + home_tracking[2 * track_obj['id'] + 1] = obj['y'] + elif track_obj['role'] == 'away': + away_tracking[2 * track_obj['id']] = obj['x'] + away_tracking[2 * track_obj['id'] + 1] = obj['y'] + + if track_obj['position'] == "Goalkeeper": + if track_obj['role'] == 'home': + home_gk_x = obj['x'] + elif track_obj['role'] == 'away': + away_gk_x = obj['x'] + + + # Determine the side of the home team based on the goalkeeper's position + if home_gk_x < away_gk_x: + home_side = 'left' + else: + home_side = 'right' + + home_side = [home_side] + + df_list.append([match_id, period, time, minute, second, seconds, event_type, event_type_2, team, home_team, player, start_x, start_y, end_x, end_y, *home_tracking, *away_tracking, *home_side]) + + # Define DataFrame columns + home_tracking_columns = [] + away_tracking_columns = [] + for i in range(1, 24): + home_tracking_columns.extend([f"h{i}_x", f"h{i}_y"]) + away_tracking_columns.extend([f"a{i}_x", f"a{i}_y"]) + columns = ["match_id", "period", "time", "minute", "second", 'seconds', "event_type", "event_type_2", "team", "home_team", "player", "start_x", "start_y","end_x","end_y"] + home_tracking_columns + away_tracking_columns + ["home_side"] + + # Convert the event list to a DataFrame + df = pd.DataFrame(df_list, columns=columns) + + #Sort the DataFrame by 'period' then 'seconds' + df = df.sort_values(by=["period", "seconds"]).reset_index(drop=True) + + return df + +def calculate_velocity_and_max_timestamp(data): + """ + Calculate the velocity for each time interval and find the timestamp with the highest velocity. + + Parameters: + data (list): List of lists, where each sublist contains [timestamp, x, y, z]. + + Returns: + tuple: (max_velocity_timestamp, max_velocity) + - max_velocity_timestamp: The timestamp with the highest velocity. + - max_velocity: The highest velocity value. + """ + # Extract timestamps, x, y, z coordinates + timestamps = [entry[0] for entry in data] + x = np.array([entry[1] for entry in data]) + y = np.array([entry[2] for entry in data]) + z = np.array([entry[3] for entry in data]) + + # Convert timestamps to seconds + time_seconds = np.array([ + (datetime.strptime(ts, "%H:%M:%S.%f") - datetime.strptime(timestamps[0], "%H:%M:%S.%f")).total_seconds() + for ts in timestamps + ]) + + # Calculate differences + delta_x = np.diff(x) + delta_y = np.diff(y) + delta_z = np.diff(z) + delta_t = np.diff(time_seconds) + + # Calculate velocity components and magnitude + vx = delta_x / delta_t + vy = delta_y / delta_t + vz = delta_z / delta_t + velocity_magnitude = np.sqrt(vx**2 + vy**2 + vz**2) + + # Find the index of the maximum velocity + max_velocity_index = np.argmax(velocity_magnitude) + max_velocity = velocity_magnitude[max_velocity_index] + max_velocity_timestamp = timestamps[max_velocity_index + 1] # Use +1 to get the ending timestamp of the interval + + return max_velocity_timestamp, max_velocity + +def load_pff2metrica(event_path:str, match_id:str = None) -> pd.DataFrame: + """ + Convert PFF-style event data to Metrica format. + + Parameters + ---------- + event_df : pd.DataFrame + Event data from PFF dataset with columns like: + - gameEvents_period + - gameEvents_playerName + - possessionEvents_receiverPlayerName + - possessionEvents_possessionEventType + - startTime, endTime, duration + - gameEvents_homeTeam + - various outcome types for success/failure + match_id : str, optional + Match identifier to add as a column, by default None + + Returns + ------- + Metrica_df : pd.DataFrame + DataFrame in Metrica format with columns: + ['Team', 'Type', 'Subtype', 'Period', 'Start Frame', 'Start Time [s]', + 'End Frame', 'End Time [s]', 'From', 'To', 'Start X', 'Start Y', 'End X', 'End Y'] + """ + with open(event_path, 'r') as f: + event_data = json.load(f) + event_df = pd.json_normalize(event_data, sep='_') + + def type_id2name(x): + """ + Map event type codes to descriptive names. + + Parameters + ---------- + x : str | int | float | None + Event type code (e.g., 'PA', 'SH', 'FO', etc.) + + Returns + ------- + str | None + Descriptive event type name, or None if not mapped. + """ + import math + if x in ['PA']: + x = "pass" + elif x in ['CR']: + x = "cross" + # elif x == 2: + # x = "throw_in" + # elif x == 5: + # x = "corner_crossed" + # elif x == 7: + # x = "take_on" + elif x in ['FO']: + x = "foul" + elif x in ['CH']: + x = "tackle" + # elif x == 10: + # x = "interception" + elif x in ['SH']: + x = "shot" + elif x in ['CL']: + x = "clearance" + elif x in ['BC']: + x = "dribble" + # elif x == 22: + # x = "goalkick" + elif x in ['IT', 'RE', 'TC']: + x = "other" + elif x is None or (isinstance(x, (float, int)) and math.isnan(x)): + x = None + else: + print(f"Unmapped event type: {x}") + return x + def extract_player_xy(row): + """ + Extracts the (x, y) coordinates of the player involved in a game event. + + Parameters + ---------- + row : pd.Series + A row from a DataFrame containing game event and player information. + Expected keys: + - "gameEvents_homeTeam" (bool): True if home team, False if away team. + - "homePlayers" (list|str): List or stringified list of home team players. + - "awayPlayers" (list|str): List or stringified list of away team players. + - "gameEvents_playerId" (int): ID of the player involved in the event. + + Returns + ------- + pd.Series + A Series with coordinates: + - "start_x" + - "start_y" + - "end_x" + - "end_y" + If the player is not found, all values are None. + """ + # choose player list + if row["gameEvents_homeTeam"] is True: + player_dict = row["homePlayers"] + elif row["gameEvents_homeTeam"] is False: + player_dict = row["awayPlayers"] + else: + return pd.Series([None, None, None, None], index=["start_x", "start_y", "end_x", "end_y"]) + + # find target player + player_dict = ast.literal_eval(player_dict) if type(player_dict) == str else player_dict + target_player = next((d for d in player_dict if d["playerId"] == row["gameEvents_playerId"]), None) + + if target_player: + return pd.Series( + [target_player["x"], target_player["y"], target_player["x"], target_player["y"]], + index=["start_x", "start_y", "end_x", "end_y"] + ) + else: + return pd.Series([None, None, None, None], index=["start_x", "start_y", "end_x", "end_y"]) + + # drop row where gameEvents_startGameClock is NaN + event_df = event_df.dropna(subset=['gameEvents_startGameClock']).reset_index(drop=True) + + # set column name + column_name = ['Team', + 'Type', + 'Subtype', + 'Period', + 'Start Frame', + 'Start Time [s]', + 'End Frame', + 'End Time [s]', + 'From', + 'To', + 'Start X', + 'Start Y', + 'End X', + 'End Y'] + Metrica_df = pd.DataFrame(columns=column_name) + Metrica_df['Period'] = event_df['gameEvents_period'] + event_df[["start_x", "start_y", "end_x", "end_y"]] = event_df.apply(extract_player_xy, axis=1) + Metrica_df['Start X'] = event_df['start_x'] #- 52.5 + Metrica_df['Start Y'] = event_df['start_y'] #- 34 + Metrica_df['End X'] = event_df['end_x'] #- 52.5 + Metrica_df['End Y'] = event_df['end_y'] #- 34 + Metrica_df['From'] = event_df['gameEvents_playerName'] + Metrica_df['To'] = event_df['possessionEvents_receiverPlayerName'] + Metrica_df['Type'] = event_df['possessionEvents_possessionEventType'] + Metrica_df['Type'] = Metrica_df['Type'].apply(type_id2name) + + idx = event_df.index + + def col(name): + """Safe getter: returns Series aligned to df (all NaN if col missing).""" + return event_df[name] if name in event_df.columns else pd.Series(pd.NA, index=idx) + + # Raw outcome columns + pass_out = col('possessionEvents_passOutcomeType') + cross_out = col('possessionEvents_crossOutcomeType') + shot_out = col('possessionEvents_shotOutcomeType') + clr_out = col('possessionEvents_clearanceOutcomeType') + tkl_out = col('possessionEvents_challengeOutcomeType') + carry_out = col('possessionEvents_ballCarryOutcome') + touch_out = col('possessionEvents_touchOutcomeType') + + # Per-action success masks (nullable booleans) + event_df['pass_success'] = pass_out.isin(['C']) + event_df['cross_success'] = cross_out.isin(['C']) + event_df['shot_success'] = shot_out.isin(['G']) + event_df['clearance_success'] = ~clr_out.isin(['B','D']) & clr_out.notna() + event_df['tackle_success'] = tkl_out.isin(['B','C','M']) + event_df['dribble_success'] = carry_out.isin(['R']) + event_df['touch_success'] = touch_out.isin(['R']) + + # Where each action is *present* (not NaN), assign Subtype based on its success + event_df['Subtype'] = np.nan + + def apply_subtype(success_col, present_series): + """Set Subtype for rows where this action is present.""" + is_present = present_series.notna() + success = event_df[success_col] == True + fail = event_df[success_col] == False + event_df.loc[is_present & success, 'Subtype'] = 'success' + event_df.loc[is_present & fail, 'Subtype'] = 'fail' + + apply_subtype('pass_success', pass_out) + apply_subtype('cross_success', cross_out) + apply_subtype('shot_success', shot_out) + apply_subtype('clearance_success', clr_out) + apply_subtype('tackle_success', tkl_out) + apply_subtype('dribble_success', carry_out) + apply_subtype('touch_success', touch_out) + Metrica_df['Subtype'] = event_df['Subtype'] + + fps = 29.97 + + Metrica_df['Start Time [s]'] = (event_df['gameEvents_startGameClock']).round().astype(int) + Metrica_df['End Time [s]'] = (event_df['duration'] + event_df['gameEvents_startGameClock']).round().astype(int) + + Metrica_df['Start Frame'] = ((event_df['startTime'] - event_df['startTime'][0]) * fps).round().astype(int) + end_frame = ((event_df['endTime'] - event_df['startTime'][0]) * fps).round() + Metrica_df['End Frame'] = end_frame.fillna(Metrica_df['Start Frame']).astype(int) + Metrica_df['Team'] = np.where(event_df['gameEvents_homeTeam'] == True, 'Home', + np.where(event_df['gameEvents_homeTeam'] == False, 'Away', None)) + + #drop rows where start_x or start_y is NaN + Metrica_df = Metrica_df.dropna(subset=['Start X', 'Start Y']) + Metrica_df = Metrica_df.reset_index(drop=True) + + if match_id is not None: + Metrica_df['match_id'] = match_id + cols = Metrica_df.columns.tolist() + cols = cols[-1:] + cols[:-1] + Metrica_df = Metrica_df[cols] + + return Metrica_df + +def load_datastadium( + datastadium_event_path: str, + datastadium_home_tracking_path: str, + datastadium_away_tracking_path: str + ) -> pd.DataFrame: + """ + Loads and processes event and tracking data from stadium event recordings. + + Args: + datastadium_event_path (str): Path to the CSV file containing event data. + datastadium_home_tracking_path (str): Path to the CSV file containing home team tracking data. + datastadium_away_tracking_path (str): Path to the CSV file containing away team tracking data. + + Returns: + pd.DataFrame: A DataFrame containing the merged and processed event and tracking data. + """ + # Load data + event = pd.read_csv(datastadium_event_path, encoding='shift_jis') + home_tracking = pd.read_csv(datastadium_home_tracking_path) + away_tracking = pd.read_csv(datastadium_away_tracking_path) + + # Define required columns and flags + required_columns = [ + "試合ID", "ホームアウェイF", "チーム名", "選手名", "アクション名", "F_成功", + "位置座標X", "位置座標Y", "敵陣F", "点差", "自スコア", "相手スコア", + "F_ゴール", "F_セーブ", "F_シュートGK以外", "F_ミスヒット", "ゴール角度", + "ゴール距離", "F_パス", "F_クロス", "F_ドリブル", "F_クリア", + "F_ハンドクリア", "F_ゴールキック", "F_コーナーキック", "F_直接フリーキック", + "F_間接フリーキック", "絶対時間秒数", "フレーム番号","距離" + ] + flags = [ + "F_ゴール", "F_セーブ", "F_シュートGK以外", "F_ミスヒット", "F_パス", + "F_クロス", "F_ドリブル", "F_クリア", "F_ハンドクリア", "F_ゴールキック", + "F_コーナーキック", "F_直接フリーキック", "F_間接フリーキック" + ] + event_type_dict = { + "前半開始": "First Half Start", "前半終了": "First Half End", "後半開始": "Second Half Start", + "後半終了": "Second Half End", "延長前半開始": "Overtime First Half Start", + "延長前半終了": "Overtime First Half End", "延長後半開始": "Overtime Second Half Start", + "延長後半終了": "Overtime Second Half End", "再延長前半開始": "Second Overtime First Half Start", + "再延長前半終了": "Second Overtime First Half End", "再延長後半開始": "Second Overtime Second Start", + "再延長後半終了": "Second Overtime Second End", "PK戦開始": "PK Start", "PK戦終了": "PK End", + "シュート": "Shoot", "GK": "GK", "直接FK": "Direct FK", "キャッチ": "Catch", + "警告(イエロー)": "YellowCard", "PK": "PK", "CK": "CK", "間接FK": "Indirect FK", + "オフサイド": "Offside", "退場(レッド)": "RedCard", "交代": "Change", "キックオフ": "KickOff", + "ファウルする": "Foul", "オウンゴール": "OwnGoal", "ホームパス": "HomePass", + "アウェイパス": "AwayPass", "PKパス": "PKPass", "ポジション変更": "Position Change", + "中断": "Suspension", "ドリブル": "Dribble", "スルーパス": "Through Pass", + "ハンドクリア": "Hand Clear", "ファウル受ける": "Foul", "ドロップボール": "Drop Ball", + "ボールアウト": "Ball Out", "インターセプト": "Intercept", "クリア": "Clear", + "ブロック": "Block", "スローイン": "ThrowIn", "クロス": "Cross", "トラップ": "Trap", + "PK合戦": "PK Battle", "試合再開": "Resume", "フィード": "Feed", "タッチ": "Touch", + "タックル": "Tackle", "フリックオン": "FrickOn", "試合中断": "Suspension", + "ポスト/バー": "Post Bar", "試合中断(試合中)": "Suspension(InGame)", + "試合再開(試合中)": "Resume(InGame)" + } + flag_dict = { + "F_ゴール": "Goal", "F_セーブ": "Save", "F_シュートGK以外": "Shot(not_GK)", + "F_ミスヒット": "MissHit", "F_パス": "Pass", "F_クロス": "Cross", "F_ドリブル": "Dribble", + "F_クリア": "Clear", "F_ハンドクリア": "HandClear", "F_ゴールキック": "GoalKick", + "F_コーナーキック": "CornerKick", "F_直接フリーキック": "DirectFreeKick", + "F_間接フリーキック": "IndirectFreeKick" + } + + # Filter columns and preprocess data + event = event[required_columns].copy() + event["絶対時間秒数"] = event["絶対時間秒数"].astype(float) + event = event.sort_values(by="絶対時間秒数") + + # Create event_type_2 column based on flags + def get_event_type_2(row): + event_types = [flag_dict[f] for f in flags if row[f] == 1] + return "/".join(event_types) if event_types else None + + event["event_type_2"] = event.apply(get_event_type_2, axis=1) + event = event.drop(columns=flags) + + # Rename columns + event.columns = [ + "match_id", "home", "team", "player", "event_type", "success", + "start_x", "start_y", "opp_field", "point_diff", "self_score", + "opp_score", "angle2goal", "dist2goal", "absolute_time", + "frame", "dist", "event_type_2" + ] + + # Reorder columns + event = event[[ + "match_id", "team", "home", "player", "frame", "absolute_time", + "event_type", "event_type_2", "success", "start_x", "start_y","dist", + "opp_field", "point_diff", "self_score", "opp_score", "angle2goal", + "dist2goal" + ]] + + # Convert event_type to English + event["event_type"] = event["event_type"].map(event_type_dict).fillna(event["event_type"]) + + # Calculate period, minute, and second + def calculate_time(row, half_start, period_flag): + time_elapsed = float(row["absolute_time"]) - half_start + return int(time_elapsed / 60), round(time_elapsed % 60, 4) + + period, minute, second = [], [], [] + half_start = float(event.iloc[0]["absolute_time"]) + period_flag = 1 + + for _, row in event.iterrows(): + if row["event_type"] == "Second Half Start": + period_flag = 2 + half_start = float(row["absolute_time"]) + + period.append(period_flag) + m, s = calculate_time(row, half_start, period_flag) + minute.append(m) + second.append(s) + + event["Period"] = period + event["Minute"] = minute + event["Second"] = second + + # Reorder columns + event = event[[ + "match_id", "Period", "Minute", "Second", "frame", "absolute_time", + "team", "home", "player", "event_type", "event_type_2", "success", + "start_x", "start_y", "dist", "opp_field", "point_diff", "self_score", + "opp_score", "angle2goal", "dist2goal" + ]] + + #reset the index + event.reset_index(drop=True, inplace=True) + + # get the tracking start time for 2nd half + tracking_start_time_2 = home_tracking[home_tracking["Period"] == 2].iloc[0]["Time [s]"] + + #sort both tracking data + home_tracking = home_tracking.sort_values(by="Time [s]").reset_index(drop=True) + away_tracking = away_tracking.sort_values(by="Time [s]").reset_index(drop=True) + + home_tracking_time = home_tracking["Time [s]"].round(2).values + tracking_col_home = [f"Home_{i}_x" for i in range(1, 15)] + [f"Home_{i}_y" for i in range(1, 15)] + tracking_col_away = [f"Away_{i}_x" for i in range(1, 15)] + [f"Away_{i}_y" for i in range(1, 15)] + + # Calculate event times vectorized + event_time = event["Minute"] * 60 + event["Second"] + tracking_start_time_2 * (event["Period"] == 2) + + # Find nearest indices using numpy + nearest_indices = np.searchsorted(home_tracking_time, event_time,side='left') + nearest_indices = np.clip(nearest_indices, 0, len(home_tracking_time) - 1) + + # Get the corresponding tracking data + home_tracking_data = home_tracking.iloc[nearest_indices][tracking_col_home].values + away_tracking_data = away_tracking.iloc[nearest_indices][tracking_col_away].values + + # pdb.set_trace() + + # Combine the results + new_df = pd.concat([event, pd.DataFrame(home_tracking_data, columns=tracking_col_home), + pd.DataFrame(away_tracking_data, columns=tracking_col_away)], axis=1) + + + # Create final DataFrame + columns = [ + "match_id", "absolute_time", "Period", "Minute", "Second", "team", "home", "player", + "event_type", "event_type_2", "success", "start_x", "start_y", "dist", + "opp_field", "point_diff", "self_score", "opp_score", + "angle2goal", "dist2goal"] + tracking_col_home + tracking_col_away + + final_df = pd.DataFrame(new_df, columns=columns) + + return final_df + +def load_robocup_2d(event_path: str, match_id: str = None, tracking_path: str = None) -> pd.DataFrame: + """ + Load event data from CSV file and optionally merge with tracking data. + + Args: + event_path (str): Path to the CSV file containing event data. + match_id (str, optional): Identifier for the match. Defaults to None. + tracking_path (str, optional): Path to the CSV file containing tracking data. Defaults to None. + + Returns: + pd.DataFrame: DataFrame containing event and tracking data. + """ + # Load event data from CSV file + event_df = pd.read_csv(event_path) + + # Load tracking data if provided + if tracking_path: + tracking_df = pd.read_csv(tracking_path) + + # Define columns for the DataFrame + columns = ["match_id", "seconds", "event_type", "outcome", "team", "player", "start_x", "start_y", "end_x", "end_y"] + if tracking_path: + columns.extend([" l_score", " r_score", " b_x", " b_y"]) + for i in range(1, 12): + columns.extend([f" l{i}_x", f" l{i}_y"]) + for i in range(1, 12): + columns.extend([f" r{i}_x", f" r{i}_y"]) + + + # Initialize an empty list to store event details + event_list = [] + + # Iterate through event records + for index, record in event_df.iterrows(): + seconds = record.get('Time1', None) + event_type = record.get('Type', None) + outcome = record.get('Success', None) + team = record.get('Side1', None) + player = record.get('Unum1', None) + start_x = record.get('X1', None) + start_y = record.get('Y1', None) + end_x = record.get('X2', None) + end_y = record.get('Y2', None) + + # If tracking data is provided, merge with event details + if tracking_path: + if seconds in tracking_df[' cycle'].values: + tracking_record = tracking_df[tracking_df[' cycle'] == seconds] + if tracking_record.shape[0] != 1: + print(f"Error: Tracking record {index} has more than one row") + continue + + # Extract tracking data + tracking_values = tracking_record.iloc[0].to_dict() + + # tracking_values.pop(' cycle') # Remove the cycle column + tracking_values = {key: value for key, value in tracking_values.items() if key in columns} + # Append event and tracking details to the list + event_list.append([match_id, seconds, event_type, outcome, team, player, start_x, start_y, end_x, end_y, *tracking_values.values()]) + else: + # Append only event details + event_list.append([match_id, seconds, event_type, outcome, team, player, start_x, start_y, end_x, end_y]) + + # Convert the event list to a DataFrame + df = pd.DataFrame(event_list, columns=columns) + + # Sort the DataFrame by 'seconds' + df = df.sort_values(by="seconds").reset_index(drop=True) + + return df + +if __name__ == "__main__": + import pdb + import os + #cd to ../PreProcessing + datafactory_path=os.getcwd()+"/test/sports/event_data/data/datafactory/datafactory_events.json" + metrica_event_json_path=os.getcwd()+"/test/sports/event_data/data/metrica/metrica_events.json" + metrica_event_csv_path=os.getcwd()+"/test/sports/event_data/data/metrica/Sample_Game_1/Sample_Game_1_RawEventsData.csv" + metrica_tracking_home_path=os.getcwd()+"/test/sports/event_data/data/metrica/Sample_Game_1/Sample_Game_1_RawTrackingData_Home_Team.csv" + metrica_tracking_away_path=os.getcwd()+"/test/sports/event_data/data/metrica/Sample_Game_1/Sample_Game_1_RawTrackingData_Away_Team.csv" + opta_f7_path=os.getcwd()+"/test/sports/event_data/data/opta/opta_f7.xml" + opta_f24_path=os.getcwd()+"/test/sports/event_data/data/opta/opta_f24.xml" + robocup_2d_event_path=os.getcwd()+"/test/sports/event_data/data/robocup_2d/202307091024-HELIOS2023_1-vs-CYRUS_0-pass.csv" + robocup_2d_tracking_path=os.getcwd()+"/test/sports/event_data/data/robocup_2d/202307091024-HELIOS2023_1-vs-CYRUS_0.csv" + sportec_event_path=os.getcwd()+"/test/sports/event_data/data/sportec/sportec_events.xml" + sportec_tracking_path=os.getcwd()+"/test/sports/event_data/data/sportec/sportec_positional.xml" + sportec_meta_path=os.getcwd()+"/test/sports/event_data/data/sportec/sportec_meta.xml" + statsbomb_event_path=os.getcwd()+"/test/sports/event_data/data/statsbomb/events/3805010.json" + statsbomb_360_path=os.getcwd()+"/test/sports/event_data/data/statsbomb/three-sixty/3805010.json" + statsbomb_api_path=os.getcwd()+"/test/sports/event_data/data/statsbomb/api.json" + statsbomb_skillcorner_event_path="/data_pool_1/laliga_23/statsbomb/events" + statsbomb_skillcorner_tracking_path="/data_pool_1/laliga_23/skillcorner/tracking" + statsbomb_skillcorner_match_path="/data_pool_1/laliga_23/skillcorner/match" + wyscout_event_path=os.getcwd()+"/test/sports/event_data/data/wyscout/events_England.json" + wyscout_matches_path=os.getcwd()+"/test/sports/event_data/data/wyscout/matches_England.json" + datastadium_event_path=os.getcwd()+"/test/sports/event_data/data/datastadium/2019022307/play.csv" + datastadium_home_tracking_path=os.getcwd()+"/test/sports/event_data/data/datastadium/2019022307/home_tracking.csv" + datastadium_away_tracking_path=os.getcwd()+"/test/sports/event_data/data/datastadium/2019022307/away_tracking.csv" + + #test load_datafactory + # datafactory_df=load_datafactory(datafactory_path) + # datafactory_df.to_csv(os.getcwd()+"/test/sports/event_data/data/datafactory/test_data.csv",index=False) + + #test load_metrica + # metrica_df=load_metrica(metrica_event_json_path,1,metrica_tracking_home_path,metrica_tracking_away_path) + # metrica_df.to_csv(os.getcwd()+"/test/sports/event_data/data/metrica/test_data_json.csv",index=False) + # metrica_df=load_metrica(metrica_event_csv_path,1,metrica_tracking_home_path,metrica_tracking_away_path) + # metrica_df.to_csv(os.getcwd()+"/test/sports/event_data/data/metrica/test_data_csv.csv",index=False) + + #test load_opta_xml + # opta_df=load_opta_xml(opta_f24_path,1) + # opta_df.to_csv(os.getcwd()+"/test/sports/event_data/data/opta/test_data.csv",index=False) + + #test load_robocup_2d + # robocup_2d_df=load_robocup_2d(robocup_2d_event_path,1,robocup_2d_tracking_path) + # robocup_2d_df.to_csv(os.getcwd()+"/test/sports/event_data/data/robocup_2d/test_data.csv",index=False) + + #test load_sportec + # sportec_df=load_sportec(sportec_event_path,sportec_tracking_path,sportec_meta_path) + # sportec_df.to_csv(os.getcwd()+"/test/sports/event_data/data/sportec/test_data.csv",index=False) + + #test load_statsbomb with json file + # statsbomb_df=load_statsbomb(statsbomb_event_path,statsbomb_360_path) + # statsbomb_df.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_data.csv",index=False) + + # test load_statsbomb with api data + # statsbomb_df=load_statsbomb(match_id=3795108) + # statsbomb_df.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_api_data.csv",index=False) + + #test load_statsbomb_skillcorner + # statsbomb_skillcorner_df=load_statsbomb_skillcorner(statsbomb_skillcorner_event_path,statsbomb_skillcorner_tracking_path, + # statsbomb_skillcorner_match_path,3894907,1553748) + # statsbomb_skillcorner_df.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb_skillcorner/test_data.csv",index=False) + + #test load_wyscout + # wyscout_df=load_wyscout(wyscout_event_path,wyscout_matches_path) + # wyscout_df.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_data.csv",index=False) + + + #test load_datastadium + # event=load_datastadium(datastadium_event_path,datastadium_home_tracking_path,datastadium_away_tracking_path) + # event.to_csv(os.getcwd()+"/test/sports/event_data/data/datastadium/load.csv",index=False) + + #test load_soccertrack + soccer_track_event_path="/data_pool_1/soccertrackv2/2023-11-18/Event/event.csv" + soccer_track_tracking_path="/data_pool_1/soccertrackv2/2023-11-18/Tracking/tracking.xml" + soccer_track_meta_path="/data_pool_1/soccertrackv2/2023-11-18/Tracking/meta.xml" + df_soccertrack=load_bepro(soccer_track_event_path,soccer_track_tracking_path,soccer_track_meta_path,True) + df_soccertrack.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/soccertrack/test_load_function_sync.csv",index=False) + + print("----------------done-----------------") + # pdb.set_trace() + diff --git a/preprocessing/sports/phase_data/soccer/soccer_phase_class.py b/preprocessing/sports/phase_data/soccer/soccer_phase_class.py new file mode 100644 index 0000000..3468c3d --- /dev/null +++ b/preprocessing/sports/phase_data/soccer/soccer_phase_class.py @@ -0,0 +1,578 @@ +#Target data provider [Metrica,Robocup 2D simulation,Statsbomb,Wyscout,Opta data,DataFactory,sportec] + +''' +format of the data source +Metrica:csv and json (tracking data will be included in the future due to lack of matching data) +Robocup 2D simulation:csv and gz +Statsbomb: json +Wyscout: json +Opta data:xml +DataFactory:json +sportec:xml +DataStadium:csv +soccertrack:csv and xml +''' + +import os +import pandas as pd +from tqdm import tqdm +from concurrent.futures import ThreadPoolExecutor, as_completed + +if __name__ == '__main__': + import soccer_load_data + import soccer_processing + import soccer_tracking_data +else: + from . import soccer_load_data + from . import soccer_processing + from . import soccer_tracking_data +import pdb + +#create a class to wrap the data source +class Soccer_phase_data: + def __init__(self,data_provider,bp_tracking_xml_path=None,bp_tracking_json_paths=None, + event_path=None,match_id=None,tracking_home_path=None,tracking_away_path=None, + tracking_path=None,meta_data=None,statsbomb_api_args=[], + statsbomb_match_id=None,skillcorner_match_id=None,max_workers=1,match_id_df=None, + statsbomb_event_dir=None, skillcorner_tracking_dir=None, skillcorner_match_dir=None, + preprocess_method=None,sb360_path=None,wyscout_matches_path=None, + st_track_path=None, st_meta_path=None,verbose=False, + preprocess_tracking=False): + self.data_provider = data_provider + self.bp_tracking_xml_path = bp_tracking_xml_path + self.bp_tracking_json_paths = bp_tracking_json_paths + self.event_path = event_path + self.match_id = match_id + self.tracking_home_path = tracking_home_path + self.tracking_away_path = tracking_away_path + self.tracking_path = tracking_path + self.meta_data = meta_data + self.statsbomb_api_args = statsbomb_api_args + self.statsbomb_match_id = statsbomb_match_id + self.sb360_path = sb360_path + self.skillcorner_match_id = skillcorner_match_id + self.max_workers = max_workers + self.match_id_df = match_id_df + self.statsbomb_event_dir = statsbomb_event_dir + self.skillcorner_tracking_dir = skillcorner_tracking_dir + self.skillcorner_match_dir = skillcorner_match_dir + self.preprocess_method = preprocess_method + self.wyscout_matches_path=wyscout_matches_path + self.st_track_path = st_track_path + self.st_meta_path = st_meta_path + self.preprocess_tracking = preprocess_tracking + self.verbose = verbose + self.call_preprocess = False + + def load_data_single_file(self): + #based on the data provider, load the dataloading function from load_data.py (single file) + if self.data_provider == 'bepro': + df=soccer_load_data.load_bepro(self.bp_tracking_xml_path, self.bp_tracking_json_paths, self.event_path) + elif self.data_provider == 'pff_fc': + df=soccer_load_data.load_pff2metrica(self.event_path, match_id=self.match_id) + elif self.data_provider == 'robocup_2d': + df=soccer_load_data.load_robocup_2d(self.event_path,match_id=self.match_id,tracking_path=self.tracking_path) + elif self.data_provider == 'statsbomb_skillcorner': + df=soccer_load_data.load_statsbomb_skillcorner(statsbomb_event_dir=self.statsbomb_event_dir, skillcorner_tracking_dir=self.skillcorner_tracking_dir, skillcorner_match_dir=self.skillcorner_match_dir, statsbomb_match_id=self.statsbomb_match_id, skillcorner_match_id=self.skillcorner_match_id) + if self.preprocess_tracking and not self.call_preprocess: + df=soccer_tracking_data.statsbomb_skillcorner_tracking_data_preprocessing(df) + if self.preprocess_method is not None and not self.call_preprocess: + df=soccer_tracking_data.statsbomb_skillcorner_event_data_preprocessing(df,process_event_coord=False) + elif self.data_provider == 'datastadium': + df=soccer_load_data.load_datastadium(self.event_path,self.tracking_home_path,self.tracking_away_path) + else: + raise ValueError('Data provider not supported or not found') + return df + + def load_data(self): + print(f'Loading data from {self.data_provider}') + #check if the event path is a single file or a directory + if ((self.event_path is not None and os.path.isfile(self.event_path)) and self.data_provider != 'statsbomb') or \ + (self.data_provider == 'statsbomb' and self.statsbomb_match_id is None and os.path.isfile(self.event_path)) or \ + (self.data_provider == 'statsbomb_skillcorner' and self.statsbomb_match_id is not None): + df = self.load_data_single_file() + #load data from multiple files + elif (self.event_path is not None and os.path.isdir(self.event_path)) or self.data_provider == 'statsbomb' or \ + (self.data_provider == 'statsbomb_skillcorner' and self.statsbomb_match_id is None and self.skillcorner_match_id is None): + #statsbomb_skillcorner + if self.data_provider == 'statsbomb_skillcorner': + out_df_list = [] + self.match_id_df = pd.read_csv(self.match_id_df) + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + # Submit tasks to the executor + futures = [executor.submit(self.load_match_statsbomb_skillcorner, i, self.match_id_df, + self.statsbomb_event_dir,self.skillcorner_tracking_dir,self.skillcorner_match_dir) + for i in range(len(self.match_id_df))] + # Collect the results as they complete + for future in tqdm(as_completed(futures), total=len(futures)): + out_df_list.append(future.result()) + df = pd.concat(out_df_list) + #other data providers + elif self.data_provider in ['datafactory','opta','wyscout','pff_fc']: + event_path = self.event_path + files = sorted(os.listdir(self.event_path)) + files = [f for f in files if not f.startswith('.')] + if self.data_provider == "pff_fc": + #only json files + files = [f for f in files if f.endswith('.json')] + out_df_list = [] + if self.data_provider == "opta": + if self.match_id is None: + match_id=self.match_id + elif self.data_provider == "wyscout": + matches_path=self.wyscout_matches_path + count=0 + for f in tqdm(files, total=len(files)): + if self.data_provider == "opta": + if self.match_id is None: + self.match_id = match_id[count] + else: + self.match_id = count + count+=1 + elif self.data_provider == "wyscout": + self.wyscout_matches_path=os.path.join(matches_path, f.replace("events_","matches_")) + elif self.data_provider == "pff_fc": + self.match_id = f.split(".")[0] + self.event_path = os.path.join(event_path, f) + df = self.load_data_single_file() + out_df_list.append(df) + df = pd.concat(out_df_list) + self.event_path = event_path + if self.data_provider == "opta": + self.match_id = match_id + elif self.data_provider == "wyscout": + self.wyscout_matches_path=matches_path + # other data providers + elif self.data_provider in ['metrica','robocup_2d','sportec']: + #warnging that the event data and tracking data will be matched via the file name + print('Warning: Event data and tracking data will be matched via the file name') + event_path = self.event_path + files = sorted(os.listdir(self.event_path)) + files = [f for f in files if not f.startswith('.')] + out_df_list = [] + if self.data_provider in ['metrica']: + tracking_home_path = self.tracking_home_path + tracking_away_path = self.tracking_away_path + for f in files: + self.event_path = os.path.join(event_path, f) + self.tracking_home_path = os.path.join(tracking_home_path,f.replace("RawEventsData","RawTrackingData_Home_Team")) + self.tracking_away_path = os.path.join(tracking_away_path,f.replace("RawEventsData","RawTrackingData_Away_Team")) + #check if the tracking data exists + if os.path.isfile(self.tracking_home_path) and os.path.isfile(self.tracking_away_path): + df = self.load_data_single_file() + out_df_list.append(df) + else: + print(f'Tracking data not found for {f}') + df = pd.concat(out_df_list) + self.event_path = event_path + self.tracking_home_path = tracking_home_path + self.tracking_away_path = tracking_away_path + elif self.data_provider == 'robocup_2d': + tracking_path = self.tracking_path + for f in files: + self.event_path = os.path.join(event_path, f) + self.tracking_path = os.path.join(tracking_path,f.replace("pass","")) + self.match_id = f.replace("pass","").replace(".csv","") + if os.path.isfile(self.tracking_path): + df = self.load_data_single_file() + out_df_list.append(df) + else: + print(f'Tracking data not found for {f}') + df = pd.concat(out_df_list) + self.event_path = event_path + self.tracking_path = tracking_path + self.match_id = None + elif self.data_provider == 'sportec': + tracking_path = self.tracking_path + meta_path = self.meta_data + for f in files: + self.event_path = os.path.join(event_path, f) + self.tracking_path = os.path.join(tracking_path,f.replace("events","positional")) + self.meta_path = os.path.join(meta_path,f.replace("events","meta")) + if os.path.isfile(self.tracking_path) and os.path.isfile(self.meta_path): + df = self.load_data_single_file() + out_df_list.append(df) + else: + print(f'Tracking data or Meta data not found for {f}') + df = pd.concat(out_df_list) + self.event_path = event_path + self.tracking_path = tracking_path + self.meta_path = meta_path + # statsbomb + elif self.data_provider == 'statsbomb': + print('Warning: Event data and 360 data will be matched via the file name') + out_df_list = [] + if self.statsbomb_match_id is None: + files = sorted(os.listdir(self.event_path)) + files = [f for f in files if not f.startswith('.')] + event_path = self.event_path + sb360_path = self.sb360_path + def process_file(f): + event_path_local = os.path.join(event_path, f) + sb360_path_local = os.path.join(sb360_path, f) if sb360_path is not None else None + self.event_path = event_path_local + self.sb360_path = sb360_path_local + return self.load_data_single_file() + + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + futures = {executor.submit(process_file, f): f for f in files} + for future in tqdm(as_completed(futures), total=len(futures)): + result = future.result() + if result is not None: + out_df_list.append(result) + + df = pd.concat(out_df_list) + self.event_path = event_path + self.sb360_path = sb360_path + else: + if isinstance(self.statsbomb_match_id, list): + files = self.statsbomb_match_id + else: + files = [self.statsbomb_match_id] + + def process_id(f): + self.statsbomb_match_id = str(f) + return self.load_data_single_file() + + for f in tqdm(files, total=len(files)): + out_df_list.append(process_id(f)) + + df = pd.concat(out_df_list) + self.statsbomb_match_id = files + # datastadium + elif self.data_provider == "datastadium": + out_df_list = [] + + event_dir = self.event_path + + def process_event_folder(f): + # Define file paths for the current event folder + self.event_path = os.path.join(event_dir, f, 'play.csv') + self.tracking_home_path = os.path.join(event_dir, f, 'home_tracking.csv') + self.tracking_away_path = os.path.join(event_dir, f, 'away_tracking.csv') + + # Load data + df = self.load_data_single_file() + return df + + # Initialize ThreadPoolExecutor + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + # Get list of event folders + event_folders = sorted(f for f in os.listdir(self.event_path) if not (f.startswith('.') or f.startswith('@'))) + # Submit tasks to the executor + future_to_event = {executor.submit(process_event_folder, folder): folder for folder in event_folders} + # Collect results + out_df_list = [] + for future in tqdm(as_completed(future_to_event), total=len(future_to_event)): + try: + df = future.result() + out_df_list.append(df) + except Exception as e: + print(f'Error processing folder {future_to_event[future]}: {e}') + self.event_path = event_dir + df = pd.concat(out_df_list) + + else: + raise ValueError('Event path is not a valid file or directory') + print(f'Loaded data from {self.data_provider}') + return df + + def load_match_statsbomb_skillcorner(self,i, match_id_df, statsbomb_skillcorner_event_path, + statsbomb_skillcorner_tracking_path, statsbomb_skillcorner_match_path): + statsbomb_match_id = match_id_df.loc[i, "match_id_statsbomb"] + skillcorner_match_id = match_id_df.loc[i, "match_id_skillcorner"] + try: + statsbomb_skillcorner_df = soccer_load_data.load_statsbomb_skillcorner( + statsbomb_skillcorner_event_path, + statsbomb_skillcorner_tracking_path, + statsbomb_skillcorner_match_path, + statsbomb_match_id, + skillcorner_match_id + ) + except: #Exception as e: + # print("An error occurred:", e) + print(f"Skipped match statsbomb match_id: {statsbomb_match_id}") + statsbomb_skillcorner_df=None + return statsbomb_skillcorner_df + + def preprocessing_single_df(self,df): + df_out=None + if self.data_provider in ["statsbomb", "wyscout","statsbomb_skillcorner","datastadium"]: + if self.data_provider in ["statsbomb","statsbomb_skillcorner"]: + df = df.reset_index(drop=True) + df_out=soccer_processing.UIED_statsbomb(df) + elif self.data_provider == "datastadium": + df_out=soccer_processing.UIED_datastadium(df) + elif self.data_provider == "wyscout": + if self.preprocess_method == "UIED": + df_out=soccer_processing.UIED_wyscout(df) + elif self.preprocess_method == "LEM": + df_out=soccer_processing.lem(df) + elif self.preprocess_method == "NMSTPP": + df_out=soccer_processing.nmstpp(df) + elif self.preprocess_method == "SEQ2EVENT": + df_out=soccer_processing.seq2event(df) + else: + raise ValueError(f'Preprocessing method {self.preprocess_method} not found') + else: + raise ValueError(f'Preprocessing method not supported for {self.data_provider}') + return df_out + + def preprocessing(self): + self.call_preprocess = True + print(f'Preprocessing data from {self.data_provider} with method {self.preprocess_method}') + if self.preprocess_method is not None: + df = self.load_data() + out_df_list = [] + + # df_out=self.preprocessing_single_df(df) + # return df_out + + def process_single_match(match_id): + df_single = df[df.match_id == match_id] + return self.preprocessing_single_df(df_single) + + unique_match_ids = df.match_id.unique() + # unique_match_ids = [df.match_id.unique()[0]] + + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + future_to_match_id = {executor.submit(process_single_match, match_id): match_id for match_id in unique_match_ids} + + for future in tqdm(as_completed(future_to_match_id), total=len(future_to_match_id)): + match_id = future_to_match_id[future] + try: + df_single = future.result() + out_df_list.append(df_single) + except Exception as e: + print(f'Exception for match_id {match_id}: {e}') + + df = pd.concat(out_df_list) if len(out_df_list) > 1 else out_df_list[0] + df = df.reset_index(drop=True) + df['index_column'] = df.index + df = df.sort_values(by=['match_id', "index_column"]) + df = df.drop(columns=['index_column']) + else: + raise ValueError('Preprocessing method not found') + print(f'Preprocessed data from {self.data_provider} with method {self.preprocess_method}') + self.call_preprocess = False + return df + +if __name__ == '__main__': + datafactory_path=os.getcwd()+"/test/sports/event_data/data/datafactory/datafactory_events.json" + metrica_event_json_path=os.getcwd()+"/test/sports/event_data/data/metrica/metrica_events.json" + metrica_event_csv_path=os.getcwd()+"/test/sports/event_data/data/metrica/Sample_Game_1/Sample_Game_1_RawEventsData.csv" + metrica_tracking_home_path=os.getcwd()+"/test/sports/event_data/data/metrica/Sample_Game_1/Sample_Game_1_RawTrackingData_Home_Team.csv" + metrica_tracking_away_path=os.getcwd()+"/test/sports/event_data/data/metrica/Sample_Game_1/Sample_Game_1_RawTrackingData_Away_Team.csv" + opta_f7_path=os.getcwd()+"/test/sports/event_data/data/opta/opta_f7.xml" + opta_f24_path=os.getcwd()+"/test/sports/event_data/data/opta/opta_f24.xml" + robocup_2d_event_path=os.getcwd()+"/test/sports/event_data/data/robocup_2d/202307091024-HELIOS2023_1-vs-CYRUS_0-pass.csv" + robocup_2d_tracking_path=os.getcwd()+"/test/sports/event_data/data/robocup_2d/202307091024-HELIOS2023_1-vs-CYRUS_0.csv" + sportec_event_path=os.getcwd()+"/test/sports/event_data/data/sportec/sportec_events.xml" + sportec_tracking_path=os.getcwd()+"/test/sports/event_data/data/sportec/sportec_positional.xml" + sportec_meta_path=os.getcwd()+"/test/sports/event_data/data/sportec/sportec_meta.xml" + statsbomb_event_path=os.getcwd()+"/test/sports/event_data/data/statsbomb/events/3805010.json" + statsbomb_360_path=os.getcwd()+"/test/sports/event_data/data/statsbomb/three-sixty/3805010.json" + statsbomb_api_path=os.getcwd()+"/test/sports/event_data/data/statsbomb/api.json" + statsbomb_skillcorner_event_path="/data_pool_1/laliga_23/statsbomb/events" + statsbomb_skillcorner_tracking_path="/data_pool_1/laliga_23/skillcorner/tracking" + statsbomb_skillcorner_match_path="/data_pool_1/laliga_23/skillcorner/match" + wyscout_event_path=os.getcwd()+"/test/sports/event_data/data/wyscout/events_England.json" + datastadium_event_path=os.getcwd()+"/test/sports/event_data/data/datastadium/2019022307/play.csv" + datastadium_tracking_home_path=os.getcwd()+"/test/sports/event_data/data/datastadium/2019022307/home_tracking.csv" + datastadium_tracking_away_path=os.getcwd()+"/test/sports/event_data/data/datastadium/2019022307/away_tracking.csv" + + #test single file + + #test load_datafactory + # datafactory_df=Event_data(data_provider='datafactory',event_path=datafactory_path).load_data() + # datafactory_df.to_csv(os.getcwd()+"/test/sports/event_data/data/datafactory/test_data_main.csv",index=False) + + #test load_metrica + # metrica_df=Event_data(data_provider='metrica',event_path=metrica_event_csv_path,match_id=1, + # tracking_home_path=metrica_tracking_home_path,tracking_away_path=metrica_tracking_away_path).load_data() + # metrica_df.to_csv(os.getcwd()+"/test/sports/event_data/data/metrica/test_data_csv_main.csv",index=False) + # metrica_df=Event_data(data_provider='metrica',event_path=metrica_event_json_path,match_id=1).load_data() + # metrica_df.to_csv(os.getcwd()+"/test/sports/event_data/data/metrica/test_data_json_main.csv",index=False) + + #test load_opta_xml + # opta_df=Event_data(data_provider='opta',event_path=opta_f24_path,match_id=1).load_data() + # opta_df.to_csv(os.getcwd()+"/test/sports/event_data/data/opta/test_data_main.csv",index=False) + + #test load_robocup_2d + # robocup_2d_df=Event_data(data_provider='robocup_2d',event_path=robocup_2d_event_path,match_id=1,tracking_path=robocup_2d_tracking_path).load_data() + # robocup_2d_df.to_csv(os.getcwd()+"/test/sports/event_data/data/robocup_2d/test_data_main.csv",index=False) + + #test load_sportec + # sportec_df = Event_data(data_provider='sportec', event_path=sportec_event_path, tracking_path=sportec_tracking_path, meta_data=sportec_meta_path).load_data() + # sportec_df.to_csv(os.getcwd()+"/test/sports/event_data/data/sportec/test_data_main.csv",index=False) + + #test load_statsbomb with json file + # statsbomb_df=Event_data(data_provider='statsbomb',event_path=statsbomb_event_path,sb360_path=statsbomb_360_path).load_data() + # statsbomb_df.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_data_main.csv",index=False) + + # test load_statsbomb with api data + # statsbomb_df=Event_data(data_provider='statsbomb',statsbomb_match_id=3795108).load_data() + # statsbomb_df.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_api_data_main.csv",index=False) + + #test load_statsbomb_skillcorner + # statsbomb_skillcorner_df=Event_data(data_provider='statsbomb_skillcorner', + # statsbomb_event_dir=statsbomb_skillcorner_event_path, + # skillcorner_tracking_dir=statsbomb_skillcorner_tracking_path, + # skillcorner_match_dir=statsbomb_skillcorner_match_path, + # statsbomb_match_id=3894907, + # skillcorner_match_id=1553748 + # ).load_data() + # statsbomb_skillcorner_df.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb_skillcorner/test_data_main.csv",index=False) + + #test load_wyscout + # wyscout_df=Event_data(data_provider='wyscout',event_path=wyscout_event_path).load_data() + # wyscout_df.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_data_main.csv",index=False) + + # test load_datastadium + # datastadium_df=Event_data(data_provider='datastadium',event_path=datastadium_event_path, + # tracking_home_path=datastadium_tracking_home_path,tracking_away_path=datastadium_tracking_away_path).load_data() + # datastadium_df.to_csv(os.getcwd()+"/test/sports/event_data/data/datastadium/load_class_single.csv",index=False) + + + + #test preprocessing + # seq2event + # wyscout_df=Event_data(data_provider='wyscout',event_path=wyscout_event_path,preprocess_method="SEQ2EVENT",max_workers=10).preprocessing() + # wyscout_df.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_seq2event_main.csv",index=False) + + #test nmstpp + # wyscout_df=Event_data(data_provider='wyscout',event_path=wyscout_event_path,preprocess_method="NMSTPP",max_workers=10).preprocessing() + # wyscout_df.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_nmstpp_main.csv",index=False) + + #test lem + # wyscout_df=Event_data(data_provider='wyscout',event_path=wyscout_event_path,preprocess_method="LEM",max_workers=10).preprocessing() + # wyscout_df.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_lem_main.csv",index=False) + + #test UIED wyscout + # df_wyscout=Event_data(data_provider='wyscout',event_path=wyscout_event_path,preprocess_method="UIED",max_workers=10).preprocessing() + # df_wyscout.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_wyscout_UIED_main.csv",index=False) + + #test UIED statsbomb_skillcorner + # df_statsbomb_skillcorner=Event_data(data_provider='statsbomb_skillcorner', + # statsbomb_event_dir=statsbomb_skillcorner_event_path, + # skillcorner_tracking_dir=statsbomb_skillcorner_tracking_path, + # skillcorner_match_dir=statsbomb_skillcorner_match_path, + # statsbomb_match_id=3894907, + # skillcorner_match_id=1553748, + # preprocess_method="UIED", + # max_workers=10).preprocessing() + # df_statsbomb_skillcorner.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb_skillcorner/test_preprocess_statsbomb_skillcorner_UIED_main.csv",index=False) + + #test UIED statsbomb_json + # df_statsbomb_json=Event_data(data_provider='statsbomb',event_path=statsbomb_event_path,sb360_path=statsbomb_360_path,preprocess_method="UIED").preprocessing() + # df_statsbomb_json.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_preprocess_statsbomb_json_UIED_main.csv",index=False) + + #test UIED statsbomb_api + # df_statsbomb_api=Event_data(data_provider='statsbomb',statsbomb_match_id=3795108,preprocess_method="UIED").preprocessing() + # df_statsbomb_api.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_preprocess_statsbomb_api_UIED_main.csv",index=False) + + #test UIED datastadium + # df_datastadium=Event_data(data_provider='datastadium',event_path=datastadium_event_path, + # tracking_home_path=datastadium_tracking_home_path,tracking_away_path=datastadium_tracking_away_path, + # preprocess_method="UIED").preprocessing() + # df_datastadium.to_csv(os.getcwd()+"/test/sports/event_data/data/datastadium/preprocess_UIED_class_single.csv",index=False) + + + + + + + + + + # multiple files + # statsbomb_df=Event_data(data_provider='statsbomb',statsbomb_match_id=[3788742,3788741]).load_data() + # statsbomb_df.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_api_data_main_multi.csv",index=False) + + #test load_statsbomb_skillcorner + # statsbomb_skillcorner_df=Event_data(data_provider='statsbomb_skillcorner', + # statsbomb_event_dir=statsbomb_skillcorner_event_path, + # skillcorner_tracking_dir=statsbomb_skillcorner_tracking_path, + # skillcorner_match_dir=statsbomb_skillcorner_match_path, + # match_id_df=os.getcwd()+'/preprocessing/example/id_matching.csv', + # max_workers=10).load_data() + # statsbomb_skillcorner_df.head(10000).to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb_skillcorner/test_data_main_multi.csv",index=False) + + #test load_statsbomb_json + # multi_event_path="/data_pool_1/statsbomb_2023/events_and_frames/data/events" + # multi_360_path="/data_pool_1/statsbomb_2023/events_and_frames/data/360-frames" + + # statsbomb_df=Event_data(data_provider='statsbomb',event_path=multi_event_path,sb360_path=multi_360_path,max_workers=10).load_data() + # statsbomb_df.head(10000).to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_data_main_multi.csv",index=False) + + #test load_wyscout + # wyscout_event_path="/home/c_yeung/workspace6/python/openstarlab/PreProcessing/test/sports/event_data/data/wyscout/event" + # wyscout_matches_path="/home/c_yeung/workspace6/python/openstarlab/PreProcessing/test/sports/event_data/data/wyscout/matches" + # wyscout_df=Event_data(data_provider='wyscout', + # event_path=wyscout_event_path, + # wyscout_matches_path=wyscout_matches_path, + # max_workers=10).load_data() + # wyscout_df.head(10000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_data_main_multi.csv",index=False) + + #test load_datastadium multiple files + # datastadium_df=Event_data(data_provider='datastadium',event_path=datastadium_dir,max_workers=10).load_data() + # datastadium_df.to_csv(os.getcwd()+"/test/sports/event_data/data/datastadium/load_class_multi.csv",index=False) + + #test preprocessing multi files + # wyscout_event_path="/home/c_yeung/workspace6/python/openstarlab/PreProcessing/test/sports/event_data/data/wyscout/event" + # wyscout_matches_path="/home/c_yeung/workspace6/python/openstarlab/PreProcessing/test/sports/event_data/data/wyscout/matches" + # statsbomb_df=Event_data(data_provider='statsbomb',statsbomb_match_id=[3788742,3788741]).load_data() + # statsbomb_df.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_api_data_main_multi.csv",index=False) + #seq2event + # wyscout_df=Event_data(data_provider='wyscout',event_path=wyscout_event_path,wyscout_matches_path=wyscout_matches_path, + # preprocess_method="SEQ2EVENT",max_workers=10).preprocessing() + # wyscout_df.head(10000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_seq2event_main_multi.csv",index=False) + + #nmstpp + # wyscout_df=Event_data(data_provider='wyscout',event_path=wyscout_event_path,wyscout_matches_path=wyscout_matches_path, + # preprocess_method="NMSTPP",max_workers=10).preprocessing() + # wyscout_df.head(10000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_seq2event_main_multi.csv",index=False) + + #lem + # wyscout_df=Event_data(data_provider='wyscout',event_path=wyscout_event_path,wyscout_matches_path=wyscout_matches_path, + # preprocess_method="LEM",max_workers=10).preprocessing() + # wyscout_df.head(10000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_seq2event_main_multi.csv",index=False) + + #UIED wyscout + # wyscout_df=Event_data(data_provider='wyscout',event_path=wyscout_event_path,wyscout_matches_path=wyscout_matches_path, + # preprocess_method="UIED",max_workers=10).preprocessing() + # wyscout_df.head(10000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_seq2event_main_multi.csv",index=False) + + #UIED statsbomb_skillcorner + # statsbomb_skillcorner_df=Event_data(data_provider='statsbomb_skillcorner', + # statsbomb_event_dir=statsbomb_skillcorner_event_path, + # skillcorner_tracking_dir=statsbomb_skillcorner_tracking_path, + # skillcorner_match_dir=statsbomb_skillcorner_match_path, + # match_id_df=os.getcwd()+'/preprocessing/example/id_matching.csv', + # preprocess_method="UIED", + # ).preprocessing() + # statsbomb_skillcorner_df.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb_skillcorner/test_preprocess_statsbomb_skillcorner_UIED_main_multi.csv",index=False) + + #UIED statsbomb_json + # multi_event_path="/data_pool_1/statsbomb_2023/events_and_frames/data/events" + # multi_360_path="/data_pool_1/statsbomb_2023/events_and_frames/data/360-frames" + + # statsbomb_df=Event_data(data_provider='statsbomb',event_path=multi_event_path,sb360_path=multi_360_path,preprocess_method="UIED",max_workers=10).preprocessing() + # statsbomb_df.head(10000).to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_preprocess_statsbomb_json_UIED_main_multi.csv",index=False) + + #UIED statsbomb_api (could not test due to Max retries exceeded) + + #test UIED datastadium multiple files + # df_datastadium=Event_data(data_provider='datastadium',event_path=datastadium_dir,preprocess_method="UIED",max_workers=10).preprocessing() + # df_datastadium.to_csv(os.getcwd()+"/test/sports/event_data/data/datastadium/preprocess_UIED_class_multi.csv",index=False) + + #test soccertrack + soccer_track_event_path="/data_pool_1/soccertrackv2/2024-03-18/Event/event.csv" + soccer_track_tracking_path="/data_pool_1/soccertrackv2/2024-03-18/Tracking/tracking.xml" + soccer_track_meta_path="/data_pool_1/soccertrackv2/2024-03-18/Tracking/meta.xml" + df_soccertrack=Soccer_phase_data('soccertrack',soccer_track_event_path, + st_track_path = soccer_track_tracking_path, + st_meta_path = soccer_track_meta_path, + verbose = True).load_data() + df_soccertrack.to_csv(os.getcwd()+"/test/sports/event_data/data/soccertrack/test_load_soccer_event_class.csv",index=False) + print("-----------done-----------") diff --git a/preprocessing/sports/phase_data/soccer/soccer_plot_row.py b/preprocessing/sports/phase_data/soccer/soccer_plot_row.py new file mode 100644 index 0000000..f1024a8 --- /dev/null +++ b/preprocessing/sports/phase_data/soccer/soccer_plot_row.py @@ -0,0 +1,175 @@ +import matplotlib.pyplot as plt +import pandas as pd +import matplotlib.patches as patches +import os +import pdb + +FIELD_LENGTH = 105.0 # unit: meters +FIELD_WIDTH = 68.0 # unit: meters +GOAL_WIDTH = 7.32 # unit: meters +PENALTY_X = 105.0/2-16.5 # left point (unit: meters) +PENALTY_Y = 40.32 # upper point (unit: meters) + + +def plot_row_soccer(df, row, save_path): + if not isinstance(df, pd.DataFrame): + if isinstance(df, str): + df = pd.read_csv(df) + else: + raise ValueError("The input is not a dataframe or a path to a csv file") + + fig, ax = plt.subplots(figsize=(8, 6)) + fig.subplots_adjust(bottom=0.2) + + # Flip the y-axis + ax.invert_yaxis() + + # Plot the pitch + + # Center line + ax.plot([FIELD_LENGTH/2, FIELD_LENGTH/2], [0, FIELD_WIDTH], color="black", linewidth=0.7) + + # Penalty areas + # pdb.set_trace() + ax.plot([PENALTY_X+FIELD_LENGTH/2, FIELD_LENGTH], [(FIELD_WIDTH-PENALTY_Y)/2, (FIELD_WIDTH-PENALTY_Y)/2], color="black", linewidth=0.7) + ax.plot([PENALTY_X+FIELD_LENGTH/2, FIELD_LENGTH], [(FIELD_WIDTH+PENALTY_Y)/2, (FIELD_WIDTH+PENALTY_Y)/2], color="black", linewidth=0.7) + ax.plot([PENALTY_X+FIELD_LENGTH/2, PENALTY_X+FIELD_LENGTH/2,], [(FIELD_WIDTH-PENALTY_Y)/2, (FIELD_WIDTH+PENALTY_Y)/2], color="black", linewidth=0.7) + + ax.plot([FIELD_LENGTH/2-PENALTY_X, 0], [(FIELD_WIDTH-PENALTY_Y)/2, (FIELD_WIDTH-PENALTY_Y)/2], color="black", linewidth=0.7) + ax.plot([FIELD_LENGTH/2-PENALTY_X, 0], [(FIELD_WIDTH+PENALTY_Y)/2, (FIELD_WIDTH+PENALTY_Y)/2], color="black", linewidth=0.7) + ax.plot([FIELD_LENGTH/2-PENALTY_X, FIELD_LENGTH/2-PENALTY_X], [(FIELD_WIDTH-PENALTY_Y)/2, (FIELD_WIDTH+PENALTY_Y)/2], color="black", linewidth=0.7) + + # Goal areas + ax.plot([5.5, 0], [(FIELD_WIDTH-18.32)/2, (FIELD_WIDTH-18.32)/2], color="black", linewidth=0.7) + ax.plot([5.5, 0], [(FIELD_WIDTH+18.32)/2, (FIELD_WIDTH+18.32)/2], color="black", linewidth=0.7) + ax.plot([5.5, 5.5], [(FIELD_WIDTH-18.32)/2, (FIELD_WIDTH+18.32)/2], color="black", linewidth=0.7) + + ax.plot([FIELD_LENGTH-5.5, FIELD_LENGTH], [(FIELD_WIDTH-18.32)/2, (FIELD_WIDTH-18.32)/2], color="black", linewidth=0.7) + ax.plot([FIELD_LENGTH-5.5, FIELD_LENGTH], [(FIELD_WIDTH+18.32)/2, (FIELD_WIDTH+18.32)/2], color="black", linewidth=0.7) + ax.plot([FIELD_LENGTH-5.5, FIELD_LENGTH-5.5], [(FIELD_WIDTH-18.32)/2, (FIELD_WIDTH+18.32)/2], color="black", linewidth=0.7) + + # # Goals + # ax.plot([-2, -2], [(FIELD_WIDTH-GOAL_WIDTH)/2, (FIELD_WIDTH+GOAL_WIDTH)/2], color="black", linewidth=10) + # ax.plot([FIELD_LENGTH+2, FIELD_LENGTH+2], [(FIELD_WIDTH-GOAL_WIDTH)/2, (FIELD_WIDTH+GOAL_WIDTH)/2], color="black", linewidth=10) + + # Field outline + ax.plot([0, FIELD_LENGTH], [0, 0], color="black", linewidth=2) + ax.plot([0, FIELD_LENGTH], [FIELD_WIDTH, FIELD_WIDTH], color="black", linewidth=2) + ax.plot([0, 0], [0, FIELD_WIDTH], color="black", linewidth=2) + ax.plot([FIELD_LENGTH, FIELD_LENGTH], [0, FIELD_WIDTH], color="black", linewidth=2) + + # Center circle + c = patches.Circle(xy=(FIELD_LENGTH/2, FIELD_WIDTH/2), radius=9.15, fill=False, ec='black', linewidth=0.7) + ax.add_patch(c) + + # Penalty arcs + a = patches.Arc((11, FIELD_WIDTH/2), 9.15*2, 9.15*2, theta1=270+37, theta2=90-37, linewidth=0.7) + ax.add_patch(a) + a = patches.Arc((FIELD_LENGTH-11, FIELD_WIDTH/2), 9.15*2, 9.15*2, theta1=90+36, theta2=270-36, linewidth=0.7) + # a = patches.Arc((-FIELD_LENGTH / 2 + 11, 0), 9.15*2, 9.15*2, theta1=270+34, theta2=90-34, linewidth=0.7) + ax.add_patch(a) + + # Set axis limits + ax.set_xlim(-5, FIELD_LENGTH+5) + ax.set_ylim(FIELD_WIDTH+5, -5) + + # Plot the player positions + df = df.reset_index(drop=True) + + row_df = df.iloc[row] + + # Define possession team actions + team_actions =[ 'Pass_Ground Pass', 'Pass_Long_HighPass', + 'Carry_nan', 'Pass_High Pass', 'Pass_Low Pass', + 'Miscontrol_nan', + 'Dribble_nan', 'Clearance_nan', 'Pass_Cross', 'Ball Recovery_nan', + 'Offside_nan', 'Goal Keeper_nan', + 'Dribbled Past_nan', 'Pass_Corner', + 'Shot_Saved', 'Shot_Blocked', 'Shot_Wayward', 'Shot_Off T', 'Shot_Goal', 'Shot_Post', + 'Tactical Shift_nan', 'Shield_nan', + 'Own Goal Against_Own goal', 'Error_nan', + 'Shot_Saved Off Target', 'Ball Receipt*_nan', 'Pressure_nan', 'Interception_nan' + ] + + def plot_player(row_df, ax, switch=False): + if not switch: + for i in range(1, 24): + x = row_df[f"h{i}_x"]+FIELD_LENGTH/2 + y = -(row_df[f"h{i}_y"])+FIELD_WIDTH/2 + if x == 0 and y == 0: + continue + ax.plot(x, y, 'o', color='red') + for i in range(1, 24): + x = row_df[f"a{i}_x"]+FIELD_LENGTH/2 + y = -(row_df[f"a{i}_y"])+FIELD_WIDTH/2 + if x == 0 and y == 0: + continue + ax.plot(x, y, 'o', color='blue') + else: + for i in range(1, 24): + x = -(row_df[f"h{i}_x"])+FIELD_LENGTH/2 + y = (row_df[f"h{i}_y"])+FIELD_WIDTH/2 + if x == 0 and y == 0: + continue + ax.plot(x, y, 'o', color='red') + for i in range(1, 24): + x = -(row_df[f"a{i}_x"])+FIELD_LENGTH/2 + y = (row_df[f"a{i}_y"])+FIELD_WIDTH/2 + if x == 0 and y == 0: + continue + ax.plot(x, y, 'o', color='blue') + + #check if col 'action' exists + switch_flag = False + if 'action' in df.columns: + x = row_df["start_x"] + y = row_df["start_y"] + home_team = row_df['home_team'] + home_side = row_df['home_side'] + if home_team == 1 and home_side == 'right': + plot_player(row_df, ax, switch=True) + switch_flag = True + elif home_team == 0 and home_side == 'left': + plot_player(row_df, ax, switch=True) + switch_flag = True + else: + plot_player(row_df, ax, switch=False) + switch_flag = False + elif 'event_type' in df.columns: + x = row_df["start_x"]*(1.05/1.2) + y = row_df["start_y"]*(0.68/0.8) + home_team = row_df['home_team'] + home_side = row_df['home_side'] + action = str(row_df["event_type"])+ "_" + str(row_df["event_type_2"]).replace("None","nan") + poss_team_action = True if action in team_actions else False + if poss_team_action: + if home_team == 1 and home_side == 'right': + plot_player(row_df, ax, switch=True) + switch_flag = True + elif home_team == 0 and home_side == 'left': + plot_player(row_df, ax, switch=True) + switch_flag = True + else: + plot_player(row_df, ax, switch=False) + switch_flag = False + else: + if home_team == 1 and home_side == 'right': + plot_player(row_df, ax, switch=False) + switch_flag = False + elif home_team == 0 and home_side == 'left': + plot_player(row_df, ax, switch=False) + switch_flag = False + else: + plot_player(row_df, ax, switch=True) + switch_flag = True + + + #plot the event location + ax.plot(x, y, 'o', color='black', markersize=3) + + # Set the figure title + ax.set_title(f"Row {row}, action: {action}, seconds: {row_df['seconds']}, home : {row_df.home_team}, switch: {switch_flag}\n red: home team, blue: away team, black: event location") + + # Save the plot + plt.savefig(save_path + f"/row_{row}.png") + plt.close(fig) diff --git a/preprocessing/sports/phase_data/soccer/soccer_processing.py b/preprocessing/sports/phase_data/soccer/soccer_processing.py new file mode 100644 index 0000000..3e9a665 --- /dev/null +++ b/preprocessing/sports/phase_data/soccer/soccer_processing.py @@ -0,0 +1,1554 @@ +import os +import pandas as pd +import numpy as np +import pdb + +def seq2event(data): + """ + Processes soccer match event data to determine possession, filter actions, + compute additional metrics, and normalize data. + + Parameters: + data (pd.DataFrame or str): A pandas DataFrame containing event data or a file path to a CSV file. + + Returns: + pd.DataFrame: A processed DataFrame with simplified and normalized event actions. + """ + + # Load data from DataFrame or file path + if isinstance(data, pd.DataFrame): + df = data + elif isinstance(data, str): + if os.path.exists(data): + df = pd.read_csv(data) + else: + raise FileNotFoundError("The file path does not exist") + else: + raise ValueError("The data must be a pandas DataFrame or a file path") + df = df.copy() + # Create 'action' column by concatenating 'event_type' and 'event_type_2' + df.loc[:, "action"] = df["event_type"].astype(str) + "_" + df["event_type_2"].astype(str) + + # Define possession team actions + possession_team_actions = [ + 'Free Kick_Goal kick', 'Free Kick_Throw in', 'Free Kick_Corner', 'Free Kick_Free Kick', + 'Free Kick_Free kick cross', 'Free Kick_Free kick shot', 'Free Kick_Penalty', 'Pass_Cross', + 'Pass_Hand pass', 'Pass_Head pass', 'Pass_High pass', 'Pass_Launch', 'Pass_Simple pass', + 'Pass_Smart pass', 'Shot_Shot', 'Shot_Goal', 'Free Kick_goal', 'Duel_Ground attacking duel_off dribble', + 'Others on the ball_Acceleration', 'Others on the ball_Clearance', 'Others on the ball_Touch_good', + 'Shot_Own_goal', 'Pass_Own_goal', 'Others on the ball_Own_goal' + ] + + possession = [] + seconds = [] + + # Determine possession and adjust seconds for second half + for i in range(len(df)): + if i == 0: + possession.append(df["team"].iloc[i]) + else: + if df["team"].iloc[i] == df["team"].iloc[i - 1]: + possession.append(df["team"].iloc[i]) + else: + if df["action"].iloc[i] in possession_team_actions: + possession.append(df["team"].iloc[i]) + else: + possession.append(df["team"].iloc[i - 1]) + + if df["period"].iloc[i] == "2H": + seconds.append(df["seconds"].iloc[i] + 60 * 60) + elif df["period"].iloc[i] == "E1": + seconds.append(df["seconds"].iloc[i] + 120 * 60) + elif df["period"].iloc[i] == "E2": + seconds.append(df["seconds"].iloc[i] + 150 * 60) + elif df["period"].iloc[i] == "P": + seconds.append(df["seconds"].iloc[i] + 180 * 60) + else: + seconds.append(df["seconds"].iloc[i]) + + df.loc[:, "possession_team"] = possession + df.loc[:, "seconds"] = seconds + + # Normalize time + df.loc[:, "seconds"] = df["seconds"] / df["seconds"].max() + #round numerical columns + df = df.round({"seconds": 4}) + + # Filter actions not by team in possession + df = df[df["team"] == df["possession_team"]].reset_index(drop=True) + + # Define simple actions + simple_actions = [ + 'Foul_Foul', 'Foul_Hand foul', 'Foul_Late card foul', 'Foul_Out of game foul', 'Foul_Protest', + 'Foul_Simulation', 'Foul_Time lost foul', 'Foul_Violent Foul', 'Offside_', 'Free Kick_Corner', + 'Free Kick_Free Kick', 'Free Kick_Free kick cross', 'Free Kick_Free kick shot', 'Free Kick_Goal kick', + 'Free Kick_Penalty', 'Free Kick_Throw in', 'Pass_Cross', 'Pass_Hand pass', 'Pass_Head pass', 'Pass_High pass', + 'Pass_Launch', 'Pass_Simple pass', 'Pass_Smart pass', 'Shot_Shot', 'Shot_Goal', 'Shot_Own_goal', 'Free Kick_goal', + 'Others on the ball_Own_goal', 'Pass_Own_goal', 'Duel_Ground attacking duel', 'Others on the ball_Acceleration', + 'Others on the ball_Clearance', 'Others on the ball_Touch', 'Others on the ball_Touch_good', + 'Duel_Ground attacking duel_off dribble' + ] + + # Filter out non-simple actions + df = df[df["action"].isin(simple_actions)].reset_index(drop=True) + + # Calculate match score + def calculate_match_score(df): + home_team_score_list = [] + away_team_score_list = [] + score_diff_list = [] + + for match_id in df.match_id.unique(): + home_team_score = 0 + away_team_score = 0 + #check if column home_team only have one unique value + if len(df[df["match_id"] == match_id].home_team.unique())>1: + home_team_id = df[df["match_id"] == match_id][df["home_team"]==1].team.unique()[0] + else: + home_team_id = df.team.unique()[0] + match_df = df[df["match_id"] == match_id].reset_index(drop=True) + + for i in range(len(match_df)): + if match_df.iloc[i].event_type_2 == "Goal": + if match_df["team"].iloc[i] == home_team_id: + home_team_score += 1 + else: + away_team_score += 1 + elif match_df.iloc[i].event_type_2 == "Own_goal": + if match_df["team"].iloc[i] == home_team_id: + away_team_score += 1 + else: + home_team_score += 1 + score_diff = home_team_score - away_team_score + home_team_score_list.append(home_team_score) + away_team_score_list.append(away_team_score) + score_diff_list.append(score_diff) + + return home_team_score_list, away_team_score_list, score_diff_list + + home_team_score_list, away_team_score_list, score_diff_list = calculate_match_score(df) + df["home_team_score"] = home_team_score_list + df["away_team_score"] = away_team_score_list + df["score_diff"] = score_diff_list + + # Set possession id + poss_id_list = [] + poss_id = 0 + for i in range(len(df)): + if i == 0: + poss_id_list.append(0) + else: + if df["possession_team"].iloc[i] == df["possession_team"].iloc[i - 1] and df["period"].iloc[i] == df["period"].iloc[i - 1]: + poss_id_list.append(poss_id) + else: + poss_id += 1 + poss_id_list.append(poss_id) + df["poss_id"] = poss_id_list + + + # Add a row in between the first and last row of each possession + new_df = [] + for poss_id in df.poss_id.unique(): + temp_df = df[df["poss_id"] == poss_id].reset_index(drop=True) + for j in range(len(temp_df)): + new_df.append(temp_df.iloc[j]) + new_row = temp_df.iloc[-1].copy() + new_row["action"] = "_" + new_df.append(new_row) + + # Concatenate all rows in new_df + new_df = pd.concat(new_df, axis=1).T.reset_index(drop=True) + + # Simplify actions + drop_list = [ + 'Foul_Foul', 'Foul_Hand foul', 'Foul_Late card foul', 'Foul_Out of game foul', + 'Foul_Protest', 'Foul_Simulation', 'Foul_Time lost foul', 'Foul_Violent Foul', 'Offside_', + 'Others on the ball_Own_goal', 'Pass_Own_goal' + ] + p_list = [ + "Free Kick_Goal kick", 'Free Kick_Throw in', 'Free Kick_Free Kick', 'Pass_Hand pass', + 'Pass_Head pass', 'Pass_High pass', 'Pass_Launch', 'Pass_Simple pass', 'Pass_Smart pass', + 'Others on the ball_Clearance' + ] + d_list = [ + 'Duel_Ground attacking duel_off dribble', 'Others on the ball_Acceleration', 'Others on the ball_Touch_good' + ] + x_list = [ + 'Free Kick_Corner', 'Free Kick_Free kick cross', 'Pass_Cross' + ] + s_list = [ + 'Free Kick_Free kick shot', 'Free Kick_Penalty', 'Shot_Shot', 'Shot_Goal', 'Shot_Own_goal' + ] + + new_df = new_df[~new_df["action"].isin(drop_list)].reset_index(drop=True) + action_list = [] + for action in new_df["action"]: + if action in p_list: + action_list.append("p") + elif action in d_list: + action_list.append("d") + elif action in x_list: + action_list.append("x") + elif action in s_list: + action_list.append("s") + elif action == "_": + action_list.append("_") + else: + action_list.append(action) + + new_df["action"] = action_list + + df = new_df.copy() + + # Calculate additional metrics + def calculate_additional_metrics(df): + time_diff_list = [] + distance_list = [] + distance2goal_list = [] + angle_list = [] + x_diff_list = [] + y_diff_list = [] + + for match_id in df.match_id.unique(): + match_df = df[df["match_id"] == match_id].reset_index(drop=True) + for i in range(len(match_df)): + if i == 0: + time_diff = 0 + distance = 0 + distance2goal = 0 + angle = 0.5 + x_diff = 0 + y_diff = 0 + elif match_df.iloc[i].action == "_": + time_diff = 0 + distance = 0 + distance2goal = 0 + angle = 0.5 + x_diff = 0 + y_diff = 0 + else: + time_diff = match_df["seconds"].iloc[i] - match_df["seconds"].iloc[i - 1] + distance = ((match_df["start_x"].iloc[i] * 1.05 - match_df["start_x"].iloc[i-1] * 1.05) ** 2 + + (match_df["start_y"].iloc[i] * 0.68 - match_df["start_y"].iloc[i-1] * 0.68) ** 2) ** 0.5 + distance2goal = (((match_df["start_x"].iloc[i] - 100/100) * 1.05) ** 2 + + ((match_df["start_y"].iloc[i] - 50/100) * 0.68) ** 2) ** 0.5 + angle = np.abs(np.arctan2((match_df["start_y"].iloc[i] - 50/100) * 0.68, + (match_df["start_x"].iloc[i] - 100/100) * 1.05)) + x_diff = match_df["start_x"].iloc[i] * 1.05 - match_df["start_x"].iloc[i-1] * 1.05 + y_diff = match_df["start_y"].iloc[i] * 0.68 - match_df["start_y"].iloc[i-1] * 0.68 + + time_diff_list.append(time_diff) + distance_list.append(distance) + distance2goal_list.append(distance2goal) + angle_list.append(angle) + x_diff_list.append(x_diff) + y_diff_list.append(y_diff) + + return time_diff_list, distance_list, distance2goal_list, angle_list, x_diff_list, y_diff_list + + # Scale and normalize columns + df["start_x"] = df["start_x"] / 100 + df["start_y"] = df["start_y"] / 100 + df["end_x"] = df["end_x"] / 100 + df["end_y"] = df["end_y"] / 100 + + (time_diff_list, distance_list, distance2goal_list, angle_list, + x_diff_list, y_diff_list) = calculate_additional_metrics(df) + + df["time_diff"] = time_diff_list + df["distance"] = distance_list + df["distance2goal"] = distance2goal_list + df["angle2goal"] = angle_list + df["x_diff"] = x_diff_list + df["y_diff"] = y_diff_list + + # Scale and normalize columns + # df["distance"] = df["distance"] / df["distance"].max() + # df["distance2goal"] = df["distance2goal"] / df["distance2goal"].max() + # df["angle2goal"] = df["angle2goal"] / df["angle2goal"].max() + # df["x_diff"] = df["x_diff"] / df["x_diff"].max() + # df["y_diff"] = df["y_diff"] / df["y_diff"].max() + + # Clip time differences to a maximum of 0.01 seconds + df["time_diff"] = np.clip(df["time_diff"], 0, 0.01) + + # Round numerical columns + df = df.round({"seconds": 4, "time_diff": 4, "distance": 4, "distance2goal": 4, "angle2goal": 4, + "start_x": 4, "start_y": 4, "end_x": 4, "end_y": 4, "x_diff": 4, "y_diff": 4}) + + # Reorder columns + df = df[[ + "comp", "match_id", "poss_id", "team", "action", "start_x", "start_y", "x_diff", "y_diff", + "distance", "distance2goal", "angle2goal", "seconds", "time_diff", "score_diff" + ]] + + return df + +def nmstpp(data): + """ + Processes soccer match event data to determine possession, filter actions, + compute additional metrics, and normalize data. + + Parameters: + data (pd.DataFrame or str): A pandas DataFrame containing event data or a file path to a CSV file. + + Returns: + pd.DataFrame: A processed DataFrame with simplified and normalized event actions. + """ + + # Load data from DataFrame or file path + if isinstance(data, pd.DataFrame): + df = data + elif isinstance(data, str): + if os.path.exists(data): + df = pd.read_csv(data) + else: + raise FileNotFoundError("The file path does not exist") + else: + raise ValueError("The data must be a pandas DataFrame or a file path") + + df=seq2event(df) + #define the zone clusters for Juego de Posición + centroid_x=[ 8.5 , 25.25, 41.75, 58.25, 74.75, 91.5,8.5 , 25.25, 41.75, 58.25, 74.75, + 91.5,33.5, 66.5,33.5, 66.5,33.5, 66.5,8.5,91.5] + centroid_y=[89.45, 89.45, 89.45, 89.45, 89.45, 89.45,10.55, 10.55, 10.55, 10.55, 10.55, 10.55, + 71.05, 71.05,50., 50.,28.95, 28.95, 50.,50.] + + #scale start_x and start_y by 100 + df["start_x"]=df["start_x"]*100 + df["start_y"]=df["start_y"]*100 + + #calculate the zone of the start_x and start_y + zone_list=[] + #get closest zone for each start_x and start_y + for i in range(len(df)): + min_dist=1000 + zone=-1 + for j in range(len(centroid_x)): + dist=np.sqrt((df["start_x"].iloc[i]-centroid_x[j])**2+(df["start_y"].iloc[i]-centroid_y[j])**2) + if dist1: + home_team=match_df[match_df["home_team"]==1].team.unique()[0] + else: + home_team=team_list[0] + home_score=0 + away_score=0 + is_goal=0 + for i in range(len(match_df)): + if match_df["team"].iloc[i]==home_team: + is_home_list.append(1) + if match_df["event_type_2"].iloc[i]=="Goal": + home_score+=1 + is_goal=1 + elif match_df["event_type_2"].iloc[i]=="Own_goal": + away_score+=1 + is_goal=1 + else: + is_home_list.append(0) + if match_df["event_type_2"].iloc[i]=="Goal": + away_score+=1 + is_goal=1 + elif match_df["event_type_2"].iloc[i]=="Own_goal": + home_score+=1 + is_goal=1 + home_score_list.append(home_score) + away_score_list.append(away_score) + is_goal_list.append(is_goal) + df["HomeScore"]=home_score_list + df["AwayScore"]=away_score_list + df["IsHome"]=is_home_list + df["IsGoal"]=is_goal_list + + #convert col accurate from TF to 1 and 0 + df['IsAccurate']=df['accurate'].astype(int) + + #create the EventType + event_type_list=[] + for i in range(len(df)): + event_type=df["event_type_2"].iloc[i] + if event_type=="Goal": + event_type_list.append("Shot") + elif event_type=="own-goal": + event_type_list.append("Shot") + else: + event_type_list.append(event_type) + + df["EventType"]=event_type_list + + #add row period_over and game_over + new_df=[] + for match in df.match_id.unique(): + match_df=df[df["match_id"]==match] + for period in match_df.period.unique(): + period_df=match_df[match_df["period"]==period] + for i in range(len(period_df)): + new_df.append(period_df.iloc[i]) + last_row=period_df.iloc[-1].copy() + #set the IsHome, IsGoal, IsAccurate, to 0 + last_row["IsHome"]=0 + last_row["IsGoal"]=0 + last_row["IsAccurate"]=0 + #check if it is the last period of the matchs + if period==match_df.period.unique()[-1]: + last_row["EventType"]="game_over" + new_df.append(last_row) + else: + last_row["EventType"]="period_over" + new_df.append(last_row) + df=pd.concat(new_df,axis=1).T.reset_index(drop=True) + + #reorder columns + df = df[[ + "comp", "match_id", "EventType", "IsGoal", "IsAccurate","IsHome", "Period", "Minute","Second","start_x","start_y","HomeScore","AwayScore" + ]] + + #rename columns + df.rename(columns={"start_x":"X","start_y":"Y"},inplace=True) + + #round numerical columns to 4 decimal places (period, minute, second, X, Y) + df = df.round({"Period": 4, "Minute": 4, "Second": 4, "X": 4, "Y": 4}) + + return df + +def UIED_wyscout(data): + """ + Processes soccer match event data to determine possession, filter actions, + compute additional metrics, and normalize data. + + Parameters: + data (pd.DataFrame or str): A pandas DataFrame containing event data or a file path to a CSV file. + provider (str): The provider of the event data. + + Returns: + pd.DataFrame: A processed DataFrame with simplified and normalized event actions. + """ + + # Load data from DataFrame or file path + if isinstance(data, pd.DataFrame): + df = data + elif isinstance(data, str): + if os.path.exists(data): + df = pd.read_csv(data) + else: + raise FileNotFoundError("The file path does not exist") + else: + raise ValueError("The data must be a pandas DataFrame or a file path") + + df=df.copy() + #get possession team only event + # Create 'action' column by concatenating 'event_type' and 'event_type_2' + df["action"] = df["event_type"].astype(str) + "_" + df["event_type_2"].astype(str) + + # Define possession team actions + possession_team_actions = [ + 'Free Kick_Goal kick', 'Free Kick_Throw in', 'Free Kick_Corner', 'Free Kick_Free Kick', + 'Free Kick_Free kick cross', 'Free Kick_Free kick shot', 'Free Kick_Penalty', 'Pass_Cross', + 'Pass_Hand pass', 'Pass_Head pass', 'Pass_High pass', 'Pass_Launch', 'Pass_Simple pass', + 'Pass_Smart pass', 'Shot_Shot', 'Shot_Goal', 'Free Kick_goal', 'Duel_Ground attacking duel_off dribble', + 'Others on the ball_Acceleration', 'Others on the ball_Clearance', 'Others on the ball_Touch_good', + 'Shot_Own_goal', 'Pass_Own_goal', 'Others on the ball_Own_goal' + ] + + possession = [] + # Determine possession + for i in range(len(df)): + if i == 0: + possession.append(df["team"].iloc[i]) + else: + if df["team"].iloc[i] == df["team"].iloc[i - 1]: + possession.append(df["team"].iloc[i]) + else: + if df["action"].iloc[i] in possession_team_actions: + possession.append(df["team"].iloc[i]) + else: + possession.append(df["team"].iloc[i - 1]) + + df["possession_team"] = possession + df = df[df["team"] == df["possession_team"]].reset_index(drop=True) + + #create the event related features (sucess, home_team, goal, home_score, away_score) + df["success"]=df["accurate"].astype(int) + home_team_list=[] + goal_list=[] + home_score_list=[] + away_score_list=[] + goal_diff_list=[] + for match in df.match_id.unique(): + match_df=df[df["match_id"]==match] + team_list=match_df["team"].unique() + #check if column home_team only have one unique value + if len(match_df.home_team.unique())>1: + home_team=match_df[match_df["home_team"]==1].team.unique()[0] + else: + home_team=team_list[0] + home_score=0 + away_score=0 + goal_diff=0 + for i in range(len(match_df)): + if match_df["team"].iloc[i]==home_team: + home_team_list.append(1) + if match_df["event_type_2"].iloc[i]=="Goal": + home_score+=1 + elif match_df["event_type_2"].iloc[i]=="Own_goal": + away_score+=1 + else: + home_team_list.append(0) + if match_df["event_type_2"].iloc[i]=="Goal": + away_score+=1 + elif match_df["event_type_2"].iloc[i]=="Own_goal": + home_score+=1 + goal_diff=home_score-away_score + goal_list.append(1) if match_df["event_type_2"].iloc[i]=="Goal" else goal_list.append(0) + home_score_list.append(home_score) + away_score_list.append(away_score) + goal_diff_list.append(goal_diff) + + df["home_team"]=home_team_list + df["goal"]=goal_list + df["home_score"]=home_score_list + df["away_score"]=away_score_list + df["goal_diff"]=goal_diff_list + + #group the event into simpliefied actions + pass_actions=['Free Kick_Goal kick', 'Free Kick_Throw in','Free Kick_Free Kick','Pass_Cross','Pass_Hand pass','Pass_Simple pass','Pass_Smart pass','Pass_Head pass'] + high_pass_actions=['Pass_High pass'] + shot_actions=['Free Kick_Free kick shot','Free Kick_Penalty','Shot_Shot', 'Shot_Goal','Shot_Own_goal'] + carray_actions=['Others on the ball_Acceleration'] + dribble_actions=['Duel_Ground attacking duel_off dribble', 'Others on the ball_Touch_good','Duel_Air duel'] + cross_actions=['Free Kick_Corner','Free Kick_Free kick cross'] + drop_actions=['Pass_Launch', 'Free Kick_goal', 'Others on the ball_Clearance','Pass_Own_goal', 'Others on the ball_Own_goal','Foul_Foul', 'Foul_Hand foul', 'Foul_Late card foul', 'Foul_Out of game foul', + 'Foul_Protest', 'Foul_Simulation', 'Foul_Time lost foul', 'Foul_Violent Foul', 'Offside_','Duel_Ground loose ball duel','Others on the ball_Touch','Offside_nan','Interruption_Ball out of the field', + 'Duel_Ground defending duel', 'Duel_Ground attacking duel', 'Goalkeeper leaving line_Goalkeeper leaving line', 'Interruption_Whistle', 'Save attempt_Reflexes', 'Save attempt_Save attempt' + ] + action_list=[] + for i in range(len(df)): + if df["action"].iloc[i] in pass_actions: + #devide short pass and long pass based on the distance (45) + distance=np.sqrt(((df["start_x"].iloc[i]-df["end_x"].iloc[i])*1.05)**2+((df["start_y"].iloc[i]-df["end_y"].iloc[i])*0.68)**2) + if distance>=45: + action_list.append("long_pass") + else: + action_list.append("short_pass") + elif df["action"].iloc[i] in high_pass_actions: + action_list.append("high_pass") + elif df["action"].iloc[i] in shot_actions: + action_list.append("shot") + elif df["action"].iloc[i] in carray_actions: + action_list.append("carry") + elif df["action"].iloc[i] in dribble_actions: + action_list.append("dribble") + elif df["action"].iloc[i] in cross_actions: + action_list.append("cross") + elif df["action"].iloc[i] in drop_actions: + action_list.append("drop") + else: + action= df["action"].iloc[i] + print(f"Warning: action {action} was not found in the action list, it will be dropped") + action_list.append("drop") + + df["action"]=action_list + #drop the drop actions + df=df[df["action"]!="drop"].reset_index(drop=True) + + #create the time related features (period, minute, second, delta_T) + period_list=[] + minute_list=[] + second_list=[] + delta_t_list=[] + for i in range(len(df)): + if df["period"].iloc[i]=="1H": + period_list.append(1) + elif df["period"].iloc[i]=="2H": + period_list.append(2) + elif df["period"].iloc[i]=="E1": + period_list.append(3) + elif df["period"].iloc[i]=="E2": + period_list.append(4) + elif df["period"].iloc[i]=="P": + period_list.append(5) + minute_list.append(df["seconds"].iloc[i]//60) + second_list.append((df["seconds"].iloc[i]%60).round(4)) + if i==0: + delta_t_list.append(0) + else: + if df.action.iloc[i-1]=="period_over" or df.action.iloc[i-1]=="game_over": + delta_t_list.append(0) + else: + delta_t_list.append((df["seconds"].iloc[i]-df["seconds"].iloc[i-1]).round(4)) + df["Period"]=period_list + df["Minute"]=minute_list + df["Second"]=second_list + df["delta_T"]=delta_t_list + + #create the location related features (deltaX, deltaY, distance, dist2goal, angle2goal) + delta_x_list=[] + delta_y_list=[] + dist_list=[] + dist2goal_list=[] + angle2goal_list=[] + for i in range(len(df)): + delta_x=df["start_x"].iloc[i]-df["start_x"].iloc[i-1] + delta_y=df["start_y"].iloc[i]-df["start_y"].iloc[i-1] + distance = ((df["start_x"].iloc[i] * 1.05 - df["start_x"].iloc[i-1] * 1.05) ** 2 + + (df["start_y"].iloc[i] * 0.68 - df["start_y"].iloc[i-1] * 0.68) ** 2) ** 0.5 + dist2goal = (((df["start_x"].iloc[i] - 100) * 1.05) ** 2 + + ((df["start_y"].iloc[i] - 50) * 0.68) ** 2) ** 0.5 + angle2goal = np.abs(np.arctan2((df["start_y"].iloc[i] - 50) * 0.68, + (df["start_x"].iloc[i] - 100) * 1.05)) + + delta_x_list.append(delta_x) + delta_y_list.append(delta_y) + dist_list.append(distance) + dist2goal_list.append(dist2goal) + angle2goal_list.append(angle2goal) + df["deltaX"]=delta_x_list + df["deltaY"]=delta_y_list + df["distance"]=dist_list + df["dist2goal"]=dist2goal_list + df["angle2goal"]=angle2goal_list + + #scale start_x and start_y by the field size + df["start_x"]=df["start_x"]*0.68 + df["start_y"]=df["start_y"]*1.05 + + #create the possession id, end of possession, end of period, end of game + poss_id_list = [] + poss_id = 0 + for match in df.match_id.unique(): + match_df = df[df["match_id"] == match] + for i in range(len(match_df)): + if i == 0: + poss_id_list.append(poss_id) + else: + if match_df["possession_team"].iloc[i] == match_df["possession_team"].iloc[i - 1]: + poss_id_list.append(poss_id) + else: + poss_id += 1 + poss_id_list.append(poss_id) + poss_id+=1 + df["poss_id"] = poss_id_list + + new_df = [] + for match in df.match_id.unique(): + match_df = df[df["match_id"] == match] + for period in match_df.Period.unique(): + period_df = match_df[match_df["Period"] == period] + for poss_id in period_df.poss_id.unique(): + poss_df = period_df[period_df["poss_id"] == poss_id] + for i in range(len(poss_df)): + new_df.append(poss_df.iloc[i]) + last_row = poss_df.iloc[-1].copy() + last_row["action"] = "_" + #change the value of the features to 0 + last_row['goal'] = 0 + last_row["success"]=0 + last_row["deltaX"]=0 + last_row["deltaY"]=0 + last_row["distance"]=0 + last_row["dist2goal"]=0 + last_row["angle2goal"]=0.5 + last_row["delta_T"]=0 + new_df.append(last_row) + last_row = period_df.iloc[-1].copy() + #change the value of the features to 0 + last_row['goal'] = 0 + last_row["success"]=0 + last_row["deltaX"]=0 + last_row["deltaY"]=0 + last_row["distance"]=0 + last_row["dist2goal"]=0 + last_row["angle2goal"]=0.5 + last_row["delta_T"]=0 + if period == df.Period.unique()[-1]: + last_row["action"] = "game_over" + new_df.append(last_row) + else: + last_row["action"] = "period_over" + new_df.append(last_row) + df = pd.concat(new_df, axis=1).T.reset_index(drop=True) + + #reorder columns + df = df[['match_id', 'poss_id', 'team', 'home_team', 'action', 'success', 'goal', 'home_score', 'away_score', 'goal_diff', 'Period', 'Minute', 'Second', 'seconds', "delta_T", 'start_x', 'start_y', 'deltaX', 'deltaY', 'distance', 'dist2goal', 'angle2goal']] + + #adjust the seconds column for different periods + seconds_list=[] + for i in range(len(df)): + if df["Period"].iloc[i]==1: + seconds_list.append(df["seconds"].iloc[i]) + elif df["Period"].iloc[i]==2: + seconds_list.append(df["seconds"].iloc[i]+60*60) + elif df["Period"].iloc[i]==3: + seconds_list.append(df["seconds"].iloc[i]+120*60) + elif df["Period"].iloc[i]==4: + seconds_list.append(df["seconds"].iloc[i]+150*60) + elif df["Period"].iloc[i]==5: + seconds_list.append(df["seconds"].iloc[i]+180*60) + df["seconds"]=seconds_list + + #reset the features value to 0 (angle2goal to 0.5)for beginning of each period + new_df=[] + for match in df.match_id.unique(): + match_df=df[df["match_id"]==match] + for period in match_df.Period.unique(): + period_df=match_df[match_df["Period"]==period].copy() + for i in range(len(period_df)): + if i==0: + first_row=period_df.iloc[i].copy() + first_row["deltaX"]=0 + first_row["deltaY"]=0 + first_row["distance"]=0 + first_row["dist2goal"]=0 + first_row["angle2goal"]=0.5 + first_row["delta_T"]=0 + new_df.append(first_row) + else: + new_df.append(period_df.iloc[i]) + df=pd.concat(new_df,axis=1).T.reset_index(drop=True) + + #convert seconds, distance, dist2goal, angle2goal, start_x, start_y into type float + df["seconds"]=df["seconds"].astype(float) + df["distance"]=df["distance"].astype(float) + df["dist2goal"]=df["dist2goal"].astype(float) + df["angle2goal"]=df["angle2goal"].astype(float) + df["start_x"]=df["start_x"].astype(float) + df["start_y"]=df["start_y"].astype(float) + + #round numerical columns to 4 decimal places (period, minute, second, X, Y) + df = df.round({"Period": 4, "Minute": 4, "Second": 4, "seconds": 4, "start_x": 4, "start_y": 4, "deltaX": 4, "deltaY": 4, "distance": 4, "dist2goal": 4, "angle2goal": 4}) + + return df + +def UIED_statsbomb(data): + """ + Processes soccer match event data to determine possession, filter actions, + compute additional metrics, and normalize data. + + Parameters: + data (pd.DataFrame or str): A pandas DataFrame containing event data or a file path to a CSV file. + provider (str): The provider of the event data. + + Returns: + pd.DataFrame: A processed DataFrame with simplified and normalized event actions. + """ + + # Load data from DataFrame or file path + if isinstance(data, pd.DataFrame): + df = data + elif isinstance(data, str): + if os.path.exists(data): + df = pd.read_csv(data) + else: + raise FileNotFoundError("The file path does not exist") + else: + raise ValueError("The data must be a pandas DataFrame or a file path") + + df=df.copy() + + #get possession team only event + df["action"] = df["event_type"].astype(str) + "_" + df["event_type_2"].astype(str).replace("None","nan") + + # Define possession team actions + + possession_team_actions =[ 'Pass_Ground Pass', 'Pass_Long_HighPass', + 'Carry_nan', 'Pass_High Pass', 'Pass_Low Pass', + 'Miscontrol_nan', + 'Dribble_nan', 'Clearance_nan', 'Pass_Cross', 'Ball Recovery_nan', + 'Offside_nan', 'Goal Keeper_nan', + 'Dribbled Past_nan', 'Pass_Corner', + 'Shot_Saved', 'Shot_Blocked', 'Shot_Wayward', 'Shot_Off T', 'Shot_Goal', 'Shot_Post', + 'Tactical Shift_nan', 'Shield_nan', + 'Own Goal Against_Own goal', 'Error_nan', + 'Shot_Saved Off Target'] + + # Determine possession + possession = [] + for i in range(len(df)): + if i == 0: + possession.append(df["team"].iloc[i]) + else: + if df["team"].iloc[i] == df["team"].iloc[i - 1]: + possession.append(df["team"].iloc[i]) + else: + if df["action"].iloc[i] in possession_team_actions: + possession.append(df["team"].iloc[i]) + else: + possession.append(df["team"].iloc[i - 1]) + + df["possession_team"] = possession + df = df[df["team"] == df["possession_team"]].reset_index(drop=True) + + #create the event related features (sucess, home_team, goal, home_score, away_score) + sucess_list=[] + home_team_list=[] + goal_list=[] + goal_diff_list=[] + home_score_list=[] + away_score_list=[] + for match in df.match_id.unique(): + match_df=df[df["match_id"]==match] + team_list=match_df["team"].unique() + if "home_team" in df.columns: + if df.home_team.unique().shape[0]!=1: + #team name in "team" and "home_team" indicate the home team + home_team= df[df["home_team"]==1]["team"].iloc[0] + else: + home_team=team_list[0] + else: + home_team=team_list[0] + home_score=0 + away_score=0 + for i in range(len(match_df)): + if match_df["team"].iloc[i]==home_team: + home_team_list.append(1) + if match_df["event_type_2"].iloc[i]=="Goal": + home_score+=1 + elif match_df["event_type_2"].iloc[i]=="Own_goal": + away_score+=1 + else: + home_team_list.append(0) + if match_df["event_type_2"].iloc[i]=="Goal": + away_score+=1 + elif match_df["event_type_2"].iloc[i]=="Own_goal": + home_score+=1 + if match_df["possession_team"].iloc[i]==match_df["possession_team"].iloc[i-1] and match_df["event_type"].iloc[i]!='Shot': + sucess_list.append(1) + elif match_df["possession_team"].iloc[i]==match_df["possession_team"].iloc[i-1] and match_df["event_type"].iloc[i]=='Shot': + if match_df["event_type_2"].iloc[i]=="Goal": + sucess_list.append(1) + else: + sucess_list.append(0) + else: + sucess_list.append(0) + goal_list.append(1) if match_df["event_type_2"].iloc[i]=="Goal" else goal_list.append(0) + home_score_list.append(home_score) + away_score_list.append(away_score) + goal_diff=home_score-away_score + goal_diff_list.append(goal_diff) + + df["success"]=sucess_list + #check if home_team is in the df columns + if "home_team" not in df.columns: + df["home_team"]=home_team_list + elif "home_team" in df.columns and df.home_team.unique().shape[0]==1: + df["home_team"]=home_team_list + df["goal"]=goal_list + df["home_score"]=home_score_list + df["away_score"]=away_score_list + df["goal_diff"]=goal_diff_list + + #group the event into simpliefied actions + ''' + all action + ['Starting XI_nan', 'Half Start_nan', 'Pass_Ground Pass', 'Ball Receipt*_nan', + 'Carry_nan', 'Pass_High Pass', 'Pass_Low Pass', 'Duel_nan', 'Pressure_nan', + 'Foul Committed_nan', 'Foul Won_nan', 'Miscontrol_nan', 'Block_nan', + 'Dribble_nan', 'Clearance_nan', 'Pass_Cross', 'Ball Recovery_nan', + 'Dispossessed_nan', 'Interception_nan', 'Offside_nan', 'Goal Keeper_nan', + 'Injury Stoppage_nan', 'Player Off_nan', 'Referee Ball-Drop_nan', + 'Player On_nan', 'Dribbled Past_nan', 'Shot_Saved to Post', 'Pass_Corner', + 'Shot_Saved', 'Shot_Blocked', 'Shot_Wayward', 'Shot_Off T', 'Half End_nan', + 'Substitution_nan', '50/50_nan', 'Shot_Goal', 'Shot_Post', + 'Tactical Shift_nan', 'Bad Behaviour_nan', 'Shield_nan', + 'Own Goal Against_Own goal', 'Own Goal For_nan', 'Error_nan', + 'Shot_Saved Off Target'] + ''' + + pass_actions=['Pass_Ground Pass', 'Pass_Low Pass',] + high_pass_actions=['Pass_High Pass',] + shot_actions=['Shot_Saved to Post','Shot_Saved', 'Shot_Blocked', 'Shot_Wayward','Shot_Saved Off Target','Shot_Off T','Shot_Goal', 'Shot_Post',] + carray_actions=['Carry_nan','Carry_None'] + dribble_actions=['Dribble_nan', 'Shot_Off T',"Dribble_None"] + cross_actions=['Pass_Cross','Pass_Corner'] + drop_actions=['Starting XI_nan', 'Half Start_nan', 'Ball Receipt*_nan', 'Pressure_nan', 'Foul Committed_nan', 'Foul Won_nan', 'Miscontrol_nan', 'Block_nan', + 'Clearance_nan','Ball Recovery_nan','Dispossessed_nan', 'Interception_nan', 'Offside_nan', 'Goal Keeper_nan','Injury Stoppage_nan', 'Player Off_nan', 'Referee Ball-Drop_nan','Player On_nan', + 'Dribbled Past_nan','Half End_nan','Substitution_nan', '50/50_nan', 'Tactical Shift_nan', 'Bad Behaviour_nan', 'Shield_nan','Own Goal Against_Own goal', 'Own Goal For_nan', 'Error_nan','Duel_nan', + 'Ball Receipt*_None','Miscontrol_None','Duel_None','Pressure_None',"Ball Recovery_None","Substitution_None", + '50/50_None','Foul Committed_None','Error_None','Block_None','Bad Behaviour_None','Goal Keeper_None','Interception_None', + 'Half Start_None','Starting XI_None','Clearance_None','Interception_None','Tactical Shift_None','Dribbled Past_None',"Injury Stoppage_None",'Referee Ball-Drop_None','Dispossessed_None', + "Half End_None", "Own Goal Against_None","Own Goal Against_nan"] + + action_list=[] + for i in range(len(df)): + if df["action"].iloc[i] in pass_actions: + #devide short pass and long pass based on the distance (45) + distance=np.sqrt(((df["start_x"].iloc[i]-df["end_x"].iloc[i])*(1.05/1.2))**2+((df["start_y"].iloc[i]-df["end_y"].iloc[i])*(0.68/0.8))**2) + if distance>=45: + action_list.append("long_pass") + else: + action_list.append("short_pass") + elif df["action"].iloc[i] in high_pass_actions: + action_list.append("high_pass") + elif df["action"].iloc[i] in shot_actions: + action_list.append("shot") + elif df["action"].iloc[i] in carray_actions: + action_list.append("carry") + elif df["action"].iloc[i] in dribble_actions: + action_list.append("dribble") + elif df["action"].iloc[i] in cross_actions: + action_list.append("cross") + elif df["action"].iloc[i] in drop_actions: + action_list.append("drop") + else: + action= df["action"].iloc[i] + print(f"Warning: action {action} was not found in the action list, it will be dropped") + action_list.append("drop") + + df["action"]=action_list + #drop the drop actions + df=df[df["action"]!="drop"].reset_index(drop=True) + + #check if seconds is in df columns + if "seconds" not in df.columns: + df["seconds"]=df["minute"]*60+df["second"] + delta_t_list=[] + for i in range(len(df)): + if i==0: + delta_t_list.append(0) + else: + if df.action.iloc[i-1]=="period_over" or df.action.iloc[i-1]=="game_over": + delta_t_list.append(0) + else: + delta_t_list.append(df["seconds"].iloc[i]-df["seconds"].iloc[i-1]) + df["delta_T"]=delta_t_list + + #create the location related features (deltaX, deltaY, distance, dist2goal, angle2goal) + delta_x_list=[] + delta_y_list=[] + dist_list=[] + dist2goal_list=[] + angle2goal_list=[] + for i in range(len(df)): + delta_x=df["start_x"].iloc[i]-df["start_x"].iloc[i-1] + delta_y=df["start_y"].iloc[i]-df["start_y"].iloc[i-1] + distance = ((df["start_x"].iloc[i] * (1.05/1.2) - df["start_x"].iloc[i-1] * (1.05/1.2)) ** 2 + + (df["start_y"].iloc[i] * (0.68/0.8) - df["start_y"].iloc[i-1] * (0.68/0.8)) ** 2) ** 0.5 + dist2goal = (((df["start_x"].iloc[i] - 120) * (1.05/1.2)) ** 2 + + ((df["start_y"].iloc[i] - 40) * (0.68/0.8)) ** 2) ** 0.5 + angle2goal = np.abs(np.arctan2((df["start_y"].iloc[i] - 40) * (0.68/0.8), + (df["start_x"].iloc[i] - 120) * (1.05/1.2))) + + delta_x_list.append(delta_x) + delta_y_list.append(delta_y) + dist_list.append(distance) + dist2goal_list.append(dist2goal) + angle2goal_list.append(angle2goal) + df["deltaX"]=delta_x_list + df["deltaY"]=delta_y_list + df["distance"]=dist_list + df["dist2goal"]=dist2goal_list + df["angle2goal"]=angle2goal_list + + #scale the start_x and start_y to real pitch size + df["start_x"]=df["start_x"]*(1.05/1.2) + df["start_y"]=df["start_y"]*(0.68/0.8) + + #set possession_id + poss_id_list = [] + poss_id = 0 + for i in range(len(df)): + if i == 0: + poss_id_list.append(0) + else: + if df["possession_team"].iloc[i] == df["possession_team"].iloc[i - 1] and df["period"].iloc[i] == df["period"].iloc[i - 1]: + poss_id_list.append(poss_id) + else: + poss_id += 1 + poss_id_list.append(poss_id) + df["poss_id"] = poss_id_list + + #rename columns period to Period, minute to Minute, second to Second + df.rename(columns={"period":"Period","minute":"Minute","second":"Second"},inplace=True) + + new_df = [] + for match in df.match_id.unique(): + match_df = df[df["match_id"] == match] + for period in match_df.Period.unique(): + period_df = match_df[match_df["Period"] == period] + for poss_id in period_df.poss_id.unique(): + poss_df = period_df[period_df["poss_id"] == poss_id] + for i in range(len(poss_df)): + if poss_id==period_df.poss_id.unique()[0] and i==0: + first_row=poss_df.iloc[i].copy() + first_row["deltaX"]=0 + first_row["deltaY"]=0 + first_row["distance"]=0 + first_row["delta_T"]=0 + new_df.append(first_row) + else: + new_df.append(poss_df.iloc[i]) + last_row = poss_df.iloc[-1].copy() + last_row["action"] = "_" + #change the value of the features to 0 + last_row['goal']=0 + last_row["success"]=0 + last_row["deltaX"]=0 + last_row["deltaY"]=0 + last_row["distance"]=0 + last_row["dist2goal"]=0 + last_row["angle2goal"]=0.5 + last_row["delta_T"]=0 + new_df.append(last_row) + last_row = period_df.iloc[-1].copy() + #change the value of the features to 0 + last_row['goal']=0 + last_row["success"]=0 + last_row["deltaX"]=0 + last_row["deltaY"]=0 + last_row["distance"]=0 + last_row["dist2goal"]=0 + last_row["angle2goal"]=0.5 + last_row["delta_T"]=0 + if period == df.Period.unique()[-1]: + last_row["action"] = "game_over" + new_df.append(last_row) + else: + last_row["action"] = "period_over" + new_df.append(last_row) + df = pd.concat(new_df, axis=1).T.reset_index(drop=True) + + #remove carray action that have the same start and end location as the previous action (exclude "_" end of possession) + droplist=[] + for i in range(len(df)): + if df.start_x.iloc[i]==df.start_x.iloc[i-1] and df.start_y.iloc[i]==df.start_y.iloc[i-1]: + if df.action.iloc[i]=="carry" and df.action.iloc[i-1] not in ["_", "period_over", "game_over"]: + droplist.append(i) + + df.drop(droplist,inplace=True) + + new_df=[] + flag=False + for i in range(len(df)): + if i==len(df)-1: + new_df.append(df.iloc[i]) + break + if flag: + flag=False + new_df.append(row) + continue + if df.start_x.iloc[i]==df.start_x.iloc[i+1] and df.start_y.iloc[i]==df.start_y.iloc[i+1]: + if df.action.iloc[i]=="carry" and df.action.iloc[i+1] in ["short_pass", "long_pass", "high_pass", "shot", "dribble", "cross"]: + row=df.iloc[i].copy() + row["action"]=df.action.iloc[i+1] + flag=True + else: + new_df.append(df.iloc[i]) + else: + new_df.append(df.iloc[i]) + + df=pd.concat(new_df,axis=1).T.reset_index(drop=True) + + #adjust the seconds column for different periods + seconds_list=[] + for i in range(len(df)): + if df["Period"].iloc[i]==1: + seconds_list.append(df["seconds"].iloc[i]) + elif df["Period"].iloc[i]==2: + seconds_list.append(df["seconds"].iloc[i]+60*45) + elif df["Period"].iloc[i]==3: + seconds_list.append(df["seconds"].iloc[i]+60*90) + elif df["Period"].iloc[i]==4: + seconds_list.append(df["seconds"].iloc[i]+60*105) + elif df["Period"].iloc[i]==5: + seconds_list.append(df["seconds"].iloc[i]+60*120) + + #reset the features value to 0 (angle2goal to 0.5)for beginning of each period + new_df=[] + for match in df.match_id.unique(): + match_df=df[df["match_id"]==match] + for period in match_df.Period.unique(): + period_df=match_df[match_df["Period"]==period].copy() + for i in range(len(period_df)): + if i==0: + first_row=period_df.iloc[i].copy() + first_row["deltaX"]=0 + first_row["deltaY"]=0 + first_row["distance"]=0 + first_row["dist2goal"]=0 + first_row["angle2goal"]=0.5 + first_row["delta_T"]=0 + new_df.append(first_row) + else: + new_df.append(period_df.iloc[i]) + df=pd.concat(new_df,axis=1).T.reset_index(drop=True) + + #reorder columns + try: + sb360_columns = ["h"+str(i)+"_"+j for i in range(1, 12) for j in ["teammate", "actor", "keeper", "x", "y"]] + ["a"+str(i)+"_"+j for i in range(1, 12) for j in ["teammate", "actor", "keeper", "x", "y"]] + df = df[['match_id', 'poss_id', 'team', 'home_team', 'action', 'success', 'goal', 'home_score', 'away_score', 'goal_diff', 'Period', 'Minute', 'Second', 'seconds', "delta_T", 'start_x', 'start_y', 'deltaX', 'deltaY', 'distance', 'dist2goal', 'angle2goal']+sb360_columns] + #set the sb360 columns to 4 decimal places + for col in ["h"+str(i)+"_"+j for i in range(1, 12) for j in ["x", "y"]] + ["a"+str(i)+"_"+j for i in range(1, 12) for j in ["x", "y"]]: + #change the type of the column to float + df[col]=df[col].astype(float) + df[col]=df[col].round(4) + except: + try: + home_tracking_columns = [] + away_tracking_columns = [] + for i in range(1, 24): + home_tracking_columns.extend([f"h{i}_x", f"h{i}_y"]) + away_tracking_columns.extend([f"a{i}_x", f"a{i}_y"]) + df = df[['match_id', 'poss_id', 'team', 'home_team', 'action', 'success', 'goal', 'home_score', 'away_score', 'goal_diff', 'Period', 'Minute', 'Second', 'seconds', "delta_T", 'start_x', 'start_y', 'deltaX', 'deltaY', 'distance', 'dist2goal', 'angle2goal']+home_tracking_columns+away_tracking_columns] + #set the home_tracking_columns and away_tracking_columns to 4 decimal places + # for col in home_tracking_columns+away_tracking_columns: + # df[col]=df[col].round(4) + except: + df = df[['match_id', 'poss_id', 'team', 'home_team', 'action', 'success', 'goal', 'home_score', 'away_score', 'goal_diff', 'Period', 'Minute', 'Second', 'seconds', "delta_T", 'start_x', 'start_y', 'deltaX', 'deltaY', 'distance', 'dist2goal', 'angle2goal']] + #convert seconds, distance, dist2goal, angle2goal, deltaX, deltaY,start_x, start_y into type float + df["seconds"]=df["seconds"].astype(float) + df["distance"]=df["distance"].astype(float) + df["dist2goal"]=df["dist2goal"].astype(float) + df["angle2goal"]=df["angle2goal"].astype(float) + df["deltaX"]=df["deltaX"].astype(float) + df["deltaY"]=df["deltaY"].astype(float) + df["delta_T"]=df["delta_T"].astype(float) + df["start_x"]=df["start_x"].astype(float) + df["start_y"]=df["start_y"].astype(float) + + #round numerical columns to 4 decimal places (period, minute, second, X, Y,deltaX, deltaY, distance, dist2goal, angle2goal) + df = df.round({"Period": 4, "Minute": 4, "Second": 4, "seconds": 4, "start_x": 4, "start_y": 4, "deltaX": 4, "deltaY": 4, "distance": 4, "dist2goal": 4, "angle2goal": 4, "delta_T": 4}) + + return df + +def UIED_datastadium(data): + """ + Processes football event data from a DataFrame or CSV file, creating various features for analysis. + + Parameters: + - data (pd.DataFrame or str): If a string, it should be a path to a CSV file. If a DataFrame, it should contain the event data. + + Returns: + - pd.DataFrame: Processed DataFrame with additional features and cleaned data. + """ + # Load data from DataFrame or file path + if isinstance(data, pd.DataFrame): + df = data + elif isinstance(data, str): + if os.path.exists(data): + df = pd.read_csv(data) + else: + raise FileNotFoundError("The file path does not exist") + else: + raise ValueError("The data must be a pandas DataFrame or a file path") + + df = df.copy() + + # Create 'action' column by concatenating 'event_type' and 'event_type_2' + df["action"] = df["event_type"].astype(str) + "_" + df["event_type_2"].astype(str) + #rename "_None" to "_nan" + df["action"]=df["action"].str.replace("_None","_nan") + + + # Define possession team actions + + all_cation=['First Half Start_nan', 'KickOff_Pass', 'Trap_nan', + 'AwayPass_Pass', 'Block_nan', 'Intercept_nan', 'Shoot_nan', + 'Post Bar_nan', 'Shoot_Goal', 'Ball Out_nan', 'Clear_Clear', + 'Through Pass_Pass', 'Cross_Pass/Cross', 'Touch_nan', + 'HomePass_Pass', 'Dribble_Dribble', 'ThrowIn_Pass', 'Offside_nan', + 'Indirect FK_Pass/IndirectFreeKick', 'GK_Pass/GoalKick', + 'CK_Pass/CornerKick', 'Foul_nan', 'Direct FK_Pass/DirectFreeKick', + 'Tackle_nan', 'Shoot_Save', 'Shoot_Shot(not_GK)', 'Catch_nan', + 'CK_Pass/Cross/CornerKick', 'Feed_Pass', 'Hand Clear_HandClear', + 'Shoot_Shot(not_GK)/MissHit', 'Direct FK_Save/DirectFreeKick', + 'Direct FK_Shot(not_GK)/DirectFreeKick', + 'Direct FK_Pass/Cross/DirectFreeKick', 'First Half End_nan', + 'Second Half Start_nan', 'Change_nan', 'Second Half End_nan',"YellowCard_nan", + "RedCard_nan","Suspension(InGame)_nan","Shoot_Save/MissHit","PK_Goal","FrickOn_Pass", + "Direct FK_DirectFreeKick","Drop Ball_nan","Direct FK_Goal/DirectFreeKick","Shoot_MissHit", + "ThrowIn_nan","OwnGoal_Goal","CK_Save/CornerKick","Indirect FK_Pass/Cross/IndirectFreeKick" + ] + + possession_team_actions = [ + 'KickOff_Pass', 'Trap_nan', + 'AwayPass_Pass','Shoot_nan','Post Bar_nan', 'Shoot_Goal','Clear_Clear', + 'Through Pass_Pass', 'Cross_Pass/Cross', 'Touch_nan','HomePass_Pass', 'Dribble_Dribble', 'ThrowIn_Pass', + 'Indirect FK_Pass/IndirectFreeKick', 'GK_Pass/GoalKick','CK_Pass/CornerKick','Direct FK_Pass/DirectFreeKick', + 'Shoot_Shot(not_GK)','Shoot_Save','CK_Pass/Cross/CornerKick', 'Feed_Pass', 'Hand Clear_HandClear','Shoot_Shot(not_GK)/MissHit', + 'Direct FK_Save/DirectFreeKick','Direct FK_Shot(not_GK)/DirectFreeKick', 'Direct FK_Pass/Cross/DirectFreeKick',"FrickOn_Pass", + "Direct FK_DirectFreeKick","Shoot_Save/MissHit","Indirect FK_Pass/Cross/IndirectFreeKick","Shoot_MissHit", + "Direct FK_Goal/DirectFreeKick","ThrowIn_nan","CK_Save/CornerKick"] + + possession = [] + # Determine possession + for i in range(len(df)): + if i == 0: + possession.append(df["team"].iloc[i]) + else: + if df.action.iloc[i] not in all_cation: + print(f"Warning: action {df.action.iloc[i]} was not found in the all action list") + if df["team"].iloc[i] == df["team"].iloc[i - 1]: + possession.append(df["team"].iloc[i]) + else: + if df["action"].iloc[i] in possession_team_actions: + possession.append(df["team"].iloc[i]) + else: + possession.append(df["team"].iloc[i - 1]) + + df["possession_team"] = possession + + #create the event related features (sucess, home_team, goal_diff, home_score, away_score) + #success is provided in the data + #drop all row with col home equal 0 then subtract 1 from home + df = df[df["home"] != 0].reset_index(drop=True) + + home_score = [] + away_score = [] + goal_diff = [] + home_team = [] + goal= [] + for i in range(len(df)): + if df["home"].iloc[i] == 1: + home_team.append(1) + home_score.append(df["self_score"].iloc[i]) + away_score.append(df["opp_score"].iloc[i]) + goal_diff.append(df["self_score"].iloc[i] - df["opp_score"].iloc[i]) + elif df["home"].iloc[i] == 2: + home_team.append(0) + home_score.append(df["opp_score"].iloc[i]) + away_score.append(df["self_score"].iloc[i]) + goal_diff.append(df["opp_score"].iloc[i] - df["self_score"].iloc[i]) + #check if Goal but not GoalKick is in the str of df["event_type_2"].iloc[i] + if "Goal" in str(df["event_type_2"].iloc[i]) and "GoalKick" not in str(df["event_type_2"].iloc[i]): + goal.append(1) + else: + goal.append(0) + + df["home_score"] = home_score + df["away_score"] = away_score + df["goal_diff"] = goal_diff + df["home_team"] = home_team + df["goal"] = goal + + #group the event into simpliefied actions + pass_actions=['KickOff_Pass','AwayPass_Pass','Through Pass_Pass', 'HomePass_Pass','ThrowIn_Pass', + 'Indirect FK_Pass/IndirectFreeKick', 'GK_Pass/GoalKick','Direct FK_Pass/DirectFreeKick', + "FrickOn_Pass","Direct FK_DirectFreeKick","Indirect FK_Pass/Cross/IndirectFreeKick", + "ThrowIn_nan" + ] + high_pass_actions=[] + shot_actions=['Shoot_nan','Shoot_Goal','Shoot_Save', 'Shoot_Shot(not_GK)','Shoot_Shot(not_GK)/MissHit','Direct FK_Save/DirectFreeKick', + 'Direct FK_Shot(not_GK)/DirectFreeKick', "Shoot_Save/MissHit","Shoot_MissHit","Direct FK_Goal/DirectFreeKick" + ] + carray_actions=[] + dribble_actions=['Dribble_Dribble'] + cross_actions=['Cross_Pass/Cross','CK_Pass/CornerKick','CK_Pass/Cross/CornerKick','Feed_Pass','Direct FK_Pass/Cross/DirectFreeKick', "CK_Save/CornerKick"] + drop_actions=['First Half Start_nan','Trap_nan','Block_nan', 'Intercept_nan','Post Bar_nan','Ball Out_nan','Clear_Clear','Touch_nan', + 'Offside_nan','Foul_nan','Tackle_nan','Catch_nan','Hand Clear_HandClear','First Half End_nan','Second Half Start_nan', + 'Change_nan', 'Second Half End_nan',"YellowCard_nan","RedCard_nan","Suspension(InGame)_nan","Drop Ball_nan","PK_Goal", + "OwnGoal_Goal" + ] + + + action_list=[] + for i in range(len(df)): + if df["action"].iloc[i] in pass_actions: + #devide short pass and long pass based on the distance (45) + distance=df.dist.iloc[i] + if distance>=45: + action_list.append("long_pass") + else: + action_list.append("short_pass") + elif df["action"].iloc[i] in high_pass_actions: + action_list.append("high_pass") + elif df["action"].iloc[i] in shot_actions: + action_list.append("shot") + elif df["action"].iloc[i] in carray_actions: + action_list.append("carry") + elif df["action"].iloc[i] in dribble_actions: + action_list.append("dribble") + elif df["action"].iloc[i] in cross_actions: + action_list.append("cross") + elif df["action"].iloc[i] in drop_actions: + action_list.append("drop") + else: + action= df["action"].iloc[i] + print(f"Warning: action {action} was not found in the action list, it will be dropped") + action_list.append("drop") + + df["action"]=action_list + #drop the drop actions + df=df[df["action"]!="drop"].reset_index(drop=True) + + #create the time related features (delta_T) + delta_t_list=[] + for i in range(len(df)): + if i==0: + delta_t_list.append(0) + else: + delta_t_list.append(df["absolute_time"].iloc[i]-df["absolute_time"].iloc[i-1]) + df["delta_T"]=delta_t_list + + #create the location related features (deltaX, deltaY, distance) + delta_x_list=[] + delta_y_list=[] + dist_list=[] + + for i in range(len(df)): + if i==0: + delta_x=0 + delta_y=0 + distance=0 + else: + delta_x=df["start_x"].iloc[i]-df["start_x"].iloc[i-1] + delta_y=df["start_y"].iloc[i]-df["start_y"].iloc[i-1] + distance = np.sqrt(delta_x**2+delta_y**2) + delta_x_list.append(delta_x) + delta_y_list.append(delta_y) + dist_list.append(distance) + df["deltaX"]=delta_x_list + df["deltaY"]=delta_y_list + df["distance"]=dist_list + + #create the possession id, end of possession, end of period, end of game + poss_id_list = [] + poss_id = 0 + for match in df.match_id.unique(): + match_df = df[df["match_id"] == match] + for i in range(len(match_df)): + if i == 0: + poss_id_list.append(poss_id) + else: + if match_df["possession_team"].iloc[i] == match_df["possession_team"].iloc[i - 1]: + poss_id_list.append(poss_id) + else: + poss_id += 1 + poss_id_list.append(poss_id) + poss_id+=1 + df["poss_id"] = poss_id_list + + new_df = [] + for match in df.match_id.unique(): + match_df = df[df["match_id"] == match] + for period in match_df.Period.unique(): + period_df = match_df[match_df["Period"] == period] + for poss_id in period_df.poss_id.unique(): + poss_df = period_df[period_df["poss_id"] == poss_id] + for i in range(len(poss_df)): + new_df.append(poss_df.iloc[i]) + last_row = poss_df.iloc[-1].copy() + last_row["action"] = "_" + #change the value of the features to 0 + last_row['goal']=0 + last_row["success"]=0 + last_row["deltaX"]=0 + last_row["deltaY"]=0 + last_row["distance"]=0 + last_row["dist2goal"]=0 + last_row["angle2goal"]=0.5 + last_row["delta_T"]=0 + new_df.append(last_row) + last_row = period_df.iloc[-1].copy() + #change the value of the features to 0 + last_row['goal']=0 + last_row["success"]=0 + last_row["deltaX"]=0 + last_row["deltaY"]=0 + last_row["distance"]=0 + last_row["dist2goal"]=0 + last_row["angle2goal"]=0.5 + last_row["delta_T"]=0 + if period == df.Period.unique()[-1]: + last_row["action"] = "game_over" + new_df.append(last_row) + else: + last_row["action"] = "period_over" + new_df.append(last_row) + df = pd.concat(new_df, axis=1).T.reset_index(drop=True) + + #create the seconds column + seconds_list=[] + for i in range(len(df)): + if df["Period"].iloc[i]==1: + seconds_list.append(df.Minute.iloc[i]*60+df.Second.iloc[i]) + elif df["Period"].iloc[i]==2: + seconds_list.append(df.Minute.iloc[i]*60+df.Second.iloc[i]+60*45) + + df["seconds"]=seconds_list + + #reset the features value to 0 (angle2goal to 0.5)for beginning of each period + new_df=[] + for match in df.match_id.unique(): + match_df=df[df["match_id"]==match] + for period in match_df.Period.unique(): + period_df=match_df[match_df["Period"]==period].copy() + for i in range(len(period_df)): + if i==0: + first_row=period_df.iloc[i].copy() + first_row["deltaX"]=0 + first_row["deltaY"]=0 + first_row["distance"]=0 + first_row["dist2goal"]=0 + first_row["angle2goal"]=0.5 + first_row["delta_T"]=0 + new_df.append(first_row) + else: + new_df.append(period_df.iloc[i]) + df=pd.concat(new_df,axis=1).T.reset_index(drop=True) + + #convert seconds, distance, dist2goal, angle2goal, start_x, start_y into type float + df["seconds"]=df["seconds"].astype(float) + df["distance"]=df["distance"].astype(float) + df["dist2goal"]=df["dist2goal"].astype(float) + df["angle2goal"]=df["angle2goal"].astype(float) + df["start_x"]=df["start_x"].astype(float) + df["start_y"]=df["start_y"].astype(float) + + #round numerical columns to 4 decimal places (period, minute, second, X, Y) + df = df.round({"Period": 4, "Minute": 4, "Second": 4, "seconds": 4, "start_x": 4, "start_y": 4, "deltaX": 4, "deltaY": 4, "distance": 4, "dist2goal": 4, "angle2goal": 4}) + + #reorder columns + tracking_col_home = [f"Home_{i}_x" for i in range(1, 15)] + [f"Home_{i}_y" for i in range(1, 15)] + tracking_col_away = [f"Away_{i}_x" for i in range(1, 15)] + [f"Away_{i}_y" for i in range(1, 15)] + df = df[['match_id', 'poss_id', 'team', 'home_team', 'action', 'success', 'goal', 'home_score', + 'away_score', 'goal_diff', 'Period', 'Minute', 'Second', 'seconds', "delta_T", 'start_x', + 'start_y', 'deltaX', 'deltaY', 'distance', 'dist2goal', 'angle2goal']+tracking_col_home+tracking_col_away] + + return df + + +if __name__ == '__main__': + import pdb + + # seq2event + # df_path=os.getcwd()+"/test/sports/event_data/data/wyscout/test_data.csv" + # df=seq2event(df_path) + # df.to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_seq2event.csv",index=False) + + # nmstpp + # df_path=os.getcwd()+"/test/sports/event_data/data/wyscout/test_data.csv" + # df=nmstpp(df_path) + # df.to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_nmstpp.csv",index=False) + + # lem + # df_path=os.getcwd()+"/test/sports/event_data/data/wyscout/test_data.csv" + # df=lem(df_path) + # df.to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_lem.csv",index=False) + + # UIED + # df_wyscout_path=os.getcwd()+"/test/sports/event_data/data/wyscout/test_data.csv" + # df_wyscout=UIED_wyscout(df_wyscout_path) + # df_wyscout.to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_wyscout_UIED.csv",index=False) + + # df_statsbomb_skillcorner_path=os.getcwd()+"/test/sports/event_data/data/statsbomb_skillcorner/test_data.csv" + # df_statsbomb_skillcorner=UIED_statsbomb(df_statsbomb_skillcorner_path) + # df_statsbomb_skillcorner.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb_skillcorner/test_preprocess_statsbomb_skillcorner_UIED.csv",index=False) + + # df_statsbomb_json_path=os.getcwd()+"/test/sports/event_data/data/statsbomb/test_data.csv" + # df_statsbomb_json=UIED_statsbomb(df_statsbomb_json_path) + # df_statsbomb_json.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_preprocess_statsbomb_json_UIED.csv",index=False) + + # df_statsbomb_api_path=os.getcwd()+"/test/sports/event_data/data/statsbomb/test_api_data.csv" + # df_statsbomb_api=UIED_statsbomb(df_statsbomb_api_path) + # df_statsbomb_api.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_preprocess_statsbomb_api_UIED.csv",index=False) + + # df_datastadium_path=os.getcwd()+"/test/sports/event_data/data/datastadium/load.csv" + # df_datastadium=UIED_datastadium(df_datastadium_path) + # df_datastadium.to_csv(os.getcwd()+"/test/sports/event_data/data/datastadium/preprocess_UIED.csv",index=False) + + print('-----------------end-----------------') + # pdb.set_trace() diff --git a/preprocessing/sports/phase_data/soccer/soccer_tracking_data.py b/preprocessing/sports/phase_data/soccer/soccer_tracking_data.py new file mode 100644 index 0000000..12e1f96 --- /dev/null +++ b/preprocessing/sports/phase_data/soccer/soccer_tracking_data.py @@ -0,0 +1,115 @@ +import pandas as pd +import os +import pdb +import pandas as pd + +def statsbomb_skillcorner_tracking_data_preprocessing(df_raw, save_path=None, process_event_coord=True): + """ + Preprocess tracking data for StatsBomb and SkillCorner data formats to standardize the coordinates + + Parameters: + - df (pd.DataFrame or str): DataFrame containing tracking data or a path to a CSV file. + Expected columns include 'home_team', 'home_side', and optional columns like 'action' or 'event_type'. + - save_path (str): Path to save the preprocessed data as a CSV file. + - process_event_coord (bool): Flag to scale event data coordinates to field dimensions. + + Steps: + 1. Load CSV if `df` is a file path; validate the input to ensure it is a DataFrame. + 2. Define possession team actions to categorize certain events as possession-related. + 3. Adjust player coordinates by shifting the origin to the center and flipping coordinates + if the home team plays on the right side (field normalization). + 4. Process each row based on the action or event type to determine whether switching + the field orientation is necessary. + 5. Save the modified DataFrame to the specified path. + + Notes: + - Assumes field dimensions of 105 x 68 meters. + - Applies scaling for event data start_x and start_y to adjust coordinates to the field dimensions. + """ + FIELD_LENGTH = 105.0 # Field length in meters + FIELD_WIDTH = 68.0 # Field width in meters + + # Load data if `df_raw` is a file path; validate input + if not isinstance(df_raw, pd.DataFrame): + if isinstance(df_raw, str): + df_raw = pd.read_csv(df_raw) + else: + raise ValueError("Input should be a DataFrame or a CSV file path") + + # Define list of team actions that imply possession + team_actions = [ + 'Pass_Ground Pass', 'Pass_Long_HighPass', 'Carry_nan', 'Pass_High Pass', 'Pass_Low Pass', + 'Miscontrol_nan', 'Dribble_nan', 'Clearance_nan', 'Pass_Cross', 'Ball Recovery_nan', + 'Offside_nan', 'Goal Keeper_nan', 'Dribbled Past_nan', 'Pass_Corner', 'Shot_Saved', + 'Shot_Blocked', 'Shot_Wayward', 'Shot_Off T', 'Shot_Goal', 'Shot_Post', + 'Tactical Shift_nan', 'Shield_nan', 'Own Goal Against_Own goal', 'Error_nan', + 'Shot_Saved Off Target', 'Ball Receipt*_nan', 'Pressure_nan', 'Interception_nan' + ] + + # Function to adjust coordinates based on field orientation + def adjust_coordinates(idx, switch_sides): + """ + Adjusts the x and y coordinates for players on the field based on field orientation. + + Parameters: + - idx (int): The index of the row to modify in df. + - switch_sides (bool): Flag indicating if coordinates should be flipped. + """ + for prefix in ['h', 'a']: # 'h' for home, 'a' for away + for i in range(1, 24): + x_col, y_col = f"{prefix}{i}_x", f"{prefix}{i}_y" + x, y = df.at[idx, x_col], df.at[idx, y_col] + + # Skip if x and y are zero (indicating missing data) + if x == 0 and y == 0: + continue + + # Adjust coordinates based on `switch_sides` flag + df.at[idx, x_col] = (-x if switch_sides else x) + FIELD_LENGTH / 2 + df.at[idx, y_col] = (y if switch_sides else -y) + FIELD_WIDTH / 2 + #round to 2 decimal places + df.at[idx, x_col] = round(df.at[idx, x_col], 2) + df.at[idx, y_col] = round(df.at[idx, y_col], 2) + + # Process each row + df = df_raw.copy() + for idx in range(len(df)): + home_team, home_side = df.at[idx, 'home_team'], df.at[idx, 'home_side'] + switch_sides = False # Default: no switch + + if 'action' in df.columns: + # Use switch condition based on the home team's side in possession + if (home_team == 1 and home_side == 'right') or (home_team == 0 and home_side == 'left'): + switch_sides = True + elif 'event_type' in df.columns: + if process_event_coord: + # Scale start_x and start_y for event data + df.at[idx, "start_x"] *= (1.05 / 1.2) + df.at[idx, "start_y"] *= (0.68 / 0.8) + #round to 2 decimal places + df.at[idx, "start_x"] = round(df.at[idx, "start_x"], 2) + df.at[idx, "start_y"] = round(df.at[idx, "start_y"], 2) + + action_type = f"{df.at[idx, 'event_type']}_{str(df.at[idx, 'event_type_2']).replace('None', 'nan')}" + is_possession_action = action_type in team_actions + + # Determine if coordinates should be switched based on possession action and home side + if is_possession_action: + switch_sides = (home_team == 1 and home_side == 'right') or (home_team == 0 and home_side == 'left') + else: + switch_sides = not ((home_team == 1 and home_side == 'right') or (home_team == 0 and home_side == 'left')) + + # Apply coordinate adjustment for each row by index + adjust_coordinates(idx, switch_sides) + + # Save the processed DataFrame to a CSV file + if save_path is not None: + df.to_csv(save_path, index=False) + + return df + +if __name__=="__main__": + df_path = os.getcwd() + "/test/sports/event_data/data/statsbomb_skillcorner/test_data.csv" + save_path = os.getcwd() + "/test/sports/event_data/data/statsbomb_skillcorner/track_data_preprocessed.csv" + statsbomb_skillcorner_tracking_data_preprocessing(df_path, save_path) + print("done") \ No newline at end of file From ab73946e65d93bfb42fa208d2bd177211d3fb4cf Mon Sep 17 00:00:00 2001 From: KurodaKento0505 Date: Fri, 26 Dec 2025 15:57:10 +0900 Subject: [PATCH 2/7] first commit --- .../sports/phase_data/phase_class.py | 58 +- .../sports/phase_data/soccer/README.md | 25 +- .../sports/phase_data/soccer/constant.py | 215 --- .../phase_data/soccer/soccer_load_data.py | 1497 ++++++++-------- .../phase_data/soccer/soccer_phase_class.py | 557 +----- .../phase_data/soccer/soccer_plot_row.py | 175 -- .../phase_data/soccer/soccer_processing.py | 1554 ----------------- .../phase_data/soccer/soccer_tracking_data.py | 115 -- 8 files changed, 819 insertions(+), 3377 deletions(-) delete mode 100644 preprocessing/sports/phase_data/soccer/constant.py delete mode 100644 preprocessing/sports/phase_data/soccer/soccer_plot_row.py delete mode 100644 preprocessing/sports/phase_data/soccer/soccer_processing.py delete mode 100644 preprocessing/sports/phase_data/soccer/soccer_tracking_data.py diff --git a/preprocessing/sports/phase_data/phase_class.py b/preprocessing/sports/phase_data/phase_class.py index ea258bd..e745ba0 100644 --- a/preprocessing/sports/phase_data/phase_class.py +++ b/preprocessing/sports/phase_data/phase_class.py @@ -1,45 +1,69 @@ class Phase_data: - soccer_data_provider = ['bepro', 'skillcorner', 'pff_fc'] # 'robocup_2d', 'datastadium', + soccer_data_provider = ['bepro', 'statsbomb_skillcorner', 'pff_fc'] + other_soccer_data_provider = ['robocup_2d', 'datastadium'] handball_data_provider = [] - rocket_league_data_provider = [] # 'carball' + rocket_league_data_provider = ['carball'] def __new__(cls, data_provider, *args, **kwargs): if data_provider in cls.soccer_data_provider: - from .soccer.soccer_phase_class import Soccer_phase_data + from preprocessing.sports.phase_data.soccer.soccer_phase_class import Soccer_phase_data return Soccer_phase_data(data_provider, *args, **kwargs) + elif data_provider in cls.other_soccer_data_provider: + raise NotImplementedError('other soccer data provider not implemented yet') elif data_provider in cls.handball_data_provider: raise NotImplementedError('Handball phase data not implemented yet') elif data_provider in cls.rocket_league_data_provider: raise NotImplementedError('rocket_league phase data not implemented yet') - # from .rocket_league.rocket_league_phase_class import Rocket_league_phase_data - # return Rocket_league_phase_data(data_provider, *args, **kwargs) else: raise ValueError(f'Unknown data provider: {data_provider}') -if __name__ == '__main__': - #check if the Soccer_tracking_data class is correctly implemented +def main(): import os import argparse import glob args = argparse.ArgumentParser() - args.add_argument('--data_provider', required=True, choices=['bepro', 'skillcorner', 'pff_fc'], help='kind of data provider') - args.add_argument('--match_id', required=True, help='ID of match data') + args.add_argument('--data_provider', required=True, choices=['bepro', 'statsbomb_skillcorner', 'pff_fc'], help='kind of data provider') + args.add_argument('--match_id', required=False, help='ID of match data') + args = args.parse_args() data_provider = args.data_provider - match_ids = [str(match_id) for match_id in args.match_id.split(",")] - base_dir = os.getcwd() + f"/test/sports/tracking_data/{data_provider}/" + base_dir = os.getcwd() + f"/test/sports/" if data_provider == 'bepro': + match_ids = [str(match_id) for match_id in args.match_id.split(",")] for match_id in match_ids: # The format for bepro has changed from Match ID: 130000(?). if int(match_id) >= 130000: - file_pattern = os.path.join(base_dir, match_id, f"{match_id}_*_frame_data.json") + file_pattern = os.path.join(base_dir, 'tracking_data', data_provider, match_id, f"{match_id}_*_frame_data.json") tracking_json_paths = sorted(glob.glob(file_pattern)) - preprocessing_df=Phase_data(data_provider=data_provider, bp_tracking_json_paths=tracking_json_paths).load_data() + meta_data = os.path.join(base_dir, 'tracking_data', data_provider, match_id, f"{match_id}_metadata.json") + event_csv_path = glob.glob(os.path.join(os.path.join(base_dir, 'event_data', data_provider, match_id), '*.csv')) + preprocessing_df=Phase_data(data_provider=data_provider, bp_tracking_json_paths=tracking_json_paths, event_path=event_csv_path[0], meta_data=meta_data).load_data() else: tracking_path=os.getcwd()+f"/test/sports/tracking_data/{data_provider}/{match_id}/{match_id}_tracker_box_data.xml" - preprocessing_df=Phase_data(data_provider=data_provider, bp_tracking_xml_path=tracking_path).load_data() - elif data_provider == 'skillcorner': - print('not yet') + meta_data = os.path.join(base_dir, 'tracking_data', data_provider, match_id, f"{match_id}_tracker_box_metadata.xml") + event_csv_path = glob.glob(os.path.join(os.path.join(base_dir, 'event_data', data_provider, match_id), '*.csv')) + preprocessing_df=Phase_data(data_provider=data_provider, bp_tracking_xml_path=tracking_path, event_path=event_csv_path[0], meta_data=meta_data).load_data() + output_file_path = os.path.join(base_dir, 'phase_data', data_provider, match_id, f"{match_id}_main_data.csv") + preprocessing_df.to_csv(output_file_path,index=False) + print(f"✅ All period tracking data saved successfully at {output_file_path}.") + elif data_provider == 'statsbomb_skillcorner': + sb_match_id = 3894537 # 843, 537 + sc_match_id = 1018887 # 1498966, 1018887 + sb_event_path=f'D:/lab/My_Research/Github/OpenSTARLab/PreProcessing/test/sports/event_data/statsbomb/{sb_match_id}_events.pkl' + sc_tracking_path=f'D:/lab/My_Research/Github/OpenSTARLab/PreProcessing/test/sports/tracking_data/skillcorner/LaLiga-2023-2024/tracking/{sc_match_id}.json' + sc_match_path=f'D:/lab/My_Research/Github/OpenSTARLab/PreProcessing/test/sports/tracking_data/skillcorner/LaLiga-2023-2024/match/{sc_match_id}.json' + sc_players_path='D:/lab/My_Research/Github/OpenSTARLab/PreProcessing/test/sports/tracking_data/skillcorner/LaLiga-2023-2024/players/players.json' + preprocessing_df=Phase_data(data_provider=data_provider, sb_event_path=sb_event_path, sc_tracking_path=sc_tracking_path, sc_match_path=sc_match_path, sc_players_path=sc_players_path).load_data() + output_file_dir = os.path.join(base_dir, 'phase_data', data_provider, f'{sb_match_id}_{sc_match_id}') + os.makedirs(output_file_dir, exist_ok=True) + output_file_path = os.path.join(output_file_dir, f"{sb_match_id}_{sc_match_id}_main_data.csv") + preprocessing_df.to_csv(output_file_path,index=False) elif data_provider == 'pff_fc': print('not yet') - preprocessing_df.to_csv(os.getcwd()+f"/test/sports/tracking_data/{data_provider}/{match_id}/test_data_main.csv",index=False) \ No newline at end of file + output_file_path = os.path.join(base_dir, 'phase_data', data_provider, match_id, f"{match_id}_main_data.csv") + preprocessing_df.to_csv(output_file_path,index=False) + print(f"✅ All period tracking data saved successfully at {output_file_path}.") + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/preprocessing/sports/phase_data/soccer/README.md b/preprocessing/sports/phase_data/soccer/README.md index 3e28411..39a2f2a 100644 --- a/preprocessing/sports/phase_data/soccer/README.md +++ b/preprocessing/sports/phase_data/soccer/README.md @@ -1,30 +1,19 @@ -# Event Data in Football/Soccer ⚽ +# Phase Data in Football/Soccer ⚽ [![Documentation Status](https://readthedocs.org/projects/openstarlab/badge/?version=latest)](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/index.html) ## Introduction -This package offers functions to load and preprocess event data from various sources in football/soccer. +This package offers functions to load and preprocess phase data from various sources in football/soccer. ## Supported Data Providers -You can find detailed documentation on supported data providers [here](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/Event_data/Data_Provider/index.html). The supported providers include: +You can find detailed documentation on supported data providers [here](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/Phase_data/Data_Provider/index.html). The supported providers include: -- DataFactory -- DataStadium -- Metrica -- Opta -- Robocup 2D Simulation -- SoccerTrackv2 (BePro) -- Sportec -- Statsbomb -- Statsbomb with Skillcorner Tracking Data -- Wyscout +- Bepro +- Statsbomb and Skillcorner +- (PFF FC) For data format examples, visit [Kloppy](https://github.com/PySport/kloppy/tree/master/kloppy/tests/files) ## Supported Preprocessing Methods -For information on supported preprocessing methods, visit [this documentation](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/Event_data/Data_Format/index.html). The available preprocessing methods are: - -- Unified and Integrated Event Data (UIED) -- NMSTPP (same format required for [Football Match Event Forecast](https://github.com/calvinyeungck/Football-Match-Event-Forecast)) -- Other Event Data Formats +For information on supported preprocessing methods, visit [this documentation](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/Phase_data/Data_Format/index.html). ## Examples Here are some examples of how to download and preprocess data: diff --git a/preprocessing/sports/phase_data/soccer/constant.py b/preprocessing/sports/phase_data/soccer/constant.py deleted file mode 100644 index 7f30b59..0000000 --- a/preprocessing/sports/phase_data/soccer/constant.py +++ /dev/null @@ -1,215 +0,0 @@ -from bidict import bidict - -""" -共通 -""" - -FREQUENCY_HZ_ORIGINAL = 25 -FREQUENCY_HZ_CONVERTED = 10 - - -""" -データスタジアムのカラム -""" -# データスタジアムのカラム"攻撃方向"とトラッキングデータの座標正負との関係 -ATTACKING_DIRECTION_PLUS = 1 # 1の場合、正方向に攻撃 -ATTACKING_DIRECTION_MINUS = 2 # 2の場合、負方向に攻撃 - -# データスタジアムのカラム"ホームアウェイF"とチームの関係 -F_HOME_AWAY_BALL = 0 -F_HOME_AWAY_HOME = 1 -F_HOME_AWAY_AWAY = 2 - -# play.csv(イベントデータ)から抽出するイベント -ACTION_NAME_FOR_ATTACK_ANALYSIS_LIST = [ - # 自チームイベント - 'シュート' - ,'ホームパス' - ,'アウェイパス' - ,'スルーパス' - ,'フィード' - ,'クロス' - ,'GK' - ,'CK' - ,'キックオフ' - ,'スローイン' - ,'ドリブル' - # ,'直接FK' - # ,'間接FK' - - # 敵チームイベント - , 'トラップ' - , 'クリア' - , 'ブロック' -] - - -# アクション一覧 -# usage -# print(ACTION_BIDICT['CK']) # 1 -# print(ACTION_BIDICT.inverse[1]) # CK - -ACTION_BIDICT = bidict({ - 'CK': 1 - ,'シュート': 2 - ,'キックオフ': 3 - ,'クロス': 4 - ,'ハンドクリア': 5 - ,'タッチ': 6 - ,'ボールアウト': 7 - ,'間接FK': 8 - ,'PK': 9 - ,'タックル': 10 - ,'試合中断(試合中)': 11 - ,'フリック>オン': 12 - ,'退場(レッド)': 13 - ,'オフサイド': 14 - ,'ポスト/バー': 15 - ,'ファウル受ける': 16 - ,'前半開始': 17 - ,'ホームパス': 18 - ,'トラップ': 19 - ,'クリア': 20 - ,'直接FK': 21 - ,'前半終了': 22 - ,'後半終了': 23 - ,'オウンゴール': 24 - ,'警告(イエロー)': 25 - ,'ドリブル': 26 - ,'ファウルする': 27 - ,'スルーパス': 28 - ,'キャッチ': 29 - ,'フィード': 30 - ,'アウェイパス': 31 - ,'交代': 32 - ,'後半開始': 33 - ,'インターセプト': 34 - ,'ドロップボール': 35 - ,'GK': 36 - ,'ブロック': 37 - ,'スローイン': 38 -}) - -# アクション優先度(大きいほど高い) -# 同フレームに複数のアクションが紐付いている際、どのアクションを残すかを決定 -ACTION_PRIORITY = { - # 自チームイベント - 'シュート': 10 - ,'ホームパス': 10 - ,'アウェイパス': 10 - ,'スルーパス': 10 - ,'フィード': 10 - ,'クロス': 10 - ,'GK': 8 - ,'CK': 8 - ,'キックオフ': 8 - ,'スローイン': 8 - ,'ドリブル': 8 - # ,'直接FK' - # ,'間接FK' - - # 敵チームイベント - , 'トラップ': 6 - , 'クリア': 9 # シュート、クリア重複あり - - # 共通イベント - , 'ブロック': 4 # トラップ、ブロック重複あり -} - -TEAM_BIDICT = bidict({ - 122: '浦和レッズ' - , 128: 'ガンバ大阪' - , 124: '横浜F・マリノス' - , 127: '名古屋グランパス' - , 126: '清水エスパルス' - , 133: 'セレッソ大阪' - , 136: 'ヴィッセル神戸' - , 30528: '松本山雅FC' - , 120: '鹿島アントラーズ' - , 238: 'ベガルタ仙台' - , 129: 'サンフレッチェ広島' - , 131: 'ジュビロ磐田' - , 207: '大分トリニータ' - , 86: '川崎フロンタ>ーレ' - , 86: '川崎フロンターレ' - , 276: '北海道コンサドーレ札幌' - , 270: 'FC東京' - , 130: '湘南ベルマーレ' - , 269: 'サガン鳥栖' -}) - - -N_AGENTS = 22 -EXTRA_FRAME = 4 - -FIELD_LENGTH = 105.0 # unit: meters -FIELD_WIDTH = 68.0 # unit: meters -GOAL_WIDTH = 7.32 # unit: meters -PENALTY_X = 105.0/2-16.5 # left point (unit: meters) -PENALTY_Y = 40.32 # upper point (unit: meters) - -# for gfootball -FIELD_LENGTH_GRF = 1*2 -FIELD_WIDTH_GRF = 0.42*2 -GOAL_WIDTH_GRF = 0.044*2 - -STOP_THRESHOLD = 0.1 # unit: m/s -SPRINT_THRESHOLD = 24000/3600 # unit: m/s (24 km/h) -LONGPASS_THRESHOLD = 30 # unit: meters -HIGHPASS_AGENT_THRESHOLD = 1 # unit: meters -BALL_KEEP_THRESHOLD = 1 # unit: m -SEED = 42 - -# super mini map for gfootball -SMM_WIDTH = 96 -SMM_HEIGHT = 72 - -SMM_LAYERS = ['left_team', 'right_team', 'ball', 'active'] - -# Normalized minimap coordinates -MINIMAP_NORM_X_MIN = -1.0 -MINIMAP_NORM_X_MAX = 1.0 -MINIMAP_NORM_Y_MIN = -1.0 / 2.25 -MINIMAP_NORM_Y_MAX = 1.0 / 2.25 - -MARKER_VALUE = 255 - -# GFootbal actions -ACTION_GRF_19 = bidict({ - 'idle': 0 - ,'left': 1 - ,'top_left': 2 - ,'top': 3 - ,'top_right': 4 - ,'right': 5 - ,'bottom_right': 6 - ,'bottom': 7 - ,'bottom_left': 8 - ,'long_pass': 9 - ,'high_pass': 10 - ,'short_pass': 11 - ,'shot': 12 - ,'sprint': 13 - ,'release_direction': 14 - ,'release_sprint': 15 - ,'sliding': 16 - ,'dribble': 17 - ,'release_dribble': 18 # ,'builtin_ai ': 19 -}) - -ACTION_GRF_14 = bidict({ - 'idle': 0 - ,'left': 1 - ,'top_left': 2 - ,'top': 3 - ,'top_right': 4 - ,'right': 5 - ,'bottom_right': 6 - ,'bottom': 7 - ,'bottom_left': 8 - ,'pass': 9 - ,'shot': 10 - ,'sprint': 11 - ,'release_direction': 12 - ,'release_sprint': 13 -}) \ No newline at end of file diff --git a/preprocessing/sports/phase_data/soccer/soccer_load_data.py b/preprocessing/sports/phase_data/soccer/soccer_load_data.py index 1763139..16cf4f3 100644 --- a/preprocessing/sports/phase_data/soccer/soccer_load_data.py +++ b/preprocessing/sports/phase_data/soccer/soccer_load_data.py @@ -2,16 +2,15 @@ import json import pandas as pd +pd.set_option('future.no_silent_downcasting', True) import numpy as np import xml.etree.ElementTree as ET -from statsbombpy import sb -from tqdm import tqdm -from datetime import datetime +# from statsbombpy import sb import os -import pdb -import csv +import pickle +from typing import List, Dict, Any -def load_bepro(tracking_xml_path: str, tracking_json_paths: list, event_path: str, verbose: bool = False) -> pd.DataFrame: +def load_bepro(tracking_xml_path: str, tracking_json_paths: list, event_path: str, meta_data_path: str) -> pd.DataFrame: """ Loads and processes event and tracking data from soccer match recordings. @@ -31,14 +30,15 @@ def load_bepro(tracking_xml_path: str, tracking_json_paths: list, event_path: st with additional features including player positions, speeds, ball position, and metadata (e.g., player names, shirt numbers, positions). """ - def extract_tracking_data_from_xml(xml_path): + + def extract_tracking_data_from_xml(xml_path: str) -> List[Dict[str, Any]]: """ - Parse the XML file and extract tracking data. + Parse the XML file and extract tracking data for players and the ball. Args: xml_path (str): Path to the XML file. Returns: - list of dict: A list containing tracking information for each player in each frame. + list of dict: A list containing tracking information for each player and the ball in each frame. """ tree = ET.parse(xml_path) root = tree.getroot() @@ -48,26 +48,47 @@ def extract_tracking_data_from_xml(xml_path): frame_number = int(frame.get("frameNumber")) match_time = int(frame.get("matchTime")) - for player in frame: - player_id = player.get("playerId") - loc = player.get("loc") + # 処理対象の要素を の両方に拡張 + # findall("*") を使用することで、 の直下にある全ての要素(player, ballなど)を取得 + for element in frame.findall("*"): + + # タグ名に基づいて player_id と loc の属性名を設定 + if element.tag == "player": + player_id = element.get("playerId") + loc = element.get("loc") + elif element.tag == "ball": + # ⭐ 変更点: タグの場合、player_id を "ball" とし、属性を取得 + player_id = "ball" + loc = element.get("loc") + else: + # 予期しないタグはスキップ + continue + + # loc 情報が存在しない場合はスキップ + if loc is None: + continue + # Convert loc string to float coordinates try: + # loc の形式は "[x, y]" を想定 x, y = map(float, loc.strip("[]").split(",")) + + # 座標変換とデータ追加 tracking_data.append({ "frame": frame_number, "match_time": match_time, "player_id": player_id, - "x": "{:.2f}".format(x * 105 - 52.5), + # 座標の正規化解除とフォーマット適用 (元のコードのロジックを維持) + "x": "{:.2f}".format(x * 105 - 52.5), "y": "{:.2f}".format(y * 68 - 34.0) }) except ValueError: + # loc の形式が不正な場合 raise ValueError(f"Invalid location format for player {player_id} in frame {frame_number}") - tracking_df = add_period_column(tracking_data) - return tracking_df + return tracking_data - def extract_tracking_data_from_json(json_path): + def extract_tracking_data_from_json(json_path: str, period: str) -> List[Dict[str, Any]]: """ Parse the JSON file and extract tracking data. @@ -84,6 +105,7 @@ def extract_tracking_data_from_json(json_path): for player in players: try: tracking_data.append({ + "period": period, "frame": int(frame_number), "match_time": int(player.get("match_time", 0)), "player_id": "ball" if player.get("player_id") == None else player.get("player_id"), @@ -92,246 +114,451 @@ def extract_tracking_data_from_json(json_path): }) except ValueError: raise ValueError(f"Invalid data format in frame {frame_number}") - tracking_df = add_period_column(tracking_data) - return tracking_df + return tracking_data - def add_period_column(tracking_data_list): + def devide_by_period(tracking_data_list: List[dict]) -> List[pd.DataFrame]: """ - Add a 'period' column to the tracking_data list. + トラッキングデータのリストに 'period' 列を追加し、periodごとに分割した + DataFrameのリストを返す。 + + frame番号が大きく減少する(リセットされる)ごとにperiodをインクリメントし、 + その直前の行で期間を終了する。 - Increment the period each time the frame number significantly decreases (resets). Args: - tracking_data_list (list of dict): A list containing tracking_data. + tracking_data_list (list of dict): tracking_dataを格納したリスト。 Returns: - pandas.DataFrame: A DataFrame with the 'period' column added. + List[pd.DataFrame]: 'period' 列が追加され、期間ごとに正確に分割されたDataFrameのリスト。 """ + if not tracking_data_list: + return [] + # 1. リストをPandas DataFrameに変換し、オリジナルのインデックスを保持 df = pd.DataFrame(tracking_data_list) - first_occurrence_of_frame = df.drop_duplicates(subset=['frame'], keep='first') + + # 2. periodの境界となるインデックス(frame番号がリセットされる行)を特定 + # 各フレームの最初の行のみを取得 + first_occurrence_of_frame = df.drop_duplicates(subset=['frame', 'match_time'], keep='first') + # frame番号の差分を計算し、負になる箇所(リセット)を検出 + # .diff() は Series を返すため、インデックスは first_occurrence_of_frame のインデックスと一致する frame_diff = first_occurrence_of_frame['frame'].diff().fillna(0) - period_reset = (frame_diff < 0) - period_values = period_reset.cumsum() + 1 - period_map = pd.Series(period_values.values, index=first_occurrence_of_frame['frame']).to_dict() - df['period'] = df['frame'].map(period_map) - cols = ['period'] + [col for col in df.columns if col != 'period'] - df = df[cols] - - return df - - def get_additional_features(event_df, meta_data): - #player info: id name nameEN shirtNumber position - # create features period, seconds, event_type, event_type_2, outcome, home_team, x_unscaled, y_unscaled, - period_dict = {"FIRST_HALF": 1, "SECOND_HALF": 2, "EXTRA_FIRST_HALF": 3, "EXTRA_SECOND_HALF": 4} - event_df["period"] = event_df["event_period"].map(period_dict) - event_df["seconds"] = event_df["event_time"]/1000 - - event_type_list = [] - for i in range(len(event_df)): - event_i = event_df.iloc[i].event_types - # print(event_i) - if not isinstance(event_i, str): - event_type_list.append(None) - else: - event_i = event_i.split(" ")[0] - event_type_list.append(event_i) - event_df["event_type"] = event_type_list - - home_team_dict = {int(team_info["id"]):team_info["side"] for team_info in meta_data["team_info"]} - event_df["home_team"] = event_df["team_id"].map(home_team_dict) - #convert "home" to 1 and "away" to 0 for home_team - event_df["home_team"] = event_df["home_team"].map({"home":1,"away":0}) - - #x and y coordinates of the field (height,width) for the event data (inverse of the tracking data) - event_df["x_unscaled"] = event_df["y"]*int(meta_data["pitch_info"]["width"]) - event_df["y_unscaled"] = event_df["x"]*int(meta_data["pitch_info"]["height"]) + period_reset_indices = frame_diff[frame_diff < 0].index + + # 3. 分割点のインデックスリストを作成 + # リストの先頭 (0) を開始点として追加 + split_indices = [0] + # リセットされたフレームのインデックスを取得 + # df.index.get_loc() を使わずに、直接 df のインデックスで操作する + for reset_idx in period_reset_indices: + # リセットが行われるフレームの直前のインデックスを分割点に追加 + # reset_idx は first_occurrence_of_frame のインデックスであり、df のインデックスと一致する + if reset_idx > 0: + split_indices.append(reset_idx) + + # リストの末尾(データの最終インデックス+1)を終了点として追加 + split_indices.append(len(df)) + # 重複を削除し、ソート + split_indices = sorted(list(set(split_indices))) + period_df_list = [] + + # 4. 分割とperiod番号の割り当て + for i in range(len(split_indices) - 1): + start_idx = split_indices[i] + end_idx = split_indices[i+1] + current_period = i + 1 + # DataFrameをスライス + period_df = df.iloc[start_idx:end_idx].copy() + # 'period' 列を割り当て + period_df.loc[:, 'period'] = current_period + # 不要な一時列をクリーンアップ(ここでは既に df に period がマッピングされていないので不要だが、念のため) + period_df_list.append(period_df.reset_index(drop=True)) + + return period_df_list + + def extract_meta_info_from_xml(xml_path: str) -> dict: + """ + Extract team information (ID, name, side) from an XML metadata file. + + Args: + xml_path (str): Path to the XML metadata file. + Returns: + dict: Dictionary in the format: {player_id: {'position': str, 'team_id': str, 'side': str}}. + """ + tree = ET.parse(xml_path) + root = tree.getroot() + + team_info = {} + player_info = {} + + teams_element = root.find("teams") + if teams_element is not None: + for team in teams_element.findall("team"): + team_id = team.get("id") + team_name = team.get("name") + side = team.get("side") + + if team_id: + team_info[team_id] = { + "team_name": team_name, + "side": side + } + players_element = root.find("players") + if players_element is not None: + for player in players_element.findall("player"): + player_id = player.get("id") + player_name = player.get("name") + team_id = player.get("teamId") + position = player.get("position") + + if player_id: + side = team_info.get(team_id, {}).get("side") + team_name = team_info.get(team_id, {}).get("team_name") + + player_info[player_id] = { + "team_id": team_id, + "team_name": team_name, + "side": side, + "player_name": player_name, + "position": position, + } + return player_info + + def extract_meta_info_from_json(json_path: str) -> dict: + """ + Extract team information (ID, name, side) from an JSON metadata file. + + Args: + xml_path (str): Path to the XML metadata file. + Returns: + dict: Dictionary in the format: {player_id: {'position': str, 'team_id': str, 'side': str}}. + """ + with open(json_path, "r", encoding="utf-8") as f: + data = json.load(f) + + player_info = {} + + teams = { + "home": data.get("home_team", {}), + "away": data.get("away_team", {}) + } + + for side, team_data in teams.items(): + if team_data: + team_id = str(team_data.get("team_id")) + team_name = str(team_data.get("team_name")) + + # プレイヤー情報を保存 + if "players" in team_data: + for player in team_data["players"]: + player_id = str(player.get("player_id")) + player_name = str(player.get("full_name")) + position = player.get("initial_position_name") + + if player_id: + player_info[player_id] = { + "team_id": team_id, + "team_name": team_name, + "side": side, + "player_name": player_name, + "position": position, + } + + return player_info + + def get_inplay_start_time(event_df: pd.DataFrame) -> pd.DataFrame: + """ + Add 'inplay_num' column to event_df. + If the first word in filtered_event_types matches the specified event type, + it is considered the start of a new in-play event, and inplay_num is incremented. + + Args: + event_df (pd.DataFrame): A DataFrame containing soccer event data. + Returns: + pd.DataFrame: A DataFrame with the 'inplay_num' column added. + """ + + event_df = event_df.copy() + # List of strings specified as in-play start events + START_EVENT_STRINGS = ['goalKick', 'throwIn', 'cornerKick', 'freeKick', 'goalAgainst'] + + # 1. Extract the string up to the first space in 'filtered_event_types' + # Since NaN values may be present, replace them with an empty string ('') before str.split(). + event_df.loc[:, 'first_event_type'] = event_df['filtered_event_types'].fillna('').str.split(' ').str[0] + + # 2. Create a flag column to detect the start frame + # The first row is always considered the start of an in-play sequence + is_start_frame = pd.Series(False, index=event_df.index) + is_start_frame.iloc[0] = True + # Detect events containing the specified strings + is_restart_event = event_df['first_event_type'].isin(START_EVENT_STRINGS) + + # 3. Apply the restart logic + # Restart events other than 'goalAgainst': The current row marks the start of a new in-play sequence + is_normal_restart = is_restart_event & (event_df['first_event_type'] != 'goalAgainst') + is_start_frame = is_start_frame | is_normal_restart + # 'goalAgainst' restart event: The **next frame** marks the start of a new in-play sequence + is_goal_against = event_df['first_event_type'] == 'goalAgainst' + # Set True for the row immediately following 'goalAgainst' (using shift(-1), the last row is ignored) + # This is OR combined with is_start_frame + shifted_goal_against = is_goal_against.shift(periods=-1) + filled_shifted = shifted_goal_against.fillna(False).astype(bool) + is_start_frame = is_start_frame.astype(bool) + is_start_frame = is_start_frame | filled_shifted + + # 4. Calculate the in-play number + # Calculate the cumulative sum, which increments at every True (start frame) instance + # Since True is treated as 1 and False as 0, cumsum() yields the in-play number + event_df.loc[:, 'inplay_num'] = is_start_frame.cumsum().astype(int) + + # 5. Post-processing + # Delete the helper column created during intermediate processing and return the result + event_df = event_df.drop(columns=['first_event_type'], errors='ignore') + return event_df - def calculate_sync_bias(event_df, tracking_data, period=1, verbose=False): - # 'FIRST_HALF' "SECOND_HALF" - # Calculate the bias between event time and tracking time - limit = 5.0 #seconds - time_list = [key for key in tracking_data.keys()] - #split the time_list into two halves - if period == 1: - time_list = [time for time in time_list if tracking_data[time]['eventPeriod'] == 'FIRST_HALF'] - first_event_time = event_df[event_df["event_period"]=="FIRST_HALF"].iloc[0].event_time if "FIRST_HALF" in event_df["event_period"].values else 0 - elif period == 2: - time_list = [time for time in time_list if tracking_data[time]['eventPeriod'] == 'SECOND_HALF'] - first_event_time = event_df[event_df["event_period"]=="SECOND_HALF"].iloc[0].event_time if "SECOND_HALF" in event_df["event_period"].values else 0 - elif period == 3: - time_list = [time for time in time_list if tracking_data[time]['eventPeriod'] == 'EXTRA_FIRST_HALF'] - first_event_time = event_df[event_df["event_period"]=="EXTRA_FIRST_HALF"].iloc[0].event_time if "EXTRA_FIRST_HALF" in event_df["event_period"].values else 0 + def get_tracking(tracking_df: pd.DataFrame, event_df: pd.DataFrame, player_info_df: Dict[str, Dict[str, str]]) -> pd.DataFrame: + """ + トラッキングデータをフレームごとに集約し、チームサイドとポジション順に並べ替えた + ワイドフォーマットのDataFrameを作成し、インプレー番号を割り当てる。 + + Args: + tracking_df (pd.DataFrame): 処理されたトラッキングデータ (frame, period, x, y, player_idなどを含む)。 + event_df (pd.DataFrame): 処理されたイベントデータ (match_time, period, inplay_numなどを含む)。 + player_info_df (Dict[str, Dict[str, str]]): player_idに対するポジション、チームID、サイド情報を持つ辞書。 + + Returns: + pd.DataFrame: フレームごとのワイドフォーマットトラッキングデータ。 + """ - if time_list == []: - return 0 - - time_list.sort() - start_time = max(time_list[0],0) - #round to the nearest 1000 - start_time = round(start_time/1000)*1000 - print("start_time:",start_time) if verbose else None - #drop the time that exceeds the limit of the event time - time_list = [time for time in time_list if time <= start_time+limit*1000] - #order the time_list in ascending order - time_list.sort() + # 標準的なポジション順序 (1から11の番号付けに使用) + POSITION_ORDER = ['GK', 'CB', 'RWB', 'RB', 'LWB', 'LB', 'CDM', 'RM', 'CM', 'LM', 'CAM', 'RW', 'LW', 'CF'] + FPS = 25 # トラッキングデータのフレームレート - ball_coordinates = [] - for time_i in time_list: - tracking_data_i = tracking_data[time_i] - ball_data_i = tracking_data_i['ball']['loc'] - ball_coordinates.append(ball_data_i) - #find the time with the highest acceleration - ball_coordinates = np.array(ball_coordinates) - ball_speed = np.linalg.norm(np.diff(ball_coordinates,axis=0),axis=1) - max_speed_index = np.argmax(ball_speed) - max_speed_time = time_list[max_speed_index] - bias = max_speed_time - first_event_time - - return bias - - def get_tracking_features(event_df, tracking_data, meta_data, verbose=True): - # combine the event data with the tracking data via event_time and matchTime - #get the player info - time_list = [key for key in tracking_data.keys()] - time_diff_list = [] - player_dict = {} - home_team_player_count = 0 - away_team_player_count = 0 - home_team_dict = {int(team_info["id"]):team_info["side"] for team_info in meta_data["team_info"]} - for player_i in meta_data["player_info"]: - player_dict[player_i["id"]] = player_i - team_id = int(player_dict[player_i["id"]]['teamId']) - if home_team_dict[team_id] == 'home': - player_dict[player_i["id"]]["player_num"] = home_team_player_count+1 - home_team_player_count += 1 - elif home_team_dict[team_id] == 'away': - player_dict[player_i["id"]]["player_num"] = away_team_player_count+1 - away_team_player_count += 1 - else: - print("team_id not found") - pdb.set_trace() + # ----------------------------------------------- + # 0. プレイヤー情報の結合と前処理 + # ----------------------------------------------- + event_df = event_df.copy() + # player_info_dfをDataFrameに変換し、トラッキングデータにマージ + player_map_df = pd.DataFrame.from_dict(player_info_df, orient='index').reset_index().rename( + columns={'index': 'player_id', 'side': 'team_side', 'team_name': 'team_name'} + ) - #create the additional features - tracking_features=["player_id","x","y","speed"] - meta_features=["name","nameEn","shirtNumber","position"] - ball_features = ["ball_x","ball_y","ball_speed"] - additional_features = tracking_features+meta_features - additional_featurs_dict = {} - for i in range(home_team_player_count): - for j in range(len(additional_features)): - additional_featurs_dict[f"home_{additional_features[j]}_{i+1}"] = [] - for i in range(away_team_player_count): - for j in range(len(additional_features)): - additional_featurs_dict[f"away_{additional_features[j]}_{i+1}"] = [] - for j in range(len(ball_features)): - additional_featurs_dict[ball_features[j]] = [] + # player_idの型を揃える + tracking_df['player_id'] = tracking_df['player_id'].astype(str) - additional_featurs_dict["tracking_time"] = [] - additaional_features_dict_key_list = [key for key in additional_featurs_dict.keys()] + # プレイヤーのメタデータをトラッキングデータに結合 + tracking_df = pd.merge(tracking_df, player_map_df, on='player_id', how='left') - #get the sync bias for the event and tracking data - bias_1 = calculate_sync_bias(event_df, tracking_data, period=1, verbose=verbose) #FIRST_HALF - bias_2 = calculate_sync_bias(event_df, tracking_data, period=2, verbose=verbose) #SECOND_HALF - bias_3 = calculate_sync_bias(event_df, tracking_data, period=3, verbose=verbose) #EXTRA_FIRST_HALF + # ボールの行のメタデータ ('player_id'='ball') を補完 + tracking_df.loc[tracking_df['player_id'] == 'ball', ['team_id', 'team_name', 'team_side', 'position', 'player_name']] = \ + ['ball', 'ball', 'ball', 'ball', 'ball'] - print("bias_1:",bias_1,"bias_2:",bias_2,"bias_3:",bias_3) if verbose else None + # ----------------------------------------------- + # 1. チームサイド (left/right) の決定 (最初のフレームで固定) + # ----------------------------------------------- + + # 最初のフレームのGKデータのみを抽出 + target_frame = tracking_df['frame'].min() + 10 + gk_data_initial = tracking_df[(tracking_df['position'] == 'GK') & (tracking_df['frame'] == target_frame)] + + # x座標が最小(マイナス側)のチームを 'left' チームとする + left_team_id = gk_data_initial.loc[gk_data_initial['x'].idxmin(), 'team_id'] + + # チームのメタデータを格納する辞書を作成(ワイドフォーマットの列作成に使用) + team_meta = {} + unique_teams = tracking_df[tracking_df['team_id'] != 'ball'][['team_id', 'team_name', 'team_side']].drop_duplicates() + + for _, row in unique_teams.iterrows(): + current_side = 'left' if row['team_id'] == left_team_id else 'right' + + team_meta[f'{current_side}_team_id'] = row['team_id'] + team_meta[f'{current_side}_team_name'] = row['team_name'] + team_meta[f'{current_side}_team_side'] = row['team_side'] # home/away - if verbose: - iterable = tqdm(range(len(event_df))) - else: - iterable = range(len(event_df)) - for i in iterable: - updated_features = [] - event_time = event_df.iloc[i].event_time - period = event_df.iloc[i].event_period - if period == 'FIRST_HALF': - event_time += bias_1 - elif period == 'SECOND_HALF': - event_time += bias_2 - elif period == 'EXTRA_FIRST_HALF': - event_time += bias_3 - else: - print("period not included") - #find the nearest time in the tracking data - nearest_time = min(time_list, key=lambda x:abs(x-event_time)) - try: - additional_featurs_dict["tracking_time"].append(nearest_time) - updated_features+=["tracking_time"] - except: - pass - time_diff_list.append(nearest_time-event_time) - #get the tracking data - tracking_data_i = tracking_data[nearest_time] - for player_track_j in tracking_data_i['players']: - player_j_id = player_track_j['playerId'] - player_j_num = player_dict[player_j_id]["player_num"] - player_j_team = player_dict[player_j_id]["teamId"] - player_j_home = home_team_dict[int(player_j_team)] - # append the tracking data and meta data to the additional features - additional_featurs_dict[f"{player_j_home}_player_id_{player_j_num}"].append(player_track_j['playerId']) - additional_featurs_dict[f"{player_j_home}_x_{player_j_num}"].append(round(player_track_j['loc'][0]*int(meta_data["pitch_info"]["width"]),2)) - additional_featurs_dict[f"{player_j_home}_y_{player_j_num}"].append(round(player_track_j['loc'][1]*int(meta_data["pitch_info"]["height"]),2)) - additional_featurs_dict[f"{player_j_home}_speed_{player_j_num}"].append(player_track_j['speed']) - additional_featurs_dict[f"{player_j_home}_name_{player_j_num}"].append(player_dict[player_j_id]["name"]) - additional_featurs_dict[f"{player_j_home}_nameEn_{player_j_num}"].append(player_dict[player_j_id]["nameEn"]) - additional_featurs_dict[f"{player_j_home}_shirtNumber_{player_j_num}"].append(player_dict[player_j_id]["shirtNumber"]) - additional_featurs_dict[f"{player_j_home}_position_{player_j_num}"].append(player_dict[player_j_id]["position"]) - updated_features+=[f"{player_j_home}_player_id_{player_j_num}",f"{player_j_home}_x_{player_j_num}",f"{player_j_home}_y_{player_j_num}",f"{player_j_home}_speed_{player_j_num}",f"{player_j_home}_name_{player_j_num}",f"{player_j_home}_nameEn_{player_j_num}",f"{player_j_home}_shirtNumber_{player_j_num}",f"{player_j_home}_position_{player_j_num}"] - ball_track = tracking_data_i['ball'] - additional_featurs_dict[f"ball_x"].append(round(ball_track['loc'][0]*int(meta_data["pitch_info"]["width"]),2)) - additional_featurs_dict[f"ball_y"].append(round(ball_track['loc'][1]*int(meta_data["pitch_info"]["height"]),2)) - if ball_track['speed'] == 'NA': - additional_featurs_dict[f"ball_speed"].append(None) - else: - additional_featurs_dict[f"ball_speed"].append(ball_track['speed']) - updated_features+=["ball_x","ball_y","ball_speed"] - # for features in additaional_features_dict_key_list but not in updated_features, append None - for key in additaional_features_dict_key_list: - if key not in updated_features: - additional_featurs_dict[key].append(None) + # ----------------------------------------------- + # 2. インプレー番号 (inplay_num) の割り当てロジック + # ----------------------------------------------- + + # tracking_df に inplay_num 列を追加し、全て NaN で初期化 + # このコードは関数内での処理を想定しているため、DataFrameのコピーを直接修正します。 + tracking_df['inplay_num'] = np.nan + + # 1. event_dfから各インプレーの開始/終了時刻を決定 + + # 'inplay_num' と 'match_time' の組み合わせを取得し、インプレー番号でソート + inplay_times = event_df[['inplay_num', 'event_time']].drop_duplicates().sort_values('inplay_num') + # 各インプレー番号の開始時刻と終了時刻を計算 + inplay_periods = inplay_times.groupby('inplay_num')['event_time'].agg(['min', 'max']).reset_index() + inplay_periods.columns = ['inplay_num', 'start_time', 'end_time'] + + # 2. tracking_df に inplay_num を割り当て + + # Period ごとに処理を行い、割り当てを確実にする + for period in tracking_df['period'].unique(): + + # 当該ピリオドの tracking_df を抽出 + p_tracking = tracking_df[tracking_df['period'] == period].copy() + + # 当該ピリオドのインプレー期間を抽出 + p_inplay_periods = inplay_periods.copy() + + # 各インプレー期間に対して tracking_df に inplay_num を割り当て + for _, row in p_inplay_periods.iterrows(): + current_inplay_num = row['inplay_num'] + start_time = row['start_time'] + end_time = row['end_time'] + + # 'match_time' が 'start_time' 以上かつ 'true_end_time' 以下のフレームに 'inplay_num' を設定 + # NumPyのwhere条件を使用して高速に処理 + + # グローバルな tracking_df のインデックスを取得 + mask_index = tracking_df[ + (tracking_df['period'] == period) & + (tracking_df['match_time'] >= start_time) & + (tracking_df['match_time'] <= end_time) + ].index + + # マスクされた行に inplay_num を割り当てる + tracking_df.loc[mask_index, 'inplay_num'] = current_inplay_num + + # 割り当てられなかった NaN の inplay_num はインプレー間の中断フレームと見なされます。 + # 最終的な final_tracking_df は tracking_df そのものです。 + final_tracking_df = tracking_df.copy() + + # ----------------------------------------------- + # 3. プレイヤー順序の決定と結合キーの作成 + # ----------------------------------------------- + + is_player = (final_tracking_df['player_id'] != 'ball') + side_calculated = np.where( + final_tracking_df['team_id'] == left_team_id, + 'left', + 'right' + ) + side_series = pd.Series(side_calculated, index=final_tracking_df.index) + if 'side' not in final_tracking_df.columns: + final_tracking_df.loc[:, 'side'] = np.nan + final_tracking_df['side'] = final_tracking_df['side'].astype(object) + final_tracking_df.loc[is_player, 'side'] = side_series.loc[is_player] + final_tracking_df.loc[final_tracking_df['player_id'] == 'ball', 'side'] = 'ball' + + # ポジションの順序をマッピング + pos_map = {pos: order for order, pos in enumerate(POSITION_ORDER, 1)} + + # プレイヤーのみをフィルタリング + player_df = final_tracking_df[final_tracking_df['player_id'] != 'ball'].copy() + + # ポジションの順序番号をDataFrameに追加 + player_df.loc[:, 'pos_order'] = player_df['position'].map(pos_map) + + # 各チーム・各フレーム内でポジション順に連番 (1から11) を作成 + player_df.loc[:, 'pos_rank'] = player_df.groupby(['frame', 'side'])['pos_order'].rank(method='first').astype(int) + + # ワイドフォーマットの列名を作成: 例: 'left_1_x', 'right_11_y' + player_df.loc[:, 'variable'] = player_df['side'] + '_' + player_df['pos_rank'].astype(str) + + # ----------------------------------------------- + # 4. プレイヤーデータのワイドフォーマット化 (Pivot) + # ----------------------------------------------- + + # ワイド化する値列をリスト化 + value_cols = ['x', 'y', 'player_id', 'player_name', 'position'] + + wide_data_list = [] + + for col in value_cols: + pivot_df = player_df.pivot_table( + index=['frame', 'match_time', 'period', 'inplay_num'], + columns='variable', + values=col, + aggfunc='first' + ).add_suffix(f'_{col.replace("player_id", "id").replace("player_name", "name")}') # player_id -> left_1_id + + wide_data_list.append(pivot_df) + + # 全てのピボットテーブルを結合 + wide_player_df = wide_data_list[0].join(wide_data_list[1:]) + + # ----------------------------------------------- + # 5. ボールデータとチームメタデータの抽出・結合 + # ----------------------------------------------- + + # ボールデータを抽出 + ball_df = final_tracking_df[final_tracking_df['player_id'] == 'ball'][['frame', 'x', 'y', 'match_time', 'period', 'inplay_num']].rename( + columns={'x': 'ball_x', 'y': 'ball_y'} + ).set_index(['frame', 'match_time', 'period', 'inplay_num']) + + # プレイヤーデータにボールデータを結合 + final_tracking_df = wide_player_df.join(ball_df).reset_index() + + # チームメタデータを追加 + for col, value in team_meta.items(): + final_tracking_df[col] = value + + # ----------------------------------------------- + # 6. 最終的な列の整形と順序調整 + # ----------------------------------------------- + + # プレイヤー列を ID, Name, Position, x, y の順で生成 + ordered_player_cols = [] + for side in ['left', 'right']: + for i in range(1, 12): # 1番から11番まで + prefix = f'{side}_{i}_' + + # ID, Name, Positionはデータに存在しない可能性もあるため、チェックしてから追加 + ordered_player_cols.append(prefix + 'id') + ordered_player_cols.append(prefix + 'name') + ordered_player_cols.append(prefix + 'position') + ordered_player_cols.append(prefix + 'x') + ordered_player_cols.append(prefix + 'y') + + # 最終的な列順序 (要望の形式に合わせる) + base_cols = ['period', 'inplay_num', 'frame', 'match_time', 'ball_x', 'ball_y'] + + # チームメタデータ列 + team_cols = [] + for side in ['left', 'right']: + team_cols.extend([f'{side}_team_id', f'{side}_team_name', f'{side}_team_side']) + + final_cols = base_cols + team_cols + ordered_player_cols - #add the additional features to the event_df - out_event_df = event_df.copy() - if verbose: - for key in additional_featurs_dict.keys(): - print(key,len(additional_featurs_dict[key])) - - # Create a DataFrame from the additional features dictionary - additional_features_df = pd.DataFrame(additional_featurs_dict) - - # Concatenate the original event_df with the additional features DataFrame - out_event_df = pd.concat([event_df, additional_features_df], axis=1) - - #print the mean and std of the time_diff_list - if verbose: - print("mean time difference:",round(np.mean(time_diff_list),4)) - print("std time difference:",round(np.std(time_diff_list),4)) - print("max time difference:",round(np.max(time_diff_list),4)) - print("min time difference:",round(np.min(time_diff_list),4)) - return out_event_df + # 必要な列のみを選択し、順序を調整 (存在しない列は無視される) + final_tracking_df = final_tracking_df.reindex(columns=final_cols) + + return final_tracking_df + # Load the event data + event_df = pd.read_csv(event_path) + # devide by period + grouped_events = event_df.groupby('event_period') + PERIOD_ORDER = ['FIRST_HALF', 'SECOND_HALF', 'EXTRA_FIRST_HALF', 'EXTRA_SECOND_HALF'] # check if the format is the latest version if tracking_xml_path is None: - list_of_tracking_dfs = [] + list_of_tracking_data = [] for i in range(len(tracking_json_paths)): - input_json = tracking_json_paths[i] - tracking_df = extract_tracking_data_from_json(input_json) - list_of_tracking_dfs.append(tracking_df) - tracking_df = pd.concat(list_of_tracking_dfs, ignore_index=True) + tracking_data = extract_tracking_data_from_json(tracking_json_paths[i], period=str(i+1)) + list_of_tracking_data.append(tracking_data) + player_info_df = extract_meta_info_from_json(meta_data_path) else: - input_json = tracking_xml_path - tracking_df = extract_tracking_data_from_xml(input_json) - # Load the event data - event_df = pd.read_csv(event_path) - # Get additional features - event_df = get_additional_features(event_df) - # Get tracking features - event_df = get_tracking_features(event_df, tracking_df, verbose=verbose) - - return event_df + tracking_data = extract_tracking_data_from_xml(tracking_xml_path) + # add period + list_of_tracking_data = devide_by_period(tracking_data) + player_info_df = extract_meta_info_from_xml(meta_data_path) + + final_tracking_df_list = [] + for i in range(len(list_of_tracking_data)): + event_df = grouped_events.get_group(PERIOD_ORDER[i]) + tracking_df = pd.DataFrame(list_of_tracking_data[i]) + # Get additional features + event_df = get_inplay_start_time(event_df) + # Get tracking features + processed_tracking_df = get_tracking(tracking_df, event_df, player_info_df) + final_tracking_df_list.append(processed_tracking_df) + + final_tracking_df = pd.concat(final_tracking_df_list, ignore_index=True) + return final_tracking_df -def load_statsbomb_skillcorner(statsbomb_event_dir: str, skillcorner_tracking_dir: str, skillcorner_match_dir: str, statsbomb_match_id: str, skillcorner_match_id: str) -> pd.DataFrame: +def load_statsbomb_skillcorner(sb_event_path: str, sc_tracking_path: str, sc_match_path: str, sc_players_path: str) -> pd.DataFrame: """ Load and merge StatsBomb event data with SkillCorner tracking data. @@ -346,283 +573,322 @@ def load_statsbomb_skillcorner(statsbomb_event_dir: str, skillcorner_tracking_di pd.DataFrame: Combined DataFrame with event and tracking data. """ - # File paths - statsbomb_event_path = f"{statsbomb_event_dir}/{statsbomb_match_id}.csv" - skillcorner_tracking_path = f"{skillcorner_tracking_dir}/{skillcorner_match_id}.json" - skillcorner_match_path = f"{skillcorner_match_dir}/{skillcorner_match_id}.json" + def extract_meta_info_from_match(sc_match: dict, sc_players: list) -> dict: + """ + Extract team and player information (ID, name, side) from a json match data file. - # Load StatsBomb events - events = pd.read_csv(statsbomb_event_path) - - # Load SkillCorner tracking and match data - with open(skillcorner_tracking_path) as f: - tracking = json.load(f) - - with open(skillcorner_match_path) as f: - match = json.load(f) + Args: + sc_match (dict): Dataframe of match data file. + Returns: + dict: Dictionary in the format: {team_id: {'team_name': str, 'team_side': str}}, {player_id: {'position': str, 'team_id': str, 'side': str}}. + """ + # 結果を格納する辞書の初期化 + team_meta_df = {} + player_meta_df = {} + + player_trackable_map = {p['id']: p.get('trackable_object') for p in sc_players} + + # 1. チーム情報の作成 + # Home Team + home_id = sc_match['home_team']['id'] + team_meta_df[home_id] = { + 'team_name': sc_match['home_team']['name'], + 'team_side': 'home' + } + + # Away Team + away_id = sc_match['away_team']['id'] + team_meta_df[away_id] = { + 'team_name': sc_match['away_team']['name'], + 'team_side': 'away' + } + + # 2. 選手情報の作成 + for p in sc_match['players']: + player_id = p['id'] + trackable_id = player_trackable_map.get(player_id) + player_meta_df[trackable_id] = { + 'team_id': p['team_id'], + 'player_name': p['short_name'], + 'position_name': p['player_role']['name'], + 'position_acronym': p['player_role']['acronym'] + } - #check if the file exists - if not os.path.exists(statsbomb_event_path): - print(f"Statsbomb event file not found: {statsbomb_event_path}") - return None - if not os.path.exists(skillcorner_tracking_path): - print(f"Skillcorner tracking file not found: {skillcorner_tracking_path}") - return None - if not os.path.exists(skillcorner_match_path): - print(f"Skillcorner match file not found: {skillcorner_match_path}") + return team_meta_df, player_meta_df + + def get_left_team_id(sc_tracking, team_meta_df, player_meta_df): + all_team_ids = list(team_meta_df.keys()) + for frame_data in sc_tracking: + if frame_data['data']==None: + continue + for obj in frame_data['data']: + if 'z' in obj: + continue + p_id = obj['trackable_object'] + p_info = player_meta_df[p_id] + if p_info['position_acronym'] == 'GK': + if obj['x'] < 0.0: + left_team_id = p_info['team_id'] + else: + left_team_id = [tid for tid in all_team_ids if tid != p_info['team_id']][0] + return left_team_id return None - # Team name mapping - team_name_dict = { - 'UD Almería': 'Almería', 'Real Sociedad': 'Real Sociedad', 'Athletic Club de Bilbao': 'Athletic Club', - 'Villarreal CF': 'Villarreal', 'RC Celta de Vigo': 'Celta Vigo', 'Getafe CF': 'Getafe', - 'UD Las Palmas': 'Las Palmas', 'Sevilla FC': 'Sevilla', 'Cadiz CF': 'Cádiz', - 'Atlético Madrid': 'Atlético Madrid', 'RCD Mallorca': 'Mallorca', 'Valencia CF': 'Valencia', - 'CA Osasuna': 'Osasuna', 'Girona FC': 'Girona', 'Real Betis Balompié': 'Real Betis', - 'FC Barcelona': 'Barcelona', 'Deportivo Alavés': 'Deportivo Alavés', 'Granada CF': 'Granada', - 'Rayo Vallecano': 'Rayo Vallecano', 'Real Madrid CF': 'Real Madrid' - } - - home_team_name = team_name_dict[match['home_team']['name']] - away_team_name = team_name_dict[match['away_team']['name']] - - team_dict = { - match['home_team']['id']: {'role': 'home', 'name': home_team_name}, - match['away_team']['id']: {'role': 'away', 'name': away_team_name} - } - - # Convert the trackable object dict - trackable_objects = {} - home_count = away_count = 0 - - for player in match['players']: - role = team_dict[player['team_id']]['role'] - position = player['player_role']['name'] - if role == 'home': - trackable_objects[player['trackable_object']] = { - 'name': f"{player['first_name']} {player['last_name']}", - 'team': team_dict[player['team_id']]['name'], - 'role': role, - 'id': home_count, - 'position': position - } - home_count += 1 - elif role == 'away': - trackable_objects[player['trackable_object']] = { - 'name': f"{player['first_name']} {player['last_name']}", - 'team': team_dict[player['team_id']]['name'], - 'role': role, - 'id': away_count, - 'position': position + def process_all_tracking(sc_tracking, team_meta_df, player_meta_df, left_team_id): + """ + 全フレームをループし、ポジション順にソートされたフラットなリストを返す。 + """ + + # ポジションの優先順位を辞書化(スコアが低いほど若い番号に割り当てられる) + POSITION_ORDER = ['GK', 'CB', 'RCB', 'LCB', 'RWB', 'RB', 'LWB', 'LB', 'CDM', 'RDM', 'LDM', 'RM', 'CM', 'LM', 'CAM', 'RW', 'LW', 'CF'] + pos_priority = {pos: i for i, pos in enumerate(POSITION_ORDER)} + + # 左右のチームIDを特定 + all_team_ids = list(team_meta_df.keys()) + right_team_id = [tid for tid in all_team_ids if tid != left_team_id][0] + + all_frames_processed = [] + + for frame_data in sc_tracking: + # 基本情報の構築 + res = { + 'period': int(frame_data['period']) if pd.notna(frame_data['period']) else None, + 'inplay_num': None, # 予約列 + 'frame': frame_data['frame'], + 'match_time': frame_data['timestamp'], + 'ball_x': None, # 後で更新 + 'ball_y': None, # 後で更新 + 'left_team_id': left_team_id, + 'left_team_name': team_meta_df[left_team_id]['team_name'], + 'left_team_side': team_meta_df[left_team_id]['team_side'], + 'right_team_id': right_team_id, + 'right_team_name': team_meta_df[right_team_id]['team_name'], + 'right_team_side': team_meta_df[right_team_id]['team_side'] } - away_count += 1 - - trackable_objects[match['ball']['trackable_object']] = {'name': 'ball', 'team': 'ball', 'role': 'ball', 'position': 'ball'} - ball_id = match['ball']['trackable_object'] - - ##sync the tracking data with the events based on the ball velocity - #get the first 5s of the match - ball_velocity_period_1 = [] - ball_velocity_period_2 = [] - - for frame in tracking: - time = frame['timestamp'] - period = frame['period'] - data = frame['data'] - time_components = time.split(':') if time else None - seconds = float(time_components[0]) * 3600 + float(time_components[1]) * 60 + float(time_components[2]) if time else 0 - if time and period==1 and seconds<=5: - for obj in data: - if obj['trackable_object']==ball_id: - ball_velocity_period_1.append([time, obj['x'], obj['y'],obj['z']]) - - if time and period==2 and seconds <= 45*60+5: - for obj in data: - if obj['trackable_object']==ball_id: - ball_velocity_period_2.append([time, obj['x'], obj['y'],obj['z']]) + + # フレーム内のデータを「ボール」と「左右の選手リスト」に分ける + left_players_in_frame = [] + right_players_in_frame = [] - if not ball_velocity_period_1 == [] or not ball_velocity_period_2 == []: - try: - max_velocity_timestamp1, max_velocity1 = calculate_velocity_and_max_timestamp(ball_velocity_period_1) - max_velocity_seconds1 = max_velocity_timestamp1.split(':') - max_velocity_seconds1 = float(max_velocity_seconds1[0]) * 3600 + float(max_velocity_seconds1[1]) * 60 + float(max_velocity_seconds1[2]) - except: - max_velocity_seconds1 = -1 + for obj in frame_data['data']: + # ボールの処理 + if 'z' in obj: + res['ball_x'] = obj['x'] + res['ball_y'] = obj['y'] + continue + + # 選手の処理 + p_id = obj['track_id'] + if p_id in player_meta_df: + p_info = player_meta_df[p_id] + player_data = { + 'id': p_id, + 'name': p_info['player_name'], + 'pos': p_info['position_acronym'], + 'x': obj['x'], + 'y': obj['y'], + 'priority': pos_priority.get(p_info['position_acronym'], 99) # 未定義は最後尾 + } + + if p_info['team_id'] == left_team_id: + left_players_in_frame.append(player_data) + else: + right_players_in_frame.append(player_data) + + # ------------------------------------------------------- + # ⭐ ポジション順(同ポジションならID順)でソート + # ------------------------------------------------------- + left_players_sorted = sorted(left_players_in_frame, key=lambda x: (x['priority'], x['id'])) + right_players_sorted = sorted(right_players_in_frame, key=lambda x: (x['priority'], x['id'])) + + # ソートされた順に left_1, left_2 ... と格納 (最大11人) + for i in range(11): + idx = i + 1 + # Left Team + if i < len(left_players_sorted): + p = left_players_sorted[i] + res[f"left_{idx}_id"] = p['id'] + res[f"left_{idx}_name"] = p['name'] + res[f"left_{idx}_position"] = p['pos'] + res[f"left_{idx}_x"] = p['x'] + res[f"left_{idx}_y"] = p['y'] + else: + # 11人に満たない場合はNaNで埋める(列順を維持するため重要) + res[f"left_{idx}_id"] = None + res[f"left_{idx}_name"] = None + res[f"left_{idx}_position"] = None + res[f"left_{idx}_x"] = None + res[f"left_{idx}_y"] = None + + for i in range(11): + idx = i + 1 + # Right Team + if i < len(right_players_sorted): + p = right_players_sorted[i] + res[f"right_{idx}_id"] = p['id'] + res[f"right_{idx}_name"] = p['name'] + res[f"right_{idx}_position"] = p['pos'] + res[f"right_{idx}_x"] = p['x'] + res[f"right_{idx}_y"] = p['y'] + else: + res[f"right_{idx}_id"] = None + res[f"right_{idx}_name"] = None + res[f"right_{idx}_position"] = None + res[f"right_{idx}_x"] = None + res[f"right_{idx}_y"] = None + + all_frames_processed.append(res) + + return pd.DataFrame(all_frames_processed) + + def get_inplay_start_time(event_df: pd.DataFrame): + """ + event_dfにinplay_numを追加し、各インプレーの開始情報を辞書のリストで返す。 + """ + # データのコピーを作成 + df = event_df.copy() - try: - max_velocity_timestamp2, max_velocity2 = calculate_velocity_and_max_timestamp(ball_velocity_period_2) - max_velocity_seconds2 = max_velocity_timestamp2.split(':') - max_velocity_seconds2 = float(max_velocity_seconds2[0]) * 3600 + float(max_velocity_seconds2[1]) * 60 + float(max_velocity_seconds2[2]) - max_velocity_seconds2 = max_velocity_seconds2 - 45*60 - except: - max_velocity_seconds2 = -1 + # 開始情報を保持するリスト(辞書を格納) + inplay_info_list = [] - if max_velocity_seconds1 == -1 and max_velocity_seconds2 != -1: - max_velocity_seconds1 = max_velocity_seconds2 - elif max_velocity_seconds1 != -1 and max_velocity_seconds2 == -1: - max_velocity_seconds2 = max_velocity_seconds1 - elif max_velocity_seconds1 == -1 and max_velocity_seconds2 == -1: - max_velocity_seconds1 = max_velocity_seconds2 = 0 - - # Process tracking data - tracking_dict = {} - for frame in tracking: - time = frame['timestamp'] - if time: - time_components = time.split(':') - seconds = float(time_components[0]) * 3600 + float(time_components[1]) * 60 + float(time_components[2]) - period = frame['period'] - if period == 1: - seconds = seconds - max_velocity_seconds1 - elif period == 2: - seconds = seconds - max_velocity_seconds2 - seconds = round(seconds, 1) - uid = f"{period}_{seconds}" - tracking_dict[uid] = frame['data'] + # インプレー番号を初期化 + current_inplay = 0 + + continuing_patterns = ['Regular Play', 'From Counter', 'From Keeper'] + restart_types = ['Throw-in', 'Corner', 'Goal Kick', 'Free Kick'] + + for i in range(len(df) - 1): + curr_ev = df.iloc[i] + next_ev = df.iloc[i + 1] + + # pass_type が None の場合は判定をスキップ(元のロジックを維持) + if pd.isna(next_ev['pass_type']): + continue + + # --- インプレーの切り替わり条件判定 --- + is_new_inplay = False + + # 1. 試合終了後のデータ(時間が戻る場合)対策 + next_ts = pd.Timestamp(next_ev['timestamp']).round('100ms') + curr_ts = pd.Timestamp(curr_ev['timestamp']).round('100ms') + if next_ts < curr_ts: + is_new_inplay = True + + # 条件A: play_patternの変化 + elif curr_ev['play_pattern'] != next_ev['play_pattern']: + if next_ev['play_pattern'] not in continuing_patterns: + is_new_inplay = True + + # 条件B: 特定の再開イベント + elif next_ev['pass_type'] in restart_types: + is_new_inplay = True + + # --- インプレー番号の更新と情報の記録 --- + if is_new_inplay: + current_inplay += 1 + # 必要な情報を辞書形式で保存 + inplay_info_list.append({ + 'inplay_num': current_inplay, + 'period': int(next_ev['period']), + 'timestamp': next_ts + }) + + return inplay_info_list - # Prepare data for DataFrame - df_list = [] - for _, event in events.iterrows(): - event_id = event['id'] - match_id = statsbomb_match_id - period = event['period'] - time = event['timestamp'] - minute = event['minute'] - second = event['second'] - event_type = event['type'] - event_type_2 = None - end_x = end_y = None - if event_type == "Pass": - end_location=event.get('pass_end_location') - #check if end_location is a string - if isinstance(end_location, (str)): - end_location = [float(x) for x in end_location[1:-1].split(",")] - end_x = end_location[0] - end_y = end_location[1] - cross=event.get('pass_cross') - pass_height=event.get('pass_height') - pass_type=event.get('pass_type') - if pass_type=="Corner": - event_type_2="Corner" - elif cross and not np.isnan(cross): - event_type_2="Cross" - elif pass_height: - event_type_2=pass_height - elif event_type=="Shot": - event_type_2=event.get('shot_outcome') - - team = event['team'] - home_team = 1 if team == home_team_name else 0 - player = event['player'] - location = event['location'] - - if isinstance(location, str): - location = [float(x) for x in location[1:-1].split(",")] - start_x, start_y = location[0], location[1] - else: - start_x = start_y = None - - time_components = time.split(':') - seconds = round(float(time_components[0]) * 3600 + float(time_components[1]) * 60 + float(time_components[2]), 4) - if period == 2: - seconds += 45 * 60 - elif period == 3: - seconds += 90 * 60 - elif period == 4: - seconds += (90 + 15) * 60 - - seconds_rounded = round(seconds, 1) - uid = f"{period}_{seconds_rounded}" - tracking_data = tracking_dict.get(uid) - home_tracking = [None] * 2 * 23 - away_tracking = [None] * 2 * 23 - home_side = [None] + def get_inplay_tracking(tracking_df: pd.DataFrame, inplay_info_list: List) -> pd.DataFrame: + """ + inplay_info_listを元に、トラッキングデータにinplay_numを付与し、 + インプレー外(区間外)のデータを削除する。 + """ + df = tracking_df.copy() + + # 1. トラッキングデータの時間を統一された日付(1900-01-01)のTimestampに変換 + # これにより「時間・分・秒」のみの純粋な比較が可能になります + df['tmp_timestamp'] = pd.to_datetime( + df['match_time'], format='%H:%M:%S.%f', errors='coerce' + ).map(lambda x: x.replace(year=1900, month=1, day=1) if pd.notna(x) else x) + + # 2. インプレー情報の時間も同じ日付(1900-01-01)に統一 + def normalize_period_time(group): + period_start = group['tmp_timestamp'].min() + # 経過時間を計算し、1900-01-01 00:00:00 からの経過に変換し直す + base = pd.Timestamp('1900-01-01 00:00:00') + group['tmp_timestamp'] = base + (group['tmp_timestamp'] - period_start) + return group + + def normalize_time(ts): + if isinstance(ts, pd.Timestamp): + return ts.replace(year=1900, month=1, day=1) + return ts - if tracking_data: - for obj in tracking_data: - track_obj = trackable_objects[obj['trackable_object']] - if track_obj['role'] == 'home': - home_tracking[2 * track_obj['id']] = obj['x'] - home_tracking[2 * track_obj['id'] + 1] = obj['y'] - elif track_obj['role'] == 'away': - away_tracking[2 * track_obj['id']] = obj['x'] - away_tracking[2 * track_obj['id'] + 1] = obj['y'] - - if track_obj['position'] == "Goalkeeper": - if track_obj['role'] == 'home': - home_gk_x = obj['x'] - elif track_obj['role'] == 'away': - away_gk_x = obj['x'] + df = df.groupby('period', group_keys=False).apply(normalize_period_time) - - # Determine the side of the home team based on the goalkeeper's position - if home_gk_x < away_gk_x: - home_side = 'left' - else: - home_side = 'right' + # --- インプレー番号の割り当て --- + for i in range(len(inplay_info_list)): + current_info = inplay_info_list[i] - home_side = [home_side] + # 日付を1900-01-01に揃える + start_time = normalize_time(current_info['timestamp']) + period = current_info['period'] + num = current_info['inplay_num'] + + period_mask = (df['period'] == period) + + # 次のインプレー開始時間を取得 + next_event_in_same_period = None + for j in range(i + 1, len(inplay_info_list)): + if int(inplay_info_list[j]['period']) == period: + next_event_in_same_period = normalize_time(inplay_info_list[j]['timestamp']) + break + + if next_event_in_same_period is not None: + # 同じピリオド内に次のインプレーがある場合: その直前まで + time_mask = (df['tmp_timestamp'] >= start_time) & (df['tmp_timestamp'] < next_event_in_same_period) + else: + # そのピリオド内で最後のインプレーの場合: ピリオドの最後まで + time_mask = (df['tmp_timestamp'] >= start_time) + + final_mask = period_mask & time_mask + df.loc[final_mask, 'inplay_num'] = num + + # --- データのクリーンアップ --- + # inplay_num が割り当てられなかった行(インプレー外)を削除 + df = df.dropna(subset=['inplay_num']) + + # tmp_timestamp を文字列フォーマットに戻す (%f はマイクロ秒なので下3桁をカット) + base_time = pd.Timestamp('1900-01-01 00:00:00') + df['match_time'] = (df['tmp_timestamp'] - base_time).dt.total_seconds() * 1000 + df = df[df['match_time'] % 200 == 0] + df['match_time'] = df['match_time'].astype(int) + df = df.drop(columns=['tmp_timestamp']) - df_list.append([match_id, period, time, minute, second, seconds, event_type, event_type_2, team, home_team, player, start_x, start_y, end_x, end_y, *home_tracking, *away_tracking, *home_side]) + # 型を整数に戻す + df['period'] = df['period'].astype(int) + df['inplay_num'] = df['inplay_num'].astype(int) + + return df.reset_index(drop=True) - # Define DataFrame columns - home_tracking_columns = [] - away_tracking_columns = [] - for i in range(1, 24): - home_tracking_columns.extend([f"h{i}_x", f"h{i}_y"]) - away_tracking_columns.extend([f"a{i}_x", f"a{i}_y"]) - columns = ["match_id", "period", "time", "minute", "second", 'seconds', "event_type", "event_type_2", "team", "home_team", "player", "start_x", "start_y","end_x","end_y"] + home_tracking_columns + away_tracking_columns + ["home_side"] + # Load the event data + with open(sb_event_path, 'rb') as f: + sb_event = pickle.load(f) + with open(sc_tracking_path, 'r', encoding='utf-8') as f: + sc_tracking = json.load(f) + with open(sc_match_path, 'r', encoding='utf-8') as f: + sc_match = json.load(f) + with open(sc_players_path, 'r', encoding='utf-8') as f: + sc_players = json.load(f) - # Convert the event list to a DataFrame - df = pd.DataFrame(df_list, columns=columns) + team_meta_df, player_meta_df = extract_meta_info_from_match(sc_match, sc_players) - #Sort the DataFrame by 'period' then 'seconds' - df = df.sort_values(by=["period", "seconds"]).reset_index(drop=True) + left_team_id = get_left_team_id(sc_tracking, team_meta_df, player_meta_df) - return df + tracking_df = process_all_tracking(sc_tracking, team_meta_df, player_meta_df, left_team_id) -def calculate_velocity_and_max_timestamp(data): - """ - Calculate the velocity for each time interval and find the timestamp with the highest velocity. + inplay_info_list = get_inplay_start_time(sb_event) - Parameters: - data (list): List of lists, where each sublist contains [timestamp, x, y, z]. + processed_tracking_df = get_inplay_tracking(tracking_df, inplay_info_list) - Returns: - tuple: (max_velocity_timestamp, max_velocity) - - max_velocity_timestamp: The timestamp with the highest velocity. - - max_velocity: The highest velocity value. - """ - # Extract timestamps, x, y, z coordinates - timestamps = [entry[0] for entry in data] - x = np.array([entry[1] for entry in data]) - y = np.array([entry[2] for entry in data]) - z = np.array([entry[3] for entry in data]) - - # Convert timestamps to seconds - time_seconds = np.array([ - (datetime.strptime(ts, "%H:%M:%S.%f") - datetime.strptime(timestamps[0], "%H:%M:%S.%f")).total_seconds() - for ts in timestamps - ]) - - # Calculate differences - delta_x = np.diff(x) - delta_y = np.diff(y) - delta_z = np.diff(z) - delta_t = np.diff(time_seconds) - - # Calculate velocity components and magnitude - vx = delta_x / delta_t - vy = delta_y / delta_t - vz = delta_z / delta_t - velocity_magnitude = np.sqrt(vx**2 + vy**2 + vz**2) - - # Find the index of the maximum velocity - max_velocity_index = np.argmax(velocity_magnitude) - max_velocity = velocity_magnitude[max_velocity_index] - max_velocity_timestamp = timestamps[max_velocity_index + 1] # Use +1 to get the ending timestamp of the interval - - return max_velocity_timestamp, max_velocity + return processed_tracking_df def load_pff2metrica(event_path:str, match_id:str = None) -> pd.DataFrame: """ @@ -839,178 +1105,6 @@ def apply_subtype(success_col, present_series): Metrica_df = Metrica_df[cols] return Metrica_df - -def load_datastadium( - datastadium_event_path: str, - datastadium_home_tracking_path: str, - datastadium_away_tracking_path: str - ) -> pd.DataFrame: - """ - Loads and processes event and tracking data from stadium event recordings. - - Args: - datastadium_event_path (str): Path to the CSV file containing event data. - datastadium_home_tracking_path (str): Path to the CSV file containing home team tracking data. - datastadium_away_tracking_path (str): Path to the CSV file containing away team tracking data. - - Returns: - pd.DataFrame: A DataFrame containing the merged and processed event and tracking data. - """ - # Load data - event = pd.read_csv(datastadium_event_path, encoding='shift_jis') - home_tracking = pd.read_csv(datastadium_home_tracking_path) - away_tracking = pd.read_csv(datastadium_away_tracking_path) - - # Define required columns and flags - required_columns = [ - "試合ID", "ホームアウェイF", "チーム名", "選手名", "アクション名", "F_成功", - "位置座標X", "位置座標Y", "敵陣F", "点差", "自スコア", "相手スコア", - "F_ゴール", "F_セーブ", "F_シュートGK以外", "F_ミスヒット", "ゴール角度", - "ゴール距離", "F_パス", "F_クロス", "F_ドリブル", "F_クリア", - "F_ハンドクリア", "F_ゴールキック", "F_コーナーキック", "F_直接フリーキック", - "F_間接フリーキック", "絶対時間秒数", "フレーム番号","距離" - ] - flags = [ - "F_ゴール", "F_セーブ", "F_シュートGK以外", "F_ミスヒット", "F_パス", - "F_クロス", "F_ドリブル", "F_クリア", "F_ハンドクリア", "F_ゴールキック", - "F_コーナーキック", "F_直接フリーキック", "F_間接フリーキック" - ] - event_type_dict = { - "前半開始": "First Half Start", "前半終了": "First Half End", "後半開始": "Second Half Start", - "後半終了": "Second Half End", "延長前半開始": "Overtime First Half Start", - "延長前半終了": "Overtime First Half End", "延長後半開始": "Overtime Second Half Start", - "延長後半終了": "Overtime Second Half End", "再延長前半開始": "Second Overtime First Half Start", - "再延長前半終了": "Second Overtime First Half End", "再延長後半開始": "Second Overtime Second Start", - "再延長後半終了": "Second Overtime Second End", "PK戦開始": "PK Start", "PK戦終了": "PK End", - "シュート": "Shoot", "GK": "GK", "直接FK": "Direct FK", "キャッチ": "Catch", - "警告(イエロー)": "YellowCard", "PK": "PK", "CK": "CK", "間接FK": "Indirect FK", - "オフサイド": "Offside", "退場(レッド)": "RedCard", "交代": "Change", "キックオフ": "KickOff", - "ファウルする": "Foul", "オウンゴール": "OwnGoal", "ホームパス": "HomePass", - "アウェイパス": "AwayPass", "PKパス": "PKPass", "ポジション変更": "Position Change", - "中断": "Suspension", "ドリブル": "Dribble", "スルーパス": "Through Pass", - "ハンドクリア": "Hand Clear", "ファウル受ける": "Foul", "ドロップボール": "Drop Ball", - "ボールアウト": "Ball Out", "インターセプト": "Intercept", "クリア": "Clear", - "ブロック": "Block", "スローイン": "ThrowIn", "クロス": "Cross", "トラップ": "Trap", - "PK合戦": "PK Battle", "試合再開": "Resume", "フィード": "Feed", "タッチ": "Touch", - "タックル": "Tackle", "フリックオン": "FrickOn", "試合中断": "Suspension", - "ポスト/バー": "Post Bar", "試合中断(試合中)": "Suspension(InGame)", - "試合再開(試合中)": "Resume(InGame)" - } - flag_dict = { - "F_ゴール": "Goal", "F_セーブ": "Save", "F_シュートGK以外": "Shot(not_GK)", - "F_ミスヒット": "MissHit", "F_パス": "Pass", "F_クロス": "Cross", "F_ドリブル": "Dribble", - "F_クリア": "Clear", "F_ハンドクリア": "HandClear", "F_ゴールキック": "GoalKick", - "F_コーナーキック": "CornerKick", "F_直接フリーキック": "DirectFreeKick", - "F_間接フリーキック": "IndirectFreeKick" - } - - # Filter columns and preprocess data - event = event[required_columns].copy() - event["絶対時間秒数"] = event["絶対時間秒数"].astype(float) - event = event.sort_values(by="絶対時間秒数") - - # Create event_type_2 column based on flags - def get_event_type_2(row): - event_types = [flag_dict[f] for f in flags if row[f] == 1] - return "/".join(event_types) if event_types else None - - event["event_type_2"] = event.apply(get_event_type_2, axis=1) - event = event.drop(columns=flags) - - # Rename columns - event.columns = [ - "match_id", "home", "team", "player", "event_type", "success", - "start_x", "start_y", "opp_field", "point_diff", "self_score", - "opp_score", "angle2goal", "dist2goal", "absolute_time", - "frame", "dist", "event_type_2" - ] - - # Reorder columns - event = event[[ - "match_id", "team", "home", "player", "frame", "absolute_time", - "event_type", "event_type_2", "success", "start_x", "start_y","dist", - "opp_field", "point_diff", "self_score", "opp_score", "angle2goal", - "dist2goal" - ]] - - # Convert event_type to English - event["event_type"] = event["event_type"].map(event_type_dict).fillna(event["event_type"]) - - # Calculate period, minute, and second - def calculate_time(row, half_start, period_flag): - time_elapsed = float(row["absolute_time"]) - half_start - return int(time_elapsed / 60), round(time_elapsed % 60, 4) - - period, minute, second = [], [], [] - half_start = float(event.iloc[0]["absolute_time"]) - period_flag = 1 - - for _, row in event.iterrows(): - if row["event_type"] == "Second Half Start": - period_flag = 2 - half_start = float(row["absolute_time"]) - - period.append(period_flag) - m, s = calculate_time(row, half_start, period_flag) - minute.append(m) - second.append(s) - - event["Period"] = period - event["Minute"] = minute - event["Second"] = second - - # Reorder columns - event = event[[ - "match_id", "Period", "Minute", "Second", "frame", "absolute_time", - "team", "home", "player", "event_type", "event_type_2", "success", - "start_x", "start_y", "dist", "opp_field", "point_diff", "self_score", - "opp_score", "angle2goal", "dist2goal" - ]] - - #reset the index - event.reset_index(drop=True, inplace=True) - - # get the tracking start time for 2nd half - tracking_start_time_2 = home_tracking[home_tracking["Period"] == 2].iloc[0]["Time [s]"] - - #sort both tracking data - home_tracking = home_tracking.sort_values(by="Time [s]").reset_index(drop=True) - away_tracking = away_tracking.sort_values(by="Time [s]").reset_index(drop=True) - - home_tracking_time = home_tracking["Time [s]"].round(2).values - tracking_col_home = [f"Home_{i}_x" for i in range(1, 15)] + [f"Home_{i}_y" for i in range(1, 15)] - tracking_col_away = [f"Away_{i}_x" for i in range(1, 15)] + [f"Away_{i}_y" for i in range(1, 15)] - - # Calculate event times vectorized - event_time = event["Minute"] * 60 + event["Second"] + tracking_start_time_2 * (event["Period"] == 2) - - # Find nearest indices using numpy - nearest_indices = np.searchsorted(home_tracking_time, event_time,side='left') - nearest_indices = np.clip(nearest_indices, 0, len(home_tracking_time) - 1) - - # Get the corresponding tracking data - home_tracking_data = home_tracking.iloc[nearest_indices][tracking_col_home].values - away_tracking_data = away_tracking.iloc[nearest_indices][tracking_col_away].values - - # pdb.set_trace() - - # Combine the results - new_df = pd.concat([event, pd.DataFrame(home_tracking_data, columns=tracking_col_home), - pd.DataFrame(away_tracking_data, columns=tracking_col_away)], axis=1) - - - # Create final DataFrame - columns = [ - "match_id", "absolute_time", "Period", "Minute", "Second", "team", "home", "player", - "event_type", "event_type_2", "success", "start_x", "start_y", "dist", - "opp_field", "point_diff", "self_score", "opp_score", - "angle2goal", "dist2goal"] + tracking_col_home + tracking_col_away - - final_df = pd.DataFrame(new_df, columns=columns) - - return final_df - -def load_robocup_2d(event_path: str, match_id: str = None, tracking_path: str = None) -> pd.DataFrame: """ Load event data from CSV file and optionally merge with tracking data. @@ -1079,87 +1173,4 @@ def load_robocup_2d(event_path: str, match_id: str = None, tracking_path: str = # Sort the DataFrame by 'seconds' df = df.sort_values(by="seconds").reset_index(drop=True) - return df - -if __name__ == "__main__": - import pdb - import os - #cd to ../PreProcessing - datafactory_path=os.getcwd()+"/test/sports/event_data/data/datafactory/datafactory_events.json" - metrica_event_json_path=os.getcwd()+"/test/sports/event_data/data/metrica/metrica_events.json" - metrica_event_csv_path=os.getcwd()+"/test/sports/event_data/data/metrica/Sample_Game_1/Sample_Game_1_RawEventsData.csv" - metrica_tracking_home_path=os.getcwd()+"/test/sports/event_data/data/metrica/Sample_Game_1/Sample_Game_1_RawTrackingData_Home_Team.csv" - metrica_tracking_away_path=os.getcwd()+"/test/sports/event_data/data/metrica/Sample_Game_1/Sample_Game_1_RawTrackingData_Away_Team.csv" - opta_f7_path=os.getcwd()+"/test/sports/event_data/data/opta/opta_f7.xml" - opta_f24_path=os.getcwd()+"/test/sports/event_data/data/opta/opta_f24.xml" - robocup_2d_event_path=os.getcwd()+"/test/sports/event_data/data/robocup_2d/202307091024-HELIOS2023_1-vs-CYRUS_0-pass.csv" - robocup_2d_tracking_path=os.getcwd()+"/test/sports/event_data/data/robocup_2d/202307091024-HELIOS2023_1-vs-CYRUS_0.csv" - sportec_event_path=os.getcwd()+"/test/sports/event_data/data/sportec/sportec_events.xml" - sportec_tracking_path=os.getcwd()+"/test/sports/event_data/data/sportec/sportec_positional.xml" - sportec_meta_path=os.getcwd()+"/test/sports/event_data/data/sportec/sportec_meta.xml" - statsbomb_event_path=os.getcwd()+"/test/sports/event_data/data/statsbomb/events/3805010.json" - statsbomb_360_path=os.getcwd()+"/test/sports/event_data/data/statsbomb/three-sixty/3805010.json" - statsbomb_api_path=os.getcwd()+"/test/sports/event_data/data/statsbomb/api.json" - statsbomb_skillcorner_event_path="/data_pool_1/laliga_23/statsbomb/events" - statsbomb_skillcorner_tracking_path="/data_pool_1/laliga_23/skillcorner/tracking" - statsbomb_skillcorner_match_path="/data_pool_1/laliga_23/skillcorner/match" - wyscout_event_path=os.getcwd()+"/test/sports/event_data/data/wyscout/events_England.json" - wyscout_matches_path=os.getcwd()+"/test/sports/event_data/data/wyscout/matches_England.json" - datastadium_event_path=os.getcwd()+"/test/sports/event_data/data/datastadium/2019022307/play.csv" - datastadium_home_tracking_path=os.getcwd()+"/test/sports/event_data/data/datastadium/2019022307/home_tracking.csv" - datastadium_away_tracking_path=os.getcwd()+"/test/sports/event_data/data/datastadium/2019022307/away_tracking.csv" - - #test load_datafactory - # datafactory_df=load_datafactory(datafactory_path) - # datafactory_df.to_csv(os.getcwd()+"/test/sports/event_data/data/datafactory/test_data.csv",index=False) - - #test load_metrica - # metrica_df=load_metrica(metrica_event_json_path,1,metrica_tracking_home_path,metrica_tracking_away_path) - # metrica_df.to_csv(os.getcwd()+"/test/sports/event_data/data/metrica/test_data_json.csv",index=False) - # metrica_df=load_metrica(metrica_event_csv_path,1,metrica_tracking_home_path,metrica_tracking_away_path) - # metrica_df.to_csv(os.getcwd()+"/test/sports/event_data/data/metrica/test_data_csv.csv",index=False) - - #test load_opta_xml - # opta_df=load_opta_xml(opta_f24_path,1) - # opta_df.to_csv(os.getcwd()+"/test/sports/event_data/data/opta/test_data.csv",index=False) - - #test load_robocup_2d - # robocup_2d_df=load_robocup_2d(robocup_2d_event_path,1,robocup_2d_tracking_path) - # robocup_2d_df.to_csv(os.getcwd()+"/test/sports/event_data/data/robocup_2d/test_data.csv",index=False) - - #test load_sportec - # sportec_df=load_sportec(sportec_event_path,sportec_tracking_path,sportec_meta_path) - # sportec_df.to_csv(os.getcwd()+"/test/sports/event_data/data/sportec/test_data.csv",index=False) - - #test load_statsbomb with json file - # statsbomb_df=load_statsbomb(statsbomb_event_path,statsbomb_360_path) - # statsbomb_df.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_data.csv",index=False) - - # test load_statsbomb with api data - # statsbomb_df=load_statsbomb(match_id=3795108) - # statsbomb_df.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_api_data.csv",index=False) - - #test load_statsbomb_skillcorner - # statsbomb_skillcorner_df=load_statsbomb_skillcorner(statsbomb_skillcorner_event_path,statsbomb_skillcorner_tracking_path, - # statsbomb_skillcorner_match_path,3894907,1553748) - # statsbomb_skillcorner_df.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb_skillcorner/test_data.csv",index=False) - - #test load_wyscout - # wyscout_df=load_wyscout(wyscout_event_path,wyscout_matches_path) - # wyscout_df.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_data.csv",index=False) - - - #test load_datastadium - # event=load_datastadium(datastadium_event_path,datastadium_home_tracking_path,datastadium_away_tracking_path) - # event.to_csv(os.getcwd()+"/test/sports/event_data/data/datastadium/load.csv",index=False) - - #test load_soccertrack - soccer_track_event_path="/data_pool_1/soccertrackv2/2023-11-18/Event/event.csv" - soccer_track_tracking_path="/data_pool_1/soccertrackv2/2023-11-18/Tracking/tracking.xml" - soccer_track_meta_path="/data_pool_1/soccertrackv2/2023-11-18/Tracking/meta.xml" - df_soccertrack=load_bepro(soccer_track_event_path,soccer_track_tracking_path,soccer_track_meta_path,True) - df_soccertrack.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/soccertrack/test_load_function_sync.csv",index=False) - - print("----------------done-----------------") - # pdb.set_trace() - + return df \ No newline at end of file diff --git a/preprocessing/sports/phase_data/soccer/soccer_phase_class.py b/preprocessing/sports/phase_data/soccer/soccer_phase_class.py index 3468c3d..37dee12 100644 --- a/preprocessing/sports/phase_data/soccer/soccer_phase_class.py +++ b/preprocessing/sports/phase_data/soccer/soccer_phase_class.py @@ -20,559 +20,36 @@ if __name__ == '__main__': import soccer_load_data - import soccer_processing - import soccer_tracking_data else: from . import soccer_load_data - from . import soccer_processing - from . import soccer_tracking_data import pdb #create a class to wrap the data source class Soccer_phase_data: - def __init__(self,data_provider,bp_tracking_xml_path=None,bp_tracking_json_paths=None, - event_path=None,match_id=None,tracking_home_path=None,tracking_away_path=None, - tracking_path=None,meta_data=None,statsbomb_api_args=[], - statsbomb_match_id=None,skillcorner_match_id=None,max_workers=1,match_id_df=None, - statsbomb_event_dir=None, skillcorner_tracking_dir=None, skillcorner_match_dir=None, - preprocess_method=None,sb360_path=None,wyscout_matches_path=None, - st_track_path=None, st_meta_path=None,verbose=False, - preprocess_tracking=False): + def __init__(self,data_provider,bp_tracking_xml_path=None,bp_tracking_json_paths=None,bp_event_path=None,bp_meta_data=None, + sb_event_path=None, sc_tracking_path=None, sc_match_path=None, sc_players_path=None): self.data_provider = data_provider self.bp_tracking_xml_path = bp_tracking_xml_path self.bp_tracking_json_paths = bp_tracking_json_paths - self.event_path = event_path - self.match_id = match_id - self.tracking_home_path = tracking_home_path - self.tracking_away_path = tracking_away_path - self.tracking_path = tracking_path - self.meta_data = meta_data - self.statsbomb_api_args = statsbomb_api_args - self.statsbomb_match_id = statsbomb_match_id - self.sb360_path = sb360_path - self.skillcorner_match_id = skillcorner_match_id - self.max_workers = max_workers - self.match_id_df = match_id_df - self.statsbomb_event_dir = statsbomb_event_dir - self.skillcorner_tracking_dir = skillcorner_tracking_dir - self.skillcorner_match_dir = skillcorner_match_dir - self.preprocess_method = preprocess_method - self.wyscout_matches_path=wyscout_matches_path - self.st_track_path = st_track_path - self.st_meta_path = st_meta_path - self.preprocess_tracking = preprocess_tracking - self.verbose = verbose - self.call_preprocess = False + self.bp_event_path = bp_event_path + self.bp_meta_data = bp_meta_data + self.sb_event_path = sb_event_path + self.sc_tracking_path = sc_tracking_path + self.sc_match_path = sc_match_path + self.sc_players_path=sc_players_path - def load_data_single_file(self): + def load_data(self): #based on the data provider, load the dataloading function from load_data.py (single file) if self.data_provider == 'bepro': - df=soccer_load_data.load_bepro(self.bp_tracking_xml_path, self.bp_tracking_json_paths, self.event_path) - elif self.data_provider == 'pff_fc': - df=soccer_load_data.load_pff2metrica(self.event_path, match_id=self.match_id) - elif self.data_provider == 'robocup_2d': - df=soccer_load_data.load_robocup_2d(self.event_path,match_id=self.match_id,tracking_path=self.tracking_path) + df=soccer_load_data.load_bepro(self.bp_tracking_xml_path, self.bp_tracking_json_paths, self.bp_event_path, self.bp_meta_data) elif self.data_provider == 'statsbomb_skillcorner': - df=soccer_load_data.load_statsbomb_skillcorner(statsbomb_event_dir=self.statsbomb_event_dir, skillcorner_tracking_dir=self.skillcorner_tracking_dir, skillcorner_match_dir=self.skillcorner_match_dir, statsbomb_match_id=self.statsbomb_match_id, skillcorner_match_id=self.skillcorner_match_id) - if self.preprocess_tracking and not self.call_preprocess: - df=soccer_tracking_data.statsbomb_skillcorner_tracking_data_preprocessing(df) - if self.preprocess_method is not None and not self.call_preprocess: - df=soccer_tracking_data.statsbomb_skillcorner_event_data_preprocessing(df,process_event_coord=False) - elif self.data_provider == 'datastadium': - df=soccer_load_data.load_datastadium(self.event_path,self.tracking_home_path,self.tracking_away_path) + df=soccer_load_data.load_statsbomb_skillcorner(sb_event_path=self.sb_event_path, sc_tracking_path=self.sc_tracking_path, sc_match_path=self.sc_match_path, sc_players_path=self.sc_players_path) + elif self.data_provider == 'pff_fc': + df=soccer_load_data.load_pff2metrica(self.bp_event_path) + # elif self.data_provider == 'robocup_2d': + # df=soccer_load_data.load_robocup_2d(self.event_path,match_id=self.match_id,tracking_path=self.tracking_path) + # elif self.data_provider == 'datastadium': + # df=soccer_load_data.load_datastadium(self.event_path,self.tracking_home_path,self.tracking_away_path) else: raise ValueError('Data provider not supported or not found') return df - - def load_data(self): - print(f'Loading data from {self.data_provider}') - #check if the event path is a single file or a directory - if ((self.event_path is not None and os.path.isfile(self.event_path)) and self.data_provider != 'statsbomb') or \ - (self.data_provider == 'statsbomb' and self.statsbomb_match_id is None and os.path.isfile(self.event_path)) or \ - (self.data_provider == 'statsbomb_skillcorner' and self.statsbomb_match_id is not None): - df = self.load_data_single_file() - #load data from multiple files - elif (self.event_path is not None and os.path.isdir(self.event_path)) or self.data_provider == 'statsbomb' or \ - (self.data_provider == 'statsbomb_skillcorner' and self.statsbomb_match_id is None and self.skillcorner_match_id is None): - #statsbomb_skillcorner - if self.data_provider == 'statsbomb_skillcorner': - out_df_list = [] - self.match_id_df = pd.read_csv(self.match_id_df) - with ThreadPoolExecutor(max_workers=self.max_workers) as executor: - # Submit tasks to the executor - futures = [executor.submit(self.load_match_statsbomb_skillcorner, i, self.match_id_df, - self.statsbomb_event_dir,self.skillcorner_tracking_dir,self.skillcorner_match_dir) - for i in range(len(self.match_id_df))] - # Collect the results as they complete - for future in tqdm(as_completed(futures), total=len(futures)): - out_df_list.append(future.result()) - df = pd.concat(out_df_list) - #other data providers - elif self.data_provider in ['datafactory','opta','wyscout','pff_fc']: - event_path = self.event_path - files = sorted(os.listdir(self.event_path)) - files = [f for f in files if not f.startswith('.')] - if self.data_provider == "pff_fc": - #only json files - files = [f for f in files if f.endswith('.json')] - out_df_list = [] - if self.data_provider == "opta": - if self.match_id is None: - match_id=self.match_id - elif self.data_provider == "wyscout": - matches_path=self.wyscout_matches_path - count=0 - for f in tqdm(files, total=len(files)): - if self.data_provider == "opta": - if self.match_id is None: - self.match_id = match_id[count] - else: - self.match_id = count - count+=1 - elif self.data_provider == "wyscout": - self.wyscout_matches_path=os.path.join(matches_path, f.replace("events_","matches_")) - elif self.data_provider == "pff_fc": - self.match_id = f.split(".")[0] - self.event_path = os.path.join(event_path, f) - df = self.load_data_single_file() - out_df_list.append(df) - df = pd.concat(out_df_list) - self.event_path = event_path - if self.data_provider == "opta": - self.match_id = match_id - elif self.data_provider == "wyscout": - self.wyscout_matches_path=matches_path - # other data providers - elif self.data_provider in ['metrica','robocup_2d','sportec']: - #warnging that the event data and tracking data will be matched via the file name - print('Warning: Event data and tracking data will be matched via the file name') - event_path = self.event_path - files = sorted(os.listdir(self.event_path)) - files = [f for f in files if not f.startswith('.')] - out_df_list = [] - if self.data_provider in ['metrica']: - tracking_home_path = self.tracking_home_path - tracking_away_path = self.tracking_away_path - for f in files: - self.event_path = os.path.join(event_path, f) - self.tracking_home_path = os.path.join(tracking_home_path,f.replace("RawEventsData","RawTrackingData_Home_Team")) - self.tracking_away_path = os.path.join(tracking_away_path,f.replace("RawEventsData","RawTrackingData_Away_Team")) - #check if the tracking data exists - if os.path.isfile(self.tracking_home_path) and os.path.isfile(self.tracking_away_path): - df = self.load_data_single_file() - out_df_list.append(df) - else: - print(f'Tracking data not found for {f}') - df = pd.concat(out_df_list) - self.event_path = event_path - self.tracking_home_path = tracking_home_path - self.tracking_away_path = tracking_away_path - elif self.data_provider == 'robocup_2d': - tracking_path = self.tracking_path - for f in files: - self.event_path = os.path.join(event_path, f) - self.tracking_path = os.path.join(tracking_path,f.replace("pass","")) - self.match_id = f.replace("pass","").replace(".csv","") - if os.path.isfile(self.tracking_path): - df = self.load_data_single_file() - out_df_list.append(df) - else: - print(f'Tracking data not found for {f}') - df = pd.concat(out_df_list) - self.event_path = event_path - self.tracking_path = tracking_path - self.match_id = None - elif self.data_provider == 'sportec': - tracking_path = self.tracking_path - meta_path = self.meta_data - for f in files: - self.event_path = os.path.join(event_path, f) - self.tracking_path = os.path.join(tracking_path,f.replace("events","positional")) - self.meta_path = os.path.join(meta_path,f.replace("events","meta")) - if os.path.isfile(self.tracking_path) and os.path.isfile(self.meta_path): - df = self.load_data_single_file() - out_df_list.append(df) - else: - print(f'Tracking data or Meta data not found for {f}') - df = pd.concat(out_df_list) - self.event_path = event_path - self.tracking_path = tracking_path - self.meta_path = meta_path - # statsbomb - elif self.data_provider == 'statsbomb': - print('Warning: Event data and 360 data will be matched via the file name') - out_df_list = [] - if self.statsbomb_match_id is None: - files = sorted(os.listdir(self.event_path)) - files = [f for f in files if not f.startswith('.')] - event_path = self.event_path - sb360_path = self.sb360_path - def process_file(f): - event_path_local = os.path.join(event_path, f) - sb360_path_local = os.path.join(sb360_path, f) if sb360_path is not None else None - self.event_path = event_path_local - self.sb360_path = sb360_path_local - return self.load_data_single_file() - - with ThreadPoolExecutor(max_workers=self.max_workers) as executor: - futures = {executor.submit(process_file, f): f for f in files} - for future in tqdm(as_completed(futures), total=len(futures)): - result = future.result() - if result is not None: - out_df_list.append(result) - - df = pd.concat(out_df_list) - self.event_path = event_path - self.sb360_path = sb360_path - else: - if isinstance(self.statsbomb_match_id, list): - files = self.statsbomb_match_id - else: - files = [self.statsbomb_match_id] - - def process_id(f): - self.statsbomb_match_id = str(f) - return self.load_data_single_file() - - for f in tqdm(files, total=len(files)): - out_df_list.append(process_id(f)) - - df = pd.concat(out_df_list) - self.statsbomb_match_id = files - # datastadium - elif self.data_provider == "datastadium": - out_df_list = [] - - event_dir = self.event_path - - def process_event_folder(f): - # Define file paths for the current event folder - self.event_path = os.path.join(event_dir, f, 'play.csv') - self.tracking_home_path = os.path.join(event_dir, f, 'home_tracking.csv') - self.tracking_away_path = os.path.join(event_dir, f, 'away_tracking.csv') - - # Load data - df = self.load_data_single_file() - return df - - # Initialize ThreadPoolExecutor - with ThreadPoolExecutor(max_workers=self.max_workers) as executor: - # Get list of event folders - event_folders = sorted(f for f in os.listdir(self.event_path) if not (f.startswith('.') or f.startswith('@'))) - # Submit tasks to the executor - future_to_event = {executor.submit(process_event_folder, folder): folder for folder in event_folders} - # Collect results - out_df_list = [] - for future in tqdm(as_completed(future_to_event), total=len(future_to_event)): - try: - df = future.result() - out_df_list.append(df) - except Exception as e: - print(f'Error processing folder {future_to_event[future]}: {e}') - self.event_path = event_dir - df = pd.concat(out_df_list) - - else: - raise ValueError('Event path is not a valid file or directory') - print(f'Loaded data from {self.data_provider}') - return df - - def load_match_statsbomb_skillcorner(self,i, match_id_df, statsbomb_skillcorner_event_path, - statsbomb_skillcorner_tracking_path, statsbomb_skillcorner_match_path): - statsbomb_match_id = match_id_df.loc[i, "match_id_statsbomb"] - skillcorner_match_id = match_id_df.loc[i, "match_id_skillcorner"] - try: - statsbomb_skillcorner_df = soccer_load_data.load_statsbomb_skillcorner( - statsbomb_skillcorner_event_path, - statsbomb_skillcorner_tracking_path, - statsbomb_skillcorner_match_path, - statsbomb_match_id, - skillcorner_match_id - ) - except: #Exception as e: - # print("An error occurred:", e) - print(f"Skipped match statsbomb match_id: {statsbomb_match_id}") - statsbomb_skillcorner_df=None - return statsbomb_skillcorner_df - - def preprocessing_single_df(self,df): - df_out=None - if self.data_provider in ["statsbomb", "wyscout","statsbomb_skillcorner","datastadium"]: - if self.data_provider in ["statsbomb","statsbomb_skillcorner"]: - df = df.reset_index(drop=True) - df_out=soccer_processing.UIED_statsbomb(df) - elif self.data_provider == "datastadium": - df_out=soccer_processing.UIED_datastadium(df) - elif self.data_provider == "wyscout": - if self.preprocess_method == "UIED": - df_out=soccer_processing.UIED_wyscout(df) - elif self.preprocess_method == "LEM": - df_out=soccer_processing.lem(df) - elif self.preprocess_method == "NMSTPP": - df_out=soccer_processing.nmstpp(df) - elif self.preprocess_method == "SEQ2EVENT": - df_out=soccer_processing.seq2event(df) - else: - raise ValueError(f'Preprocessing method {self.preprocess_method} not found') - else: - raise ValueError(f'Preprocessing method not supported for {self.data_provider}') - return df_out - - def preprocessing(self): - self.call_preprocess = True - print(f'Preprocessing data from {self.data_provider} with method {self.preprocess_method}') - if self.preprocess_method is not None: - df = self.load_data() - out_df_list = [] - - # df_out=self.preprocessing_single_df(df) - # return df_out - - def process_single_match(match_id): - df_single = df[df.match_id == match_id] - return self.preprocessing_single_df(df_single) - - unique_match_ids = df.match_id.unique() - # unique_match_ids = [df.match_id.unique()[0]] - - with ThreadPoolExecutor(max_workers=self.max_workers) as executor: - future_to_match_id = {executor.submit(process_single_match, match_id): match_id for match_id in unique_match_ids} - - for future in tqdm(as_completed(future_to_match_id), total=len(future_to_match_id)): - match_id = future_to_match_id[future] - try: - df_single = future.result() - out_df_list.append(df_single) - except Exception as e: - print(f'Exception for match_id {match_id}: {e}') - - df = pd.concat(out_df_list) if len(out_df_list) > 1 else out_df_list[0] - df = df.reset_index(drop=True) - df['index_column'] = df.index - df = df.sort_values(by=['match_id', "index_column"]) - df = df.drop(columns=['index_column']) - else: - raise ValueError('Preprocessing method not found') - print(f'Preprocessed data from {self.data_provider} with method {self.preprocess_method}') - self.call_preprocess = False - return df - -if __name__ == '__main__': - datafactory_path=os.getcwd()+"/test/sports/event_data/data/datafactory/datafactory_events.json" - metrica_event_json_path=os.getcwd()+"/test/sports/event_data/data/metrica/metrica_events.json" - metrica_event_csv_path=os.getcwd()+"/test/sports/event_data/data/metrica/Sample_Game_1/Sample_Game_1_RawEventsData.csv" - metrica_tracking_home_path=os.getcwd()+"/test/sports/event_data/data/metrica/Sample_Game_1/Sample_Game_1_RawTrackingData_Home_Team.csv" - metrica_tracking_away_path=os.getcwd()+"/test/sports/event_data/data/metrica/Sample_Game_1/Sample_Game_1_RawTrackingData_Away_Team.csv" - opta_f7_path=os.getcwd()+"/test/sports/event_data/data/opta/opta_f7.xml" - opta_f24_path=os.getcwd()+"/test/sports/event_data/data/opta/opta_f24.xml" - robocup_2d_event_path=os.getcwd()+"/test/sports/event_data/data/robocup_2d/202307091024-HELIOS2023_1-vs-CYRUS_0-pass.csv" - robocup_2d_tracking_path=os.getcwd()+"/test/sports/event_data/data/robocup_2d/202307091024-HELIOS2023_1-vs-CYRUS_0.csv" - sportec_event_path=os.getcwd()+"/test/sports/event_data/data/sportec/sportec_events.xml" - sportec_tracking_path=os.getcwd()+"/test/sports/event_data/data/sportec/sportec_positional.xml" - sportec_meta_path=os.getcwd()+"/test/sports/event_data/data/sportec/sportec_meta.xml" - statsbomb_event_path=os.getcwd()+"/test/sports/event_data/data/statsbomb/events/3805010.json" - statsbomb_360_path=os.getcwd()+"/test/sports/event_data/data/statsbomb/three-sixty/3805010.json" - statsbomb_api_path=os.getcwd()+"/test/sports/event_data/data/statsbomb/api.json" - statsbomb_skillcorner_event_path="/data_pool_1/laliga_23/statsbomb/events" - statsbomb_skillcorner_tracking_path="/data_pool_1/laliga_23/skillcorner/tracking" - statsbomb_skillcorner_match_path="/data_pool_1/laliga_23/skillcorner/match" - wyscout_event_path=os.getcwd()+"/test/sports/event_data/data/wyscout/events_England.json" - datastadium_event_path=os.getcwd()+"/test/sports/event_data/data/datastadium/2019022307/play.csv" - datastadium_tracking_home_path=os.getcwd()+"/test/sports/event_data/data/datastadium/2019022307/home_tracking.csv" - datastadium_tracking_away_path=os.getcwd()+"/test/sports/event_data/data/datastadium/2019022307/away_tracking.csv" - - #test single file - - #test load_datafactory - # datafactory_df=Event_data(data_provider='datafactory',event_path=datafactory_path).load_data() - # datafactory_df.to_csv(os.getcwd()+"/test/sports/event_data/data/datafactory/test_data_main.csv",index=False) - - #test load_metrica - # metrica_df=Event_data(data_provider='metrica',event_path=metrica_event_csv_path,match_id=1, - # tracking_home_path=metrica_tracking_home_path,tracking_away_path=metrica_tracking_away_path).load_data() - # metrica_df.to_csv(os.getcwd()+"/test/sports/event_data/data/metrica/test_data_csv_main.csv",index=False) - # metrica_df=Event_data(data_provider='metrica',event_path=metrica_event_json_path,match_id=1).load_data() - # metrica_df.to_csv(os.getcwd()+"/test/sports/event_data/data/metrica/test_data_json_main.csv",index=False) - - #test load_opta_xml - # opta_df=Event_data(data_provider='opta',event_path=opta_f24_path,match_id=1).load_data() - # opta_df.to_csv(os.getcwd()+"/test/sports/event_data/data/opta/test_data_main.csv",index=False) - - #test load_robocup_2d - # robocup_2d_df=Event_data(data_provider='robocup_2d',event_path=robocup_2d_event_path,match_id=1,tracking_path=robocup_2d_tracking_path).load_data() - # robocup_2d_df.to_csv(os.getcwd()+"/test/sports/event_data/data/robocup_2d/test_data_main.csv",index=False) - - #test load_sportec - # sportec_df = Event_data(data_provider='sportec', event_path=sportec_event_path, tracking_path=sportec_tracking_path, meta_data=sportec_meta_path).load_data() - # sportec_df.to_csv(os.getcwd()+"/test/sports/event_data/data/sportec/test_data_main.csv",index=False) - - #test load_statsbomb with json file - # statsbomb_df=Event_data(data_provider='statsbomb',event_path=statsbomb_event_path,sb360_path=statsbomb_360_path).load_data() - # statsbomb_df.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_data_main.csv",index=False) - - # test load_statsbomb with api data - # statsbomb_df=Event_data(data_provider='statsbomb',statsbomb_match_id=3795108).load_data() - # statsbomb_df.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_api_data_main.csv",index=False) - - #test load_statsbomb_skillcorner - # statsbomb_skillcorner_df=Event_data(data_provider='statsbomb_skillcorner', - # statsbomb_event_dir=statsbomb_skillcorner_event_path, - # skillcorner_tracking_dir=statsbomb_skillcorner_tracking_path, - # skillcorner_match_dir=statsbomb_skillcorner_match_path, - # statsbomb_match_id=3894907, - # skillcorner_match_id=1553748 - # ).load_data() - # statsbomb_skillcorner_df.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb_skillcorner/test_data_main.csv",index=False) - - #test load_wyscout - # wyscout_df=Event_data(data_provider='wyscout',event_path=wyscout_event_path).load_data() - # wyscout_df.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_data_main.csv",index=False) - - # test load_datastadium - # datastadium_df=Event_data(data_provider='datastadium',event_path=datastadium_event_path, - # tracking_home_path=datastadium_tracking_home_path,tracking_away_path=datastadium_tracking_away_path).load_data() - # datastadium_df.to_csv(os.getcwd()+"/test/sports/event_data/data/datastadium/load_class_single.csv",index=False) - - - - #test preprocessing - # seq2event - # wyscout_df=Event_data(data_provider='wyscout',event_path=wyscout_event_path,preprocess_method="SEQ2EVENT",max_workers=10).preprocessing() - # wyscout_df.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_seq2event_main.csv",index=False) - - #test nmstpp - # wyscout_df=Event_data(data_provider='wyscout',event_path=wyscout_event_path,preprocess_method="NMSTPP",max_workers=10).preprocessing() - # wyscout_df.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_nmstpp_main.csv",index=False) - - #test lem - # wyscout_df=Event_data(data_provider='wyscout',event_path=wyscout_event_path,preprocess_method="LEM",max_workers=10).preprocessing() - # wyscout_df.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_lem_main.csv",index=False) - - #test UIED wyscout - # df_wyscout=Event_data(data_provider='wyscout',event_path=wyscout_event_path,preprocess_method="UIED",max_workers=10).preprocessing() - # df_wyscout.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_wyscout_UIED_main.csv",index=False) - - #test UIED statsbomb_skillcorner - # df_statsbomb_skillcorner=Event_data(data_provider='statsbomb_skillcorner', - # statsbomb_event_dir=statsbomb_skillcorner_event_path, - # skillcorner_tracking_dir=statsbomb_skillcorner_tracking_path, - # skillcorner_match_dir=statsbomb_skillcorner_match_path, - # statsbomb_match_id=3894907, - # skillcorner_match_id=1553748, - # preprocess_method="UIED", - # max_workers=10).preprocessing() - # df_statsbomb_skillcorner.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb_skillcorner/test_preprocess_statsbomb_skillcorner_UIED_main.csv",index=False) - - #test UIED statsbomb_json - # df_statsbomb_json=Event_data(data_provider='statsbomb',event_path=statsbomb_event_path,sb360_path=statsbomb_360_path,preprocess_method="UIED").preprocessing() - # df_statsbomb_json.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_preprocess_statsbomb_json_UIED_main.csv",index=False) - - #test UIED statsbomb_api - # df_statsbomb_api=Event_data(data_provider='statsbomb',statsbomb_match_id=3795108,preprocess_method="UIED").preprocessing() - # df_statsbomb_api.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_preprocess_statsbomb_api_UIED_main.csv",index=False) - - #test UIED datastadium - # df_datastadium=Event_data(data_provider='datastadium',event_path=datastadium_event_path, - # tracking_home_path=datastadium_tracking_home_path,tracking_away_path=datastadium_tracking_away_path, - # preprocess_method="UIED").preprocessing() - # df_datastadium.to_csv(os.getcwd()+"/test/sports/event_data/data/datastadium/preprocess_UIED_class_single.csv",index=False) - - - - - - - - - - # multiple files - # statsbomb_df=Event_data(data_provider='statsbomb',statsbomb_match_id=[3788742,3788741]).load_data() - # statsbomb_df.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_api_data_main_multi.csv",index=False) - - #test load_statsbomb_skillcorner - # statsbomb_skillcorner_df=Event_data(data_provider='statsbomb_skillcorner', - # statsbomb_event_dir=statsbomb_skillcorner_event_path, - # skillcorner_tracking_dir=statsbomb_skillcorner_tracking_path, - # skillcorner_match_dir=statsbomb_skillcorner_match_path, - # match_id_df=os.getcwd()+'/preprocessing/example/id_matching.csv', - # max_workers=10).load_data() - # statsbomb_skillcorner_df.head(10000).to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb_skillcorner/test_data_main_multi.csv",index=False) - - #test load_statsbomb_json - # multi_event_path="/data_pool_1/statsbomb_2023/events_and_frames/data/events" - # multi_360_path="/data_pool_1/statsbomb_2023/events_and_frames/data/360-frames" - - # statsbomb_df=Event_data(data_provider='statsbomb',event_path=multi_event_path,sb360_path=multi_360_path,max_workers=10).load_data() - # statsbomb_df.head(10000).to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_data_main_multi.csv",index=False) - - #test load_wyscout - # wyscout_event_path="/home/c_yeung/workspace6/python/openstarlab/PreProcessing/test/sports/event_data/data/wyscout/event" - # wyscout_matches_path="/home/c_yeung/workspace6/python/openstarlab/PreProcessing/test/sports/event_data/data/wyscout/matches" - # wyscout_df=Event_data(data_provider='wyscout', - # event_path=wyscout_event_path, - # wyscout_matches_path=wyscout_matches_path, - # max_workers=10).load_data() - # wyscout_df.head(10000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_data_main_multi.csv",index=False) - - #test load_datastadium multiple files - # datastadium_df=Event_data(data_provider='datastadium',event_path=datastadium_dir,max_workers=10).load_data() - # datastadium_df.to_csv(os.getcwd()+"/test/sports/event_data/data/datastadium/load_class_multi.csv",index=False) - - #test preprocessing multi files - # wyscout_event_path="/home/c_yeung/workspace6/python/openstarlab/PreProcessing/test/sports/event_data/data/wyscout/event" - # wyscout_matches_path="/home/c_yeung/workspace6/python/openstarlab/PreProcessing/test/sports/event_data/data/wyscout/matches" - # statsbomb_df=Event_data(data_provider='statsbomb',statsbomb_match_id=[3788742,3788741]).load_data() - # statsbomb_df.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_api_data_main_multi.csv",index=False) - #seq2event - # wyscout_df=Event_data(data_provider='wyscout',event_path=wyscout_event_path,wyscout_matches_path=wyscout_matches_path, - # preprocess_method="SEQ2EVENT",max_workers=10).preprocessing() - # wyscout_df.head(10000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_seq2event_main_multi.csv",index=False) - - #nmstpp - # wyscout_df=Event_data(data_provider='wyscout',event_path=wyscout_event_path,wyscout_matches_path=wyscout_matches_path, - # preprocess_method="NMSTPP",max_workers=10).preprocessing() - # wyscout_df.head(10000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_seq2event_main_multi.csv",index=False) - - #lem - # wyscout_df=Event_data(data_provider='wyscout',event_path=wyscout_event_path,wyscout_matches_path=wyscout_matches_path, - # preprocess_method="LEM",max_workers=10).preprocessing() - # wyscout_df.head(10000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_seq2event_main_multi.csv",index=False) - - #UIED wyscout - # wyscout_df=Event_data(data_provider='wyscout',event_path=wyscout_event_path,wyscout_matches_path=wyscout_matches_path, - # preprocess_method="UIED",max_workers=10).preprocessing() - # wyscout_df.head(10000).to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_seq2event_main_multi.csv",index=False) - - #UIED statsbomb_skillcorner - # statsbomb_skillcorner_df=Event_data(data_provider='statsbomb_skillcorner', - # statsbomb_event_dir=statsbomb_skillcorner_event_path, - # skillcorner_tracking_dir=statsbomb_skillcorner_tracking_path, - # skillcorner_match_dir=statsbomb_skillcorner_match_path, - # match_id_df=os.getcwd()+'/preprocessing/example/id_matching.csv', - # preprocess_method="UIED", - # ).preprocessing() - # statsbomb_skillcorner_df.head(1000).to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb_skillcorner/test_preprocess_statsbomb_skillcorner_UIED_main_multi.csv",index=False) - - #UIED statsbomb_json - # multi_event_path="/data_pool_1/statsbomb_2023/events_and_frames/data/events" - # multi_360_path="/data_pool_1/statsbomb_2023/events_and_frames/data/360-frames" - - # statsbomb_df=Event_data(data_provider='statsbomb',event_path=multi_event_path,sb360_path=multi_360_path,preprocess_method="UIED",max_workers=10).preprocessing() - # statsbomb_df.head(10000).to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_preprocess_statsbomb_json_UIED_main_multi.csv",index=False) - - #UIED statsbomb_api (could not test due to Max retries exceeded) - - #test UIED datastadium multiple files - # df_datastadium=Event_data(data_provider='datastadium',event_path=datastadium_dir,preprocess_method="UIED",max_workers=10).preprocessing() - # df_datastadium.to_csv(os.getcwd()+"/test/sports/event_data/data/datastadium/preprocess_UIED_class_multi.csv",index=False) - - #test soccertrack - soccer_track_event_path="/data_pool_1/soccertrackv2/2024-03-18/Event/event.csv" - soccer_track_tracking_path="/data_pool_1/soccertrackv2/2024-03-18/Tracking/tracking.xml" - soccer_track_meta_path="/data_pool_1/soccertrackv2/2024-03-18/Tracking/meta.xml" - df_soccertrack=Soccer_phase_data('soccertrack',soccer_track_event_path, - st_track_path = soccer_track_tracking_path, - st_meta_path = soccer_track_meta_path, - verbose = True).load_data() - df_soccertrack.to_csv(os.getcwd()+"/test/sports/event_data/data/soccertrack/test_load_soccer_event_class.csv",index=False) - print("-----------done-----------") diff --git a/preprocessing/sports/phase_data/soccer/soccer_plot_row.py b/preprocessing/sports/phase_data/soccer/soccer_plot_row.py deleted file mode 100644 index f1024a8..0000000 --- a/preprocessing/sports/phase_data/soccer/soccer_plot_row.py +++ /dev/null @@ -1,175 +0,0 @@ -import matplotlib.pyplot as plt -import pandas as pd -import matplotlib.patches as patches -import os -import pdb - -FIELD_LENGTH = 105.0 # unit: meters -FIELD_WIDTH = 68.0 # unit: meters -GOAL_WIDTH = 7.32 # unit: meters -PENALTY_X = 105.0/2-16.5 # left point (unit: meters) -PENALTY_Y = 40.32 # upper point (unit: meters) - - -def plot_row_soccer(df, row, save_path): - if not isinstance(df, pd.DataFrame): - if isinstance(df, str): - df = pd.read_csv(df) - else: - raise ValueError("The input is not a dataframe or a path to a csv file") - - fig, ax = plt.subplots(figsize=(8, 6)) - fig.subplots_adjust(bottom=0.2) - - # Flip the y-axis - ax.invert_yaxis() - - # Plot the pitch - - # Center line - ax.plot([FIELD_LENGTH/2, FIELD_LENGTH/2], [0, FIELD_WIDTH], color="black", linewidth=0.7) - - # Penalty areas - # pdb.set_trace() - ax.plot([PENALTY_X+FIELD_LENGTH/2, FIELD_LENGTH], [(FIELD_WIDTH-PENALTY_Y)/2, (FIELD_WIDTH-PENALTY_Y)/2], color="black", linewidth=0.7) - ax.plot([PENALTY_X+FIELD_LENGTH/2, FIELD_LENGTH], [(FIELD_WIDTH+PENALTY_Y)/2, (FIELD_WIDTH+PENALTY_Y)/2], color="black", linewidth=0.7) - ax.plot([PENALTY_X+FIELD_LENGTH/2, PENALTY_X+FIELD_LENGTH/2,], [(FIELD_WIDTH-PENALTY_Y)/2, (FIELD_WIDTH+PENALTY_Y)/2], color="black", linewidth=0.7) - - ax.plot([FIELD_LENGTH/2-PENALTY_X, 0], [(FIELD_WIDTH-PENALTY_Y)/2, (FIELD_WIDTH-PENALTY_Y)/2], color="black", linewidth=0.7) - ax.plot([FIELD_LENGTH/2-PENALTY_X, 0], [(FIELD_WIDTH+PENALTY_Y)/2, (FIELD_WIDTH+PENALTY_Y)/2], color="black", linewidth=0.7) - ax.plot([FIELD_LENGTH/2-PENALTY_X, FIELD_LENGTH/2-PENALTY_X], [(FIELD_WIDTH-PENALTY_Y)/2, (FIELD_WIDTH+PENALTY_Y)/2], color="black", linewidth=0.7) - - # Goal areas - ax.plot([5.5, 0], [(FIELD_WIDTH-18.32)/2, (FIELD_WIDTH-18.32)/2], color="black", linewidth=0.7) - ax.plot([5.5, 0], [(FIELD_WIDTH+18.32)/2, (FIELD_WIDTH+18.32)/2], color="black", linewidth=0.7) - ax.plot([5.5, 5.5], [(FIELD_WIDTH-18.32)/2, (FIELD_WIDTH+18.32)/2], color="black", linewidth=0.7) - - ax.plot([FIELD_LENGTH-5.5, FIELD_LENGTH], [(FIELD_WIDTH-18.32)/2, (FIELD_WIDTH-18.32)/2], color="black", linewidth=0.7) - ax.plot([FIELD_LENGTH-5.5, FIELD_LENGTH], [(FIELD_WIDTH+18.32)/2, (FIELD_WIDTH+18.32)/2], color="black", linewidth=0.7) - ax.plot([FIELD_LENGTH-5.5, FIELD_LENGTH-5.5], [(FIELD_WIDTH-18.32)/2, (FIELD_WIDTH+18.32)/2], color="black", linewidth=0.7) - - # # Goals - # ax.plot([-2, -2], [(FIELD_WIDTH-GOAL_WIDTH)/2, (FIELD_WIDTH+GOAL_WIDTH)/2], color="black", linewidth=10) - # ax.plot([FIELD_LENGTH+2, FIELD_LENGTH+2], [(FIELD_WIDTH-GOAL_WIDTH)/2, (FIELD_WIDTH+GOAL_WIDTH)/2], color="black", linewidth=10) - - # Field outline - ax.plot([0, FIELD_LENGTH], [0, 0], color="black", linewidth=2) - ax.plot([0, FIELD_LENGTH], [FIELD_WIDTH, FIELD_WIDTH], color="black", linewidth=2) - ax.plot([0, 0], [0, FIELD_WIDTH], color="black", linewidth=2) - ax.plot([FIELD_LENGTH, FIELD_LENGTH], [0, FIELD_WIDTH], color="black", linewidth=2) - - # Center circle - c = patches.Circle(xy=(FIELD_LENGTH/2, FIELD_WIDTH/2), radius=9.15, fill=False, ec='black', linewidth=0.7) - ax.add_patch(c) - - # Penalty arcs - a = patches.Arc((11, FIELD_WIDTH/2), 9.15*2, 9.15*2, theta1=270+37, theta2=90-37, linewidth=0.7) - ax.add_patch(a) - a = patches.Arc((FIELD_LENGTH-11, FIELD_WIDTH/2), 9.15*2, 9.15*2, theta1=90+36, theta2=270-36, linewidth=0.7) - # a = patches.Arc((-FIELD_LENGTH / 2 + 11, 0), 9.15*2, 9.15*2, theta1=270+34, theta2=90-34, linewidth=0.7) - ax.add_patch(a) - - # Set axis limits - ax.set_xlim(-5, FIELD_LENGTH+5) - ax.set_ylim(FIELD_WIDTH+5, -5) - - # Plot the player positions - df = df.reset_index(drop=True) - - row_df = df.iloc[row] - - # Define possession team actions - team_actions =[ 'Pass_Ground Pass', 'Pass_Long_HighPass', - 'Carry_nan', 'Pass_High Pass', 'Pass_Low Pass', - 'Miscontrol_nan', - 'Dribble_nan', 'Clearance_nan', 'Pass_Cross', 'Ball Recovery_nan', - 'Offside_nan', 'Goal Keeper_nan', - 'Dribbled Past_nan', 'Pass_Corner', - 'Shot_Saved', 'Shot_Blocked', 'Shot_Wayward', 'Shot_Off T', 'Shot_Goal', 'Shot_Post', - 'Tactical Shift_nan', 'Shield_nan', - 'Own Goal Against_Own goal', 'Error_nan', - 'Shot_Saved Off Target', 'Ball Receipt*_nan', 'Pressure_nan', 'Interception_nan' - ] - - def plot_player(row_df, ax, switch=False): - if not switch: - for i in range(1, 24): - x = row_df[f"h{i}_x"]+FIELD_LENGTH/2 - y = -(row_df[f"h{i}_y"])+FIELD_WIDTH/2 - if x == 0 and y == 0: - continue - ax.plot(x, y, 'o', color='red') - for i in range(1, 24): - x = row_df[f"a{i}_x"]+FIELD_LENGTH/2 - y = -(row_df[f"a{i}_y"])+FIELD_WIDTH/2 - if x == 0 and y == 0: - continue - ax.plot(x, y, 'o', color='blue') - else: - for i in range(1, 24): - x = -(row_df[f"h{i}_x"])+FIELD_LENGTH/2 - y = (row_df[f"h{i}_y"])+FIELD_WIDTH/2 - if x == 0 and y == 0: - continue - ax.plot(x, y, 'o', color='red') - for i in range(1, 24): - x = -(row_df[f"a{i}_x"])+FIELD_LENGTH/2 - y = (row_df[f"a{i}_y"])+FIELD_WIDTH/2 - if x == 0 and y == 0: - continue - ax.plot(x, y, 'o', color='blue') - - #check if col 'action' exists - switch_flag = False - if 'action' in df.columns: - x = row_df["start_x"] - y = row_df["start_y"] - home_team = row_df['home_team'] - home_side = row_df['home_side'] - if home_team == 1 and home_side == 'right': - plot_player(row_df, ax, switch=True) - switch_flag = True - elif home_team == 0 and home_side == 'left': - plot_player(row_df, ax, switch=True) - switch_flag = True - else: - plot_player(row_df, ax, switch=False) - switch_flag = False - elif 'event_type' in df.columns: - x = row_df["start_x"]*(1.05/1.2) - y = row_df["start_y"]*(0.68/0.8) - home_team = row_df['home_team'] - home_side = row_df['home_side'] - action = str(row_df["event_type"])+ "_" + str(row_df["event_type_2"]).replace("None","nan") - poss_team_action = True if action in team_actions else False - if poss_team_action: - if home_team == 1 and home_side == 'right': - plot_player(row_df, ax, switch=True) - switch_flag = True - elif home_team == 0 and home_side == 'left': - plot_player(row_df, ax, switch=True) - switch_flag = True - else: - plot_player(row_df, ax, switch=False) - switch_flag = False - else: - if home_team == 1 and home_side == 'right': - plot_player(row_df, ax, switch=False) - switch_flag = False - elif home_team == 0 and home_side == 'left': - plot_player(row_df, ax, switch=False) - switch_flag = False - else: - plot_player(row_df, ax, switch=True) - switch_flag = True - - - #plot the event location - ax.plot(x, y, 'o', color='black', markersize=3) - - # Set the figure title - ax.set_title(f"Row {row}, action: {action}, seconds: {row_df['seconds']}, home : {row_df.home_team}, switch: {switch_flag}\n red: home team, blue: away team, black: event location") - - # Save the plot - plt.savefig(save_path + f"/row_{row}.png") - plt.close(fig) diff --git a/preprocessing/sports/phase_data/soccer/soccer_processing.py b/preprocessing/sports/phase_data/soccer/soccer_processing.py deleted file mode 100644 index 3e9a665..0000000 --- a/preprocessing/sports/phase_data/soccer/soccer_processing.py +++ /dev/null @@ -1,1554 +0,0 @@ -import os -import pandas as pd -import numpy as np -import pdb - -def seq2event(data): - """ - Processes soccer match event data to determine possession, filter actions, - compute additional metrics, and normalize data. - - Parameters: - data (pd.DataFrame or str): A pandas DataFrame containing event data or a file path to a CSV file. - - Returns: - pd.DataFrame: A processed DataFrame with simplified and normalized event actions. - """ - - # Load data from DataFrame or file path - if isinstance(data, pd.DataFrame): - df = data - elif isinstance(data, str): - if os.path.exists(data): - df = pd.read_csv(data) - else: - raise FileNotFoundError("The file path does not exist") - else: - raise ValueError("The data must be a pandas DataFrame or a file path") - df = df.copy() - # Create 'action' column by concatenating 'event_type' and 'event_type_2' - df.loc[:, "action"] = df["event_type"].astype(str) + "_" + df["event_type_2"].astype(str) - - # Define possession team actions - possession_team_actions = [ - 'Free Kick_Goal kick', 'Free Kick_Throw in', 'Free Kick_Corner', 'Free Kick_Free Kick', - 'Free Kick_Free kick cross', 'Free Kick_Free kick shot', 'Free Kick_Penalty', 'Pass_Cross', - 'Pass_Hand pass', 'Pass_Head pass', 'Pass_High pass', 'Pass_Launch', 'Pass_Simple pass', - 'Pass_Smart pass', 'Shot_Shot', 'Shot_Goal', 'Free Kick_goal', 'Duel_Ground attacking duel_off dribble', - 'Others on the ball_Acceleration', 'Others on the ball_Clearance', 'Others on the ball_Touch_good', - 'Shot_Own_goal', 'Pass_Own_goal', 'Others on the ball_Own_goal' - ] - - possession = [] - seconds = [] - - # Determine possession and adjust seconds for second half - for i in range(len(df)): - if i == 0: - possession.append(df["team"].iloc[i]) - else: - if df["team"].iloc[i] == df["team"].iloc[i - 1]: - possession.append(df["team"].iloc[i]) - else: - if df["action"].iloc[i] in possession_team_actions: - possession.append(df["team"].iloc[i]) - else: - possession.append(df["team"].iloc[i - 1]) - - if df["period"].iloc[i] == "2H": - seconds.append(df["seconds"].iloc[i] + 60 * 60) - elif df["period"].iloc[i] == "E1": - seconds.append(df["seconds"].iloc[i] + 120 * 60) - elif df["period"].iloc[i] == "E2": - seconds.append(df["seconds"].iloc[i] + 150 * 60) - elif df["period"].iloc[i] == "P": - seconds.append(df["seconds"].iloc[i] + 180 * 60) - else: - seconds.append(df["seconds"].iloc[i]) - - df.loc[:, "possession_team"] = possession - df.loc[:, "seconds"] = seconds - - # Normalize time - df.loc[:, "seconds"] = df["seconds"] / df["seconds"].max() - #round numerical columns - df = df.round({"seconds": 4}) - - # Filter actions not by team in possession - df = df[df["team"] == df["possession_team"]].reset_index(drop=True) - - # Define simple actions - simple_actions = [ - 'Foul_Foul', 'Foul_Hand foul', 'Foul_Late card foul', 'Foul_Out of game foul', 'Foul_Protest', - 'Foul_Simulation', 'Foul_Time lost foul', 'Foul_Violent Foul', 'Offside_', 'Free Kick_Corner', - 'Free Kick_Free Kick', 'Free Kick_Free kick cross', 'Free Kick_Free kick shot', 'Free Kick_Goal kick', - 'Free Kick_Penalty', 'Free Kick_Throw in', 'Pass_Cross', 'Pass_Hand pass', 'Pass_Head pass', 'Pass_High pass', - 'Pass_Launch', 'Pass_Simple pass', 'Pass_Smart pass', 'Shot_Shot', 'Shot_Goal', 'Shot_Own_goal', 'Free Kick_goal', - 'Others on the ball_Own_goal', 'Pass_Own_goal', 'Duel_Ground attacking duel', 'Others on the ball_Acceleration', - 'Others on the ball_Clearance', 'Others on the ball_Touch', 'Others on the ball_Touch_good', - 'Duel_Ground attacking duel_off dribble' - ] - - # Filter out non-simple actions - df = df[df["action"].isin(simple_actions)].reset_index(drop=True) - - # Calculate match score - def calculate_match_score(df): - home_team_score_list = [] - away_team_score_list = [] - score_diff_list = [] - - for match_id in df.match_id.unique(): - home_team_score = 0 - away_team_score = 0 - #check if column home_team only have one unique value - if len(df[df["match_id"] == match_id].home_team.unique())>1: - home_team_id = df[df["match_id"] == match_id][df["home_team"]==1].team.unique()[0] - else: - home_team_id = df.team.unique()[0] - match_df = df[df["match_id"] == match_id].reset_index(drop=True) - - for i in range(len(match_df)): - if match_df.iloc[i].event_type_2 == "Goal": - if match_df["team"].iloc[i] == home_team_id: - home_team_score += 1 - else: - away_team_score += 1 - elif match_df.iloc[i].event_type_2 == "Own_goal": - if match_df["team"].iloc[i] == home_team_id: - away_team_score += 1 - else: - home_team_score += 1 - score_diff = home_team_score - away_team_score - home_team_score_list.append(home_team_score) - away_team_score_list.append(away_team_score) - score_diff_list.append(score_diff) - - return home_team_score_list, away_team_score_list, score_diff_list - - home_team_score_list, away_team_score_list, score_diff_list = calculate_match_score(df) - df["home_team_score"] = home_team_score_list - df["away_team_score"] = away_team_score_list - df["score_diff"] = score_diff_list - - # Set possession id - poss_id_list = [] - poss_id = 0 - for i in range(len(df)): - if i == 0: - poss_id_list.append(0) - else: - if df["possession_team"].iloc[i] == df["possession_team"].iloc[i - 1] and df["period"].iloc[i] == df["period"].iloc[i - 1]: - poss_id_list.append(poss_id) - else: - poss_id += 1 - poss_id_list.append(poss_id) - df["poss_id"] = poss_id_list - - - # Add a row in between the first and last row of each possession - new_df = [] - for poss_id in df.poss_id.unique(): - temp_df = df[df["poss_id"] == poss_id].reset_index(drop=True) - for j in range(len(temp_df)): - new_df.append(temp_df.iloc[j]) - new_row = temp_df.iloc[-1].copy() - new_row["action"] = "_" - new_df.append(new_row) - - # Concatenate all rows in new_df - new_df = pd.concat(new_df, axis=1).T.reset_index(drop=True) - - # Simplify actions - drop_list = [ - 'Foul_Foul', 'Foul_Hand foul', 'Foul_Late card foul', 'Foul_Out of game foul', - 'Foul_Protest', 'Foul_Simulation', 'Foul_Time lost foul', 'Foul_Violent Foul', 'Offside_', - 'Others on the ball_Own_goal', 'Pass_Own_goal' - ] - p_list = [ - "Free Kick_Goal kick", 'Free Kick_Throw in', 'Free Kick_Free Kick', 'Pass_Hand pass', - 'Pass_Head pass', 'Pass_High pass', 'Pass_Launch', 'Pass_Simple pass', 'Pass_Smart pass', - 'Others on the ball_Clearance' - ] - d_list = [ - 'Duel_Ground attacking duel_off dribble', 'Others on the ball_Acceleration', 'Others on the ball_Touch_good' - ] - x_list = [ - 'Free Kick_Corner', 'Free Kick_Free kick cross', 'Pass_Cross' - ] - s_list = [ - 'Free Kick_Free kick shot', 'Free Kick_Penalty', 'Shot_Shot', 'Shot_Goal', 'Shot_Own_goal' - ] - - new_df = new_df[~new_df["action"].isin(drop_list)].reset_index(drop=True) - action_list = [] - for action in new_df["action"]: - if action in p_list: - action_list.append("p") - elif action in d_list: - action_list.append("d") - elif action in x_list: - action_list.append("x") - elif action in s_list: - action_list.append("s") - elif action == "_": - action_list.append("_") - else: - action_list.append(action) - - new_df["action"] = action_list - - df = new_df.copy() - - # Calculate additional metrics - def calculate_additional_metrics(df): - time_diff_list = [] - distance_list = [] - distance2goal_list = [] - angle_list = [] - x_diff_list = [] - y_diff_list = [] - - for match_id in df.match_id.unique(): - match_df = df[df["match_id"] == match_id].reset_index(drop=True) - for i in range(len(match_df)): - if i == 0: - time_diff = 0 - distance = 0 - distance2goal = 0 - angle = 0.5 - x_diff = 0 - y_diff = 0 - elif match_df.iloc[i].action == "_": - time_diff = 0 - distance = 0 - distance2goal = 0 - angle = 0.5 - x_diff = 0 - y_diff = 0 - else: - time_diff = match_df["seconds"].iloc[i] - match_df["seconds"].iloc[i - 1] - distance = ((match_df["start_x"].iloc[i] * 1.05 - match_df["start_x"].iloc[i-1] * 1.05) ** 2 + - (match_df["start_y"].iloc[i] * 0.68 - match_df["start_y"].iloc[i-1] * 0.68) ** 2) ** 0.5 - distance2goal = (((match_df["start_x"].iloc[i] - 100/100) * 1.05) ** 2 + - ((match_df["start_y"].iloc[i] - 50/100) * 0.68) ** 2) ** 0.5 - angle = np.abs(np.arctan2((match_df["start_y"].iloc[i] - 50/100) * 0.68, - (match_df["start_x"].iloc[i] - 100/100) * 1.05)) - x_diff = match_df["start_x"].iloc[i] * 1.05 - match_df["start_x"].iloc[i-1] * 1.05 - y_diff = match_df["start_y"].iloc[i] * 0.68 - match_df["start_y"].iloc[i-1] * 0.68 - - time_diff_list.append(time_diff) - distance_list.append(distance) - distance2goal_list.append(distance2goal) - angle_list.append(angle) - x_diff_list.append(x_diff) - y_diff_list.append(y_diff) - - return time_diff_list, distance_list, distance2goal_list, angle_list, x_diff_list, y_diff_list - - # Scale and normalize columns - df["start_x"] = df["start_x"] / 100 - df["start_y"] = df["start_y"] / 100 - df["end_x"] = df["end_x"] / 100 - df["end_y"] = df["end_y"] / 100 - - (time_diff_list, distance_list, distance2goal_list, angle_list, - x_diff_list, y_diff_list) = calculate_additional_metrics(df) - - df["time_diff"] = time_diff_list - df["distance"] = distance_list - df["distance2goal"] = distance2goal_list - df["angle2goal"] = angle_list - df["x_diff"] = x_diff_list - df["y_diff"] = y_diff_list - - # Scale and normalize columns - # df["distance"] = df["distance"] / df["distance"].max() - # df["distance2goal"] = df["distance2goal"] / df["distance2goal"].max() - # df["angle2goal"] = df["angle2goal"] / df["angle2goal"].max() - # df["x_diff"] = df["x_diff"] / df["x_diff"].max() - # df["y_diff"] = df["y_diff"] / df["y_diff"].max() - - # Clip time differences to a maximum of 0.01 seconds - df["time_diff"] = np.clip(df["time_diff"], 0, 0.01) - - # Round numerical columns - df = df.round({"seconds": 4, "time_diff": 4, "distance": 4, "distance2goal": 4, "angle2goal": 4, - "start_x": 4, "start_y": 4, "end_x": 4, "end_y": 4, "x_diff": 4, "y_diff": 4}) - - # Reorder columns - df = df[[ - "comp", "match_id", "poss_id", "team", "action", "start_x", "start_y", "x_diff", "y_diff", - "distance", "distance2goal", "angle2goal", "seconds", "time_diff", "score_diff" - ]] - - return df - -def nmstpp(data): - """ - Processes soccer match event data to determine possession, filter actions, - compute additional metrics, and normalize data. - - Parameters: - data (pd.DataFrame or str): A pandas DataFrame containing event data or a file path to a CSV file. - - Returns: - pd.DataFrame: A processed DataFrame with simplified and normalized event actions. - """ - - # Load data from DataFrame or file path - if isinstance(data, pd.DataFrame): - df = data - elif isinstance(data, str): - if os.path.exists(data): - df = pd.read_csv(data) - else: - raise FileNotFoundError("The file path does not exist") - else: - raise ValueError("The data must be a pandas DataFrame or a file path") - - df=seq2event(df) - #define the zone clusters for Juego de Posición - centroid_x=[ 8.5 , 25.25, 41.75, 58.25, 74.75, 91.5,8.5 , 25.25, 41.75, 58.25, 74.75, - 91.5,33.5, 66.5,33.5, 66.5,33.5, 66.5,8.5,91.5] - centroid_y=[89.45, 89.45, 89.45, 89.45, 89.45, 89.45,10.55, 10.55, 10.55, 10.55, 10.55, 10.55, - 71.05, 71.05,50., 50.,28.95, 28.95, 50.,50.] - - #scale start_x and start_y by 100 - df["start_x"]=df["start_x"]*100 - df["start_y"]=df["start_y"]*100 - - #calculate the zone of the start_x and start_y - zone_list=[] - #get closest zone for each start_x and start_y - for i in range(len(df)): - min_dist=1000 - zone=-1 - for j in range(len(centroid_x)): - dist=np.sqrt((df["start_x"].iloc[i]-centroid_x[j])**2+(df["start_y"].iloc[i]-centroid_y[j])**2) - if dist1: - home_team=match_df[match_df["home_team"]==1].team.unique()[0] - else: - home_team=team_list[0] - home_score=0 - away_score=0 - is_goal=0 - for i in range(len(match_df)): - if match_df["team"].iloc[i]==home_team: - is_home_list.append(1) - if match_df["event_type_2"].iloc[i]=="Goal": - home_score+=1 - is_goal=1 - elif match_df["event_type_2"].iloc[i]=="Own_goal": - away_score+=1 - is_goal=1 - else: - is_home_list.append(0) - if match_df["event_type_2"].iloc[i]=="Goal": - away_score+=1 - is_goal=1 - elif match_df["event_type_2"].iloc[i]=="Own_goal": - home_score+=1 - is_goal=1 - home_score_list.append(home_score) - away_score_list.append(away_score) - is_goal_list.append(is_goal) - df["HomeScore"]=home_score_list - df["AwayScore"]=away_score_list - df["IsHome"]=is_home_list - df["IsGoal"]=is_goal_list - - #convert col accurate from TF to 1 and 0 - df['IsAccurate']=df['accurate'].astype(int) - - #create the EventType - event_type_list=[] - for i in range(len(df)): - event_type=df["event_type_2"].iloc[i] - if event_type=="Goal": - event_type_list.append("Shot") - elif event_type=="own-goal": - event_type_list.append("Shot") - else: - event_type_list.append(event_type) - - df["EventType"]=event_type_list - - #add row period_over and game_over - new_df=[] - for match in df.match_id.unique(): - match_df=df[df["match_id"]==match] - for period in match_df.period.unique(): - period_df=match_df[match_df["period"]==period] - for i in range(len(period_df)): - new_df.append(period_df.iloc[i]) - last_row=period_df.iloc[-1].copy() - #set the IsHome, IsGoal, IsAccurate, to 0 - last_row["IsHome"]=0 - last_row["IsGoal"]=0 - last_row["IsAccurate"]=0 - #check if it is the last period of the matchs - if period==match_df.period.unique()[-1]: - last_row["EventType"]="game_over" - new_df.append(last_row) - else: - last_row["EventType"]="period_over" - new_df.append(last_row) - df=pd.concat(new_df,axis=1).T.reset_index(drop=True) - - #reorder columns - df = df[[ - "comp", "match_id", "EventType", "IsGoal", "IsAccurate","IsHome", "Period", "Minute","Second","start_x","start_y","HomeScore","AwayScore" - ]] - - #rename columns - df.rename(columns={"start_x":"X","start_y":"Y"},inplace=True) - - #round numerical columns to 4 decimal places (period, minute, second, X, Y) - df = df.round({"Period": 4, "Minute": 4, "Second": 4, "X": 4, "Y": 4}) - - return df - -def UIED_wyscout(data): - """ - Processes soccer match event data to determine possession, filter actions, - compute additional metrics, and normalize data. - - Parameters: - data (pd.DataFrame or str): A pandas DataFrame containing event data or a file path to a CSV file. - provider (str): The provider of the event data. - - Returns: - pd.DataFrame: A processed DataFrame with simplified and normalized event actions. - """ - - # Load data from DataFrame or file path - if isinstance(data, pd.DataFrame): - df = data - elif isinstance(data, str): - if os.path.exists(data): - df = pd.read_csv(data) - else: - raise FileNotFoundError("The file path does not exist") - else: - raise ValueError("The data must be a pandas DataFrame or a file path") - - df=df.copy() - #get possession team only event - # Create 'action' column by concatenating 'event_type' and 'event_type_2' - df["action"] = df["event_type"].astype(str) + "_" + df["event_type_2"].astype(str) - - # Define possession team actions - possession_team_actions = [ - 'Free Kick_Goal kick', 'Free Kick_Throw in', 'Free Kick_Corner', 'Free Kick_Free Kick', - 'Free Kick_Free kick cross', 'Free Kick_Free kick shot', 'Free Kick_Penalty', 'Pass_Cross', - 'Pass_Hand pass', 'Pass_Head pass', 'Pass_High pass', 'Pass_Launch', 'Pass_Simple pass', - 'Pass_Smart pass', 'Shot_Shot', 'Shot_Goal', 'Free Kick_goal', 'Duel_Ground attacking duel_off dribble', - 'Others on the ball_Acceleration', 'Others on the ball_Clearance', 'Others on the ball_Touch_good', - 'Shot_Own_goal', 'Pass_Own_goal', 'Others on the ball_Own_goal' - ] - - possession = [] - # Determine possession - for i in range(len(df)): - if i == 0: - possession.append(df["team"].iloc[i]) - else: - if df["team"].iloc[i] == df["team"].iloc[i - 1]: - possession.append(df["team"].iloc[i]) - else: - if df["action"].iloc[i] in possession_team_actions: - possession.append(df["team"].iloc[i]) - else: - possession.append(df["team"].iloc[i - 1]) - - df["possession_team"] = possession - df = df[df["team"] == df["possession_team"]].reset_index(drop=True) - - #create the event related features (sucess, home_team, goal, home_score, away_score) - df["success"]=df["accurate"].astype(int) - home_team_list=[] - goal_list=[] - home_score_list=[] - away_score_list=[] - goal_diff_list=[] - for match in df.match_id.unique(): - match_df=df[df["match_id"]==match] - team_list=match_df["team"].unique() - #check if column home_team only have one unique value - if len(match_df.home_team.unique())>1: - home_team=match_df[match_df["home_team"]==1].team.unique()[0] - else: - home_team=team_list[0] - home_score=0 - away_score=0 - goal_diff=0 - for i in range(len(match_df)): - if match_df["team"].iloc[i]==home_team: - home_team_list.append(1) - if match_df["event_type_2"].iloc[i]=="Goal": - home_score+=1 - elif match_df["event_type_2"].iloc[i]=="Own_goal": - away_score+=1 - else: - home_team_list.append(0) - if match_df["event_type_2"].iloc[i]=="Goal": - away_score+=1 - elif match_df["event_type_2"].iloc[i]=="Own_goal": - home_score+=1 - goal_diff=home_score-away_score - goal_list.append(1) if match_df["event_type_2"].iloc[i]=="Goal" else goal_list.append(0) - home_score_list.append(home_score) - away_score_list.append(away_score) - goal_diff_list.append(goal_diff) - - df["home_team"]=home_team_list - df["goal"]=goal_list - df["home_score"]=home_score_list - df["away_score"]=away_score_list - df["goal_diff"]=goal_diff_list - - #group the event into simpliefied actions - pass_actions=['Free Kick_Goal kick', 'Free Kick_Throw in','Free Kick_Free Kick','Pass_Cross','Pass_Hand pass','Pass_Simple pass','Pass_Smart pass','Pass_Head pass'] - high_pass_actions=['Pass_High pass'] - shot_actions=['Free Kick_Free kick shot','Free Kick_Penalty','Shot_Shot', 'Shot_Goal','Shot_Own_goal'] - carray_actions=['Others on the ball_Acceleration'] - dribble_actions=['Duel_Ground attacking duel_off dribble', 'Others on the ball_Touch_good','Duel_Air duel'] - cross_actions=['Free Kick_Corner','Free Kick_Free kick cross'] - drop_actions=['Pass_Launch', 'Free Kick_goal', 'Others on the ball_Clearance','Pass_Own_goal', 'Others on the ball_Own_goal','Foul_Foul', 'Foul_Hand foul', 'Foul_Late card foul', 'Foul_Out of game foul', - 'Foul_Protest', 'Foul_Simulation', 'Foul_Time lost foul', 'Foul_Violent Foul', 'Offside_','Duel_Ground loose ball duel','Others on the ball_Touch','Offside_nan','Interruption_Ball out of the field', - 'Duel_Ground defending duel', 'Duel_Ground attacking duel', 'Goalkeeper leaving line_Goalkeeper leaving line', 'Interruption_Whistle', 'Save attempt_Reflexes', 'Save attempt_Save attempt' - ] - action_list=[] - for i in range(len(df)): - if df["action"].iloc[i] in pass_actions: - #devide short pass and long pass based on the distance (45) - distance=np.sqrt(((df["start_x"].iloc[i]-df["end_x"].iloc[i])*1.05)**2+((df["start_y"].iloc[i]-df["end_y"].iloc[i])*0.68)**2) - if distance>=45: - action_list.append("long_pass") - else: - action_list.append("short_pass") - elif df["action"].iloc[i] in high_pass_actions: - action_list.append("high_pass") - elif df["action"].iloc[i] in shot_actions: - action_list.append("shot") - elif df["action"].iloc[i] in carray_actions: - action_list.append("carry") - elif df["action"].iloc[i] in dribble_actions: - action_list.append("dribble") - elif df["action"].iloc[i] in cross_actions: - action_list.append("cross") - elif df["action"].iloc[i] in drop_actions: - action_list.append("drop") - else: - action= df["action"].iloc[i] - print(f"Warning: action {action} was not found in the action list, it will be dropped") - action_list.append("drop") - - df["action"]=action_list - #drop the drop actions - df=df[df["action"]!="drop"].reset_index(drop=True) - - #create the time related features (period, minute, second, delta_T) - period_list=[] - minute_list=[] - second_list=[] - delta_t_list=[] - for i in range(len(df)): - if df["period"].iloc[i]=="1H": - period_list.append(1) - elif df["period"].iloc[i]=="2H": - period_list.append(2) - elif df["period"].iloc[i]=="E1": - period_list.append(3) - elif df["period"].iloc[i]=="E2": - period_list.append(4) - elif df["period"].iloc[i]=="P": - period_list.append(5) - minute_list.append(df["seconds"].iloc[i]//60) - second_list.append((df["seconds"].iloc[i]%60).round(4)) - if i==0: - delta_t_list.append(0) - else: - if df.action.iloc[i-1]=="period_over" or df.action.iloc[i-1]=="game_over": - delta_t_list.append(0) - else: - delta_t_list.append((df["seconds"].iloc[i]-df["seconds"].iloc[i-1]).round(4)) - df["Period"]=period_list - df["Minute"]=minute_list - df["Second"]=second_list - df["delta_T"]=delta_t_list - - #create the location related features (deltaX, deltaY, distance, dist2goal, angle2goal) - delta_x_list=[] - delta_y_list=[] - dist_list=[] - dist2goal_list=[] - angle2goal_list=[] - for i in range(len(df)): - delta_x=df["start_x"].iloc[i]-df["start_x"].iloc[i-1] - delta_y=df["start_y"].iloc[i]-df["start_y"].iloc[i-1] - distance = ((df["start_x"].iloc[i] * 1.05 - df["start_x"].iloc[i-1] * 1.05) ** 2 + - (df["start_y"].iloc[i] * 0.68 - df["start_y"].iloc[i-1] * 0.68) ** 2) ** 0.5 - dist2goal = (((df["start_x"].iloc[i] - 100) * 1.05) ** 2 + - ((df["start_y"].iloc[i] - 50) * 0.68) ** 2) ** 0.5 - angle2goal = np.abs(np.arctan2((df["start_y"].iloc[i] - 50) * 0.68, - (df["start_x"].iloc[i] - 100) * 1.05)) - - delta_x_list.append(delta_x) - delta_y_list.append(delta_y) - dist_list.append(distance) - dist2goal_list.append(dist2goal) - angle2goal_list.append(angle2goal) - df["deltaX"]=delta_x_list - df["deltaY"]=delta_y_list - df["distance"]=dist_list - df["dist2goal"]=dist2goal_list - df["angle2goal"]=angle2goal_list - - #scale start_x and start_y by the field size - df["start_x"]=df["start_x"]*0.68 - df["start_y"]=df["start_y"]*1.05 - - #create the possession id, end of possession, end of period, end of game - poss_id_list = [] - poss_id = 0 - for match in df.match_id.unique(): - match_df = df[df["match_id"] == match] - for i in range(len(match_df)): - if i == 0: - poss_id_list.append(poss_id) - else: - if match_df["possession_team"].iloc[i] == match_df["possession_team"].iloc[i - 1]: - poss_id_list.append(poss_id) - else: - poss_id += 1 - poss_id_list.append(poss_id) - poss_id+=1 - df["poss_id"] = poss_id_list - - new_df = [] - for match in df.match_id.unique(): - match_df = df[df["match_id"] == match] - for period in match_df.Period.unique(): - period_df = match_df[match_df["Period"] == period] - for poss_id in period_df.poss_id.unique(): - poss_df = period_df[period_df["poss_id"] == poss_id] - for i in range(len(poss_df)): - new_df.append(poss_df.iloc[i]) - last_row = poss_df.iloc[-1].copy() - last_row["action"] = "_" - #change the value of the features to 0 - last_row['goal'] = 0 - last_row["success"]=0 - last_row["deltaX"]=0 - last_row["deltaY"]=0 - last_row["distance"]=0 - last_row["dist2goal"]=0 - last_row["angle2goal"]=0.5 - last_row["delta_T"]=0 - new_df.append(last_row) - last_row = period_df.iloc[-1].copy() - #change the value of the features to 0 - last_row['goal'] = 0 - last_row["success"]=0 - last_row["deltaX"]=0 - last_row["deltaY"]=0 - last_row["distance"]=0 - last_row["dist2goal"]=0 - last_row["angle2goal"]=0.5 - last_row["delta_T"]=0 - if period == df.Period.unique()[-1]: - last_row["action"] = "game_over" - new_df.append(last_row) - else: - last_row["action"] = "period_over" - new_df.append(last_row) - df = pd.concat(new_df, axis=1).T.reset_index(drop=True) - - #reorder columns - df = df[['match_id', 'poss_id', 'team', 'home_team', 'action', 'success', 'goal', 'home_score', 'away_score', 'goal_diff', 'Period', 'Minute', 'Second', 'seconds', "delta_T", 'start_x', 'start_y', 'deltaX', 'deltaY', 'distance', 'dist2goal', 'angle2goal']] - - #adjust the seconds column for different periods - seconds_list=[] - for i in range(len(df)): - if df["Period"].iloc[i]==1: - seconds_list.append(df["seconds"].iloc[i]) - elif df["Period"].iloc[i]==2: - seconds_list.append(df["seconds"].iloc[i]+60*60) - elif df["Period"].iloc[i]==3: - seconds_list.append(df["seconds"].iloc[i]+120*60) - elif df["Period"].iloc[i]==4: - seconds_list.append(df["seconds"].iloc[i]+150*60) - elif df["Period"].iloc[i]==5: - seconds_list.append(df["seconds"].iloc[i]+180*60) - df["seconds"]=seconds_list - - #reset the features value to 0 (angle2goal to 0.5)for beginning of each period - new_df=[] - for match in df.match_id.unique(): - match_df=df[df["match_id"]==match] - for period in match_df.Period.unique(): - period_df=match_df[match_df["Period"]==period].copy() - for i in range(len(period_df)): - if i==0: - first_row=period_df.iloc[i].copy() - first_row["deltaX"]=0 - first_row["deltaY"]=0 - first_row["distance"]=0 - first_row["dist2goal"]=0 - first_row["angle2goal"]=0.5 - first_row["delta_T"]=0 - new_df.append(first_row) - else: - new_df.append(period_df.iloc[i]) - df=pd.concat(new_df,axis=1).T.reset_index(drop=True) - - #convert seconds, distance, dist2goal, angle2goal, start_x, start_y into type float - df["seconds"]=df["seconds"].astype(float) - df["distance"]=df["distance"].astype(float) - df["dist2goal"]=df["dist2goal"].astype(float) - df["angle2goal"]=df["angle2goal"].astype(float) - df["start_x"]=df["start_x"].astype(float) - df["start_y"]=df["start_y"].astype(float) - - #round numerical columns to 4 decimal places (period, minute, second, X, Y) - df = df.round({"Period": 4, "Minute": 4, "Second": 4, "seconds": 4, "start_x": 4, "start_y": 4, "deltaX": 4, "deltaY": 4, "distance": 4, "dist2goal": 4, "angle2goal": 4}) - - return df - -def UIED_statsbomb(data): - """ - Processes soccer match event data to determine possession, filter actions, - compute additional metrics, and normalize data. - - Parameters: - data (pd.DataFrame or str): A pandas DataFrame containing event data or a file path to a CSV file. - provider (str): The provider of the event data. - - Returns: - pd.DataFrame: A processed DataFrame with simplified and normalized event actions. - """ - - # Load data from DataFrame or file path - if isinstance(data, pd.DataFrame): - df = data - elif isinstance(data, str): - if os.path.exists(data): - df = pd.read_csv(data) - else: - raise FileNotFoundError("The file path does not exist") - else: - raise ValueError("The data must be a pandas DataFrame or a file path") - - df=df.copy() - - #get possession team only event - df["action"] = df["event_type"].astype(str) + "_" + df["event_type_2"].astype(str).replace("None","nan") - - # Define possession team actions - - possession_team_actions =[ 'Pass_Ground Pass', 'Pass_Long_HighPass', - 'Carry_nan', 'Pass_High Pass', 'Pass_Low Pass', - 'Miscontrol_nan', - 'Dribble_nan', 'Clearance_nan', 'Pass_Cross', 'Ball Recovery_nan', - 'Offside_nan', 'Goal Keeper_nan', - 'Dribbled Past_nan', 'Pass_Corner', - 'Shot_Saved', 'Shot_Blocked', 'Shot_Wayward', 'Shot_Off T', 'Shot_Goal', 'Shot_Post', - 'Tactical Shift_nan', 'Shield_nan', - 'Own Goal Against_Own goal', 'Error_nan', - 'Shot_Saved Off Target'] - - # Determine possession - possession = [] - for i in range(len(df)): - if i == 0: - possession.append(df["team"].iloc[i]) - else: - if df["team"].iloc[i] == df["team"].iloc[i - 1]: - possession.append(df["team"].iloc[i]) - else: - if df["action"].iloc[i] in possession_team_actions: - possession.append(df["team"].iloc[i]) - else: - possession.append(df["team"].iloc[i - 1]) - - df["possession_team"] = possession - df = df[df["team"] == df["possession_team"]].reset_index(drop=True) - - #create the event related features (sucess, home_team, goal, home_score, away_score) - sucess_list=[] - home_team_list=[] - goal_list=[] - goal_diff_list=[] - home_score_list=[] - away_score_list=[] - for match in df.match_id.unique(): - match_df=df[df["match_id"]==match] - team_list=match_df["team"].unique() - if "home_team" in df.columns: - if df.home_team.unique().shape[0]!=1: - #team name in "team" and "home_team" indicate the home team - home_team= df[df["home_team"]==1]["team"].iloc[0] - else: - home_team=team_list[0] - else: - home_team=team_list[0] - home_score=0 - away_score=0 - for i in range(len(match_df)): - if match_df["team"].iloc[i]==home_team: - home_team_list.append(1) - if match_df["event_type_2"].iloc[i]=="Goal": - home_score+=1 - elif match_df["event_type_2"].iloc[i]=="Own_goal": - away_score+=1 - else: - home_team_list.append(0) - if match_df["event_type_2"].iloc[i]=="Goal": - away_score+=1 - elif match_df["event_type_2"].iloc[i]=="Own_goal": - home_score+=1 - if match_df["possession_team"].iloc[i]==match_df["possession_team"].iloc[i-1] and match_df["event_type"].iloc[i]!='Shot': - sucess_list.append(1) - elif match_df["possession_team"].iloc[i]==match_df["possession_team"].iloc[i-1] and match_df["event_type"].iloc[i]=='Shot': - if match_df["event_type_2"].iloc[i]=="Goal": - sucess_list.append(1) - else: - sucess_list.append(0) - else: - sucess_list.append(0) - goal_list.append(1) if match_df["event_type_2"].iloc[i]=="Goal" else goal_list.append(0) - home_score_list.append(home_score) - away_score_list.append(away_score) - goal_diff=home_score-away_score - goal_diff_list.append(goal_diff) - - df["success"]=sucess_list - #check if home_team is in the df columns - if "home_team" not in df.columns: - df["home_team"]=home_team_list - elif "home_team" in df.columns and df.home_team.unique().shape[0]==1: - df["home_team"]=home_team_list - df["goal"]=goal_list - df["home_score"]=home_score_list - df["away_score"]=away_score_list - df["goal_diff"]=goal_diff_list - - #group the event into simpliefied actions - ''' - all action - ['Starting XI_nan', 'Half Start_nan', 'Pass_Ground Pass', 'Ball Receipt*_nan', - 'Carry_nan', 'Pass_High Pass', 'Pass_Low Pass', 'Duel_nan', 'Pressure_nan', - 'Foul Committed_nan', 'Foul Won_nan', 'Miscontrol_nan', 'Block_nan', - 'Dribble_nan', 'Clearance_nan', 'Pass_Cross', 'Ball Recovery_nan', - 'Dispossessed_nan', 'Interception_nan', 'Offside_nan', 'Goal Keeper_nan', - 'Injury Stoppage_nan', 'Player Off_nan', 'Referee Ball-Drop_nan', - 'Player On_nan', 'Dribbled Past_nan', 'Shot_Saved to Post', 'Pass_Corner', - 'Shot_Saved', 'Shot_Blocked', 'Shot_Wayward', 'Shot_Off T', 'Half End_nan', - 'Substitution_nan', '50/50_nan', 'Shot_Goal', 'Shot_Post', - 'Tactical Shift_nan', 'Bad Behaviour_nan', 'Shield_nan', - 'Own Goal Against_Own goal', 'Own Goal For_nan', 'Error_nan', - 'Shot_Saved Off Target'] - ''' - - pass_actions=['Pass_Ground Pass', 'Pass_Low Pass',] - high_pass_actions=['Pass_High Pass',] - shot_actions=['Shot_Saved to Post','Shot_Saved', 'Shot_Blocked', 'Shot_Wayward','Shot_Saved Off Target','Shot_Off T','Shot_Goal', 'Shot_Post',] - carray_actions=['Carry_nan','Carry_None'] - dribble_actions=['Dribble_nan', 'Shot_Off T',"Dribble_None"] - cross_actions=['Pass_Cross','Pass_Corner'] - drop_actions=['Starting XI_nan', 'Half Start_nan', 'Ball Receipt*_nan', 'Pressure_nan', 'Foul Committed_nan', 'Foul Won_nan', 'Miscontrol_nan', 'Block_nan', - 'Clearance_nan','Ball Recovery_nan','Dispossessed_nan', 'Interception_nan', 'Offside_nan', 'Goal Keeper_nan','Injury Stoppage_nan', 'Player Off_nan', 'Referee Ball-Drop_nan','Player On_nan', - 'Dribbled Past_nan','Half End_nan','Substitution_nan', '50/50_nan', 'Tactical Shift_nan', 'Bad Behaviour_nan', 'Shield_nan','Own Goal Against_Own goal', 'Own Goal For_nan', 'Error_nan','Duel_nan', - 'Ball Receipt*_None','Miscontrol_None','Duel_None','Pressure_None',"Ball Recovery_None","Substitution_None", - '50/50_None','Foul Committed_None','Error_None','Block_None','Bad Behaviour_None','Goal Keeper_None','Interception_None', - 'Half Start_None','Starting XI_None','Clearance_None','Interception_None','Tactical Shift_None','Dribbled Past_None',"Injury Stoppage_None",'Referee Ball-Drop_None','Dispossessed_None', - "Half End_None", "Own Goal Against_None","Own Goal Against_nan"] - - action_list=[] - for i in range(len(df)): - if df["action"].iloc[i] in pass_actions: - #devide short pass and long pass based on the distance (45) - distance=np.sqrt(((df["start_x"].iloc[i]-df["end_x"].iloc[i])*(1.05/1.2))**2+((df["start_y"].iloc[i]-df["end_y"].iloc[i])*(0.68/0.8))**2) - if distance>=45: - action_list.append("long_pass") - else: - action_list.append("short_pass") - elif df["action"].iloc[i] in high_pass_actions: - action_list.append("high_pass") - elif df["action"].iloc[i] in shot_actions: - action_list.append("shot") - elif df["action"].iloc[i] in carray_actions: - action_list.append("carry") - elif df["action"].iloc[i] in dribble_actions: - action_list.append("dribble") - elif df["action"].iloc[i] in cross_actions: - action_list.append("cross") - elif df["action"].iloc[i] in drop_actions: - action_list.append("drop") - else: - action= df["action"].iloc[i] - print(f"Warning: action {action} was not found in the action list, it will be dropped") - action_list.append("drop") - - df["action"]=action_list - #drop the drop actions - df=df[df["action"]!="drop"].reset_index(drop=True) - - #check if seconds is in df columns - if "seconds" not in df.columns: - df["seconds"]=df["minute"]*60+df["second"] - delta_t_list=[] - for i in range(len(df)): - if i==0: - delta_t_list.append(0) - else: - if df.action.iloc[i-1]=="period_over" or df.action.iloc[i-1]=="game_over": - delta_t_list.append(0) - else: - delta_t_list.append(df["seconds"].iloc[i]-df["seconds"].iloc[i-1]) - df["delta_T"]=delta_t_list - - #create the location related features (deltaX, deltaY, distance, dist2goal, angle2goal) - delta_x_list=[] - delta_y_list=[] - dist_list=[] - dist2goal_list=[] - angle2goal_list=[] - for i in range(len(df)): - delta_x=df["start_x"].iloc[i]-df["start_x"].iloc[i-1] - delta_y=df["start_y"].iloc[i]-df["start_y"].iloc[i-1] - distance = ((df["start_x"].iloc[i] * (1.05/1.2) - df["start_x"].iloc[i-1] * (1.05/1.2)) ** 2 + - (df["start_y"].iloc[i] * (0.68/0.8) - df["start_y"].iloc[i-1] * (0.68/0.8)) ** 2) ** 0.5 - dist2goal = (((df["start_x"].iloc[i] - 120) * (1.05/1.2)) ** 2 + - ((df["start_y"].iloc[i] - 40) * (0.68/0.8)) ** 2) ** 0.5 - angle2goal = np.abs(np.arctan2((df["start_y"].iloc[i] - 40) * (0.68/0.8), - (df["start_x"].iloc[i] - 120) * (1.05/1.2))) - - delta_x_list.append(delta_x) - delta_y_list.append(delta_y) - dist_list.append(distance) - dist2goal_list.append(dist2goal) - angle2goal_list.append(angle2goal) - df["deltaX"]=delta_x_list - df["deltaY"]=delta_y_list - df["distance"]=dist_list - df["dist2goal"]=dist2goal_list - df["angle2goal"]=angle2goal_list - - #scale the start_x and start_y to real pitch size - df["start_x"]=df["start_x"]*(1.05/1.2) - df["start_y"]=df["start_y"]*(0.68/0.8) - - #set possession_id - poss_id_list = [] - poss_id = 0 - for i in range(len(df)): - if i == 0: - poss_id_list.append(0) - else: - if df["possession_team"].iloc[i] == df["possession_team"].iloc[i - 1] and df["period"].iloc[i] == df["period"].iloc[i - 1]: - poss_id_list.append(poss_id) - else: - poss_id += 1 - poss_id_list.append(poss_id) - df["poss_id"] = poss_id_list - - #rename columns period to Period, minute to Minute, second to Second - df.rename(columns={"period":"Period","minute":"Minute","second":"Second"},inplace=True) - - new_df = [] - for match in df.match_id.unique(): - match_df = df[df["match_id"] == match] - for period in match_df.Period.unique(): - period_df = match_df[match_df["Period"] == period] - for poss_id in period_df.poss_id.unique(): - poss_df = period_df[period_df["poss_id"] == poss_id] - for i in range(len(poss_df)): - if poss_id==period_df.poss_id.unique()[0] and i==0: - first_row=poss_df.iloc[i].copy() - first_row["deltaX"]=0 - first_row["deltaY"]=0 - first_row["distance"]=0 - first_row["delta_T"]=0 - new_df.append(first_row) - else: - new_df.append(poss_df.iloc[i]) - last_row = poss_df.iloc[-1].copy() - last_row["action"] = "_" - #change the value of the features to 0 - last_row['goal']=0 - last_row["success"]=0 - last_row["deltaX"]=0 - last_row["deltaY"]=0 - last_row["distance"]=0 - last_row["dist2goal"]=0 - last_row["angle2goal"]=0.5 - last_row["delta_T"]=0 - new_df.append(last_row) - last_row = period_df.iloc[-1].copy() - #change the value of the features to 0 - last_row['goal']=0 - last_row["success"]=0 - last_row["deltaX"]=0 - last_row["deltaY"]=0 - last_row["distance"]=0 - last_row["dist2goal"]=0 - last_row["angle2goal"]=0.5 - last_row["delta_T"]=0 - if period == df.Period.unique()[-1]: - last_row["action"] = "game_over" - new_df.append(last_row) - else: - last_row["action"] = "period_over" - new_df.append(last_row) - df = pd.concat(new_df, axis=1).T.reset_index(drop=True) - - #remove carray action that have the same start and end location as the previous action (exclude "_" end of possession) - droplist=[] - for i in range(len(df)): - if df.start_x.iloc[i]==df.start_x.iloc[i-1] and df.start_y.iloc[i]==df.start_y.iloc[i-1]: - if df.action.iloc[i]=="carry" and df.action.iloc[i-1] not in ["_", "period_over", "game_over"]: - droplist.append(i) - - df.drop(droplist,inplace=True) - - new_df=[] - flag=False - for i in range(len(df)): - if i==len(df)-1: - new_df.append(df.iloc[i]) - break - if flag: - flag=False - new_df.append(row) - continue - if df.start_x.iloc[i]==df.start_x.iloc[i+1] and df.start_y.iloc[i]==df.start_y.iloc[i+1]: - if df.action.iloc[i]=="carry" and df.action.iloc[i+1] in ["short_pass", "long_pass", "high_pass", "shot", "dribble", "cross"]: - row=df.iloc[i].copy() - row["action"]=df.action.iloc[i+1] - flag=True - else: - new_df.append(df.iloc[i]) - else: - new_df.append(df.iloc[i]) - - df=pd.concat(new_df,axis=1).T.reset_index(drop=True) - - #adjust the seconds column for different periods - seconds_list=[] - for i in range(len(df)): - if df["Period"].iloc[i]==1: - seconds_list.append(df["seconds"].iloc[i]) - elif df["Period"].iloc[i]==2: - seconds_list.append(df["seconds"].iloc[i]+60*45) - elif df["Period"].iloc[i]==3: - seconds_list.append(df["seconds"].iloc[i]+60*90) - elif df["Period"].iloc[i]==4: - seconds_list.append(df["seconds"].iloc[i]+60*105) - elif df["Period"].iloc[i]==5: - seconds_list.append(df["seconds"].iloc[i]+60*120) - - #reset the features value to 0 (angle2goal to 0.5)for beginning of each period - new_df=[] - for match in df.match_id.unique(): - match_df=df[df["match_id"]==match] - for period in match_df.Period.unique(): - period_df=match_df[match_df["Period"]==period].copy() - for i in range(len(period_df)): - if i==0: - first_row=period_df.iloc[i].copy() - first_row["deltaX"]=0 - first_row["deltaY"]=0 - first_row["distance"]=0 - first_row["dist2goal"]=0 - first_row["angle2goal"]=0.5 - first_row["delta_T"]=0 - new_df.append(first_row) - else: - new_df.append(period_df.iloc[i]) - df=pd.concat(new_df,axis=1).T.reset_index(drop=True) - - #reorder columns - try: - sb360_columns = ["h"+str(i)+"_"+j for i in range(1, 12) for j in ["teammate", "actor", "keeper", "x", "y"]] + ["a"+str(i)+"_"+j for i in range(1, 12) for j in ["teammate", "actor", "keeper", "x", "y"]] - df = df[['match_id', 'poss_id', 'team', 'home_team', 'action', 'success', 'goal', 'home_score', 'away_score', 'goal_diff', 'Period', 'Minute', 'Second', 'seconds', "delta_T", 'start_x', 'start_y', 'deltaX', 'deltaY', 'distance', 'dist2goal', 'angle2goal']+sb360_columns] - #set the sb360 columns to 4 decimal places - for col in ["h"+str(i)+"_"+j for i in range(1, 12) for j in ["x", "y"]] + ["a"+str(i)+"_"+j for i in range(1, 12) for j in ["x", "y"]]: - #change the type of the column to float - df[col]=df[col].astype(float) - df[col]=df[col].round(4) - except: - try: - home_tracking_columns = [] - away_tracking_columns = [] - for i in range(1, 24): - home_tracking_columns.extend([f"h{i}_x", f"h{i}_y"]) - away_tracking_columns.extend([f"a{i}_x", f"a{i}_y"]) - df = df[['match_id', 'poss_id', 'team', 'home_team', 'action', 'success', 'goal', 'home_score', 'away_score', 'goal_diff', 'Period', 'Minute', 'Second', 'seconds', "delta_T", 'start_x', 'start_y', 'deltaX', 'deltaY', 'distance', 'dist2goal', 'angle2goal']+home_tracking_columns+away_tracking_columns] - #set the home_tracking_columns and away_tracking_columns to 4 decimal places - # for col in home_tracking_columns+away_tracking_columns: - # df[col]=df[col].round(4) - except: - df = df[['match_id', 'poss_id', 'team', 'home_team', 'action', 'success', 'goal', 'home_score', 'away_score', 'goal_diff', 'Period', 'Minute', 'Second', 'seconds', "delta_T", 'start_x', 'start_y', 'deltaX', 'deltaY', 'distance', 'dist2goal', 'angle2goal']] - #convert seconds, distance, dist2goal, angle2goal, deltaX, deltaY,start_x, start_y into type float - df["seconds"]=df["seconds"].astype(float) - df["distance"]=df["distance"].astype(float) - df["dist2goal"]=df["dist2goal"].astype(float) - df["angle2goal"]=df["angle2goal"].astype(float) - df["deltaX"]=df["deltaX"].astype(float) - df["deltaY"]=df["deltaY"].astype(float) - df["delta_T"]=df["delta_T"].astype(float) - df["start_x"]=df["start_x"].astype(float) - df["start_y"]=df["start_y"].astype(float) - - #round numerical columns to 4 decimal places (period, minute, second, X, Y,deltaX, deltaY, distance, dist2goal, angle2goal) - df = df.round({"Period": 4, "Minute": 4, "Second": 4, "seconds": 4, "start_x": 4, "start_y": 4, "deltaX": 4, "deltaY": 4, "distance": 4, "dist2goal": 4, "angle2goal": 4, "delta_T": 4}) - - return df - -def UIED_datastadium(data): - """ - Processes football event data from a DataFrame or CSV file, creating various features for analysis. - - Parameters: - - data (pd.DataFrame or str): If a string, it should be a path to a CSV file. If a DataFrame, it should contain the event data. - - Returns: - - pd.DataFrame: Processed DataFrame with additional features and cleaned data. - """ - # Load data from DataFrame or file path - if isinstance(data, pd.DataFrame): - df = data - elif isinstance(data, str): - if os.path.exists(data): - df = pd.read_csv(data) - else: - raise FileNotFoundError("The file path does not exist") - else: - raise ValueError("The data must be a pandas DataFrame or a file path") - - df = df.copy() - - # Create 'action' column by concatenating 'event_type' and 'event_type_2' - df["action"] = df["event_type"].astype(str) + "_" + df["event_type_2"].astype(str) - #rename "_None" to "_nan" - df["action"]=df["action"].str.replace("_None","_nan") - - - # Define possession team actions - - all_cation=['First Half Start_nan', 'KickOff_Pass', 'Trap_nan', - 'AwayPass_Pass', 'Block_nan', 'Intercept_nan', 'Shoot_nan', - 'Post Bar_nan', 'Shoot_Goal', 'Ball Out_nan', 'Clear_Clear', - 'Through Pass_Pass', 'Cross_Pass/Cross', 'Touch_nan', - 'HomePass_Pass', 'Dribble_Dribble', 'ThrowIn_Pass', 'Offside_nan', - 'Indirect FK_Pass/IndirectFreeKick', 'GK_Pass/GoalKick', - 'CK_Pass/CornerKick', 'Foul_nan', 'Direct FK_Pass/DirectFreeKick', - 'Tackle_nan', 'Shoot_Save', 'Shoot_Shot(not_GK)', 'Catch_nan', - 'CK_Pass/Cross/CornerKick', 'Feed_Pass', 'Hand Clear_HandClear', - 'Shoot_Shot(not_GK)/MissHit', 'Direct FK_Save/DirectFreeKick', - 'Direct FK_Shot(not_GK)/DirectFreeKick', - 'Direct FK_Pass/Cross/DirectFreeKick', 'First Half End_nan', - 'Second Half Start_nan', 'Change_nan', 'Second Half End_nan',"YellowCard_nan", - "RedCard_nan","Suspension(InGame)_nan","Shoot_Save/MissHit","PK_Goal","FrickOn_Pass", - "Direct FK_DirectFreeKick","Drop Ball_nan","Direct FK_Goal/DirectFreeKick","Shoot_MissHit", - "ThrowIn_nan","OwnGoal_Goal","CK_Save/CornerKick","Indirect FK_Pass/Cross/IndirectFreeKick" - ] - - possession_team_actions = [ - 'KickOff_Pass', 'Trap_nan', - 'AwayPass_Pass','Shoot_nan','Post Bar_nan', 'Shoot_Goal','Clear_Clear', - 'Through Pass_Pass', 'Cross_Pass/Cross', 'Touch_nan','HomePass_Pass', 'Dribble_Dribble', 'ThrowIn_Pass', - 'Indirect FK_Pass/IndirectFreeKick', 'GK_Pass/GoalKick','CK_Pass/CornerKick','Direct FK_Pass/DirectFreeKick', - 'Shoot_Shot(not_GK)','Shoot_Save','CK_Pass/Cross/CornerKick', 'Feed_Pass', 'Hand Clear_HandClear','Shoot_Shot(not_GK)/MissHit', - 'Direct FK_Save/DirectFreeKick','Direct FK_Shot(not_GK)/DirectFreeKick', 'Direct FK_Pass/Cross/DirectFreeKick',"FrickOn_Pass", - "Direct FK_DirectFreeKick","Shoot_Save/MissHit","Indirect FK_Pass/Cross/IndirectFreeKick","Shoot_MissHit", - "Direct FK_Goal/DirectFreeKick","ThrowIn_nan","CK_Save/CornerKick"] - - possession = [] - # Determine possession - for i in range(len(df)): - if i == 0: - possession.append(df["team"].iloc[i]) - else: - if df.action.iloc[i] not in all_cation: - print(f"Warning: action {df.action.iloc[i]} was not found in the all action list") - if df["team"].iloc[i] == df["team"].iloc[i - 1]: - possession.append(df["team"].iloc[i]) - else: - if df["action"].iloc[i] in possession_team_actions: - possession.append(df["team"].iloc[i]) - else: - possession.append(df["team"].iloc[i - 1]) - - df["possession_team"] = possession - - #create the event related features (sucess, home_team, goal_diff, home_score, away_score) - #success is provided in the data - #drop all row with col home equal 0 then subtract 1 from home - df = df[df["home"] != 0].reset_index(drop=True) - - home_score = [] - away_score = [] - goal_diff = [] - home_team = [] - goal= [] - for i in range(len(df)): - if df["home"].iloc[i] == 1: - home_team.append(1) - home_score.append(df["self_score"].iloc[i]) - away_score.append(df["opp_score"].iloc[i]) - goal_diff.append(df["self_score"].iloc[i] - df["opp_score"].iloc[i]) - elif df["home"].iloc[i] == 2: - home_team.append(0) - home_score.append(df["opp_score"].iloc[i]) - away_score.append(df["self_score"].iloc[i]) - goal_diff.append(df["opp_score"].iloc[i] - df["self_score"].iloc[i]) - #check if Goal but not GoalKick is in the str of df["event_type_2"].iloc[i] - if "Goal" in str(df["event_type_2"].iloc[i]) and "GoalKick" not in str(df["event_type_2"].iloc[i]): - goal.append(1) - else: - goal.append(0) - - df["home_score"] = home_score - df["away_score"] = away_score - df["goal_diff"] = goal_diff - df["home_team"] = home_team - df["goal"] = goal - - #group the event into simpliefied actions - pass_actions=['KickOff_Pass','AwayPass_Pass','Through Pass_Pass', 'HomePass_Pass','ThrowIn_Pass', - 'Indirect FK_Pass/IndirectFreeKick', 'GK_Pass/GoalKick','Direct FK_Pass/DirectFreeKick', - "FrickOn_Pass","Direct FK_DirectFreeKick","Indirect FK_Pass/Cross/IndirectFreeKick", - "ThrowIn_nan" - ] - high_pass_actions=[] - shot_actions=['Shoot_nan','Shoot_Goal','Shoot_Save', 'Shoot_Shot(not_GK)','Shoot_Shot(not_GK)/MissHit','Direct FK_Save/DirectFreeKick', - 'Direct FK_Shot(not_GK)/DirectFreeKick', "Shoot_Save/MissHit","Shoot_MissHit","Direct FK_Goal/DirectFreeKick" - ] - carray_actions=[] - dribble_actions=['Dribble_Dribble'] - cross_actions=['Cross_Pass/Cross','CK_Pass/CornerKick','CK_Pass/Cross/CornerKick','Feed_Pass','Direct FK_Pass/Cross/DirectFreeKick', "CK_Save/CornerKick"] - drop_actions=['First Half Start_nan','Trap_nan','Block_nan', 'Intercept_nan','Post Bar_nan','Ball Out_nan','Clear_Clear','Touch_nan', - 'Offside_nan','Foul_nan','Tackle_nan','Catch_nan','Hand Clear_HandClear','First Half End_nan','Second Half Start_nan', - 'Change_nan', 'Second Half End_nan',"YellowCard_nan","RedCard_nan","Suspension(InGame)_nan","Drop Ball_nan","PK_Goal", - "OwnGoal_Goal" - ] - - - action_list=[] - for i in range(len(df)): - if df["action"].iloc[i] in pass_actions: - #devide short pass and long pass based on the distance (45) - distance=df.dist.iloc[i] - if distance>=45: - action_list.append("long_pass") - else: - action_list.append("short_pass") - elif df["action"].iloc[i] in high_pass_actions: - action_list.append("high_pass") - elif df["action"].iloc[i] in shot_actions: - action_list.append("shot") - elif df["action"].iloc[i] in carray_actions: - action_list.append("carry") - elif df["action"].iloc[i] in dribble_actions: - action_list.append("dribble") - elif df["action"].iloc[i] in cross_actions: - action_list.append("cross") - elif df["action"].iloc[i] in drop_actions: - action_list.append("drop") - else: - action= df["action"].iloc[i] - print(f"Warning: action {action} was not found in the action list, it will be dropped") - action_list.append("drop") - - df["action"]=action_list - #drop the drop actions - df=df[df["action"]!="drop"].reset_index(drop=True) - - #create the time related features (delta_T) - delta_t_list=[] - for i in range(len(df)): - if i==0: - delta_t_list.append(0) - else: - delta_t_list.append(df["absolute_time"].iloc[i]-df["absolute_time"].iloc[i-1]) - df["delta_T"]=delta_t_list - - #create the location related features (deltaX, deltaY, distance) - delta_x_list=[] - delta_y_list=[] - dist_list=[] - - for i in range(len(df)): - if i==0: - delta_x=0 - delta_y=0 - distance=0 - else: - delta_x=df["start_x"].iloc[i]-df["start_x"].iloc[i-1] - delta_y=df["start_y"].iloc[i]-df["start_y"].iloc[i-1] - distance = np.sqrt(delta_x**2+delta_y**2) - delta_x_list.append(delta_x) - delta_y_list.append(delta_y) - dist_list.append(distance) - df["deltaX"]=delta_x_list - df["deltaY"]=delta_y_list - df["distance"]=dist_list - - #create the possession id, end of possession, end of period, end of game - poss_id_list = [] - poss_id = 0 - for match in df.match_id.unique(): - match_df = df[df["match_id"] == match] - for i in range(len(match_df)): - if i == 0: - poss_id_list.append(poss_id) - else: - if match_df["possession_team"].iloc[i] == match_df["possession_team"].iloc[i - 1]: - poss_id_list.append(poss_id) - else: - poss_id += 1 - poss_id_list.append(poss_id) - poss_id+=1 - df["poss_id"] = poss_id_list - - new_df = [] - for match in df.match_id.unique(): - match_df = df[df["match_id"] == match] - for period in match_df.Period.unique(): - period_df = match_df[match_df["Period"] == period] - for poss_id in period_df.poss_id.unique(): - poss_df = period_df[period_df["poss_id"] == poss_id] - for i in range(len(poss_df)): - new_df.append(poss_df.iloc[i]) - last_row = poss_df.iloc[-1].copy() - last_row["action"] = "_" - #change the value of the features to 0 - last_row['goal']=0 - last_row["success"]=0 - last_row["deltaX"]=0 - last_row["deltaY"]=0 - last_row["distance"]=0 - last_row["dist2goal"]=0 - last_row["angle2goal"]=0.5 - last_row["delta_T"]=0 - new_df.append(last_row) - last_row = period_df.iloc[-1].copy() - #change the value of the features to 0 - last_row['goal']=0 - last_row["success"]=0 - last_row["deltaX"]=0 - last_row["deltaY"]=0 - last_row["distance"]=0 - last_row["dist2goal"]=0 - last_row["angle2goal"]=0.5 - last_row["delta_T"]=0 - if period == df.Period.unique()[-1]: - last_row["action"] = "game_over" - new_df.append(last_row) - else: - last_row["action"] = "period_over" - new_df.append(last_row) - df = pd.concat(new_df, axis=1).T.reset_index(drop=True) - - #create the seconds column - seconds_list=[] - for i in range(len(df)): - if df["Period"].iloc[i]==1: - seconds_list.append(df.Minute.iloc[i]*60+df.Second.iloc[i]) - elif df["Period"].iloc[i]==2: - seconds_list.append(df.Minute.iloc[i]*60+df.Second.iloc[i]+60*45) - - df["seconds"]=seconds_list - - #reset the features value to 0 (angle2goal to 0.5)for beginning of each period - new_df=[] - for match in df.match_id.unique(): - match_df=df[df["match_id"]==match] - for period in match_df.Period.unique(): - period_df=match_df[match_df["Period"]==period].copy() - for i in range(len(period_df)): - if i==0: - first_row=period_df.iloc[i].copy() - first_row["deltaX"]=0 - first_row["deltaY"]=0 - first_row["distance"]=0 - first_row["dist2goal"]=0 - first_row["angle2goal"]=0.5 - first_row["delta_T"]=0 - new_df.append(first_row) - else: - new_df.append(period_df.iloc[i]) - df=pd.concat(new_df,axis=1).T.reset_index(drop=True) - - #convert seconds, distance, dist2goal, angle2goal, start_x, start_y into type float - df["seconds"]=df["seconds"].astype(float) - df["distance"]=df["distance"].astype(float) - df["dist2goal"]=df["dist2goal"].astype(float) - df["angle2goal"]=df["angle2goal"].astype(float) - df["start_x"]=df["start_x"].astype(float) - df["start_y"]=df["start_y"].astype(float) - - #round numerical columns to 4 decimal places (period, minute, second, X, Y) - df = df.round({"Period": 4, "Minute": 4, "Second": 4, "seconds": 4, "start_x": 4, "start_y": 4, "deltaX": 4, "deltaY": 4, "distance": 4, "dist2goal": 4, "angle2goal": 4}) - - #reorder columns - tracking_col_home = [f"Home_{i}_x" for i in range(1, 15)] + [f"Home_{i}_y" for i in range(1, 15)] - tracking_col_away = [f"Away_{i}_x" for i in range(1, 15)] + [f"Away_{i}_y" for i in range(1, 15)] - df = df[['match_id', 'poss_id', 'team', 'home_team', 'action', 'success', 'goal', 'home_score', - 'away_score', 'goal_diff', 'Period', 'Minute', 'Second', 'seconds', "delta_T", 'start_x', - 'start_y', 'deltaX', 'deltaY', 'distance', 'dist2goal', 'angle2goal']+tracking_col_home+tracking_col_away] - - return df - - -if __name__ == '__main__': - import pdb - - # seq2event - # df_path=os.getcwd()+"/test/sports/event_data/data/wyscout/test_data.csv" - # df=seq2event(df_path) - # df.to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_seq2event.csv",index=False) - - # nmstpp - # df_path=os.getcwd()+"/test/sports/event_data/data/wyscout/test_data.csv" - # df=nmstpp(df_path) - # df.to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_nmstpp.csv",index=False) - - # lem - # df_path=os.getcwd()+"/test/sports/event_data/data/wyscout/test_data.csv" - # df=lem(df_path) - # df.to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_lem.csv",index=False) - - # UIED - # df_wyscout_path=os.getcwd()+"/test/sports/event_data/data/wyscout/test_data.csv" - # df_wyscout=UIED_wyscout(df_wyscout_path) - # df_wyscout.to_csv(os.getcwd()+"/test/sports/event_data/data/wyscout/test_preprocess_wyscout_UIED.csv",index=False) - - # df_statsbomb_skillcorner_path=os.getcwd()+"/test/sports/event_data/data/statsbomb_skillcorner/test_data.csv" - # df_statsbomb_skillcorner=UIED_statsbomb(df_statsbomb_skillcorner_path) - # df_statsbomb_skillcorner.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb_skillcorner/test_preprocess_statsbomb_skillcorner_UIED.csv",index=False) - - # df_statsbomb_json_path=os.getcwd()+"/test/sports/event_data/data/statsbomb/test_data.csv" - # df_statsbomb_json=UIED_statsbomb(df_statsbomb_json_path) - # df_statsbomb_json.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_preprocess_statsbomb_json_UIED.csv",index=False) - - # df_statsbomb_api_path=os.getcwd()+"/test/sports/event_data/data/statsbomb/test_api_data.csv" - # df_statsbomb_api=UIED_statsbomb(df_statsbomb_api_path) - # df_statsbomb_api.to_csv(os.getcwd()+"/test/sports/event_data/data/statsbomb/test_preprocess_statsbomb_api_UIED.csv",index=False) - - # df_datastadium_path=os.getcwd()+"/test/sports/event_data/data/datastadium/load.csv" - # df_datastadium=UIED_datastadium(df_datastadium_path) - # df_datastadium.to_csv(os.getcwd()+"/test/sports/event_data/data/datastadium/preprocess_UIED.csv",index=False) - - print('-----------------end-----------------') - # pdb.set_trace() diff --git a/preprocessing/sports/phase_data/soccer/soccer_tracking_data.py b/preprocessing/sports/phase_data/soccer/soccer_tracking_data.py deleted file mode 100644 index 12e1f96..0000000 --- a/preprocessing/sports/phase_data/soccer/soccer_tracking_data.py +++ /dev/null @@ -1,115 +0,0 @@ -import pandas as pd -import os -import pdb -import pandas as pd - -def statsbomb_skillcorner_tracking_data_preprocessing(df_raw, save_path=None, process_event_coord=True): - """ - Preprocess tracking data for StatsBomb and SkillCorner data formats to standardize the coordinates - - Parameters: - - df (pd.DataFrame or str): DataFrame containing tracking data or a path to a CSV file. - Expected columns include 'home_team', 'home_side', and optional columns like 'action' or 'event_type'. - - save_path (str): Path to save the preprocessed data as a CSV file. - - process_event_coord (bool): Flag to scale event data coordinates to field dimensions. - - Steps: - 1. Load CSV if `df` is a file path; validate the input to ensure it is a DataFrame. - 2. Define possession team actions to categorize certain events as possession-related. - 3. Adjust player coordinates by shifting the origin to the center and flipping coordinates - if the home team plays on the right side (field normalization). - 4. Process each row based on the action or event type to determine whether switching - the field orientation is necessary. - 5. Save the modified DataFrame to the specified path. - - Notes: - - Assumes field dimensions of 105 x 68 meters. - - Applies scaling for event data start_x and start_y to adjust coordinates to the field dimensions. - """ - FIELD_LENGTH = 105.0 # Field length in meters - FIELD_WIDTH = 68.0 # Field width in meters - - # Load data if `df_raw` is a file path; validate input - if not isinstance(df_raw, pd.DataFrame): - if isinstance(df_raw, str): - df_raw = pd.read_csv(df_raw) - else: - raise ValueError("Input should be a DataFrame or a CSV file path") - - # Define list of team actions that imply possession - team_actions = [ - 'Pass_Ground Pass', 'Pass_Long_HighPass', 'Carry_nan', 'Pass_High Pass', 'Pass_Low Pass', - 'Miscontrol_nan', 'Dribble_nan', 'Clearance_nan', 'Pass_Cross', 'Ball Recovery_nan', - 'Offside_nan', 'Goal Keeper_nan', 'Dribbled Past_nan', 'Pass_Corner', 'Shot_Saved', - 'Shot_Blocked', 'Shot_Wayward', 'Shot_Off T', 'Shot_Goal', 'Shot_Post', - 'Tactical Shift_nan', 'Shield_nan', 'Own Goal Against_Own goal', 'Error_nan', - 'Shot_Saved Off Target', 'Ball Receipt*_nan', 'Pressure_nan', 'Interception_nan' - ] - - # Function to adjust coordinates based on field orientation - def adjust_coordinates(idx, switch_sides): - """ - Adjusts the x and y coordinates for players on the field based on field orientation. - - Parameters: - - idx (int): The index of the row to modify in df. - - switch_sides (bool): Flag indicating if coordinates should be flipped. - """ - for prefix in ['h', 'a']: # 'h' for home, 'a' for away - for i in range(1, 24): - x_col, y_col = f"{prefix}{i}_x", f"{prefix}{i}_y" - x, y = df.at[idx, x_col], df.at[idx, y_col] - - # Skip if x and y are zero (indicating missing data) - if x == 0 and y == 0: - continue - - # Adjust coordinates based on `switch_sides` flag - df.at[idx, x_col] = (-x if switch_sides else x) + FIELD_LENGTH / 2 - df.at[idx, y_col] = (y if switch_sides else -y) + FIELD_WIDTH / 2 - #round to 2 decimal places - df.at[idx, x_col] = round(df.at[idx, x_col], 2) - df.at[idx, y_col] = round(df.at[idx, y_col], 2) - - # Process each row - df = df_raw.copy() - for idx in range(len(df)): - home_team, home_side = df.at[idx, 'home_team'], df.at[idx, 'home_side'] - switch_sides = False # Default: no switch - - if 'action' in df.columns: - # Use switch condition based on the home team's side in possession - if (home_team == 1 and home_side == 'right') or (home_team == 0 and home_side == 'left'): - switch_sides = True - elif 'event_type' in df.columns: - if process_event_coord: - # Scale start_x and start_y for event data - df.at[idx, "start_x"] *= (1.05 / 1.2) - df.at[idx, "start_y"] *= (0.68 / 0.8) - #round to 2 decimal places - df.at[idx, "start_x"] = round(df.at[idx, "start_x"], 2) - df.at[idx, "start_y"] = round(df.at[idx, "start_y"], 2) - - action_type = f"{df.at[idx, 'event_type']}_{str(df.at[idx, 'event_type_2']).replace('None', 'nan')}" - is_possession_action = action_type in team_actions - - # Determine if coordinates should be switched based on possession action and home side - if is_possession_action: - switch_sides = (home_team == 1 and home_side == 'right') or (home_team == 0 and home_side == 'left') - else: - switch_sides = not ((home_team == 1 and home_side == 'right') or (home_team == 0 and home_side == 'left')) - - # Apply coordinate adjustment for each row by index - adjust_coordinates(idx, switch_sides) - - # Save the processed DataFrame to a CSV file - if save_path is not None: - df.to_csv(save_path, index=False) - - return df - -if __name__=="__main__": - df_path = os.getcwd() + "/test/sports/event_data/data/statsbomb_skillcorner/test_data.csv" - save_path = os.getcwd() + "/test/sports/event_data/data/statsbomb_skillcorner/track_data_preprocessed.csv" - statsbomb_skillcorner_tracking_data_preprocessing(df_path, save_path) - print("done") \ No newline at end of file From 33602c3717937f83428d4ade1a05b719d3199157 Mon Sep 17 00:00:00 2001 From: KurodaKento0505 Date: Mon, 29 Dec 2025 23:59:58 +0900 Subject: [PATCH 3/7] update README.md of phase data --- .../sports/phase_data/phase_class.py | 4 +- .../sports/phase_data/soccer/README.md | 10 +- .../phase_data/soccer/soccer_load_data.py | 550 +++--------------- .../phase_data/soccer/soccer_phase_class.py | 23 +- 4 files changed, 76 insertions(+), 511 deletions(-) diff --git a/preprocessing/sports/phase_data/phase_class.py b/preprocessing/sports/phase_data/phase_class.py index e745ba0..24ec6cb 100644 --- a/preprocessing/sports/phase_data/phase_class.py +++ b/preprocessing/sports/phase_data/phase_class.py @@ -1,6 +1,6 @@ class Phase_data: - soccer_data_provider = ['bepro', 'statsbomb_skillcorner', 'pff_fc'] - other_soccer_data_provider = ['robocup_2d', 'datastadium'] + soccer_data_provider = ['bepro', 'statsbomb_skillcorner'] + other_soccer_data_provider = ['pff_fc', 'robocup_2d', 'datastadium'] handball_data_provider = [] rocket_league_data_provider = ['carball'] diff --git a/preprocessing/sports/phase_data/soccer/README.md b/preprocessing/sports/phase_data/soccer/README.md index 39a2f2a..6c08873 100644 --- a/preprocessing/sports/phase_data/soccer/README.md +++ b/preprocessing/sports/phase_data/soccer/README.md @@ -8,7 +8,6 @@ You can find detailed documentation on supported data providers [here](https://o - Bepro - Statsbomb and Skillcorner -- (PFF FC) For data format examples, visit [Kloppy](https://github.com/PySport/kloppy/tree/master/kloppy/tests/files) @@ -18,11 +17,8 @@ For information on supported preprocessing methods, visit [this documentation](h ## Examples Here are some examples of how to download and preprocess data: -- **Wyscout Data (NMSTPP format):** - - [Read the Docs Example](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/Event_data/Example/Football/Example_1/contents.html) - - [GitHub Example](https://github.com/open-starlab/PreProcessing/blob/master/example/NMSTPP_data.py) - - [Colab Example](https://colab.research.google.com/drive/1c7pAwXDVaT_XTYNHvgvxgmxj0E-6IEKH?authuser=1#scrollTo=p9AZJWlYfJYs) +- **Bepro Data:** + - [Read the Docs Example](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/Phase_data/Data_Provider/Football/Bepro/contents.html) - **StatsBomb and SkillCorner Data:** - - [Read the Docs Example](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/Event_data/Example/Football/Example_2/contents.html) - - [GitHub Example](https://github.com/open-starlab/PreProcessing/blob/master/example/statsbomb_skillcorner.py) + - [Read the Docs Example](https://openstarlab.readthedocs.io/en/latest/Pre_Processing/Sports/Phase_data/Data_Provider/Football/Statsbomb_Skillcorner/contents.html) \ No newline at end of file diff --git a/preprocessing/sports/phase_data/soccer/soccer_load_data.py b/preprocessing/sports/phase_data/soccer/soccer_load_data.py index 16cf4f3..a30c02e 100644 --- a/preprocessing/sports/phase_data/soccer/soccer_load_data.py +++ b/preprocessing/sports/phase_data/soccer/soccer_load_data.py @@ -1,11 +1,8 @@ -#Target data provider [Metrica,Robocup 2D simulation,Statsbomb,Wyscout,Opta data,DataFactory,sportec] - import json import pandas as pd pd.set_option('future.no_silent_downcasting', True) import numpy as np import xml.etree.ElementTree as ET -# from statsbombpy import sb import os import pickle from typing import List, Dict, Any @@ -19,11 +16,9 @@ def load_bepro(tracking_xml_path: str, tracking_json_paths: list, event_path: st coordinates to the correct scale for analysis. Args: + tracking_path (str): Path to the XML or JSON file containing tracking data. event_path (str): Path to the CSV file containing event data. - tracking_path (str): Path to the XML file containing tracking data. meta_path (str): Path to the XML file containing match metadata (pitch, teams, players, etc.). - verbose (bool, optional): If True, prints additional information about the merging process and - feature extraction. Default is False. Returns: pd.DataFrame: A DataFrame containing the merged and processed event and tracking data, @@ -48,42 +43,29 @@ def extract_tracking_data_from_xml(xml_path: str) -> List[Dict[str, Any]]: frame_number = int(frame.get("frameNumber")) match_time = int(frame.get("matchTime")) - # 処理対象の要素を の両方に拡張 - # findall("*") を使用することで、 の直下にある全ての要素(player, ballなど)を取得 for element in frame.findall("*"): - - # タグ名に基づいて player_id と loc の属性名を設定 if element.tag == "player": player_id = element.get("playerId") loc = element.get("loc") elif element.tag == "ball": - # ⭐ 変更点: タグの場合、player_id を "ball" とし、属性を取得 player_id = "ball" loc = element.get("loc") else: - # 予期しないタグはスキップ continue - - # loc 情報が存在しない場合はスキップ if loc is None: continue - # Convert loc string to float coordinates try: - # loc の形式は "[x, y]" を想定 x, y = map(float, loc.strip("[]").split(",")) - # 座標変換とデータ追加 tracking_data.append({ "frame": frame_number, "match_time": match_time, "player_id": player_id, - # 座標の正規化解除とフォーマット適用 (元のコードのロジックを維持) "x": "{:.2f}".format(x * 105 - 52.5), "y": "{:.2f}".format(y * 68 - 34.0) }) except ValueError: - # loc の形式が不正な場合 raise ValueError(f"Invalid location format for player {player_id} in frame {frame_number}") return tracking_data @@ -118,59 +100,39 @@ def extract_tracking_data_from_json(json_path: str, period: str) -> List[Dict[st return tracking_data def devide_by_period(tracking_data_list: List[dict]) -> List[pd.DataFrame]: - """ - トラッキングデータのリストに 'period' 列を追加し、periodごとに分割した - DataFrameのリストを返す。 - - frame番号が大きく減少する(リセットされる)ごとにperiodをインクリメントし、 - その直前の行で期間を終了する。 + """Splits tracking data into multiple DataFrames based on period resets detected via frame number decreases. Args: - tracking_data_list (list of dict): tracking_dataを格納したリスト。 + tracking_data_list (List[dict]): A list of dictionaries containing raw tracking data. + Returns: - List[pd.DataFrame]: 'period' 列が追加され、期間ごとに正確に分割されたDataFrameのリスト。 + List[pd.DataFrame]: A list of DataFrames, each representing a single period with an added 'period' column. """ if not tracking_data_list: return [] - # 1. リストをPandas DataFrameに変換し、オリジナルのインデックスを保持 df = pd.DataFrame(tracking_data_list) - # 2. periodの境界となるインデックス(frame番号がリセットされる行)を特定 - # 各フレームの最初の行のみを取得 first_occurrence_of_frame = df.drop_duplicates(subset=['frame', 'match_time'], keep='first') - # frame番号の差分を計算し、負になる箇所(リセット)を検出 - # .diff() は Series を返すため、インデックスは first_occurrence_of_frame のインデックスと一致する frame_diff = first_occurrence_of_frame['frame'].diff().fillna(0) period_reset_indices = frame_diff[frame_diff < 0].index - # 3. 分割点のインデックスリストを作成 - # リストの先頭 (0) を開始点として追加 split_indices = [0] - # リセットされたフレームのインデックスを取得 - # df.index.get_loc() を使わずに、直接 df のインデックスで操作する + for reset_idx in period_reset_indices: - # リセットが行われるフレームの直前のインデックスを分割点に追加 - # reset_idx は first_occurrence_of_frame のインデックスであり、df のインデックスと一致する if reset_idx > 0: split_indices.append(reset_idx) - - # リストの末尾(データの最終インデックス+1)を終了点として追加 + split_indices.append(len(df)) - # 重複を削除し、ソート split_indices = sorted(list(set(split_indices))) period_df_list = [] - # 4. 分割とperiod番号の割り当て for i in range(len(split_indices) - 1): start_idx = split_indices[i] end_idx = split_indices[i+1] current_period = i + 1 - # DataFrameをスライス period_df = df.iloc[start_idx:end_idx].copy() - # 'period' 列を割り当て period_df.loc[:, 'period'] = current_period - # 不要な一時列をクリーンアップ(ここでは既に df に period がマッピングされていないので不要だが、念のため) period_df_list.append(period_df.reset_index(drop=True)) return period_df_list @@ -247,7 +209,6 @@ def extract_meta_info_from_json(json_path: str) -> dict: team_id = str(team_data.get("team_id")) team_name = str(team_data.get("team_name")) - # プレイヤー情報を保存 if "players" in team_data: for player in team_data["players"]: player_id = str(player.get("player_id")) @@ -278,93 +239,65 @@ def get_inplay_start_time(event_df: pd.DataFrame) -> pd.DataFrame: """ event_df = event_df.copy() - # List of strings specified as in-play start events START_EVENT_STRINGS = ['goalKick', 'throwIn', 'cornerKick', 'freeKick', 'goalAgainst'] - # 1. Extract the string up to the first space in 'filtered_event_types' - # Since NaN values may be present, replace them with an empty string ('') before str.split(). event_df.loc[:, 'first_event_type'] = event_df['filtered_event_types'].fillna('').str.split(' ').str[0] - # 2. Create a flag column to detect the start frame - # The first row is always considered the start of an in-play sequence is_start_frame = pd.Series(False, index=event_df.index) is_start_frame.iloc[0] = True - # Detect events containing the specified strings is_restart_event = event_df['first_event_type'].isin(START_EVENT_STRINGS) - # 3. Apply the restart logic - # Restart events other than 'goalAgainst': The current row marks the start of a new in-play sequence is_normal_restart = is_restart_event & (event_df['first_event_type'] != 'goalAgainst') is_start_frame = is_start_frame | is_normal_restart - # 'goalAgainst' restart event: The **next frame** marks the start of a new in-play sequence is_goal_against = event_df['first_event_type'] == 'goalAgainst' - # Set True for the row immediately following 'goalAgainst' (using shift(-1), the last row is ignored) - # This is OR combined with is_start_frame shifted_goal_against = is_goal_against.shift(periods=-1) filled_shifted = shifted_goal_against.fillna(False).astype(bool) is_start_frame = is_start_frame.astype(bool) is_start_frame = is_start_frame | filled_shifted - # 4. Calculate the in-play number - # Calculate the cumulative sum, which increments at every True (start frame) instance - # Since True is treated as 1 and False as 0, cumsum() yields the in-play number event_df.loc[:, 'inplay_num'] = is_start_frame.cumsum().astype(int) - # 5. Post-processing - # Delete the helper column created during intermediate processing and return the result event_df = event_df.drop(columns=['first_event_type'], errors='ignore') return event_df def get_tracking(tracking_df: pd.DataFrame, event_df: pd.DataFrame, player_info_df: Dict[str, Dict[str, str]]) -> pd.DataFrame: """ - トラッキングデータをフレームごとに集約し、チームサイドとポジション順に並べ替えた - ワイドフォーマットのDataFrameを作成し、インプレー番号を割り当てる。 + Aggregates tracking data per frame into a wide-format DataFrame sorted by team side and position, and assigns in-play IDs. Args: - tracking_df (pd.DataFrame): 処理されたトラッキングデータ (frame, period, x, y, player_idなどを含む)。 - event_df (pd.DataFrame): 処理されたイベントデータ (match_time, period, inplay_numなどを含む)。 - player_info_df (Dict[str, Dict[str, str]]): player_idに対するポジション、チームID、サイド情報を持つ辞書。 + tracking_df (pd.DataFrame): Processed tracking data containing frame, period, coordinates, and player IDs. + event_df (pd.DataFrame): Processed event data containing match_time, period, and inplay_num. + player_info_df (Dict[str, Dict[str, str]]): Metadata mapping player IDs to positions, team IDs, and team sides. Returns: - pd.DataFrame: フレームごとのワイドフォーマットトラッキングデータ。 + pd.DataFrame: A wide-format tracking DataFrame structured frame-by-frame with normalized team orientations. """ - - # 標準的なポジション順序 (1から11の番号付けに使用) POSITION_ORDER = ['GK', 'CB', 'RWB', 'RB', 'LWB', 'LB', 'CDM', 'RM', 'CM', 'LM', 'CAM', 'RW', 'LW', 'CF'] - FPS = 25 # トラッキングデータのフレームレート # ----------------------------------------------- - # 0. プレイヤー情報の結合と前処理 + # 0. Player Information Merging and Preprocessing # ----------------------------------------------- event_df = event_df.copy() - # player_info_dfをDataFrameに変換し、トラッキングデータにマージ player_map_df = pd.DataFrame.from_dict(player_info_df, orient='index').reset_index().rename( columns={'index': 'player_id', 'side': 'team_side', 'team_name': 'team_name'} ) - # player_idの型を揃える tracking_df['player_id'] = tracking_df['player_id'].astype(str) - # プレイヤーのメタデータをトラッキングデータに結合 tracking_df = pd.merge(tracking_df, player_map_df, on='player_id', how='left') - # ボールの行のメタデータ ('player_id'='ball') を補完 tracking_df.loc[tracking_df['player_id'] == 'ball', ['team_id', 'team_name', 'team_side', 'position', 'player_name']] = \ ['ball', 'ball', 'ball', 'ball', 'ball'] # ----------------------------------------------- - # 1. チームサイド (left/right) の決定 (最初のフレームで固定) + # 1. Determine Team Side (left/right) based on initial frame # ----------------------------------------------- - - # 最初のフレームのGKデータのみを抽出 target_frame = tracking_df['frame'].min() + 10 gk_data_initial = tracking_df[(tracking_df['position'] == 'GK') & (tracking_df['frame'] == target_frame)] - # x座標が最小(マイナス側)のチームを 'left' チームとする left_team_id = gk_data_initial.loc[gk_data_initial['x'].idxmin(), 'team_id'] - # チームのメタデータを格納する辞書を作成(ワイドフォーマットの列作成に使用) team_meta = {} unique_teams = tracking_df[tracking_df['team_id'] != 'ball'][['team_id', 'team_name', 'team_side']].drop_duplicates() @@ -373,62 +306,38 @@ def get_tracking(tracking_df: pd.DataFrame, event_df: pd.DataFrame, player_info_ team_meta[f'{current_side}_team_id'] = row['team_id'] team_meta[f'{current_side}_team_name'] = row['team_name'] - team_meta[f'{current_side}_team_side'] = row['team_side'] # home/away + team_meta[f'{current_side}_team_side'] = row['team_side'] # ----------------------------------------------- - # 2. インプレー番号 (inplay_num) の割り当てロジック + # 2. Assign In-Play Numbers (inplay_num) # ----------------------------------------------- - - # tracking_df に inplay_num 列を追加し、全て NaN で初期化 - # このコードは関数内での処理を想定しているため、DataFrameのコピーを直接修正します。 tracking_df['inplay_num'] = np.nan - # 1. event_dfから各インプレーの開始/終了時刻を決定 - - # 'inplay_num' と 'match_time' の組み合わせを取得し、インプレー番号でソート inplay_times = event_df[['inplay_num', 'event_time']].drop_duplicates().sort_values('inplay_num') - # 各インプレー番号の開始時刻と終了時刻を計算 inplay_periods = inplay_times.groupby('inplay_num')['event_time'].agg(['min', 'max']).reset_index() inplay_periods.columns = ['inplay_num', 'start_time', 'end_time'] - # 2. tracking_df に inplay_num を割り当て - - # Period ごとに処理を行い、割り当てを確実にする for period in tracking_df['period'].unique(): - - # 当該ピリオドの tracking_df を抽出 - p_tracking = tracking_df[tracking_df['period'] == period].copy() - - # 当該ピリオドのインプレー期間を抽出 p_inplay_periods = inplay_periods.copy() - - # 各インプレー期間に対して tracking_df に inplay_num を割り当て + for _, row in p_inplay_periods.iterrows(): current_inplay_num = row['inplay_num'] start_time = row['start_time'] end_time = row['end_time'] - # 'match_time' が 'start_time' 以上かつ 'true_end_time' 以下のフレームに 'inplay_num' を設定 - # NumPyのwhere条件を使用して高速に処理 - - # グローバルな tracking_df のインデックスを取得 mask_index = tracking_df[ (tracking_df['period'] == period) & (tracking_df['match_time'] >= start_time) & (tracking_df['match_time'] <= end_time) ].index - # マスクされた行に inplay_num を割り当てる tracking_df.loc[mask_index, 'inplay_num'] = current_inplay_num - # 割り当てられなかった NaN の inplay_num はインプレー間の中断フレームと見なされます。 - # 最終的な final_tracking_df は tracking_df そのものです。 final_tracking_df = tracking_df.copy() # ----------------------------------------------- - # 3. プレイヤー順序の決定と結合キーの作成 + # 3. Determine Player Ordering and Join Keys # ----------------------------------------------- - is_player = (final_tracking_df['player_id'] != 'ball') side_calculated = np.where( final_tracking_df['team_id'] == left_team_id, @@ -442,26 +351,15 @@ def get_tracking(tracking_df: pd.DataFrame, event_df: pd.DataFrame, player_info_ final_tracking_df.loc[is_player, 'side'] = side_series.loc[is_player] final_tracking_df.loc[final_tracking_df['player_id'] == 'ball', 'side'] = 'ball' - # ポジションの順序をマッピング pos_map = {pos: order for order, pos in enumerate(POSITION_ORDER, 1)} - - # プレイヤーのみをフィルタリング player_df = final_tracking_df[final_tracking_df['player_id'] != 'ball'].copy() - - # ポジションの順序番号をDataFrameに追加 player_df.loc[:, 'pos_order'] = player_df['position'].map(pos_map) - - # 各チーム・各フレーム内でポジション順に連番 (1から11) を作成 player_df.loc[:, 'pos_rank'] = player_df.groupby(['frame', 'side'])['pos_order'].rank(method='first').astype(int) - - # ワイドフォーマットの列名を作成: 例: 'left_1_x', 'right_11_y' player_df.loc[:, 'variable'] = player_df['side'] + '_' + player_df['pos_rank'].astype(str) # ----------------------------------------------- - # 4. プレイヤーデータのワイドフォーマット化 (Pivot) + # 4. Transform Player Data to Wide Format (Pivot) # ----------------------------------------------- - - # ワイド化する値列をリスト化 value_cols = ['x', 'y', 'player_id', 'player_name', 'position'] wide_data_list = [] @@ -472,57 +370,44 @@ def get_tracking(tracking_df: pd.DataFrame, event_df: pd.DataFrame, player_info_ columns='variable', values=col, aggfunc='first' - ).add_suffix(f'_{col.replace("player_id", "id").replace("player_name", "name")}') # player_id -> left_1_id + ).add_suffix(f'_{col.replace("player_id", "id").replace("player_name", "name")}') wide_data_list.append(pivot_df) - # 全てのピボットテーブルを結合 wide_player_df = wide_data_list[0].join(wide_data_list[1:]) # ----------------------------------------------- - # 5. ボールデータとチームメタデータの抽出・結合 + # 5. Extract and Merge Ball Data and Team Metadata # ----------------------------------------------- - - # ボールデータを抽出 ball_df = final_tracking_df[final_tracking_df['player_id'] == 'ball'][['frame', 'x', 'y', 'match_time', 'period', 'inplay_num']].rename( columns={'x': 'ball_x', 'y': 'ball_y'} ).set_index(['frame', 'match_time', 'period', 'inplay_num']) - - # プレイヤーデータにボールデータを結合 final_tracking_df = wide_player_df.join(ball_df).reset_index() - # チームメタデータを追加 for col, value in team_meta.items(): final_tracking_df[col] = value # ----------------------------------------------- - # 6. 最終的な列の整形と順序調整 + # 6. Final Column Formatting and Reordering # ----------------------------------------------- - - # プレイヤー列を ID, Name, Position, x, y の順で生成 ordered_player_cols = [] for side in ['left', 'right']: - for i in range(1, 12): # 1番から11番まで + for i in range(1, 12): prefix = f'{side}_{i}_' - # ID, Name, Positionはデータに存在しない可能性もあるため、チェックしてから追加 ordered_player_cols.append(prefix + 'id') ordered_player_cols.append(prefix + 'name') ordered_player_cols.append(prefix + 'position') ordered_player_cols.append(prefix + 'x') ordered_player_cols.append(prefix + 'y') - # 最終的な列順序 (要望の形式に合わせる) base_cols = ['period', 'inplay_num', 'frame', 'match_time', 'ball_x', 'ball_y'] - # チームメタデータ列 team_cols = [] for side in ['left', 'right']: team_cols.extend([f'{side}_team_id', f'{side}_team_name', f'{side}_team_side']) final_cols = base_cols + team_cols + ordered_player_cols - - # 必要な列のみを選択し、順序を調整 (存在しない列は無視される) final_tracking_df = final_tracking_df.reindex(columns=final_cols) return final_tracking_df @@ -563,11 +448,10 @@ def load_statsbomb_skillcorner(sb_event_path: str, sc_tracking_path: str, sc_mat Load and merge StatsBomb event data with SkillCorner tracking data. Args: - statsbomb_event_dir (str): Directory path for StatsBomb event data. - skillcorner_tracking_dir (str): Directory path for SkillCorner tracking data. - skillcorner_match_dir (str): Directory path for SkillCorner match data. - statsbomb_match_id (str): Match ID for StatsBomb data. - skillcorner_match_id (str): Match ID for SkillCorner data. + statsbomb_event_path (str): File path for StatsBomb event data. + skillcorner_tracking_path (str): File path for SkillCorner tracking data. + skillcorner_match_path (str): File path for SkillCorner match data. + skillcorner_players_path (str): File path for SkillCorner players data. Returns: pd.DataFrame: Combined DataFrame with event and tracking data. @@ -579,31 +463,27 @@ def extract_meta_info_from_match(sc_match: dict, sc_players: list) -> dict: Args: sc_match (dict): Dataframe of match data file. + sc_players (dict): List of players data file. Returns: dict: Dictionary in the format: {team_id: {'team_name': str, 'team_side': str}}, {player_id: {'position': str, 'team_id': str, 'side': str}}. """ - # 結果を格納する辞書の初期化 team_meta_df = {} player_meta_df = {} player_trackable_map = {p['id']: p.get('trackable_object') for p in sc_players} - # 1. チーム情報の作成 - # Home Team home_id = sc_match['home_team']['id'] team_meta_df[home_id] = { 'team_name': sc_match['home_team']['name'], 'team_side': 'home' } - # Away Team away_id = sc_match['away_team']['id'] team_meta_df[away_id] = { 'team_name': sc_match['away_team']['name'], 'team_side': 'away' } - # 2. 選手情報の作成 for p in sc_match['players']: player_id = p['id'] trackable_id = player_trackable_map.get(player_id) @@ -617,6 +497,17 @@ def extract_meta_info_from_match(sc_match: dict, sc_players: list) -> dict: return team_meta_df, player_meta_df def get_left_team_id(sc_tracking, team_meta_df, player_meta_df): + """ + Identifies which team ID is attacking the left side of the pitch based on the goalkeeper's position. + + Args: + sc_tracking (list): Raw tracking data containing frame-by-frame object positions. + team_meta_df (dict/pd.DataFrame): Metadata for teams including names and IDs. + player_meta_df (dict/pd.DataFrame): Metadata for players including their team IDs and positions. + + Returns: + int or None: The team ID assigned to the left side (x < 0), or None if not found. + """ all_team_ids = list(team_meta_df.keys()) for frame_data in sc_tracking: if frame_data['data']==None: @@ -636,28 +527,33 @@ def get_left_team_id(sc_tracking, team_meta_df, player_meta_df): def process_all_tracking(sc_tracking, team_meta_df, player_meta_df, left_team_id): """ - 全フレームをループし、ポジション順にソートされたフラットなリストを返す。 + Iterates through all frames to return a flattened DataFrame sorted by tactical positions. + + Args: + sc_tracking (list): Raw tracking data list. + team_meta_df (dict): Metadata containing team details. + player_meta_df (dict): Metadata containing player details. + left_team_id (int): The ID of the team currently on the left side. + + Returns: + pd.DataFrame: Processed tracking data with fixed columns for ball and 22 players (sorted by position). """ - - # ポジションの優先順位を辞書化(スコアが低いほど若い番号に割り当てられる) POSITION_ORDER = ['GK', 'CB', 'RCB', 'LCB', 'RWB', 'RB', 'LWB', 'LB', 'CDM', 'RDM', 'LDM', 'RM', 'CM', 'LM', 'CAM', 'RW', 'LW', 'CF'] pos_priority = {pos: i for i, pos in enumerate(POSITION_ORDER)} - # 左右のチームIDを特定 all_team_ids = list(team_meta_df.keys()) right_team_id = [tid for tid in all_team_ids if tid != left_team_id][0] all_frames_processed = [] for frame_data in sc_tracking: - # 基本情報の構築 res = { 'period': int(frame_data['period']) if pd.notna(frame_data['period']) else None, - 'inplay_num': None, # 予約列 + 'inplay_num': None, 'frame': frame_data['frame'], 'match_time': frame_data['timestamp'], - 'ball_x': None, # 後で更新 - 'ball_y': None, # 後で更新 + 'ball_x': None, + 'ball_y': None, 'left_team_id': left_team_id, 'left_team_name': team_meta_df[left_team_id]['team_name'], 'left_team_side': team_meta_df[left_team_id]['team_side'], @@ -666,18 +562,15 @@ def process_all_tracking(sc_tracking, team_meta_df, player_meta_df, left_team_id 'right_team_side': team_meta_df[right_team_id]['team_side'] } - # フレーム内のデータを「ボール」と「左右の選手リスト」に分ける left_players_in_frame = [] right_players_in_frame = [] for obj in frame_data['data']: - # ボールの処理 if 'z' in obj: res['ball_x'] = obj['x'] res['ball_y'] = obj['y'] continue - # 選手の処理 p_id = obj['track_id'] if p_id in player_meta_df: p_info = player_meta_df[p_id] @@ -687,7 +580,7 @@ def process_all_tracking(sc_tracking, team_meta_df, player_meta_df, left_team_id 'pos': p_info['position_acronym'], 'x': obj['x'], 'y': obj['y'], - 'priority': pos_priority.get(p_info['position_acronym'], 99) # 未定義は最後尾 + 'priority': pos_priority.get(p_info['position_acronym'], 99) } if p_info['team_id'] == left_team_id: @@ -695,16 +588,11 @@ def process_all_tracking(sc_tracking, team_meta_df, player_meta_df, left_team_id else: right_players_in_frame.append(player_data) - # ------------------------------------------------------- - # ⭐ ポジション順(同ポジションならID順)でソート - # ------------------------------------------------------- left_players_sorted = sorted(left_players_in_frame, key=lambda x: (x['priority'], x['id'])) right_players_sorted = sorted(right_players_in_frame, key=lambda x: (x['priority'], x['id'])) - # ソートされた順に left_1, left_2 ... と格納 (最大11人) for i in range(11): idx = i + 1 - # Left Team if i < len(left_players_sorted): p = left_players_sorted[i] res[f"left_{idx}_id"] = p['id'] @@ -713,7 +601,6 @@ def process_all_tracking(sc_tracking, team_meta_df, player_meta_df, left_team_id res[f"left_{idx}_x"] = p['x'] res[f"left_{idx}_y"] = p['y'] else: - # 11人に満たない場合はNaNで埋める(列順を維持するため重要) res[f"left_{idx}_id"] = None res[f"left_{idx}_name"] = None res[f"left_{idx}_position"] = None @@ -722,7 +609,6 @@ def process_all_tracking(sc_tracking, team_meta_df, player_meta_df, left_team_id for i in range(11): idx = i + 1 - # Right Team if i < len(right_players_sorted): p = right_players_sorted[i] res[f"right_{idx}_id"] = p['id'] @@ -743,15 +629,16 @@ def process_all_tracking(sc_tracking, team_meta_df, player_meta_df, left_team_id def get_inplay_start_time(event_df: pd.DataFrame): """ - event_dfにinplay_numを追加し、各インプレーの開始情報を辞書のリストで返す。 + Assigns in-play sequence numbers and identifies start times for each sequence from event data. + + Args: + event_df (pd.DataFrame): Dataframe of match events (passes, play patterns, etc.). + + Returns: + list: A list of dictionaries, each containing 'inplay_num', 'period', and 'timestamp' for sequence starts. """ - # データのコピーを作成 df = event_df.copy() - - # 開始情報を保持するリスト(辞書を格納) inplay_info_list = [] - - # インプレー番号を初期化 current_inplay = 0 continuing_patterns = ['Regular Play', 'From Counter', 'From Keeper'] @@ -761,32 +648,25 @@ def get_inplay_start_time(event_df: pd.DataFrame): curr_ev = df.iloc[i] next_ev = df.iloc[i + 1] - # pass_type が None の場合は判定をスキップ(元のロジックを維持) if pd.isna(next_ev['pass_type']): continue - # --- インプレーの切り替わり条件判定 --- is_new_inplay = False - # 1. 試合終了後のデータ(時間が戻る場合)対策 next_ts = pd.Timestamp(next_ev['timestamp']).round('100ms') curr_ts = pd.Timestamp(curr_ev['timestamp']).round('100ms') if next_ts < curr_ts: is_new_inplay = True - # 条件A: play_patternの変化 elif curr_ev['play_pattern'] != next_ev['play_pattern']: if next_ev['play_pattern'] not in continuing_patterns: is_new_inplay = True - # 条件B: 特定の再開イベント elif next_ev['pass_type'] in restart_types: is_new_inplay = True - # --- インプレー番号の更新と情報の記録 --- if is_new_inplay: current_inplay += 1 - # 必要な情報を辞書形式で保存 inplay_info_list.append({ 'inplay_num': current_inplay, 'period': int(next_ev['period']), @@ -797,21 +677,23 @@ def get_inplay_start_time(event_df: pd.DataFrame): def get_inplay_tracking(tracking_df: pd.DataFrame, inplay_info_list: List) -> pd.DataFrame: """ - inplay_info_listを元に、トラッキングデータにinplay_numを付与し、 - インプレー外(区間外)のデータを削除する。 + Filters tracking data to include only in-play periods and assigns sequence numbers. + + Args: + tracking_df (pd.DataFrame): Processed tracking data. + inplay_info_list (list): List of dictionaries defining start times of in-play sequences. + + Returns: + pd.DataFrame: Tracking data filtered for in-play time, sampled at 5fps (200ms intervals). """ df = tracking_df.copy() - # 1. トラッキングデータの時間を統一された日付(1900-01-01)のTimestampに変換 - # これにより「時間・分・秒」のみの純粋な比較が可能になります df['tmp_timestamp'] = pd.to_datetime( df['match_time'], format='%H:%M:%S.%f', errors='coerce' ).map(lambda x: x.replace(year=1900, month=1, day=1) if pd.notna(x) else x) - # 2. インプレー情報の時間も同じ日付(1900-01-01)に統一 def normalize_period_time(group): period_start = group['tmp_timestamp'].min() - # 経過時間を計算し、1900-01-01 00:00:00 からの経過に変換し直す base = pd.Timestamp('1900-01-01 00:00:00') group['tmp_timestamp'] = base + (group['tmp_timestamp'] - period_start) return group @@ -823,18 +705,15 @@ def normalize_time(ts): df = df.groupby('period', group_keys=False).apply(normalize_period_time) - # --- インプレー番号の割り当て --- for i in range(len(inplay_info_list)): current_info = inplay_info_list[i] - # 日付を1900-01-01に揃える start_time = normalize_time(current_info['timestamp']) period = current_info['period'] num = current_info['inplay_num'] period_mask = (df['period'] == period) - # 次のインプレー開始時間を取得 next_event_in_same_period = None for j in range(i + 1, len(inplay_info_list)): if int(inplay_info_list[j]['period']) == period: @@ -842,27 +721,21 @@ def normalize_time(ts): break if next_event_in_same_period is not None: - # 同じピリオド内に次のインプレーがある場合: その直前まで time_mask = (df['tmp_timestamp'] >= start_time) & (df['tmp_timestamp'] < next_event_in_same_period) else: - # そのピリオド内で最後のインプレーの場合: ピリオドの最後まで time_mask = (df['tmp_timestamp'] >= start_time) final_mask = period_mask & time_mask df.loc[final_mask, 'inplay_num'] = num - # --- データのクリーンアップ --- - # inplay_num が割り当てられなかった行(インプレー外)を削除 df = df.dropna(subset=['inplay_num']) - # tmp_timestamp を文字列フォーマットに戻す (%f はマイクロ秒なので下3桁をカット) base_time = pd.Timestamp('1900-01-01 00:00:00') df['match_time'] = (df['tmp_timestamp'] - base_time).dt.total_seconds() * 1000 df = df[df['match_time'] % 200 == 0] df['match_time'] = df['match_time'].astype(int) df = df.drop(columns=['tmp_timestamp']) - # 型を整数に戻す df['period'] = df['period'].astype(int) df['inplay_num'] = df['inplay_num'].astype(int) @@ -888,289 +761,4 @@ def normalize_time(ts): processed_tracking_df = get_inplay_tracking(tracking_df, inplay_info_list) - return processed_tracking_df - -def load_pff2metrica(event_path:str, match_id:str = None) -> pd.DataFrame: - """ - Convert PFF-style event data to Metrica format. - - Parameters - ---------- - event_df : pd.DataFrame - Event data from PFF dataset with columns like: - - gameEvents_period - - gameEvents_playerName - - possessionEvents_receiverPlayerName - - possessionEvents_possessionEventType - - startTime, endTime, duration - - gameEvents_homeTeam - - various outcome types for success/failure - match_id : str, optional - Match identifier to add as a column, by default None - - Returns - ------- - Metrica_df : pd.DataFrame - DataFrame in Metrica format with columns: - ['Team', 'Type', 'Subtype', 'Period', 'Start Frame', 'Start Time [s]', - 'End Frame', 'End Time [s]', 'From', 'To', 'Start X', 'Start Y', 'End X', 'End Y'] - """ - with open(event_path, 'r') as f: - event_data = json.load(f) - event_df = pd.json_normalize(event_data, sep='_') - - def type_id2name(x): - """ - Map event type codes to descriptive names. - - Parameters - ---------- - x : str | int | float | None - Event type code (e.g., 'PA', 'SH', 'FO', etc.) - - Returns - ------- - str | None - Descriptive event type name, or None if not mapped. - """ - import math - if x in ['PA']: - x = "pass" - elif x in ['CR']: - x = "cross" - # elif x == 2: - # x = "throw_in" - # elif x == 5: - # x = "corner_crossed" - # elif x == 7: - # x = "take_on" - elif x in ['FO']: - x = "foul" - elif x in ['CH']: - x = "tackle" - # elif x == 10: - # x = "interception" - elif x in ['SH']: - x = "shot" - elif x in ['CL']: - x = "clearance" - elif x in ['BC']: - x = "dribble" - # elif x == 22: - # x = "goalkick" - elif x in ['IT', 'RE', 'TC']: - x = "other" - elif x is None or (isinstance(x, (float, int)) and math.isnan(x)): - x = None - else: - print(f"Unmapped event type: {x}") - return x - def extract_player_xy(row): - """ - Extracts the (x, y) coordinates of the player involved in a game event. - - Parameters - ---------- - row : pd.Series - A row from a DataFrame containing game event and player information. - Expected keys: - - "gameEvents_homeTeam" (bool): True if home team, False if away team. - - "homePlayers" (list|str): List or stringified list of home team players. - - "awayPlayers" (list|str): List or stringified list of away team players. - - "gameEvents_playerId" (int): ID of the player involved in the event. - - Returns - ------- - pd.Series - A Series with coordinates: - - "start_x" - - "start_y" - - "end_x" - - "end_y" - If the player is not found, all values are None. - """ - # choose player list - if row["gameEvents_homeTeam"] is True: - player_dict = row["homePlayers"] - elif row["gameEvents_homeTeam"] is False: - player_dict = row["awayPlayers"] - else: - return pd.Series([None, None, None, None], index=["start_x", "start_y", "end_x", "end_y"]) - - # find target player - player_dict = ast.literal_eval(player_dict) if type(player_dict) == str else player_dict - target_player = next((d for d in player_dict if d["playerId"] == row["gameEvents_playerId"]), None) - - if target_player: - return pd.Series( - [target_player["x"], target_player["y"], target_player["x"], target_player["y"]], - index=["start_x", "start_y", "end_x", "end_y"] - ) - else: - return pd.Series([None, None, None, None], index=["start_x", "start_y", "end_x", "end_y"]) - - # drop row where gameEvents_startGameClock is NaN - event_df = event_df.dropna(subset=['gameEvents_startGameClock']).reset_index(drop=True) - - # set column name - column_name = ['Team', - 'Type', - 'Subtype', - 'Period', - 'Start Frame', - 'Start Time [s]', - 'End Frame', - 'End Time [s]', - 'From', - 'To', - 'Start X', - 'Start Y', - 'End X', - 'End Y'] - Metrica_df = pd.DataFrame(columns=column_name) - Metrica_df['Period'] = event_df['gameEvents_period'] - event_df[["start_x", "start_y", "end_x", "end_y"]] = event_df.apply(extract_player_xy, axis=1) - Metrica_df['Start X'] = event_df['start_x'] #- 52.5 - Metrica_df['Start Y'] = event_df['start_y'] #- 34 - Metrica_df['End X'] = event_df['end_x'] #- 52.5 - Metrica_df['End Y'] = event_df['end_y'] #- 34 - Metrica_df['From'] = event_df['gameEvents_playerName'] - Metrica_df['To'] = event_df['possessionEvents_receiverPlayerName'] - Metrica_df['Type'] = event_df['possessionEvents_possessionEventType'] - Metrica_df['Type'] = Metrica_df['Type'].apply(type_id2name) - - idx = event_df.index - - def col(name): - """Safe getter: returns Series aligned to df (all NaN if col missing).""" - return event_df[name] if name in event_df.columns else pd.Series(pd.NA, index=idx) - - # Raw outcome columns - pass_out = col('possessionEvents_passOutcomeType') - cross_out = col('possessionEvents_crossOutcomeType') - shot_out = col('possessionEvents_shotOutcomeType') - clr_out = col('possessionEvents_clearanceOutcomeType') - tkl_out = col('possessionEvents_challengeOutcomeType') - carry_out = col('possessionEvents_ballCarryOutcome') - touch_out = col('possessionEvents_touchOutcomeType') - - # Per-action success masks (nullable booleans) - event_df['pass_success'] = pass_out.isin(['C']) - event_df['cross_success'] = cross_out.isin(['C']) - event_df['shot_success'] = shot_out.isin(['G']) - event_df['clearance_success'] = ~clr_out.isin(['B','D']) & clr_out.notna() - event_df['tackle_success'] = tkl_out.isin(['B','C','M']) - event_df['dribble_success'] = carry_out.isin(['R']) - event_df['touch_success'] = touch_out.isin(['R']) - - # Where each action is *present* (not NaN), assign Subtype based on its success - event_df['Subtype'] = np.nan - - def apply_subtype(success_col, present_series): - """Set Subtype for rows where this action is present.""" - is_present = present_series.notna() - success = event_df[success_col] == True - fail = event_df[success_col] == False - event_df.loc[is_present & success, 'Subtype'] = 'success' - event_df.loc[is_present & fail, 'Subtype'] = 'fail' - - apply_subtype('pass_success', pass_out) - apply_subtype('cross_success', cross_out) - apply_subtype('shot_success', shot_out) - apply_subtype('clearance_success', clr_out) - apply_subtype('tackle_success', tkl_out) - apply_subtype('dribble_success', carry_out) - apply_subtype('touch_success', touch_out) - Metrica_df['Subtype'] = event_df['Subtype'] - - fps = 29.97 - - Metrica_df['Start Time [s]'] = (event_df['gameEvents_startGameClock']).round().astype(int) - Metrica_df['End Time [s]'] = (event_df['duration'] + event_df['gameEvents_startGameClock']).round().astype(int) - - Metrica_df['Start Frame'] = ((event_df['startTime'] - event_df['startTime'][0]) * fps).round().astype(int) - end_frame = ((event_df['endTime'] - event_df['startTime'][0]) * fps).round() - Metrica_df['End Frame'] = end_frame.fillna(Metrica_df['Start Frame']).astype(int) - Metrica_df['Team'] = np.where(event_df['gameEvents_homeTeam'] == True, 'Home', - np.where(event_df['gameEvents_homeTeam'] == False, 'Away', None)) - - #drop rows where start_x or start_y is NaN - Metrica_df = Metrica_df.dropna(subset=['Start X', 'Start Y']) - Metrica_df = Metrica_df.reset_index(drop=True) - - if match_id is not None: - Metrica_df['match_id'] = match_id - cols = Metrica_df.columns.tolist() - cols = cols[-1:] + cols[:-1] - Metrica_df = Metrica_df[cols] - - return Metrica_df - """ - Load event data from CSV file and optionally merge with tracking data. - - Args: - event_path (str): Path to the CSV file containing event data. - match_id (str, optional): Identifier for the match. Defaults to None. - tracking_path (str, optional): Path to the CSV file containing tracking data. Defaults to None. - - Returns: - pd.DataFrame: DataFrame containing event and tracking data. - """ - # Load event data from CSV file - event_df = pd.read_csv(event_path) - - # Load tracking data if provided - if tracking_path: - tracking_df = pd.read_csv(tracking_path) - - # Define columns for the DataFrame - columns = ["match_id", "seconds", "event_type", "outcome", "team", "player", "start_x", "start_y", "end_x", "end_y"] - if tracking_path: - columns.extend([" l_score", " r_score", " b_x", " b_y"]) - for i in range(1, 12): - columns.extend([f" l{i}_x", f" l{i}_y"]) - for i in range(1, 12): - columns.extend([f" r{i}_x", f" r{i}_y"]) - - - # Initialize an empty list to store event details - event_list = [] - - # Iterate through event records - for index, record in event_df.iterrows(): - seconds = record.get('Time1', None) - event_type = record.get('Type', None) - outcome = record.get('Success', None) - team = record.get('Side1', None) - player = record.get('Unum1', None) - start_x = record.get('X1', None) - start_y = record.get('Y1', None) - end_x = record.get('X2', None) - end_y = record.get('Y2', None) - - # If tracking data is provided, merge with event details - if tracking_path: - if seconds in tracking_df[' cycle'].values: - tracking_record = tracking_df[tracking_df[' cycle'] == seconds] - if tracking_record.shape[0] != 1: - print(f"Error: Tracking record {index} has more than one row") - continue - - # Extract tracking data - tracking_values = tracking_record.iloc[0].to_dict() - - # tracking_values.pop(' cycle') # Remove the cycle column - tracking_values = {key: value for key, value in tracking_values.items() if key in columns} - # Append event and tracking details to the list - event_list.append([match_id, seconds, event_type, outcome, team, player, start_x, start_y, end_x, end_y, *tracking_values.values()]) - else: - # Append only event details - event_list.append([match_id, seconds, event_type, outcome, team, player, start_x, start_y, end_x, end_y]) - - # Convert the event list to a DataFrame - df = pd.DataFrame(event_list, columns=columns) - - # Sort the DataFrame by 'seconds' - df = df.sort_values(by="seconds").reset_index(drop=True) - - return df \ No newline at end of file + return processed_tracking_df \ No newline at end of file diff --git a/preprocessing/sports/phase_data/soccer/soccer_phase_class.py b/preprocessing/sports/phase_data/soccer/soccer_phase_class.py index 37dee12..3917c1d 100644 --- a/preprocessing/sports/phase_data/soccer/soccer_phase_class.py +++ b/preprocessing/sports/phase_data/soccer/soccer_phase_class.py @@ -1,28 +1,9 @@ #Target data provider [Metrica,Robocup 2D simulation,Statsbomb,Wyscout,Opta data,DataFactory,sportec] -''' -format of the data source -Metrica:csv and json (tracking data will be included in the future due to lack of matching data) -Robocup 2D simulation:csv and gz -Statsbomb: json -Wyscout: json -Opta data:xml -DataFactory:json -sportec:xml -DataStadium:csv -soccertrack:csv and xml -''' - -import os -import pandas as pd -from tqdm import tqdm -from concurrent.futures import ThreadPoolExecutor, as_completed - if __name__ == '__main__': import soccer_load_data else: from . import soccer_load_data -import pdb #create a class to wrap the data source class Soccer_phase_data: @@ -44,8 +25,8 @@ def load_data(self): df=soccer_load_data.load_bepro(self.bp_tracking_xml_path, self.bp_tracking_json_paths, self.bp_event_path, self.bp_meta_data) elif self.data_provider == 'statsbomb_skillcorner': df=soccer_load_data.load_statsbomb_skillcorner(sb_event_path=self.sb_event_path, sc_tracking_path=self.sc_tracking_path, sc_match_path=self.sc_match_path, sc_players_path=self.sc_players_path) - elif self.data_provider == 'pff_fc': - df=soccer_load_data.load_pff2metrica(self.bp_event_path) + # elif self.data_provider == 'pff_fc': + # df=soccer_load_data.load_pff2metrica(self.bp_event_path) # elif self.data_provider == 'robocup_2d': # df=soccer_load_data.load_robocup_2d(self.event_path,match_id=self.match_id,tracking_path=self.tracking_path) # elif self.data_provider == 'datastadium': From 5e8ce2a9c90ca534906c0e4001c6c8e8e2b8d011 Mon Sep 17 00:00:00 2001 From: KurodaKento0505 Date: Thu, 1 Jan 2026 14:10:23 +0900 Subject: [PATCH 4/7] update README.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index d48b2c1..dd73f3e 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,9 @@ pip install -e . - [Space data in Basketball 🏀](https://github.com/open-starlab/PreProcessing/blob/master/preprocessing/sports/space_data/basketball/README.md) - [Space data in Football/Soccer ⚽](https://github.com/open-starlab/PreProcessing/blob/master/preprocessing/sports/space_data/soccer/README.md) +#### Phase Data +- [Phase data in Football/Soccer ⚽](https://github.com/open-starlab/PreProcessing/blob/master/preprocessing/sports/Phase_data/soccer/README.md) + ## RoadMap - [x] Release the package - [ ] Incorporate more functions @@ -54,6 +57,7 @@ pip install -e . Calvin Yeung
Calvin Yeung

💻 Kenjiro Ide
Kenjiro Ide

💻 Zheng Chen
Zheng Chen

💻 + Kento Kuroda
Kento Kuroda

💻 Keisuke Fujii
Keisuke Fujii

🧑‍💻 From 36d7b18d1b2b586867e80677d5319d4cc7e7914c Mon Sep 17 00:00:00 2001 From: KurodaKento0505 Date: Thu, 1 Jan 2026 14:13:32 +0900 Subject: [PATCH 5/7] update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dd73f3e..df0a30b 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ pip install -e . - [Space data in Football/Soccer ⚽](https://github.com/open-starlab/PreProcessing/blob/master/preprocessing/sports/space_data/soccer/README.md) #### Phase Data -- [Phase data in Football/Soccer ⚽](https://github.com/open-starlab/PreProcessing/blob/master/preprocessing/sports/Phase_data/soccer/README.md) +- [Phase data in Football/Soccer ⚽](https://github.com/open-starlab/PreProcessing/blob/master/preprocessing/sports/phase_data/soccer/README.md) ## RoadMap - [x] Release the package From 2ec0aca8534c4506badf846f3c46c892c79917e7 Mon Sep 17 00:00:00 2001 From: KurodaKento0505 Date: Tue, 6 Jan 2026 16:57:53 +0900 Subject: [PATCH 6/7] update version in pyproject.toml --- preprocessing/sports/phase_data/soccer/soccer_load_data.py | 1 - pyproject.toml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/preprocessing/sports/phase_data/soccer/soccer_load_data.py b/preprocessing/sports/phase_data/soccer/soccer_load_data.py index a30c02e..21811c2 100644 --- a/preprocessing/sports/phase_data/soccer/soccer_load_data.py +++ b/preprocessing/sports/phase_data/soccer/soccer_load_data.py @@ -1,6 +1,5 @@ import json import pandas as pd -pd.set_option('future.no_silent_downcasting', True) import numpy as np import xml.etree.ElementTree as ET import os diff --git a/pyproject.toml b/pyproject.toml index 9b1f134..9d3840c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" [project] name = "openstarlab_preprocessing" -version = "0.1.41" +version = "0.1.42" description = "openstarlab preprocessing package" readme = "README.md" requires-python = ">=3.8" From 5b5db35a32d0fb378f973cc2a27ce5fb20f10774 Mon Sep 17 00:00:00 2001 From: KurodaKento0505 Date: Wed, 7 Jan 2026 16:28:31 +0900 Subject: [PATCH 7/7] remove local path from some scripts --- .../sports/phase_data/phase_class.py | 94 +++++++++---------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/preprocessing/sports/phase_data/phase_class.py b/preprocessing/sports/phase_data/phase_class.py index 24ec6cb..c41d4f6 100644 --- a/preprocessing/sports/phase_data/phase_class.py +++ b/preprocessing/sports/phase_data/phase_class.py @@ -18,52 +18,52 @@ def __new__(cls, data_provider, *args, **kwargs): raise ValueError(f'Unknown data provider: {data_provider}') -def main(): - import os - import argparse - import glob - args = argparse.ArgumentParser() - args.add_argument('--data_provider', required=True, choices=['bepro', 'statsbomb_skillcorner', 'pff_fc'], help='kind of data provider') - args.add_argument('--match_id', required=False, help='ID of match data') - args = args.parse_args() - data_provider = args.data_provider - base_dir = os.getcwd() + f"/test/sports/" - if data_provider == 'bepro': - match_ids = [str(match_id) for match_id in args.match_id.split(",")] - for match_id in match_ids: - # The format for bepro has changed from Match ID: 130000(?). - if int(match_id) >= 130000: - file_pattern = os.path.join(base_dir, 'tracking_data', data_provider, match_id, f"{match_id}_*_frame_data.json") - tracking_json_paths = sorted(glob.glob(file_pattern)) - meta_data = os.path.join(base_dir, 'tracking_data', data_provider, match_id, f"{match_id}_metadata.json") - event_csv_path = glob.glob(os.path.join(os.path.join(base_dir, 'event_data', data_provider, match_id), '*.csv')) - preprocessing_df=Phase_data(data_provider=data_provider, bp_tracking_json_paths=tracking_json_paths, event_path=event_csv_path[0], meta_data=meta_data).load_data() - else: - tracking_path=os.getcwd()+f"/test/sports/tracking_data/{data_provider}/{match_id}/{match_id}_tracker_box_data.xml" - meta_data = os.path.join(base_dir, 'tracking_data', data_provider, match_id, f"{match_id}_tracker_box_metadata.xml") - event_csv_path = glob.glob(os.path.join(os.path.join(base_dir, 'event_data', data_provider, match_id), '*.csv')) - preprocessing_df=Phase_data(data_provider=data_provider, bp_tracking_xml_path=tracking_path, event_path=event_csv_path[0], meta_data=meta_data).load_data() - output_file_path = os.path.join(base_dir, 'phase_data', data_provider, match_id, f"{match_id}_main_data.csv") - preprocessing_df.to_csv(output_file_path,index=False) - print(f"✅ All period tracking data saved successfully at {output_file_path}.") - elif data_provider == 'statsbomb_skillcorner': - sb_match_id = 3894537 # 843, 537 - sc_match_id = 1018887 # 1498966, 1018887 - sb_event_path=f'D:/lab/My_Research/Github/OpenSTARLab/PreProcessing/test/sports/event_data/statsbomb/{sb_match_id}_events.pkl' - sc_tracking_path=f'D:/lab/My_Research/Github/OpenSTARLab/PreProcessing/test/sports/tracking_data/skillcorner/LaLiga-2023-2024/tracking/{sc_match_id}.json' - sc_match_path=f'D:/lab/My_Research/Github/OpenSTARLab/PreProcessing/test/sports/tracking_data/skillcorner/LaLiga-2023-2024/match/{sc_match_id}.json' - sc_players_path='D:/lab/My_Research/Github/OpenSTARLab/PreProcessing/test/sports/tracking_data/skillcorner/LaLiga-2023-2024/players/players.json' - preprocessing_df=Phase_data(data_provider=data_provider, sb_event_path=sb_event_path, sc_tracking_path=sc_tracking_path, sc_match_path=sc_match_path, sc_players_path=sc_players_path).load_data() - output_file_dir = os.path.join(base_dir, 'phase_data', data_provider, f'{sb_match_id}_{sc_match_id}') - os.makedirs(output_file_dir, exist_ok=True) - output_file_path = os.path.join(output_file_dir, f"{sb_match_id}_{sc_match_id}_main_data.csv") - preprocessing_df.to_csv(output_file_path,index=False) - elif data_provider == 'pff_fc': - print('not yet') - output_file_path = os.path.join(base_dir, 'phase_data', data_provider, match_id, f"{match_id}_main_data.csv") - preprocessing_df.to_csv(output_file_path,index=False) - print(f"✅ All period tracking data saved successfully at {output_file_path}.") +# def main(): +# import os +# import argparse +# import glob +# args = argparse.ArgumentParser() +# args.add_argument('--data_provider', required=True, choices=['bepro', 'statsbomb_skillcorner', 'pff_fc'], help='kind of data provider') +# args.add_argument('--match_id', required=False, help='ID of match data') +# args = args.parse_args() +# data_provider = args.data_provider +# base_dir = os.getcwd() + f"path/to" +# if data_provider == 'bepro': +# match_ids = [str(match_id) for match_id in args.match_id.split(",")] +# for match_id in match_ids: +# # The format for bepro has changed from Match ID: 130000(?). +# if int(match_id) >= 130000: +# file_pattern = os.path.join(base_dir, 'tracking_data', data_provider, match_id, f"{match_id}_*_frame_data.json") +# tracking_json_paths = sorted(glob.glob(file_pattern)) +# meta_data = os.path.join(base_dir, 'tracking_data', data_provider, match_id, f"{match_id}_metadata.json") +# event_csv_path = glob.glob(os.path.join(os.path.join(base_dir, 'event_data', data_provider, match_id), '*.csv')) +# preprocessing_df=Phase_data(data_provider=data_provider, bp_tracking_json_paths=tracking_json_paths, event_path=event_csv_path[0], meta_data=meta_data).load_data() +# else: +# tracking_path=os.getcwd()+f"path/to/tracking_data/{data_provider}/{match_id}/{match_id}_tracker_box_data.xml" +# meta_data = os.path.join(base_dir, 'tracking_data', data_provider, match_id, f"{match_id}_tracker_box_metadata.xml") +# event_csv_path = glob.glob(os.path.join(os.path.join(base_dir, 'event_data', data_provider, match_id), '*.csv')) +# preprocessing_df=Phase_data(data_provider=data_provider, bp_tracking_xml_path=tracking_path, event_path=event_csv_path[0], meta_data=meta_data).load_data() +# output_file_path = os.path.join(base_dir, 'phase_data', data_provider, match_id, f"{match_id}_main_data.csv") +# preprocessing_df.to_csv(output_file_path,index=False) +# print(f"✅ All period tracking data saved successfully at {output_file_path}.") +# elif data_provider == 'statsbomb_skillcorner': +# sb_match_id = 3894537 # 843, 537 +# sc_match_id = 1018887 # 1498966, 1018887 +# sb_event_path=f'path/to/event_data/statsbomb/{sb_match_id}_events.pkl' +# sc_tracking_path=f'path/to/tracking_data/skillcorner/LaLiga-2023-2024/tracking/{sc_match_id}.json' +# sc_match_path=f'path/to/tracking_data/skillcorner/LaLiga-2023-2024/match/{sc_match_id}.json' +# sc_players_path='path/to/tracking_data/skillcorner/LaLiga-2023-2024/players/players.json' +# preprocessing_df=Phase_data(data_provider=data_provider, sb_event_path=sb_event_path, sc_tracking_path=sc_tracking_path, sc_match_path=sc_match_path, sc_players_path=sc_players_path).load_data() +# output_file_dir = os.path.join(base_dir, 'phase_data', data_provider, f'{sb_match_id}_{sc_match_id}') +# os.makedirs(output_file_dir, exist_ok=True) +# output_file_path = os.path.join(output_file_dir, f"{sb_match_id}_{sc_match_id}_main_data.csv") +# preprocessing_df.to_csv(output_file_path,index=False) +# elif data_provider == 'pff_fc': +# print('not yet') +# output_file_path = os.path.join(base_dir, 'phase_data', data_provider, match_id, f"{match_id}_main_data.csv") +# preprocessing_df.to_csv(output_file_path,index=False) +# print(f"✅ All period tracking data saved successfully at {output_file_path}.") -if __name__ == '__main__': - main() \ No newline at end of file +# if __name__ == '__main__': +# main() \ No newline at end of file