From 0d0ef339fe6085790100e02159acaf70af2f15b9 Mon Sep 17 00:00:00 2001 From: Kenjiro ide Date: Fri, 30 Jan 2026 17:17:27 +0900 Subject: [PATCH 1/5] Handle substitution jersey filling and tighten league validation --- .../soccer/cleaning/clean_event_data.py | 39 ++++++++++++------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/preprocessing/sports/SAR_data/soccer/cleaning/clean_event_data.py b/preprocessing/sports/SAR_data/soccer/cleaning/clean_event_data.py index f0cccdb..d815227 100644 --- a/preprocessing/sports/SAR_data/soccer/cleaning/clean_event_data.py +++ b/preprocessing/sports/SAR_data/soccer/cleaning/clean_event_data.py @@ -21,20 +21,29 @@ def get_changed_player_list(event_data: pd.DataFrame, league: str) -> Tuple[List Tuple[List[int], List[int]]: Tuple containing two lists of players who have changed in the home and away teams respectively """ - if league == "jleague" or league == "fifawc": - changed_player_list_in_home = list( - event_data.query("event_name == '交代' and home_away == 'HOME'")["jersey_number"].values.astype(int) - ) - changed_player_list_in_away = list( - event_data.query("event_name == '交代' and home_away == 'AWAY'")["jersey_number"].values.astype(int) - ) - elif league == "laliga": - changed_player_list_in_home = list( - event_data.query("event_name == 'Substitution' and home_away == 'HOME'")["jersey_number"].values.astype(int) - ) - changed_player_list_in_away = list( - event_data.query("event_name == 'Substitution' and home_away == 'AWAY'")["jersey_number"].values.astype(int) - ) + # Fill missing jersey_number for substitution events using player_name within same team. + sub_event_name = {"jleague": "交代", "fifawc": "交代", "laliga": "Substitution"}.get(league) + assert sub_event_name is not None, f"Unsupported league: {league}" + + sub_mask = event_data["event_name"].eq(sub_event_name) + missing_mask = ( + sub_mask & event_data["jersey_number"].isna() & event_data["player_name"].notna() & event_data["home_away"].notna() + ) + if missing_mask.any(): + known = event_data.loc[ + event_data["jersey_number"].notna() & event_data["player_name"].notna() & event_data["home_away"].notna(), + ["home_away", "player_name", "jersey_number"], + ] + if not known.empty: + jersey_map = known.groupby(["home_away", "player_name"], sort=False)["jersey_number"].agg( + lambda s: s.value_counts().idxmax() + ) + keys = pd.MultiIndex.from_frame(event_data.loc[missing_mask, ["home_away", "player_name"]]) + event_data.loc[missing_mask, "jersey_number"] = keys.map(jersey_map).to_numpy() + + sub_df = event_data.loc[sub_mask, ["home_away", "jersey_number"]] + changed_player_list_in_home = sub_df.loc[sub_df["home_away"].eq("HOME"), "jersey_number"].dropna().astype(int).tolist() + changed_player_list_in_away = sub_df.loc[sub_df["home_away"].eq("AWAY"), "jersey_number"].dropna().astype(int).tolist() return changed_player_list_in_home, changed_player_list_in_away @@ -65,7 +74,7 @@ def get_timestamp(event_data: pd.DataFrame, league: str) -> Dict[str, int]: "second_start_frame": event_data.loc[event_data["event_name"] == "Half Start 2", "frame_id"].values[0], "second_end_frame": event_data.loc[event_data["event_name"] == "Half End 2", "frame_id"].values[0], } - except: + except IndexError: timestamp_dict = { "first_start_frame": event_data.loc[event_data["event_name"] == "Half Start 1", "frame_id"].values[0], "first_end_frame": event_data.loc[event_data["event_name"] == "Half End 1", "frame_id"].values[0], From 99f1d53df107d4a3c3e6cb14f3a0ef35dc1911bd Mon Sep 17 00:00:00 2001 From: Kenjiro ide Date: Fri, 30 Jan 2026 17:20:48 +0900 Subject: [PATCH 2/5] Replace pdb breakpoint with assertion --- .../soccer/cleaning/clean_tracking_data.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/preprocessing/sports/SAR_data/soccer/cleaning/clean_tracking_data.py b/preprocessing/sports/SAR_data/soccer/cleaning/clean_tracking_data.py index c33cc88..1ab455f 100644 --- a/preprocessing/sports/SAR_data/soccer/cleaning/clean_tracking_data.py +++ b/preprocessing/sports/SAR_data/soccer/cleaning/clean_tracking_data.py @@ -325,9 +325,7 @@ def interpolate_ball_tracking_data( interpolated_tracking_ball.duplicated(subset=["frame_id"], keep=False) ] print("duplicated_frame_id:", duplicated_frame_id) - import pdb - - pdb.set_trace() + raise AssertionError("There are still duplicate frame_ids after interpolation.") assert interpolated_tracking_ball["frame_id"].nunique() == len(interpolated_tracking_ball) return interpolated_tracking_ball @@ -748,9 +746,7 @@ def get_player_change_log_legacy( print("new_players_home:", new_players_home) print("changed_player_list_in_home:", changed_player_list_in_home) print("player_ever_on_pitch_home:", player_ever_on_pitch_home) - import pdb - - pdb.set_trace() + raise AssertionError("Jersey number mismatch.") if len(new_players_away := players_in_frame_away - player_ever_on_pitch_away) > 0: player_change_info.extend( @@ -973,9 +969,7 @@ def merge_ball_only_series(data): print(f"game_id: {tracking_data['game_id'].iloc[0]}") print(f"player_change_info: {player_change_info}") print(f"player_on_pitch_home: {player_on_pitch_home}") - import pdb - - pdb.set_trace() + raise AssertionError("Jersey number mismatch.") else: try: player_on_pitch_away.remove(player_change_info["player_out"]) @@ -985,9 +979,7 @@ def merge_ball_only_series(data): print(f"game_id: {tracking_data['game_id'].iloc[0]}") print(f"player_change_info: {player_change_info}") print(f"player_on_pitch_away: {player_on_pitch_away}") - import pdb - - pdb.set_trace() + raise AssertionError("Jersey number mismatch.") new_tracking_data = pd.concat(new_data_list) new_tracking_data = new_tracking_data.sort_values(by=["half", "frame_id", "home_away", "jersey_number"]).reset_index( From c2fbeeefb1972b540a7b09f816e74859182f6eb2 Mon Sep 17 00:00:00 2001 From: Kenjiro ide Date: Fri, 30 Jan 2026 18:05:11 +0900 Subject: [PATCH 3/5] Change Japanese comments to English --- .../soccer/cleaning/clean_tracking_data.py | 54 +++++++++---------- .../soccer/state_preprocess/state_edms.py | 26 ++++----- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/preprocessing/sports/SAR_data/soccer/cleaning/clean_tracking_data.py b/preprocessing/sports/SAR_data/soccer/cleaning/clean_tracking_data.py index 1ab455f..b3bb37c 100644 --- a/preprocessing/sports/SAR_data/soccer/cleaning/clean_tracking_data.py +++ b/preprocessing/sports/SAR_data/soccer/cleaning/clean_tracking_data.py @@ -1214,7 +1214,7 @@ def resample_tracking_data( ] ].reset_index(drop=True) - # 選手数が11を超えた場合の修正処理(交代を考慮) + # Fix processing when the number of players exceeds 11 (considering substitutions) def fix_player_count(df): player_data = df.query("home_away != 'BALL'").copy() player_counts = player_data.groupby(["time_from_half_start", "half", "home_away"])["jersey_number"].nunique() @@ -1230,7 +1230,7 @@ def fix_player_count(df): current_players = df[current_mask]["jersey_number"].tolist() if len(current_players) > 11: - # 前後の時刻で実際にプレーしている選手を確認 + # Check actual players playing at surrounding times time_values = sorted( player_data[(player_data["half"] == half) & (player_data["home_away"] == team)][ "time_from_half_start" @@ -1239,8 +1239,8 @@ def fix_player_count(df): current_idx = time_values.index(time_point) - # 前後10フレームの選手を確認(交代の瞬間を考慮) - # より高速なベクトル化されたアプローチ + # Check actual players playing at surrounding times (considering substitutions) + # More efficient vectorized approach time_values = np.array( sorted( player_data[(player_data["half"] == half) & (player_data["home_away"] == team)][ @@ -1251,12 +1251,12 @@ def fix_player_count(df): current_idx = np.where(time_values == time_point)[0][0] - # 前後10フレームのインデックス範囲を計算 + # Calculate the index range for the surrounding 10 frames start_idx = max(0, current_idx - 10) end_idx = min(len(time_values), current_idx + 11) context_time_range = time_values[start_idx:end_idx] - # 一度のクエリで該当する全フレームのデータを取得 + # Retrieve data for all relevant frames in a single query context_mask = ( player_data["time_from_half_start"].isin(context_time_range) & (player_data["half"] == half) @@ -1264,22 +1264,22 @@ def fix_player_count(df): ) context_data = player_data[context_mask] - # 正常なフレーム(11人)のみをフィルタリング + # Filter only valid frames (11 players) valid_frames = context_data.groupby("time_from_half_start")["jersey_number"].nunique() valid_times = valid_frames[valid_frames == 11].index - # 正常なフレームの選手を集計 + # Aggregate players from valid frames context_players = set( context_data[context_data["time_from_half_start"].isin(valid_times)]["jersey_number"].tolist() ) - # パディング選手と実選手を分離 - padding_players = [p for p in current_players if p < 0] # 負の背番号はパディング選手 - real_players = [p for p in current_players if p > 0] # 正の背番号は実選手 + # Separate padding players and real players + padding_players = [p for p in current_players if p < 0] # Negative jersey numbers are padding players + real_players = [p for p in current_players if p > 0] # Positive jersey numbers are real players - # 実選手が11人を超えている場合の処理 + # Processing when the number of real players exceeds 11 if len(real_players) > 11: - # 前後のフレームに出現する実選手を優先 + # Prioritize real players appearing in surrounding frames if len(context_players) >= 11: context_real_players = [p for p in real_players if p in context_players] other_real_players = [p for p in real_players if p not in context_players] @@ -1288,23 +1288,23 @@ def fix_player_count(df): if len(players_to_keep) < 11: players_to_keep.extend(other_real_players[: 11 - len(players_to_keep)]) else: - # 前後のフレーム情報が不十分な場合、背番号順で選択 + # If surrounding frame information is insufficient, select by jersey number players_to_keep = sorted(real_players)[:11] elif len(real_players) <= 11: - # 実選手が11人以下の場合、すべての実選手を保持 + # If the number of real players is 11 or less, keep all real players players_to_keep = real_players.copy() - # 不足分をパディング選手で補う(最大11人まで) + # Supplement the shortage with padding players (up to 11) needed_padding = 11 - len(players_to_keep) if needed_padding > 0 and padding_players: - # パディング選手を背番号順でソートして必要数だけ追加 - sorted_padding = sorted(padding_players, reverse=True) # -1, -2, -3...の順 + # Sort padding players by jersey number and add the required number + sorted_padding = sorted(padding_players, reverse=True) # -1, -2, -3... order players_to_keep.extend(sorted_padding[:needed_padding]) logger.info( f"Player selection at time {time_point}: Real={len([p for p in players_to_keep if p > 0])}, Padding={len([p for p in players_to_keep if p < 0])}" ) - # 選択されなかった選手を除去 + # Remove players who were not selected players_to_remove = [p for p in current_players if p not in players_to_keep] remove_mask = current_mask & df["jersey_number"].isin(players_to_remove) df = df[~remove_mask].copy() @@ -1512,7 +1512,7 @@ def parse_tracking_data(x): if isinstance(x, dict): if len(x) == 0: return None - # velocityキーが存在する辞書は空ではない + # Dictionaries containing the key "velocity" are not considered empty if "velocity" in x or "acceleration" in x or "position" in x: return x return x @@ -1529,7 +1529,7 @@ def parse_tracking_data(x): def clean_empty_data(series): def convert_empty(x): if isinstance(x, dict): - # 重要なキーが含まれている場合は空と判定しない + # Dictionaries containing important keys are not considered empty if any(key in x for key in ["velocity", "acceleration", "position"]): return x if len(x) == 0: @@ -1747,9 +1747,9 @@ def __get_player2vel(player_data: List[Dict[str, Any]]) -> Dict[str, Dict[str, f player_name = player.get("player_name") jersey_number = player.get("jersey_number", 0) - # パディング選手(負の背番号)をスキップ + # Skip padding players (negative jersey numbers) if jersey_number < 0: - # パディング選手のvelocityはデフォルトで0に設定 + # Set velocity of padding players to zero by default player2vel[f"padding_{jersey_number}"] = {"x": 0, "y": 0} continue @@ -1787,18 +1787,18 @@ def __get_player2vel(player_data: List[Dict[str, Any]]) -> Dict[str, Dict[str, f player_name = d.get("player_name") jersey_number = d.get("jersey_number", 0) - # パディング選手の処理 + # Skip padding players (negative jersey numbers) if jersey_number < 0: d["acceleration"] = {"x": 0, "y": 0} continue - # 実選手の処理 + # Process real players if player_name and player_name in prev_player2vel: d["acceleration"] = deepcopy(prev_player2vel[player_name]) elif jersey_number < 0 and f"padding_{jersey_number}" in prev_player2vel: d["acceleration"] = {"x": 0, "y": 0} else: - if player_name: # 実選手のみログ出力 + if player_name: # Log output for real players only # Use substitution detection for better context is_substitution = __is_likely_substitution_scenario(player_name, jersey_number, tracking_data, idx) substitution_note = " (substitution detected)" if is_substitution else " (unexpected absence)" @@ -1829,7 +1829,7 @@ def __get_player2vel(player_data: List[Dict[str, Any]]) -> Dict[str, Dict[str, f player_name = d.get("player_name") jersey_number = d.get("jersey_number", 0) - # パディング選手の処理 + # Skip padding players (negative jersey numbers) if jersey_number < 0: d["acceleration"] = {"x": 0, "y": 0} continue diff --git a/preprocessing/sports/SAR_data/soccer/state_preprocess/state_edms.py b/preprocessing/sports/SAR_data/soccer/state_preprocess/state_edms.py index e6efad4..2191715 100644 --- a/preprocessing/sports/SAR_data/soccer/state_preprocess/state_edms.py +++ b/preprocessing/sports/SAR_data/soccer/state_preprocess/state_edms.py @@ -67,15 +67,15 @@ def velocity_points(points, velocities): def calculate_voronoi(players: List[Player], ball: Ball, team_info: str, key: str): - # プレイヤーの位置情報を取得し、スケール変換 + # Get player position info and apply scale conversion points = [[player.position.x + 52.5, player.position.y + 34] for player in players] velocities = [[player.velocity.x, player.velocity.y] for player in players] - # プレイヤーのチーム情報を取得 + # Get player team information team = [player.team_name for player in players] - # ボールの位置 + # Ball position ball_loc = [ball.position.x + 52.5, ball.position.y + 34] player_name = [player.player_name for player in players] @@ -86,7 +86,7 @@ def calculate_voronoi(players: List[Player], ball: Ball, team_info: str, key: st # judge offside filtered_points, offside_f = judge_offside(points, ball_loc, team, team_info) - # ボロノイ図を計算 + # Calculate Voronoi diagram vor = Voronoi(filtered_points) return vor, team, player_name, offside_f @@ -256,34 +256,34 @@ def voronoi_finite_polygons_2d_cached(vor: Voronoi, radius: Optional[float] = No def weighted_area(polygon, weight_image, team, team_name): - # 多角形の頂点を取得 + # Get polygon vertices if isinstance(polygon, Polygon): vertices = np.array(polygon.exterior.coords, dtype=np.int32) else: raise TypeError("polygon must be a shapely.geometry.Polygon object") - # 多角形のマスクを作成 + # Create a mask for the polygon mask = np.zeros(weight_image.shape, dtype=np.uint8) - + # Create a path object from the polygon vertices path = mpath.Path(vertices) - + # Create a grid of coordinates for the mask - y_coords, x_coords = np.mgrid[0:weight_image.shape[0], 0:weight_image.shape[1]] + y_coords, x_coords = np.mgrid[0 : weight_image.shape[0], 0 : weight_image.shape[1]] points = np.vstack([x_coords.ravel(), y_coords.ravel()]).T - + # Check which points are inside the polygon mask_flat = path.contains_points(points) mask = mask_flat.reshape(weight_image.shape).astype(np.uint8) - # team が team_name と異なる場合、weight_image を x 軸方向に反転 + # If team differs from team_name, flip the weight image horizontally if team != team_name: weight_image = np.flip(weight_image, axis=1) - # 重み画像とマスクを掛け合わせて重み付き領域を取得 + # Multiply weight image and mask to get weighted region weighted_region = weight_image * mask - # 重み付き領域の合計を計算 + # Calculate the sum of the weighted region area = np.sum(weighted_region) return area From f83331c39bf8894f7d5bb127c2f4f95cef079df8 Mon Sep 17 00:00:00 2001 From: Kenjiro ide Date: Fri, 30 Jan 2026 18:06:04 +0900 Subject: [PATCH 4/5] Change Japanese comments to English --- preprocessing/sports/SAR_data/soccer/soccer_load_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/preprocessing/sports/SAR_data/soccer/soccer_load_data.py b/preprocessing/sports/SAR_data/soccer/soccer_load_data.py index 64c4e6a..b13924e 100644 --- a/preprocessing/sports/SAR_data/soccer/soccer_load_data.py +++ b/preprocessing/sports/SAR_data/soccer/soccer_load_data.py @@ -501,7 +501,7 @@ def load_single_fifawc(data_path: str, match_id: str): with open(event_file, "r") as f: event_df = json.load(f) - # Tracking Data処理 + # Tracking Data processing tracking_file = data_path / "Tracking Data" / f"{match_id}.jsonl.bz2" tracking_list = [] with bz2.open(tracking_file, "rt") as f: @@ -510,7 +510,7 @@ def load_single_fifawc(data_path: str, match_id: str): record = json.loads(line) tracking_list.append(record) - # Players Data処理 + # Players Data processing metadata_file = data_path / "Metadata" / f"{match_id}.json" roster_file = data_path / "Rosters" / f"{match_id}.json" with open(metadata_file, "r") as f: From b6e09d1ebbc720678a376bb6904a2ab54238a503 Mon Sep 17 00:00:00 2001 From: Kenjiro ide Date: Fri, 30 Jan 2026 21:20:38 +0900 Subject: [PATCH 5/5] Update project version from 0.1.42 to 0.1.43 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9d3840c..5338f75 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" [project] name = "openstarlab_preprocessing" -version = "0.1.42" +version = "0.1.43" description = "openstarlab preprocessing package" readme = "README.md" requires-python = ">=3.8"