Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 24 additions & 15 deletions preprocessing/sports/SAR_data/soccer/cleaning/clean_event_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,29 @@ def get_changed_player_list(event_data: pd.DataFrame, league: str) -> Tuple[List
Tuple[List[int], List[int]]:
Tuple containing two lists of players who have changed in the home and away teams respectively
"""
if league == "jleague" or league == "fifawc":
changed_player_list_in_home = list(
event_data.query("event_name == '交代' and home_away == 'HOME'")["jersey_number"].values.astype(int)
)
changed_player_list_in_away = list(
event_data.query("event_name == '交代' and home_away == 'AWAY'")["jersey_number"].values.astype(int)
)
elif league == "laliga":
changed_player_list_in_home = list(
event_data.query("event_name == 'Substitution' and home_away == 'HOME'")["jersey_number"].values.astype(int)
)
changed_player_list_in_away = list(
event_data.query("event_name == 'Substitution' and home_away == 'AWAY'")["jersey_number"].values.astype(int)
)
# Fill missing jersey_number for substitution events using player_name within same team.
sub_event_name = {"jleague": "交代", "fifawc": "交代", "laliga": "Substitution"}.get(league)
assert sub_event_name is not None, f"Unsupported league: {league}"

sub_mask = event_data["event_name"].eq(sub_event_name)
missing_mask = (
sub_mask & event_data["jersey_number"].isna() & event_data["player_name"].notna() & event_data["home_away"].notna()
)
if missing_mask.any():
known = event_data.loc[
event_data["jersey_number"].notna() & event_data["player_name"].notna() & event_data["home_away"].notna(),
["home_away", "player_name", "jersey_number"],
]
if not known.empty:
jersey_map = known.groupby(["home_away", "player_name"], sort=False)["jersey_number"].agg(
lambda s: s.value_counts().idxmax()
)
keys = pd.MultiIndex.from_frame(event_data.loc[missing_mask, ["home_away", "player_name"]])
event_data.loc[missing_mask, "jersey_number"] = keys.map(jersey_map).to_numpy()

sub_df = event_data.loc[sub_mask, ["home_away", "jersey_number"]]
changed_player_list_in_home = sub_df.loc[sub_df["home_away"].eq("HOME"), "jersey_number"].dropna().astype(int).tolist()
changed_player_list_in_away = sub_df.loc[sub_df["home_away"].eq("AWAY"), "jersey_number"].dropna().astype(int).tolist()
return changed_player_list_in_home, changed_player_list_in_away


Expand Down Expand Up @@ -65,7 +74,7 @@ def get_timestamp(event_data: pd.DataFrame, league: str) -> Dict[str, int]:
"second_start_frame": event_data.loc[event_data["event_name"] == "Half Start 2", "frame_id"].values[0],
"second_end_frame": event_data.loc[event_data["event_name"] == "Half End 2", "frame_id"].values[0],
}
except:
except IndexError:
timestamp_dict = {
"first_start_frame": event_data.loc[event_data["event_name"] == "Half Start 1", "frame_id"].values[0],
"first_end_frame": event_data.loc[event_data["event_name"] == "Half End 1", "frame_id"].values[0],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -325,9 +325,7 @@ def interpolate_ball_tracking_data(
interpolated_tracking_ball.duplicated(subset=["frame_id"], keep=False)
]
print("duplicated_frame_id:", duplicated_frame_id)
import pdb

pdb.set_trace()
raise AssertionError("There are still duplicate frame_ids after interpolation.")

assert interpolated_tracking_ball["frame_id"].nunique() == len(interpolated_tracking_ball)
return interpolated_tracking_ball
Expand Down Expand Up @@ -748,9 +746,7 @@ def get_player_change_log_legacy(
print("new_players_home:", new_players_home)
print("changed_player_list_in_home:", changed_player_list_in_home)
print("player_ever_on_pitch_home:", player_ever_on_pitch_home)
import pdb

pdb.set_trace()
raise AssertionError("Jersey number mismatch.")

if len(new_players_away := players_in_frame_away - player_ever_on_pitch_away) > 0:
player_change_info.extend(
Expand Down Expand Up @@ -973,9 +969,7 @@ def merge_ball_only_series(data):
print(f"game_id: {tracking_data['game_id'].iloc[0]}")
print(f"player_change_info: {player_change_info}")
print(f"player_on_pitch_home: {player_on_pitch_home}")
import pdb

pdb.set_trace()
raise AssertionError("Jersey number mismatch.")
else:
try:
player_on_pitch_away.remove(player_change_info["player_out"])
Expand All @@ -985,9 +979,7 @@ def merge_ball_only_series(data):
print(f"game_id: {tracking_data['game_id'].iloc[0]}")
print(f"player_change_info: {player_change_info}")
print(f"player_on_pitch_away: {player_on_pitch_away}")
import pdb

pdb.set_trace()
raise AssertionError("Jersey number mismatch.")

new_tracking_data = pd.concat(new_data_list)
new_tracking_data = new_tracking_data.sort_values(by=["half", "frame_id", "home_away", "jersey_number"]).reset_index(
Expand Down Expand Up @@ -1222,7 +1214,7 @@ def resample_tracking_data(
]
].reset_index(drop=True)

# 選手数が11を超えた場合の修正処理(交代を考慮)
# Fix processing when the number of players exceeds 11 (considering substitutions)
def fix_player_count(df):
player_data = df.query("home_away != 'BALL'").copy()
player_counts = player_data.groupby(["time_from_half_start", "half", "home_away"])["jersey_number"].nunique()
Expand All @@ -1238,7 +1230,7 @@ def fix_player_count(df):
current_players = df[current_mask]["jersey_number"].tolist()

if len(current_players) > 11:
# 前後の時刻で実際にプレーしている選手を確認
# Check actual players playing at surrounding times
time_values = sorted(
player_data[(player_data["half"] == half) & (player_data["home_away"] == team)][
"time_from_half_start"
Expand All @@ -1247,8 +1239,8 @@ def fix_player_count(df):

current_idx = time_values.index(time_point)

# 前後10フレームの選手を確認(交代の瞬間を考慮)
# より高速なベクトル化されたアプローチ
# Check actual players playing at surrounding times (considering substitutions)
# More efficient vectorized approach
time_values = np.array(
sorted(
player_data[(player_data["half"] == half) & (player_data["home_away"] == team)][
Expand All @@ -1259,35 +1251,35 @@ def fix_player_count(df):

current_idx = np.where(time_values == time_point)[0][0]

# 前後10フレームのインデックス範囲を計算
# Calculate the index range for the surrounding 10 frames
start_idx = max(0, current_idx - 10)
end_idx = min(len(time_values), current_idx + 11)
context_time_range = time_values[start_idx:end_idx]

# 一度のクエリで該当する全フレームのデータを取得
# Retrieve data for all relevant frames in a single query
context_mask = (
player_data["time_from_half_start"].isin(context_time_range)
& (player_data["half"] == half)
& (player_data["home_away"] == team)
)
context_data = player_data[context_mask]

# 正常なフレーム(11人)のみをフィルタリング
# Filter only valid frames (11 players)
valid_frames = context_data.groupby("time_from_half_start")["jersey_number"].nunique()
valid_times = valid_frames[valid_frames == 11].index

# 正常なフレームの選手を集計
# Aggregate players from valid frames
context_players = set(
context_data[context_data["time_from_half_start"].isin(valid_times)]["jersey_number"].tolist()
)

# パディング選手と実選手を分離
padding_players = [p for p in current_players if p < 0] # 負の背番号はパディング選手
real_players = [p for p in current_players if p > 0] # 正の背番号は実選手
# Separate padding players and real players
padding_players = [p for p in current_players if p < 0] # Negative jersey numbers are padding players
real_players = [p for p in current_players if p > 0] # Positive jersey numbers are real players

# 実選手が11人を超えている場合の処理
# Processing when the number of real players exceeds 11
if len(real_players) > 11:
# 前後のフレームに出現する実選手を優先
# Prioritize real players appearing in surrounding frames
if len(context_players) >= 11:
context_real_players = [p for p in real_players if p in context_players]
other_real_players = [p for p in real_players if p not in context_players]
Expand All @@ -1296,23 +1288,23 @@ def fix_player_count(df):
if len(players_to_keep) < 11:
players_to_keep.extend(other_real_players[: 11 - len(players_to_keep)])
else:
# 前後のフレーム情報が不十分な場合、背番号順で選択
# If surrounding frame information is insufficient, select by jersey number
players_to_keep = sorted(real_players)[:11]
elif len(real_players) <= 11:
# 実選手が11人以下の場合、すべての実選手を保持
# If the number of real players is 11 or less, keep all real players
players_to_keep = real_players.copy()
# 不足分をパディング選手で補う(最大11人まで)
# Supplement the shortage with padding players (up to 11)
needed_padding = 11 - len(players_to_keep)
if needed_padding > 0 and padding_players:
# パディング選手を背番号順でソートして必要数だけ追加
sorted_padding = sorted(padding_players, reverse=True) # -1, -2, -3...の順
# Sort padding players by jersey number and add the required number
sorted_padding = sorted(padding_players, reverse=True) # -1, -2, -3... order
players_to_keep.extend(sorted_padding[:needed_padding])

logger.info(
f"Player selection at time {time_point}: Real={len([p for p in players_to_keep if p > 0])}, Padding={len([p for p in players_to_keep if p < 0])}"
)

# 選択されなかった選手を除去
# Remove players who were not selected
players_to_remove = [p for p in current_players if p not in players_to_keep]
remove_mask = current_mask & df["jersey_number"].isin(players_to_remove)
df = df[~remove_mask].copy()
Expand Down Expand Up @@ -1520,7 +1512,7 @@ def parse_tracking_data(x):
if isinstance(x, dict):
if len(x) == 0:
return None
# velocityキーが存在する辞書は空ではない
# Dictionaries containing the key "velocity" are not considered empty
if "velocity" in x or "acceleration" in x or "position" in x:
return x
return x
Expand All @@ -1537,7 +1529,7 @@ def parse_tracking_data(x):
def clean_empty_data(series):
def convert_empty(x):
if isinstance(x, dict):
# 重要なキーが含まれている場合は空と判定しない
# Dictionaries containing important keys are not considered empty
if any(key in x for key in ["velocity", "acceleration", "position"]):
return x
if len(x) == 0:
Expand Down Expand Up @@ -1755,9 +1747,9 @@ def __get_player2vel(player_data: List[Dict[str, Any]]) -> Dict[str, Dict[str, f
player_name = player.get("player_name")
jersey_number = player.get("jersey_number", 0)

# パディング選手(負の背番号)をスキップ
# Skip padding players (negative jersey numbers)
if jersey_number < 0:
# パディング選手のvelocityはデフォルトで0に設定
# Set velocity of padding players to zero by default
player2vel[f"padding_{jersey_number}"] = {"x": 0, "y": 0}
continue

Expand Down Expand Up @@ -1795,18 +1787,18 @@ def __get_player2vel(player_data: List[Dict[str, Any]]) -> Dict[str, Dict[str, f
player_name = d.get("player_name")
jersey_number = d.get("jersey_number", 0)

# パディング選手の処理
# Skip padding players (negative jersey numbers)
if jersey_number < 0:
d["acceleration"] = {"x": 0, "y": 0}
continue

# 実選手の処理
# Process real players
if player_name and player_name in prev_player2vel:
d["acceleration"] = deepcopy(prev_player2vel[player_name])
elif jersey_number < 0 and f"padding_{jersey_number}" in prev_player2vel:
d["acceleration"] = {"x": 0, "y": 0}
else:
if player_name: # 実選手のみログ出力
if player_name: # Log output for real players only
# Use substitution detection for better context
is_substitution = __is_likely_substitution_scenario(player_name, jersey_number, tracking_data, idx)
substitution_note = " (substitution detected)" if is_substitution else " (unexpected absence)"
Expand Down Expand Up @@ -1837,7 +1829,7 @@ def __get_player2vel(player_data: List[Dict[str, Any]]) -> Dict[str, Dict[str, f
player_name = d.get("player_name")
jersey_number = d.get("jersey_number", 0)

# パディング選手の処理
# Skip padding players (negative jersey numbers)
if jersey_number < 0:
d["acceleration"] = {"x": 0, "y": 0}
continue
Expand Down
4 changes: 2 additions & 2 deletions preprocessing/sports/SAR_data/soccer/soccer_load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ def load_single_fifawc(data_path: str, match_id: str):
with open(event_file, "r") as f:
event_df = json.load(f)

# Tracking Data処理
# Tracking Data processing
tracking_file = data_path / "Tracking Data" / f"{match_id}.jsonl.bz2"
tracking_list = []
with bz2.open(tracking_file, "rt") as f:
Expand All @@ -510,7 +510,7 @@ def load_single_fifawc(data_path: str, match_id: str):
record = json.loads(line)
tracking_list.append(record)

# Players Data処理
# Players Data processing
metadata_file = data_path / "Metadata" / f"{match_id}.json"
roster_file = data_path / "Rosters" / f"{match_id}.json"
with open(metadata_file, "r") as f:
Expand Down
26 changes: 13 additions & 13 deletions preprocessing/sports/SAR_data/soccer/state_preprocess/state_edms.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,15 @@ def velocity_points(points, velocities):


def calculate_voronoi(players: List[Player], ball: Ball, team_info: str, key: str):
# プレイヤーの位置情報を取得し、スケール変換
# Get player position info and apply scale conversion
points = [[player.position.x + 52.5, player.position.y + 34] for player in players]

velocities = [[player.velocity.x, player.velocity.y] for player in players]

# プレイヤーのチーム情報を取得
# Get player team information
team = [player.team_name for player in players]

# ボールの位置
# Ball position
ball_loc = [ball.position.x + 52.5, ball.position.y + 34]

player_name = [player.player_name for player in players]
Expand All @@ -86,7 +86,7 @@ def calculate_voronoi(players: List[Player], ball: Ball, team_info: str, key: st
# judge offside
filtered_points, offside_f = judge_offside(points, ball_loc, team, team_info)

# ボロノイ図を計算
# Calculate Voronoi diagram
vor = Voronoi(filtered_points)

return vor, team, player_name, offside_f
Expand Down Expand Up @@ -256,34 +256,34 @@ def voronoi_finite_polygons_2d_cached(vor: Voronoi, radius: Optional[float] = No


def weighted_area(polygon, weight_image, team, team_name):
# 多角形の頂点を取得
# Get polygon vertices
if isinstance(polygon, Polygon):
vertices = np.array(polygon.exterior.coords, dtype=np.int32)
else:
raise TypeError("polygon must be a shapely.geometry.Polygon object")

# 多角形のマスクを作成
# Create a mask for the polygon
mask = np.zeros(weight_image.shape, dtype=np.uint8)

# Create a path object from the polygon vertices
path = mpath.Path(vertices)

# Create a grid of coordinates for the mask
y_coords, x_coords = np.mgrid[0:weight_image.shape[0], 0:weight_image.shape[1]]
y_coords, x_coords = np.mgrid[0 : weight_image.shape[0], 0 : weight_image.shape[1]]
points = np.vstack([x_coords.ravel(), y_coords.ravel()]).T

# Check which points are inside the polygon
mask_flat = path.contains_points(points)
mask = mask_flat.reshape(weight_image.shape).astype(np.uint8)

# team team_name と異なる場合、weight_image を x 軸方向に反転
# If team differs from team_name, flip the weight image horizontally
if team != team_name:
weight_image = np.flip(weight_image, axis=1)

# 重み画像とマスクを掛け合わせて重み付き領域を取得
# Multiply weight image and mask to get weighted region
weighted_region = weight_image * mask

# 重み付き領域の合計を計算
# Calculate the sum of the weighted region
area = np.sum(weighted_region)

return area
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "openstarlab_preprocessing"
version = "0.1.42"
version = "0.1.43"
description = "openstarlab preprocessing package"
readme = "README.md"
requires-python = ">=3.8"
Expand Down