diff --git a/kloppy/infra/serializers/tracking/secondspectrum.py b/kloppy/infra/serializers/tracking/secondspectrum.py index 1ea812af8..45e4ad949 100644 --- a/kloppy/infra/serializers/tracking/secondspectrum.py +++ b/kloppy/infra/serializers/tracking/secondspectrum.py @@ -27,7 +27,7 @@ ) from kloppy.domain.services.frame_factory import create_frame -from kloppy.utils import Readable, performance_logging +from kloppy.utils import Readable, performance_logging, find_json_key from .deserializer import TrackingDataDeserializer @@ -143,6 +143,115 @@ def __validate_inputs(inputs: Dict[str, Readable]): if "raw_data" not in inputs: raise ValueError("Please specify a value for 'raw_data'") + @staticmethod + def __infer_frame_rate(raw_frames, n_samples=25): + total_time_difference = 0 + + prev_frame = None + for i, line_ in enumerate(raw_frames): + line_ = line_.strip().decode("ascii") + frame_data = json.loads(line_) + if i == 0: + prev_frame = frame_data + continue + + current_time = frame_data["gameClock"] + prev_time = prev_frame["gameClock"] + + prev_frame = frame_data + + time_difference = current_time - prev_time + total_time_difference += time_difference + + if i >= n_samples: + break + + return int(1 / (total_time_difference / n_samples)) + + @staticmethod + def __periods_from_raw_data(raw_frames): + def extract_period_boundaries(raw_frames): + start_frames = {} + end_frames = {} + + for i, line_ in enumerate(raw_frames): + line_ = line_.strip().decode("ascii") + frame_data = json.loads(line_) + + period = frame_data["period"] + game_clock = frame_data["gameClock"] + + if period not in start_frames: + start_frames[period] = game_clock + + end_frames[period] = game_clock + + # Combine results + periods = {} + for period in start_frames: + periods[period] = { + "start": start_frames[period], + "end": end_frames[period], + } + + return periods + + boundaries = extract_period_boundaries(raw_frames) + + periods = [] + for period_id, times in boundaries.items(): + periods.append( + Period( + id=int(period_id), + start_timestamp=timedelta(seconds=times["start"]), + end_timestamp=timedelta(seconds=times["end"]), + ) + ) + + return periods + + @staticmethod + def __periods_from_json_metadata(metadata_periods, frame_rate): + periods = [] + for period in metadata_periods: + start_frame_id = int(period["startFrameIdx"]) + end_frame_id = int(period["endFrameIdx"]) + if start_frame_id != 0 or end_frame_id != 0: + # Frame IDs are unix timestamps (in milliseconds) + periods.append( + Period( + id=int(period["number"]), + start_timestamp=timedelta( + seconds=start_frame_id / frame_rate + ), + end_timestamp=timedelta( + seconds=end_frame_id / frame_rate + ), + ) + ) + return periods + + @staticmethod + def __periods_from_xml_metadata(match, frame_rate): + periods = [] + for period in match.iterchildren(tag="period"): + start_frame_id = int(period.attrib["iStartFrame"]) + end_frame_id = int(period.attrib["iEndFrame"]) + if start_frame_id != 0 or end_frame_id != 0: + # Frame IDs are unix timestamps (in milliseconds) + periods.append( + Period( + id=int(period.attrib["iId"]), + start_timestamp=timedelta( + seconds=start_frame_id / frame_rate + ), + end_timestamp=timedelta( + seconds=end_frame_id / frame_rate + ), + ) + ) + return periods + def deserialize(self, inputs: SecondSpectrumInputs) -> TrackingDataset: metadata = None @@ -152,30 +261,35 @@ def deserialize(self, inputs: SecondSpectrumInputs) -> TrackingDataset: # it also contains the 'additional metadata'. # First do a 'peek' to determine the char first_byte = inputs.meta_data.read(1) + raw_frames = inputs.raw_data.readlines() + if first_byte == b"{": metadata = json.loads(first_byte + inputs.meta_data.read()) - frame_rate = int(metadata["fps"]) - pitch_size_height = float(metadata["pitchLength"]) - pitch_size_width = float(metadata["pitchWidth"]) - - periods = [] - for period in metadata["periods"]: - start_frame_id = int(period["startFrameIdx"]) - end_frame_id = int(period["endFrameIdx"]) - if start_frame_id != 0 or end_frame_id != 0: - # Frame IDs are unix timestamps (in milliseconds) - periods.append( - Period( - id=int(period["number"]), - start_timestamp=timedelta( - seconds=start_frame_id / frame_rate - ), - end_timestamp=timedelta( - seconds=end_frame_id / frame_rate - ), - ) - ) + frame_rate = find_json_key(metadata, "fps") + frame_rate = ( + int(frame_rate) + if frame_rate is not None + else self.__infer_frame_rate(raw_frames) + ) + + pitch_size_height = float( + find_json_key(metadata, "pitchLength") + ) + pitch_size_width = float(find_json_key(metadata, "pitchWidth")) + if not pitch_size_height or not pitch_size_width: + raise ValueError( + "Could not locate pitch dimension(s) in meta_data..." + ) + + metadata_periods = find_json_key(metadata, "periods") + + if metadata_periods is not None: + periods = self.__periods_from_json_metadata( + metadata_periods, frame_rate + ) + else: + periods = self.__periods_from_raw_data(raw_frames) else: match = objectify.fromstring( first_byte + inputs.meta_data.read() @@ -184,23 +298,7 @@ def deserialize(self, inputs: SecondSpectrumInputs) -> TrackingDataset: pitch_size_height = float(match.attrib["fPitchXSizeMeters"]) pitch_size_width = float(match.attrib["fPitchYSizeMeters"]) - periods = [] - for period in match.iterchildren(tag="period"): - start_frame_id = int(period.attrib["iStartFrame"]) - end_frame_id = int(period.attrib["iEndFrame"]) - if start_frame_id != 0 or end_frame_id != 0: - # Frame IDs are unix timestamps (in milliseconds) - periods.append( - Period( - id=int(period.attrib["iId"]), - start_timestamp=timedelta( - seconds=start_frame_id / frame_rate - ), - end_timestamp=timedelta( - seconds=end_frame_id / frame_rate - ), - ) - ) + periods = self.__periods_from_xml_metadata(match, frame_rate) # Default team initialisation home_team = Team(team_id="home", name="home", ground=Ground.HOME) @@ -215,11 +313,13 @@ def deserialize(self, inputs: SecondSpectrumInputs) -> TrackingDataset: inputs.additional_meta_data.read() ) - home_team_id = metadata["homeOptaId"] - away_team_id = metadata["awayOptaId"] + home_team = find_json_key(metadata, "HomeTeam") + away_team = find_json_key(metadata, "AwayTeam") + + if home_team is None and away_team is None: + home_team_id = metadata["homeOptaId"] + away_team_id = metadata["awayOptaId"] - # Tries to parse (short) team names from the description string - try: home_name = ( metadata["description"].split("-")[0].strip() ) @@ -229,18 +329,45 @@ def deserialize(self, inputs: SecondSpectrumInputs) -> TrackingDataset: .split(":")[0] .strip() ) - except: + elif home_team.get("Name", None) and away_team.get( + "Name", None + ): + home_team_id = home_team["Id"] + away_team_id = away_team["Id"] + home_name = home_team["Name"] + away_name = away_team["Name"] + else: home_name, away_name = "home", "away" + home_team_id, away_team_id = None, None teams[0].team_id = home_team_id teams[0].name = home_name teams[1].team_id = away_team_id teams[1].name = away_name - for team, team_str in zip( - teams, ["homePlayers", "awayPlayers"] - ): - for player_data in metadata[team_str]: + for team, team_str in zip(teams, ["home", "away"]): + if find_json_key(metadata, f"{team_str}Players"): + id_key = "optaId" + name_key = "name" + position_key = "position" + jersey_no_key = "number" + players = find_json_key( + metadata, f"{team_str}Players" + ) + + elif find_json_key(metadata, f"{team_str}Team"): + id_key = "Id" + name_key = "Name" + position_key = None + jersey_no_key = "JerseyNumber" + players = find_json_key( + metadata, f"{team_str}Team" + )["Players"] + + else: + raise Exception() + + for player_data in players: # We use the attributes field of Player to store the extra IDs provided by the # metadata. We designate the player_id to be the 'optaId' field as this is what's # used as 'player_id' in the raw frame data file @@ -250,16 +377,21 @@ def deserialize(self, inputs: SecondSpectrumInputs) -> TrackingDataset: if k in ["ssiId", "optaUuid"] } + position = player_data.get(position_key, None) player = Player( - player_id=player_data["optaId"], - name=player_data["name"], - starting=player_data["position"] != "SUB", + player_id=player_data[id_key], + name=player_data[name_key], + starting=player_data[position_key] != "SUB" + if position_key is not None + else None, starting_position=position_mapping.get( - player_data["position"], + position, PositionType.Unknown, - ), + ) + if position is not None + else PositionType.Unknown, team=team, - jersey_no=int(player_data["number"]), + jersey_no=int(player_data[jersey_no_key]), attributes=player_attributes, ) team.players.append(player) @@ -279,7 +411,7 @@ def _iter(): n = 0 sample = 1 / self.sample_rate - for line_ in inputs.raw_data.readlines(): + for line_ in raw_frames: line_ = line_.strip().decode("ascii") if not line_: continue @@ -326,20 +458,26 @@ def _iter(): orientation = Orientation.NOT_SET if metadata: - score = Score( - home=metadata["homeScore"], away=metadata["awayScore"] - ) - year, month, day = ( - metadata["year"], - metadata["month"], - metadata["day"], - ) - date = datetime(year, month, day, 0, 0, tzinfo=timezone.utc) - game_id = metadata["ssiId"] - else: - score = None - date = None - game_id = None + home_goals = find_json_key(metadata, "homeScore") + away_goals = find_json_key(metadata, "awayScore") + if home_goals is not None and away_goals is not None: + score = Score(home=home_goals, away=away_goals) + else: + score = None + + year = find_json_key(metadata, "year") + month = find_json_key(metadata, "month") + day = find_json_key(metadata, "day") + + if ( + (year is not None) + and (month is not None) + and (day is not None) + ): + date = datetime(year, month, day, 0, 0, tzinfo=timezone.utc) + else: + date = find_json_key(metadata, "date") + game_id = metadata["ssiId"] or find_json_key(metadata, "MatchId") metadata = Metadata( teams=teams, diff --git a/kloppy/tests/files/second_spectrum_fake_metadata2.json b/kloppy/tests/files/second_spectrum_fake_metadata2.json new file mode 100644 index 000000000..7f02091b2 --- /dev/null +++ b/kloppy/tests/files/second_spectrum_fake_metadata2.json @@ -0,0 +1,260 @@ +{ + "MatchId": "1234456", + "CompetitionName": "FK1 - FK2 Championship", + "CompetitionId": "1234456", + "KickOffTime": { + "Date": "1900-01-26", + "DateTime": "1900-01-26T18:45:00Z", + "UTCOffsetInHours": 2 + }, + "MatchDay": "MD3", + "SeasonYear": "1900", + "HomeTeam": { + "Id": "123", + "Name": "FK1", + "Players": [ + { + "Id": "pmafwsw7759idgzwmsae8absl4s043v0o2lt", + "Name": "y9xrbe545u3h", + "JerseyNumber": 1 + }, + { + "Id": "5mnvb8i8hxrram5dok68zmflj2i76ihmsfnq", + "Name": "pljb4cmv0t2z", + "JerseyNumber": 2 + }, + { + "Id": "fo4kmhzrknxs1ibs10a59swk54fhi3cozhtt", + "Name": "2hnoi7fknt5w", + "JerseyNumber": 3 + }, + { + "Id": "0j5hchyosbh83y8won01kb48hvulwmogumc8", + "Name": "z2224a1am6ne", + "JerseyNumber": 4 + }, + { + "Id": "o57l9ce1nihyjb1o24azg26bcw92g5dm0ij8", + "Name": "s3al5wlky1s7", + "JerseyNumber": 5 + }, + { + "Id": "5jp3vl5s2i7ly1cpjkk6g949tctsp6vascwk", + "Name": "xfkhouu9wppp", + "JerseyNumber": 6 + }, + { + "Id": "vsj66kfzeqm5mmvyfmimvjywlcok3qcn5ty9", + "Name": "fruoqyhyio73", + "JerseyNumber": 7 + }, + { + "Id": "o47hnemi4eso46t03330m91js45c180e1pp1", + "Name": "7axcwo4x6slo", + "JerseyNumber": 8 + }, + { + "Id": "mlspcvh7rswko6ababcqmfchzztryw7jhql9", + "Name": "xwol5cxmellx", + "JerseyNumber": 9 + }, + { + "Id": "67ub77tvf4i9d627odfe6ni33sq95bpjpumt", + "Name": "hcdfo582oz4g", + "JerseyNumber": 10 + }, + { + "Id": "ounznqziz3vtxl8npt4b0roamjaka3t6zob8", + "Name": "dacpohga5lht", + "JerseyNumber": 11 + }, + { + "Id": "ctglkajjq5qb8sx7dku5yoxm4oq40w3mnzns", + "Name": "antz61bhpdqb", + "JerseyNumber": 12 + }, + { + "Id": "g4hbf5qnu7r0qbawe2hkccwoq1j5l5hxnd9i", + "Name": "jnzm4acs4b8p", + "JerseyNumber": 13 + }, + { + "Id": "ynpvn297cz6cauziwfw40d6o7mhmwu60dzmw", + "Name": "6jjbi4p2nh2c", + "JerseyNumber": 14 + }, + { + "Id": "p5jrn0nyl5ik9yn8pkiin85d5zmuetm7bkcn", + "Name": "lab60egr8a0c", + "JerseyNumber": 15 + }, + { + "Id": "4oia2is8w1iji29l85d6uy7ig64sbx78x9d5", + "Name": "q9w9yolbn7nj", + "JerseyNumber": 16 + }, + { + "Id": "o2hdxbs3fyh7rh946af5tp9q8bgtmsqk97z3", + "Name": "8t3i0cmeticj", + "JerseyNumber": 17 + }, + { + "Id": "nulvti0zc3y4ztb323us9u61n3my7wu62trp", + "Name": "vi7hl0hgu9tw", + "JerseyNumber": 18 + }, + { + "Id": "44vxxq1yggcoq8981pxhznvm9okghwphl1y2", + "Name": "h8pglht9jmsk", + "JerseyNumber": 19 + }, + { + "Id": "712jbgbruvvh680sinq5vq18y7pwonx4y00r", + "Name": "iz21cqwk37wf", + "JerseyNumber": 20 + }, + { + "Id": "712jbgbruvvh680sinq5vq18y7pwonx4y00r", + "Name": "iz21cqwk37wf", + "JerseyNumber": 21 + }, + { + "Id": "712jbgbruvvh680sinq5vq18y7pwonx4y00r", + "Name": "iz21cqwk37wf", + "JerseyNumber": 22 + }, + { + "Id": "712jbgbruvvh680sinq5vq18y7pwonx4y00r", + "Name": "iz21cqwk37wf", + "JerseyNumber": 23 + } + ] + }, + "AwayTeam": { + "Id": "456", + "Name": "FK2", + "Players": [ + { + "Id": "exkzuxbcc3f14k0oltdsvm9zanvl4su165wh", + "Name": "c6gupnmywca0", + "JerseyNumber": 1 + }, + { + "Id": "ysg04b3ailsdl9lnj383s8pvbkv79cqoadei", + "Name": "6dhssmztl3h3", + "JerseyNumber": 2 + }, + { + "Id": "7vz2bctjhdrkadsz3m2j7k7eg4s6hls1mfax", + "Name": "gyqlxo5rcvhn", + "JerseyNumber": 3 + }, + { + "Id": "65nit6f1fd7ln3hncx9uvvuu2txk3ow31g9u", + "Name": "ne8uj6ah5npc", + "JerseyNumber": 4 + }, + { + "Id": "lhe8p6vcw0cfrirmfajjpszituqdehivmpgk", + "Name": "60j1yv1rbowl", + "JerseyNumber": 5 + }, + { + "Id": "9c3a074a7gf3gih7a91t2sm8giul379lksr4", + "Name": "640fzrot7njs", + "JerseyNumber": 6 + }, + { + "Id": "6x0yiuaapy9n7o9iyc38fwgd3h3jyjl1vyge", + "Name": "bne4o954hzyv", + "JerseyNumber": 7 + }, + { + "Id": "cdby8yb4xlfn7ciypeue25ju6krya9xxza6x", + "Name": "rnlx2ina5frs", + "JerseyNumber": 8 + }, + { + "Id": "gl0nvpbkywd24ybj8mswpudoxf7b595rzso8", + "Name": "jgi0qzedhei6", + "JerseyNumber": 9 + }, + { + "Id": "wdfydi2l8yh4k6rv8y6kzv5py0aje6f45e5p", + "Name": "4kqjztd25fvo", + "JerseyNumber": 10 + }, + { + "Id": "tlvtu04csq94o3vsszv7byjnp5x8x9pmmzxv", + "Name": "toyo8gi9temj", + "JerseyNumber": 11 + }, + { + "Id": "u2zp3ezfhjs8zo55ymwpuwjnjur78kyrstj5", + "Name": "1w02mn5dc07a", + "JerseyNumber": 12 + }, + { + "Id": "reaa7v7o8mao0khye3ktzdwhnord0vt4be6m", + "Name": "2vspafo5in7z", + "JerseyNumber": 13 + }, + { + "Id": "skjamknv30h4co4y7f3ge34msy3hain27dqs", + "Name": "tilwmvwn05aa", + "JerseyNumber": 14 + }, + { + "Id": "npf8sg35gso4cddzar8t7ns7yolg1g5kkv4p", + "Name": "xrnp0q462tbi", + "JerseyNumber": 15 + }, + { + "Id": "ux9jbiwmum1bb3hsxtjrvavwpd1137htu6qa", + "Name": "ic7yufm76u08", + "JerseyNumber": 16 + }, + { + "Id": "ph04dvcgiw7jd45wdhduzn0mo7yuo66dfiop", + "Name": "iy75axf2n8qy", + "JerseyNumber": 17 + }, + { + "Id": "disn4hcjgz3qc2i8ixkdfjrqvw1syeec11wk", + "Name": "7gfm06ifb1y2", + "JerseyNumber": 18 + }, + { + "Id": "fjyghm80048r4nw2oxfdd1dckx5mwer7c59h", + "Name": "ur700ddnrd78", + "JerseyNumber": 19 + }, + { + "Id": "p190348pmn7u3pidfn8rvuug8axaztv1ht8e", + "Name": "zdsqx30l848t", + "JerseyNumber": 20 + }, + { + "Id": "p190348pmn7u3pidfn8rvuug8axaztv1ht8e", + "Name": "zdsqx30l848t", + "JerseyNumber": 21 + }, + { + "Id": "p190348pmn7u3pidfn8rvuug8axaztv1ht8e", + "Name": "zdsqx30l848t", + "JerseyNumber": 22 + }, + { + "Id": "p190348pmn7u3pidfn8rvuug8axaztv1ht8e", + "Name": "zdsqx30l848t", + "JerseyNumber": 23 + } + ] + }, + "Stadium": { + "Id": "62085", + "Name": "Anonymous Stadium", + "PitchLength": 104.8512, + "PitchWidth": 67.9704 + } +} \ No newline at end of file diff --git a/kloppy/tests/test_secondspectrum.py b/kloppy/tests/test_secondspectrum.py index b43db3b49..646b4445e 100644 --- a/kloppy/tests/test_secondspectrum.py +++ b/kloppy/tests/test_secondspectrum.py @@ -19,6 +19,10 @@ class TestSecondSpectrumTracking: def meta_data(self, base_dir) -> str: return base_dir / "files/second_spectrum_fake_metadata.xml" + @pytest.fixture + def meta_data2(self, base_dir) -> str: + return base_dir / "files/second_spectrum_fake_metadata2.json" + @pytest.fixture def raw_data(self, base_dir) -> str: return base_dir / "files/second_spectrum_fake_data.jsonl" @@ -139,6 +143,95 @@ def test_correct_deserialization( assert isinstance(game_id, str) assert game_id == "1234456" + def test_correct_deserialization_2( + self, meta_data2: Path, raw_data: Path, additional_meta_data: Path + ): + dataset = secondspectrum.load( + meta_data=meta_data2, + raw_data=raw_data, + additional_meta_data=additional_meta_data, + only_alive=False, + coordinates="secondspectrum", + ) + + # Check provider, type, shape, etc + assert dataset.metadata.provider == Provider.SECONDSPECTRUM + assert dataset.dataset_type == DatasetType.TRACKING + assert len(dataset.records) == 376 + assert len(dataset.metadata.periods) == 2 + assert dataset.metadata.orientation == Orientation.AWAY_HOME + + print("A", dataset.metadata.periods[0].end_timestamp) + print("B", timedelta(seconds=2982240 / 25)) + # Check the Periods + assert dataset.metadata.periods[0].id == 1 + assert dataset.metadata.periods[0].start_timestamp == timedelta( + seconds=0 + ) + assert dataset.metadata.periods[0].end_timestamp == timedelta( + seconds=2976 + ) + + assert dataset.metadata.periods[1].id == 2 + assert dataset.metadata.periods[1].start_timestamp == timedelta( + seconds=9, microseconds=720000 + ) + assert dataset.metadata.periods[1].end_timestamp == timedelta( + seconds=3017, microseconds=720000 + ) + + # Check some timestamps + assert dataset.records[0].timestamp == timedelta( + seconds=0 + ) # First frame + assert dataset.records[20].timestamp == timedelta( + seconds=320.0 + ) # Later frame + assert dataset.records[187].timestamp == timedelta( + seconds=9.72 + ) # Second period + + # Check some players + home_player = dataset.metadata.teams[0].players[2] + assert home_player.player_id == "8xwx2" + assert dataset.records[0].players_coordinates[home_player] == Point( + x=-8.943903672572427, y=-28.171654132650365 + ) + + away_player = dataset.metadata.teams[1].players[3] + assert away_player.player_id == "2q0uv" + assert dataset.records[0].players_coordinates[away_player] == Point( + x=-45.11871334915762, y=-20.06459030559596 + ) + + # Check the ball + assert dataset.records[1].ball_coordinates == Point3D( + x=-23.147073918432426, y=13.69367399756424, z=0.0 + ) + + # Check pitch dimensions + pitch_dimensions = dataset.metadata.pitch_dimensions + assert pitch_dimensions.x_dim.min == pytest.approx(-52.425, abs=0.001) + assert pitch_dimensions.x_dim.max == pytest.approx(52.425, abs=0.001) + assert pitch_dimensions.y_dim.min == pytest.approx(-33.985, abs=0.001) + assert pitch_dimensions.y_dim.max == pytest.approx(33.985, abs=0.001) + + # Check enriched metadata + date = dataset.metadata.date + if date: + assert isinstance(date, datetime) + assert date == datetime(1900, 1, 26, 0, 0, tzinfo=timezone.utc) + + game_week = dataset.metadata.game_week + if game_week: + assert isinstance(game_week, str) + assert game_week == "1" + + game_id = dataset.metadata.game_id + if game_id: + assert isinstance(game_id, str) + assert game_id == "1234456" + def test_correct_normalized_deserialization( self, meta_data: Path, raw_data: Path, additional_meta_data: Path ): diff --git a/kloppy/utils.py b/kloppy/utils.py index b88a3100a..204fbdb62 100644 --- a/kloppy/utils.py +++ b/kloppy/utils.py @@ -178,3 +178,38 @@ def __get__(self, instance, owner): def snake_case(s: str) -> str: """Convert a string to snake_case.""" return re.sub(r"[\s\-]+", "_", s.strip()).lower() + + +def find_json_key(data, pattern, first_only=True): + def search(obj, path=""): + results = {} + + if isinstance(obj, dict): + for key, value in obj.items(): + current_path = f"{path}.{key}" if path else key + + if pattern.lower() in key.lower(): + if first_only: + return value + results[current_path] = value + + if isinstance(value, (dict, list)): + nested = search(value, current_path) + if first_only and nested is not None: + return nested + elif not first_only: + results.update(nested) + + elif isinstance(obj, list): + for i, item in enumerate(obj): + current_path = f"{path}[{i}]" if path else f"[{i}]" + if isinstance(item, (dict, list)): + nested = search(item, current_path) + if first_only and nested is not None: + return nested + elif not first_only: + results.update(nested) + + return results if not first_only else None + + return search(data)