diff --git a/opensoar/competition/daily_results_page.py b/opensoar/competition/daily_results_page.py index ca6f17a..b1555da 100644 --- a/opensoar/competition/daily_results_page.py +++ b/opensoar/competition/daily_results_page.py @@ -24,6 +24,7 @@ def __init__(self, url): self.url = 'http://{}'.format(url) self._igc_directory = None # to be set in subclass + self._html_soup = None # to be set when the page is evaluated @property def igc_directory(self): @@ -36,20 +37,21 @@ def set_igc_directory(self, target_directory, competition_name, plane_class, dat def _get_html_soup(self) -> BeautifulSoup: # fix problem with SSL certificates # https://stackoverflow.com/questions/30551400/disable-ssl-certificate-validation-in-mechanize#35960702 - import ssl - try: - _create_unverified_https_context = ssl._create_unverified_context - except AttributeError: - # Legacy Python that doesn't verify HTTPS certificates by default - pass - else: - # Handle target environment that doesn't support HTTPS verification - ssl._create_default_https_context = _create_unverified_https_context - - # get entire html of page - html = urlopen(self.url).read() + if not self._html_soup: + import ssl + try: + _create_unverified_https_context = ssl._create_unverified_context + except AttributeError: + # Legacy Python that doesn't verify HTTPS certificates by default + pass + else: + # Handle target environment that doesn't support HTTPS verification + ssl._create_default_https_context = _create_unverified_https_context - return BeautifulSoup(html, "html.parser") + # get entire html of page + html = urlopen(self.url).read() + self._html_soup = BeautifulSoup(html, "html.parser") + return self._html_soup def igc_file_name(self, competition_id: str) -> str: """ @@ -122,4 +124,4 @@ def _select_task(tasks: List[Task]) -> Task: number_of_times_present.append(1) max_index, max_value = max(enumerate(number_of_times_present), key=operator.itemgetter(1)) - return tasks[max_index] \ No newline at end of file + return tasks[max_index] diff --git a/opensoar/competition/soaringspot.py b/opensoar/competition/soaringspot.py index 97e52b4..088ada4 100644 --- a/opensoar/competition/soaringspot.py +++ b/opensoar/competition/soaringspot.py @@ -257,46 +257,142 @@ class SoaringSpotDaily(DailyResultsPage): def __init__(self, url: str): super().__init__(url) - def _get_competitors_info(self, include_hc_competitors: bool) -> List[dict]: + def _get_competitors_info(self, include_hc_competitors: bool, include_dns_competitors: bool = False) -> List[dict]: """ - :param include_hc_competitors: include pilots which do not officially compete - :return: + Extract competitor information from a SoaringSpot daily results page. + + Args: + include_hc_competitors: Whether to include pilots competing hors-concours + include_dns_competitors: Whether to include pilots who did not start or finish + + Returns: + List of dictionaries with competitor information: + - ranking: Position in the competition or status (DNF/DNS) + - competition_id: Glider ID + - igc_url: URL to download the IGC file (None for DNF/DNS) + - pilot_name: Name of the pilot + - plane_model: Type of glider """ - - competitors_info = list() + competitors_info = [] table = self._get_html_soup().find("table") - for row in table.findAll('tr')[1:]: - if row.findAll('td')[0].text not in ["DNS", "DNF"]: - - ranking = row.findAll('td')[0].text - if ranking == 'HC': - if not include_hc_competitors: - continue - else: - ranking = int(ranking[:-1]) - - igc_url = None - competition_id = None - for link in row.findAll('a'): - data_content = link.get('data-content') - soup = BeautifulSoup(data_content, 'html.parser') - href = None - for a in soup.findAll('a'): - if 'download' in a.text.strip().lower(): - href = a.get('href') - - if href.startswith("http://") or href.startswith("https://"): # absolute URL - igc_url = href - else: # relative url - igc_url = urljoin(self.url, href) + if not table: + raise ValueError(f"Could not find results table in the page at {self.url}") + + # Default column indices (fallback values) + col_indices = { + 'ranking': 0, # First column typically has the ranking + 'cn': 2, # Third column typically has competition ID + 'pilot': 3, # Fourth column typically has pilot name + 'glider': 4 # Fifth column typically has glider model + } + + # Try to determine accurate column indices from headers + headers = table.find('thead').findAll('th') if table.find('thead') else [] + + if headers: + for i, header in enumerate(headers): + header_text = header.text.strip().lower() + + # Check for ranking column (could be # or empty for the first column) + if header_text == '#' or (i == 0 and header_text == ''): + col_indices['ranking'] = i + + # Check for CN column (competition number) + elif header_text == 'cn' or header_text.startswith('comp'): + col_indices['cn'] = i + + # Check for pilot/contestant column + elif 'contestant' in header_text or 'pilot' in header_text: + col_indices['pilot'] = i + + # Check for glider column + elif 'glider' in header_text or 'plane' in header_text: + col_indices['glider'] = i + + for row in table.findAll('tr')[1:]: # Skip header row + cells = row.findAll('td') + if not cells or len(cells) <= col_indices['cn']: # Need at least CN column + continue - competition_id = link.text.strip() + # Extract status/ranking from ranking column + ranking_idx = min(col_indices['ranking'], len(cells) - 1) + status = cells[ranking_idx].text.strip() + + # Skip DNF/DNS if not requested + if (status == "DNF" or status == "DNS") and not include_dns_competitors: + continue - competitors_info.append(dict(ranking=ranking, competition_id=competition_id, igc_url=igc_url)) + # Extract competition ID from CN column + cn_idx = min(col_indices['cn'], len(cells) - 1) + cn_cell = cells[cn_idx] + competition_id = cn_cell.text.strip() + + # Extract pilot name from pilot/contestant column + pilot_idx = min(col_indices['pilot'], len(cells) - 1) + contestant_cell = cells[pilot_idx] + pilot_name = contestant_cell.text.strip() + + # Try to find a div with flag inside contestant cell and remove it from pilot name + flag_div = contestant_cell.find('div', class_='flag') + if flag_div: + pilot_name = pilot_name.replace(flag_div.text, '').strip() + + # Extract plane model from glider column + plane_model = None + glider_idx = col_indices['glider'] + if glider_idx < len(cells): + plane_model = cells[glider_idx].text.strip() + + # Handle HC competitors + if status == "HC": + if not include_hc_competitors: + continue + ranking = status + # Handle DNF/DNS + elif status == "DNF" or status == "DNS": + ranking = status + competitors_info.append({ + "ranking": ranking, + "competition_id": competition_id, + "igc_url": None, + "pilot_name": pilot_name, + "plane_model": plane_model, + }) + continue + else: + # Normal competitors - extract numeric ranking + try: + ranking = int(status.rstrip(".")) + except ValueError: + ranking = status + + # Extract IGC URL + igc_url = None + for link in cn_cell.findAll('a'): + data_content = link.get('data-content') + if data_content: + popup_soup = BeautifulSoup(data_content, 'html.parser') + for a in popup_soup.findAll('a'): + if 'download' in a.text.lower() or '.igc' in a.text.lower(): + href = a.get('href') + if href: + if href.startswith("http://") or href.startswith("https://"): + igc_url = href + else: + igc_url = urljoin(self.url, href) + break + + competitors_info.append({ + "ranking": ranking, + "competition_id": competition_id, + "igc_url": igc_url, + "pilot_name": pilot_name, + "plane_model": plane_model, + }) return competitors_info - + def _get_competition_day_info(self) -> Tuple[str, datetime.date, str]: if self.url.startswith('https://') or self.url.startswith('http://'): diff --git a/tests/competition/test_soaringspot.py b/tests/competition/test_soaringspot.py index 569ccfc..309c447 100644 --- a/tests/competition/test_soaringspot.py +++ b/tests/competition/test_soaringspot.py @@ -130,17 +130,39 @@ def test_get_waypoints(self): self.assertEqual(w.is_line, is_line) def test_get_competitors(self): + # old format soaringspot_page = SoaringSpotDaily( 'https://www.soaringspot.com/en/sallandse-tweedaagse-2014/results/club/task-1-on-2014-06-21/daily') + competitors = soaringspot_page._get_competitors_info(include_hc_competitors=False, include_dns_competitors=True) + self.assertEqual(len(competitors), 10) + competitors = soaringspot_page._get_competitors_info(include_hc_competitors=False, include_dns_competitors=False) + self.assertEqual(len(competitors), 8) competitor_pk = soaringspot_page._get_competitors_info(include_hc_competitors=False)[2] self.assertEqual(competitor_pk['competition_id'], 'PK') self.assertEqual(competitor_pk['ranking'], 3) + self.assertEqual(competitor_pk['pilot_name'], 'Erik Berendes') + self.assertEqual(competitor_pk['plane_model'], 'Pik20D') expected_igc_url = 'https://archive.soaringspot.com/contest/013/1323/flights/2477/2597322754.igc' self.assertEqual(competitor_pk['igc_url'], expected_igc_url) + # new format + soaringspot_page = SoaringSpotDaily( + 'https://www.soaringspot.com/en_gb/pribina-cup-2025-nitra-2025/results/club/task-6-on-2025-04-24/daily') + + competitors = soaringspot_page._get_competitors_info(include_hc_competitors=False, include_dns_competitors=True) + self.assertEqual(len(competitors), 15) + competitors = soaringspot_page._get_competitors_info(include_hc_competitors=False, include_dns_competitors=False) + self.assertEqual(len(competitors), 13) + competitor_pk = soaringspot_page._get_competitors_info(include_hc_competitors=False)[1] + + self.assertEqual(competitor_pk['competition_id'], 'X11') + self.assertEqual(competitor_pk['ranking'], 2) + self.assertEqual(competitor_pk['pilot_name'], 'Kengo Matsumoto') + self.assertEqual(competitor_pk['plane_model'], 'ASW-20') + def test_get_competitors_info_relative_downloads(self): """relative IGC URLs""" soaringspot_page = SoaringSpotDaily(