GliderGeek · sylvainvdm · Apr 27, 2025 · Apr 27, 2025
diff --git a/opensoar/competition/daily_results_page.py b/opensoar/competition/daily_results_page.py
@@ -24,6 +24,7 @@ def __init__(self, url):
             self.url = 'http://{}'.format(url)
 
         self._igc_directory = None  # to be set in subclass
+        self._html_soup = None  # to be set when the page is evaluated
 
     @property
     def igc_directory(self):
@@ -36,20 +37,21 @@ def set_igc_directory(self, target_directory, competition_name, plane_class, dat
     def _get_html_soup(self) -> BeautifulSoup:
         # fix problem with SSL certificates
         # https://stackoverflow.com/questions/30551400/disable-ssl-certificate-validation-in-mechanize#35960702
-        import ssl
-        try:
-            _create_unverified_https_context = ssl._create_unverified_context
-        except AttributeError:
-            # Legacy Python that doesn't verify HTTPS certificates by default
-            pass
-        else:
-            # Handle target environment that doesn't support HTTPS verification
-            ssl._create_default_https_context = _create_unverified_https_context
-
-        # get entire html of page
-        html = urlopen(self.url).read()
+        if not self._html_soup:
+            import ssl
+            try:
+                _create_unverified_https_context = ssl._create_unverified_context
+            except AttributeError:
+                # Legacy Python that doesn't verify HTTPS certificates by default
+                pass
+            else:
+                # Handle target environment that doesn't support HTTPS verification
+                ssl._create_default_https_context = _create_unverified_https_context
 
-        return BeautifulSoup(html, "html.parser")
+            # get entire html of page
+            html = urlopen(self.url).read()
+            self._html_soup = BeautifulSoup(html, "html.parser")
+        return self._html_soup
 
     def igc_file_name(self, competition_id: str) -> str:
         """
@@ -122,4 +124,4 @@ def _select_task(tasks: List[Task]) -> Task:
                 number_of_times_present.append(1)
 
         max_index, max_value = max(enumerate(number_of_times_present), key=operator.itemgetter(1))
-        return tasks[max_index]
+        return tasks[max_index]
diff --git a/opensoar/competition/soaringspot.py b/opensoar/competition/soaringspot.py
@@ -257,46 +257,142 @@ class SoaringSpotDaily(DailyResultsPage):
     def __init__(self, url: str):
         super().__init__(url)
 
-    def _get_competitors_info(self, include_hc_competitors: bool) -> List[dict]:
+    def _get_competitors_info(self, include_hc_competitors: bool, include_dns_competitors: bool = False) -> List[dict]:
         """
-        :param include_hc_competitors: include pilots which do not officially compete
-        :return:
+        Extract competitor information from a SoaringSpot daily results page.
+
+        Args:
+            include_hc_competitors: Whether to include pilots competing hors-concours
+            include_dns_competitors: Whether to include pilots who did not start or finish
+
+        Returns:
+            List of dictionaries with competitor information:
+            - ranking: Position in the competition or status (DNF/DNS)
+            - competition_id: Glider ID
+            - igc_url: URL to download the IGC file (None for DNF/DNS)
+            - pilot_name: Name of the pilot
+            - plane_model: Type of glider
         """
-
-        competitors_info = list()
+        competitors_info = []
 
         table = self._get_html_soup().find("table")
-        for row in table.findAll('tr')[1:]:
-            if row.findAll('td')[0].text not in ["DNS", "DNF"]:
-
-                ranking = row.findAll('td')[0].text
-                if ranking == 'HC':
-                    if not include_hc_competitors:
-                        continue
-                else:
-                    ranking = int(ranking[:-1])
-
-                igc_url = None
-                competition_id = None
-                for link in row.findAll('a'):
-                    data_content = link.get('data-content')
-                    soup = BeautifulSoup(data_content, 'html.parser')
-                    href = None
-                    for a in soup.findAll('a'):
-                        if 'download' in a.text.strip().lower():
-                            href = a.get('href')
-
-                    if href.startswith("http://") or href.startswith("https://"):  # absolute URL
-                        igc_url = href
-                    else:  # relative url
-                        igc_url = urljoin(self.url, href)
+        if not table:
+            raise ValueError(f"Could not find results table in the page at {self.url}")
+
+        # Default column indices (fallback values)
+        col_indices = {
+            'ranking': 0,  # First column typically has the ranking
+            'cn': 2,       # Third column typically has competition ID
+            'pilot': 3,    # Fourth column typically has pilot name
+            'glider': 4    # Fifth column typically has glider model
+        }
+
+        # Try to determine accurate column indices from headers
+        headers = table.find('thead').findAll('th') if table.find('thead') else []
+
+        if headers:
+            for i, header in enumerate(headers):
+                header_text = header.text.strip().lower()
+
+                # Check for ranking column (could be # or empty for the first column)
+                if header_text == '#' or (i == 0 and header_text == ''):
+                    col_indices['ranking'] = i
+
+                # Check for CN column (competition number)
+                elif header_text == 'cn' or header_text.startswith('comp'):
+                    col_indices['cn'] = i
+
+                # Check for pilot/contestant column
+                elif 'contestant' in header_text or 'pilot' in header_text:
+                    col_indices['pilot'] = i
+
+                # Check for glider column
+                elif 'glider' in header_text or 'plane' in header_text:
+                    col_indices['glider'] = i
+
+        for row in table.findAll('tr')[1:]:  # Skip header row
+            cells = row.findAll('td')
+            if not cells or len(cells) <= col_indices['cn']:  # Need at least CN column
+                continue
 
-                    competition_id = link.text.strip()
+            # Extract status/ranking from ranking column
+            ranking_idx = min(col_indices['ranking'], len(cells) - 1)
+            status = cells[ranking_idx].text.strip()
+
+            # Skip DNF/DNS if not requested
+            if (status == "DNF" or status == "DNS") and not include_dns_competitors:
+                continue
 
-                competitors_info.append(dict(ranking=ranking, competition_id=competition_id, igc_url=igc_url))
+            # Extract competition ID from CN column
+            cn_idx = min(col_indices['cn'], len(cells) - 1)
+            cn_cell = cells[cn_idx]
+            competition_id = cn_cell.text.strip()
+
+            # Extract pilot name from pilot/contestant column
+            pilot_idx = min(col_indices['pilot'], len(cells) - 1)
+            contestant_cell = cells[pilot_idx]
+            pilot_name = contestant_cell.text.strip()
+
+            # Try to find a div with flag inside contestant cell and remove it from pilot name
+            flag_div = contestant_cell.find('div', class_='flag')
+            if flag_div:
+                pilot_name = pilot_name.replace(flag_div.text, '').strip()
+
+            # Extract plane model from glider column
+            plane_model = None
+            glider_idx = col_indices['glider']
+            if glider_idx < len(cells):
+                plane_model = cells[glider_idx].text.strip()
+
+            # Handle HC competitors
+            if status == "HC":
+                if not include_hc_competitors:
+                    continue
+                ranking = status
+            # Handle DNF/DNS
+            elif status == "DNF" or status == "DNS":
+                ranking = status
+                competitors_info.append({
+                    "ranking": ranking,
+                    "competition_id": competition_id,
+                    "igc_url": None,
+                    "pilot_name": pilot_name,
+                    "plane_model": plane_model,
+                })
+                continue
+            else:
+                # Normal competitors - extract numeric ranking
+                try:
+                    ranking = int(status.rstrip("."))
+                except ValueError:
+                    ranking = status
+
+            # Extract IGC URL
+            igc_url = None
+            for link in cn_cell.findAll('a'):
+                data_content = link.get('data-content')
+                if data_content:
+                    popup_soup = BeautifulSoup(data_content, 'html.parser')
+                    for a in popup_soup.findAll('a'):
+                        if 'download' in a.text.lower() or '.igc' in a.text.lower():
+                            href = a.get('href')
+                            if href:
+                                if href.startswith("http://") or href.startswith("https://"):
+                                    igc_url = href
+                                else:
+                                    igc_url = urljoin(self.url, href)
+                                break
+
+            competitors_info.append({
+                "ranking": ranking,
+                "competition_id": competition_id,
+                "igc_url": igc_url,
+                "pilot_name": pilot_name,
+                "plane_model": plane_model,
+            })
 
         return competitors_info
-
+   
     def _get_competition_day_info(self) -> Tuple[str, datetime.date, str]:
 
         if self.url.startswith('https://') or self.url.startswith('http://'):

diff --git a/tests/competition/test_soaringspot.py b/tests/competition/test_soaringspot.py
@@ -130,17 +130,39 @@ def test_get_waypoints(self):
             self.assertEqual(w.is_line, is_line)
 
     def test_get_competitors(self):
+        # old format
         soaringspot_page = SoaringSpotDaily(
             'https://www.soaringspot.com/en/sallandse-tweedaagse-2014/results/club/task-1-on-2014-06-21/daily')
 
+        competitors = soaringspot_page._get_competitors_info(include_hc_competitors=False, include_dns_competitors=True)
+        self.assertEqual(len(competitors), 10)
+        competitors = soaringspot_page._get_competitors_info(include_hc_competitors=False, include_dns_competitors=False)
+        self.assertEqual(len(competitors), 8)
         competitor_pk = soaringspot_page._get_competitors_info(include_hc_competitors=False)[2]
 
         self.assertEqual(competitor_pk['competition_id'], 'PK')
         self.assertEqual(competitor_pk['ranking'], 3)
+        self.assertEqual(competitor_pk['pilot_name'], 'Erik Berendes')
+        self.assertEqual(competitor_pk['plane_model'], 'Pik20D')
 
         expected_igc_url = 'https://archive.soaringspot.com/contest/013/1323/flights/2477/2597322754.igc'
         self.assertEqual(competitor_pk['igc_url'], expected_igc_url)
 
+        # new format
+        soaringspot_page = SoaringSpotDaily(
+            'https://www.soaringspot.com/en_gb/pribina-cup-2025-nitra-2025/results/club/task-6-on-2025-04-24/daily')
+
+        competitors = soaringspot_page._get_competitors_info(include_hc_competitors=False, include_dns_competitors=True)
+        self.assertEqual(len(competitors), 15)
+        competitors = soaringspot_page._get_competitors_info(include_hc_competitors=False, include_dns_competitors=False)
+        self.assertEqual(len(competitors), 13)
+        competitor_pk = soaringspot_page._get_competitors_info(include_hc_competitors=False)[1]
+
+        self.assertEqual(competitor_pk['competition_id'], 'X11')
+        self.assertEqual(competitor_pk['ranking'], 2)
+        self.assertEqual(competitor_pk['pilot_name'], 'Kengo Matsumoto')
+        self.assertEqual(competitor_pk['plane_model'], 'ASW-20')
+
     def test_get_competitors_info_relative_downloads(self):
         """relative IGC URLs"""
         soaringspot_page = SoaringSpotDaily(