juftin · thisIsMikeKane · Apr 6, 2025 · Apr 7, 2025 · Apr 8, 2025 · Apr 8, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-default_stages: [commit]
+default_stages: [pre-commit]
 fail_fast: false
 exclude: |
     (?x)(

diff --git a/camply/config/__init__.py b/camply/config/__init__.py
@@ -5,6 +5,7 @@
 from .api_config import (
     STANDARD_HEADERS,
     RecreationBookingConfig,
+    ReserveAmericaConfig,
     RIDBConfig,
     YellowstoneConfig,
 )
@@ -25,6 +26,7 @@
 __all__ = [
     "RecreationBookingConfig",
     "RIDBConfig",
+    "ReserveAmericaConfig",
     "STANDARD_HEADERS",
     "CampsiteContainerFields",
     "DataColumns",

diff --git a/camply/config/api_config.py b/camply/config/api_config.py
@@ -97,6 +97,14 @@ class RecreationBookingConfig(APIConfig):
     RATE_LIMITING = (1.01, 1.51)
 
 
+class ReserveAmericaConfig(APIConfig):
+    """
+    Reserve America API Configuration
+    """
+
+    # TODO: Add cookies or other authentication to configuration
+
+
 class UseDirectConfig(APIConfig):
     """
     Reserve California API Configuration

diff --git a/camply/providers/__init__.py b/camply/providers/__init__.py
@@ -13,6 +13,7 @@
     RecreationDotGovTicket,
     RecreationDotGovTimedEntry,
 )
+from .reserve_america.reserveamerica_provider import ReserveAmerica
 from .usedirect.variations import (
     AlabamaStateParks,
     ArizonaStateParks,
@@ -49,6 +50,7 @@
     AlabamaStateParks,
     FairfaxCountyParks,
     MinnesotaStateParks,
+    ReserveAmerica,
 ]
 
 __all__ = [
@@ -73,4 +75,5 @@
     "AlabamaStateParks",
     "FairfaxCountyParks",
     "MinnesotaStateParks",
+    "ReserveAmerica",
 ]
diff --git a/camply/providers/recreation_dot_gov/recdotgov_provider.py b/camply/providers/recreation_dot_gov/recdotgov_provider.py
@@ -45,7 +45,7 @@ class RecreationDotGovBase(BaseProvider, ABC):
 
     def __init__(self, api_key: Optional[str] = None):
         """
-        Initialize with Search Dates
+        Initialize API key and headers for the Recreation.gov API
         """
         super().__init__()
         if api_key is None:

diff --git a/camply/providers/reserve_america/__init__.py b/camply/providers/reserve_america/__init__.py
@@ -0,0 +1,3 @@
+"""
+ReserveAmerica __init__
+"""
diff --git a/camply/providers/reserve_america/reserve_america_scraper/__init__.py b/camply/providers/reserve_america/reserve_america_scraper/__init__.py
diff --git a/camply/providers/reserve_america/reserve_america_scraper/items.py b/camply/providers/reserve_america/reserve_america_scraper/items.py
@@ -0,0 +1,13 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class CampgroundAvailabilityItem(scrapy.Item):
+    parkId = scrapy.Field()
+    site = scrapy.Field()
+    date = scrapy.Field()
+    availability = scrapy.Field()
diff --git a/camply/providers/reserve_america/reserve_america_scraper/middlewares.py b/camply/providers/reserve_america/reserve_america_scraper/middlewares.py
@@ -0,0 +1,191 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+import logging
+import time
+from datetime import datetime, timedelta
+
+# useful for handling different item types with a single interface
+from scrapy import signals
+from scrapy.http import HtmlResponse
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service as ChromeService
+from webdriver_manager.chrome import ChromeDriverManager
+
+from camply.containers.data_containers import AvailableCampsite
+
+logger = logging.getLogger(__name__)
+
+logging.getLogger("selenium.webdriver.remote.remote_connection").setLevel(logging.INFO)
+
+
+class ReserveAmericaScraperSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it does not have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class CamplyReserveAmericaSpiderMiddleware:
+    """
+    Spider middleware to convert scraped items (raw availability) into
+    `AvailableCampsite` objects. Accumulates them so they can be retrieved
+    by the spider or provider at the end of the crawl.
+    """
+
+    def __init__(self):
+        # Store processed campsite objects here
+        self.available_campsites = []
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        """
+        Create the middleware and connect signals.
+        """
+        middleware = cls()
+        # Connect signals
+        crawler.signals.connect(middleware.spider_opened, signal=signals.spider_opened)
+        crawler.signals.connect(middleware.item_scraped, signal=signals.item_scraped)
+        crawler.signals.connect(middleware.spider_closed, signal=signals.spider_closed)
+        return middleware
+
+    def spider_opened(self, spider):
+        logger.debug(f"Spider opened: {spider.name}")
+
+    def item_scraped(self, item, spider):
+        """
+        Convert the scraped item (if it's available) into an `AvailableCampsite`,
+        then store it in `self.available_campsites`.
+        """
+        availability = item.get("availability", "").lower()
+
+        # Only process items that are actually "Available"
+        if availability == "a":
+            # For example, parse the date from the item
+            date_str = item["date"]  # e.g. '2025-04-01'
+            booking_date = datetime.strptime(date_str, "%Y-%m-%d")
+            # Create your `AvailableCampsite` object
+            campsite = AvailableCampsite(
+                campsite_id=item["site"],
+                booking_date=booking_date,
+                booking_end_date=booking_date + timedelta(days=1),
+                booking_nights=1,
+                campsite_site_name=str(item["site"]),
+                campsite_loop_name="Placeholder Loop Name",
+                # TODO: Replace with actual loop name
+                campsite_occupancy=[1, 1],
+                # TODO: Replace with actual occupancy
+                availability_status=item["availability"],
+                recreation_area="Placeholder Recreation Area",
+                # TODO: Replace with actual Recreation Area name
+                recreation_area_id=item["parkId"],
+                facility_name="Placeholder Facility Name",
+                # TODO: Replace with actual Facility name
+                facility_id=item["parkId"],
+                booking_url="placeholder.url",
+                # TODO: Replace with actual booking URL
+            )
+            self.available_campsites.append(campsite)
+
+    def spider_closed(self, spider, reason):
+        """
+        When the spider finishes, optionally store the list of available_campsites
+        back onto the spider so the provider can retrieve them.
+        """
+        spider.logger.info(f"Spider closed: {spider.name}, reason: {reason}")
+        spider.available_campsites = self.available_campsites
+
+
+class HumanInTheDownloaderMiddleware:
+    def __init__(self):
+        self.driver = self.create_headful_driver()
+        # TODO: Only use the headful if a captcha is detected.
+        #   [ ] Save session (cookies) that can be passed between headless and headful drivers
+        #   [ ] Write headless driver
+        #   [ ] Detect if captcha is present and switch to headful driver
+        #   [ ] Switch back to headless driver after captcha is solved
+        #   [ ] Save session (cookies) to configuration file
+
+    def create_headful_driver(self):
+        chrome_options = webdriver.ChromeOptions()
+        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
+        chrome_options.add_experimental_option("useAutomationExtension", False)
+        service = ChromeService(ChromeDriverManager().install())
+        return webdriver.Chrome(service=service, options=chrome_options)
+
+    def process_request(self, request, spider):
+        # Process only requests marked with meta['selenium'].
+        if not request.meta.get("selenium"):
+            return None
+
+        self.driver.get(request.url)
+        time.sleep(3)  # Wait for JavaScript elements to load.
+        body = self.driver.page_source
+
+        # Check for a captcha. Adjust this check as needed.
+        if "captcha" in body.lower():
+            spider.logger.info("Captcha detected!")
+            spider.logger.info(
+                "Please solve the captcha in the browser window, then press Enter to continue..."
+            )
+            input("Press Enter after solving the captcha...")
+            body = self.driver.page_source
+
+        # Attach the Selenium driver to the request meta so it's available later.
+        request.meta["driver"] = self.driver
+
+        return HtmlResponse(
+            url=self.driver.current_url, body=body, encoding="utf-8", request=request
+        )
+
+    def process_response(self, request, response, spider):
+        # Return the response unmodified.
+        return response
+
+    def process_exception(self, request, exception, spider):
+        spider.logger.error(f"Exception in HumanInTheMiddleware: {exception}")
+
+    def spider_closed(self, spider):
+        self.driver.quit()
diff --git a/camply/providers/reserve_america/reserve_america_scraper/pipelines.py b/camply/providers/reserve_america/reserve_america_scraper/pipelines.py
@@ -0,0 +1,33 @@
+from datetime import datetime
+
+import pandas as pd
+
+
+class DataFramePipeline:
+    def __init__(self):
+        self.items = []
+
+    def process_item(self, item, spider):
+        self.items.append(dict(item))
+        return item
+
+    def close_spider(self, spider):
+        # Create a DataFrame
+        df = pd.DataFrame(self.items)
+
+        # Ensure there is only one unique parkId
+        if "parkId" in df.columns:
+            unique_park_ids = df["parkId"].unique()
+            if len(unique_park_ids) != 1:
+                spider.logger.error("Multiple or no unique parkId values found.")
+                return
+            park_id = unique_park_ids[0]
+        else:
+            spider.logger.error("parkId column not found in data.")
+            return
+
+        # Export to Parquet with parkId in filename
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        fpath = f"../data/availability_{park_id}_{timestamp}.parquet"
+        df.to_parquet(fpath, index=False)
+        spider.logger.info(f"Saved dataframe to {fpath}")