RTIInternational · samlamont · Jan 7, 2026 · Dec 10, 2025 · Dec 11, 2025 · Dec 12, 2025
diff --git a/src/teehr/loading/s3/clone_from_s3.py b/src/teehr/loading/s3/clone_from_s3.py
@@ -49,9 +49,22 @@ def subset_the_table(
     """Subset the dataset based on location and start/end time."""
     if table.name == "locations" and primary_location_ids is not None:
         sdf_in = sdf_in.filter(sdf_in.id.isin(primary_location_ids))
-    elif table.name == "location_attributes" and primary_location_ids is not None:
+        # warn user if any primary_location_ids were excluded
+        available_ids = set(
+            sdf_in.select("id").rdd.flatMap(
+                lambda x: x).collect()
+                )
+        missing_ids = set(primary_location_ids) - available_ids
+        if len(missing_ids) > 0:
+            logger.warning(
+                "The following primary_location_ids were not found in the "
+                f"locations table and will be excluded: {missing_ids}"
+            )
+    elif table.name == "location_attributes" and \
+            primary_location_ids is not None:
         sdf_in = sdf_in.filter(sdf_in.location_id.isin(primary_location_ids))
-    elif table.name == "location_crosswalks" and primary_location_ids is not None:
+    elif table.name == "location_crosswalks" and \
+            primary_location_ids is not None:
         sdf_in = sdf_in.filter(
             sdf_in.primary_location_id.isin(primary_location_ids)
         )
@@ -64,7 +77,8 @@ def subset_the_table(
         if primary_location_ids is not None:
             secondary_ids = (
                 ev.location_crosswalks.to_sdf()
-                .select("secondary_location_id").rdd.flatMap(lambda x: x).collect()
+                .select("secondary_location_id").rdd.flatMap(
+                    lambda x: x).collect()
             )
             sdf_in = sdf_in.filter(sdf_in.location_id.isin(secondary_ids))
     elif table.name == "joined_timeseries":
@@ -119,7 +133,6 @@ def clone_from_s3(
 
     Note: future version will allow subsetting the tables to clone.
     """
-
     # Make the Evaluation directories
     logger.info(f"Creating directories for evaluation: {evaluation_name}")
     Path(ev.cache_dir).mkdir()
@@ -174,7 +187,10 @@ def clone_from_s3(
         logger.debug(f"Making directory {table.dir}")
         Path(table.dir).mkdir()
 
-        logger.debug(f"Cloning {table.name} from {s3_dataset_path}/{table.name}/ to {table.dir}")
+        logger.debug(
+            f"Cloning {table.name} from {s3_dataset_path}/{table.name}/ to "
+            f"{table.dir}"
+            )
 
         sdf_in = table._read_files(
             path=f"{s3_dataset_path}/{table.name}/",
@@ -200,7 +216,8 @@ def clone_from_s3(
     dest = f"{ev.scripts_dir}/user_defined_fields.py"
     logger.debug(f"Copying from {source}/ to {dest}")
 
-    # ToDo: there is a permission issue that prevents copying the entire directory.
+    # ToDo: there is a permission issue that prevents copying the entire
+    # directory.
     # This works for now.
     with fsspec.open(source, 'r', anon=True) as file:
         with open(dest, 'w') as f:

diff --git a/tests/fetch/test_nwm_fetch_and_format.py b/tests/fetch/test_nwm_fetch_and_format.py
@@ -49,7 +49,8 @@ def test_nwm22_point_fetch_and_format(tmpdir):
         overwrite_output=True,
         nwm_version="nwm22",
         variable_mapper=TEST_NWM_VARIABLE_MAPPER,
-        timeseries_type="secondary"
+        timeseries_type="secondary",
+        drop_overlapping_assimilation_values=False
     )
 
     parquet_file = Path(tmpdir, "20230318T14.parquet")
@@ -90,7 +91,8 @@ def test_nwm30_point_fetch_and_format(tmpdir):
         ignore_missing_file=False,
         overwrite_output=True,
         variable_mapper=TEST_NWM_VARIABLE_MAPPER,
-        timeseries_type="secondary"
+        timeseries_type="secondary",
+        drop_overlapping_assimilation_values=False
     )
 
     parquet_file = Path(tmpdir, "20231101T00.parquet")
@@ -130,7 +132,8 @@ def test_nwm30_point_fetch_and_format_medium_range_member(tmpdir):
         ignore_missing_file=False,
         overwrite_output=True,
         variable_mapper=TEST_NWM_VARIABLE_MAPPER,
-        timeseries_type="secondary"
+        timeseries_type="secondary",
+        drop_overlapping_assimilation_values=False
     )
 
     parquet_file = Path(tmpdir, "20240222T00.parquet")
@@ -163,7 +166,8 @@ def test_nwm22_grid_fetch_and_format(tmpdir):
         overwrite_output=True,
         location_id_prefix=None,
         variable_mapper=None,
-        timeseries_type="primary"
+        timeseries_type="primary",
+        drop_overlapping_assimilation_values=False
     )
 
     parquet_file = Path(tmpdir, "20201218T00.parquet")
@@ -205,7 +209,8 @@ def test_nwm30_grid_fetch_and_format(tmpdir):
         overwrite_output=True,
         location_id_prefix=None,
         variable_mapper=TEST_NWM_VARIABLE_MAPPER,
-        timeseries_type="primary"
+        timeseries_type="primary",
+        drop_overlapping_assimilation_values=False
     )
 
     parquet_file = Path(tmpdir, "20231101T00.parquet")

diff --git a/tests/query/test_get_metrics_query.py b/tests/query/test_get_metrics_query.py
@@ -490,18 +490,20 @@ def test_ensemble_metrics(tmpdir):
     ).to_pandas()
 
     # check CRPS values
-    assert np.isclose(metrics_df.mean_crps_ensemble.values[0], 22.050798)
-    assert np.isclose(metrics_df.mean_crps_ensemble.values[1], 22.383705)
     assert np.isclose(
-        metrics_df.mean_crps_ensemble_skill_score.values[0], 0.20777595
+        metrics_df.mean_crps_ensemble.values[1], 22.3837, atol=1e-04
+        )
+    assert np.isclose(
+        metrics_df.mean_crps_ensemble_skill_score.values[0], 0.2078, atol=1e-04
     )
     assert np.isnan(metrics_df.mean_crps_ensemble_skill_score.values[2])
 
     # check Brier Score values
-    assert np.isclose(metrics_df.mean_brier_score.values[0], 0.18979715)
-    assert np.isclose(metrics_df.mean_brier_score.values[1], 0.19405437)
     assert np.isclose(
-        metrics_df.mean_brier_score_skill_score.values[0], 0.26453602
+        metrics_df.mean_brier_score.values[0], 0.1898, atol=1e-04
+    )
+    assert np.isclose(
+        metrics_df.mean_brier_score_skill_score.values[0], 0.2645, atol=1e-04
     )
     assert np.isnan(
         metrics_df.mean_brier_score_skill_score.values[2]