From d3687955360075cde5af6f9a14f35bf94fb41f6f Mon Sep 17 00:00:00 2001 From: Robin Message Date: Thu, 28 Sep 2023 11:54:47 +0100 Subject: [PATCH] Implement pooled_cov and scaling as per matchit R package --- methods/matching/find_pairs.py | 49 +++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/methods/matching/find_pairs.py b/methods/matching/find_pairs.py index 17740f9..f0ccdb8 100644 --- a/methods/matching/find_pairs.py +++ b/methods/matching/find_pairs.py @@ -15,6 +15,15 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") +# Implement similar logic to the R matchit package +# https://github.com/kosukeimai/MatchIt/blob/7b7b48af995ad1d2ab83a72bad9a689bbb9e8151/R/aux_functions.R#L550 +def pooled_scaled_cov(treatment, control): + treatment -= np.mean(treatment, axis=0) + control -= np.mean(control, axis=0) + combined_scaled = np.concatenate([treatment, control]) + size = len(combined_scaled) + return np.cov(combined_scaled, rowvar=False) * (size-1) / (size-2) + def find_match_iteration( k_parquet_filename: str, s_parquet_filename: str, @@ -52,10 +61,36 @@ def find_match_iteration( # As well as all the LUC columns for later use luc_columns = [x for x in s_set.columns if x.startswith('luc')] - s_subset_for_cov = s_subset[['elevation', 'slope', 'access', \ - 'cpc0_u', 'cpc0_d', 'cpc5_u', 'cpc5_d', 'cpc10_u', 'cpc10_d']] - covarience = np.cov(s_subset_for_cov, rowvar=False) - invconv = np.linalg.inv(covarience) + distance_columns = [ + "elevation", "slope", "access", + "cpc0_u", "cpc0_d", + "cpc5_u", "cpc5_d", + "cpc10_u", "cpc10_d" + ] + + s_subset_for_cov = s_subset[distance_columns] + k_subset_for_cov = k_subset[distance_columns] + + logging.info("Scaling the columns...") + + # Scale the columns as per matchit + # https://github.com/kosukeimai/MatchIt/blob/7b7b48af995ad1d2ab83a72bad9a689bbb9e8151/R/dist_functions.R#L240 + combined = np.concatenate([s_subset_for_cov, k_subset_for_cov]) + means = np.mean(combined, axis=0) + stds = np.std(combined - means, axis=0) + s_subset_for_cov -= means + s_subset_for_cov /= stds + k_subset_for_cov -= means + k_subset_for_cov /= stds + + s_subset[distance_columns] = s_subset_for_cov + k_subset[distance_columns] = k_subset_for_cov + + logging.info("Calculating pooled covariance...") + covariance = pooled_scaled_cov(k_subset_for_cov, s_subset_for_cov) + + logging.info("Calculating inverse covariance...") + invconv = np.linalg.inv(covariance).astype(np.float32) for _, k_row in k_subset.iterrows(): # Methodology 6.5.7: find the matches. @@ -85,12 +120,6 @@ def find_match_iteration( # * slope # * accessibility # * coarsened proportional coverage - distance_columns = [ - "elevation", "slope", "access", - "cpc0_u", "cpc0_d", - "cpc5_u", "cpc5_d", - "cpc10_u", "cpc10_d" - ] k_soft = np.array(k_row[distance_columns].to_list()) just_cols = filtered_s[distance_columns].to_numpy()