From 466358c7d95ac547b1062c62e7c36e9b38a8c180 Mon Sep 17 00:00:00 2001
From: mgorecki <mila.gorecki@tuebingen.mpg.de>
Date: Sat, 8 Feb 2025 13:22:26 +0100
Subject: [PATCH 1/6] add class balancing to sampling

---
 folktexts/dataset.py | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/folktexts/dataset.py b/folktexts/dataset.py
index 081faa1..fe64f95 100755
--- a/folktexts/dataset.py
+++ b/folktexts/dataset.py
@@ -288,6 +288,7 @@ def sample_n_train_examples(
         self,
         n: int,
         reuse_examples: bool = False,
+        class_balancing: bool = False,
     ) -> tuple[pd.DataFrame, pd.Series]:
         """Return a set of samples from the training set.
 
@@ -304,11 +305,36 @@ def sample_n_train_examples(
         X, y : tuple[pd.DataFrame, pd.Series]
             The features and target data for the sampled examples.
         """
-        # TODO: make sure examples are class-balanced?
-        if reuse_examples:
-            example_indices = self._train_indices[:n]
-        else:
-            example_indices = self._rng.choice(self._train_indices, size=n, replace=False)
+        if class_balancing:
+
+            train_labels = self.get_target_data().iloc[self._train_indices]
+            unique_labels, counts = np.unique(train_labels, return_counts=True)
+            
+
+            per_label_n = n // len(unique_labels)
+            remaining = n % len(unique_labels) # distribute extra samples
+
+            if min(counts) < per_label_n:  
+                logging.error(f'Labels are very imbalanced: Attempting to sample {per_label_n}, 
+                              but minimal group size is {min(counts)}.')
+
+            example_indices = []
+            for i, label in enumerate(unique_labels):
+                class_indices = self._train_indices[train_labels == label]
+
+                if reuse_examples:
+                    selected = class_indices[:per_label_n + int(i < remaining)]
+                else: 
+                    selected = self._rng.choice(class_indices, size=per_label_n + int(i < remaining), replace=False)
+                example_indices.extend(selected)
+
+            # shuffle indices to ensure classes are mixed
+            example_indices = self._rng.permutation(example_indices)
+        else: 
+            if reuse_examples:
+                example_indices = self._train_indices[:n]
+            else:
+                example_indices = self._rng.choice(self._train_indices, size=n, replace=False)
 
         return (
             self.data.iloc[example_indices][self.task.features],

From 04b4d03f5f8833dae392ca7e85b2b3789485238b Mon Sep 17 00:00:00 2001
From: mgorecki <mila.gorecki@tuebingen.mpg.de>
Date: Mon, 10 Feb 2025 09:51:31 +0100
Subject: [PATCH 2/6] add class balancing

---
 folktexts/cli/run_acs_benchmark.py | 7 +++++++
 folktexts/dataset.py               | 3 +--
 folktexts/prompting.py             | 5 ++++-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/folktexts/cli/run_acs_benchmark.py b/folktexts/cli/run_acs_benchmark.py
index 13fee93..8e5cdf5 100755
--- a/folktexts/cli/run_acs_benchmark.py
+++ b/folktexts/cli/run_acs_benchmark.py
@@ -75,6 +75,13 @@ def list_of_strings(arg):
         default=False,
     )
 
+    parser.add_argument(
+        "--balance-few-shot-examples",
+        help="[bool] Whether to sample evenly from all classes in few-shot prompting",
+        action="store_true",
+        default=False,
+    )
+
     # Optionally, receive a list of features to use (subset of original list)
     parser.add_argument(
         "--use-feature-subset",
diff --git a/folktexts/dataset.py b/folktexts/dataset.py
index fe64f95..e81e69d 100755
--- a/folktexts/dataset.py
+++ b/folktexts/dataset.py
@@ -315,8 +315,7 @@ def sample_n_train_examples(
             remaining = n % len(unique_labels) # distribute extra samples
 
             if min(counts) < per_label_n:  
-                logging.error(f'Labels are very imbalanced: Attempting to sample {per_label_n}, 
-                              but minimal group size is {min(counts)}.')
+                logging.error(f"Labels are very imbalanced: Attempting to sample {per_label_n}, but minimal group size is {min(counts)}.")
 
             example_indices = []
             for i, label in enumerate(unique_labels):
diff --git a/folktexts/prompting.py b/folktexts/prompting.py
index 824c8dc..6eca9e0 100644
--- a/folktexts/prompting.py
+++ b/folktexts/prompting.py
@@ -64,6 +64,7 @@ def encode_row_prompt_few_shot(
     n_shots: int,
     question: QAInterface = None,
     reuse_examples: bool = False,
+    class_balancing: bool = False,
     custom_prompt_prefix: str = None,
 ) -> str:
     """Encode a question regarding a given row using few-shot prompting.
@@ -87,7 +88,9 @@ def encode_row_prompt_few_shot(
         The encoded few-shot prompt.
     """
     # Take `n_shots` random samples from the train set
-    X_examples, y_examples = dataset.sample_n_train_examples(n_shots, reuse_examples=reuse_examples)
+    X_examples, y_examples = dataset.sample_n_train_examples(n_shots, 
+                                                             reuse_examples=reuse_examples,
+                                                             class_balancing = class_balancing)
 
     # Start with task description
     prompt = ACS_FEW_SHOT_TASK_DESCRIPTION + "\n"

From 115ca49cda57b6c7dea22924c8016fc90a0a50ef Mon Sep 17 00:00:00 2001
From: mgorecki <mila.gorecki@tuebingen.mpg.de>
Date: Tue, 11 Feb 2025 10:33:36 +0100
Subject: [PATCH 3/6] add class balancing to benchmark config

---
 folktexts/benchmark.py             | 5 +++++
 folktexts/cli/run_acs_benchmark.py | 1 +
 2 files changed, 6 insertions(+)

diff --git a/folktexts/benchmark.py b/folktexts/benchmark.py
index 833dce9..50ba08c 100755
--- a/folktexts/benchmark.py
+++ b/folktexts/benchmark.py
@@ -42,6 +42,9 @@ class BenchmarkConfig:
     reuse_few_shot_examples : bool, optional
         Whether to reuse the same samples for few-shot prompting (or sample new
         ones every time), by default False.
+    balance_few_shot_examples : bool, optional
+        Whether to balance the samples for few-shot prompting with respect to 
+        their labels, by default False.
     batch_size : int | None, optional
         The batch size to use for inference.
     context_size : int | None, optional
@@ -62,6 +65,7 @@ class BenchmarkConfig:
     numeric_risk_prompting: bool = False
     few_shot: int | None = None
     reuse_few_shot_examples: bool = False
+    balance_few_shot_examples: bool = False
     batch_size: int | None = None
     context_size: int | None = None
     correct_order_bias: bool = True
@@ -540,6 +544,7 @@ def make_benchmark(
                 n_shots=config.few_shot,
                 dataset=dataset,
                 reuse_examples=config.reuse_few_shot_examples,
+                class_balancing=config.balance_few_shot_examples,
             )
 
         else:
diff --git a/folktexts/cli/run_acs_benchmark.py b/folktexts/cli/run_acs_benchmark.py
index 8e5cdf5..e43803f 100755
--- a/folktexts/cli/run_acs_benchmark.py
+++ b/folktexts/cli/run_acs_benchmark.py
@@ -154,6 +154,7 @@ def main():
         few_shot=args.few_shot,
         numeric_risk_prompting=args.numeric_risk_prompting,
         reuse_few_shot_examples=args.reuse_few_shot_examples,
+        balance_few_shot_examples=args.balance_few_shot_examples,
         batch_size=args.batch_size,
         context_size=args.context_size,
         correct_order_bias=not args.dont_correct_order_bias,

From 55b8b7a08540c3d8d080f2b81e9833051a6f7cdb Mon Sep 17 00:00:00 2001
From: mgorecki <mila.gorecki@tuebingen.mpg.de>
Date: Thu, 13 Feb 2025 14:02:57 +0100
Subject: [PATCH 4/6] fix order of few-shot examples

---
 folktexts/dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/folktexts/dataset.py b/folktexts/dataset.py
index e81e69d..d65bcca 100755
--- a/folktexts/dataset.py
+++ b/folktexts/dataset.py
@@ -327,8 +327,8 @@ def sample_n_train_examples(
                     selected = self._rng.choice(class_indices, size=per_label_n + int(i < remaining), replace=False)
                 example_indices.extend(selected)
 
-            # shuffle indices to ensure classes are mixed
-            example_indices = self._rng.permutation(example_indices)
+            # shuffle indices using seed to ensure classes are mixed
+            example_indices = np.random.default_rng(self._seed).permutation(example_indices)
         else: 
             if reuse_examples:
                 example_indices = self._train_indices[:n]

From be9ea7299c77751f092a73f69bdf0a9702cbb0c2 Mon Sep 17 00:00:00 2001
From: mgorecki <mila.gorecki@tuebingen.mpg.de>
Date: Thu, 13 Feb 2025 15:32:23 +0100
Subject: [PATCH 5/6] Revert "fix order of few-shot examples", use internal rng

this reverts commit 55b8b7a08540c3d8d080f2b81e9833051a6f7cdb.
---
 folktexts/dataset.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/folktexts/dataset.py b/folktexts/dataset.py
index d65bcca..e81e69d 100755
--- a/folktexts/dataset.py
+++ b/folktexts/dataset.py
@@ -327,8 +327,8 @@ def sample_n_train_examples(
                     selected = self._rng.choice(class_indices, size=per_label_n + int(i < remaining), replace=False)
                 example_indices.extend(selected)
 
-            # shuffle indices using seed to ensure classes are mixed
-            example_indices = np.random.default_rng(self._seed).permutation(example_indices)
+            # shuffle indices to ensure classes are mixed
+            example_indices = self._rng.permutation(example_indices)
         else: 
             if reuse_examples:
                 example_indices = self._train_indices[:n]

From 5156f1f378a766bc4d65cdf51608816a671d6cb4 Mon Sep 17 00:00:00 2001
From: AndreFCruz <andrecruz97@gmail.com>
Date: Thu, 13 Feb 2025 15:41:07 -0500
Subject: [PATCH 6/6] fixing minor flake8 linter warnings

---
 folktexts/benchmark.py  |  2 +-
 folktexts/dataset.py    | 14 ++++++++------
 folktexts/evaluation.py |  2 +-
 folktexts/prompting.py  |  8 +++++---
 4 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/folktexts/benchmark.py b/folktexts/benchmark.py
index 50ba08c..86e9304 100755
--- a/folktexts/benchmark.py
+++ b/folktexts/benchmark.py
@@ -43,7 +43,7 @@ class BenchmarkConfig:
         Whether to reuse the same samples for few-shot prompting (or sample new
         ones every time), by default False.
     balance_few_shot_examples : bool, optional
-        Whether to balance the samples for few-shot prompting with respect to 
+        Whether to balance the samples for few-shot prompting with respect to
         their labels, by default False.
     batch_size : int | None, optional
         The batch size to use for inference.
diff --git a/folktexts/dataset.py b/folktexts/dataset.py
index e81e69d..c2548e8 100755
--- a/folktexts/dataset.py
+++ b/folktexts/dataset.py
@@ -309,13 +309,15 @@ def sample_n_train_examples(
 
             train_labels = self.get_target_data().iloc[self._train_indices]
             unique_labels, counts = np.unique(train_labels, return_counts=True)
-            
 
+            # Calculate number of samples to sample per label
             per_label_n = n // len(unique_labels)
-            remaining = n % len(unique_labels) # distribute extra samples
+            remaining = n % len(unique_labels)  # distribute extra samples
 
-            if min(counts) < per_label_n:  
-                logging.error(f"Labels are very imbalanced: Attempting to sample {per_label_n}, but minimal group size is {min(counts)}.")
+            if min(counts) < per_label_n:
+                logging.error(
+                    f"Labels are very imbalanced: Attempting to sample {per_label_n}, "
+                    f"but minimal group size is {min(counts)}.")
 
             example_indices = []
             for i, label in enumerate(unique_labels):
@@ -323,13 +325,13 @@ def sample_n_train_examples(
 
                 if reuse_examples:
                     selected = class_indices[:per_label_n + int(i < remaining)]
-                else: 
+                else:
                     selected = self._rng.choice(class_indices, size=per_label_n + int(i < remaining), replace=False)
                 example_indices.extend(selected)
 
             # shuffle indices to ensure classes are mixed
             example_indices = self._rng.permutation(example_indices)
-        else: 
+        else:
             if reuse_examples:
                 example_indices = self._train_indices[:n]
             else:
diff --git a/folktexts/evaluation.py b/folktexts/evaluation.py
index 1abb088..323f01d 100644
--- a/folktexts/evaluation.py
+++ b/folktexts/evaluation.py
@@ -9,7 +9,7 @@
 
 import logging
 import statistics
-from typing import Callable, Optional
+from typing import Callable
 
 import numpy as np
 from netcal.metrics import ECE
diff --git a/folktexts/prompting.py b/folktexts/prompting.py
index 6eca9e0..80960f5 100644
--- a/folktexts/prompting.py
+++ b/folktexts/prompting.py
@@ -88,9 +88,11 @@ def encode_row_prompt_few_shot(
         The encoded few-shot prompt.
     """
     # Take `n_shots` random samples from the train set
-    X_examples, y_examples = dataset.sample_n_train_examples(n_shots, 
-                                                             reuse_examples=reuse_examples,
-                                                             class_balancing = class_balancing)
+    X_examples, y_examples = dataset.sample_n_train_examples(
+        n_shots,
+        reuse_examples=reuse_examples,
+        class_balancing=class_balancing,
+    )
 
     # Start with task description
     prompt = ACS_FEW_SHOT_TASK_DESCRIPTION + "\n"