From 466358c7d95ac547b1062c62e7c36e9b38a8c180 Mon Sep 17 00:00:00 2001 From: mgorecki Date: Sat, 8 Feb 2025 13:22:26 +0100 Subject: [PATCH 1/6] add class balancing to sampling --- folktexts/dataset.py | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/folktexts/dataset.py b/folktexts/dataset.py index 081faa1..fe64f95 100755 --- a/folktexts/dataset.py +++ b/folktexts/dataset.py @@ -288,6 +288,7 @@ def sample_n_train_examples( self, n: int, reuse_examples: bool = False, + class_balancing: bool = False, ) -> tuple[pd.DataFrame, pd.Series]: """Return a set of samples from the training set. @@ -304,11 +305,36 @@ def sample_n_train_examples( X, y : tuple[pd.DataFrame, pd.Series] The features and target data for the sampled examples. """ - # TODO: make sure examples are class-balanced? - if reuse_examples: - example_indices = self._train_indices[:n] - else: - example_indices = self._rng.choice(self._train_indices, size=n, replace=False) + if class_balancing: + + train_labels = self.get_target_data().iloc[self._train_indices] + unique_labels, counts = np.unique(train_labels, return_counts=True) + + + per_label_n = n // len(unique_labels) + remaining = n % len(unique_labels) # distribute extra samples + + if min(counts) < per_label_n: + logging.error(f'Labels are very imbalanced: Attempting to sample {per_label_n}, + but minimal group size is {min(counts)}.') + + example_indices = [] + for i, label in enumerate(unique_labels): + class_indices = self._train_indices[train_labels == label] + + if reuse_examples: + selected = class_indices[:per_label_n + int(i < remaining)] + else: + selected = self._rng.choice(class_indices, size=per_label_n + int(i < remaining), replace=False) + example_indices.extend(selected) + + # shuffle indices to ensure classes are mixed + example_indices = self._rng.permutation(example_indices) + else: + if reuse_examples: + example_indices = self._train_indices[:n] + else: + example_indices = self._rng.choice(self._train_indices, size=n, replace=False) return ( self.data.iloc[example_indices][self.task.features], From 04b4d03f5f8833dae392ca7e85b2b3789485238b Mon Sep 17 00:00:00 2001 From: mgorecki Date: Mon, 10 Feb 2025 09:51:31 +0100 Subject: [PATCH 2/6] add class balancing --- folktexts/cli/run_acs_benchmark.py | 7 +++++++ folktexts/dataset.py | 3 +-- folktexts/prompting.py | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/folktexts/cli/run_acs_benchmark.py b/folktexts/cli/run_acs_benchmark.py index 13fee93..8e5cdf5 100755 --- a/folktexts/cli/run_acs_benchmark.py +++ b/folktexts/cli/run_acs_benchmark.py @@ -75,6 +75,13 @@ def list_of_strings(arg): default=False, ) + parser.add_argument( + "--balance-few-shot-examples", + help="[bool] Whether to sample evenly from all classes in few-shot prompting", + action="store_true", + default=False, + ) + # Optionally, receive a list of features to use (subset of original list) parser.add_argument( "--use-feature-subset", diff --git a/folktexts/dataset.py b/folktexts/dataset.py index fe64f95..e81e69d 100755 --- a/folktexts/dataset.py +++ b/folktexts/dataset.py @@ -315,8 +315,7 @@ def sample_n_train_examples( remaining = n % len(unique_labels) # distribute extra samples if min(counts) < per_label_n: - logging.error(f'Labels are very imbalanced: Attempting to sample {per_label_n}, - but minimal group size is {min(counts)}.') + logging.error(f"Labels are very imbalanced: Attempting to sample {per_label_n}, but minimal group size is {min(counts)}.") example_indices = [] for i, label in enumerate(unique_labels): diff --git a/folktexts/prompting.py b/folktexts/prompting.py index 824c8dc..6eca9e0 100644 --- a/folktexts/prompting.py +++ b/folktexts/prompting.py @@ -64,6 +64,7 @@ def encode_row_prompt_few_shot( n_shots: int, question: QAInterface = None, reuse_examples: bool = False, + class_balancing: bool = False, custom_prompt_prefix: str = None, ) -> str: """Encode a question regarding a given row using few-shot prompting. @@ -87,7 +88,9 @@ def encode_row_prompt_few_shot( The encoded few-shot prompt. """ # Take `n_shots` random samples from the train set - X_examples, y_examples = dataset.sample_n_train_examples(n_shots, reuse_examples=reuse_examples) + X_examples, y_examples = dataset.sample_n_train_examples(n_shots, + reuse_examples=reuse_examples, + class_balancing = class_balancing) # Start with task description prompt = ACS_FEW_SHOT_TASK_DESCRIPTION + "\n" From 115ca49cda57b6c7dea22924c8016fc90a0a50ef Mon Sep 17 00:00:00 2001 From: mgorecki Date: Tue, 11 Feb 2025 10:33:36 +0100 Subject: [PATCH 3/6] add class balancing to benchmark config --- folktexts/benchmark.py | 5 +++++ folktexts/cli/run_acs_benchmark.py | 1 + 2 files changed, 6 insertions(+) diff --git a/folktexts/benchmark.py b/folktexts/benchmark.py index 833dce9..50ba08c 100755 --- a/folktexts/benchmark.py +++ b/folktexts/benchmark.py @@ -42,6 +42,9 @@ class BenchmarkConfig: reuse_few_shot_examples : bool, optional Whether to reuse the same samples for few-shot prompting (or sample new ones every time), by default False. + balance_few_shot_examples : bool, optional + Whether to balance the samples for few-shot prompting with respect to + their labels, by default False. batch_size : int | None, optional The batch size to use for inference. context_size : int | None, optional @@ -62,6 +65,7 @@ class BenchmarkConfig: numeric_risk_prompting: bool = False few_shot: int | None = None reuse_few_shot_examples: bool = False + balance_few_shot_examples: bool = False batch_size: int | None = None context_size: int | None = None correct_order_bias: bool = True @@ -540,6 +544,7 @@ def make_benchmark( n_shots=config.few_shot, dataset=dataset, reuse_examples=config.reuse_few_shot_examples, + class_balancing=config.balance_few_shot_examples, ) else: diff --git a/folktexts/cli/run_acs_benchmark.py b/folktexts/cli/run_acs_benchmark.py index 8e5cdf5..e43803f 100755 --- a/folktexts/cli/run_acs_benchmark.py +++ b/folktexts/cli/run_acs_benchmark.py @@ -154,6 +154,7 @@ def main(): few_shot=args.few_shot, numeric_risk_prompting=args.numeric_risk_prompting, reuse_few_shot_examples=args.reuse_few_shot_examples, + balance_few_shot_examples=args.balance_few_shot_examples, batch_size=args.batch_size, context_size=args.context_size, correct_order_bias=not args.dont_correct_order_bias, From 55b8b7a08540c3d8d080f2b81e9833051a6f7cdb Mon Sep 17 00:00:00 2001 From: mgorecki Date: Thu, 13 Feb 2025 14:02:57 +0100 Subject: [PATCH 4/6] fix order of few-shot examples --- folktexts/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/folktexts/dataset.py b/folktexts/dataset.py index e81e69d..d65bcca 100755 --- a/folktexts/dataset.py +++ b/folktexts/dataset.py @@ -327,8 +327,8 @@ def sample_n_train_examples( selected = self._rng.choice(class_indices, size=per_label_n + int(i < remaining), replace=False) example_indices.extend(selected) - # shuffle indices to ensure classes are mixed - example_indices = self._rng.permutation(example_indices) + # shuffle indices using seed to ensure classes are mixed + example_indices = np.random.default_rng(self._seed).permutation(example_indices) else: if reuse_examples: example_indices = self._train_indices[:n] From be9ea7299c77751f092a73f69bdf0a9702cbb0c2 Mon Sep 17 00:00:00 2001 From: mgorecki Date: Thu, 13 Feb 2025 15:32:23 +0100 Subject: [PATCH 5/6] Revert "fix order of few-shot examples", use internal rng this reverts commit 55b8b7a08540c3d8d080f2b81e9833051a6f7cdb. --- folktexts/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/folktexts/dataset.py b/folktexts/dataset.py index d65bcca..e81e69d 100755 --- a/folktexts/dataset.py +++ b/folktexts/dataset.py @@ -327,8 +327,8 @@ def sample_n_train_examples( selected = self._rng.choice(class_indices, size=per_label_n + int(i < remaining), replace=False) example_indices.extend(selected) - # shuffle indices using seed to ensure classes are mixed - example_indices = np.random.default_rng(self._seed).permutation(example_indices) + # shuffle indices to ensure classes are mixed + example_indices = self._rng.permutation(example_indices) else: if reuse_examples: example_indices = self._train_indices[:n] From 5156f1f378a766bc4d65cdf51608816a671d6cb4 Mon Sep 17 00:00:00 2001 From: AndreFCruz Date: Thu, 13 Feb 2025 15:41:07 -0500 Subject: [PATCH 6/6] fixing minor flake8 linter warnings --- folktexts/benchmark.py | 2 +- folktexts/dataset.py | 14 ++++++++------ folktexts/evaluation.py | 2 +- folktexts/prompting.py | 8 +++++--- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/folktexts/benchmark.py b/folktexts/benchmark.py index 50ba08c..86e9304 100755 --- a/folktexts/benchmark.py +++ b/folktexts/benchmark.py @@ -43,7 +43,7 @@ class BenchmarkConfig: Whether to reuse the same samples for few-shot prompting (or sample new ones every time), by default False. balance_few_shot_examples : bool, optional - Whether to balance the samples for few-shot prompting with respect to + Whether to balance the samples for few-shot prompting with respect to their labels, by default False. batch_size : int | None, optional The batch size to use for inference. diff --git a/folktexts/dataset.py b/folktexts/dataset.py index e81e69d..c2548e8 100755 --- a/folktexts/dataset.py +++ b/folktexts/dataset.py @@ -309,13 +309,15 @@ def sample_n_train_examples( train_labels = self.get_target_data().iloc[self._train_indices] unique_labels, counts = np.unique(train_labels, return_counts=True) - + # Calculate number of samples to sample per label per_label_n = n // len(unique_labels) - remaining = n % len(unique_labels) # distribute extra samples + remaining = n % len(unique_labels) # distribute extra samples - if min(counts) < per_label_n: - logging.error(f"Labels are very imbalanced: Attempting to sample {per_label_n}, but minimal group size is {min(counts)}.") + if min(counts) < per_label_n: + logging.error( + f"Labels are very imbalanced: Attempting to sample {per_label_n}, " + f"but minimal group size is {min(counts)}.") example_indices = [] for i, label in enumerate(unique_labels): @@ -323,13 +325,13 @@ def sample_n_train_examples( if reuse_examples: selected = class_indices[:per_label_n + int(i < remaining)] - else: + else: selected = self._rng.choice(class_indices, size=per_label_n + int(i < remaining), replace=False) example_indices.extend(selected) # shuffle indices to ensure classes are mixed example_indices = self._rng.permutation(example_indices) - else: + else: if reuse_examples: example_indices = self._train_indices[:n] else: diff --git a/folktexts/evaluation.py b/folktexts/evaluation.py index 1abb088..323f01d 100644 --- a/folktexts/evaluation.py +++ b/folktexts/evaluation.py @@ -9,7 +9,7 @@ import logging import statistics -from typing import Callable, Optional +from typing import Callable import numpy as np from netcal.metrics import ECE diff --git a/folktexts/prompting.py b/folktexts/prompting.py index 6eca9e0..80960f5 100644 --- a/folktexts/prompting.py +++ b/folktexts/prompting.py @@ -88,9 +88,11 @@ def encode_row_prompt_few_shot( The encoded few-shot prompt. """ # Take `n_shots` random samples from the train set - X_examples, y_examples = dataset.sample_n_train_examples(n_shots, - reuse_examples=reuse_examples, - class_balancing = class_balancing) + X_examples, y_examples = dataset.sample_n_train_examples( + n_shots, + reuse_examples=reuse_examples, + class_balancing=class_balancing, + ) # Start with task description prompt = ACS_FEW_SHOT_TASK_DESCRIPTION + "\n"