-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathselect_samples.py
More file actions
106 lines (87 loc) · 3.82 KB
/
select_samples.py
File metadata and controls
106 lines (87 loc) · 3.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
import numpy as np
import argparse
import datetime
def set_seed(seed):
"""
Set random seed for reproducibility
"""
np.random.seed(seed)
print(f"Random seed set to {seed}")
def select_samples(input_file, output_file, num_samples, seeds, layer_idx=-1):
"""
Select a fixed number of samples for each seed from the input file
Args:
input_file: Path to input file
output_file: Path to output file
num_samples: Number of samples to select per seed
seeds: List of random seeds
layer_idx: Layer index to select; defaults to the last layer
"""
print(f"Loading input file: {input_file}")
data = np.load(input_file)
print(f"Input data shape: {data.shape}")
# Ensure correct data dimensionality
if len(data.shape) != 4:
raise ValueError(f"Expected input data to be 4D (num_samples, pos/neg, num_layers, dim), got {len(data.shape)}D")
# Get all sample indices
total_samples = data.shape[0]
all_indices = np.arange(total_samples)
# Collect selected samples for all seeds
selected_samples = []
# Select samples for each seed
for seed in seeds:
print(f"\nSelecting samples with seed {seed}...")
set_seed(seed)
# Randomly choose indices
selected_indices = np.random.choice(all_indices, num_samples, replace=False)
# Gather selected samples
selected_data = data[selected_indices]
print(f"Selected samples shape (seed {seed}): {selected_data.shape}")
selected_samples.append(selected_data)
# Concatenate across seeds
combined_samples = np.concatenate(selected_samples, axis=0)
print(f"\nCombined samples shape: {combined_samples.shape}")
# Save results
print(f"\nSaving results to: {output_file}")
np.save(output_file, combined_samples)
# Save selection config
config_file = os.path.splitext(output_file)[0] + "_config.txt"
with open(config_file, "w") as f:
f.write("Sample selection config\n")
f.write("=" * 50 + "\n\n")
f.write(f"Input file: {input_file}\n")
f.write(f"Output file: {output_file}\n")
f.write(f"Num samples per seed: {num_samples}\n")
f.write(f"Seeds: {seeds}\n")
f.write(f"Layer index: {layer_idx}\n")
f.write(f"Selection time: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"\nOriginal data shape: {data.shape}\n")
f.write(f"Selected data shape: {combined_samples.shape}\n")
print(f"Config saved to: {config_file}")
def main():
parser = argparse.ArgumentParser(description='Select a fixed number of samples from training data')
parser.add_argument('--input_file', type=str,
default='/nobackup2/taoleitian/neurips/embeddings/hh_rlhf/llama_instruct_10k/train_10k.npy',
help='Path to input file')
parser.add_argument('--output_file', type=str, default='/nobackup2/taoleitian/rm/vae_results/hh_rlhf/llama_sft_10k/seeds_samples/10k_4.npy',
help='Path to output file')
parser.add_argument('--num_samples', type=int, default=1000,
help='Number of samples to select per seed')
parser.add_argument('--seeds', type=int, nargs='+', default=[4],
help='List of random seeds')
parser.add_argument('--layer_idx', type=int, default=-1,
help='Layer index to select, default is the last layer')
args = parser.parse_args()
# Create output directory
os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
# Select samples
select_samples(
args.input_file,
args.output_file,
args.num_samples,
args.seeds,
args.layer_idx
)
if __name__ == "__main__":
main()