-
Notifications
You must be signed in to change notification settings - Fork 15
Open
Description
Counting rows for all feature files is highly inefficient as it loads them to memory first. A quick fix I made using a buffer:
def _fast_count_rows(file_paths: list[Path]) -> int:
total_lines = 0tr
pbar = tqdm.tqdm(file_paths, desc="Counting rows")
for file_path in pbar:
pbar.set_postfix_str(f"{file_path.name}")
with file_path.open("rb") as f:
lines = 0
buf_size = 1024 * 1024 # 1MB buffer
read_f = f.raw.read
buf = read_f(buf_size)
while buf:
lines += buf.count(b'\n')
buf = read_f(buf_size)
total_lines += lines
return total_linesand in model.create_vectorized_features():
# ...
print("Preparing to vectorize raw features")
X_train_path = data_path / "X_train.dat"
y_train_path = data_path / "y_train.dat"
train_feature_paths = gather_feature_paths(data_path, "train")
# train_nrows = sum([1 for fp in train_feature_paths for _ in fp.open()])
train_nrows = _fast_count_rows(train_feature_paths) # monkey patch
X_test_path = data_path / "X_test.dat"
y_test_path = data_path / "y_test.dat"
test_feature_paths = gather_feature_paths(data_path, "test")
# test_nrows = sum([1 for fp in test_feature_paths for _ in fp.open()])
test_nrows = _fast_count_rows(test_feature_paths) # monkey patch
# ...Metadata
Metadata
Assignees
Labels
No labels