Skip to content

Too slow row counting for feature files #11

@h0wl34

Description

@h0wl34

Counting rows for all feature files is highly inefficient as it loads them to memory first. A quick fix I made using a buffer:

def _fast_count_rows(file_paths: list[Path]) -> int:
    total_lines = 0tr
    pbar = tqdm.tqdm(file_paths, desc="Counting rows")
    for file_path in pbar:
        pbar.set_postfix_str(f"{file_path.name}")
        with file_path.open("rb") as f:
            lines = 0
            buf_size = 1024 * 1024  # 1MB buffer
            read_f = f.raw.read
            buf = read_f(buf_size)
            while buf:
                lines += buf.count(b'\n')
                buf = read_f(buf_size)
            total_lines += lines
    return total_lines

and in model.create_vectorized_features():

# ...
print("Preparing to vectorize raw features")
X_train_path = data_path / "X_train.dat"
y_train_path = data_path / "y_train.dat"
train_feature_paths = gather_feature_paths(data_path, "train")
# train_nrows = sum([1 for fp in train_feature_paths for _ in fp.open()])
train_nrows = _fast_count_rows(train_feature_paths) # monkey patch

X_test_path = data_path / "X_test.dat"
y_test_path = data_path / "y_test.dat"
test_feature_paths = gather_feature_paths(data_path, "test")
# test_nrows = sum([1 for fp in test_feature_paths for _ in fp.open()])
test_nrows = _fast_count_rows(test_feature_paths) # monkey patch
# ...

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions