From 1b9ea5b956905d5af4e9a506491f4f1d977579c8 Mon Sep 17 00:00:00 2001 From: Yun Zheng Hu Date: Tue, 17 Jun 2025 15:51:21 +0200 Subject: [PATCH 1/3] Add -Cn/--csv-no-header option to rdump This allows you to dump the fields as CSV but without the row header. Also fixes issue that in some cases the selected fields were not propagated to the adapter. --- flow/record/adapter/csvfile.py | 9 +++++++-- flow/record/tools/rdump.py | 16 +++++++++++++--- tests/test_regression.py | 20 ++++++++++++++++++++ 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/flow/record/adapter/csvfile.py b/flow/record/adapter/csvfile.py index 82a25fb5..48e77f7e 100644 --- a/flow/record/adapter/csvfile.py +++ b/flow/record/adapter/csvfile.py @@ -17,11 +17,12 @@ __usage__ = """ Comma-separated values (CSV) adapter --- -Write usage: rdump -w csvfile://[PATH]?lineterminator=[TERMINATOR] +Write usage: rdump -w csvfile://[PATH]?lineterminator=[TERMINATOR]&header=[HEADER] Read usage: rdump csvfile://[PATH]?fields=[FIELDS] [PATH]: path to file. Leave empty or "-" to output to stdout Optional parameters: + [HEADER]: if set to false, it will not print the CSV header (default: true) [TERMINATOR]: line terminator, default is \\r\\n [FIELDS]: comma-separated list of CSV fields (in case of missing CSV header) """ @@ -34,6 +35,7 @@ def __init__( fields: str | list[str] | None = None, exclude: str | list[str] | None = None, lineterminator: str = "\r\n", + header: str = "true", **kwargs, ): self.fp = None @@ -52,13 +54,16 @@ def __init__( self.fields = self.fields.split(",") if isinstance(self.exclude, str): self.exclude = self.exclude.split(",") + self.header = header.lower() == "true" def write(self, r: Record) -> None: rdict = r._asdict(fields=self.fields, exclude=self.exclude) if not self.desc or self.desc != r._desc: self.desc = r._desc self.writer = csv.DictWriter(self.fp, rdict, lineterminator=self.lineterminator) - self.writer.writeheader() + if self.header: + # Write header only if it is requested + self.writer.writeheader() self.writer.writerow(rdict) def flush(self) -> None: diff --git a/flow/record/tools/rdump.py b/flow/record/tools/rdump.py index f9759355..7e30ae4c 100644 --- a/flow/record/tools/rdump.py +++ b/flow/record/tools/rdump.py @@ -107,7 +107,7 @@ def main(argv: list[str] | None = None) -> int: output.add_argument("--skip", metavar="COUNT", type=int, default=0, help="Skip the first COUNT records") output.add_argument("-w", "--writer", metavar="OUTPUT", default=None, help="Write records to output") output.add_argument( - "-m", "--mode", default=None, choices=("csv", "json", "jsonlines", "line", "line-verbose"), help="Output mode" + "-m", "--mode", default=None, choices=("csv", "csv-no-header", "json", "jsonlines", "line", "line-verbose"), help="Output mode" ) output.add_argument( "--split", metavar="COUNT", default=None, type=int, help="Write record files smaller than COUNT records" @@ -180,6 +180,15 @@ def main(argv: list[str] | None = None) -> int: default=argparse.SUPPRESS, help="Short for --mode=line-verbose", ) + aliases.add_argument( + "-Cn", + "--csv-no-header", + action="store_const", + const="csv-no-header", + dest="mode", + default=argparse.SUPPRESS, + help="Short for --mode=csv-no-header", + ) args = parser.parse_args(argv) @@ -198,6 +207,7 @@ def main(argv: list[str] | None = None) -> int: if not args.writer: mode_to_uri = { "csv": "csvfile://", + "csv-no-header": "csvfile://?header=false", "json": "jsonfile://?indent=2&descriptors=false", "jsonlines": "jsonfile://?descriptors=false", "line": "line://", @@ -210,7 +220,7 @@ def main(argv: list[str] | None = None) -> int: "format_spec": args.format, } query = urlencode({k: v for k, v in qparams.items() if v}) - uri += "&" if urlparse(uri).query else "?" + query + uri += f"&{query}" if urlparse(uri).query else f"?{query}" if args.split: if not args.writer: @@ -221,7 +231,7 @@ def main(argv: list[str] | None = None) -> int: query_dict = dict(parse_qsl(parsed.query)) query_dict.update({"count": args.split, "suffix-length": args.suffix_length}) query = urlencode(query_dict) - uri = parsed.scheme + "://" + parsed.netloc + parsed.path + "?" + query + uri = f"{parsed.scheme}://{parsed.netloc}{parsed.path}?{query}" record_field_rewriter = None if fields or fields_to_exclude or args.exec_expression: diff --git a/tests/test_regression.py b/tests/test_regression.py index c26296fe..520eae20 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -6,6 +6,7 @@ import pathlib import subprocess import sys +from pathlib import Path from datetime import datetime, timezone from io import BytesIO from typing import Callable @@ -691,5 +692,24 @@ def test_record_writer_default_stdout(capsysbinary: pytest.CaptureFixture) -> No assert stdout.startswith(b"\x00\x00\x00\x0f\xc4\rRECORDSTREAM\n") +def test_rdump_selected_fields(capsysbinary: pytest.CaptureFixture) -> None: + """Test rdump regression where selected fields was not propagated properly to adapter.""" + + # Pastebin record used for this test + example_records_json_path = Path(__file__).parent.parent / "examples" / "records.json" + + # rdump --fields key,title,syntax --csv + rdump.main([str(example_records_json_path), "--fields", "key,title,syntax", "--csv"]) + captured = capsysbinary.readouterr() + assert captured.err == b"" + assert captured.out == b"key,title,syntax\r\nQ42eWSaF,A sample pastebin record,text\r\n" + + # rdump --fields key,title,syntax --csv + rdump.main([str(example_records_json_path), "--fields", "key,title,syntax", "--csv-no-header"]) + captured = capsysbinary.readouterr() + assert captured.err == b"" + assert captured.out == b"Q42eWSaF,A sample pastebin record,text\r\n" + + if __name__ == "__main__": __import__("standalone_test").main(globals()) From 0d3684a2ad8f58d9b0dd0ecfe4c3ea7c40bbb7de Mon Sep 17 00:00:00 2001 From: Yun Zheng Hu Date: Wed, 18 Jun 2025 12:55:57 +0200 Subject: [PATCH 2/3] Introduce boolean_argument function Test included. --- flow/record/adapter/csvfile.py | 4 ++-- flow/record/adapter/elastic.py | 11 ++++++----- flow/record/adapter/jsonfile.py | 4 ++-- flow/record/adapter/splunk.py | 4 ++-- flow/record/tools/rdump.py | 6 +++++- flow/record/utils.py | 29 +++++++++++++++++++++++++++++ tests/test_record_adapter.py | 2 +- tests/test_regression.py | 2 +- tests/test_utils.py | 21 +++++++++++++++++++++ 9 files changed, 69 insertions(+), 14 deletions(-) create mode 100644 tests/test_utils.py diff --git a/flow/record/adapter/csvfile.py b/flow/record/adapter/csvfile.py index 48e77f7e..cba8442c 100644 --- a/flow/record/adapter/csvfile.py +++ b/flow/record/adapter/csvfile.py @@ -9,7 +9,7 @@ from flow.record.adapter import AbstractReader, AbstractWriter from flow.record.base import Record, normalize_fieldname from flow.record.selector import make_selector -from flow.record.utils import is_stdout +from flow.record.utils import boolean_argument, is_stdout if TYPE_CHECKING: from collections.abc import Iterator @@ -54,7 +54,7 @@ def __init__( self.fields = self.fields.split(",") if isinstance(self.exclude, str): self.exclude = self.exclude.split(",") - self.header = header.lower() == "true" + self.header = boolean_argument(header) def write(self, r: Record) -> None: rdict = r._asdict(fields=self.fields, exclude=self.exclude) diff --git a/flow/record/adapter/elastic.py b/flow/record/adapter/elastic.py index 6a74bf28..c47e0bc9 100644 --- a/flow/record/adapter/elastic.py +++ b/flow/record/adapter/elastic.py @@ -19,6 +19,7 @@ from flow.record.base import Record, RecordDescriptor from flow.record.fieldtypes import fieldtype_for_value from flow.record.jsonpacker import JsonRecordPacker +from flow.record.utils import boolean_argument if TYPE_CHECKING: from collections.abc import Iterator @@ -72,9 +73,9 @@ def __init__( self.index = index self.uri = uri - verify_certs = str(verify_certs).lower() in ("1", "true") - http_compress = str(http_compress).lower() in ("1", "true") - self.hash_record = str(hash_record).lower() in ("1", "true") + verify_certs = boolean_argument(verify_certs) + http_compress = boolean_argument(http_compress) + self.hash_record = boolean_argument(hash_record) queue_size = int(queue_size) if not uri.lower().startswith(("http://", "https://")): @@ -216,8 +217,8 @@ def __init__( self.index = index self.uri = uri self.selector = selector - verify_certs = str(verify_certs).lower() in ("1", "true") - http_compress = str(http_compress).lower() in ("1", "true") + verify_certs = boolean_argument(verify_certs) + http_compress = boolean_argument(http_compress) if not uri.lower().startswith(("http://", "https://")): uri = "http://" + uri diff --git a/flow/record/adapter/jsonfile.py b/flow/record/adapter/jsonfile.py index 783ce485..20586160 100644 --- a/flow/record/adapter/jsonfile.py +++ b/flow/record/adapter/jsonfile.py @@ -8,7 +8,7 @@ from flow.record.adapter import AbstractReader, AbstractWriter from flow.record.fieldtypes import fieldtype_for_value from flow.record.selector import make_selector -from flow.record.utils import is_stdout +from flow.record.utils import boolean_argument, is_stdout if TYPE_CHECKING: from collections.abc import Iterator @@ -33,7 +33,7 @@ class JsonfileWriter(AbstractWriter): def __init__( self, path: str | Path | BinaryIO, indent: str | int | None = None, descriptors: bool = True, **kwargs ): - self.descriptors = str(descriptors).lower() in ("true", "1") + self.descriptors = boolean_argument(descriptors) self.fp = record.open_path_or_stream(path, "w") if isinstance(indent, str): indent = int(indent) diff --git a/flow/record/adapter/splunk.py b/flow/record/adapter/splunk.py index 6b618c61..b2a6cccc 100644 --- a/flow/record/adapter/splunk.py +++ b/flow/record/adapter/splunk.py @@ -18,7 +18,7 @@ from flow.record.adapter import AbstractReader, AbstractWriter from flow.record.jsonpacker import JsonRecordPacker -from flow.record.utils import to_base64, to_bytes, to_str +from flow.record.utils import boolean_argument, to_base64, to_bytes, to_str if TYPE_CHECKING: from flow.record.base import Record @@ -218,7 +218,7 @@ def __init__( self.token = f"Splunk {self.token}" # Assume verify=True unless specified otherwise. - self.verify = str(ssl_verify).lower() not in ("0", "false") + self.verify = boolean_argument(ssl_verify) if not self.verify: log.warning("Certificate verification is disabled") diff --git a/flow/record/tools/rdump.py b/flow/record/tools/rdump.py index 7e30ae4c..b630e003 100644 --- a/flow/record/tools/rdump.py +++ b/flow/record/tools/rdump.py @@ -107,7 +107,11 @@ def main(argv: list[str] | None = None) -> int: output.add_argument("--skip", metavar="COUNT", type=int, default=0, help="Skip the first COUNT records") output.add_argument("-w", "--writer", metavar="OUTPUT", default=None, help="Write records to output") output.add_argument( - "-m", "--mode", default=None, choices=("csv", "csv-no-header", "json", "jsonlines", "line", "line-verbose"), help="Output mode" + "-m", + "--mode", + default=None, + choices=("csv", "csv-no-header", "json", "jsonlines", "line", "line-verbose"), + help="Output mode", ) output.add_argument( "--split", metavar="COUNT", default=None, type=int, help="Write record files smaller than COUNT records" diff --git a/flow/record/utils.py b/flow/record/utils.py index 57d28949..2d2fc1a2 100644 --- a/flow/record/utils.py +++ b/flow/record/utils.py @@ -117,3 +117,32 @@ def remove_handler(self, callback: Callable[..., None]) -> None: def __call__(self, *args, **kwargs) -> None: for h in self.handlers: h(*args, **kwargs) + + +def boolean_argument(value: str | bool | int) -> bool: + """Convert a string, boolean, or integer to a boolean value. + + This function interprets various string representations of boolean values, + such as "true", "false", "1", "0", "yes", "no". + It also accepts boolean and integer values directly. + + Arguments: + value: The value to convert. Can be a string, boolean, or integer. + + Returns: + bool: The converted boolean value. + + Raises: + ValueError: If the value cannot be interpreted as a boolean. + """ + if isinstance(value, bool): + return value + if isinstance(value, int): + return bool(value) + if isinstance(value, str): + value = value.lower() + if value in ("true", "1", "y", "yes", "on"): + return True + if value in ("false", "0", "n", "no", "off"): + return False + raise ValueError(f"Invalid boolean argument: {value}") diff --git a/tests/test_record_adapter.py b/tests/test_record_adapter.py index ed20bc96..8f821d73 100644 --- a/tests/test_record_adapter.py +++ b/tests/test_record_adapter.py @@ -242,7 +242,7 @@ def test_record_adapter_archive(tmp_path: Path) -> None: # defaults to always archive by /YEAR/MONTH/DAY/ dir structure outdir = tmp_path.joinpath(f"{dt:%Y/%m/%d}") - assert len(list(outdir.iterdir())) + assert list(outdir.iterdir()) # read the archived records and test filename and counts count2 = 0 diff --git a/tests/test_regression.py b/tests/test_regression.py index 520eae20..e39640b0 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -6,9 +6,9 @@ import pathlib import subprocess import sys -from pathlib import Path from datetime import datetime, timezone from io import BytesIO +from pathlib import Path from typing import Callable from unittest.mock import MagicMock, patch diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..789bc1aa --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,21 @@ +import pytest + +from flow.record.utils import boolean_argument + + +def test_boolean_argument() -> None: + assert boolean_argument("True") is True + assert boolean_argument("true") is True + assert boolean_argument("trUe") is True + assert boolean_argument("False") is False + assert boolean_argument("false") is False + assert boolean_argument("1") is True + assert boolean_argument("0") is False + assert boolean_argument("yes") is True + assert boolean_argument("no") is False + assert boolean_argument("y") is True + assert boolean_argument("n") is False + assert boolean_argument("on") is True + assert boolean_argument("off") is False + with pytest.raises(ValueError, match="Invalid boolean argument: .*"): + boolean_argument("maybe") From 0c2132bd2eb82a510e15a217795db646a3e68b9b Mon Sep 17 00:00:00 2001 From: Yun Zheng Hu Date: Fri, 20 Jun 2025 14:52:53 +0200 Subject: [PATCH 3/3] Update test_utils.py Co-authored-by: Erik Schamper <1254028+Schamper@users.noreply.github.com> --- tests/test_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_utils.py b/tests/test_utils.py index 789bc1aa..441ae8de 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -17,5 +17,9 @@ def test_boolean_argument() -> None: assert boolean_argument("n") is False assert boolean_argument("on") is True assert boolean_argument("off") is False + assert boolean_argument(True) is True + assert boolean_argument(False) is False + assert boolean_argument(1) is True + assert boolean_argument(0) is False with pytest.raises(ValueError, match="Invalid boolean argument: .*"): boolean_argument("maybe")