diff --git a/README.md b/README.md index 8392529..ef85f42 100644 --- a/README.md +++ b/README.md @@ -130,6 +130,16 @@ See below for an exhaustive list of configuration fields: // format, either "csv" or "excel" "format": "csv", + // record delimter (optional), defaults to "," + "delimiter": "|", + + // if true, unzip the file before reading it at a csv + "unzip": true, + + // if specified, override the default CSV quoting config + // More info: https://docs.python.org/3/library/csv.html#csv.QUOTE_ALL + "quoting": "QUOTE_NONE", + // if the files don't have a header row, you can specify the field names "field_names": ["id", "first_name", "last_name"], diff --git a/tap_s3_csv/config.py b/tap_s3_csv/config.py index 9becc52..5703470 100644 --- a/tap_s3_csv/config.py +++ b/tap_s3_csv/config.py @@ -13,6 +13,9 @@ Required('pattern'): str, Required('key_properties'): [str], Required('format'): Any('csv', 'excel'), + Optional('unzip'): bool, + Optional('delimiter'): str, + Optional('quoting'): Any('QUOTE_MINIMAL', 'QUOTE_ALL', 'QUOTE_NONNUMERIC', 'QUOTE_NONE'), Optional('search_prefix'): str, Optional('field_names'): [str], Optional('worksheet_name'): str, diff --git a/tap_s3_csv/csv_handler.py b/tap_s3_csv/csv_handler.py index 491165a..0aa1c61 100644 --- a/tap_s3_csv/csv_handler.py +++ b/tap_s3_csv/csv_handler.py @@ -1,6 +1,7 @@ import codecs import csv import re +import gzip def generator_wrapper(reader): @@ -25,17 +26,27 @@ def generator_wrapper(reader): def get_row_iterator(table_spec, file_handle): + if table_spec.get('unzip'): + raw_stream = gzip.GzipFile(fileobj=file_handle._raw_stream) + else: + raw_stream = file_handle._raw_stream + # we use a protected member of the s3 object, _raw_stream, here to create # a generator for data from the s3 file. # pylint: disable=protected-access file_stream = codecs.iterdecode( - file_handle._raw_stream, encoding='utf-8') + raw_stream, encoding='utf-8') field_names = None if 'field_names' in table_spec: field_names = table_spec['field_names'] - reader = csv.DictReader(file_stream, fieldnames=field_names) + delimiter = table_spec.get('delimiter', ',') + + quote_config = table_spec.get('quoting', 'QUOTE_MINIMAL') + quoting = getattr(csv, quote_config) + + reader = csv.DictReader(file_stream, quoting=quoting, delimiter=delimiter, fieldnames=field_names) return generator_wrapper(reader)