Skip to content

Commit 2b198cf

Browse files
authored
#256 - sz_export --extended argument isn't working correctly for JSON output (#257)
1 parent fdf3eb0 commit 2b198cf

File tree

3 files changed

+37
-52
lines changed

3 files changed

+37
-52
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning].
77

88
## [Unreleased]
99

10+
## [0.0.31] - 2025-09-11
11+
12+
### Fixed in 0.0.31
13+
14+
- Extended argument wasn't working for JSON output
15+
1016
## [0.0.30] - 2025-09-10
1117

1218
### Changed in 0.0.30

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = python-tools
3-
version = 0.0.28
3+
version = 0.0.31
44
author = senzing
55
author_email = support@senzing.com
66
description = Python Tools

sz_tools/sz_export

Lines changed: 30 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ from _tool_helpers import (
1717
print_error,
1818
print_warning,
1919
)
20-
from senzing import SzEngineFlags, SzError
20+
from senzing import SzError
2121
from senzing_core import SzAbstractFactoryCore
2222

2323
MODULE_NAME = pathlib.Path(__file__).stem
@@ -66,7 +66,6 @@ VALID_FLAGS = [
6666

6767
def csv_fetch_next(handle, csv_header=None):
6868
"""Fetch next for CSV output"""
69-
7069
try:
7170
export_record = sz_engine.fetch_next(handle)
7271
except SzError as err:
@@ -79,17 +78,15 @@ def csv_fetch_next(handle, csv_header=None):
7978
# Check data doesn't exceed the csv field limit
8079
if len(export_record) > csv.field_size_limit():
8180
csv.field_size_limit(int(len(export_record) * 1.5))
82-
print(
83-
f" Increased CSV field limit size to: {csv.field_size_limit()}",
84-
)
81+
print(f"Increased CSV field limit size to: {csv.field_size_limit()}")
82+
8583
export_record_dict = next(csv.DictReader([export_record], fieldnames=csv_header)) if export_record else None
8684

8785
return export_record, export_record_dict
8886

8987

9088
def json_fetch_next(handle):
9189
"""Fetch next for JSON output"""
92-
9390
try:
9491
export_record = sz_engine.fetch_next(handle)
9592
except SzError as err:
@@ -100,7 +97,6 @@ def json_fetch_next(handle):
10097

10198
def do_stats_output(total_entity_count, start_time, batch_row_count):
10299
"""Print stats if output frequency interval and not disabled with -1. Reset batch row count if triggered"""
103-
104100
if args.outputFrequency != -1 and total_entity_count % args.outputFrequency == 0:
105101
time_now = datetime.now().strftime("%I:%M:%S %p").lower()
106102
rows_per_sec = int(
@@ -121,7 +117,6 @@ def do_stats_output(total_entity_count, start_time, batch_row_count):
121117

122118
def csv_export():
123119
"""Export data in CSV format"""
124-
125120
bad_count_inner = 0
126121
bad_count_outer = 0
127122
batch_row_count = 0
@@ -135,12 +130,7 @@ def csv_export():
135130

136131
# Create writer object and write the header row
137132
try:
138-
writer = csv.DictWriter(
139-
output_file,
140-
fieldnames=csv_header,
141-
dialect=csv.excel,
142-
quoting=csv.QUOTE_ALL,
143-
)
133+
writer = csv.DictWriter(output_file, fieldnames=csv_header, dialect=csv.excel, quoting=csv.QUOTE_ALL)
144134
writer.writeheader()
145135
except csv.Error as err:
146136
print_error(f"Could not create CSV writer for output or write CSF header: {err}", exit_=True)
@@ -151,7 +141,6 @@ def csv_export():
151141
export_record, export_record_dict = csv_fetch_next(export_handle, csv_header)
152142

153143
while export_record:
154-
155144
row_list = []
156145
fetched_rec_count += 1
157146
batch_row_count += 1
@@ -168,7 +157,6 @@ def csv_export():
168157

169158
# Keep fetching all export rows for the current RES_ENT
170159
while export_record_dict and export_record_dict["RESOLVED_ENTITY_ID"] == resolved_entity_id:
171-
172160
# Bypass bad rows
173161
if "RECORD_ID" not in export_record_dict:
174162
print_error(f"RECORD_ID is missing at line: {fetched_rec_count} - {export_record.strip()}")
@@ -208,14 +196,12 @@ def csv_export():
208196

209197
def json_export():
210198
"""Export data in JSON format"""
211-
212199
row_count = batch_row_count = 0
213200
start_time = time.time()
214201

215202
export_record = json_fetch_next(export_handle)
216203

217204
while export_record:
218-
219205
row_count += 1
220206
batch_row_count += 1
221207

@@ -226,7 +212,6 @@ def json_export():
226212
return row_count, 0, 1
227213

228214
start_time, batch_row_count = do_stats_output(row_count, start_time, batch_row_count)
229-
230215
export_record = json_fetch_next(export_handle)
231216

232217
return row_count, 0, 0
@@ -257,7 +242,7 @@ if __name__ == "__main__":
257242
help=textwrap.dedent(
258243
"""\
259244
260-
Path and file name to send output to.
245+
Path and file name to write output to.
261246
262247
"""
263248
),
@@ -410,8 +395,7 @@ if __name__ == "__main__":
410395
print_warning(
411396
textwrap.dedent(
412397
f"""
413-
414-
***************************************************** WARNING ****************************************************
398+
**************************************************** WARNING *****************************************************
415399
416400
Using the --extendCSVRelates (-xcr) argument with CSV output format will result in excessive and repeated data for
417401
related entities. Very rarely, if ever, is this option required!
@@ -420,60 +404,52 @@ if __name__ == "__main__":
420404
421405
Review the help with {MODULE_NAME} --help
422406
423-
******************************************************************************************************************
407+
**************************************************** WARNING *****************************************************
424408
"""
425409
),
426410
)
427-
428411
time.sleep(10)
429412

430413
print_warning(
431414
textwrap.dedent(
432415
f"""
433-
434-
******************************************************** WARNING *******************************************************
416+
****************************************************** WARNING *****************************************************
435417
436418
{MODULE_NAME} isn't intended for exporting large numbers of entities and associated data source record information.
437-
Beyond 100M+ data source records isn't suggested. For exporting overview entity and relationship data for
438-
analytical purposes outside of Senzing please review the following article:
419+
Exporting this way does not scale, exporting more than a few million records isn't recommended!
439420
440-
https://senzing.com/v4-replicating-to-data-warehouse/
421+
For better options see: https://senzing.com/v4-replicating-to-data-warehouse/
441422
442-
************************************************************************************************************************
423+
****************************************************** WARNING *****************************************************
443424
"""
444425
),
445426
)
446-
447427
time.sleep(5)
448428

429+
# Check can locate an engine configuration
430+
engine_config = get_engine_config(args.ini_file_name)
431+
432+
try:
433+
sz_factory = SzAbstractFactoryCore(MODULE_NAME, engine_config)
434+
sz_engine = sz_factory.create_engine()
435+
except SzError as err:
436+
print_error(err, exit_=True)
437+
sys.exit(1)
438+
449439
# Some CSV exports can be large especially with extended data. Is checked and increased in csv_fetch_next()
450440
csv.field_size_limit(300000)
451441

452-
# Fields to use with CSV output, list of fields to request data
453442
# For CSV these are unioned with the data returned by the flags to give final output
454-
csvFields = [
443+
csv_fields = [
455444
"RESOLVED_ENTITY_ID",
456445
"RELATED_ENTITY_ID",
457446
"MATCH_LEVEL",
458447
"MATCH_KEY",
459448
"DATA_SOURCE",
460449
"RECORD_ID",
461450
]
462-
if args.extended:
463-
csvFields.insert(2, "RESOLVED_ENTITY_NAME")
464-
csvFields.insert(6, "JSON_DATA")
465451

466-
# Check can locate an engine configuration
467-
engine_config = get_engine_config(args.ini_file_name)
468-
469-
try:
470-
sz_factory = SzAbstractFactoryCore(MODULE_NAME, engine_config)
471-
sz_engine = sz_factory.create_engine()
472-
except SzError as err:
473-
print_error(err, exit_=True)
474-
sys.exit(1)
475-
476-
# Convert strings to upper and if integers supplied convert from string to int
452+
# Accept string and int flags
477453
flags = [int(flag) if flag.isdigit() else flag.upper() for flag in args.flags]
478454

479455
# Get only the string flags to check against accepted flags
@@ -484,27 +460,30 @@ if __name__ == "__main__":
484460
print(", ".join(invalid_string_flags))
485461

486462
valid_flags = [flag for flag in flags if flag not in invalid_string_flags]
463+
464+
if args.extended:
465+
csv_fields.insert(2, "RESOLVED_ENTITY_NAME")
466+
csv_fields.insert(6, "JSON_DATA")
467+
valid_flags.extend(["SZ_ENTITY_INCLUDE_ENTITY_NAME", "SZ_ENTITY_INCLUDE_RECORD_JSON_DATA"])
468+
487469
final_flags = combine_engine_flags(valid_flags)
488470

489471
# Initialize the export
490472
export_output = args.output_file
491473
if args.compressFile:
492474
export_output = f"{args.output_file}.gz"
493475

494-
# Open file for export output
495476
with open_file(export_output) as output_file:
496-
# Create CSV or JSON export handle to fetch from
497477
try:
498478
if args.outputFormat == "CSV":
499-
CSV_FIELDS_STR = ", ".join(csvFields)
479+
CSV_FIELDS_STR = ", ".join(csv_fields)
500480
export_handle = sz_engine.export_csv_entity_report(CSV_FIELDS_STR, final_flags)
501481
else:
502482
export_handle = sz_engine.export_json_entity_report(final_flags)
503483
except SzError as err:
504484
print_error(f"Could not initialize export: {err}", exit_=True)
505485

506486
export_start = time.time()
507-
508487
row_count, bad_rec_count, exit_code = csv_export() if args.outputFormat == "CSV" else json_export()
509488

510489
if exit_code:

0 commit comments

Comments
 (0)