@@ -17,7 +17,7 @@ from _tool_helpers import (
1717 print_error ,
1818 print_warning ,
1919)
20- from senzing import SzEngineFlags , SzError
20+ from senzing import SzError
2121from senzing_core import SzAbstractFactoryCore
2222
2323MODULE_NAME = pathlib .Path (__file__ ).stem
@@ -66,7 +66,6 @@ VALID_FLAGS = [
6666
6767def csv_fetch_next (handle , csv_header = None ):
6868 """Fetch next for CSV output"""
69-
7069 try :
7170 export_record = sz_engine .fetch_next (handle )
7271 except SzError as err :
@@ -79,17 +78,15 @@ def csv_fetch_next(handle, csv_header=None):
7978 # Check data doesn't exceed the csv field limit
8079 if len (export_record ) > csv .field_size_limit ():
8180 csv .field_size_limit (int (len (export_record ) * 1.5 ))
82- print (
83- f" Increased CSV field limit size to: { csv .field_size_limit ()} " ,
84- )
81+ print (f"Increased CSV field limit size to: { csv .field_size_limit ()} " )
82+
8583 export_record_dict = next (csv .DictReader ([export_record ], fieldnames = csv_header )) if export_record else None
8684
8785 return export_record , export_record_dict
8886
8987
9088def json_fetch_next (handle ):
9189 """Fetch next for JSON output"""
92-
9390 try :
9491 export_record = sz_engine .fetch_next (handle )
9592 except SzError as err :
@@ -100,7 +97,6 @@ def json_fetch_next(handle):
10097
10198def do_stats_output (total_entity_count , start_time , batch_row_count ):
10299 """Print stats if output frequency interval and not disabled with -1. Reset batch row count if triggered"""
103-
104100 if args .outputFrequency != - 1 and total_entity_count % args .outputFrequency == 0 :
105101 time_now = datetime .now ().strftime ("%I:%M:%S %p" ).lower ()
106102 rows_per_sec = int (
@@ -121,7 +117,6 @@ def do_stats_output(total_entity_count, start_time, batch_row_count):
121117
122118def csv_export ():
123119 """Export data in CSV format"""
124-
125120 bad_count_inner = 0
126121 bad_count_outer = 0
127122 batch_row_count = 0
@@ -135,12 +130,7 @@ def csv_export():
135130
136131 # Create writer object and write the header row
137132 try :
138- writer = csv .DictWriter (
139- output_file ,
140- fieldnames = csv_header ,
141- dialect = csv .excel ,
142- quoting = csv .QUOTE_ALL ,
143- )
133+ writer = csv .DictWriter (output_file , fieldnames = csv_header , dialect = csv .excel , quoting = csv .QUOTE_ALL )
144134 writer .writeheader ()
145135 except csv .Error as err :
146136 print_error (f"Could not create CSV writer for output or write CSF header: { err } " , exit_ = True )
@@ -151,7 +141,6 @@ def csv_export():
151141 export_record , export_record_dict = csv_fetch_next (export_handle , csv_header )
152142
153143 while export_record :
154-
155144 row_list = []
156145 fetched_rec_count += 1
157146 batch_row_count += 1
@@ -168,7 +157,6 @@ def csv_export():
168157
169158 # Keep fetching all export rows for the current RES_ENT
170159 while export_record_dict and export_record_dict ["RESOLVED_ENTITY_ID" ] == resolved_entity_id :
171-
172160 # Bypass bad rows
173161 if "RECORD_ID" not in export_record_dict :
174162 print_error (f"RECORD_ID is missing at line: { fetched_rec_count } - { export_record .strip ()} " )
@@ -208,14 +196,12 @@ def csv_export():
208196
209197def json_export ():
210198 """Export data in JSON format"""
211-
212199 row_count = batch_row_count = 0
213200 start_time = time .time ()
214201
215202 export_record = json_fetch_next (export_handle )
216203
217204 while export_record :
218-
219205 row_count += 1
220206 batch_row_count += 1
221207
@@ -226,7 +212,6 @@ def json_export():
226212 return row_count , 0 , 1
227213
228214 start_time , batch_row_count = do_stats_output (row_count , start_time , batch_row_count )
229-
230215 export_record = json_fetch_next (export_handle )
231216
232217 return row_count , 0 , 0
@@ -257,7 +242,7 @@ if __name__ == "__main__":
257242 help = textwrap .dedent (
258243 """\
259244
260- Path and file name to send output to.
245+ Path and file name to write output to.
261246
262247 """
263248 ),
@@ -410,8 +395,7 @@ if __name__ == "__main__":
410395 print_warning (
411396 textwrap .dedent (
412397 f"""
413-
414- ***************************************************** WARNING ****************************************************
398+ **************************************************** WARNING *****************************************************
415399
416400 Using the --extendCSVRelates (-xcr) argument with CSV output format will result in excessive and repeated data for
417401 related entities. Very rarely, if ever, is this option required!
@@ -420,60 +404,52 @@ if __name__ == "__main__":
420404
421405 Review the help with { MODULE_NAME } --help
422406
423- ************************************************************* *****************************************************
407+ **************************************************** WARNING *****************************************************
424408 """
425409 ),
426410 )
427-
428411 time .sleep (10 )
429412
430413 print_warning (
431414 textwrap .dedent (
432415 f"""
433-
434- ******************************************************** WARNING *******************************************************
416+ ****************************************************** WARNING *****************************************************
435417
436418 { MODULE_NAME } isn't intended for exporting large numbers of entities and associated data source record information.
437- Beyond 100M+ data source records isn't suggested. For exporting overview entity and relationship data for
438- analytical purposes outside of Senzing please review the following article:
419+ Exporting this way does not scale, exporting more than a few million records isn't recommended!
439420
440- https://senzing.com/v4-replicating-to-data-warehouse/
421+ For better options see: https://senzing.com/v4-replicating-to-data-warehouse/
441422
442- ******************************************************************* *****************************************************
423+ ****************************************************** WARNING *****************************************************
443424 """
444425 ),
445426 )
446-
447427 time .sleep (5 )
448428
429+ # Check can locate an engine configuration
430+ engine_config = get_engine_config (args .ini_file_name )
431+
432+ try :
433+ sz_factory = SzAbstractFactoryCore (MODULE_NAME , engine_config )
434+ sz_engine = sz_factory .create_engine ()
435+ except SzError as err :
436+ print_error (err , exit_ = True )
437+ sys .exit (1 )
438+
449439 # Some CSV exports can be large especially with extended data. Is checked and increased in csv_fetch_next()
450440 csv .field_size_limit (300000 )
451441
452- # Fields to use with CSV output, list of fields to request data
453442 # For CSV these are unioned with the data returned by the flags to give final output
454- csvFields = [
443+ csv_fields = [
455444 "RESOLVED_ENTITY_ID" ,
456445 "RELATED_ENTITY_ID" ,
457446 "MATCH_LEVEL" ,
458447 "MATCH_KEY" ,
459448 "DATA_SOURCE" ,
460449 "RECORD_ID" ,
461450 ]
462- if args .extended :
463- csvFields .insert (2 , "RESOLVED_ENTITY_NAME" )
464- csvFields .insert (6 , "JSON_DATA" )
465451
466- # Check can locate an engine configuration
467- engine_config = get_engine_config (args .ini_file_name )
468-
469- try :
470- sz_factory = SzAbstractFactoryCore (MODULE_NAME , engine_config )
471- sz_engine = sz_factory .create_engine ()
472- except SzError as err :
473- print_error (err , exit_ = True )
474- sys .exit (1 )
475-
476- # Convert strings to upper and if integers supplied convert from string to int
452+ # Accept string and int flags
477453 flags = [int (flag ) if flag .isdigit () else flag .upper () for flag in args .flags ]
478454
479455 # Get only the string flags to check against accepted flags
@@ -484,27 +460,30 @@ if __name__ == "__main__":
484460 print (", " .join (invalid_string_flags ))
485461
486462 valid_flags = [flag for flag in flags if flag not in invalid_string_flags ]
463+
464+ if args .extended :
465+ csv_fields .insert (2 , "RESOLVED_ENTITY_NAME" )
466+ csv_fields .insert (6 , "JSON_DATA" )
467+ valid_flags .extend (["SZ_ENTITY_INCLUDE_ENTITY_NAME" , "SZ_ENTITY_INCLUDE_RECORD_JSON_DATA" ])
468+
487469 final_flags = combine_engine_flags (valid_flags )
488470
489471 # Initialize the export
490472 export_output = args .output_file
491473 if args .compressFile :
492474 export_output = f"{ args .output_file } .gz"
493475
494- # Open file for export output
495476 with open_file (export_output ) as output_file :
496- # Create CSV or JSON export handle to fetch from
497477 try :
498478 if args .outputFormat == "CSV" :
499- CSV_FIELDS_STR = ", " .join (csvFields )
479+ CSV_FIELDS_STR = ", " .join (csv_fields )
500480 export_handle = sz_engine .export_csv_entity_report (CSV_FIELDS_STR , final_flags )
501481 else :
502482 export_handle = sz_engine .export_json_entity_report (final_flags )
503483 except SzError as err :
504484 print_error (f"Could not initialize export: { err } " , exit_ = True )
505485
506486 export_start = time .time ()
507-
508487 row_count , bad_rec_count , exit_code = csv_export () if args .outputFormat == "CSV" else json_export ()
509488
510489 if exit_code :
0 commit comments