3636# Suppress logging by default; enable for local testing
3737ENABLE_LOGGING = True
3838LOG_LEVEL = logging .DEBUG
39- LOG_PATH = "correlation_search_test2 .log"
39+ LOG_PATH = "correlation_search .log"
4040
4141
4242class SavedSearchKeys (StrEnum ):
@@ -904,6 +904,58 @@ def notable_in_risk_dm(self) -> bool:
904904 if isinstance (event , NotableEvent ):
905905 return True
906906 return False
907+
908+ def validate_risk_notable_events (self ) -> IntegrationTestResult | None :
909+ try :
910+ # Validate risk events
911+ if self .has_risk_analysis_action :
912+ self .logger .debug ("Checking for matching risk events" )
913+ if self .risk_event_exists ():
914+ # TODO (PEX-435): should this in the retry loop? or outside it?
915+ # -> I've observed there being a missing risk event (15/16) on
916+ # the first few tries, so this does help us check for true
917+ # positives; BUT, if we have lots of failing detections, this
918+ # will definitely add to the total wait time
919+ # -> certain types of failures (e.g. risk message, or any value
920+ # checking) should fail testing automatically
921+ # -> other types, like those based on counts of risk events,
922+ # should happen should fail more slowly as more events may be
923+ # produced
924+ self .validate_risk_events ()
925+ else :
926+ raise ValidationFailed (
927+ f"TEST FAILED: No matching risk event created for: { self .name } "
928+ )
929+ else :
930+ self .logger .debug (
931+ f"No risk action defined for '{ self .name } '"
932+ )
933+
934+ # Validate notable events
935+ if self .has_notable_action :
936+ self .logger .debug ("Checking for matching notable events" )
937+ # NOTE: because we check this last, if both fail, the error message about notables will
938+ # always be the last to be added and thus the one surfaced to the user
939+ if self .notable_event_exists ():
940+ # TODO (PEX-435): should this in the retry loop? or outside it?
941+ self .validate_notable_events ()
942+ pass
943+ else :
944+ raise ValidationFailed (
945+ f"TEST FAILED: No matching notable event created for: { self .name } "
946+ )
947+ else :
948+ self .logger .debug (
949+ f"No notable action defined for '{ self .name } '"
950+ )
951+
952+ return None
953+ except ValidationFailed as e :
954+ self .logger .error (f"Risk/notable validation failed: { e } " )
955+ return IntegrationTestResult (
956+ status = TestResultStatus .FAIL ,
957+ message = f"TEST FAILED: { e } "
958+ )
907959
908960 # NOTE: it would be more ideal to switch this to a system which gets the handle of the saved search job and polls
909961 # it for completion, but that seems more tricky
@@ -934,10 +986,6 @@ def test(
934986
935987 # keep track of time slept and number of attempts for exponential backoff (base 2)
936988 elapsed_sleep_time = 0
937- num_tries = 0
938-
939- # set the initial base sleep time
940- time_to_sleep = TimeoutConfig .BASE_SLEEP
941989
942990 try :
943991 # first make sure the indexes are currently empty and the detection is starting from a disabled state
@@ -970,115 +1018,19 @@ def test(
9701018 self .update_pbar (TestingStates .FORCE_RUN )
9711019 self .force_run ()
9721020
973- # # loop so long as the elapsed time is less than max_sleep
974- # while elapsed_sleep_time < max_sleep:
975- # # sleep so the detection job can finish
976- # self.logger.info(
977- # f"Waiting {time_to_sleep} for {self.name} so it can finish"
978- # )
979- # self.update_pbar(TestingStates.VALIDATING)
980- # # time.sleep(time_to_sleep)
981- # self.logger.info(
982- # f"Skipping sleeping time for testing purposes"
983- # )
984- # elapsed_sleep_time += time_to_sleep
985-
986- # self.logger.info(
987- # f"Validating detection (attempt #{num_tries + 1} - {elapsed_sleep_time} seconds elapsed of "
988- # f"{max_sleep} max)"
989- # )
990-
991- # # reset the result to None on each loop iteration
992- # result = None
993-
994- max_retries = 10
995- initial_wait = 2
996- max_wait = 60
997- max_total_wait = 300
1021+ max_retries = 3
9981022
9991023 current_turn = 1
1000- wait_time = initial_wait
1001- total_waited = 0
10021024
1003- while current_turn <= max_retries and total_waited < max_total_wait :
1025+ while current_turn <= max_retries :
10041026 current_turn += 1
10051027
10061028 self .logger .info (
10071029 f"Skipping sleeping time for testing purposes"
10081030 )
10091031
1010- if current_turn > 3 :
1011- time .sleep (wait_time )
1012- total_waited += wait_time
1013- self .logger .info (f"Waiting { wait_time } s before retry { current_turn } ..." )
1014-
1015- wait_time = min (wait_time * 2 , max_wait )
1016-
1017- # Rerun the search job
1018- job = self .dispatch ()
1019- self .logger .info (f"Force running detection '{ self .name } ' with job ID: { job .sid } " )
1020-
1021- time_to_execute = 0
1022-
1023- # Check if the job is finished
1024- while not job .is_done ():
1025- self .logger .info (f"Job { job .sid } is still running..." )
1026- time .sleep (1 )
1027- time_to_execute += 1
1028-
1029- self .logger .info (f"Job { job .sid } has finished running in { time_to_execute } seconds." )
1030-
10311032 # reset the result to None on each loop iteration
1032- result = None
1033- try :
1034- # Validate risk events
1035- if self .has_risk_analysis_action :
1036- self .logger .debug ("Checking for matching risk events" )
1037- if self .risk_event_exists ():
1038- # TODO (PEX-435): should this in the retry loop? or outside it?
1039- # -> I've observed there being a missing risk event (15/16) on
1040- # the first few tries, so this does help us check for true
1041- # positives; BUT, if we have lots of failing detections, this
1042- # will definitely add to the total wait time
1043- # -> certain types of failures (e.g. risk message, or any value
1044- # checking) should fail testing automatically
1045- # -> other types, like those based on counts of risk events,
1046- # should happen should fail more slowly as more events may be
1047- # produced
1048- self .validate_risk_events ()
1049- else :
1050- raise ValidationFailed (
1051- f"TEST FAILED: No matching risk event created for: { self .name } "
1052- )
1053- else :
1054- self .logger .debug (
1055- f"No risk action defined for '{ self .name } '"
1056- )
1057-
1058- # Validate notable events
1059- if self .has_notable_action :
1060- self .logger .debug ("Checking for matching notable events" )
1061- # NOTE: because we check this last, if both fail, the error message about notables will
1062- # always be the last to be added and thus the one surfaced to the user
1063- if self .notable_event_exists ():
1064- # TODO (PEX-435): should this in the retry loop? or outside it?
1065- self .validate_notable_events ()
1066- pass
1067- else :
1068- raise ValidationFailed (
1069- f"TEST FAILED: No matching notable event created for: { self .name } "
1070- )
1071- else :
1072- self .logger .debug (
1073- f"No notable action defined for '{ self .name } '"
1074- )
1075- except ValidationFailed as e :
1076- self .logger .error (f"Risk/notable validation failed: { e } " )
1077- result = IntegrationTestResult (
1078- status = TestResultStatus .FAIL ,
1079- message = f"TEST FAILED: { e } " ,
1080- wait_duration = elapsed_sleep_time ,
1081- )
1033+ result = self .validate_risk_notable_events ()
10821034
10831035 # if result is still None, then all checks passed and we can break the loop
10841036 if result is None :
@@ -1088,16 +1040,37 @@ def test(
10881040 wait_duration = elapsed_sleep_time ,
10891041 )
10901042 break
1043+
1044+ if result != None and result .status == TestResultStatus .FAIL :
1045+ elapsed = 0
10911046
1092- # # increment number of attempts to validate detection
1093- # num_tries += 1
1047+ for i in range (10 ):
1048+ time .sleep (10 )
1049+ start_time = time .time ()
10941050
1095- # # compute the next time to sleep for
1096- # time_to_sleep = 2**num_tries
1051+ job = self .dispatch ()
1052+ time_to_execute = 0
1053+ # Check if the job is finished
1054+ while not job .is_done ():
1055+ self .logger .info (f"Job { job .sid } is still running..." )
1056+ time .sleep (1 )
1057+ time_to_execute += 1
1058+ self .logger .info (f"Job { job .sid } has finished running in { time_to_execute } seconds." )
10971059
1098- # # if the computed time to sleep will exceed max_sleep, adjust appropriately
1099- # if (elapsed_sleep_time + time_to_sleep) > max_sleep:
1100- # time_to_sleep = max_sleep - elapsed_sleep_time
1060+ result = self .validate_risk_notable_events ()
1061+ end_time = time .time ()
1062+
1063+ elapsed = elapsed + end_time - start_time + 10
1064+
1065+ if result is None :
1066+ result = IntegrationTestResult (
1067+ status = TestResultStatus .PASS ,
1068+ message = f"TEST PASSED: Expected risk and/or notable events were created for: { self .name } "
1069+ )
1070+ self .logger .info (
1071+ f"Test passed in { i } th retry after { elapsed } seconds"
1072+ )
1073+ break
11011074
11021075 # TODO (PEX-436): should cleanup be in a finally block so it runs even on exception?
11031076 # cleanup the created events, disable the detection and return the result
0 commit comments