Skip to content

Commit 22def48

Browse files
committed
Update retry logic
1 parent af111b7 commit 22def48

File tree

1 file changed

+84
-111
lines changed

1 file changed

+84
-111
lines changed

contentctl/objects/correlation_search.py

Lines changed: 84 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
# Suppress logging by default; enable for local testing
3737
ENABLE_LOGGING = True
3838
LOG_LEVEL = logging.DEBUG
39-
LOG_PATH = "correlation_search_test2.log"
39+
LOG_PATH = "correlation_search.log"
4040

4141

4242
class SavedSearchKeys(StrEnum):
@@ -904,6 +904,58 @@ def notable_in_risk_dm(self) -> bool:
904904
if isinstance(event, NotableEvent):
905905
return True
906906
return False
907+
908+
def validate_risk_notable_events(self) -> IntegrationTestResult | None:
909+
try:
910+
# Validate risk events
911+
if self.has_risk_analysis_action:
912+
self.logger.debug("Checking for matching risk events")
913+
if self.risk_event_exists():
914+
# TODO (PEX-435): should this in the retry loop? or outside it?
915+
# -> I've observed there being a missing risk event (15/16) on
916+
# the first few tries, so this does help us check for true
917+
# positives; BUT, if we have lots of failing detections, this
918+
# will definitely add to the total wait time
919+
# -> certain types of failures (e.g. risk message, or any value
920+
# checking) should fail testing automatically
921+
# -> other types, like those based on counts of risk events,
922+
# should happen should fail more slowly as more events may be
923+
# produced
924+
self.validate_risk_events()
925+
else:
926+
raise ValidationFailed(
927+
f"TEST FAILED: No matching risk event created for: {self.name}"
928+
)
929+
else:
930+
self.logger.debug(
931+
f"No risk action defined for '{self.name}'"
932+
)
933+
934+
# Validate notable events
935+
if self.has_notable_action:
936+
self.logger.debug("Checking for matching notable events")
937+
# NOTE: because we check this last, if both fail, the error message about notables will
938+
# always be the last to be added and thus the one surfaced to the user
939+
if self.notable_event_exists():
940+
# TODO (PEX-435): should this in the retry loop? or outside it?
941+
self.validate_notable_events()
942+
pass
943+
else:
944+
raise ValidationFailed(
945+
f"TEST FAILED: No matching notable event created for: {self.name}"
946+
)
947+
else:
948+
self.logger.debug(
949+
f"No notable action defined for '{self.name}'"
950+
)
951+
952+
return None
953+
except ValidationFailed as e:
954+
self.logger.error(f"Risk/notable validation failed: {e}")
955+
return IntegrationTestResult(
956+
status=TestResultStatus.FAIL,
957+
message=f"TEST FAILED: {e}"
958+
)
907959

908960
# NOTE: it would be more ideal to switch this to a system which gets the handle of the saved search job and polls
909961
# it for completion, but that seems more tricky
@@ -934,10 +986,6 @@ def test(
934986

935987
# keep track of time slept and number of attempts for exponential backoff (base 2)
936988
elapsed_sleep_time = 0
937-
num_tries = 0
938-
939-
# set the initial base sleep time
940-
time_to_sleep = TimeoutConfig.BASE_SLEEP
941989

942990
try:
943991
# first make sure the indexes are currently empty and the detection is starting from a disabled state
@@ -970,115 +1018,19 @@ def test(
9701018
self.update_pbar(TestingStates.FORCE_RUN)
9711019
self.force_run()
9721020

973-
# # loop so long as the elapsed time is less than max_sleep
974-
# while elapsed_sleep_time < max_sleep:
975-
# # sleep so the detection job can finish
976-
# self.logger.info(
977-
# f"Waiting {time_to_sleep} for {self.name} so it can finish"
978-
# )
979-
# self.update_pbar(TestingStates.VALIDATING)
980-
# # time.sleep(time_to_sleep)
981-
# self.logger.info(
982-
# f"Skipping sleeping time for testing purposes"
983-
# )
984-
# elapsed_sleep_time += time_to_sleep
985-
986-
# self.logger.info(
987-
# f"Validating detection (attempt #{num_tries + 1} - {elapsed_sleep_time} seconds elapsed of "
988-
# f"{max_sleep} max)"
989-
# )
990-
991-
# # reset the result to None on each loop iteration
992-
# result = None
993-
994-
max_retries = 10
995-
initial_wait = 2
996-
max_wait = 60
997-
max_total_wait = 300
1021+
max_retries = 3
9981022

9991023
current_turn = 1
1000-
wait_time = initial_wait
1001-
total_waited = 0
10021024

1003-
while current_turn <= max_retries and total_waited < max_total_wait:
1025+
while current_turn <= max_retries:
10041026
current_turn += 1
10051027

10061028
self.logger.info(
10071029
f"Skipping sleeping time for testing purposes"
10081030
)
10091031

1010-
if current_turn > 3:
1011-
time.sleep(wait_time)
1012-
total_waited += wait_time
1013-
self.logger.info(f"Waiting {wait_time}s before retry {current_turn}...")
1014-
1015-
wait_time = min(wait_time * 2, max_wait)
1016-
1017-
# Rerun the search job
1018-
job = self.dispatch()
1019-
self.logger.info(f"Force running detection '{self.name}' with job ID: {job.sid}")
1020-
1021-
time_to_execute = 0
1022-
1023-
# Check if the job is finished
1024-
while not job.is_done():
1025-
self.logger.info(f"Job {job.sid} is still running...")
1026-
time.sleep(1)
1027-
time_to_execute += 1
1028-
1029-
self.logger.info(f"Job {job.sid} has finished running in {time_to_execute} seconds.")
1030-
10311032
# reset the result to None on each loop iteration
1032-
result = None
1033-
try:
1034-
# Validate risk events
1035-
if self.has_risk_analysis_action:
1036-
self.logger.debug("Checking for matching risk events")
1037-
if self.risk_event_exists():
1038-
# TODO (PEX-435): should this in the retry loop? or outside it?
1039-
# -> I've observed there being a missing risk event (15/16) on
1040-
# the first few tries, so this does help us check for true
1041-
# positives; BUT, if we have lots of failing detections, this
1042-
# will definitely add to the total wait time
1043-
# -> certain types of failures (e.g. risk message, or any value
1044-
# checking) should fail testing automatically
1045-
# -> other types, like those based on counts of risk events,
1046-
# should happen should fail more slowly as more events may be
1047-
# produced
1048-
self.validate_risk_events()
1049-
else:
1050-
raise ValidationFailed(
1051-
f"TEST FAILED: No matching risk event created for: {self.name}"
1052-
)
1053-
else:
1054-
self.logger.debug(
1055-
f"No risk action defined for '{self.name}'"
1056-
)
1057-
1058-
# Validate notable events
1059-
if self.has_notable_action:
1060-
self.logger.debug("Checking for matching notable events")
1061-
# NOTE: because we check this last, if both fail, the error message about notables will
1062-
# always be the last to be added and thus the one surfaced to the user
1063-
if self.notable_event_exists():
1064-
# TODO (PEX-435): should this in the retry loop? or outside it?
1065-
self.validate_notable_events()
1066-
pass
1067-
else:
1068-
raise ValidationFailed(
1069-
f"TEST FAILED: No matching notable event created for: {self.name}"
1070-
)
1071-
else:
1072-
self.logger.debug(
1073-
f"No notable action defined for '{self.name}'"
1074-
)
1075-
except ValidationFailed as e:
1076-
self.logger.error(f"Risk/notable validation failed: {e}")
1077-
result = IntegrationTestResult(
1078-
status=TestResultStatus.FAIL,
1079-
message=f"TEST FAILED: {e}",
1080-
wait_duration=elapsed_sleep_time,
1081-
)
1033+
result = self.validate_risk_notable_events()
10821034

10831035
# if result is still None, then all checks passed and we can break the loop
10841036
if result is None:
@@ -1088,16 +1040,37 @@ def test(
10881040
wait_duration=elapsed_sleep_time,
10891041
)
10901042
break
1043+
1044+
if result != None and result.status == TestResultStatus.FAIL:
1045+
elapsed = 0
10911046

1092-
# # increment number of attempts to validate detection
1093-
# num_tries += 1
1047+
for i in range(10):
1048+
time.sleep(10)
1049+
start_time = time.time()
10941050

1095-
# # compute the next time to sleep for
1096-
# time_to_sleep = 2**num_tries
1051+
job = self.dispatch()
1052+
time_to_execute = 0
1053+
# Check if the job is finished
1054+
while not job.is_done():
1055+
self.logger.info(f"Job {job.sid} is still running...")
1056+
time.sleep(1)
1057+
time_to_execute += 1
1058+
self.logger.info(f"Job {job.sid} has finished running in {time_to_execute} seconds.")
10971059

1098-
# # if the computed time to sleep will exceed max_sleep, adjust appropriately
1099-
# if (elapsed_sleep_time + time_to_sleep) > max_sleep:
1100-
# time_to_sleep = max_sleep - elapsed_sleep_time
1060+
result = self.validate_risk_notable_events()
1061+
end_time = time.time()
1062+
1063+
elapsed = elapsed + end_time - start_time + 10
1064+
1065+
if result is None:
1066+
result = IntegrationTestResult(
1067+
status=TestResultStatus.PASS,
1068+
message=f"TEST PASSED: Expected risk and/or notable events were created for: {self.name}"
1069+
)
1070+
self.logger.info(
1071+
f"Test passed in {i}th retry after {elapsed} seconds"
1072+
)
1073+
break
11011074

11021075
# TODO (PEX-436): should cleanup be in a finally block so it runs even on exception?
11031076
# cleanup the created events, disable the detection and return the result

0 commit comments

Comments
 (0)