@@ -61,31 +61,135 @@ def ensure_data_dir():
6161 return data_dir
6262
6363
64+ def detect_injection (justification : str ) -> bool :
65+ """
66+ Detect prompt-injection and manipulation patterns in justifications.
67+
68+ Returns True if the justification looks like a prompt-injection attempt.
69+ """
70+ injection_patterns = [
71+ r'ignore\s+(previous|above|prior|all)' ,
72+ r'override\s+(policy|restriction|rule|permission|security)' ,
73+ r'system\s*prompt' ,
74+ r'you\s+are\s+(now|a)' ,
75+ r'act\s+as\s+(if|a|an)' ,
76+ r'pretend\s+(to|that|you)' ,
77+ r'bypass\s+(security|check|restriction|auth)' ,
78+ r'grant\s+(me|access|permission)\s+(anyway|regardless|now)' ,
79+ r'disregard\s+(policy|rule|restriction|previous)' ,
80+ r'admin\s+(mode|access|override)' ,
81+ r'sudo\b' ,
82+ r'jailbreak' ,
83+ r'do\s+not\s+(check|verify|validate|restrict)' ,
84+ r'skip\s+(validation|verification|check)' ,
85+ r'trust\s+level\s*[:=]' ,
86+ r'score\s*[:=]+\s*[\d.]' ,
87+ ]
88+ text = justification .lower ()
89+ for pattern in injection_patterns :
90+ if re .search (pattern , text ):
91+ return True
92+ return False
93+
94+
6495def score_justification (justification : str ) -> float :
6596 """
66- Score the quality of a justification.
67-
68- Criteria:
69- - Length (more detail = better)
70- - Contains task-related keywords
71- - Contains specificity keywords
72- - Doesn't contain test/debug keywords
97+ Score the quality of a justification with hardened validation.
98+
99+ Defenses against prompt injection and keyword stuffing:
100+ - Injection pattern detection (immediate reject)
101+ - Maximum length cap (prevents obfuscation in long text)
102+ - Keyword-stuffing detection (penalises unnatural keyword density)
103+ - Unique-word ratio check (catches copy-paste padding)
104+ - Structural coherence (requires natural sentence structure)
105+
106+ Criteria (after safety checks):
107+ - Length (more detail = better, but capped)
108+ - Contains task-related keywords (capped contribution)
109+ - Contains specificity keywords (capped contribution)
110+ - No test/debug keywords
111+ - Structural coherence bonus
73112 """
113+ # ----- Hard reject: injection patterns -----
114+ if detect_injection (justification ):
115+ return 0.0
116+
117+ # ----- Hard reject: empty or whitespace-only -----
118+ stripped = justification .strip ()
119+ if not stripped :
120+ return 0.0
121+
122+ # ----- Hard cap: excessively long justifications are suspicious -----
123+ MAX_JUSTIFICATION_LENGTH = 500
124+ if len (stripped ) > MAX_JUSTIFICATION_LENGTH :
125+ return 0.1 # Suspiciously long — allow re-submission with concise text
126+
127+ words = stripped .split ()
128+ word_count = len (words )
129+
130+ # ----- Hard reject: too few words to be meaningful -----
131+ if word_count < 3 :
132+ return 0.1
133+
134+ # ----- Repetition / padding detection -----
135+ unique_words = set (w .lower () for w in words )
136+ unique_ratio = len (unique_words ) / word_count if word_count > 0 else 0
137+ if unique_ratio < 0.4 :
138+ return 0.1 # More than 60% repeated words — likely padding
139+
140+ # ----- Keyword-stuffing detection -----
141+ task_keywords = re .findall (
142+ r'\b(task|purpose|need|require|generate|analyze|create|process)\b' ,
143+ stripped , re .IGNORECASE ,
144+ )
145+ specificity_keywords = re .findall (
146+ r'\b(specific|particular|exact|quarterly|annual|report|summary)\b' ,
147+ stripped , re .IGNORECASE ,
148+ )
149+ total_matched = len (task_keywords ) + len (specificity_keywords )
150+ keyword_density = total_matched / word_count if word_count > 0 else 0
151+ if keyword_density > 0.5 :
152+ return 0.1 # More than half the words are scoring keywords — stuffing
153+
154+ # ----- Scoring (defensive caps per category) -----
74155 score = 0.0
75-
76- if len (justification ) > 20 :
77- score += 0.2
78- if len (justification ) > 50 :
79- score += 0.2
80- if re .search (r'\b(task|purpose|need|require|generate|analyze|create|process)\b' ,
81- justification , re .IGNORECASE ):
82- score += 0.2
83- if re .search (r'\b(specific|particular|exact|quarterly|annual|report|summary)\b' ,
84- justification , re .IGNORECASE ):
85- score += 0.2
86- if not re .search (r'\b(test|debug|try|experiment)\b' , justification , re .IGNORECASE ):
87- score += 0.2
88-
156+
157+ # Length contribution (max 0.25)
158+ if len (stripped ) > 20 :
159+ score += 0.15
160+ if len (stripped ) > 50 :
161+ score += 0.10
162+
163+ # Task keyword presence (max 0.20, but only first match counts)
164+ if task_keywords :
165+ score += 0.20
166+
167+ # Specificity keyword presence (max 0.20, but only first match counts)
168+ if specificity_keywords :
169+ score += 0.20
170+
171+ # No test/debug markers (max 0.15)
172+ if not re .search (r'\b(test|debug|try|experiment)\b' , stripped , re .IGNORECASE ):
173+ score += 0.15
174+
175+ # Structural coherence: sentence-like structure (max 0.20)
176+ # Must contain at least one verb-like pattern and read like prose
177+ has_verb = bool (re .search (
178+ r'\b(is|are|was|were|need|needs|require|requires|must|should|will|'
179+ r'generate|generating|analyze|analyzing|create|creating|process|processing|'
180+ r'prepare|preparing|compile|compiling|review|reviewing|access|accessing|'
181+ r'retrieve|retrieving|export|exporting|send|sending|run|running)\b' ,
182+ stripped , re .IGNORECASE ,
183+ ))
184+ has_noun_object = bool (re .search (
185+ r'\b(data|report|records|invoices?|orders?|customers?|accounts?|'
186+ r'transactions?|files?|emails?|results?|metrics?|statistics?|'
187+ r'analysis|documents?|exports?|payments?|entries|logs?|summaries)\b' ,
188+ stripped , re .IGNORECASE ,
189+ ))
190+ if has_verb and has_noun_object :
191+ score += 0.20
192+
89193 return min (score , 1.0 )
90194
91195
0 commit comments