Classifier_PII_LATAM/test_improvements.py at main · andresveraf/Classifier_PII_LATAM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
"""
Test script to demonstrate all improvements:
1. Error handling
2. Input validation
3. Edge case handling
4. Logging
5. Performance monitoring
6. Batch error recovery
"""

import sys
from pathlib import Path
sys.path.append(str(Path(__file__).parent))

from inference.pipeline import PII_ValidationPipeline, PipelineError
from utils import ValidationError


def test_edge_cases():
    """Test edge cases: None, empty text, special characters."""
    print("=" * 60)
    print("TEST 1: Edge Cases")
    print("=" * 60)

    pipeline = PII_ValidationPipeline()

    # Test None/empty text
    test_cases = [
        {"text": None, "entity_type": "ID", "country": "CL"},
        {"text": "", "entity_type": "PHONE", "country": "BR"},
        {"text": "   ", "entity_type": "EMAIL", "country": "CL"},
    ]

    for case in test_cases:
        try:
            result = pipeline.validate(case["text"], case["entity_type"], case["country"])
            print(f"\n✓ Empty/None test: {case}")
            print(f"  Result: is_pii={result['is_pii']}, path={result['validation_path']}")
        except Exception as e:
            print(f"\n✗ Failed: {e}")

    print("\n" + "=" * 60 + "\n")


def test_input_validation():
    """Test input validation with invalid inputs."""
    print("=" * 60)
    print("TEST 2: Input Validation")
    print("=" * 60)

    pipeline = PII_ValidationPipeline()

    # Test invalid entity type
    print("\n1. Invalid entity type:")
    try:
        result = pipeline.validate("test", "INVALID_TYPE", "CL")
        print("  ✗ Should have raised ValidationError")
    except (ValidationError, PipelineError) as e:
        print(f"  ✓ Correctly caught: {e}")

    # Test invalid country
    print("\n2. Invalid country:")
    try:
        result = pipeline.validate("test", "ID", "XX")
        print("  ✗ Should have raised ValidationError")
    except (ValidationError, PipelineError) as e:
        print(f"  ✓ Correctly caught: {e}")

    # Test too long text
    print("\n3. Too long text:")
    try:
        result = pipeline.validate("x" * 20000, "ID", "CL")
        print("  ✗ Should have raised ValidationError")
    except (ValidationError, PipelineError) as e:
        print(f"  ✓ Correctly caught: {e}")

    print("\n" + "=" * 60 + "\n")


def test_batch_error_recovery():
    """Test batch processing with some failing items."""
    print("=" * 60)
    print("TEST 3: Batch Error Recovery")
    print("=" * 60)

    pipeline = PII_ValidationPipeline()

    # Mix of valid and invalid items
    batch = [
        {"text": "15.783.037-6", "entity_type": "ID", "country": "CL"},  # Valid
        {"text": "", "entity_type": "PHONE", "country": "BR"},  # Empty
        {"text": "juan.perez@gmail.com", "entity_type": "EMAIL", "country": "CL"},  # Valid
        {"text": None, "entity_type": "ID", "country": "CL"},  # None
        {"text": "+56912345678", "entity_type": "PHONE", "country": "CL"},  # Valid
    ]

    print(f"\nProcessing batch of {len(batch)} items (including invalid ones)...")
    results = pipeline.validate_batch(batch)

    print(f"\n✓ Batch completed: {len(results)} results")
    print("\nResults summary:")
    for i, result in enumerate(results):
        status = "✓ PII" if result['is_pii'] else "✗ NOT PII"
        print(f"  Item {i+1}: {status} (path: {result['validation_path']})")

    print("\n" + "=" * 60 + "\n")


def test_logging_and_monitoring():
    """Test logging and performance monitoring."""
    print("=" * 60)
    print("TEST 4: Logging & Performance Monitoring")
    print("=" * 60)

    pipeline = PII_ValidationPipeline()

    # Perform several validations
    test_cases = [
        {"text": "15.783.037-6", "entity_type": "ID", "country": "CL"},
        {"text": "+56912345678", "entity_type": "PHONE", "country": "CL"},
        {"text": "juan.perez@gmail.com", "entity_type": "EMAIL", "country": "CL"},
        {"text": "123.456.789-00", "entity_type": "ID", "country": "BR"},
        {"text": "Juan Carlos Pérez", "entity_type": "PER", "country": "CL"},
    ]

    print(f"\nPerforming {len(test_cases)} validations...")
    for case in test_cases:
        result = pipeline.validate(case["text"], case["entity_type"], case["country"])

    # Get performance stats
    print("\n" + "-" * 60)
    print("Performance Statistics:")
    print("-" * 60)
    stats = pipeline.get_performance_stats()

    if 'validate' in stats:
        v_stats = stats['validate']
        print(f"  Validations: {v_stats['count']}")
        print(f"  Avg Time: {v_stats['avg_time_ms']:.2f}ms")
        print(f"  Errors: {v_stats['errors']}")

    if 'overall' in stats:
        o_stats = stats['overall']
        print(f"  Total Operations: {o_stats['total_operations']}")
        print(f"  Throughput: {o_stats['throughput_ops_per_sec']:.2f} ops/sec")

    print("\n✓ Check logs/ directory for detailed logs")

    print("\n" + "=" * 60 + "\n")


def test_successful_validations():
    """Test successful validations to ensure everything still works."""
    print("=" * 60)
    print("TEST 5: Successful Validations")
    print("=" * 60)

    pipeline = PII_ValidationPipeline()

    test_cases = [
        # TRUE PII
        {"text": "15.783.037-6", "entity_type": "ID", "country": "CL", "expected": True},
        {"text": "123.456.789-00", "entity_type": "ID", "country": "BR", "expected": True},
        {"text": "+56912345678", "entity_type": "PHONE", "country": "CL", "expected": True},
        {"text": "juan.perez@gmail.com", "entity_type": "EMAIL", "country": "CL", "expected": True},

        # FALSE POSITIVES
        {"text": "123456789", "entity_type": "ID", "country": "CL", "expected": False},
        {"text": "Article 15.783.037-6 of the law", "entity_type": "ID", "country": "CL", "expected": False},
    ]

    correct = 0
    for case in test_cases:
        result = pipeline.validate(case["text"], case["entity_type"], case["country"])
        is_correct = result['is_pii'] == case['expected']
        correct += is_correct

        status = "✓" if is_correct else "✗"
        print(f"\n{status} {case['text'][:40]}")
        print(f"   Expected: {case['expected']}, Got: {result['is_pii']}")
        print(f"   Path: {result['validation_path']}, Confidence: {result['confidence']:.2f}")

    print(f"\n{'='*60}")
    print(f"Accuracy: {correct}/{len(test_cases)} ({100*correct/len(test_cases):.0f}%)")
    print("=" * 60 + "\n")


def main():
    """Run all tests."""
    print("\n" + "=" * 60)
    print("TESTING PII VALIDATION IMPROVEMENTS")
    print("=" * 60 + "\n")

    try:
        # Test 1: Edge cases
        test_edge_cases()

        # Test 2: Input validation
        test_input_validation()

        # Test 3: Batch error recovery
        test_batch_error_recovery()

        # Test 4: Logging and monitoring
        test_logging_and_monitoring()

        # Test 5: Successful validations
        test_successful_validations()

        print("=" * 60)
        print("ALL TESTS COMPLETED SUCCESSFULLY")
        print("=" * 60)
        print("\nImprovements implemented:")
        print("  ✓ 1. Comprehensive error handling")
        print("  ✓ 2. Input validation with custom exceptions")
        print("  ✓ 3. Edge case handling (None/empty text)")
        print("  ✓ 4. Structured logging system")
        print("  ✓ 5. Performance monitoring")
        print("  ✓ 6. Batch error recovery")
        print("  ✓ 7. Model loading validation")
        print("\nCheck logs/ directory for detailed execution logs")
        print("=" * 60 + "\n")

    except Exception as e:
        print(f"\n✗ Test suite failed: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    main()