Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/Devoplus.DataGuardian/DataGuardianEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ public DataGuardianEngine(DataGuardianOptions opt, Ner.INerRecognizer? ner = nul
new Recognizers.IbanRecognizer(),
new Recognizers.CreditCardRecognizer(),
new Recognizers.TcknRecognizer(),
new Recognizers.VknRecognizer(),
new Recognizers.SgkRecognizer(),
new Recognizers.PassportRecognizer(),
new Recognizers.LicensePlateRecognizer(),
new Recognizers.DobRecognizer(),
new Recognizers.AddressRecognizer()
};
Expand Down
110 changes: 110 additions & 0 deletions src/Devoplus.DataGuardian/DataGuardianMiddleware.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ namespace Devoplus.DataGuardian;

public sealed class DataGuardianMiddleware
{
private const int JsonSearchLimit = 100;

private readonly RequestDelegate _next;
private readonly DataGuardianEngine _engine;
private readonly DataGuardianOptions _opt;
Expand Down Expand Up @@ -155,6 +157,11 @@ private bool IsAllowed(HttpContext ctx)

private static string Redact(string text, IEnumerable<PiiHit> hits, DataGuardianOptions opt)
{
if (opt.Redaction == RedactionStyle.JsonSafe)
{
return RedactJsonSafe(text, hits, opt);
}

var sb = new StringBuilder(text);
var toRedact = hits.Where(h => opt.RedactTypes.Contains(h.Type)).OrderByDescending(h => h.Start).ToList();
foreach (var h in toRedact)
Expand All @@ -180,4 +187,107 @@ private static string Redact(string text, IEnumerable<PiiHit> hits, DataGuardian
}
return sb.ToString();
}

private static string RedactJsonSafe(string text, IEnumerable<PiiHit> hits, DataGuardianOptions opt)
{
try
{
using var doc = System.Text.Json.JsonDocument.Parse(text);
var toRedact = hits.Where(h => opt.RedactTypes.Contains(h.Type)).ToList();
return RedactJsonElement(text, doc.RootElement, toRedact);
}
catch
{
// Fall back to regular redaction if JSON parsing fails
return Redact(text, hits, new DataGuardianOptions { Redaction = RedactionStyle.Partial, RedactTypes = opt.RedactTypes });
}
}

private static string RedactJsonElement(string originalText, System.Text.Json.JsonElement element, List<PiiHit> hits)
{
var sb = new StringBuilder(originalText);

// Process hits in reverse order to maintain correct positions
var sortedHits = hits.OrderByDescending(h => h.Start).ToList();

foreach (var hit in sortedHits)
{
if (hit.Start < 0 || hit.Start + hit.Length > sb.Length) continue;

// Check if this hit is within a JSON value (not a key)
if (IsWithinJsonValue(originalText, hit.Start, element))
{
// Apply partial redaction to preserve some readability
var value = sb.ToString(hit.Start, hit.Length);
string redacted;

if (hit.Length <= 3)
{
redacted = new string('*', hit.Length);
}
else if (value.Contains('@')) // Email-like
{
var atPos = value.IndexOf('@');
var parts = value.Split('@');
if (parts.Length == 2)
{
var localPart = parts[0].Length > 2 ? parts[0][..1] + new string('*', parts[0].Length - 1) : new string('*', parts[0].Length);
var domainParts = parts[1].Split('.');
var domain = domainParts.Length > 1
? new string('*', domainParts[0].Length) + "." + domainParts[^1]
: new string('*', parts[1].Length);
redacted = localPart + "@" + domain;
}
else
{
redacted = value[..1] + new string('*', value.Length - 1);
}
}
else // Partial masking
{
if (hit.Length <= 3)
{
redacted = new string('*', hit.Length);
}
else
{
var visibleChars = Math.Max(1, Math.Min(2, hit.Length / 3));
redacted = value[..visibleChars] + new string('*', hit.Length - 2 * visibleChars) + value[^visibleChars..];
}
}

sb.Remove(hit.Start, hit.Length);
sb.Insert(hit.Start, redacted);
}
}

return sb.ToString();
}

private static bool IsWithinJsonValue(string json, int position, System.Text.Json.JsonElement root)
{
// Simple heuristic: check if the position is not immediately after a colon and quote
// This is a simplified approach - we assume the position is in a value if it's not clearly a key

// Look backward to find the nearest structural character
int i = position - 1;
while (i >= 0 && char.IsWhiteSpace(json[i])) i--;

if (i < 0) return false;

// If we find a colon before finding a comma/bracket, we're likely in a value
int colonPos = -1;
int commaOrBracketPos = -1;

for (int j = i; j >= 0 && j > Math.Max(0, position - JsonSearchLimit); j--)
{
if (json[j] == ':' && colonPos < 0) colonPos = j;
if ((json[j] == ',' || json[j] == '{' || json[j] == '[') && commaOrBracketPos < 0) commaOrBracketPos = j;

if (colonPos >= 0 && commaOrBracketPos >= 0) break;
}

// If we found a colon more recently than a comma/bracket, we're in a value
return colonPos > commaOrBracketPos;
}
}
8 changes: 4 additions & 4 deletions src/Devoplus.DataGuardian/DataGuardianOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ public sealed class DataGuardianOptions

public Dictionary<string, double> Weights { get; set; } = new()
{
["TCKN"] = 10, ["CREDIT_CARD"] = 9, ["IBAN_TR"] = 8,
["DOB"] = 7, ["ADDRESS"] = 6, ["PHONE"] = 5, ["EMAIL"] = 4, ["PERSON"] = 3
["TCKN"] = 10, ["CREDIT_CARD"] = 9, ["VKN"] = 9, ["IBAN_TR"] = 8, ["PASSPORT"] = 8,
["DOB"] = 7, ["SGK"] = 7, ["ADDRESS"] = 6, ["PHONE"] = 5, ["LICENSE_PLATE"] = 5, ["EMAIL"] = 4, ["PERSON"] = 3
};

public int MaxCountPerType { get; set; } = 5;
Expand Down Expand Up @@ -47,7 +47,7 @@ public sealed class DataGuardianOptions
// Action mode
public ActionMode Action { get; set; } = ActionMode.Tag; // Tag by default
public double RedactAt { get; set; } = 0; // Redact when risk >= RedactAt
public HashSet<string> RedactTypes { get; set; } = new() { "EMAIL","PHONE","TCKN","CREDIT_CARD","IBAN_TR","DOB" };
public HashSet<string> RedactTypes { get; set; } = new() { "EMAIL","PHONE","TCKN","CREDIT_CARD","IBAN_TR","DOB","VKN","SGK","LICENSE_PLATE","PASSPORT" };
public RedactionStyle Redaction { get; set; } = RedactionStyle.MaskAll;

// Headers toggle
Expand All @@ -59,4 +59,4 @@ public sealed class DataGuardianOptions

// Supporting enums
public enum ActionMode { None, Tag, Redact, Block }
public enum RedactionStyle { MaskAll, Partial, Hash }
public enum RedactionStyle { MaskAll, Partial, Hash, JsonSafe }
23 changes: 23 additions & 0 deletions src/Devoplus.DataGuardian/Recognizers/LicensePlateRecognizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
using System.Collections.Generic;
using System.Text.RegularExpressions;

namespace Devoplus.DataGuardian.Recognizers;

public sealed class LicensePlateRecognizer : IPiiRecognizer
{
// Turkish license plate format: 2 digits + space + 1-3 letters + space + 2-4 digits
// Examples: "34 ABC 1234", "06 XY 9876", "01 A 1234"
static readonly Regex Rx = new(@"\b\d{2}\s?[A-Z]{1,3}\s?\d{2,4}\b", RegexOptions.Compiled | RegexOptions.IgnoreCase);

public IReadOnlyList<PiiHit> Analyze(string text, string lang)
{
if (lang != "tr") return System.Array.Empty<PiiHit>();

var list = new List<PiiHit>();
foreach (Match m in Rx.Matches(text))
{
list.Add(new PiiHit("LICENSE_PLATE", m.Index, m.Length));
}
return list;
}
}
23 changes: 23 additions & 0 deletions src/Devoplus.DataGuardian/Recognizers/PassportRecognizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
using System.Collections.Generic;
using System.Text.RegularExpressions;

namespace Devoplus.DataGuardian.Recognizers;

public sealed class PassportRecognizer : IPiiRecognizer
{
// Turkish passport format: 1 letter + 8 digits (e.g., "U12345678")
static readonly Regex Rx = new(@"\b[A-Z]\d{8}\b", RegexOptions.Compiled | RegexOptions.IgnoreCase);

public IReadOnlyList<PiiHit> Analyze(string text, string lang)
{
// Support both Turkish and English contexts
if (lang != "tr" && lang != "en") return System.Array.Empty<PiiHit>();

var list = new List<PiiHit>();
foreach (Match m in Rx.Matches(text))
{
list.Add(new PiiHit("PASSPORT", m.Index, m.Length));
}
return list;
}
}
21 changes: 21 additions & 0 deletions src/Devoplus.DataGuardian/Recognizers/SgkRecognizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
using System.Collections.Generic;
using System.Text.RegularExpressions;

namespace Devoplus.DataGuardian.Recognizers;

public sealed class SgkRecognizer : IPiiRecognizer
{
static readonly Regex Rx = new(@"\b\d{12}\b", RegexOptions.Compiled);

public IReadOnlyList<PiiHit> Analyze(string text, string lang)
{
var list = new List<PiiHit>();
foreach (Match m in Rx.Matches(text))
{
// SGK numbers are 12 digits
// Basic validation: should be all digits
list.Add(new PiiHit("SGK", m.Index, m.Length));
}
return list;
}
}
44 changes: 44 additions & 0 deletions src/Devoplus.DataGuardian/Recognizers/VknRecognizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;

namespace Devoplus.DataGuardian.Recognizers;

public sealed class VknRecognizer : IPiiRecognizer
{
static readonly Regex Rx = new(@"\b\d{10}\b", RegexOptions.Compiled);

public IReadOnlyList<PiiHit> Analyze(string text, string lang)
{
var list = new List<PiiHit>();
foreach (Match m in Rx.Matches(text))
{
var v = m.Value;
if (IsValid(v))
list.Add(new PiiHit("VKN", m.Index, m.Length));
}
return list;
}

static bool IsValid(string s)
{
if (s.Length != 10) return false;

// VKN checksum validation (Modulo 10 algorithm)
var digits = s.Select(c => c - '0').ToArray();

int[] v = new int[10];
for (int i = 0; i < 9; i++)
{
int temp = (digits[i] + (9 - i)) % 10;
v[i] = (temp * (int)Math.Pow(2, 9 - i)) % 9;
if (temp != 0 && v[i] == 0) v[i] = 9;
}

int sum = v.Take(9).Sum();
int lastDigit = (10 - (sum % 10)) % 10;

return digits[9] == lastDigit;
}
}
113 changes: 113 additions & 0 deletions tests/Devoplus.DataGuardian.Tests/IntegrationTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
using Devoplus.DataGuardian;
using Xunit;
using System.Linq;

public class IntegrationTests
{
[Fact]
public void Engine_Detects_All_Turkish_Identifiers()
{
var opt = new DataGuardianOptions { LanguageOverride = "tr" };
var engine = new DataGuardianEngine(opt);

var text = @"
TCKN: 10000000146
VKN: 8590095528
SGK: 123456789012
Passport: U12345678
License Plate: 34 ABC 1234
Email: test@example.com
Phone: 05551234567
";

var (risk, counts, hits) = engine.AnalyzeDetailed(text);

// Should detect multiple types
Assert.True(risk > 0);
Assert.True(counts.Count >= 5); // At least 5 different PII types
Assert.Contains("TCKN", counts.Keys);
Assert.Contains("VKN", counts.Keys);
Assert.Contains("SGK", counts.Keys);
Assert.Contains("PASSPORT", counts.Keys);
Assert.Contains("LICENSE_PLATE", counts.Keys);
}

[Fact]
public void Engine_Calculates_Risk_With_New_Detectors()
{
const double ExpectedMinimumHighRisk = 5.0; // VKN (9) + TCKN (10) + Passport (8)

var opt = new DataGuardianOptions { LanguageOverride = "tr" };
var engine = new DataGuardianEngine(opt);

// Text with high-weight identifiers
var highRiskText = "VKN: 8590095528, TCKN: 10000000146, Passport: U12345678";
var (highRisk, _, _) = engine.AnalyzeDetailed(highRiskText);

// Text with low-weight identifiers
var lowRiskText = "Email: test@example.com";
var (lowRisk, _, _) = engine.AnalyzeDetailed(lowRiskText);

Assert.True(highRisk > lowRisk);
Assert.True(highRisk > ExpectedMinimumHighRisk);
}

[Fact]
public void Middleware_JsonSafe_Mode_Works_E2E()
{
// This test verifies the configuration is properly set up
var opt = new DataGuardianOptions
{
Redaction = RedactionStyle.JsonSafe,
Action = ActionMode.Redact,
RedactAt = 0,
LanguageOverride = "tr"
};

var engine = new DataGuardianEngine(opt);
var json = "{\"email\":\"test@example.com\",\"vkn\":\"8590095528\"}";

var (risk, counts, hits) = engine.AnalyzeDetailed(json);

// Verify detection works
Assert.True(risk > 0);
Assert.Contains("EMAIL", counts.Keys);
Assert.Contains("VKN", counts.Keys);

// Verify redaction configuration
Assert.Equal(RedactionStyle.JsonSafe, opt.Redaction);
Assert.Contains("VKN", opt.RedactTypes);
Assert.Contains("EMAIL", opt.RedactTypes);
}

[Fact]
public void Engine_Respects_New_Default_Weights()
{
var opt = new DataGuardianOptions();

// Verify new types have weights
Assert.True(opt.Weights.ContainsKey("VKN"));
Assert.Equal(9, opt.Weights["VKN"]);

Assert.True(opt.Weights.ContainsKey("PASSPORT"));
Assert.Equal(8, opt.Weights["PASSPORT"]);

Assert.True(opt.Weights.ContainsKey("SGK"));
Assert.Equal(7, opt.Weights["SGK"]);

Assert.True(opt.Weights.ContainsKey("LICENSE_PLATE"));
Assert.Equal(5, opt.Weights["LICENSE_PLATE"]);
}

[Fact]
public void Engine_Respects_New_RedactTypes()
{
var opt = new DataGuardianOptions();

// Verify new types are in default redact set
Assert.Contains("VKN", opt.RedactTypes);
Assert.Contains("SGK", opt.RedactTypes);
Assert.Contains("LICENSE_PLATE", opt.RedactTypes);
Assert.Contains("PASSPORT", opt.RedactTypes);
}
}
Loading