Skip to content

Commit 62ee46b

Browse files
Add CSV
1 parent 997b319 commit 62ee46b

File tree

7 files changed

+254
-11
lines changed

7 files changed

+254
-11
lines changed

.github/workflows/release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jobs:
1616
goarch: [ amd64, arm64]
1717
steps:
1818
- uses: actions/checkout@v4
19-
- uses: martijnboers/go-release-action@v1
19+
- uses: wangyoucao577/go-release-action@v1
2020
with:
2121
github_token: ${{ secrets.GITHUB_TOKEN }}
2222
goos: ${{ matrix.goos }}

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
## Description
2-
`unaware` is a command-line tool for masking sensitive data within XML and JSON files. It processes data from files or `stdin` and anonymizes specified property values. Masked values mimick the length and appearance of the original data types.
2+
`unaware` is a command-line tool for masking sensitive data within JSON, XML, and CSV files. It processes data from files or `stdin` and anonymizes specified property values. Masked values mimick the length and appearance of the original data types.
33

44
The program is a cross-platform, statically linked binary with no external dependencies. It leverages streaming and concurrency to efficiently process large files entirely offline.
55

@@ -13,7 +13,7 @@ Alternatively, check the releases page for pre-built binaries.
1313

1414
### Usage
1515
```
16-
Anonymize data in JSON and XML files by replacing values with realistic-looking fakes.
16+
Anonymize data in JSON, XML, and CSV files by replacing values with realistic-looking fakes.
1717
1818
Use the -method hashed option to preserve relationships by ensuring identical input values get the same masked output value. By default every run uses a random salt, use STATIC_SALT=test123 environment variable for consistent masking.
1919
@@ -22,7 +22,7 @@ Use the -method hashed option to preserve relationships by ensuring identical in
2222
-exclude value
2323
Glob pattern to exclude keys from masking (can be specified multiple times)
2424
-format string
25-
The format of the input data (json or xml) (default "json")
25+
The format of the input data (json, xml, or csv) (default "json")
2626
-in string
2727
Input file path (default: stdin)
2828
-include value

pkg/csv.go

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
package pkg
2+
3+
import (
4+
"encoding/csv"
5+
"fmt"
6+
"io"
7+
"sync"
8+
)
9+
10+
// csvProcessor handles the reading, concurrent masking, and writing of CSV data.
11+
type csvProcessor struct {
12+
methodFactory func() *masker
13+
include []string
14+
exclude []string
15+
}
16+
17+
// newCSVProcessor creates a new processor for CSV files.
18+
func newCSVProcessor(strategy MaskingStrategy, include, exclude []string) *csvProcessor {
19+
return &csvProcessor{
20+
methodFactory: func() *masker {
21+
return newMasker(strategy)
22+
},
23+
include: include,
24+
exclude: exclude,
25+
}
26+
}
27+
28+
// Process orchestrates the reading and assembling of the CSV data.
29+
func (p *csvProcessor) Process(r io.Reader, w io.Writer, cpuCount int) error {
30+
csvReader := csv.NewReader(r)
31+
32+
// Read the header to get column names.
33+
header, err := csvReader.Read()
34+
if err == io.EOF {
35+
return nil // Handle empty file
36+
}
37+
if err != nil {
38+
return fmt.Errorf("error reading CSV header: %w", err)
39+
}
40+
41+
// The chunkReader function reads one row at a time and converts it into a map,
42+
// which is the "chunk" our concurrent runner will process.
43+
chunkReader := func() (any, error) {
44+
record, err := csvReader.Read()
45+
if err != nil {
46+
return nil, err // Let the runner handle io.EOF
47+
}
48+
// Convert the row (a slice of strings) into a map using the header.
49+
// This provides the necessary key (column name) for masking.
50+
rowMap := make(map[string]any, len(header))
51+
for i, value := range record {
52+
if i < len(header) {
53+
rowMap[header[i]] = value
54+
}
55+
}
56+
return rowMap, nil
57+
}
58+
59+
assembler := &csvAssembler{
60+
header: header,
61+
writer: csv.NewWriter(w),
62+
}
63+
runner := newConcurrentRunner(p.methodFactory, cpuCount, p.include, p.exclude)
64+
65+
return runner.Run(w, chunkReader, assembler)
66+
}
67+
68+
// csvAssembler is responsible for writing the processed data back into a CSV format.
69+
type csvAssembler struct {
70+
header []string
71+
writer *csv.Writer
72+
// A mutex is needed because multiple workers will call WriteItem concurrently.
73+
mu sync.Mutex
74+
}
75+
76+
func (a *csvAssembler) WriteStart(w io.Writer) error {
77+
return a.writer.Write(a.header)
78+
}
79+
80+
func (a *csvAssembler) WriteItem(w io.Writer, item any, isFirst bool) error {
81+
// The concurrent runner might call this from multiple goroutines,
82+
// so we lock to ensure writes are not interleaved.
83+
a.mu.Lock()
84+
defer a.mu.Unlock()
85+
86+
rowMap, ok := item.(map[string]any)
87+
if !ok {
88+
return fmt.Errorf("csv assembler expected map[string]any, but got %T", item)
89+
}
90+
91+
// Convert the map back into a slice of strings in the correct order.
92+
record := make([]string, len(a.header))
93+
for i, key := range a.header {
94+
if val, ok := rowMap[key]; ok {
95+
record[i] = fmt.Sprintf("%v", val)
96+
}
97+
}
98+
99+
return a.writer.Write(record)
100+
}
101+
102+
func (a *csvAssembler) WriteEnd(w io.Writer) error {
103+
// The csv.Writer needs to be flushed to ensure all buffered data is written.
104+
a.writer.Flush()
105+
return a.writer.Error()
106+
}

pkg/engine.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ func Start(format string, cpuCount int, r io.Reader, w io.Writer, strategy Maski
3232
p = newJSONProcessor(strategy, include, exclude)
3333
case "xml":
3434
p = newXMLProcessor(strategy, include, exclude)
35+
case "csv":
36+
p = newCSVProcessor(strategy, include, exclude)
3537
default:
3638
return fmt.Errorf("unsupported format: %s", format)
3739
}

test/csv_test.go

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
package test
2+
3+
import (
4+
"bytes"
5+
"encoding/csv"
6+
"strings"
7+
"testing"
8+
9+
"github.com/stretchr/testify/assert"
10+
"github.com/stretchr/testify/require"
11+
12+
"unaware/pkg"
13+
)
14+
15+
func TestCSVProcessing_Hashed(t *testing.T) {
16+
salt := []byte("csv-salt")
17+
input := `id,name,email,ip_address,notes
18+
1,John Doe,john.doe@example.com,192.168.1.1,some notes
19+
2,Jane Smith,jane.smith@example.net,10.0.0.2,more data`
20+
21+
var buf bytes.Buffer
22+
err := pkg.Start("csv", 2, strings.NewReader(input), &buf, pkg.Hashed(salt), nil, nil)
23+
require.NoError(t, err)
24+
25+
output := buf.String()
26+
assert.NotEmpty(t, output)
27+
28+
// Verify header is preserved
29+
assert.True(t, strings.HasPrefix(output, "id,name,email,ip_address,notes\n"))
30+
31+
// Verify original sensitive data is gone
32+
assert.NotContains(t, output, "John Doe")
33+
assert.NotContains(t, output, "john.doe@example.com")
34+
assert.NotContains(t, output, "192.168.1.1")
35+
assert.NotContains(t, output, "Jane Smith")
36+
assert.NotContains(t, output, "jane.smith@example.net")
37+
assert.NotContains(t, output, "10.0.0.2")
38+
39+
// Verify the structure is a valid CSV with the correct number of records
40+
r := csv.NewReader(strings.NewReader(output))
41+
records, err := r.ReadAll()
42+
require.NoError(t, err)
43+
assert.Len(t, records, 3) // Header + 2 rows
44+
assert.Len(t, records[1], 5)
45+
assert.Len(t, records[2], 5)
46+
}
47+
48+
func TestCSVProcessing_WithInclude(t *testing.T) {
49+
salt := []byte("csv-include-salt")
50+
input := `id,name,email,ip_address,notes
51+
1,John Doe,john.doe@example.com,192.168.1.1,some notes
52+
2,Jane Smith,jane.smith@example.net,10.0.0.2,more data`
53+
54+
// Only include 'email' and 'ip_address' for masking
55+
include := []string{"email", "ip_address"}
56+
57+
var buf bytes.Buffer
58+
err := pkg.Start("csv", 2, strings.NewReader(input), &buf, pkg.Hashed(salt), include, nil)
59+
require.NoError(t, err)
60+
61+
output := buf.String()
62+
63+
// Verify sensitive data that should be masked is gone
64+
assert.NotContains(t, output, "john.doe@example.com")
65+
assert.NotContains(t, output, "192.168.1.1")
66+
67+
// Verify data that should be preserved is still there
68+
assert.Contains(t, output, "John Doe")
69+
assert.Contains(t, output, "Jane Smith")
70+
assert.Contains(t, output, "some notes")
71+
72+
// Verify structure and read the output to check specific fields
73+
r := csv.NewReader(strings.NewReader(output))
74+
records, err := r.ReadAll()
75+
require.NoError(t, err)
76+
require.Len(t, records, 3)
77+
78+
// Check first data row
79+
assert.Equal(t, "1", records[1][0])
80+
assert.Equal(t, "John Doe", records[1][1])
81+
assert.NotEqual(t, "john.doe@example.com", records[1][2]) // Masked
82+
assert.NotEqual(t, "192.168.1.1", records[1][3]) // Masked
83+
assert.Equal(t, "some notes", records[1][4])
84+
}
85+
86+
func TestCSVProcessing_WithExclude(t *testing.T) {
87+
salt := []byte("csv-exclude-salt")
88+
input := `id,name,email,ip_address,notes
89+
1,John Doe,john.doe@example.com,192.168.1.1,some notes`
90+
91+
// Exclude 'id' and 'notes' from masking
92+
exclude := []string{"id", "notes"}
93+
94+
var buf bytes.Buffer
95+
err := pkg.Start("csv", 2, strings.NewReader(input), &buf, pkg.Hashed(salt), nil, exclude)
96+
require.NoError(t, err)
97+
98+
output := buf.String()
99+
100+
// Verify sensitive data that should be masked is gone
101+
assert.NotContains(t, output, "John Doe")
102+
assert.NotContains(t, output, "john.doe@example.com")
103+
104+
// Verify data that should be preserved is still there
105+
assert.Contains(t, output, "1,") // id is preserved
106+
assert.Contains(t, output, ",some notes") // notes is preserved
107+
108+
// Verify structure
109+
r := csv.NewReader(strings.NewReader(output))
110+
records, err := r.ReadAll()
111+
require.NoError(t, err)
112+
require.Len(t, records, 2)
113+
assert.Equal(t, "1", records[1][0]) // Preserved
114+
assert.NotEqual(t, "John Doe", records[1][1]) // Masked
115+
assert.NotEqual(t, "john.doe@example.com", records[1][2]) // Masked
116+
assert.NotEqual(t, "192.168.1.1", records[1][3]) // Masked
117+
assert.Equal(t, "some notes", records[1][4]) // Preserved
118+
}
119+
120+
func TestEmptyReader_CSV(t *testing.T) {
121+
var buf bytes.Buffer
122+
err := pkg.Start("csv", 1, strings.NewReader(""), &buf, pkg.Random(), nil, nil)
123+
require.NoError(t, err)
124+
assert.Equal(t, "", buf.String())
125+
}
126+
127+
func TestReaderError_CSV(t *testing.T) {
128+
errorReader := &errorReader{}
129+
var buf bytes.Buffer
130+
err := pkg.Start("csv", 1, errorReader, &buf, pkg.Random(), nil, nil)
131+
require.Error(t, err)
132+
}

test/json_test.go

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package test
33
import (
44
"bytes"
55
"encoding/json"
6-
"io"
76
"strings"
87
"testing"
98

@@ -197,9 +196,3 @@ func TestJSONStreamingNestedArray(t *testing.T) {
197196
err = json.Unmarshal([]byte(output), &result)
198197
require.NoError(t, err, "Output should be valid JSON. Got: %s", output)
199198
}
200-
201-
type errorReader struct{}
202-
203-
func (r *errorReader) Read(p []byte) (n int, err error) {
204-
return 0, io.ErrUnexpectedEOF
205-
}

test/test_utils.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
package test
2+
3+
import "io"
4+
5+
// errorReader is a helper for testing that simulates an error during reading.
6+
type errorReader struct{}
7+
8+
func (r *errorReader) Read(p []byte) (n int, err error) {
9+
return 0, io.ErrUnexpectedEOF
10+
}

0 commit comments

Comments
 (0)