-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathparse.js
More file actions
48 lines (37 loc) · 1.49 KB
/
parse.js
File metadata and controls
48 lines (37 loc) · 1.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
const fs = require("fs");
const path = require("path");
function parse() {
let output = [];
let flagged = [];
let fileCount = 0;
const textDir = path.join(__dirname, "./SOURCE_TEXT");
const files = fs.readdirSync(textDir).filter(file => file.endsWith(".txt"));
console.log(`[*] Found ${files.length} files`);
for (const file of files) {
const filePath = path.join(textDir, file);
const content = fs.readFileSync(filePath, "utf8");
const trimmedContent = content.replace(/[\r\n]/g, "");
const letterMatches = trimmedContent.match(/[A-Za-z]/g) || [];
const totalLetters = letterMatches.length;
const lowercaseMatches = trimmedContent.match(/[a-z]/g) || [];
const lowercaseCount = lowercaseMatches.length;
const lowercasePercent = totalLetters === 0 ? 0 : (lowercaseCount / totalLetters) * 100;
if (lowercasePercent < 1) {
console.log(
`[!] Skipping ${file} — only ${lowercasePercent.toFixed(2)}% lowercase (${lowercaseCount}/${totalLetters})`
);
flagged.push({ file, lowercasePercent: Number(lowercasePercent.toFixed(4)), lowercaseCount, totalLetters });
continue;
}
output.push(trimmedContent);
fileCount++;
}
console.log(
`[+] Files successfully parsed, ${fileCount} files saved`
);
if (flagged.length > 0) {
console.log(`[i] ${flagged.length} files flagged`);
}
fs.writeFileSync("output.json", JSON.stringify(output), "utf8");
}
module.exports = parse;