-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathProgram.cs
More file actions
91 lines (72 loc) · 2.73 KB
/
Program.cs
File metadata and controls
91 lines (72 loc) · 2.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
// See https://aka.ms/new-console-template for more information
using System.Diagnostics;
using System.Text.RegularExpressions;
using System.Xml;
using System.Xml.XPath;
Console.WriteLine("Process TMX");
// Load the TMX file
if (args.Length > 0)
{
ProcessTMX(args[0]);
Console.WriteLine("Run completed.");
return 0;
}
else return 1;
static void ProcessTMX(string fileName)
{
XmlDocument xmlDoc = new();
xmlDoc.Load(fileName);
XPathNavigator xPathNavigator = xmlDoc.CreateNavigator();
XmlNamespaceManager xmlNamespaceManager = new(xPathNavigator.NameTable);
StreamWriter outfile = new(fileName + ".cleanonly.tsv");
outfile.WriteLine("EN\tES");
// Select all TU elements
XmlNodeList tuNodes = xmlDoc.SelectNodes("//tu");
Console.WriteLine($"tuNodes.Count: {tuNodes?.Count ?? 0}");
// Loop through each TU element and apply the method
int discarded = 0;
int processed = 0;
foreach (XmlNode tuNode in tuNodes)
{
// Apply your method to the TU element here
// For example, you could extract the source and target segments:
XmlNode segSource = tuNode.SelectSingleNode("./tuv[@xml:lang='en-US']/seg", xmlNamespaceManager);
string sourceText = segSource.InnerText;
XmlNode segTarget = tuNode.SelectSingleNode("./tuv[@xml:lang='es-ES']/seg", xmlNamespaceManager);
string targetText = segTarget.InnerText;
string cleanSourceText = RemoveMarkup(sourceText);
string cleanTargetText = RemoveMarkup(targetText);
if ((string.IsNullOrEmpty(cleanSourceText)) || (string.IsNullOrEmpty(cleanTargetText)))
{
discarded++;
continue;
}
if ((cleanSourceText.StartsWith("Ramon"))
|| (cleanSourceText.StartsWith("Diana"))
|| (cleanSourceText.StartsWith("Edgar")))
{
discarded++;
continue;
}
processed++;
outfile.WriteLine($"{cleanSourceText}\t{cleanTargetText}");
}
Console.WriteLine($"Processed: {processed} Discarded: {discarded}");
outfile.Close();
}
static string RemoveMarkup(string tuv)
{
// regular expression pattern to match XML tags
string pattern = @"<[^>]+>";
// remove all XML tags from the string
string plainText = Regex.Replace(tuv, pattern, "");
//other cleanup
plainText = plainText.Replace("\t", " ");
plainText = plainText.Replace("• ", "");
plainText = plainText.StartsWith("-") ? plainText[1..] : plainText;
plainText = plainText.StartsWith("■") ? plainText[1..] : plainText;
plainText = plainText.StartsWith("\"") ? "\"" + plainText : plainText;
plainText = Regex.Replace(plainText, @"\.+", ".");
// output plain text string
return plainText.Trim();
}