1- #include < print>
1+ #include < charconv>
2+ #include < format>
23#include < stack>
34#include < stdexcept>
45#include < string>
@@ -25,7 +26,8 @@ class XMLParser {
2526 std::string currentTag = " " ;
2627 std::string currentTagClose = " " ;
2728 std::string currentBody = " " ;
28- std::string currentPath = " " ;
29+ std::string currentPath = " " ;
30+ std::string currentEntity = " " ;
2931 auto tagStack = std::stack<std::string> {};
3032
3133 for (char ch : sv) {
@@ -43,12 +45,12 @@ class XMLParser {
4345 else {
4446 state = States::TagName;
4547 currentTag.push_back (ch);
46- if (currentPath.size () > 0 && currentPath[currentPath.size () - 2 ] != ' .' ) {
47- currentPath.push_back (' .' );
48- }
49- currentPath.push_back (ch);
48+ if (currentPath.size () >= 2 && currentPath[currentPath.size () - 2 ] != ' .' ) {
49+ currentPath.push_back (' .' );
50+ }
51+ currentPath.push_back (ch);
5052 }
51- break ;
53+ break ;
5254 }
5355 case States::TagName: {
5456 if (ch == ' ' )
@@ -59,7 +61,7 @@ class XMLParser {
5961 currentTag = " " ;
6062 } else {
6163 currentTag.push_back (ch);
62- currentPath.push_back (ch);
64+ currentPath.push_back (ch);
6365 }
6466 break ;
6567 }
@@ -74,20 +76,33 @@ class XMLParser {
7476 case States::Body: {
7577 if (ch == ' <' ) {
7678 state = States::Tag;
79+ } else if (ch == ' &' ) {
80+ state = States::Entity;
7781 } else {
7882 currentBody.push_back (ch);
7983 }
8084 break ;
8185 }
86+ case States::Entity: {
87+ if (ch == ' ;' ) {
88+ // Decode entity and append it to currentBody
89+ state = States::Body;
90+ currentBody.push_back (decodeXMLEntity (currentEntity));
91+ currentEntity = " " ;
92+ } else {
93+ currentEntity.push_back (ch);
94+ }
95+ break ;
96+ }
8297 case States::Tag: {
8398 if (ch == ' /' ) {
8499 state = States::TagClose;
85100 if (currentTagClose.size () == 0 )
86101 currentTagClose = tagStack.top ();
87102 } else {
88103 currentTag.push_back (ch);
89- currentPath.push_back (' .' );
90- currentPath.push_back (ch);
104+ currentPath.push_back (' .' );
105+ currentPath.push_back (ch);
91106 state = States::Processing;
92107 }
93108 break ;
@@ -118,10 +133,10 @@ class XMLParser {
118133 state = States::Body;
119134
120135 // Cleanup
121- tagStack.pop ();
122- if (auto pos = currentPath.find_last_of (' .' ); pos != std::string::npos) {
123- currentPath.erase (pos, std::string::npos);
124- }
136+ tagStack.pop ();
137+ if (auto pos = currentPath.find_last_of (' .' ); pos != std::string::npos) {
138+ currentPath.erase (pos, std::string::npos);
139+ }
125140 currentBody = " " ;
126141 break ;
127142 }
@@ -135,13 +150,49 @@ class XMLParser {
135150 throw std::runtime_error (" Something went wrong" );
136151 }
137152
153+ char decodeXMLEntity (const std::string& entity) {
154+ // XML escape characters
155+ if (entity == " quot" )
156+ return ' "' ;
157+ else if (entity == " apos" )
158+ return ' \' ' ;
159+ else if (entity == " lt" )
160+ return ' <' ;
161+ else if (entity == " gt" )
162+ return ' >' ;
163+ else if (entity == " amp" )
164+ return ' &' ;
165+
166+ // XML numerical values (i.e. ETags using quotes)
167+ int code = 0 ;
168+ int base;
169+ std::from_chars_result result;
170+ if (entity.starts_with (' #' ) && entity.size () > 1 ) {
171+ if (entity[1 ] == ' x' || entity[1 ] == ' X' ) {
172+ // Hex: #xhhhh
173+ base = 16 ;
174+ result = std::from_chars (entity.data () + 2 , entity.data () + entity.size (), code, base);
175+ } else {
176+ // Decimal: #hhhh
177+ base = 10 ;
178+ result = std::from_chars (entity.data () + 1 , entity.data () + entity.size (), code, base);
179+ }
180+ }
181+ if (result.ec == std::errc {}) {
182+ return static_cast <char >(code);
183+ }
184+
185+ throw std::runtime_error (std::format (" Unknown XML entity: &{};" , entity));
186+ }
187+
138188private:
139189 enum class States : int {
140190 Start,
141191 Processing,
142192 TagName,
143193 TagAttr,
144194 Body,
195+ Entity,
145196 Tag,
146197 TagClose,
147198 Emit,
0 commit comments