Skip to content

Commit 3b510bb

Browse files
committed
Add XML entity parsing
1 parent da17674 commit 3b510bb

4 files changed

Lines changed: 99 additions & 30 deletions

File tree

src/s3cpp/s3.hpp

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
class S3Client {
55
public:
6+
// TODO(cristian): We should accept and define the endpoint url here
67
S3Client(const std::string& access, const std::string& secret)
78
: Client(HttpClient())
89
, Signer(AWSSigV4Signer(access, secret))
@@ -17,14 +18,10 @@ class S3Client {
1718
}
1819

1920
void list_objects(const std::string& bucket, const std::string& prefix) {
20-
// TODO(cristian): Decide what to do with Host, if it will always be the same as the URL,
21-
// then we can autoamtically create this header on the HttpClient
22-
// TODO(cristian): This is currently hardcoded to point to MinIO Docker IP-Port...
21+
// TODO(cristian): Decide what to do with the Host header
2322

24-
const std::string empty_payload_hash = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855";
25-
HttpRequest req = Client.get(std::format("http://127.0.0.1:9000/{}?prefix={}&max-keys=2", bucket, prefix))
26-
.header("Host", "127.0.0.1")
27-
.header("X-Amz-Content-Sha256", empty_payload_hash);
23+
HttpRequest req = Client.get(std::format("http://127.0.0.1:9000/{}?prefix={}", bucket, prefix))
24+
.header("Host", "127.0.0.1");
2825
Signer.sign(req);
2926

3027
HttpResponse res = req.execute();

src/s3cpp/xml.hpp

Lines changed: 65 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
#include <print>
1+
#include <charconv>
2+
#include <format>
23
#include <stack>
34
#include <stdexcept>
45
#include <string>
@@ -25,7 +26,8 @@ class XMLParser {
2526
std::string currentTag = "";
2627
std::string currentTagClose = "";
2728
std::string currentBody = "";
28-
std::string currentPath = "";
29+
std::string currentPath = "";
30+
std::string currentEntity = "";
2931
auto tagStack = std::stack<std::string> {};
3032

3133
for (char ch : sv) {
@@ -43,12 +45,12 @@ class XMLParser {
4345
else {
4446
state = States::TagName;
4547
currentTag.push_back(ch);
46-
if (currentPath.size() > 0 && currentPath[currentPath.size() - 2] != '.') {
47-
currentPath.push_back('.');
48-
}
49-
currentPath.push_back(ch);
48+
if (currentPath.size() >= 2 && currentPath[currentPath.size() - 2] != '.') {
49+
currentPath.push_back('.');
50+
}
51+
currentPath.push_back(ch);
5052
}
51-
break;
53+
break;
5254
}
5355
case States::TagName: {
5456
if (ch == ' ')
@@ -59,7 +61,7 @@ class XMLParser {
5961
currentTag = "";
6062
} else {
6163
currentTag.push_back(ch);
62-
currentPath.push_back(ch);
64+
currentPath.push_back(ch);
6365
}
6466
break;
6567
}
@@ -74,20 +76,33 @@ class XMLParser {
7476
case States::Body: {
7577
if (ch == '<') {
7678
state = States::Tag;
79+
} else if (ch == '&') {
80+
state = States::Entity;
7781
} else {
7882
currentBody.push_back(ch);
7983
}
8084
break;
8185
}
86+
case States::Entity: {
87+
if (ch == ';') {
88+
// Decode entity and append it to currentBody
89+
state = States::Body;
90+
currentBody.push_back(decodeXMLEntity(currentEntity));
91+
currentEntity = "";
92+
} else {
93+
currentEntity.push_back(ch);
94+
}
95+
break;
96+
}
8297
case States::Tag: {
8398
if (ch == '/') {
8499
state = States::TagClose;
85100
if (currentTagClose.size() == 0)
86101
currentTagClose = tagStack.top();
87102
} else {
88103
currentTag.push_back(ch);
89-
currentPath.push_back('.');
90-
currentPath.push_back(ch);
104+
currentPath.push_back('.');
105+
currentPath.push_back(ch);
91106
state = States::Processing;
92107
}
93108
break;
@@ -118,10 +133,10 @@ class XMLParser {
118133
state = States::Body;
119134

120135
// Cleanup
121-
tagStack.pop();
122-
if (auto pos = currentPath.find_last_of('.'); pos != std::string::npos) {
123-
currentPath.erase(pos, std::string::npos);
124-
}
136+
tagStack.pop();
137+
if (auto pos = currentPath.find_last_of('.'); pos != std::string::npos) {
138+
currentPath.erase(pos, std::string::npos);
139+
}
125140
currentBody = "";
126141
break;
127142
}
@@ -135,13 +150,49 @@ class XMLParser {
135150
throw std::runtime_error("Something went wrong");
136151
}
137152

153+
char decodeXMLEntity(const std::string& entity) {
154+
// XML escape characters
155+
if (entity == "quot")
156+
return '"';
157+
else if (entity == "apos")
158+
return '\'';
159+
else if (entity == "lt")
160+
return '<';
161+
else if (entity == "gt")
162+
return '>';
163+
else if (entity == "amp")
164+
return '&';
165+
166+
// XML numerical values (i.e. ETags using quotes)
167+
int code = 0;
168+
int base;
169+
std::from_chars_result result;
170+
if (entity.starts_with('#') && entity.size() > 1) {
171+
if (entity[1] == 'x' || entity[1] == 'X') {
172+
// Hex: #xhhhh
173+
base = 16;
174+
result = std::from_chars(entity.data() + 2, entity.data() + entity.size(), code, base);
175+
} else {
176+
// Decimal: #hhhh
177+
base = 10;
178+
result = std::from_chars(entity.data() + 1, entity.data() + entity.size(), code, base);
179+
}
180+
}
181+
if (result.ec == std::errc {}) {
182+
return static_cast<char>(code);
183+
}
184+
185+
throw std::runtime_error(std::format("Unknown XML entity: &{};", entity));
186+
}
187+
138188
private:
139189
enum class States : int {
140190
Start,
141191
Processing,
142192
TagName,
143193
TagAttr,
144194
Body,
195+
Entity,
145196
Tag,
146197
TagClose,
147198
Emit,

test/auth_test.cpp

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -75,22 +75,15 @@ TEST(AUTH, MinIOBasicRequest) {
7575
HttpClient client {};
7676

7777
// prepare request
78-
const std::string host = "127.0.0.1:9000";
79-
const std::string URI = "/";
80-
const std::string URL = std::format("http://{}{}", host, URI);
8178
const std::string empty_payload_hash = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855";
82-
HttpRequest req = client.get(URL)
83-
.header("Host", host)
79+
HttpRequest req = client.get("http://127.0.0.1:9000/")
80+
.header("Host", "127.0.0.1")
8481
.header("X-Amz-Content-Sha256", empty_payload_hash);
8582
signer.sign(req);
8683

8784
try {
8885
HttpResponse resp = req.execute();
8986
EXPECT_EQ(resp.status(), 200);
90-
// std::println("RESPONSE STATUS: {}", resp.status());
91-
// std::println("RESPONSE HEADERS: {}", resp.headers());
92-
// std::println("RESPONSE BODY: {}", resp.body());
93-
9487
} catch (const std::exception& e) {
9588
// Our exception in the GitHub CI will be "Couldn't connect to server"
9689
// will be exactly returned as a runtime error like so:

test/xml_test.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,31 @@ TEST(XML, XMLAWSListBucket) {
9999
EXPECT_EQ(XMLValues[3].tag, "ListAllMyBucketsResult.Buckets.Bucket.CreationDate");
100100
EXPECT_EQ(XMLValues[3].value, "2025-12-07T14:32:30.240Z");
101101
}
102+
103+
TEST(XML, XMLHandleDecimalEntity) {
104+
// <?xml version="1.0" encoding="UTF-8"?>
105+
// <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/">
106+
// <Contents>
107+
// <ETag>&#34;This ETag has quotes!&#34;</ETag>
108+
// </Contents>
109+
// </ListBucketResult>
110+
auto parser = XMLParser();
111+
auto XMLValues = parser.parse(R"(<?xml version="1.0" encoding="UTF-8"?><ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Contents><ETag>&#34;This ETag has quotes!&#34;</ETag></Contents></ListBucketResult>)");
112+
EXPECT_EQ(XMLValues.size(), 1);
113+
EXPECT_EQ(XMLValues[0].tag, "ListBucketResult.Contents.ETag");
114+
EXPECT_EQ(XMLValues[0].value, "\"This ETag has quotes!\"");
115+
}
116+
117+
TEST(XML, XMLHandleHexEntity) {
118+
// <?xml version="1.0" encoding="UTF-8"?>
119+
// <ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/">
120+
// <Contents>
121+
// <ETag>&#x22;This ETag has quotes!&#x22;</ETag>
122+
// </Contents>
123+
// </ListBucketResult>
124+
auto parser = XMLParser();
125+
auto XMLValues = parser.parse(R"(<?xml version="1.0" encoding="UTF-8"?><ListBucketResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Contents><ETag>&#x22;This ETag has quotes!&#x22;</ETag></Contents></ListBucketResult>)");
126+
EXPECT_EQ(XMLValues.size(), 1);
127+
EXPECT_EQ(XMLValues[0].tag, "ListBucketResult.Contents.ETag");
128+
EXPECT_EQ(XMLValues[0].value, "\"This ETag has quotes!\"");
129+
}

0 commit comments

Comments
 (0)