diff --git a/docs/simfil-language.md b/docs/simfil-language.md index 660448a..ace9fa9 100644 --- a/docs/simfil-language.md +++ b/docs/simfil-language.md @@ -136,7 +136,7 @@ count(mylist.*) ## Types -Simfil supports the following scalar types: `null`, `bool`, `int`, `float` (double precision), `string` and `re`. +Simfil supports the following scalar types: `null`, `bool`, `int`, `float` (double precision), `string`, `bytes` and `re`. Additionally, the `model` type represents compound object/array container nodes. All values but `null` and `false` are considered `true`, implicit boolean conversion takes place for operators `and` and `or` only. @@ -151,6 +151,11 @@ The following types can be target types for a cast: * `int` - Converts the value to an integer. Returns 0 on failure. * `float` - Converts the value to a float. Returns 0 on failure. * `string` - Converts the value to a string. Boolean values are converted to either "true" or "false". +* `bytes` - Converts the value to bytes. + +Byte literals are written using the `b` prefix, e.g. `b"hello"` or `b'hello'`. +Escape sequences `\n`, `\r`, `\t`, `\\`, `\"`, and `\'` are supported. +Bytes can also be written explicitly using `\xNN` (hex), e.g. `b"\x41\x00"`. ## Operators @@ -161,12 +166,12 @@ The following types can be target types for a cast: | `[ a ]` | Array/Object subscript, index expression can be of type `int` or `string`. | | `{ a }` | Sub-Query (inside sub-query `_` represents the value the query is applied to). | | `. b` or `a . b` | Direct field access; returns the value of field `b` or `null`. | -| `a as b` | Cast a to type b (one of `bool`, `int`, `float` or `string`). | +| `a as b` | Cast a to type b (one of `bool`, `int`, `float`, `string` or `bytes`). | | `a ?` | Get boolean value of `a` (see ##Types). | | `a ...` | Unpacks `a` to a list of values (see function `range` under [Functions](#Functions) for example) | -| `typeof a` | Returns the type of the value of its expression (`"null"`, `"bool"`, `"int"`, `"float"` or `"string"`). | +| `typeof a` | Returns the type of the value of its expression (`"null"`, `"bool"`, `"int"`, `"float"`, `"string"` or `"bytes"`). | | `not a` | Boolean not. | -| `# a` | Returns the length of a string or array value. | +| `# a` | Returns the length of a string, bytes, or array value. | | `~ a` | Bitwise not. | | `- a` | Unary minus. | | `a * b` | Multiplication. | diff --git a/include/simfil/byte-array.h b/include/simfil/byte-array.h new file mode 100644 index 0000000..9b804ac --- /dev/null +++ b/include/simfil/byte-array.h @@ -0,0 +1,129 @@ +// Copyright (c) Navigation Data Standard e.V. - See "LICENSE" file. +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace simfil +{ + +struct ByteArray +{ + std::string bytes; + + ByteArray() = default; + + explicit ByteArray(const char* data) + : bytes(data) + {} + + explicit ByteArray(std::string_view data) + : bytes(data) + {} + + explicit ByteArray(std::string data) + : bytes(std::move(data)) + {} + + auto operator==(const ByteArray&) const -> bool = default; + + [[nodiscard]] static std::optional fromHex(std::string_view hex) + { + if (hex.size() % 2 != 0) + return std::nullopt; + + std::string decoded; + decoded.reserve(hex.size() / 2); + for (size_t i = 0; i < hex.size(); i += 2) { + const auto upper = decodeHexNibble(hex[i]); + const auto lower = decodeHexNibble(hex[i + 1]); + if (upper < 0 || lower < 0) + return std::nullopt; + decoded.push_back(static_cast((upper << 4) | lower)); + } + + return ByteArray{std::move(decoded)}; + } + + [[nodiscard]] std::optional decodeBigEndianI64() const + { + if (bytes.size() > 8) { + for (size_t i = 8; i < bytes.size(); ++i) { + if (static_cast(bytes[i]) != 0) + return std::nullopt; + } + } + + const size_t count = bytes.size() <= 8 ? bytes.size() : 8; + uint64_t value = 0; + for (size_t i = 0; i < count; ++i) { + value = (value << 8) | static_cast(bytes[i]); + } + + int64_t signedValue = 0; + std::memcpy(&signedValue, &value, sizeof(signedValue)); + return signedValue; + } + + [[nodiscard]] std::string toHex(bool uppercase = true) const + { + std::string out; + out.reserve(bytes.size() * 2); + + if (uppercase) { + for (unsigned char byte : bytes) + fmt::format_to(std::back_inserter(out), FMT_STRING("{:02X}"), byte); + } else { + for (unsigned char byte : bytes) + fmt::format_to(std::back_inserter(out), FMT_STRING("{:02x}"), byte); + } + + return out; + } + + [[nodiscard]] std::string toLiteral() const + { + std::string out; + out.reserve(bytes.size() + 3); + out += "b\""; + + for (unsigned char byte : bytes) { + switch (byte) { + case '\\': out += "\\\\"; break; + case '"': out += "\\\""; break; + case '\n': out += "\\n"; break; + case '\r': out += "\\r"; break; + case '\t': out += "\\t"; break; + default: + if (byte < 0x20 || byte >= 0x7f) + fmt::format_to(std::back_inserter(out), FMT_STRING("\\x{:02X}"), byte); + else + out.push_back(static_cast(byte)); + break; + } + } + + out.push_back('"'); + return out; + } + + [[nodiscard]] static auto decodeHexNibble(char c) -> int + { + if ('0' <= c && c <= '9') + return c - '0'; + if ('a' <= c && c <= 'f') + return c - 'a' + 10; + if ('A' <= c && c <= 'F') + return c - 'A' + 10; + return -1; + } +}; + +} // namespace simfil diff --git a/include/simfil/model/model.h b/include/simfil/model/model.h index 0e02c93..66bdd81 100644 --- a/include/simfil/model/model.h +++ b/include/simfil/model/model.h @@ -2,6 +2,7 @@ #pragma once #include "simfil/model/string-pool.h" +#include "simfil/byte-array.h" #include "tl/expected.hpp" #if defined(SIMFIL_WITH_MODEL_JSON) # include "nlohmann/json.hpp" @@ -9,7 +10,10 @@ #include #include +#include #include +#include +#include #include #include @@ -19,6 +23,38 @@ namespace simfil { +namespace res +{ +// Tag type for ADL-based resolve hooks implemented by model libraries. +template +struct tag {}; +} + +namespace detail +{ +template +concept HasModelType = requires { typename T::ModelType; }; + +template +using ModelTypeOf = typename T::ModelType; +} + +/** + * ADL customization point for typed node resolution. + * Libraries define resolveInternal(tag, model, node) in their namespace. + */ +template +model_ptr resolveInternal(res::tag, ModelType const&, ModelNode const&) = delete; + +class ModelPool; + +// Built-in resolve hooks for core node types. Declared here so ADL sees them +// across translation units without relying on friend injection. +template<> +model_ptr resolveInternal(res::tag, ModelPool const&, ModelNode const&); +template<> +model_ptr resolveInternal(res::tag, ModelPool const&, ModelNode const&); + /** * Basic node model which only resolves trivial node types. */ @@ -58,6 +94,60 @@ class Model : public std::enable_shared_from_this */ virtual tl::expected resolve(ModelNode const& n, ResolveFn const& cb) const; + /** + * Resolve a node to a specific ModelNode subtype using ADL hooks. + * This provides a clean cast API without exposing model internals. + */ + template + model_ptr resolve(ModelNodeAddress const& address) const + { + if constexpr (std::is_same_v) { + return ModelNode::Ptr::make(shared_from_this(), address); + } + return resolve(*ModelNode::Ptr::make(shared_from_this(), address)); + } + + template + model_ptr resolve(ModelNodeAddress const& address, ScalarValueType data) const + { + if constexpr (std::is_same_v) { + return ModelNode::Ptr::make(shared_from_this(), address, std::move(data)); + } + return resolve(*ModelNode::Ptr::make(shared_from_this(), address, std::move(data))); + } + + template + model_ptr resolve(ModelNode::Ptr const& node) const + { + return resolve(*node); + } + + template + model_ptr resolve(ModelNode const& node) const + { + if constexpr (std::is_same_v) { + return model_ptr(node); + } + else { + if constexpr (!detail::HasModelType) { + static_assert(detail::HasModelType, "Target must provide a ModelType alias."); + return {}; + } + else { + using ModelType = detail::ModelTypeOf; +#if !defined(NDEBUG) + // In debug builds, validate the model type to catch misuse early. + auto typedModel = dynamic_cast(this); + assert(typedModel && "resolve called on incompatible model type."); + return resolveInternal(res::tag{}, *typedModel, node); +#else + // In release builds, avoid RTTI overhead on this hot path. + return resolveInternal(res::tag{}, *static_cast(this), node); +#endif + } + } + } + /** Add a small scalar value and get its model node view */ ModelNode::Ptr newSmallValue(bool value); ModelNode::Ptr newSmallValue(int16_t value); @@ -88,6 +178,8 @@ class ModelPool : public Model template friend struct BaseArray; public: + // Keep Model::resolve overloads visible alongside the virtual resolve override. + using Model::resolve; /** * The pool consists of multiple ModelNode columns, * each for a different data type. Each column @@ -100,6 +192,7 @@ class ModelPool : public Model Double, String, PooledString, + ByteArray, FirstCustomColumnId = 128, }; @@ -154,14 +247,9 @@ class ModelPool : public Model ModelNode::Ptr newValue(int64_t const& value); ModelNode::Ptr newValue(double const& value); ModelNode::Ptr newValue(std::string_view const& value); + ModelNode::Ptr newValue(simfil::ByteArray const& value); ModelNode::Ptr newValue(StringId handle); - /** Node-type-specific resolve-functions */ - [[nodiscard]] - model_ptr resolveObject(ModelNode::Ptr const& n) const; - [[nodiscard]] - model_ptr resolveArray(ModelNode::Ptr const& n) const; - /** Access the field name storage */ [[nodiscard]] std::shared_ptr strings() const; diff --git a/include/simfil/model/nodes.h b/include/simfil/model/nodes.h index 8992e51..d475a1b 100644 --- a/include/simfil/model/nodes.h +++ b/include/simfil/model/nodes.h @@ -8,6 +8,7 @@ #include "arena.h" #include "string-pool.h" +#include "simfil/byte-array.h" #include "simfil/error.h" #include @@ -51,6 +52,7 @@ enum class ValueType Int, Float, String, + Bytes, TransientObject, Object, Array @@ -63,22 +65,23 @@ using ScalarValueType = std::variant< int64_t, double, std::string, - std::string_view>; + std::string_view, + ByteArray>; namespace detail { - // Passkey for ModelNode construction: ModelNode types take this in their constructors so only - // model_ptr (and ModelPool via a shared key instance) can default/in-place construct them. - // This avoids per-node friendship and keeps IDEs happy across library boundaries. - struct mp_key - { - mp_key() = delete; - private: - explicit mp_key(int) {} - template friend struct ::simfil::model_ptr; - friend class ::simfil::Model; - friend class ::simfil::ModelPool; - }; +// Passkey for ModelNode construction: ModelNode types take this in their constructors so only +// model_ptr (and ModelPool via a shared key instance) can default/in-place construct them. +// This avoids per-node friendship and keeps IDEs happy across library boundaries. +struct mp_key +{ + mp_key() = delete; +private: + explicit mp_key(int) {} + template friend struct ::simfil::model_ptr; + friend class ::simfil::Model; + friend class ::simfil::ModelPool; +}; } /** @@ -206,6 +209,25 @@ struct ModelNodeAddress } }; +namespace detail +{ +// Shared storage entry for object fields across all BaseObject instantiations. +// Keeps the underlying ArrayArena type identical regardless of ModelType. +struct ObjectField +{ + ObjectField() = default; + ObjectField(StringId name, ModelNodeAddress a) : name_(name), node_(a) {} + StringId name_ = StringPool::Empty; + ModelNodeAddress node_; + + template + void serialize(S& s) { + s.value2b(name_); + s.object(node_); + } +}; +} + /** Semantic view onto a particular node in a ModelPool. */ struct ModelNode { @@ -413,13 +435,15 @@ struct ValueNode final : public ModelNodeBase * a reference to a Model-derived pool type. * @tparam ModelType Model-derived type. */ -template +template struct MandatoryDerivedModelNodeBase : public ModelNodeBase { - inline ModelType& model() const {return *modelPtr();} // NOLINT + using ModelType = ModelTypeT; + + inline ModelTypeT& model() const {return *modelPtr();} // NOLINT protected: - template + template inline ModelType_* modelPtr() const { return static_cast(const_cast(model_.get())); } @@ -570,24 +594,9 @@ struct BaseObject : public MandatoryDerivedModelNodeBase protected: /** - * Object field - a name and a tree node address. - * These are stored in the ModelPools Field array arena. + * Object fields are stored in the model's shared object-field arena. */ - struct Field - { - Field() = default; - Field(StringId name, ModelNodeAddress a) : name_(name), node_(a) {} - StringId name_ = StringPool::Empty; - ModelNodeAddress node_; - - template - void serialize(S& s) { - s.value2b(name_); - s.object(node_); - } - }; - - using Storage = ArrayArena; + using Storage = ArrayArena; using ModelNode::model_; using MandatoryDerivedModelNodeBase::model; @@ -637,6 +646,8 @@ template(*this)); diff --git a/include/simfil/operator.h b/include/simfil/operator.h index 6e874cd..98febb0 100644 --- a/include/simfil/operator.h +++ b/include/simfil/operator.h @@ -8,6 +8,7 @@ #include "fmt/format.h" #include +#include #include #include #include @@ -129,6 +130,11 @@ struct OperatorLen return static_cast(s.size()); } + auto operator()(const ByteArray& s) const + { + return static_cast(s.bytes.size()); + } + auto operator()(const ModelNode& n) const { return static_cast(n.size()); @@ -171,6 +177,12 @@ struct OperatorTypeof return n; } + auto operator()(const ByteArray&) const -> std::string_view + { + static auto n = "bytes"sv; + return n; + } + auto operator()(const ModelNode&) const -> std::string_view { static auto n = "model"sv; @@ -211,6 +223,13 @@ struct OperatorAsInt return (int64_t)0; } + auto operator()(const ByteArray& v) const + { + if (auto decoded = v.decodeBigEndianI64()) + return *decoded; + return (int64_t)0; + } + auto operator()(const ModelNode&) const { return (int64_t)0; @@ -270,6 +289,11 @@ struct OperatorAsString return v; } + auto operator()(const ByteArray& v) const -> std::string + { + return v.bytes; + } + auto operator()(const ModelNode& v) const { return ""s; @@ -290,6 +314,50 @@ struct OperatorAsString NULL_AS("null"s); }; +struct OperatorAsBytes +{ + NAME("bytes") + DENY_OTHER() + + auto operator()(const ByteArray& v) const -> ByteArray + { + return v; + } + + auto operator()(bool v) const -> ByteArray + { + return ByteArray{std::string(1, static_cast(v ? 1 : 0))}; + } + + auto operator()(int64_t v) const -> ByteArray + { + auto raw = static_cast(v); + auto bytes = std::string(8, '\0'); + for (size_t i = 0; i < bytes.size(); ++i) { + bytes[bytes.size() - i - 1] = static_cast((raw >> (i * 8)) & 0xFFu); + } + return ByteArray{std::move(bytes)}; + } + + auto operator()(const std::string& v) const -> ByteArray + { + return ByteArray{v}; + } + + auto operator()(const ModelNode&) const + { + return ByteArray{}; + } + + auto operator()(const TransientObject&) const + { + // Handled by MetaType::unaryOp + return ByteArray{}; + } + + NULL_AS(ByteArray{"null"}); +}; + #undef DENY_OTHER #undef NULL_AS #undef NULL_AS_NULL @@ -475,6 +543,39 @@ struct OperatorEq DECL_OPERATION(double, double, ==) DECL_OPERATION(const std::string&, const std::string&, ==) + auto operator()(const ByteArray& l, const ByteArray& r) const + { + return l.bytes == r.bytes; + } + + auto operator()(const ByteArray& l, int64_t r) const + { + if (auto decoded = l.decodeBigEndianI64()) + return *decoded == r; + return false; + } + + auto operator()(int64_t l, const ByteArray& r) const + { + if (auto decoded = r.decodeBigEndianI64()) + return l == *decoded; + return false; + } + + auto operator()(const ByteArray& l, double r) const + { + if (auto decoded = l.decodeBigEndianI64()) + return static_cast(*decoded) == r; + return false; + } + + auto operator()(double l, const ByteArray& r) const + { + if (auto decoded = r.decodeBigEndianI64()) + return l == static_cast(*decoded); + return false; + } + auto operator()(NullType, NullType) const { return true; @@ -502,6 +603,42 @@ struct OperatorLt DECL_OPERATION(double, double, <) DECL_OPERATION(const std::string&, const std::string&, <) + auto operator()(const ByteArray& l, const ByteArray& r) const + { + return std::lexicographical_compare( + l.bytes.begin(), l.bytes.end(), + r.bytes.begin(), r.bytes.end(), + [](unsigned char a, unsigned char b) { return a < b; }); + } + + auto operator()(const ByteArray& l, int64_t r) const + { + if (auto decoded = l.decodeBigEndianI64()) + return *decoded < r; + return false; + } + + auto operator()(int64_t l, const ByteArray& r) const + { + if (auto decoded = r.decodeBigEndianI64()) + return l < *decoded; + return false; + } + + auto operator()(const ByteArray& l, double r) const + { + if (auto decoded = l.decodeBigEndianI64()) + return static_cast(*decoded) < r; + return false; + } + + auto operator()(double l, const ByteArray& r) const + { + if (auto decoded = r.decodeBigEndianI64()) + return l < static_cast(*decoded); + return false; + } + auto operator()(NullType, NullType) const { return false; @@ -518,6 +655,31 @@ struct OperatorLtEq DECL_OPERATION(double, double, <=) DECL_OPERATION(const std::string&, const std::string&, <=) + auto operator()(const ByteArray& l, const ByteArray& r) const + { + return OperatorLt()(l, r) || OperatorEq()(l, r); + } + + auto operator()(const ByteArray& l, int64_t r) const + { + return OperatorLt()(l, r) || OperatorEq()(l, r); + } + + auto operator()(int64_t l, const ByteArray& r) const + { + return OperatorLt()(l, r) || OperatorEq()(l, r); + } + + auto operator()(const ByteArray& l, double r) const + { + return OperatorLt()(l, r) || OperatorEq()(l, r); + } + + auto operator()(double l, const ByteArray& r) const + { + return OperatorLt()(l, r) || OperatorEq()(l, r); + } + auto operator()(NullType, NullType) const { return true; diff --git a/include/simfil/simfil.h b/include/simfil/simfil.h index 17c02d5..1481844 100644 --- a/include/simfil/simfil.h +++ b/include/simfil/simfil.h @@ -26,7 +26,7 @@ struct ModelNode; * Param: * any If true, wrap expression with call to `any(...)`. * Param: - * autoWildcard If true, expand constant expressions to `** = `. + * autoWildcard If true, expand constant expressions to `** == `. */ auto compile(Environment& env, std::string_view query, bool any = true, bool autoWildcard = false) -> tl::expected; diff --git a/include/simfil/token.h b/include/simfil/token.h index 6851c87..cb9fdc3 100644 --- a/include/simfil/token.h +++ b/include/simfil/token.h @@ -10,6 +10,7 @@ #include #include "simfil/error.h" +#include "simfil/byte-array.h" namespace simfil { @@ -32,6 +33,7 @@ struct Token INT, // FLOAT, // STRING, // [r]"..." or [r]'...' + BYTES, // [b]"..." or [b]'...' REGEXP, // A string prefixed by re or RE WORD, // SELF, // _ @@ -73,6 +75,7 @@ struct Token std::variant< std::monostate, std::string, + ByteArray, int64_t, double > value; diff --git a/include/simfil/value.h b/include/simfil/value.h index 5c871c3..3805a75 100644 --- a/include/simfil/value.h +++ b/include/simfil/value.h @@ -7,6 +7,7 @@ #include #include "model/nodes.h" +#include "simfil/byte-array.h" #include "transient.h" namespace simfil @@ -58,6 +59,11 @@ struct ValueToString return v; } + auto operator()(const ByteArray& v) const + { + return v.toLiteral(); + } + auto operator()(const TransientObject&) const { return ""s; @@ -84,6 +90,7 @@ inline auto valueType2String(ValueType t) -> const char* case ValueType::Int: return "int"; case ValueType::Float: return "float"; case ValueType::String: return "string"; + case ValueType::Bytes: return "bytes"; case ValueType::TransientObject: return "transient"; case ValueType::Object: return "object"; case ValueType::Array: return "array"; @@ -97,7 +104,7 @@ inline auto valueType2String(ValueType t) -> const char* */ struct TypeFlags { - std::bitset<9> flags; + std::bitset<10> flags; auto test(ValueType type) const { @@ -158,6 +165,11 @@ struct ValueType4CType { static constexpr ValueType Type = ValueType::String; }; +template <> +struct ValueType4CType { + static constexpr ValueType Type = ValueType::Bytes; +}; + template <> struct ValueType4CType { static constexpr ValueType Type = ValueType::TransientObject; @@ -201,6 +213,11 @@ struct ValueTypeInfo { using Type = std::string; }; +template <> +struct ValueTypeInfo { + using Type = ByteArray; +}; + template <> struct ValueTypeInfo { using Type = TransientObject; @@ -240,6 +257,16 @@ struct ValueAs } }; +template <> +struct ValueAs +{ + template + static inline auto get(const VariantType& v) noexcept -> decltype(auto) + { + return std::get(v); + } +}; + template <> struct ValueAs { @@ -404,6 +431,8 @@ class Value return fn(this->template as()); case ValueType::String: return fn(this->template as()); + case ValueType::Bytes: + return fn(this->template as()); case ValueType::TransientObject: return fn(this->template as()); case ValueType::Object: @@ -438,6 +467,7 @@ class Value void operator() (double const& v) {result = v;} void operator() (std::string const& v) {result = v;} void operator() (std::string_view const& v) {result = v;} + void operator() (ByteArray const& v) {result = v;} void operator() (TransientObject const&) {} void operator() (ModelNode::Ptr const&) {} ScalarValueType result; @@ -471,6 +501,7 @@ class Value double, std::string, std::string_view, + ByteArray, TransientObject, ModelNode::Ptr> value; }; diff --git a/src/model/json.cpp b/src/model/json.cpp index c140bc7..1cbe693 100644 --- a/src/model/json.cpp +++ b/src/model/json.cpp @@ -34,6 +34,18 @@ static auto build(const json& j, ModelPool & model) -> tl::expectedis_boolean() && it->get()) { + auto hex = j.find("hex"); + if (hex == j.end() || !hex->is_string()) + return tl::unexpected(Error::ParserError, "Invalid tagged bytes object: expected string field 'hex'"); + + auto decoded = ByteArray::fromHex(hex->get()); + if (!decoded) + return tl::unexpected(Error::ParserError, "Invalid tagged bytes object: hex decode failed"); + + return model.newValue(std::move(*decoded)); + } + auto object = model.newObject(j.size()); for (auto&& [key, value] : j.items()) { auto child = build(value, model); diff --git a/src/model/model.cpp b/src/model/model.cpp index 4e23392..294c8bc 100644 --- a/src/model/model.cpp +++ b/src/model/model.cpp @@ -2,6 +2,7 @@ #include "simfil/model/arena.h" #include "simfil/model/bitsery-traits.h" #include "simfil/model/nodes.h" +#include "simfil/byte-array.h" #include #include @@ -86,6 +87,7 @@ struct ModelPool::Impl std::string stringData_; sfl::segmented_vector strings_; + sfl::segmented_vector byteArrays_; Object::Storage objectMemberArrays_; Array::Storage arrayMemberArrays_; @@ -101,6 +103,7 @@ struct ModelPool::Impl s.container(columns_.double_, maxColumnSize); s.text1b(columns_.stringData_, maxColumnSize); s.container(columns_.strings_, maxColumnSize); + s.container(columns_.byteArrays_, maxColumnSize); s.ext(columns_.objectMemberArrays_, bitsery::ext::ArrayArenaExt{}); s.ext(columns_.arrayMemberArrays_, bitsery::ext::ArrayArenaExt{}); @@ -242,6 +245,7 @@ void ModelPool::clear() clear_and_shrink(columns.double_); clear_and_shrink(columns.strings_); clear_and_shrink(columns.stringData_); + clear_and_shrink(columns.byteArrays_); clear_and_shrink(columns.objectMemberArrays_); clear_and_shrink(columns.arrayMemberArrays_); } @@ -293,6 +297,15 @@ tl::expected ModelPool::resolve(ModelNode const& n, ResolveFn const mpKey_)); break; } + case ByteArray: { + auto idx = n.addr().index(); + if (auto err = checkBounds(impl_->columns_.byteArrays_)) + return tl::unexpected(*err); + auto& val = impl_->columns_.byteArrays_[idx]; + auto view = std::string_view(impl_->columns_.stringData_).substr(val.offset_, val.length_); + cb(ValueNode(simfil::ByteArray{view}, shared_from_this(), mpKey_)); + break; + } case PooledString: { auto str = lookupStringId(static_cast(n.addr().index())); cb(ValueNode(str.value_or(std::string_view{}), shared_from_this(), mpKey_)); @@ -388,23 +401,39 @@ ModelNode::Ptr ModelPool::newValue(std::string_view const& value) ModelNodeAddress{String, (uint32_t)impl_->columns_.strings_.size()-1}); } +ModelNode::Ptr ModelPool::newValue(simfil::ByteArray const& value) +{ + impl_->columns_.byteArrays_.emplace_back(Impl::StringRange{ + (uint32_t)impl_->columns_.stringData_.size(), + (uint32_t)value.bytes.size() + }); + impl_->columns_.stringData_.append(value.bytes.data(), value.bytes.size()); + return ModelNode::Ptr::make( + shared_from_this(), + ModelNodeAddress{ByteArray, (uint32_t)impl_->columns_.byteArrays_.size()-1}); +} + ModelNode::Ptr ModelPool::newValue(StringId handle) { return ModelNode::Ptr::make( shared_from_this(), ModelNodeAddress{PooledString, static_cast(handle)}); } -model_ptr ModelPool::resolveObject(const ModelNode::Ptr& n) const { - if (n->addr_.column() != Objects) +// Core ADL resolve hooks for base Object/Array nodes. +template<> +model_ptr resolveInternal(res::tag, ModelPool const& model, ModelNode const& node) +{ + if (node.addr().column() != ModelPool::Objects) raise("Cannot cast this node to an object."); - return model_ptr::make(shared_from_this(), n->addr_); + return model_ptr::make(model.shared_from_this(), node.addr()); } -model_ptr ModelPool::resolveArray(ModelNode::Ptr const& n) const +template<> +model_ptr resolveInternal(res::tag, ModelPool const& model, ModelNode const& node) { - if (n->addr_.column() != Arrays) + if (node.addr().column() != ModelPool::Arrays) raise("Cannot cast this node to an array."); - return model_ptr::make(shared_from_this(), n->addr_); + return model_ptr::make(model.shared_from_this(), node.addr()); } std::shared_ptr ModelPool::strings() const @@ -451,6 +480,8 @@ ModelPool::SerializationSizeStats ModelPool::serializationSizeStats() const [&](auto& s) { s.text1b(impl_->columns_.stringData_, maxColumnSize); }); stats.stringRangeBytes = measureBytes( [&](auto& s) { s.container(impl_->columns_.strings_, maxColumnSize); }); + stats.stringRangeBytes += measureBytes( + [&](auto& s) { s.container(impl_->columns_.byteArrays_, maxColumnSize); }); stats.objectMemberBytes = measureBytes( [&](auto& s) { s.ext(impl_->columns_.objectMemberArrays_, bitsery::ext::ArrayArenaExt{}); }); stats.arrayMemberBytes = measureBytes( diff --git a/src/model/nodes.cpp b/src/model/nodes.cpp index 663c5e2..affd11c 100644 --- a/src/model/nodes.cpp +++ b/src/model/nodes.cpp @@ -124,7 +124,16 @@ nlohmann::json ModelNode::toJson() const [&j](auto&& v) { using T = decltype(v); - if constexpr (!std::is_same_v, std::monostate>) { + if constexpr (std::is_same_v, ByteArray>) { + auto bytes = nlohmann::json::object(); + bytes["_bytes"] = true; + if (auto decoded = v.decodeBigEndianI64()) + bytes["number"] = *decoded; + else + bytes["number"] = nullptr; + bytes["hex"] = v.toHex(false); + j = std::move(bytes); + } else if constexpr (!std::is_same_v, std::monostate>) { j = std::forward(v); } else { j = nullptr; diff --git a/src/simfil.cpp b/src/simfil.cpp index 6f69da5..5da371d 100644 --- a/src/simfil.cpp +++ b/src/simfil.cpp @@ -44,6 +44,7 @@ static constexpr std::string_view TypenameBool("bool"); static constexpr std::string_view TypenameInt("int"); static constexpr std::string_view TypenameFloat("float"); static constexpr std::string_view TypenameString("string"); +static constexpr std::string_view TypenameBytes("bytes"); } /** @@ -243,6 +244,8 @@ class CastParser : public InfixParselet return std::make_unique>(std::move(left)); if (name == strings::TypenameString) return std::make_unique>(std::move(left)); + if (name == strings::TypenameBytes) + return std::make_unique>(std::move(left)); return unexpected(Error::InvalidType, fmt::format("Invalid type name for cast '{}'", name)); }()); @@ -680,6 +683,7 @@ namespace const ScalarParser intParser; const ScalarParser floatParser; const ScalarParser stringParser; +const ScalarParser bytesParser; const RegExpParser regexpParser; const UnaryOpParser negateParser; const UnaryOpParser bitInvParser; @@ -726,6 +730,7 @@ static auto setupParser(Parser& p) p.prefixParsers[Token::INT] = &intParser; p.prefixParsers[Token::FLOAT] = &floatParser; p.prefixParsers[Token::STRING] = &stringParser; + p.prefixParsers[Token::BYTES] = &bytesParser; p.prefixParsers[Token::REGEXP] = ®expParser; /* Unary Operators */ diff --git a/src/token.cpp b/src/token.cpp index 094a99d..d6ad1f1 100644 --- a/src/token.cpp +++ b/src/token.cpp @@ -41,6 +41,8 @@ auto Token::toString() const -> std::string return std::to_string(std::get(value)); case Token::STRING: return "'"s + std::get(value) + "'"s; + case Token::BYTES: + return std::get(value).toLiteral(); case Token::REGEXP: return "re'"s + std::get(value) + "'"s; case Token::WORD: @@ -97,6 +99,7 @@ auto Token::toString(Type t) -> std::string case Token::INT: return ""; case Token::FLOAT: return ""; case Token::STRING: return ""; + case Token::BYTES: return ""; case Token::REGEXP: return ""; case Token::WORD: return ""; }; @@ -252,6 +255,11 @@ std::optional scanStringLiteral(Scanner& s) s.match("r'") || s.match("R'") || s.match("r\"") || s.match("R\""); + // Test for byte strings + const auto bytes = + s.match("b'") || s.match("B'") || + s.match("b\"") || s.match("B\""); + // Test for regexp const auto regexp = s.match("re'") || s.match("RE'") || @@ -259,6 +267,8 @@ std::optional scanStringLiteral(Scanner& s) if (raw) s.skip(1); + else if (bytes) + s.skip(1); else if (regexp) s.skip(2); @@ -284,6 +294,19 @@ std::optional scanStringLiteral(Scanner& s) else text.push_back('\\'); } else { + if (bytes && (s.at(0) == 'x' || s.at(0) == 'X')) { + const auto upper = ByteArray::decodeHexNibble(s.at(1)); + const auto lower = ByteArray::decodeHexNibble(s.at(2)); + if (upper < 0 || lower < 0) { + s.fail("Invalid hex escape sequence"); + return {}; + } + + text.push_back(static_cast((upper << 4) | lower)); + s.skip(3); + continue; + } + switch (s.at(0)) { case 'n': text.push_back('\n'); break; case 'r': text.push_back('\r'); break; @@ -309,6 +332,8 @@ std::optional scanStringLiteral(Scanner& s) if (regexp) return Token(Token::REGEXP, text, begin, s.pos()); + if (bytes) + return Token(Token::BYTES, ByteArray{text}, begin, s.pos()); return Token(Token::STRING, text, begin, s.pos()); } @@ -475,6 +500,8 @@ auto tokenize(std::string_view expr) -> expected, Error> else if (auto t = scanSyntax(s)) tokens.push_back(std::move(*t)); else { + if (s.hasError() && s.error().message.rfind("Invalid hex escape sequence", 0) == 0) + return unexpected(std::move(s.error())); if (s.at(0) != '\0') return unexpected(s.fail("Invalid input")); } diff --git a/test/complex.cpp b/test/complex.cpp index bffd18b..5901df9 100644 --- a/test/complex.cpp +++ b/test/complex.cpp @@ -102,6 +102,27 @@ TEST_CASE("Multimap JSON", "[multimap.serialization]") { REQUIRE(model->toJson() == nlohmann::json::parse(R"([{"a":[1],"b":[1,2,3],"c":[[1],2],"_multimap":true}])")); } +TEST_CASE("Tagged bytes JSON", "[bytes.serialization]") { + auto model = std::make_shared(); + auto root = model->newObject(1); + model->addRoot(root); + root->addField("raw", model->newValue(ByteArray{"A normal string"})); + + auto expected = nlohmann::json::parse( + R"([{"raw":{"_bytes":true,"number":null,"hex":"41206e6f726d616c20737472696e67"}}])"); + REQUIRE(model->toJson() == expected); + + auto roundTrip = json::parse(model->toJson().dump()); + REQUIRE(roundTrip); + auto roundTripRoot = roundTrip.value()->root(0); + REQUIRE(roundTripRoot); + REQUIRE(roundTripRoot.value()->toJson() == expected); + + auto invalidHex = json::parse(R"([{"raw":{"_bytes":true,"hex":"abc"}}])"); + REQUIRE_FALSE(invalidHex); + REQUIRE(invalidHex.error().message == "Invalid tagged bytes object: hex decode failed"); +} + TEST_CASE("Serialization", "[complex.serialization]") { auto model = json::parse(invoice); REQUIRE(model); diff --git a/test/operator.cpp b/test/operator.cpp index 7e78ed0..dfb5b7f 100644 --- a/test/operator.cpp +++ b/test/operator.cpp @@ -95,6 +95,7 @@ TEST_CASE("Unary operators", "[operator.unary]") { REQUIRE(op(int64_t(42)) == "int"); REQUIRE(op(3.14) == "float"); REQUIRE(op("hello"s) == "string"); + REQUIRE(op(ByteArray{"ff"}) == "bytes"); } } @@ -129,6 +130,30 @@ TEST_CASE("Type conversion operators", "[operator.conversion]") { REQUIRE(op(""s) == 0.0); REQUIRE(op(NullType{}) == 0.0); } + + SECTION("OperatorAsString") { + OperatorAsString op; + REQUIRE(op(ByteArray{"89899"}) == "89899"); + } + + SECTION("OperatorAsBytes") { + OperatorAsBytes op; + REQUIRE(op("A normal string"s).bytes == "A normal string"); + REQUIRE(op(ByteArray{"ff"}).bytes == "ff"); + REQUIRE(op(true).bytes == std::string(1, char(1))); + REQUIRE(op(false).bytes == std::string(1, char(0))); + + auto intBytes = op(int64_t(0xff)); + REQUIRE(intBytes.bytes.size() == 8); + REQUIRE((unsigned char)intBytes.bytes.back() == 0xff); + REQUIRE(intBytes.decodeBigEndianI64().value_or(0) == int64_t(0xff)); + + auto negBytes = op(int64_t(-1)); + REQUIRE(negBytes.bytes.size() == 8); + REQUIRE(negBytes.decodeBigEndianI64().value_or(0) == int64_t(-1)); + + REQUIRE_INVALID_OPERANDS(op(3.14)); + } } TEST_CASE("Binary arithmetic operators", "[operator.binary.arithmetic]") { @@ -220,5 +245,12 @@ TEST_CASE("Binary comparison operators", "[operator.binary.comparison]") { REQUIRE(op(int64_t(5), 5.0) == true); REQUIRE(op(5.0, int64_t(5)) == true); REQUIRE(op(int64_t(5), 5.1) == false); + REQUIRE(op(ByteArray{"89899"}, "normal-string"s) == false); + } + + SECTION("OperatorGt") { + OperatorGt op; + REQUIRE(op(ByteArray{"89899"}, int64_t(5)) == true); + REQUIRE(op(ByteArray{"89899"}, "normal-string"s) == false); } } diff --git a/test/simfil.cpp b/test/simfil.cpp index 21c07fe..66db4e0 100644 --- a/test/simfil.cpp +++ b/test/simfil.cpp @@ -116,6 +116,8 @@ TEST_CASE("OperatorConst", "[ast.operator]") { REQUIRE_AST("'a'<='b'", "true"); REQUIRE_AST("'b'>'a'", "true"); REQUIRE_AST("'b'>='b'", "true"); + REQUIRE_AST("b\"89899\" > 5", "true"); + REQUIRE_AST("b\"89899\" > \"normal-string\"", "false"); /* Null behaviour */ REQUIRE_AST("1 Type static auto asInt(const std::string_view input) {return getFirst(input, Token::Type::INT);} static auto asFloat(const std::string_view input) {return getFirst(input, Token::Type::FLOAT);} static auto asStr(const std::string_view input) {return getFirst(input, Token::Type::STRING);} +static auto asBytes(const std::string_view input) {return getFirst(input, Token::Type::BYTES);} static auto asRegexp(const std::string_view input) {return getFirst(input, Token::Type::REGEXP);} static auto asWord(const std::string_view input) {return getFirst(input, Token::Type::WORD);} static auto asError(const std::string_view input) { @@ -104,6 +105,24 @@ TEST_CASE("Tokenize strings", "[token.string]") { REQUIRE(asRegexp("RE''") == ""); REQUIRE(asRegexp("re'\"'") == "\""); + /* b'...' */ + REQUIRE(asBytes("b''").bytes == ""); + REQUIRE(asBytes("B''").bytes == ""); + REQUIRE(asBytes("b'abc'").bytes == "abc"); + REQUIRE(asBytes("b'\\'abc\\''").bytes == "'abc'"); + REQUIRE(asBytes("b'\\x41'").bytes == "A"); + + /* b"..." */ + REQUIRE(asBytes("b\"\"").bytes == ""); + REQUIRE(asBytes("B\"\"").bytes == ""); + REQUIRE(asBytes("b\"abc\"").bytes == "abc"); + REQUIRE(asBytes("b\"\\\"abc\\\"\"").bytes == "\"abc\""); + REQUIRE(asBytes("b\"\\x41\"").bytes == "A"); + REQUIRE(asBytes("b\"A\\x00\\xFF\\\"\\\\\"").bytes == std::string{"A\0\xFF\"\\", 5}); + REQUIRE(asError("b\"\\x\"").starts_with("Invalid hex escape sequence")); + REQUIRE(asError("b\"\\x0\"").starts_with("Invalid hex escape sequence")); + REQUIRE(asError("b\"\\xGG\"").starts_with("Invalid hex escape sequence")); + /* Quote mismatch */ REQUIRE(asError("'abc") == "Quote mismatch at 4"); REQUIRE(asError("abc'") == "Quote mismatch at 4"); @@ -160,7 +179,7 @@ TEST_CASE("Token location", "[token.location]") { } TEST_CASE("Token to string", "[token.to-string]") { - auto tokens = tokenize("1 1.5 'Æthervial' re'.* Familiar' abc ()[]{},:._ * ** " + auto tokens = tokenize("1 1.5 'Æthervial' b'beef' re'.* Familiar' abc ()[]{},:._ * ** " "null true false + - * / % << >> & | ^ ~ not and or " "== != < <= > >= ? # typeof as ..."); REQUIRE(tokens); @@ -171,6 +190,7 @@ TEST_CASE("Token to string", "[token.to-string]") { REQUIRE_STR("1"); REQUIRE_STR("1.500000"); REQUIRE_STR("'Æthervial'"); + REQUIRE_STR("b\"beef\""); REQUIRE_STR("re'.* Familiar'"); REQUIRE_STR("abc"); REQUIRE_STR("("); REQUIRE_STR(")"); @@ -189,3 +209,11 @@ TEST_CASE("Token to string", "[token.to-string]") { REQUIRE_STR("?"); REQUIRE_STR("#"); REQUIRE_STR("typeof"); REQUIRE_STR("as"); REQUIRE_STR("..."); } + +TEST_CASE("Byte token roundtrip", "[token.bytes-roundtrip]") { + auto bytes = ByteArray{std::string{"A\0\xFF\"\\\n\t", 7}}; + Token token(Token::BYTES, bytes, 0, 0); + + auto roundTripped = asBytes(token.toString()); + REQUIRE(roundTripped == bytes); +} diff --git a/test/value.cpp b/test/value.cpp index bc4b088..c01640a 100644 --- a/test/value.cpp +++ b/test/value.cpp @@ -4,6 +4,7 @@ #include "simfil/value.h" #include "simfil/model/model.h" +#include "simfil/token.h" #include "simfil/transient.h" using namespace simfil; @@ -76,6 +77,12 @@ TEST_CASE("Value Constructors", "[value.value-constructor]") { REQUIRE(val.as() == "world"); } + SECTION("Make ByteArray") { + auto val = Value::make(ByteArray{"bytes"}); + REQUIRE(val.isa(ValueType::Bytes)); + REQUIRE(val.as().bytes == "bytes"); + } + SECTION("Type constructor") { Value val(ValueType::Null); REQUIRE(val.isa(ValueType::Null)); @@ -184,6 +191,11 @@ TEST_CASE("Value As", "[value.as]") { auto ptr = val.as(); REQUIRE(!!ptr); } + + SECTION("as()") { + auto val = Value::make(ByteArray{"abc"}); + REQUIRE(val.as().bytes == "abc"); + } SECTION("as()") { auto model = std::make_shared(); @@ -308,6 +320,14 @@ TEST_CASE("Value toString() method", "[value.toString]") { REQUIRE(Value::make(int64_t(-123)).toString() == "-123"); REQUIRE(Value::make(double(3.14)).toString().find("3.14") == 0); REQUIRE(Value::make("Ponder"s).toString() == "Ponder"); + REQUIRE(Value::make(ByteArray{"A normal string"}).toString() == "b\"A normal string\""); + + auto bytes = ByteArray{std::string{"A\0\xFF\"\\", 5}}; + auto repr = Value::make(bytes).toString(); + auto tokens = tokenize(repr); + REQUIRE(tokens); + REQUIRE(tokens->at(0).type == Token::BYTES); + REQUIRE(std::get(tokens->at(0).value) == bytes); } TEST_CASE("Value utility methods", "[value.utilities]") { @@ -360,6 +380,7 @@ TEST_CASE("valueType2String() function", "[value.type2string]") { REQUIRE(valueType2String(ValueType::Int) == "int"s); REQUIRE(valueType2String(ValueType::Float) == "float"s); REQUIRE(valueType2String(ValueType::String) == "string"s); + REQUIRE(valueType2String(ValueType::Bytes) == "bytes"s); REQUIRE(valueType2String(ValueType::TransientObject) == "transient"s); REQUIRE(valueType2String(ValueType::Object) == "object"s); REQUIRE(valueType2String(ValueType::Array) == "array"s);