diff --git a/include/matcher/core.hpp b/include/matcher/core.hpp index 14e4fcc..b6ca72d 100644 --- a/include/matcher/core.hpp +++ b/include/matcher/core.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef DEBUG #include @@ -55,27 +56,100 @@ namespace { struct Limits; /** - * @brief Marker indicating group boundary during traversal - * + * @brief Tag action for tagged automaton transitions + * Represents capture operations: opening or closing a group at a position */ - struct GroupMarker { + struct TagAction { + enum class Type : uint8_t { + OPEN_GROUP, // Start capturing group_id + CLOSE_GROUP // End capturing group_id + }; + + Type type; size_t group_id; - bool is_start; // true = group start, false = group end - GroupMarker(size_t id, bool start) : group_id(id), is_start(start) {} + TagAction(Type t, size_t id) : type(t), group_id(id) {} + + static TagAction open(size_t group_id) { return TagAction(Type::OPEN_GROUP, group_id); } + static TagAction close(size_t group_id) { return TagAction(Type::CLOSE_GROUP, group_id); } + + bool is_open() const { return type == Type::OPEN_GROUP; } + bool is_close() const { return type == Type::CLOSE_GROUP; } }; /** - * @brief Capture state tracking during match traversal - * + * @brief Capture slots for efficient group position tracking + * Uses vector for O(1) access instead of map + * Auto-resizes when accessing groups beyond current capacity + * Supports undo operations for efficient backtracking without full copies */ - struct CaptureState { - std::map open_groups; // group_id -> start position - std::map> completed_groups; // group_id -> (start, end) + struct CaptureSlots { + static constexpr size_t UNSET = static_cast(-1); + + std::vector start_positions; // group_id -> start position (UNSET if not started) + std::vector end_positions; // group_id -> end position (UNSET if not ended) + + CaptureSlots() = default; + + void ensure_capacity(size_t group_id) { + if (group_id >= start_positions.size()) { + start_positions.resize(group_id + 1, UNSET); + end_positions.resize(group_id + 1, UNSET); + } + } + + // Returns previous value for undo + size_t open_group(size_t group_id, size_t position) { + ensure_capacity(group_id); + size_t prev = start_positions[group_id]; + start_positions[group_id] = position; + return prev; + } + + // Returns previous value for undo + size_t close_group(size_t group_id, size_t position) { + ensure_capacity(group_id); + size_t prev = end_positions[group_id]; + end_positions[group_id] = position; + return prev; + } + + // Restore previous start value + void undo_open(size_t group_id, size_t prev_value) { start_positions[group_id] = prev_value; } - CaptureState() = default; - CaptureState(const CaptureState&) = default; - CaptureState& operator=(const CaptureState&) = default; + // Restore previous end value + void undo_close(size_t group_id, size_t prev_value) { end_positions[group_id] = prev_value; } + + bool is_group_complete(size_t group_id) const { + return group_id < start_positions.size() && start_positions[group_id] != UNSET && + end_positions[group_id] != UNSET; + } + + std::map> to_map() const { + std::map> result; + for (size_t i = 0; i < start_positions.size(); ++i) { + if (start_positions[i] != UNSET && end_positions[i] != UNSET) { + result[i] = {start_positions[i], end_positions[i]}; + } + } + return result; + } + }; + + /** + * @brief Simulation state for tagged NFA traversal + * Bundles all state needed during matching + */ + template + struct SimulationState { + std::vector active_paths; // Currently active regex paths + std::map captures; // Capture slots per regex + + SimulationState() = default; + SimulationState(const SimulationState&) = default; + SimulationState& operator=(const SimulationState&) = default; + SimulationState(SimulationState&&) = default; + SimulationState& operator=(SimulationState&&) = default; }; /** @@ -179,7 +253,7 @@ namespace { struct EdgeInfo { std::map> paths; // each path may have different requirements for how many times should the edge be repeated. - std::map> group_markers; // group boundaries per regex path + std::map> tag_actions; // tag actions per regex path for capture tracking Node* to; EdgeInfo() = default; EdgeInfo(const EdgeInfo& info) { @@ -190,8 +264,8 @@ namespace { paths[x.first] = std::nullopt; } } - for (auto x : info.group_markers) { - group_markers[x.first] = x.second; + for (auto x : info.tag_actions) { + tag_actions[x.first] = x.second; } to = info.to; } @@ -335,14 +409,14 @@ namespace { std::optional limits = std::nullopt); /** - * @brief Adds a child node with group markers for capture tracking + * @brief Adds a child node with tag actions for capture tracking * * @param child Existing node * @param regex Regex data that is being used to identify the regex that the edge is part of - * @param markers Group markers to attach to this edge + * @param actions Tag actions to attach to this edge transition * @param limits Pointer to the shared limit of the edge (nullptr if no limit is applied) */ - void connect_with(Node* child, RegexData regex, const std::vector& markers, + void connect_with(Node* child, RegexData regex, const std::vector& actions, std::optional limits = std::nullopt); /** @@ -373,7 +447,7 @@ namespace { template void match_with_groups_helper(ConstIterator begin, ConstIterator end, size_t position, const std::vector& paths, const Node* prev, - std::map& capture_states, + std::map& capture_slots, std::vector>& results) const; #ifdef DEBUG @@ -418,8 +492,7 @@ namespace matcher { template static SubTree> process(std::vector*>, RegexData, ConstIterator&, ConstIterator, const bool, - size_t& group_counter, - std::vector& pending_markers); + size_t& group_counter, std::vector& pending_actions); public: /** diff --git a/include/matcher/impl/core.cpp b/include/matcher/impl/core.cpp index 6adce72..e93678c 100644 --- a/include/matcher/impl/core.cpp +++ b/include/matcher/impl/core.cpp @@ -142,11 +142,11 @@ namespace matcher { template SubTree> RegexMatcher::process( std::vector*> parents, RegexData regex, ConstIterator& it, ConstIterator end, - const bool inBrackets, size_t& group_counter, std::vector& pending_markers) { + const bool inBrackets, size_t& group_counter, std::vector& pending_actions) { SubTree> answer = {{}, {}}; std::vector>> nodeLayers = {{parents, parents}}; - // Save initial pending markers to restore on alternation - std::vector initial_markers = pending_markers; + // Save initial pending actions to restore on alternation + std::vector initial_actions = pending_actions; for (; it != end; it++) { if (*it == ')' && inBrackets) { break; @@ -156,19 +156,19 @@ namespace matcher { SubTree> newNodes = processSet(latest_parents.get_leafs(), regex, it); for (auto parent : latest_parents.get_leafs()) { for (auto newNode : newNodes.get_leafs()) { - parent->connect_with(newNode, regex, pending_markers); + parent->connect_with(newNode, regex, pending_actions); } } - pending_markers.clear(); + pending_actions.clear(); nodeLayers.push_back(newNodes); } else if (*it == '(') { // start of a regex in brackets (capture group) size_t current_group_id = group_counter++; - pending_markers.push_back(GroupMarker(current_group_id, true)); // group start + pending_actions.push_back(TagAction::open(current_group_id)); // OPEN_GROUP tag it++; SubTree> newLayer = process(nodeLayers.back().get_leafs(), regex, it, end, true, group_counter, - pending_markers); // leaves it at the closing bracket - pending_markers.push_back(GroupMarker(current_group_id, false)); // group end + pending_actions); // leaves it at the closing bracket + pending_actions.push_back(TagAction::close(current_group_id)); // CLOSE_GROUP tag nodeLayers.push_back(newLayer); } else if (*it == '|') { answer.roots.insert(answer.roots.end(), nodeLayers[1].get_leafs().begin(), @@ -176,8 +176,8 @@ namespace matcher { answer.leafs.insert(answer.leafs.end(), nodeLayers.back().get_leafs().begin(), nodeLayers.back().get_leafs().end()); nodeLayers.resize(1); - // Restore initial markers for the next alternative branch - pending_markers = initial_markers; + // Restore initial actions for the next alternative branch + pending_actions = initial_actions; } else if (*it == '{') { [[maybe_unused]] Limits* limits = processLimit(nodeLayers[nodeLayers.size() - 2], nodeLayers.back(), regex, it); @@ -207,9 +207,9 @@ namespace matcher { nextNode = new Node(sym); } for (auto parent : nodeLayers.back().get_leafs()) { - parent->connect_with(nextNode, regex, pending_markers); + parent->connect_with(nextNode, regex, pending_actions); } - pending_markers.clear(); + pending_actions.clear(); nodeLayers.push_back({{nextNode}, {nextNode}}); } } @@ -220,7 +220,7 @@ namespace matcher { Node* end_of_regex = new Node(symbol::EOR); SubTree> final_answer = {answer.get_roots(), {end_of_regex}}; for (auto parent : answer.leafs) { - parent->connect_with(end_of_regex, regex, pending_markers); + parent->connect_with(end_of_regex, regex, pending_actions); } return final_answer; } @@ -233,8 +233,8 @@ namespace matcher { void RegexMatcher::add_regex(Iterable str, RegexData uid) { auto it = std::cbegin(str); size_t group_counter = 0; - std::vector pending_markers; - process(std::vector{&root}, uid, it, std::cend(str), false, group_counter, pending_markers); + std::vector pending_actions; + process(std::vector{&root}, uid, it, std::cend(str), false, group_counter, pending_actions); } template diff --git a/include/matcher/impl/node.cpp b/include/matcher/impl/node.cpp index a0556b0..5d64a48 100644 --- a/include/matcher/impl/node.cpp +++ b/include/matcher/impl/node.cpp @@ -140,7 +140,7 @@ namespace { template void Node::connect_with(Node* child, RegexData regex, - const std::vector& markers, std::optional limit) { + const std::vector& actions, std::optional limit) { if (auto existing_child = neighbours.find(child->current_symbol); existing_child != neighbours.end()) { if (auto it = existing_child->second.paths.find(regex); it != existing_child->second.paths.end()) { if (!it->second.has_value() && limit == std::nullopt) { @@ -160,18 +160,18 @@ namespace { } else { neighbours[child->current_symbol].paths.emplace(regex, limit); } - // Append group markers for this regex path - if (!markers.empty()) { - auto& existing_markers = neighbours[child->current_symbol].group_markers[regex]; - existing_markers.insert(existing_markers.end(), markers.begin(), markers.end()); + // Append tag actions for this regex path + if (!actions.empty()) { + auto& existing_actions = neighbours[child->current_symbol].tag_actions[regex]; + existing_actions.insert(existing_actions.end(), actions.begin(), actions.end()); } return; } neighbours[child->current_symbol].paths.emplace(regex, limit); neighbours[child->current_symbol].to = child; - // Store group markers for this regex path - if (!markers.empty()) { - neighbours[child->current_symbol].group_markers[regex] = markers; + // Store tag actions for this regex path + if (!actions.empty()) { + neighbours[child->current_symbol].tag_actions[regex] = actions; } } @@ -279,8 +279,8 @@ namespace { std::vector> Node::match_with_groups(ConstIterator begin, ConstIterator end) const { std::vector> results; - std::map capture_states; - match_with_groups_helper(begin, end, 0, {}, nullptr, capture_states, results); + std::map capture_slots; + match_with_groups_helper(begin, end, 0, {}, nullptr, capture_slots, results); return results; } @@ -288,8 +288,7 @@ namespace { template void Node::match_with_groups_helper( ConstIterator begin, ConstIterator end, size_t position, const std::vector& paths, const Node* prev, - std::map& capture_states, - std::vector>& results) const { + std::map& capture_slots, std::vector>& results) const { if (begin == end) { // Check for end-of-regex marker if (auto it = this->neighbours.find(symbol::EOR); it != this->neighbours.end()) { @@ -317,24 +316,19 @@ namespace { } } if (to_include) { - // Process any group markers on the EOR edge - if (auto markers_it = it->second.group_markers.find(pathId); - markers_it != it->second.group_markers.end()) { - for (const auto& marker : markers_it->second) { - if (marker.is_start) { - capture_states[pathId].open_groups[marker.group_id] = position; + // Process any tag actions on the EOR edge + if (auto actions_it = it->second.tag_actions.find(pathId); + actions_it != it->second.tag_actions.end()) { + for (const auto& action : actions_it->second) { + if (action.is_open()) { + capture_slots[pathId].open_group(action.group_id, position); } else { - if (auto open_it = capture_states[pathId].open_groups.find(marker.group_id); - open_it != capture_states[pathId].open_groups.end()) { - capture_states[pathId].completed_groups[marker.group_id] = {open_it->second, - position}; - capture_states[pathId].open_groups.erase(open_it); - } + capture_slots[pathId].close_group(action.group_id, position); } } } - // Create result with captured groups - results.emplace_back(pathId, capture_states[pathId].completed_groups); + // Create result with captured groups using CaptureSlots::to_map() + results.emplace_back(pathId, capture_slots[pathId].to_map()); } } } @@ -375,27 +369,22 @@ namespace { } if (!new_paths.empty()) { - // Save capture states before recursion - std::map saved_states = capture_states; + // Track undo operations for efficient backtracking (avoid full map copy) + // Each entry: (pathId, group_id, is_open, prev_value) + std::vector> undo_stack; - // Process group markers for this edge - // Group markers are processed BEFORE consuming the current character - // - Group-start markers: record current position as start - // - Group-end markers: record current position as end (exclusive) + // Process tag actions for this edge transition + // Tag actions are executed BEFORE consuming the current character for (RegexData pathId : new_paths) { - if (auto markers_it = it->second.group_markers.find(pathId); - markers_it != it->second.group_markers.end()) { - for (const auto& marker : markers_it->second) { - if (marker.is_start) { - capture_states[pathId].open_groups[marker.group_id] = position; + if (auto actions_it = it->second.tag_actions.find(pathId); + actions_it != it->second.tag_actions.end()) { + for (const auto& action : actions_it->second) { + if (action.is_open()) { + size_t prev = capture_slots[pathId].open_group(action.group_id, position); + undo_stack.emplace_back(pathId, action.group_id, true, prev); } else { - if (auto open_it = capture_states[pathId].open_groups.find(marker.group_id); - open_it != capture_states[pathId].open_groups.end()) { - // End position is current position (before consuming this character) - capture_states[pathId].completed_groups[marker.group_id] = {open_it->second, - position}; - capture_states[pathId].open_groups.erase(open_it); - } + size_t prev = capture_slots[pathId].close_group(action.group_id, position); + undo_stack.emplace_back(pathId, action.group_id, false, prev); } } } @@ -408,10 +397,17 @@ namespace { } it->second.to->match_with_groups_helper(next_begin, end, next_position, new_paths, this, - capture_states, results); + capture_slots, results); - // Restore capture states after recursion - capture_states = saved_states; + // Undo capture slot changes (reverse order) + for (auto rit = undo_stack.rbegin(); rit != undo_stack.rend(); ++rit) { + const auto& [pathId, group_id, is_open, prev_value] = *rit; + if (is_open) { + capture_slots[pathId].undo_open(group_id, prev_value); + } else { + capture_slots[pathId].undo_close(group_id, prev_value); + } + } // Restore limits for (const auto& [pathId, old_limits] : current_paths) { diff --git a/tests/main.cpp b/tests/main.cpp index 825ebd2..a56cb93 100644 --- a/tests/main.cpp +++ b/tests/main.cpp @@ -149,42 +149,8 @@ TEST(RegexMatcherGroups, multiple_regexes_with_groups) { // Both regexes should match with their respective groups } -TEST(RegexMatcherGroups, performance_comparison) { - matcher::RegexMatcher root; - root.add_regex(std::string("d(abc|def)*g+"), 0); - root.add_regex(std::string("d(abc)*g+"), 1); - root.add_regex(std::string("(a)?"), 2); - - std::string test_str = "dabcabcg"; - const int iterations = 10000; - - // Warm up - for (int i = 0; i < 100; i++) { - root.match(test_str); - root.match_with_groups(test_str); - } - - auto t1 = high_resolution_clock::now(); - for (int i = 0; i < iterations; i++) { - auto r = root.match(test_str); - } - auto t2 = high_resolution_clock::now(); - for (int i = 0; i < iterations; i++) { - auto r = root.match_with_groups(test_str); - } - auto t3 = high_resolution_clock::now(); - - auto match_time = duration(t2 - t1).count() / iterations; - auto match_groups_time = duration(t3 - t2).count() / iterations; - - std::cout << "\n\tPerformance comparison (" << iterations << " iterations):\n"; - std::cout << "\t match(): " << match_time << " ns/call\n"; - std::cout << "\t match_with_groups(): " << match_groups_time << " ns/call\n"; - std::cout << "\t Overhead: " << (match_groups_time / match_time - 1) * 100 << "%\n"; - - // Just ensure both return same number of matches - EXPECT_EQ(root.match(test_str).size(), root.match_with_groups(test_str).size()); -} +// Performance benchmarks have been moved to benchmarks.cpp using Google Benchmark +// Run: ./build/tests/benchmarks.exe int main(int argc, char** argv) { std::cout << "RegexMatcher VERSION: " << RegexMatcher_VERSION_MAJOR << "." << RegexMatcher_VERSION_MINOR << "."