Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 95 additions & 22 deletions include/matcher/core.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <vector>
#include <optional>
#include <sstream>
#include <cstdint>

#ifdef DEBUG
#include <iostream>
Expand Down Expand Up @@ -55,27 +56,100 @@
struct Limits;

/**
* @brief Marker indicating group boundary during traversal
*
* @brief Tag action for tagged automaton transitions
* Represents capture operations: opening or closing a group at a position
*/
struct GroupMarker {
struct TagAction {
enum class Type : uint8_t {
OPEN_GROUP, // Start capturing group_id
CLOSE_GROUP // End capturing group_id
};

Type type;
size_t group_id;
bool is_start; // true = group start, false = group end

GroupMarker(size_t id, bool start) : group_id(id), is_start(start) {}
TagAction(Type t, size_t id) : type(t), group_id(id) {}

static TagAction open(size_t group_id) { return TagAction(Type::OPEN_GROUP, group_id); }
static TagAction close(size_t group_id) { return TagAction(Type::CLOSE_GROUP, group_id); }

bool is_open() const { return type == Type::OPEN_GROUP; }
bool is_close() const { return type == Type::CLOSE_GROUP; }
};

/**
* @brief Capture state tracking during match traversal
*
* @brief Capture slots for efficient group position tracking
* Uses vector for O(1) access instead of map
* Auto-resizes when accessing groups beyond current capacity
* Supports undo operations for efficient backtracking without full copies
*/
struct CaptureState {
std::map<size_t, size_t> open_groups; // group_id -> start position
std::map<size_t, std::pair<size_t, size_t>> completed_groups; // group_id -> (start, end)
struct CaptureSlots {
static constexpr size_t UNSET = static_cast<size_t>(-1);

std::vector<size_t> start_positions; // group_id -> start position (UNSET if not started)
std::vector<size_t> end_positions; // group_id -> end position (UNSET if not ended)

CaptureSlots() = default;

void ensure_capacity(size_t group_id) {
if (group_id >= start_positions.size()) {
start_positions.resize(group_id + 1, UNSET);
end_positions.resize(group_id + 1, UNSET);
}
}

// Returns previous value for undo
size_t open_group(size_t group_id, size_t position) {
ensure_capacity(group_id);
size_t prev = start_positions[group_id];
start_positions[group_id] = position;
return prev;
}

// Returns previous value for undo
size_t close_group(size_t group_id, size_t position) {
ensure_capacity(group_id);
size_t prev = end_positions[group_id];
end_positions[group_id] = position;
return prev;
}

// Restore previous start value
void undo_open(size_t group_id, size_t prev_value) { start_positions[group_id] = prev_value; }

CaptureState() = default;
CaptureState(const CaptureState&) = default;
CaptureState& operator=(const CaptureState&) = default;
// Restore previous end value
void undo_close(size_t group_id, size_t prev_value) { end_positions[group_id] = prev_value; }

bool is_group_complete(size_t group_id) const {
return group_id < start_positions.size() && start_positions[group_id] != UNSET &&
end_positions[group_id] != UNSET;
}

std::map<size_t, std::pair<size_t, size_t>> to_map() const {
std::map<size_t, std::pair<size_t, size_t>> result;
for (size_t i = 0; i < start_positions.size(); ++i) {
if (start_positions[i] != UNSET && end_positions[i] != UNSET) {
result[i] = {start_positions[i], end_positions[i]};
}
}
return result;
}
};

/**
* @brief Simulation state for tagged NFA traversal
* Bundles all state needed during matching
*/
template <typename RegexData>
struct SimulationState {
std::vector<RegexData> active_paths; // Currently active regex paths
std::map<RegexData, CaptureSlots> captures; // Capture slots per regex

SimulationState() = default;
SimulationState(const SimulationState&) = default;
SimulationState& operator=(const SimulationState&) = default;
SimulationState(SimulationState&&) = default;
SimulationState& operator=(SimulationState&&) = default;
};

/**
Expand Down Expand Up @@ -179,7 +253,7 @@
struct EdgeInfo {
std::map<T, std::optional<Limits*>>
paths; // each path may have different requirements for how many times should the edge be repeated.
std::map<T, std::vector<GroupMarker>> group_markers; // group boundaries per regex path
std::map<T, std::vector<TagAction>> tag_actions; // tag actions per regex path for capture tracking
Node* to;
EdgeInfo() = default;
EdgeInfo(const EdgeInfo& info) {
Expand All @@ -190,8 +264,8 @@
paths[x.first] = std::nullopt;
}
}
for (auto x : info.group_markers) {
group_markers[x.first] = x.second;
for (auto x : info.tag_actions) {
tag_actions[x.first] = x.second;
}
to = info.to;
}
Expand Down Expand Up @@ -335,14 +409,14 @@
std::optional<Limits*> limits = std::nullopt);

/**
* @brief Adds a child node with group markers for capture tracking
* @brief Adds a child node with tag actions for capture tracking
*
* @param child Existing node
* @param regex Regex data that is being used to identify the regex that the edge is part of
* @param markers Group markers to attach to this edge
* @param actions Tag actions to attach to this edge transition
* @param limits Pointer to the shared limit of the edge (nullptr if no limit is applied)
*/
void connect_with(Node<RegexData, char_t>* child, RegexData regex, const std::vector<GroupMarker>& markers,
void connect_with(Node<RegexData, char_t>* child, RegexData regex, const std::vector<TagAction>& actions,
std::optional<Limits*> limits = std::nullopt);

/**
Expand Down Expand Up @@ -373,7 +447,7 @@
template <typename ConstIterator>
void match_with_groups_helper(ConstIterator begin, ConstIterator end, size_t position,
const std::vector<RegexData>& paths, const Node* prev,
std::map<RegexData, CaptureState>& capture_states,
std::map<RegexData, CaptureSlots>& capture_slots,
std::vector<matcher::MatchResult<RegexData>>& results) const;

#ifdef DEBUG
Expand Down Expand Up @@ -404,7 +478,7 @@

namespace matcher {
template <typename RegexData, typename char_t>
class RegexMatcher {

Check warning on line 481 in include/matcher/core.hpp

View workflow job for this annotation

GitHub Actions / gather_annotations

‘matcher::RegexMatcher<int, char>’ has a field ‘matcher::RegexMatcher<int, char>::root’ whose type uses the anonymous namespace [-Wsubobject-linkage]
Node<RegexData, char_t> root;

template <typename ConstIterator>
Expand All @@ -418,8 +492,7 @@
template <typename ConstIterator>
static SubTree<Node<RegexData, char_t>> process(std::vector<Node<RegexData, char_t>*>, RegexData,
ConstIterator&, ConstIterator, const bool,
size_t& group_counter,
std::vector<GroupMarker>& pending_markers);
size_t& group_counter, std::vector<TagAction>& pending_actions);

public:
/**
Expand Down
30 changes: 15 additions & 15 deletions include/matcher/impl/core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,11 +142,11 @@ namespace matcher {
template <typename ConstIterator>
SubTree<Node<RegexData, char_t>> RegexMatcher<RegexData, char_t>::process(
std::vector<Node<RegexData, char_t>*> parents, RegexData regex, ConstIterator& it, ConstIterator end,
const bool inBrackets, size_t& group_counter, std::vector<GroupMarker>& pending_markers) {
const bool inBrackets, size_t& group_counter, std::vector<TagAction>& pending_actions) {
SubTree<Node<RegexData, char_t>> answer = {{}, {}};
std::vector<SubTree<Node<RegexData, char_t>>> nodeLayers = {{parents, parents}};
// Save initial pending markers to restore on alternation
std::vector<GroupMarker> initial_markers = pending_markers;
// Save initial pending actions to restore on alternation
std::vector<TagAction> initial_actions = pending_actions;
for (; it != end; it++) {
if (*it == ')' && inBrackets) {
break;
Expand All @@ -156,28 +156,28 @@ namespace matcher {
SubTree<Node<RegexData, char_t>> newNodes = processSet(latest_parents.get_leafs(), regex, it);
for (auto parent : latest_parents.get_leafs()) {
for (auto newNode : newNodes.get_leafs()) {
parent->connect_with(newNode, regex, pending_markers);
parent->connect_with(newNode, regex, pending_actions);
}
}
pending_markers.clear();
pending_actions.clear();
nodeLayers.push_back(newNodes);
} else if (*it == '(') { // start of a regex in brackets (capture group)
size_t current_group_id = group_counter++;
pending_markers.push_back(GroupMarker(current_group_id, true)); // group start
pending_actions.push_back(TagAction::open(current_group_id)); // OPEN_GROUP tag
it++;
SubTree<Node<RegexData, char_t>> newLayer =
process(nodeLayers.back().get_leafs(), regex, it, end, true, group_counter,
pending_markers); // leaves it at the closing bracket
pending_markers.push_back(GroupMarker(current_group_id, false)); // group end
pending_actions); // leaves it at the closing bracket
pending_actions.push_back(TagAction::close(current_group_id)); // CLOSE_GROUP tag
nodeLayers.push_back(newLayer);
} else if (*it == '|') {
answer.roots.insert(answer.roots.end(), nodeLayers[1].get_leafs().begin(),
nodeLayers[1].get_leafs().end());
answer.leafs.insert(answer.leafs.end(), nodeLayers.back().get_leafs().begin(),
nodeLayers.back().get_leafs().end());
nodeLayers.resize(1);
// Restore initial markers for the next alternative branch
pending_markers = initial_markers;
// Restore initial actions for the next alternative branch
pending_actions = initial_actions;
} else if (*it == '{') {
[[maybe_unused]] Limits* limits =
processLimit(nodeLayers[nodeLayers.size() - 2], nodeLayers.back(), regex, it);
Expand Down Expand Up @@ -207,9 +207,9 @@ namespace matcher {
nextNode = new Node<RegexData, char_t>(sym);
}
for (auto parent : nodeLayers.back().get_leafs()) {
parent->connect_with(nextNode, regex, pending_markers);
parent->connect_with(nextNode, regex, pending_actions);
}
pending_markers.clear();
pending_actions.clear();
nodeLayers.push_back({{nextNode}, {nextNode}});
}
}
Expand All @@ -220,7 +220,7 @@ namespace matcher {
Node<RegexData, char_t>* end_of_regex = new Node<RegexData, char_t>(symbol<char_t>::EOR);
SubTree<Node<RegexData, char_t>> final_answer = {answer.get_roots(), {end_of_regex}};
for (auto parent : answer.leafs) {
parent->connect_with(end_of_regex, regex, pending_markers);
parent->connect_with(end_of_regex, regex, pending_actions);
}
return final_answer;
}
Expand All @@ -233,8 +233,8 @@ namespace matcher {
void RegexMatcher<RegexData, char_t>::add_regex(Iterable str, RegexData uid) {
auto it = std::cbegin(str);
size_t group_counter = 0;
std::vector<GroupMarker> pending_markers;
process(std::vector{&root}, uid, it, std::cend(str), false, group_counter, pending_markers);
std::vector<TagAction> pending_actions;
process(std::vector{&root}, uid, it, std::cend(str), false, group_counter, pending_actions);
}

template <typename RegexData, typename char_t>
Expand Down
Loading
Loading