From de2ad6333dcbe1bec8f2fc83385445a345528c8f Mon Sep 17 00:00:00 2001 From: Jeremy Castagno Date: Tue, 23 Dec 2025 10:44:31 -0500 Subject: [PATCH] Add ID index functionality for efficient feature lookup by OSM ID --- examples/query-id/CMakeLists.txt | 2 + examples/query-id/main.cpp | 72 ++++++ include/geodesk/feature/FeatureStore.h | 9 + include/geodesk/feature/FeaturesBase.h | 44 ++++ include/geodesk/feature/IdIndex.h | 84 +++++++ src/clarisma/store/IndexFile.cpp | 9 +- src/feature/FeatureStore.cpp | 9 + src/feature/IdIndex.cpp | 301 +++++++++++++++++++++++++ 8 files changed, 526 insertions(+), 4 deletions(-) create mode 100644 examples/query-id/CMakeLists.txt create mode 100644 examples/query-id/main.cpp create mode 100644 include/geodesk/feature/IdIndex.h create mode 100644 src/feature/IdIndex.cpp diff --git a/examples/query-id/CMakeLists.txt b/examples/query-id/CMakeLists.txt new file mode 100644 index 0000000..85b54b7 --- /dev/null +++ b/examples/query-id/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(query-id main.cpp) +target_link_libraries(query-id PRIVATE geodesk) diff --git a/examples/query-id/main.cpp b/examples/query-id/main.cpp new file mode 100644 index 0000000..1ace5bd --- /dev/null +++ b/examples/query-id/main.cpp @@ -0,0 +1,72 @@ +// Copyright (c) 2024 Clarisma / GeoDesk contributors +// SPDX-License-Identifier: LGPL-3.0-only + +#include +#include +#include +#include +#include + +using namespace geodesk; + +template +void queryAndDisplay(Collection collection, uint64_t id, const char* typeName) +{ + auto start = std::chrono::high_resolution_clock::now(); + auto feature = collection.byId(id); + auto end = std::chrono::high_resolution_clock::now(); + auto us = std::chrono::duration_cast(end - start).count(); + + if (feature) + { + std::cout << typeName << " " << id << " (" << us << " us):" << std::endl; + for (Tag tag : feature->tags()) + { + std::cout << " " << tag.key() << " = " << tag.value() << std::endl; + } + } + else + { + std::cout << typeName << " " << id << " not found (" << us << " us)" << std::endl; + } +} + +int main(int argc, char* argv[]) +{ + if (argc < 2) + { + std::cerr << "Usage: " << argv[0] << " [n|w|r]" << std::endl; + std::cerr << " Example: " << argv[0] << " planet.gol w12345" << std::endl; + return 1; + } + + Features features(argv[1]); + std::cout << "Loaded " << argv[1] << std::endl; + + if (argc >= 3) + { + const char* arg = argv[2]; + char typeChar = std::tolower(static_cast(arg[0])); + uint64_t id = std::strtoull(arg + 1, nullptr, 10); + + try + { + switch (typeChar) + { + case 'n': queryAndDisplay(features.nodes(), id, "Node"); break; + case 'w': queryAndDisplay(features.ways(), id, "Way"); break; + case 'r': queryAndDisplay(features.relations(), id, "Relation"); break; + default: + std::cerr << "Unknown type '" << arg[0] << "'. Use n, w, or r." << std::endl; + return 1; + } + } + catch (const QueryException& e) + { + std::cerr << "Error: " << e.what() << std::endl; + return 1; + } + } + + return 0; +} diff --git a/include/geodesk/feature/FeatureStore.h b/include/geodesk/feature/FeatureStore.h index 50dd2ff..c563ac5 100644 --- a/include/geodesk/feature/FeatureStore.h +++ b/include/geodesk/feature/FeatureStore.h @@ -3,6 +3,8 @@ #pragma once +#include +#include #include #include #ifdef GEODESK_PYTHON @@ -27,6 +29,7 @@ class PyFeatures; // not namespaced for now namespace geodesk { +class IdIndex; class MatcherHolder; // Possible threadpool alternatives: @@ -171,6 +174,10 @@ class GEODESK_API FeatureStore final : public clarisma::FreeStore TilePtr fetchTile(Tip tip) const; static bool isTileValid(const byte* p); + /// @brief Returns the ID index if available, or nullptr if not. + /// ID index files are created by `gol build -i`. + IdIndex* idIndex(); + struct Metadata; class Transaction; @@ -216,6 +223,8 @@ class GEODESK_API FeatureStore final : public clarisma::FreeStore #endif clarisma::ThreadPool executor_; ZoomLevels zoomLevels_; + std::unique_ptr idIndex_; + mutable std::once_flag idIndexInitFlag_; friend class Transaction; }; diff --git a/include/geodesk/feature/FeaturesBase.h b/include/geodesk/feature/FeaturesBase.h index c84da24..f8b7fc8 100644 --- a/include/geodesk/feature/FeaturesBase.h +++ b/include/geodesk/feature/FeaturesBase.h @@ -4,6 +4,7 @@ #pragma once #include +#include #include #include #include @@ -102,6 +103,49 @@ class FeaturesBase [[nodiscard]] std::optional first() const; [[nodiscard]] T one() const; + /// @brief Looks up a feature by its OSM ID. + /// + /// Requires ID index files created by `gol build -i`. + /// + /// @param id The OSM ID to look up + /// @return The feature if found, or `std::nullopt` if not found + /// @throws QueryException if ID indexes are not available + /// + [[nodiscard]] std::optional byId(uint64_t id) const + { + IdIndex* idx = store()->idIndex(); + if (!idx) + { + throw QueryException("ID indexes not available (build GOL with -i flag)"); + } + + // Determine the feature type from the template parameter T + // T is Node, Way, Relation, or Feature (FeatureBase) + FeatureType type; + if constexpr (std::is_same_v) + { + type = FeatureType::NODE; + } + else if constexpr (std::is_same_v) + { + type = FeatureType::WAY; + } + else if constexpr (std::is_same_v) + { + type = FeatureType::RELATION; + } + else + { + static_assert( + std::is_same_v || std::is_same_v || std::is_same_v, + "byId() is only available on Nodes, Ways, or Relations collections"); + } + + FeaturePtr ptr = idx->findById(id, type); + if (ptr.isNull()) return std::nullopt; + return T(store(), ptr); + } + // NOLINTNEXTLINE(google-explicit-constructor) [[nodiscard]] operator std::vector() const; diff --git a/include/geodesk/feature/IdIndex.h b/include/geodesk/feature/IdIndex.h new file mode 100644 index 0000000..ea8c953 --- /dev/null +++ b/include/geodesk/feature/IdIndex.h @@ -0,0 +1,84 @@ +// Copyright (c) 2024 Clarisma / GeoDesk contributors +// SPDX-License-Identifier: LGPL-3.0-only + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace geodesk { + +class FeatureStore; + +/// @brief Provides O(1) lookup of features by their OSM ID. +/// +/// This class manages the optional ID index files created by `gol build -i`. +/// The index files map OSM IDs to pile numbers, which are then converted +/// to tile index positions (tips) for feature retrieval. +/// +/// Lookup chain: OSM_ID -> pile -> tip -> tile -> linear scan +/// +class IdIndex +{ +public: + /// Constructs an IdIndex for the given FeatureStore. + /// If index files don't exist, isAvailable() returns false. + explicit IdIndex(FeatureStore* store); + + ~IdIndex() = default; + + // Non-copyable, non-movable (owns file handles) + IdIndex(const IdIndex&) = delete; + IdIndex& operator=(const IdIndex&) = delete; + IdIndex(IdIndex&&) = delete; + IdIndex& operator=(IdIndex&&) = delete; + + /// Returns true if ID index files are available and loaded. + bool isAvailable() const noexcept { return available_; } + + /// Finds a feature by its OSM ID. + /// @param id The OSM ID to look up + /// @param type The feature type (NODE, WAY, or RELATION) + /// @return The FeaturePtr if found, or an empty FeaturePtr if not found + FeaturePtr findById(uint64_t id, FeatureType type); + +private: + /// Extra bits for ways and relations to encode tile pair flags. + static constexpr int PILEPAIR_EXTRA_BITS = 2; + + /// Calculates bit width needed to store tile count values. + /// Uses same formula as gol-tool: 32 - countLeadingZeros(tileCount) + static int calculateBaseBitWidth(uint32_t tileCount); + + /// Builds the pile-to-tip mapping by walking the tile index. + void buildPileToTip(); + + /// Scans a tile to find a feature by ID. + FeaturePtr scanTileForId(TilePtr tile, uint64_t id, FeatureType type) const; + + /// Scans a single index (NODE, WAY, AREA, or RELATION) for a feature. + FeaturePtr scanIndexForId(DataPtr pIndex, uint64_t id, FeatureType type) const; + + /// Recursively scans R-tree branches for a feature. + FeaturePtr scanBranchForId(DataPtr p, uint64_t id, FeatureType type, bool isNode) const; + + /// Scans a leaf node for the feature. + FeaturePtr scanNodeLeafForId(DataPtr p, uint64_t id) const; + + /// Scans a leaf (way/relation) for the feature. + FeaturePtr scanLeafForId(DataPtr p, uint64_t id, FeatureType type) const; + + FeatureStore* store_; + bool available_ = false; + uint32_t maxPile_ = 0; + std::unique_ptr pileToTip_; + clarisma::IndexFile nodeIndex_; + clarisma::IndexFile wayIndex_; + clarisma::IndexFile relationIndex_; +}; + +} // namespace geodesk diff --git a/src/clarisma/store/IndexFile.cpp b/src/clarisma/store/IndexFile.cpp index bbc3dac..2e3579c 100644 --- a/src/clarisma/store/IndexFile.cpp +++ b/src/clarisma/store/IndexFile.cpp @@ -2,13 +2,12 @@ // SPDX-License-Identifier: LGPL-3.0-only #include -#include namespace clarisma { IndexFile::IndexFile() : slotsPerSegment_(0), - maxEntryCount_(std::numeric_limits::max()), // TODO + maxEntryCount_(0), valueWidth_(0) { } @@ -19,6 +18,8 @@ void IndexFile::open(const char* filename, OpenMode mode, int valueWidth) assert(valueWidth > 0 && valueWidth <= 32); valueWidth_ = valueWidth; slotsPerSegment_ = static_cast(SEGMENT_LENGTH) * 8 / valueWidth_; + // Calculate max entry count based on actual file size + maxEntryCount_ = static_cast(mainMappingSize()) * 8 / valueWidth_; } IndexFile::CellRef IndexFile::getCell(int64_t key) @@ -35,8 +36,8 @@ IndexFile::CellRef IndexFile::getCell(int64_t key) return ref; } -// TODO: Clarify that it is legal to call get() if IndexFile is not open, -// which will always return 0 because maxEntryCount_==0 +// It is legal to call get() if IndexFile is not open; +// it will always return 0 because maxEntryCount_==0 uint32_t IndexFile::get(uint64_t key) { if (key >= maxEntryCount_) [[unlikely]] diff --git a/src/feature/FeatureStore.cpp b/src/feature/FeatureStore.cpp index a0df785..184db99 100644 --- a/src/feature/FeatureStore.cpp +++ b/src/feature/FeatureStore.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: LGPL-3.0-only #include +#include #include #include #include @@ -132,6 +133,14 @@ TilePtr FeatureStore::fetchTile(Tip tip) const return TilePtr(pagePointer(entry.page())); } +IdIndex* FeatureStore::idIndex() +{ + std::call_once(idIndexInitFlag_, [this]() + { + idIndex_ = std::make_unique(this); + }); + return idIndex_->isAvailable() ? idIndex_.get() : nullptr; +} void FeatureStore::readIndexSchema(DataPtr p) diff --git a/src/feature/IdIndex.cpp b/src/feature/IdIndex.cpp new file mode 100644 index 0000000..f236d53 --- /dev/null +++ b/src/feature/IdIndex.cpp @@ -0,0 +1,301 @@ +// Copyright (c) 2024 Clarisma / GeoDesk contributors +// SPDX-License-Identifier: LGPL-3.0-only + +#include +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; +using namespace TileConstants; + +namespace geodesk { + +int IdIndex::calculateBaseBitWidth(uint32_t tileCount) +{ + // Same formula as gol-tool: 32 - countLeadingZeros(tileCount) + // This gives the minimum bits needed to represent values 0..tileCount + if (tileCount == 0) return 1; + int leadingZeros = 0; + uint32_t n = tileCount; + if (n <= 0x0000FFFF) { leadingZeros += 16; n <<= 16; } + if (n <= 0x00FFFFFF) { leadingZeros += 8; n <<= 8; } + if (n <= 0x0FFFFFFF) { leadingZeros += 4; n <<= 4; } + if (n <= 0x3FFFFFFF) { leadingZeros += 2; n <<= 2; } + if (n <= 0x7FFFFFFF) { leadingZeros += 1; } + return 32 - leadingZeros; +} + +IdIndex::IdIndex(FeatureStore* store) : store_(store) +{ + // Get GOL path from parent class FreeStore::fileName() + std::string golPath = store->fileName(); + + // Strip .gol extension if present + if (golPath.size() > 4 && golPath.substr(golPath.size() - 4) == ".gol") + { + golPath = golPath.substr(0, golPath.size() - 4); + } + + std::string indexDir = golPath + "-indexes"; + + fs::path nodePath = fs::path(indexDir) / "nodes.idx"; + fs::path wayPath = fs::path(indexDir) / "ways.idx"; + fs::path relPath = fs::path(indexDir) / "relations.idx"; + + // Check if all three index files exist + if (!fs::exists(nodePath) || !fs::exists(wayPath) || !fs::exists(relPath)) + { + return; // available_ stays false + } + + try + { + // Calculate bit width based on tile count (same formula as gol-tool) + int baseBits = calculateBaseBitWidth(store->tileCount()); + int nodeBits = baseBits; + int wayBits = baseBits + PILEPAIR_EXTRA_BITS; + int relBits = baseBits + PILEPAIR_EXTRA_BITS; + + // Open each index with its correct bit width: + // - nodes.idx: baseBits (pile only) + // - ways.idx: baseBits + 2 (pile << 2 | tile_pair_flags) + // - relations.idx: baseBits + 2 (pile << 2 | tile_pair_flags) + nodeIndex_.open(nodePath.string().c_str(), + clarisma::FileHandle::OpenMode::READ, nodeBits); + wayIndex_.open(wayPath.string().c_str(), + clarisma::FileHandle::OpenMode::READ, wayBits); + relationIndex_.open(relPath.string().c_str(), + clarisma::FileHandle::OpenMode::READ, relBits); + + buildPileToTip(); + available_ = true; + } + catch (...) + { + // If any file fails to open, leave available_ = false + } +} + +void IdIndex::buildPileToTip() +{ + maxPile_ = store_->tileCount(); + pileToTip_ = std::make_unique(maxPile_ + 1); + pileToTip_[0] = Tip(); // Pile 0 is invalid/not-found sentinel + + // gol-tool assigns pile numbers using TileIndexWalker traversal order + // (depth-first). We must use the same order to correctly map pile -> tip. + // See gol-tool's TileCatalog constructor for the canonical implementation. + + TileIndexWalker tiw(store_->tileIndex(), store_->zoomLevels(), + Box::ofWorld(), nullptr); + uint32_t pile = 0; + do + { + pile++; + pileToTip_[pile] = tiw.currentTip(); + } + while (tiw.next() && pile < maxPile_); +} + +FeaturePtr IdIndex::findById(uint64_t id, FeatureType type) +{ + if (!available_) return FeaturePtr(); + + // Select the appropriate index file based on feature type + clarisma::IndexFile* index; + switch (type) + { + case FeatureType::NODE: + index = &nodeIndex_; + break; + case FeatureType::WAY: + index = &wayIndex_; + break; + case FeatureType::RELATION: + index = &relationIndex_; + break; + default: + return FeaturePtr(); + } + + // Look up value from the index + uint32_t indexValue = index->get(id); + if (indexValue == 0) return FeaturePtr(); + + // For ways and relations, the index stores a "pilePair" which is + // (pile << 2) | tile_pair_flags. We need to right-shift by 2 to get the pile. + // For nodes, the index stores the pile directly. + uint32_t pile = (type == FeatureType::NODE) ? indexValue : (indexValue >> 2); + + if (pile == 0 || pile > maxPile_) return FeaturePtr(); + + // Convert pile to tip using our depth-first mapping + Tip tip = pileToTip_[pile]; + TilePtr tile = store_->fetchTile(tip); + if (!tile) return FeaturePtr(); + + return scanTileForId(tile, id, type); +} + +FeaturePtr IdIndex::scanTileForId(TilePtr tile, uint64_t id, FeatureType type) const +{ + FeaturePtr result; + + if (type == FeatureType::NODE) + { + result = scanIndexForId(tile + NODE_INDEX_OFS, id, type); + } + else if (type == FeatureType::WAY) + { + // Ways can be in WAY_INDEX or AREA_INDEX (if closed/area) + result = scanIndexForId(tile + WAY_INDEX_OFS, id, type); + if (result.isNull()) + { + result = scanIndexForId(tile + AREA_INDEX_OFS, id, type); + } + } + else if (type == FeatureType::RELATION) + { + // Relations can be in RELATION_INDEX or AREA_INDEX (if multipolygon) + result = scanIndexForId(tile + RELATION_INDEX_OFS, id, type); + if (result.isNull()) + { + result = scanIndexForId(tile + AREA_INDEX_OFS, id, type); + } + } + + return result; +} + +FeaturePtr IdIndex::scanIndexForId(DataPtr pIndex, uint64_t id, FeatureType type) const +{ + // Get root pointer from index offset + int32_t ptr = pIndex.getInt(); + if (ptr == 0) return FeaturePtr(); // No features of this type in tile + + // Walk all key-index branches (no key filtering for ID lookup) + DataPtr p = pIndex + ptr; + bool isNode = (type == FeatureType::NODE); + + for (;;) + { + ptr = p.getInt(); + int32_t last = ptr & 1; + + // Scan this branch (ptr ^ last clears the last bit) + FeaturePtr found = scanBranchForId(p + (ptr ^ last), id, type, isNode); + if (!found.isNull()) return found; + + if (last != 0) break; + p += 8; // Key-index entries are 8 bytes + } + + return FeaturePtr(); +} + +FeaturePtr IdIndex::scanBranchForId(DataPtr p, uint64_t id, FeatureType type, bool isNode) const +{ + // Entry size: 20 bytes for all types (ptr + 16-byte bbox) + constexpr int BRANCH_ENTRY_SIZE = 20; + + for (;;) + { + int32_t ptr = p.getInt(); + int32_t last = ptr & 1; + + // Get child pointer (clear low 2 bits: last flag and leaf flag) + DataPtr pChild = p + (ptr & 0xffff'fffc); + + FeaturePtr found; + if (ptr & 2) + { + // Leaf node + if (isNode) + { + found = scanNodeLeafForId(pChild, id); + } + else + { + found = scanLeafForId(pChild, id, type); + } + } + else + { + // Branch node - recurse + found = scanBranchForId(pChild, id, type, isNode); + } + + if (!found.isNull()) return found; + if (last != 0) break; + p += BRANCH_ENTRY_SIZE; + } + + return FeaturePtr(); +} + +FeaturePtr IdIndex::scanNodeLeafForId(DataPtr p, uint64_t id) const +{ + // Node leaf entry layout: + // p+0: x coordinate (4 bytes) + // p+4: y coordinate (4 bytes) + // p+8: flags (4 bytes) - FeaturePtr starts here + // Entry size: 20 + (flags & 4) bytes + + for (;;) + { + int32_t flags = (p + 8).getInt(); + + // FeaturePtr for nodes is at p+8 (where flags are) + FeaturePtr feature(p + 8); + + if (feature.id() == id) + { + return feature; + } + + if (flags & 1) break; // Last entry flag + + // Entry size: 20 bytes + 4 extra if relation member (flags bit 2) + p += 20 + (flags & 4); + } + + return FeaturePtr(); +} + +FeaturePtr IdIndex::scanLeafForId(DataPtr p, uint64_t id, FeatureType type) const +{ + // Way/Relation leaf entry layout: + // p+0: minX (4 bytes) + // p+4: minY (4 bytes) + // p+8: maxX (4 bytes) + // p+12: maxY (4 bytes) + // p+16: flags (4 bytes) - FeaturePtr starts here + // Entry size: 32 bytes fixed + + for (;;) + { + int32_t flags = (p + 16).getInt(); + + // FeaturePtr for ways/relations is at p+16 + FeaturePtr feature(p + 16); + + if (feature.id() == id) + { + // For AREA index, verify the type matches (contains both ways and relations) + if (feature.type() == type) + { + return feature; + } + } + + if (flags & 1) break; // Last entry flag + p += 32; + } + + return FeaturePtr(); +} + +} // namespace geodesk