Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -358,3 +358,4 @@ compile_commands.json

cmake-build-debug/
.cache
_codeql_detected_source_root
14 changes: 14 additions & 0 deletions tx_service/include/cc/cc_entry.h
Original file line number Diff line number Diff line change
Expand Up @@ -2030,8 +2030,22 @@ struct LruPage

CcMap *parent_map_{nullptr};

// The value of CcShard::access_counter_ at the time this page was last
// moved to the LRU tail by UpdateLruList(). Comparing two pages'
// last_access_ts_ values determines which was accessed more recently
// (the larger value is more recent).
uint64_t last_access_ts_{0};

// True when at least one entry on this page has a payload whose size
// exceeds txservice_large_value_threshold. Set lazily by CanBeCleaned when
// a large payload is first detected. Once set it is never cleared (even if
// the large entries are later evicted) because the flag is only used as a
// cheap signal to keep the page in the large-value zone.
//
// Large-value pages are kept in the tail (recent) zone of the LRU list so
// that they are evicted only after all small-value pages have been evicted.
bool has_large_value_{false};

// The largest commit ts of dirty cc entries on this page. This value might
// be larger than the actual max commit ts of cc entries. Currently used to
// decide if this page has dirty data after a given ts.
Expand Down
9 changes: 9 additions & 0 deletions tx_service/include/cc/cc_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,15 @@ class CcMap
{
}

// Returns true if the payload-size-aware large-value zone eviction policy
// is active for this map. Only ObjectCcMap (EloqKV) overrides this to
// return true; all other maps (RangeCcMap, CatalogCcMap, etc.) return false
// so the policy has no effect on EloqSQL / EloqDoc tables.
virtual bool IsLargeValueZoneEnabled() const
{
return false;
}

virtual std::pair<size_t, LruPage *> CleanPageAndReBalance(
LruPage *page,
KickoutCcEntryCc *kickout_cc = nullptr,
Expand Down
42 changes: 41 additions & 1 deletion tx_service/include/cc/cc_page_clean_guard.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,16 @@ struct CcPageCleanGuard
return clean_obj_cnt_;
}

// Returns true if CanBeCleaned freshly set the has_large_value_ flag on
// the page during this clean pass (i.e. the page was just discovered to
// have a large-value entry for the first time). When this is set,
// CleanPageAndReBalance will call UpdateLruList to move the page from the
// small-value zone into the large-value zone immediately.
bool HasBlockedLargeValue() const
{
return has_blocked_large_value_;
}

protected:
struct CanBeCleanedResult
{
Expand Down Expand Up @@ -343,6 +353,9 @@ struct CcPageCleanGuard
uint64_t dirty_freed_cnt_{0};
bool evicted_valid_key_{false};
uint64_t clean_obj_cnt_{0};
// Set by CanBeCleaned when a large-value entry is protected. Mutable so it
// can be set from the const CanBeCleaned override.
mutable bool has_blocked_large_value_{false};

private:
std::bitset<CcPage<KeyT, ValueT, VersionedRecord, RangePartitioned>::
Expand Down Expand Up @@ -399,7 +412,34 @@ struct CcPageCleanGuardWithoutKickoutCc
return {false, false};
}

return {(cce->IsFree() && !cce->GetBeingCkpt()), false};
if (!cce->IsFree() || cce->GetBeingCkpt())
{
return {false, false};
}

// Payload-size-aware eviction (ObjectCcMap / EloqKV only): mark the
// page as a large-value page so that UpdateLruList places it in the
// large-value zone (tail end) of the LRU list. The page is still
// evictable here — protection is positional (large-value pages are
// evicted only after all small-value pages). Setting
// has_blocked_large_value_ signals CleanPageAndReBalance to re-zone
// the page immediately via UpdateLruList in case it was in the
// small-value zone.
// IsLargeValueZoneEnabled() returns false for all non-ObjectCcMap
// types (RangeCcMap, CatalogCcMap, etc.), so this block is a no-op
// for EloqSQL and EloqDoc tables.
if (this->page_->parent_map_ != nullptr &&
this->page_->parent_map_->IsLargeValueZoneEnabled() &&
cce->PayloadSize() > txservice_large_value_threshold)
{
if (!this->page_->has_large_value_)
{
this->page_->has_large_value_ = true;
this->has_blocked_large_value_ = true;
}
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个算法存在两个问题:

shard.access_counter_是一个shard level的值,某个热点key就可以使它变得很大。用户不知道怎么配置txservice_large_value_eviction_age,也不想配置。

large value容易沉淀到LRU list的头部,但是因为它的age小于txservice_large_value_eviction_age,所以不会被踢掉。这造成了LRU 扫描的时候做了一些无效的工作,扫描了不能被踢的large value。

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in commit 55925be.

Both problems are resolved by replacing the absolute txservice_large_value_eviction_age threshold with a self-calibrating relative LRU-position check:

uint64_t now        = cc_shard_->AccessCounter();
uint64_t page_age   = now - page_->last_access_ts_;
uint64_t total_span = now - cc_shard_->LruOldestTs();
if (page_age * 2 < total_span)   // page in recent half → protect
    return {false, false};
// page in old half → allow eviction (fall through)

Problem 1 (unconfigurable threshold): txservice_large_value_eviction_age is removed entirely. The check compares a page's age against the total span of the LRU list, so it self-calibrates regardless of how fast access_counter_ grows. No user configuration is needed.

Problem 2 (cold large values blocking the scan): A cold large-value page near the LRU head has page_age ≈ total_span, so page_age * 2 >= total_span — it is immediately eligible for eviction rather than wasting scan work.


return {true, false};
}

bool IsCleanTarget(
Expand Down
49 changes: 44 additions & 5 deletions tx_service/include/cc/cc_shard.h
Original file line number Diff line number Diff line change
Expand Up @@ -1005,11 +1005,35 @@ class CcShard
clean_start_ccp_ = ccp;
}

/// Returns the sentinel head of the LRU list. The first real page is
/// LruHead()->lru_next_. Used by tests to traverse the list.
const LruPage *LruHead() const
{
return &head_ccp_;
}

bool OutOfMemory()
{
return clean_start_ccp_ != nullptr && clean_start_ccp_ == &tail_ccp_;
}

uint64_t AccessCounter() const
{
return access_counter_;
}

/**
* @brief Returns the current head of the large-value zone — the first
* large-value page in the LRU list (the one closest to the sentinel head).
*
* Used for testing: verifies that the large-value zone is non-empty and
* correctly maintained.
*/
const LruPage *LruLargeValueZoneHead() const
{
return lru_large_value_zone_head_;
}

SystemHandler *GetSystemHandler()
{
return system_handler_;
Expand Down Expand Up @@ -1273,18 +1297,33 @@ class CcShard
// simplifies handling of empty and one-element lists.
LruPage head_ccp_, tail_ccp_;
/**
* @brief Each time a page is accessed and moved to the tail of the LRU
* list, the counter is incremented and assigned to the page. Since in a
* double-linked list there is no way to determine the relative order of two
* pages, we use the number to indicate if a page precedes or succeeds the
* other in the list.
* @brief A monotonically-increasing shard-wide counter. It is
* incremented and assigned to a page's last_access_ts_ every time
* UpdateLruList() is called, i.e. whenever any page in this shard is
* moved to its target position in the LRU list.
*
* Primary use: since a doubly-linked list provides no O(1) way to compare
* the positions of two arbitrary nodes, comparing page1.last_access_ts_
* against page2.last_access_ts_ lets the merge/redistribute code determine
* which of the two pages was accessed more recently without traversing the
* list.
*/
uint64_t access_counter_{0};

// Page to start looking for cc entries to kick out on LRU chain.
LruPage *clean_start_ccp_;

// Head of the large-value zone in the LRU list. Large-value pages are
// clustered at the tail (recent) end of the list so they are evicted only
// after all small-value pages have been evicted. This pointer points to the
// first (oldest) large-value page, i.e. the boundary between the two zones:
//
// head ← [small-value pages] ← lru_large_value_zone_head_ ← [large-value
// pages] ← tail
//
// It equals &tail_ccp_ when no large-value pages are in the list.
LruPage *lru_large_value_zone_head_;

// The number of ccentry in all the ccmap of this ccshard.
uint64_t size_;

Expand Down
28 changes: 28 additions & 0 deletions tx_service/include/cc/object_cc_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ class ObjectCcMap : public TemplateCcMap<KeyT, ValueT, false, false>
using TemplateCcMap<KeyT, ValueT, false, false>::Type;
using TemplateCcMap<KeyT, ValueT, false, false>::CleanEntry;

// Enable the payload-size-aware large-value zone policy for EloqKV.
bool IsLargeValueZoneEnabled() const override
{
return txservice_large_value_threshold > 0;
}

bool Execute(ApplyCc &req) override
{
TX_TRACE_ACTION_WITH_CONTEXT(
Expand Down Expand Up @@ -1201,6 +1207,10 @@ class ObjectCcMap : public TemplateCcMap<KeyT, ValueT, false, false>
CommitCommandOnPayload(
cce->payload_.cur_payload_, status, *cmd);
}
if (status == RecordStatus::Normal)
{
MaybeMarkAndRezoneAsLargeValue(ccp, cce->PayloadSize());
}

// Reset the dirty status.
cce->SetDirtyPayload(nullptr);
Expand Down Expand Up @@ -1441,6 +1451,10 @@ class ObjectCcMap : public TemplateCcMap<KeyT, ValueT, false, false>
cce->ReleaseForwardEntry();
shard_->ForwardStandbyMessage(entry_ptr.release());
}
if (payload_status == RecordStatus::Normal)
{
MaybeMarkAndRezoneAsLargeValue(ccp, cce->PayloadSize());
}
bool was_dirty = cce->IsDirty();
cce->SetCommitTsPayloadStatus(commit_ts, payload_status);
this->OnCommittedUpdate(cce, was_dirty);
Expand Down Expand Up @@ -1653,6 +1667,7 @@ class ObjectCcMap : public TemplateCcMap<KeyT, ValueT, false, false>
}
cce->payload_.PassInCurrentPayload(std::move(object_uptr));
object_uptr = nullptr;
MaybeMarkAndRezoneAsLargeValue(cc_page, cce->PayloadSize());
}
else
{
Expand Down Expand Up @@ -1981,6 +1996,10 @@ class ObjectCcMap : public TemplateCcMap<KeyT, ValueT, false, false>
cce->payload_.cur_payload_ == nullptr
? RecordStatus::Deleted
: RecordStatus::Normal;
if (payload_status == RecordStatus::Normal)
{
MaybeMarkAndRezoneAsLargeValue(ccp, cce->PayloadSize());
}
bool was_dirty = cce->IsDirty();
cce->SetCommitTsPayloadStatus(commit_ts, payload_status);
this->OnCommittedUpdate(cce, was_dirty);
Expand Down Expand Up @@ -2453,6 +2472,11 @@ class ObjectCcMap : public TemplateCcMap<KeyT, ValueT, false, false>
++TemplateCcMap<KeyT, ValueT, false, false>::normal_obj_sz_;
}

if (payload_status == RecordStatus::Normal)
{
MaybeMarkAndRezoneAsLargeValue(ccp, cce->PayloadSize());
}

this->OnCommittedUpdate(cce, was_dirty);

// Must update dirty_commit_ts. Otherwise, this entry may be
Expand Down Expand Up @@ -2583,6 +2607,10 @@ class ObjectCcMap : public TemplateCcMap<KeyT, ValueT, false, false>
{
size_t offset = 0;
cce->payload_.DeserializeCurrentPayload(rec_str.data(), offset);
if (status == RecordStatus::Normal)
{
MaybeMarkAndRezoneAsLargeValue(ccp, cce->PayloadSize());
}
}
else
{
Expand Down
Loading