Import wiredtiger: 6f3dbbf2ed12faffad4a3e274d012c61e58874f5 from branch mongodb-master (#54433)

GitOrigin-RevId: 36bf4d0bf6ecb584bdbe873d873f281bc5cc72a6
This commit is contained in:
Siddhartha Mahajan 2026-05-26 16:56:53 +10:00 committed by MongoDB Bot
parent e7ead1c8cf
commit c5ae18cad2
22 changed files with 2362 additions and 969 deletions

View File

@ -0,0 +1,21 @@
FROM ubuntu:26.04
RUN apt-get update && \
apt-get install -y --no-install-recommends \
aspell aspell-en ca-certificates ccache clang clang-format clang-tidy \
clangd cmake curl doxygen ed file g++ gcc gdb git liblz4-dev libsnappy-dev \
libsodium-dev libzstd-dev lld lldb llvm make ninja-build python-is-python3 \
python3 python3-dev python3-pip python3-venv swig universal-ctags \
zlib1g-dev \
&& rm -rf /var/lib/apt/lists/*
RUN useradd -m -s /bin/bash wiredtiger
USER wiredtiger
RUN python -m venv /home/wiredtiger/venv
ENV PATH="/home/wiredtiger/venv/bin:$PATH"
RUN pip install --no-cache-dir \
find_libpython==0.4.0 gcovr==5.0 psutil==5.9.4 ruff==0.4.5
WORKDIR /workdir

View File

@ -0,0 +1,21 @@
{
"name": "WiredTiger",
"build": {
"dockerfile": "Dockerfile"
},
"customizations": {
"vscode": {
"extensions": [
"llvm-vs-code-extensions.vscode-clangd",
"ms-python.python",
"ms-vscode.cmake-tools",
"ms-vscode.cpptools",
"redhat.vscode-yaml"
],
"settings": {
"C_Cpp.intelliSenseEngine": "disabled",
"clangd.path": "clangd"
}
}
}
}

View File

@ -12,7 +12,6 @@ cscope.out
cscope.po.out
build/
build_*/
dist/clang-format
/docs/
test-compatibility-run/
COMPATIBILITY_TEST/

View File

@ -58,6 +58,20 @@ choco install swig
choco install python --pre
```
##### Docker
Alternatively, the repository ships a Dockerfile in `.devcontainer/Dockerfile`
with all required and optional build dependencies pre-installed. Build the image
and mount the repository:
```bash
docker build -t wiredtiger-dev .devcontainer
docker run -it -v "$PWD":/workdir wiredtiger-dev
```
The `.devcontainer/devcontainer.json` also makes this image usable as a
[Dev Container](https://containers.dev/) in supporting editors (e.g. VS Code).
### Building the WiredTiger Library
Building the WiredTiger library is relatively straightforward. Navigate to the top level of the WiredTiger repository and run the following commands:

View File

@ -6,38 +6,16 @@ setup_trap
cd_top
check_fast_mode_flag
venv="$TOP_DIR/.venv"
download_clang_format() {
version=$1
arch_and_os="$(uname -m)-$(uname)"
archive=dist/clang-format.tar.gz
# Adding more clang-format binaries requires uploading them to boxes.10gen
# You can either get the clang-format binary from the llvm releases page
# (https://github.com/llvm/llvm-project/releases) or compile clang-format yourself.
# Place the binary in dist/ and confirm that s_clang_format runs correctly, then
# tar a folder containing just the clang-format binary with the format:
# clang-format-llvm-${version}-${arch_and_os}/
# clang-format
# into a tarball named clang-format-llvm-${version}-${arch_and_os}.tar.gz
# The tarball should extract using the tar command below.
# This tarball can then be uploaded via a Jira request to the BUILD team.
if [[ "$arch_and_os" =~ ^("aarch64-Linux"|"x86_64-Darwin"|"arm64-Darwin"|"x86_64-Linux")$ ]] ; then
curl -s https://s3.amazonaws.com/boxes.10gen.com/build/clang-format-llvm-"$version"-"$arch_and_os".tar.gz -o $archive
tar --strip=1 -C dist/ -xf $archive clang-format-llvm-"$version"-"$arch_and_os"/clang-format && rm $archive
chmod +x ./dist/clang-format
if [[ "$arch_and_os" =~ ^("x86_64-Darwin"|"arm64-Darwin")$ ]] ; then
# Needed to get around the macOS code signing issue.
xattr -c ./dist/clang-format
fi
else
echo "$0: unsupported architecture and OS combination '$arch_and_os' to run clang_format"
return 1
fi
python3 -m venv "$venv"
"$venv/bin/pip" install --quiet --disable-pip-version-check "clang-format==$version"
}
# Override existing Clang-Format versions in the PATH.
export PATH="${PWD}/dist":$PATH
export PATH="$venv/bin":$PATH
# Check if Clang-Format is already available with the desired version.
desired_version="12.0.1"

View File

@ -194,17 +194,28 @@ setup_doxygen()
exit 0
}
# Do not build the documentation if doxygen version is not compatible.
# Select Doxyfile based on doxygen major.minor version.
v=$(doxygen --version)
case "$v" in
1.8.17) doxyfile="Doxyfile";;
1.9.1) doxyfile="Doxyfile.9";;
1.9.3) doxyfile="Doxyfile.9";;
1.11.*) doxyfile="Doxyfile.11";;
*)
echo "$0 skipped: unsupported version of doxygen: $v, not 1.8.17, 1.9.1, 1.9.3 or 1.11.*"
exit 0
esac
major=$(cut -d. -f1 <<< "$v")
minor=$(cut -d. -f2 <<< "$v")
# Doxygen < 1.8:
if [ "$major" -lt 1 ] || { [ "$major" -eq 1 ] && [ "$minor" -lt 8 ]; }; then
echo "$0 failed: doxygen $v is too old, need 1.8 or later"
exit 1
# Doxygen 1.8:
elif [ "$major" -eq 1 ] && [ "$minor" -eq 8 ]; then
doxyfile="Doxyfile"
# Doxygen 1.9 - 1.10:
elif [ "$major" -eq 1 ] && [ "$minor" -lt 11 ]; then
doxyfile="Doxyfile.9"
# Doxygen >= 1.11:
else
doxyfile="Doxyfile.11"
fi
}
build()

View File

@ -418,6 +418,7 @@ conn_stats = [
# Note eviction_server_evict_attempt - eviction_server_evict_fail = evict page successes by eviction server.
EvictStat('eviction_server_skip_checkpointing_trees', 'eviction server skips trees that are being checkpointed'),
EvictStat('eviction_server_skip_dirty_pages_during_checkpoint', 'eviction server skips dirty pages during a running checkpoint'),
EvictStat('eviction_server_skip_disagg_trees_checkpointed', 'eviction server skips disaggregated trees already visited by the ongoing checkpoint'),
EvictStat('eviction_server_skip_history_store_pages_with_updates_during_checkpoint', 'eviction server skips clean history store pages with updates when a precise checkpoint is in progress'),
EvictStat('eviction_server_skip_ingest_trees', 'eviction server skips ingest btrees in disagg'),
EvictStat('eviction_server_skip_intl_page_non_aggressive', 'eviction server skipped the internal pages if eviction is not in aggressive mode'),

View File

@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger",
"branch": "mongodb-master",
"commit": "3c6ec61a01a4dbdb3351121c0a624d3996e28c03"
"commit": "6f3dbbf2ed12faffad4a3e274d012c61e58874f5"
}

View File

@ -1202,10 +1202,6 @@ typedef int int_void;
};
%enddef
SIDESTEP_METHOD(__wt_page_log, pl_begin_checkpoint,
(WT_SESSION *session, int checkpoint_id),
(self, session, checkpoint_id))
SIDESTEP_METHOD(__wt_page_log, pl_complete_checkpoint,
(WT_SESSION *session, WT_PAGE_LOG_COMPLETE_CHECKPOINT_ARGS *args),
(self, session, args))

View File

@ -105,15 +105,14 @@ __wt_conn_calc_read_load(WT_SESSION_IMPL *session)
uint8_t load;
load_control = &S2C(session)->load_control;
/*
* Avoid division by zero if the cache size has not yet been set in a shared cache.
*/
bytes_max = load_control->read_load_max;
bytes_inuse = __wt_cache_bytes_inuse(S2C(session)->cache);
if (F_ISSET(load_control, WT_CONN_LOAD_CONTROL)) {
bytes_max = load_control->read_load_max;
bytes_inuse = __wt_cache_bytes_inuse(S2C(session)->cache);
load = __conn_calc_load_pct(bytes_inuse, bytes_max);
__wt_atomic_store_uint8_relaxed(&load_control->read_load, load);
WT_STAT_CONN_SET(session, read_load, load);
load = __conn_calc_load_pct(bytes_inuse, bytes_max);
__wt_atomic_store_uint8_relaxed(&load_control->read_load, load);
WT_STAT_CONN_SET(session, read_load, load);
}
return;
}
@ -128,15 +127,14 @@ __wt_conn_calc_write_load(WT_SESSION_IMPL *session)
uint64_t bytes_dirty, bytes_max;
uint8_t load;
/*
* Avoid division by zero if the cache size has not yet been set in a shared cache.
*/
load_control = &S2C(session)->load_control;
bytes_max = load_control->write_load_max;
bytes_dirty = __wt_cache_dirty_inuse(S2C(session)->cache);
if (F_ISSET(load_control, WT_CONN_LOAD_CONTROL)) {
bytes_max = load_control->write_load_max;
bytes_dirty = __wt_cache_dirty_inuse(S2C(session)->cache);
load = __conn_calc_load_pct(bytes_dirty, bytes_max);
__wt_atomic_store_uint8_relaxed(&load_control->write_load, load);
WT_STAT_CONN_SET(session, write_load, load);
load = __conn_calc_load_pct(bytes_dirty, bytes_max);
__wt_atomic_store_uint8_relaxed(&load_control->write_load, load);
WT_STAT_CONN_SET(session, write_load, load);
}
return;
}

View File

@ -2106,7 +2106,7 @@ __clayered_put(WT_SESSION_IMPL *session, WT_CURSOR_LAYERED *clayered, const WT_I
if (!leader) {
/*
* FIXME-WT-16812: Investigate whether this function can be called below the cursor layer.
* FIXME-WT-17425: Investigate whether this function can be called below the cursor layer.
* Doing so would remove the cursor write operation dependency on the truncate list.
*/
WT_RET(__wt_layered_table_truncate_detect_write_conflict(
@ -2188,7 +2188,7 @@ __clayered_remove_follower(
WT_RET(__clayered_reset_cursors(clayered, true));
/*
* FIXME-WT-16812: Investigate whether this function can be called below the cursor layer. Doing
* FIXME-WT-17425: Investigate whether this function can be called below the cursor layer. Doing
* so would remove the write cursor operations dependency on the truncate list.
*/
WT_RET(__wt_layered_table_truncate_detect_write_conflict(

View File

@ -703,7 +703,7 @@ Get an empty page instead and mark it instantiated.}
Return \c WT_NOTFOUND instead of reading the page if we are skipping deleted pages.}
@row{bt_split.c, Hook, in \c __split_parent_discard_ref,
Discard the \c page_del field of the }\c WT_REF.
Discard the \c page_del field of the \c WT_REF.}
@row{bt_split.c, Hook, in \c __split_parent,
For VLCS trees\, avoid discarding the leftmost child even if it's deleted.}

View File

@ -330,7 +330,7 @@ __wti_evict_walk(WT_SESSION_IMPL *session, WTI_EVICT_QUEUE *queue)
uint32_t evict_walk_period;
u_int loop_count, max_entries, retries, slot, start_slot;
u_int total_candidates;
bool dhandle_list_locked;
bool aggressive, dhandle_list_locked;
WT_TRACK_OP_INIT(session);
@ -429,13 +429,29 @@ retry:
continue;
}
/*
* Skip disaggregated btrees that have already been visited by the ongoing checkpoint when
* we are looking only for dirty pages and the cache is not under pressure. Every modified
* page in such a tree belongs to the next checkpoint and would fail the post-lock recheck,
* so walking only inflates the worker failure rate. When looking for clean or update pages,
* or when eviction is aggressive, walk anyway: any candidates the workers can lay hands on
* are better than starving the cache.
*/
aggressive = __wt_evict_aggressive(session);
if (!F_ISSET(evict, WT_EVICT_CACHE_CLEAN | WT_EVICT_CACHE_UPDATES) && !aggressive &&
__wt_btree_disagg_checkpointed(session, btree)) {
WT_STAT_CONN_INCR(session, eviction_server_skip_disagg_trees_checkpointed);
__evict_disagg_btree_skip_count(session, btree);
continue;
}
/*
* Skip files that are configured to stick in cache until we become aggressive.
*
* If the file is contributing heavily to our cache usage then ignore the "stickiness" of
* its pages.
*/
if (btree->evict_priority != 0 && !__wt_evict_aggressive(session) &&
if (btree->evict_priority != 0 && !aggressive &&
!__evict_btree_dominating_cache(session, btree)) {
WT_STAT_CONN_INCR(session, eviction_server_skip_trees_stick_in_cache);
__evict_disagg_btree_skip_count(session, btree);
@ -524,7 +540,7 @@ retry:
* If eviction is not in aggressive mode, sleep a bit to give the checkpoint thread a
* chance to gather its handles.
*/
if (F_ISSET_ATOMIC_32(conn, WT_CONN_CKPT_GATHER) && !__wt_evict_aggressive(session)) {
if (F_ISSET_ATOMIC_32(conn, WT_CONN_CKPT_GATHER) && !aggressive) {
__wt_sleep(0, 10);
WT_STAT_CONN_INCR(session, eviction_walk_sleeps);
}

View File

@ -2242,6 +2242,21 @@ __wt_btree_can_discard(WT_SESSION_IMPL *session)
return (__wt_materialization_check(session, rec_lsn_max));
}
/*
* __wt_btree_disagg_checkpointed --
* Return true when a disaggregated btree has been visited by the current global checkpoint and
* that checkpoint is still running. While this holds, every modified page in the btree belongs
* to the next checkpoint and cannot be evicted.
*/
static WT_INLINE bool
__wt_btree_disagg_checkpointed(WT_SESSION_IMPL *session, WT_BTREE *btree)
{
return (F_ISSET(btree, WT_BTREE_DISAGGREGATED) &&
__wt_atomic_load_uint64_acquire(&btree->checkpoint_gen) ==
__wt_gen(session, WT_GEN_CHECKPOINT) &&
__wt_atomic_load_bool_v_acquire(&S2C(session)->txn_global.checkpoint_running));
}
/*
* __wt_page_can_evict --
* Check whether a page can be evicted.
@ -2252,8 +2267,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp)
WT_BTREE *btree;
WT_PAGE *page;
WT_PAGE_MODIFY *mod;
uint64_t checkpoint_gen;
bool checkpoint_running, modified;
bool modified;
if (inmem_splitp != NULL)
*inmem_splitp = false;
@ -2374,16 +2388,10 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp)
* It is safe to evict when checkpoint is not running because we have opened a new checkpoint
* before we set the checkpoint running flag to false.
*/
if (modified && F_ISSET(btree, WT_BTREE_DISAGGREGATED) && !WT_SESSION_BTREE_SYNC(session)) {
checkpoint_gen = __wt_atomic_load_uint64_acquire(&btree->checkpoint_gen);
if (checkpoint_gen == __wt_gen(session, WT_GEN_CHECKPOINT)) {
checkpoint_running =
__wt_atomic_load_bool_v_acquire(&S2C(session)->txn_global.checkpoint_running);
if (checkpoint_running) {
WT_STAT_CONN_DSRC_INCR(session, cache_eviction_blocked_disagg_next_checkpoint);
return (false);
}
}
if (modified && !WT_SESSION_BTREE_SYNC(session) &&
__wt_btree_disagg_checkpointed(session, btree)) {
WT_STAT_CONN_DSRC_INCR(session, cache_eviction_blocked_disagg_next_checkpoint);
return (false);
}
/*

View File

@ -1988,6 +1988,8 @@ static WT_INLINE bool __wt_block_eligible_for_sweep(WT_BM *bm, WT_BLOCK *block)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static WT_INLINE bool __wt_btree_can_discard(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static WT_INLINE bool __wt_btree_disagg_checkpointed(WT_SESSION_IMPL *session, WT_BTREE *btree)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static WT_INLINE bool __wt_btree_syncing_by_other_sessions(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static WT_INLINE bool __wt_cache_full(WT_SESSION_IMPL *session)

View File

@ -576,6 +576,7 @@ struct __wt_connection_stats {
int64_t eviction_root_pages_skipped;
int64_t eviction_server_skip_history_store_pages_with_updates_during_checkpoint;
int64_t eviction_server_skip_dirty_pages_during_checkpoint;
int64_t eviction_server_skip_disagg_trees_checkpointed;
int64_t eviction_server_skip_ingest_trees;
int64_t eviction_server_skip_intl_page_with_active_child;
int64_t eviction_server_skip_metatdata_with_history;

File diff suppressed because it is too large Load Diff

View File

@ -2023,6 +2023,7 @@ static const char *const __stats_connection_desc[] = {
"cache: eviction server skips clean history store pages with updates when a precise checkpoint "
"is in progress",
"cache: eviction server skips dirty pages during a running checkpoint",
"cache: eviction server skips disaggregated trees already visited by the ongoing checkpoint",
"cache: eviction server skips ingest btrees in disagg",
"cache: eviction server skips internal pages as it has an active child",
"cache: eviction server skips metadata pages with history",
@ -3107,6 +3108,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->eviction_root_pages_skipped = 0;
stats->eviction_server_skip_history_store_pages_with_updates_during_checkpoint = 0;
stats->eviction_server_skip_dirty_pages_during_checkpoint = 0;
stats->eviction_server_skip_disagg_trees_checkpointed = 0;
stats->eviction_server_skip_ingest_trees = 0;
stats->eviction_server_skip_intl_page_with_active_child = 0;
stats->eviction_server_skip_metatdata_with_history = 0;
@ -4179,6 +4181,8 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *
from, eviction_server_skip_history_store_pages_with_updates_during_checkpoint);
to->eviction_server_skip_dirty_pages_during_checkpoint +=
WT_STAT_CONN_READ(from, eviction_server_skip_dirty_pages_during_checkpoint);
to->eviction_server_skip_disagg_trees_checkpointed +=
WT_STAT_CONN_READ(from, eviction_server_skip_disagg_trees_checkpointed);
to->eviction_server_skip_ingest_trees +=
WT_STAT_CONN_READ(from, eviction_server_skip_ingest_trees);
to->eviction_server_skip_intl_page_with_active_child +=

View File

@ -10,8 +10,8 @@
/*
* Selects which truncate-list entries __truncate_search considers: those visible to the calling
* transaction (committed truncates we may need to honor) or those not visible (uncommitted
* truncates that may conflict with our writes).
* transaction (committed and within its read timestamp) or those not visible (uncommitted, or
* committed at a timestamp beyond its read timestamp) that may conflict with our writes.
*/
typedef enum { WT_TRUNCATE_SEARCH_VISIBLE, WT_TRUNCATE_SEARCH_NOT_VISIBLE } WT_TRUNCATE_SEARCH_MODE;
@ -259,15 +259,14 @@ __truncate_search(WT_SESSION_IMPL *session, WT_LAYERED_TABLE *layered_table, con
TAILQ_FOREACH (entry, &layered_table->truncateqh, q) {
WT_STAT_CONN_INCR(session, layered_truncate_list_search_entries_walked);
if (mode == WT_TRUNCATE_SEARCH_VISIBLE) {
wt_timestamp_t start_ts, durable_ts;
__truncate_read_entry_timestamps(entry, &start_ts, &durable_ts);
if (!__wt_txn_visible(session, entry->txn_id, start_ts, durable_ts))
continue;
} else if (mode == WT_TRUNCATE_SEARCH_NOT_VISIBLE) {
if (__wt_txn_visible_id(session, entry->txn_id))
continue;
}
wt_timestamp_t start_ts, durable_ts;
__truncate_read_entry_timestamps(entry, &start_ts, &durable_ts);
const bool visible = __wt_txn_visible(session, entry->txn_id, start_ts, durable_ts);
const bool want_visible = (mode == WT_TRUNCATE_SEARCH_VISIBLE);
if (visible != want_visible)
continue;
WT_RET(__key_within_truncate_range(
session, collator, &entry->start_key, &entry->stop_key, key, is_foundp));
@ -287,7 +286,7 @@ __truncate_search(WT_SESSION_IMPL *session, WT_LAYERED_TABLE *layered_table, con
* Search if the current key we are modifying conflicts with any uncommitted truncates in the
* layered table truncate list.
*
* FIXME-WT-16812: Investigate whether this function can be called below the cursor layer. Doing so
* FIXME-WT-17425: Investigate whether this function can be called below the cursor layer. Doing so
* would remove the write cursor operations dependency on the truncate list.
*/
int
@ -347,7 +346,10 @@ __wt_layered_table_truncate_detect_non_ingest_write_conflict(WT_SESSION_IMPL *se
TAILQ_FOREACH (entry, &layered_table->truncateqh, q) {
WT_STAT_CONN_INCR(session, layered_truncate_list_search_entries_walked);
if (__wt_txn_visible_id(session, entry->txn_id))
wt_timestamp_t start_ts, durable_ts;
__truncate_read_entry_timestamps(entry, &start_ts, &durable_ts);
if (__wt_txn_visible(session, entry->txn_id, start_ts, durable_ts))
continue;
/* Does the new range start within an existing range? */

File diff suppressed because it is too large Load Diff

View File

@ -92,13 +92,26 @@ class test_config09(wttest.WiredTigerTestCase):
self.assertEqual(val, 1024)
self.update_tables()
val = self.get_stat(stat.conn.checkpoint_handle_applied)
applied = self.get_stat(stat.conn.checkpoint_handle_applied)
# We cannot assert it is equal to half because there could be other
# internal tables in the count. Assert it is less than 75% and at least
# half.
self.assertGreaterEqual(val, self.ntables // 2)
self.assertLess(val, self.ntables // 4 * 3)
val = self.get_stat(stat.conn.checkpoint_handle_skipped)
self.assertNotEqual(val, 0)
self.assertGreaterEqual(applied, self.ntables // 2)
self.assertLess(applied, self.ntables // 4 * 3)
skipped = self.get_stat(stat.conn.checkpoint_handle_skipped)
self.assertNotEqual(skipped, 0)
# The handle stats should be reset at the start of each gather and
# then set to the per-checkpoint value. Two back-to-back checkpoints
# over the same working set must therefore publish the same value;
# if the reset is missing, the second value will be roughly double
# the first.
locked_1 = self.get_stat(stat.conn.checkpoint_handle_locked)
meta_checked_1 = self.get_stat(stat.conn.checkpoint_handle_meta_checked)
self.session.checkpoint()
locked_2 = self.get_stat(stat.conn.checkpoint_handle_locked)
meta_checked_2 = self.get_stat(stat.conn.checkpoint_handle_meta_checked)
self.assertEqual(locked_1, locked_2)
self.assertEqual(meta_checked_1, meta_checked_2)
self.conn.close()

View File

@ -0,0 +1,249 @@
#!/usr/bin/env python3
#
# Public Domain 2014-present MongoDB, Inc.
# Public Domain 2008-2014 WiredTiger, Inc.
#
# This is free and unencumbered software released into the public domain.
#
# Anyone is free to copy, modify, publish, use, compile, sell, or
# distribute this software, either in source code form or as a compiled
# binary, for any purpose, commercial or non-commercial, and by any
# means.
#
# In jurisdictions that recognize copyright laws, the author or authors
# of this software dedicate any and all copyright interest in the
# software to the public domain. We make this dedication for the benefit
# of the public at large and to the detriment of our heirs and
# successors. We intend this dedication to be an overt act of
# relinquishment in perpetuity of all present and future rights to this
# software under copyright law.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
# test_layered_fast_truncate18.py
# Write conflict detection for follower fast truncate (truncate-truncate
# conflicts only).
import unittest
from contextlib import closing, nullcontext
from typing import Iterable
from helper_disagg import disagg_test_class, gen_disagg_storages
from wiredtiger import WiredTigerError
from wtscenario import make_scenarios
import wttest
def range_inclusive(start: int, stop: int) -> range:
"""Return a range covering [start, stop] inclusive."""
return range(start, stop + 1)
@disagg_test_class
class test_layered_fast_truncate18(wttest.WiredTigerTestCase):
"""
Write conflict detection for follower fast truncate (truncate-truncate
conflicts only).
"""
uris = [
("layered", {"uri": "layered:fast_truncate"}),
("table", {"uri": "table:fast_truncate"}),
]
disagg_storages = gen_disagg_storages(disagg_only=True)
scenarios = make_scenarios(disagg_storages, uris)
conn_config = 'disaggregated=(role="leader"),'
CONFLICT_MSG = "/conflict between concurrent operations/"
def session_create_config(self) -> str:
"""Return a config string for session.create() based on table URI."""
cfg = "key_format=i,value_format=S"
if self.uri.startswith("table"):
cfg += ",block_manager=disagg,type=layered"
return cfg
def auto_closing_cursor(self, session) -> closing:
"""Return a cursor that auto-closes as it goes out of scope."""
return closing(session.open_cursor(self.uri))
def auto_closing_session(self) -> closing:
"""Return a session that auto-closes as it goes out of scope."""
return closing(self.conn.open_session())
def populate(self, keys: Iterable[int]):
"""Insert each key with a placeholder value in a single transaction."""
with self.auto_closing_cursor(self.session) as cursor:
with self.transaction():
for key in keys:
cursor[key] = "v"
def setup_leader(self, keys: Iterable[int] | None = None):
"""Create the table on the leader and optionally populate stable."""
self.session.create(self.uri, self.session_create_config())
if keys is not None:
self.populate(keys)
self.session.checkpoint()
def setup_follower(self, keys: Iterable[int] | None = None):
"""Switch to follower role and optionally write keys to ingest."""
self.reopen_disagg_conn('disaggregated=(role="follower"),')
if keys is not None:
self.populate(keys)
def cursor_for_key(self, key: int | None, session):
"""Return a cursor with its key set, or None if key is None."""
if key is None:
return nullcontext(None)
cursor = self.auto_closing_cursor(session)
cursor.thing.set_key(key)
return cursor
def truncate(self, session, start_key: int | None, stop_key: int | None):
"""Execute a truncate from start to stop key inclusive."""
with (
self.cursor_for_key(start_key, session) as start,
self.cursor_for_key(stop_key, session) as stop,
):
uri = self.uri if (start is None and stop is None) else None
session.truncate(uri, start, stop, None)
def test_same_txn_truncates_no_self_conflict(self):
# A follower with stable keys 1-100.
self.setup_leader(keys=range_inclusive(1, 100))
self.setup_follower()
# Within a single transaction: truncate 30-60, then truncate 40-80.
with self.transaction():
self.truncate(self.session, 30, 60)
self.truncate(self.session, 40, 80)
# The transaction committed; no WT_ROLLBACK raised.
def test_overlapping_truncates_conflict_with_ingest(self):
# A follower with stable keys 1-100 and ingest key 45.
self.setup_leader(keys=range_inclusive(1, 100))
self.setup_follower(keys=[45])
# txn A begins a truncate over 30-60 and leaves it uncommitted.
session_a = self.session
session_a.begin_transaction()
self.truncate(session_a, 30, 60)
# txn B truncates overlapping range 40-70 and gets WT_ROLLBACK.
with (
self.auto_closing_session() as session_b,
self.transaction(session=session_b, rollback=True),
):
self.assertRaisesException(
WiredTigerError,
lambda: self.truncate(session_b, 40, 70),
self.CONFLICT_MSG,
)
def test_overlapping_truncates_conflict_no_ingest(self):
# A follower with stable keys 1-100 and an empty ingest table.
self.setup_leader(keys=range_inclusive(1, 100))
self.setup_follower()
# txn A begins a truncate over 30-60 and leaves it uncommitted.
session_a = self.session
session_a.begin_transaction()
self.truncate(session_a, 30, 60)
# txn B truncates overlapping range 40-70 and gets WT_ROLLBACK.
with (
self.auto_closing_session() as session_b,
self.transaction(session=session_b, rollback=True),
):
self.assertRaisesException(
WiredTigerError,
lambda: self.truncate(session_b, 40, 70),
self.CONFLICT_MSG,
)
def test_non_overlapping_truncates_no_conflict(self):
# A follower with stable keys 1-100.
self.setup_leader(keys=range_inclusive(1, 100))
self.setup_follower()
# txn A truncates 10-30 and leaves it uncommitted.
session_a = self.session
session_a.begin_transaction()
self.truncate(session_a, 10, 30)
# txn B truncates 50-70 (no overlap) and commits successfully.
with (
self.auto_closing_session() as session_b,
self.transaction(session=session_b),
):
self.truncate(session_b, 50, 70)
def test_rolled_back_truncate_no_residual(self):
# A follower with stable keys 1-100.
self.setup_leader(keys=range_inclusive(1, 100))
self.setup_follower()
# txn A truncates 30-60 then explicitly rolls back.
session_a = self.session
with self.transaction(session=session_a, rollback=True):
self.truncate(session_a, 30, 60)
# txn B truncates the same range 30-60 and commits without WT_ROLLBACK.
with (
self.auto_closing_session() as session_b,
self.transaction(session=session_b),
):
self.truncate(session_b, 30, 60)
def test_invisible_committed_truncate_conflicts(self):
# A follower with stable keys 1-100.
self.setup_leader(keys=range_inclusive(1, 100))
self.setup_follower()
# txn A commits a truncate over 30-60 at ts=10 (invisible to txn B).
self.conn.set_timestamp("oldest_timestamp=" + self.timestamp_str(1))
with self.transaction(commit_timestamp=10):
self.truncate(self.session, 30, 60)
# txn B (read_ts=5) truncates overlapping range 40-70 and gets
# WT_ROLLBACK.
with (
self.auto_closing_session() as session_b,
self.transaction(
session=session_b, read_timestamp=5, rollback=True
),
):
self.assertRaisesException(
WiredTigerError,
lambda: self.truncate(session_b, 40, 70),
self.CONFLICT_MSG,
)
def test_visible_committed_truncate_no_conflict(self):
# A follower with stable keys 1-100.
self.setup_leader(keys=range_inclusive(1, 100))
self.setup_follower()
# txn A commits a truncate over 30-60 at ts=5 (visible to txn B).
self.conn.set_timestamp("oldest_timestamp=" + self.timestamp_str(1))
with self.transaction(commit_timestamp=5):
self.truncate(self.session, 30, 60)
# txn B (read_ts=10) truncates overlapping range 40-70 without
# WT_ROLLBACK.
with (
self.auto_closing_session() as session_b,
self.transaction(session=session_b, read_timestamp=10),
):
self.truncate(session_b, 40, 70)
if __name__ == "__main__":
wttest.run()