Import wiredtiger: 9d2e4ce64fa8248ce21daa252e79528da59bc5d5 from branch mongodb-master (#54508)

GitOrigin-RevId: f0a6209b8e95fa30174606036a240e325e2d1947
This commit is contained in:
Alex Pullen 2026-05-27 15:10:04 +10:00 committed by MongoDB Bot
parent 837806ab05
commit e7ebec401e
59 changed files with 1112 additions and 1601 deletions

View File

@ -413,34 +413,34 @@ if(ENABLE_DEBUG_INFO AND NOT WT_DEBUG_FLAGS_INITIALIZED)
set(BUILD_TYPES_WITH_DEBUG_INFO ${BUILD_MODES})
list(REMOVE_ITEM BUILD_TYPES_WITH_DEBUG_INFO Release)
set(DEBUG_INFO_FLAGS)
if(GNU_C_COMPILER OR CLANG_C_COMPILER)
# Higher debug levels `-g3`/`-ggdb3` emit additional debug information, including
# macro definitions that allow us to evaluate macros such as `p S2C(session)` inside of gdb.
# This needs to be in DWARF version 2 format or later - and should be by default - but
# we'll specify version 4 here to be safe.
list(APPEND DEBUG_INFO_FLAGS -g3 -gdwarf-4)
# DWARF v4 is supplied explicitly to be safe across toolchain defaults.
set(debug_info_flags "-g3 -gdwarf-4")
if(CLANG_C_COMPILER)
# Clang requires one additional flag to output macro debug information.
list(APPEND DEBUG_INFO_FLAGS -glldb -fdebug-macro)
string(APPEND debug_info_flags " -glldb -fdebug-macro")
else()
list(APPEND DEBUG_INFO_FLAGS -ggdb3)
string(APPEND debug_info_flags " -ggdb3")
endif()
add_cmake_compiler_flags(
FLAGS ${DEBUG_INFO_FLAGS}
LANGUAGES C CXX
BUILD_TYPES ${BUILD_TYPES_WITH_DEBUG_INFO}
)
foreach(build_type IN LISTS BUILD_TYPES_WITH_DEBUG_INFO)
string(TOUPPER "${build_type}" BT)
set(CMAKE_C_FLAGS_${BT}
"${CMAKE_C_FLAGS_${BT}} ${debug_info_flags}" CACHE STRING "" FORCE)
set(CMAKE_CXX_FLAGS_${BT}
"${CMAKE_CXX_FLAGS_${BT}} ${debug_info_flags}" CACHE STRING "" FORCE)
endforeach()
endif()
# MSVC: ensure linker produces PDBs.
if(MSVC_C_COMPILER)
add_cmake_linker_flags(
FLAGS "/DEBUG"
BINARIES EXE SHARED
BUILD_TYPES ${BUILD_TYPES_WITH_DEBUG_INFO}
)
foreach(build_type IN LISTS BUILD_TYPES_WITH_DEBUG_INFO)
string(TOUPPER "${build_type}" BT)
set(CMAKE_EXE_LINKER_FLAGS_${BT}
"${CMAKE_EXE_LINKER_FLAGS_${BT}} /DEBUG" CACHE STRING "" FORCE)
set(CMAKE_SHARED_LINKER_FLAGS_${BT}
"${CMAKE_SHARED_LINKER_FLAGS_${BT}} /DEBUG" CACHE STRING "" FORCE)
endforeach()
endif()
# Mark that we've set the initial debug flags

View File

@ -80,29 +80,36 @@ function(define_build_mode mode)
string(REPLACE ";" " " cxx_flags "${DEFINE_BUILD_CXX_COMPILER_FLAGS}")
string(REPLACE ";" " " linker_flags "${linker_flags}")
string(TOUPPER ${mode} build_mode)
set(CMAKE_C_FLAGS_${build_mode}
"${c_flags}" CACHE STRING
"Flags used by the C compiler for ${mode} build type or configuration." FORCE)
set(CMAKE_CXX_FLAGS_${build_mode}
"${cxx_flags}" CACHE STRING
"Flags used by the C++ compiler for ${mode} build type or configuration." FORCE)
# Seed the default flags for this build mode exactly once per build dir.
if(NOT WT_BUILD_MODE_${build_mode}_FLAGS_INITIALIZED)
set(CMAKE_C_FLAGS_${build_mode}
"${c_flags}" CACHE STRING
"Flags used by the C compiler for ${mode} build type or configuration." FORCE)
set(CMAKE_EXE_LINKER_FLAGS_${build_mode}
"${linker_flags}" CACHE STRING
"Linker flags to be used to create executables for ${mode} build type." FORCE)
set(CMAKE_CXX_FLAGS_${build_mode}
"${cxx_flags}" CACHE STRING
"Flags used by the C++ compiler for ${mode} build type or configuration." FORCE)
set(CMAKE_SHARED_LINKER_FLAGS_${build_mode}
"${linker_flags}" CACHE STRING
"Linker flags to be used to create shared libraries for ${mode} build type." FORCE)
set(CMAKE_EXE_LINKER_FLAGS_${build_mode}
"${linker_flags}" CACHE STRING
"Linker flags to be used to create executables for ${mode} build type." FORCE)
set(CMAKE_MODULE_LINKER_FLAGS_${build_mode}
"${linker_flags}" CACHE STRING
"Linker flags to be used to create shared modules for ${mode} build type." FORCE)
set(CMAKE_SHARED_LINKER_FLAGS_${build_mode}
"${linker_flags}" CACHE STRING
"Linker flags to be used to create shared libraries for ${mode} build type." FORCE)
set(CMAKE_MODULE_LINKER_FLAGS_${build_mode}
"${linker_flags}" CACHE STRING
"Linker flags to be used to create shared modules for ${mode} build type." FORCE)
set(WT_BUILD_MODE_${build_mode}_FLAGS_INITIALIZED TRUE CACHE INTERNAL
"WiredTiger ${mode} build mode flags have been initialized")
endif()
mark_as_advanced(
CMAKE_CXX_FLAGS_${build_mode}
CMAKE_C_FLAGS_${build_mode}
CMAKE_CXX_FLAGS_${build_mode}
CMAKE_EXE_LINKER_FLAGS_${build_mode}
CMAKE_SHARED_LINKER_FLAGS_${build_mode}
CMAKE_MODULE_LINKER_FLAGS_${build_mode}

View File

@ -441,99 +441,6 @@ function(add_cmake_flag included_flags flag)
endif()
endfunction()
# add_cmake_compiler_flags(FLAGS <flags...> LANGUAGES <languages...> BUILD_TYPES <build_types...>)
# A helper function that adds one or more compiler flags to specified languages and build types,
# avoiding duplication by using the existing add_cmake_flag function.
# FLAGS <flags...> - one or more compilation flags to add
# LANGUAGES <languages...> - one or more languages (C, CXX, etc.)
# BUILD_TYPES <build_types...> - one or more build types (Debug, RelWithDebInfo, Release, etc.)
function(add_cmake_compiler_flags)
cmake_parse_arguments(
PARSE_ARGV
0
"COMPILER_FLAGS"
""
""
"FLAGS;LANGUAGES;BUILD_TYPES"
)
# Validate required arguments
if(NOT COMPILER_FLAGS_FLAGS)
message(FATAL_ERROR "add_cmake_compiler_flags: FLAGS argument is required")
endif()
if(NOT COMPILER_FLAGS_LANGUAGES)
message(FATAL_ERROR "add_cmake_compiler_flags: LANGUAGES argument is required")
endif()
if(NOT COMPILER_FLAGS_BUILD_TYPES)
message(FATAL_ERROR "add_cmake_compiler_flags: BUILD_TYPES argument is required")
endif()
# Add each flag to each language/build_type combination
foreach(lang ${COMPILER_FLAGS_LANGUAGES})
foreach(build_type ${COMPILER_FLAGS_BUILD_TYPES})
# Convert build type to uppercase for CMAKE variable names
string(TOUPPER "${build_type}" build_type_upper)
# Initialize the flags variable if not already defined
if(NOT DEFINED CMAKE_${lang}_FLAGS_${build_type_upper})
set(CMAKE_${lang}_FLAGS_${build_type_upper} "")
endif()
# Add each flag while avoiding duplication
foreach(flag ${COMPILER_FLAGS_FLAGS})
add_cmake_flag(CMAKE_${lang}_FLAGS_${build_type_upper} "${flag}")
endforeach()
endforeach()
endforeach()
endfunction()
# add_cmake_linker_flags(FLAGS <flags...> BINARIES <binaries...> BUILD_TYPES <build_types...>)
# A helper function that adds one or more linker flags to specified binary types and build types,
# avoiding duplication by using the existing add_cmake_flag function.
# FLAGS <flags...> - one or more linker flags to add
# BINARIES <binaries...> - one or more binary types (EXE, SHARED, MODULE, etc.)
# BUILD_TYPES <build_types...> - one or more build types (Debug, RelWithDebInfo, Release, etc.)
function(add_cmake_linker_flags)
cmake_parse_arguments(
PARSE_ARGV
0
"LINKER_FLAGS"
""
""
"FLAGS;BINARIES;BUILD_TYPES"
)
# Validate required arguments
if(NOT LINKER_FLAGS_FLAGS)
message(FATAL_ERROR "add_cmake_linker_flags: FLAGS argument is required")
endif()
if(NOT LINKER_FLAGS_BINARIES)
message(FATAL_ERROR "add_cmake_linker_flags: BINARIES argument is required")
endif()
if(NOT LINKER_FLAGS_BUILD_TYPES)
message(FATAL_ERROR "add_cmake_linker_flags: BUILD_TYPES argument is required")
endif()
# Add each flag to each binary_type/build_type combination
foreach(binary ${LINKER_FLAGS_BINARIES})
foreach(build_type ${LINKER_FLAGS_BUILD_TYPES})
# Convert build type to uppercase for CMAKE variable names
string(TOUPPER "${build_type}" build_type_upper)
# Initialize the flags variable if not already defined
if(NOT DEFINED CMAKE_${binary}_LINKER_FLAGS_${build_type_upper})
set(CMAKE_${binary}_LINKER_FLAGS_${build_type_upper} "")
endif()
# Add each flag while avoiding duplication
foreach(flag ${LINKER_FLAGS_FLAGS})
add_cmake_flag(CMAKE_${binary}_LINKER_FLAGS_${build_type_upper} "${flag}")
endforeach()
endforeach()
endforeach()
endfunction()
# replace_compile_options(flag_var [REMOVE <flags...>] [ADD <flags...>])
# A helper function that removes specified compiler flags from a flag variable and optionally adds new ones.
# This is useful for replacing default compiler flags with custom ones while maintaining clean flag strings.

View File

@ -4,11 +4,9 @@ include(cmake/rcpc_test.cmake)
# ARMv8-A is the 64-bit ARM architecture, turn on the optional CRC.
# If the compilation check in rcpc_test passes also turn on the RCpc instructions.
if(HAVE_RCPC)
add_cmake_flag(CMAKE_C_FLAGS -march=armv8.2-a+rcpc+crc)
add_cmake_flag(CMAKE_CXX_FLAGS -march=armv8.2-a+rcpc+crc)
add_compile_options(-march=armv8.2-a+rcpc+crc)
else()
add_cmake_flag(CMAKE_C_FLAGS -march=armv8-a+crc)
add_cmake_flag(CMAKE_CXX_FLAGS -march=armv8-a+crc)
add_compile_options(-march=armv8-a+crc)
endif()
# moutline-atomics preserves backwards compatibility with Arm v8.0 systems but also supports
@ -17,6 +15,6 @@ endif()
# the flag.
check_c_compiler_flag("-moutline-atomics" has_moutline_atomics)
if(has_moutline_atomics)
add_cmake_flag(CMAKE_C_FLAGS -moutline-atomics)
add_compile_options(-moutline-atomics)
endif()
unset(has_moutline_atomics CACHE)

View File

@ -1,4 +1,3 @@
# See https://www.sifive.com/blog/all-aboard-part-1-compiler-args
# for background on the `rv64imafdc` and `lp64d` arguments here.
add_cmake_flag(CMAKE_C_FLAGS -march=rv64imafdc)
add_cmake_flag(CMAKE_C_FLAGS -mabi=lp64d)
add_compile_options(-march=rv64imafdc -mabi=lp64d)

View File

@ -1,6 +1,5 @@
set(WT_POSIX ON CACHE BOOL "")
# Linux requires '_GNU_SOURCE' to be defined for access to GNU/Linux extension functions
# e.g. Access to 'pthread_setname_np' on Linux. Append this macro to our compiler flags
# for Linux-based builds.
add_cmake_flag(CMAKE_C_FLAGS -D_GNU_SOURCE)
# e.g. 'pthread_setname_np'.
add_compile_definitions(_GNU_SOURCE)

View File

@ -689,8 +689,7 @@ connection_runtime_config = [
if true, for operations with snapshot isolation the cursor temporarily releases any page
that requires force eviction, then repositions back to the page for further operations.
A page release encourages eviction of hot or large pages, which is more likely to
succeed without a cursor keeping the page pinned. Note: This setting is not compatible
with disaggregated storage.''',
succeed without a cursor keeping the page pinned.''',
type='boolean'),
Config('disagg_address_cookie_upgrade', 'none', r'''
modify the disaggregated block manager to pretend that it is a newer version to test
@ -2238,7 +2237,13 @@ methods = {
),
'WT_CONNECTION.set_file_system' : Method([]),
'WT_CONNECTION.set_key_provider' : Method([]),
'WT_CONNECTION.set_key_provider' : Method([
Config('version', '0', r'''
the key provider API version. Version 0 uses the pull model
(WiredTiger calls WT_KEY_PROVIDER::get_key). Version 1 uses
the push model''',
min=0, max=1),
]),
'WT_CONNECTION.load_extension' : Method([
Config('config', '', r'''

View File

@ -125,21 +125,23 @@ ENDOFTEXT
# Parallel execution: if it's the main invocation of the script, collect the file names
# to process and run them in subprocesses.
# Search for files, skipping some well-known 3rd party directories.
find [a-z]* -name '*.[ch]' \
# Search for files in explicit source directories, skipping any absent in this tree.
dirs=()
for d in bench dist docs examples ext lang oss src test tools; do
[ -d "$d" ] && dirs+=("$d")
done
find "${dirs[@]}" \
-name '*.[ch]' \
-o -name '*.cpp' \
-o -name '*.in' \
-o -name '*.py' \
-o -name '*.swig' |
sed -e '/Makefile.in/d' \
-e '/^build\//d' \
-e '/^cmake\//d' \
-e '/checksum\/power8\//d' \
-e '/checksum\/zseries\//d' \
-e '/\/3rdparty\//d' \
-e '/\/node_modules\//d' \
-e '/^tools\/wt-mcp\/\.venv\//d' \
-e '/^venv\//d' \
-e '/dist\/__/d' \
-e 's/^\.\///' |
do_in_parallel || RET=1

View File

@ -26,23 +26,12 @@ fi
# Get what could be the ticket id.
ticket_id=$(echo "$branch_name" | cut -d "-" -f-2)
search_function="grep -Iinr --exclude-dir=.git"
# Find the name of the build folders WiredTiger has been compiled in.
# Users can name this folder anything, but it needs to be in the rootdir and to contain CMakeFiles
build_files=$(find ../ -maxdepth 2 -name CMakeFiles)
for build_dir in $build_files; do
build_folder=$(basename $(dirname $build_dir))
search_function="$search_function --exclude-dir=$build_folder"
done
search_function="$search_function $ticket_id ../ 2>&1"
# Check for comments related to the ticket.
if eval "$search_function >/dev/null" ; then
echo "There are comments mentioning $ticket_id in the code, please check if they need to be \
resolved:"
eval "$search_function"
# Check for comments related to the ticket. git grep searches only tracked files, so build
# directories and temporary files are excluded automatically. Note: newly created files that are
# not yet added to git will be missed, but they can be checked once they are tracked.
if git -C .. grep -Iin "$ticket_id" > /dev/null 2>&1; then
echo "There are comments mentioning $ticket_id in the code, please check if they need to be resolved:"
git -C .. grep -Iin "$ticket_id"
fi
exit 0

View File

@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger",
"branch": "mongodb-master",
"commit": "6f3dbbf2ed12faffad4a3e274d012c61e58874f5"
"commit": "9d2e4ce64fa8248ce21daa252e79528da59bc5d5"
}

View File

@ -1612,10 +1612,6 @@ __debug_update_dump_flags(WT_DBG *ds, WT_UPDATE *upd)
ds->f(ds, ", prepare-restored-from-ds"));
++flag_num;
}
if (F_ISSET(upd, WT_UPDATE_PREPARE_ROLLBACK)) {
WT_RET(flag_num == 0 ? ds->f(ds, "prepare-rollback") : ds->f(ds, ", prepare-rollback"));
++flag_num;
}
if (F_ISSET(upd, WT_UPDATE_RESTORED_FAST_TRUNCATE)) {
WT_RET(flag_num == 0 ? ds->f(ds, "fast-truncate") : ds->f(ds, ", fast-truncate"));
++flag_num;

View File

@ -396,18 +396,6 @@ __wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UP
if (__wt_atomic_load_uint64_v_relaxed(&upd->txnid) == WT_TXN_ABORTED)
continue;
/*
* Prepare transaction rollback adds a globally visible tombstone to the update chain to
* remove the entire key. Treating these globally visible tombstones as obsolete and
* trimming update list can cause problems if the update chain is getting accessed somewhere
* else. To avoid this problem, skip these globally visible tombstones from the update
* obsolete check.
*/
if (F_ISSET(upd, WT_UPDATE_PREPARE_ROLLBACK)) {
first = NULL;
continue;
}
/* Cannot truncate the updates if we need to remove the updates from the history store. */
if (F_ISSET(upd, WT_UPDATE_HS_MAX_STOP)) {
first = NULL;

View File

@ -901,6 +901,17 @@ static const uint8_t confchk_WT_CONNECTION_rollback_to_stable_jump[WT_CONFIG_JUM
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
static const WT_CONFIG_CHECK confchk_WT_CONNECTION_set_key_provider[] = {
{"version", "int", NULL, "min=0,max=1", NULL, 0, NULL, WT_CONFIG_COMPILED_TYPE_INT, 70, 0, 1,
NULL},
{NULL, NULL, NULL, NULL, NULL, 0, NULL, 0, 0, 0, 0, NULL}};
static const uint8_t confchk_WT_CONNECTION_set_key_provider_jump[WT_CONFIG_JUMP_TABLE_SIZE] = {0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1};
static const WT_CONFIG_CHECK confchk_WT_CONNECTION_set_timestamp[] = {
{"durable_timestamp", "string", NULL, NULL, NULL, 0, NULL, WT_CONFIG_COMPILED_TYPE_STRING, 3,
INT64_MIN, INT64_MAX, NULL},
@ -4210,7 +4221,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {
confchk_WT_CONNECTION_rollback_to_stable, 2, confchk_WT_CONNECTION_rollback_to_stable_jump, 12,
WT_CONF_SIZING_NONE, false},
{"WT_CONNECTION.set_file_system", "", NULL, 0, NULL, 13, WT_CONF_SIZING_NONE, false},
{"WT_CONNECTION.set_key_provider", "", NULL, 0, NULL, 14, WT_CONF_SIZING_NONE, false},
{"WT_CONNECTION.set_key_provider", "version=0", confchk_WT_CONNECTION_set_key_provider, 1,
confchk_WT_CONNECTION_set_key_provider_jump, 14, WT_CONF_SIZING_NONE, false},
{"WT_CONNECTION.set_timestamp",
"durable_timestamp=,force=false,oldest_timestamp=,"
"stable_disaggregated_schema_epoch=,stable_timestamp=",

View File

@ -1448,7 +1448,7 @@ __conn_open_session(WT_CONNECTION *wt_conn, WT_EVENT_HANDLER *event_handler, con
session_ret = NULL;
WT_ERR(__wt_open_session(conn, event_handler, config, true, &session_ret));
session_ret->name = "connection-open-session";
__wt_atomic_store_ptr_relaxed(&session_ret->name, "connection-open-session");
*wt_sessionp = &session_ret->iface;
err:
@ -2907,16 +2907,13 @@ err:
static int
__conn_set_key_provider(WT_CONNECTION *wt_conn, WT_KEY_PROVIDER *key_provider, const char *config)
{
WT_CONFIG_ITEM cval;
WT_CONNECTION_IMPL *conn;
WT_DECL_RET;
WT_SESSION_IMPL *session;
conn = (WT_CONNECTION_IMPL *)wt_conn;
CONNECTION_API_CALL_NOCONF(conn, session, set_key_provider);
/* The configuration string has no use but may be useful at a later time. */
if (config != NULL)
WT_ERR_MSG(session, EINVAL, "key provider configuration currently not supported.");
CONNECTION_API_CALL(conn, session, set_key_provider, config, cfg);
/* You can only enable the key provider system in disaggregated mode. */
if (__wt_conn_is_disagg(session))
@ -2928,6 +2925,10 @@ __conn_set_key_provider(WT_CONNECTION *wt_conn, WT_KEY_PROVIDER *key_provider, c
if (conn->key_provider != NULL)
WT_ERR_MSG(session, EINVAL, "key provider system must be configured with early_load set");
WT_ERR(__wt_config_gets(session, cfg, "version", &cval));
if (cval.val == 1)
F_SET(conn, WT_CONN_KEY_PROVIDER_PUSH);
conn->key_provider = key_provider;
err:

View File

@ -435,6 +435,10 @@ __wt_disagg_put_crypt_helper(WT_SESSION_IMPL *session)
if (session->ckpt.crash_trigger_point == KEY_PROVIDER_CRASH_BEFORE_KEY_ROTATION)
__wt_debug_crash(session);
/* The pull-model get_key API is disabled when the push-model is configured. */
if (F_ISSET(conn, WT_CONN_KEY_PROVIDER_PUSH))
return (ENOTSUP);
/* Check for a new encryption key data. If the size is 0, there is none so we can skip. */
WT_ERR(key_provider->get_key(key_provider, (WT_SESSION *)session, &crypt));
if (crypt.keys.size == 0)

View File

@ -1155,13 +1155,11 @@ __clayered_iterate_constituents(WT_CURSOR_LAYERED *clayered, uint32_t iter_flag)
* prepared conflict occurs. Prepared updates are always ignored on the stable cursor, making it
* safe to check the WT_CURSTD_KEY_INT flag.
*/
if (((WT_CURSOR_BTREE *)c_ingest)->ref == NULL && !F_ISSET(c_stable, WT_CURSTD_KEY_INT)) {
/*
* Move the stable cursor first to ensure it is advanced, even if a prepared conflict occurs
* on the ingest cursor.
*/
WT_ERR_NOTFOUND_OK(__clayered_constituent_iter_helper(clayered, c_stable, forward), false);
bool fresh_start =
(((WT_CURSOR_BTREE *)c_ingest)->ref == NULL && !F_ISSET(c_stable, WT_CURSTD_KEY_INT));
if (fresh_start) {
WT_ERR_NOTFOUND_OK(__clayered_constituent_iter_helper(clayered, c_ingest, forward), false);
WT_ERR_NOTFOUND_OK(__clayered_constituent_iter_helper(clayered, c_stable, forward), false);
goto done;
}
@ -1226,7 +1224,13 @@ __clayered_iterate_constituents(WT_CURSOR_LAYERED *clayered, uint32_t iter_flag)
done:
err:
if (ret == 0 || ret == WT_PREPARE_CONFLICT) {
if (ret == WT_PREPARE_CONFLICT && fresh_start)
/*
* Prepare conflict on the very first key of a fresh walk: ingest is blocked before stable
* has advanced. Reset ingest so the next call restarts cleanly.
*/
WT_TRET(__clayered_reset_cursors(clayered, false));
else if (ret == 0 || ret == WT_PREPARE_CONFLICT) {
if (!F_ISSET(clayered, iter_flag)) {
F_CLR(clayered, WT_CLAYERED_ITERATE_NEXT | WT_CLAYERED_ITERATE_PREV);
F_SET(clayered, iter_flag);
@ -2961,14 +2965,11 @@ __wt_clayered_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner,
WT_RET(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
if (cval.len != 0)
WT_RET_MSG(session, ENOTSUP, "Layered trees do not support opening by checkpoint");
WT_RET_MSG(session, EINVAL, "Layered trees do not support opening by checkpoint");
WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval));
if (cval.val != 0)
WT_RET_MSG(session, ENOTSUP, "Layered trees do not support bulk loading");
if (FLD_ISSET(S2C(session)->debug.flags, WT_CONN_DEBUG_CURSOR_REPOSITION))
WT_RET_MSG(session, ENOTSUP, "Layered trees do not support cursor reposition");
WT_RET_MSG(session, EINVAL, "Layered trees do not support bulk loading");
/* Get the layered tree, and hold a reference to it until the cursor is closed. */
WT_RET(__wt_session_get_dhandle(session, uri, NULL, cfg, 0));

View File

@ -186,13 +186,7 @@ static WT_INLINE WT_UPDATE *
__curversion_tombstone_next_upd(
WT_SESSION_IMPL *session, WT_CURSOR_VERSION *version_cursor, WT_UPDATE *tombstone)
{
/*
* show_prepared_rollback currently targets ingest-table style rollback updates (in-memory
* trees), where rollback metadata lives on aborted prepared value updates and no globally
* visible tombstone with PREPARE_ROLLBACK flag is prepended. If this feature is extended to
* non-in-memory trees, we need additional handling for globally visible PREPARE_ROLLBACK
* tombstones and their underlying aborted value updates.
*/
/* Stop at a globally visible tombstone nothing older is relevant. */
if (__wt_txn_upd_visible_all(session, tombstone))
return (NULL);

View File

@ -1549,20 +1549,19 @@ struct __wt_update {
/* When introducing a new flag, consider adding it to WT_UPDATE_SELECT_FOR_DS. */
/* AUTOMATIC FLAG VALUE GENERATION START 0 */
#define WT_UPDATE_DELETE_DURABLE 0x0001u /* Key has been removed from disk image. */
#define WT_UPDATE_DS 0x0002u /* Update has been chosen to the data store. */
#define WT_UPDATE_DURABLE 0x0004u /* Update has been durable. */
#define WT_UPDATE_HS 0x0008u /* Update has been written to hs. */
#define WT_UPDATE_HS_MAX_STOP 0x0010u /* Update has been written to hs with a max stop. */
#define WT_UPDATE_PREPARE_DURABLE 0x0020u /* Prepared update has been durable. */
#define WT_UPDATE_PREPARE_RESTORED_FROM_DS 0x0040u /* Prepared update restored from data store. */
#define WT_UPDATE_PREPARE_ROLLBACK 0x0080u /* Tombstone that rolled back by a prepared update.*/
#define WT_UPDATE_RESTORED_FAST_TRUNCATE 0x0100u /* Fast truncate instantiation. */
#define WT_UPDATE_RESTORED_FROM_DS 0x0200u /* Update restored from data store. */
#define WT_UPDATE_RESTORED_FROM_HS 0x0400u /* Update restored from history store. */
#define WT_UPDATE_RESTORED_FROM_INGEST 0x0800u /* Update restored from ingest btree. */
#define WT_UPDATE_RTS_DRYRUN_ABORT 0x1000u /* Used by dry run to mark a would-be abort. */
/* AUTOMATIC FLAG VALUE GENERATION STOP 16 */
#define WT_UPDATE_DELETE_DURABLE 0x001u /* Key has been removed from disk image. */
#define WT_UPDATE_DS 0x002u /* Update has been chosen to the data store. */
#define WT_UPDATE_DURABLE 0x004u /* Update has been durable. */
#define WT_UPDATE_HS 0x008u /* Update has been written to hs. */
#define WT_UPDATE_HS_MAX_STOP 0x010u /* Update has been written to hs with a max stop. */
#define WT_UPDATE_PREPARE_DURABLE 0x020u /* Prepared update has been durable. */
#define WT_UPDATE_PREPARE_RESTORED_FROM_DS 0x040u /* Prepared update restored from data store. */
#define WT_UPDATE_RESTORED_FAST_TRUNCATE 0x080u /* Fast truncate instantiation. */
#define WT_UPDATE_RESTORED_FROM_DS 0x100u /* Update restored from data store. */
#define WT_UPDATE_RESTORED_FROM_HS 0x200u /* Update restored from history store. */
#define WT_UPDATE_RESTORED_FROM_INGEST 0x400u /* Update restored from ingest btree. */
#define WT_UPDATE_RTS_DRYRUN_ABORT 0x800u /* Used by dry run to mark a would-be abort. */
/* AUTOMATIC FLAG VALUE GENERATION STOP 16 */
uint16_t flags;
/* There are several cases we should select the update irrespective of visibility to write to the

View File

@ -1083,6 +1083,280 @@ __wt_cell_leaf_value_parse(WT_PAGE *page, WT_CELL *cell)
return (WT_ERROR); \
} while (0)
/*
* __cell_unpack_addr_cell --
* Unpack the validity window and optional fast-truncate record for an addr cell.
*/
static WT_INLINE int
__cell_unpack_addr_cell(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CELL *cell,
const uint8_t **pp, const void *end, WT_CELL_UNPACK_ADDR *unpack_addr)
{
WT_PAGE_DELETED *page_del;
WT_TIME_AGGREGATE *ta;
uint8_t flags;
bool has_fast_truncate, prepare_fast_truncate;
/* Return an error if we're not unpacking a cell of this type. */
if (unpack_addr == NULL)
return (WT_ERROR);
ta = &unpack_addr->ta;
has_fast_truncate = unpack_addr->raw == WT_CELL_ADDR_DEL && F_ISSET(dsk, WT_PAGE_FT_UPDATE);
prepare_fast_truncate = false;
if ((cell->__chunk[0] & WT_CELL_SECOND_DESC) != 0) {
flags = *(*pp)++; /* skip second descriptor byte */
WT_CELL_LEN_CHK(*pp, 0, dsk, end);
if (LF_ISSET(WT_CELL_PREPARE)) {
if (has_fast_truncate)
prepare_fast_truncate = true;
else
ta->prepare = 1;
}
if (LF_ISSET(WT_CELL_TS_START))
WT_RET(
__wt_vunpack_uint(pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), &ta->oldest_start_ts));
if (LF_ISSET(WT_CELL_TXN_START))
WT_RET(__wt_vunpack_uint(pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), &ta->newest_txn));
if (LF_ISSET(WT_CELL_TS_DURABLE_START)) {
WT_RET(__wt_vunpack_uint(
pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), &ta->newest_start_durable_ts));
ta->newest_start_durable_ts += ta->oldest_start_ts;
}
if (LF_ISSET(WT_CELL_TS_STOP)) {
WT_RET(
__wt_vunpack_uint(pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), &ta->newest_stop_ts));
ta->newest_stop_ts += ta->oldest_start_ts;
}
if (LF_ISSET(WT_CELL_TXN_STOP)) {
WT_RET(
__wt_vunpack_uint(pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), &ta->newest_stop_txn));
ta->newest_stop_txn += ta->newest_txn;
}
if (LF_ISSET(WT_CELL_TS_DURABLE_STOP)) {
WT_RET(__wt_vunpack_uint(
pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), &ta->newest_stop_durable_ts));
ta->newest_stop_durable_ts += ta->newest_stop_ts;
}
WT_RET(__wt_check_addr_validity(session, ta, end != NULL));
}
if (!has_fast_truncate)
return (0);
/* Unpack the fast-truncate page_del record. */
page_del = &unpack_addr->page_del;
WT_RET(
__wt_vunpack_uint(pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), (uint64_t *)&page_del->txnid));
if (prepare_fast_truncate) {
page_del->prepare_state = WT_PREPARE_INPROGRESS;
page_del->committed = false;
WT_RET(
__wt_vunpack_uint(pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), &page_del->prepare_ts));
page_del->pg_del_start_ts = page_del->prepare_ts;
WT_RET(
__wt_vunpack_uint(pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), &page_del->prepared_id));
/* Explicitly initialize the durable timestamp to WT_TS_NONE. */
page_del->pg_del_durable_ts = WT_TS_NONE;
WT_ASSERT_ALWAYS(session,
!F_ISSET(S2C(session), WT_CONN_PRESERVE_PREPARED) ||
page_del->prepared_id != WT_PREPARED_ID_NONE,
"Read prepared record with no prepared id when preserve prepared is enabled.");
} else {
page_del->prepare_state = WT_PREPARE_INIT;
page_del->committed = true;
WT_RET(__wt_vunpack_uint(
pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), &page_del->pg_del_start_ts));
WT_RET(__wt_vunpack_uint(
pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), &page_del->pg_del_durable_ts));
}
page_del->selected_for_write = true;
return (0);
}
/*
* __cell_unpack_value_window --
* Unpack the validity window for a value cell (called when WT_CELL_SECOND_DESC is set).
*/
static WT_INLINE int
__cell_unpack_value_window(
WT_SESSION_IMPL *session, const uint8_t **pp, const void *end, uint8_t flags, WT_TIME_WINDOW *tw)
{
wt_timestamp_t temp_start_ts, temp_durable_start_ts, temp_stop_ts, temp_durable_stop_ts;
temp_start_ts = temp_durable_start_ts = temp_durable_stop_ts = WT_TS_NONE;
temp_stop_ts = WT_TS_MAX;
if (LF_ISSET(WT_CELL_TS_START))
WT_RET(__wt_vunpack_uint(pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), &temp_start_ts));
if (LF_ISSET(WT_CELL_TXN_START))
WT_RET(__wt_vunpack_uint(pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), &tw->start_txn));
if (LF_ISSET(WT_CELL_TS_DURABLE_START))
WT_RET(
__wt_vunpack_uint(pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), &temp_durable_start_ts));
if (LF_ISSET(WT_CELL_TS_STOP))
WT_RET(__wt_vunpack_uint(pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), &temp_stop_ts));
if (LF_ISSET(WT_CELL_TXN_STOP)) {
WT_RET(__wt_vunpack_uint(pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), &tw->stop_txn));
tw->stop_txn += tw->start_txn;
}
if (LF_ISSET(WT_CELL_TS_DURABLE_STOP))
WT_RET(
__wt_vunpack_uint(pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), &temp_durable_stop_ts));
/* Load temporary values to the right fields. */
if (LF_ISSET(WT_CELL_PREPARE)) {
bool preserve_prepared = F_ISSET(S2C(session), WT_CONN_PRESERVE_PREPARED);
/*
* We can compare the txn_id only here, but cannot do it everywhere else because when
* recovering, all transaction ids are reset to WT_TXN_NONE, so we cannot compare the
* transaction ids.
*/
if (tw->start_txn == tw->stop_txn && temp_stop_ts == WT_TS_NONE) {
/*
* This is a special case where both transaction start and stop are in prepared state.
* The prepared record is written with the preserve prepared config enabled. The same
* prepared id is packed to WT_CELL_TS_DURABLE_START. Since temp_stop_ts here stores the
* difference between start_prepared_id and stop_prepared_id, temp_stop_ts must be 0.
*/
if (temp_durable_start_ts != WT_TS_NONE) {
WT_ASSERT(session, temp_durable_stop_ts == WT_TS_NONE);
tw->start_prepare_ts = temp_start_ts;
tw->start_prepared_id = temp_durable_start_ts;
tw->stop_prepare_ts = temp_start_ts;
tw->stop_prepared_id = temp_durable_start_ts;
} else {
WT_ASSERT_ALWAYS(session, !preserve_prepared,
"Read prepared record with no prepared id when preserve prepared is "
"enabled.");
WT_ASSERT(session, temp_durable_start_ts == temp_durable_stop_ts);
tw->start_prepare_ts = tw->stop_prepare_ts = temp_start_ts;
}
} else if (tw->stop_txn != WT_TXN_MAX) {
/*
* This case happens where the transaction start is committed, but the transaction stop
* is prepared. In this case, we store the start timestamp and durable start timestamp
* in WT_CELL_TS_START and WT_CELL_TS_DURABLE_START, prepare ts in WT_CELL_TS_STOP.
*/
tw->start_ts = temp_start_ts;
/*
* The prepared record is written with the preserve prepared config enabled. We store
* the prepared id in WT_CELL_TS_DURABLE_STOP.
*/
if (temp_durable_start_ts != WT_TS_NONE)
tw->durable_start_ts = temp_durable_start_ts + tw->start_ts;
else
tw->durable_start_ts = tw->start_ts;
WT_ASSERT(session, temp_stop_ts != WT_TS_MAX);
tw->stop_prepare_ts = tw->start_ts + temp_stop_ts;
if (temp_durable_stop_ts != WT_TS_NONE)
tw->stop_prepared_id = temp_durable_stop_ts;
else
WT_ASSERT_ALWAYS(session, !preserve_prepared,
"Read prepared record with no prepared id when preserve prepared is "
"enabled.");
} else {
WT_ASSERT(session, tw->start_ts == WT_TS_NONE);
/*
* This case happens when only transaction start is prepared, and there is no
* transaction stop. In this case, we store the prepare ts in WT_CELL_TS_START.
*/
tw->start_prepare_ts = temp_start_ts;
/*
* The prepared record is written with the preserve prepared config enabled. We store
* prepared id in WT_CELL_TS_DURABLE_START.
*/
if (temp_durable_start_ts != WT_TS_NONE)
tw->start_prepared_id = temp_durable_start_ts;
else
WT_ASSERT_ALWAYS(session, !preserve_prepared,
"Read prepared record with no prepared id when preserve prepared is "
"enabled.");
}
} else {
if (LF_ISSET(WT_CELL_TS_START))
tw->start_ts = temp_start_ts;
if (LF_ISSET(WT_CELL_TS_DURABLE_START))
tw->durable_start_ts = temp_durable_start_ts + tw->start_ts;
else
tw->durable_start_ts = tw->start_ts;
if (LF_ISSET(WT_CELL_TS_STOP))
tw->stop_ts = temp_stop_ts + tw->start_ts;
if (LF_ISSET(WT_CELL_TS_DURABLE_STOP))
tw->durable_stop_ts = temp_durable_stop_ts + tw->stop_ts;
else if (tw->stop_ts != WT_TS_MAX)
tw->durable_stop_ts = tw->stop_ts;
}
__cell_assert_tw_has_ts_for_garbage_collection_table(session, tw);
WT_RET(__cell_check_value_validity(session, tw, end != NULL));
return (0);
}
/*
* __cell_unpack_data_len --
* Unpack the data length for a cell (all cases except WT_CELL_VALUE_COPY).
*/
static WT_INLINE int
__cell_unpack_data_len(
WT_CELL *cell, WT_CELL_UNPACK_COMMON *unpack, const uint8_t **pp, const void *end)
{
uint64_t v;
switch (unpack->raw) {
case WT_CELL_KEY_OVFL:
case WT_CELL_KEY_OVFL_RM:
case WT_CELL_VALUE_OVFL:
case WT_CELL_VALUE_OVFL_RM:
/*
* Set overflow flag.
*/
F_SET(unpack, WT_CELL_UNPACK_OVERFLOW);
/* FALLTHROUGH */
case WT_CELL_ADDR_DEL:
case WT_CELL_ADDR_DEL_VISIBLE_ALL:
case WT_CELL_ADDR_INT:
case WT_CELL_ADDR_LEAF:
case WT_CELL_ADDR_LEAF_NO:
case WT_CELL_KEY:
case WT_CELL_KEY_PFX:
case WT_CELL_VALUE:
/*
* The cell is followed by a 4B data length and a chunk of data.
*/
WT_RET(__wt_vunpack_uint(pp, end == NULL ? 0 : WT_PTRDIFF(end, *pp), &v));
/*
* If the size was what prevented us from using a short cell, it's larger than the
* adjustment size. Decrement/increment it when packing/unpacking so it takes up less room.
*/
if (unpack->raw == WT_CELL_KEY || unpack->raw == WT_CELL_KEY_PFX ||
(unpack->raw == WT_CELL_VALUE && unpack->v == 0 &&
(cell->__chunk[0] & WT_CELL_SECOND_DESC) == 0))
v += WT_CELL_SIZE_ADJUST;
unpack->data = *pp;
unpack->size = (uint32_t)v;
unpack->__len = WT_PTRDIFF32(*pp, cell) + unpack->size;
break;
case WT_CELL_DEL:
unpack->__len = WT_PTRDIFF32(*pp, cell);
break;
default:
return (WT_ERROR); /* Unknown cell type. */
}
return (0);
}
/*
* __wt_cell_unpack_safe --
* Unpack a WT_CELL into a structure, with optional boundary checks.
@ -1097,15 +1371,13 @@ __wt_cell_unpack_safe(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CE
WT_TIME_WINDOW tw;
} copy;
WT_CELL_UNPACK_COMMON *unpack;
WT_PAGE_DELETED *page_del;
WT_TIME_AGGREGATE *ta;
WT_TIME_WINDOW *tw;
uint64_t v;
const uint8_t *p;
uint8_t flags;
bool copy_cell, has_fast_truncate, prepare_fast_truncate;
bool copy_cell;
copy_cell = has_fast_truncate = prepare_fast_truncate = false;
copy_cell = false;
copy.len = 0; /* [-Wconditional-uninitialized] */
copy.v = 0; /* [-Wconditional-uninitialized] */
@ -1113,13 +1385,11 @@ __wt_cell_unpack_safe(WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_CE
unpack = (WT_CELL_UNPACK_COMMON *)unpack_value;
tw = &unpack_value->tw;
WT_TIME_WINDOW_INIT(tw);
ta = NULL;
} else {
WT_ASSERT(session, unpack_value == NULL);
unpack = (WT_CELL_UNPACK_COMMON *)unpack_addr;
ta = &unpack_addr->ta;
WT_TIME_AGGREGATE_INIT(ta);
WT_TIME_AGGREGATE_INIT(&unpack_addr->ta);
tw = NULL;
}
@ -1189,60 +1459,7 @@ copy_cell_restart:
case WT_CELL_ADDR_INT:
case WT_CELL_ADDR_LEAF:
case WT_CELL_ADDR_LEAF_NO:
/* Return an error if we're not unpacking a cell of this type. */
if (unpack_addr == NULL)
return (WT_ERROR);
/*
* A committed fast-truncate cell may be written without WT_CELL_SECOND_DESC when its time
* aggregate is globally visible. Compute this flag before the SECOND_DESC early-exit so the
* page_del block is always unpacked for fast-truncate addr-del cells.
*/
has_fast_truncate = unpack->raw == WT_CELL_ADDR_DEL && F_ISSET(dsk, WT_PAGE_FT_UPDATE);
if ((cell->__chunk[0] & WT_CELL_SECOND_DESC) == 0)
break;
flags = *p++; /* skip second descriptor byte */
WT_CELL_LEN_CHK(p, 0, dsk, end);
if (LF_ISSET(WT_CELL_PREPARE)) {
/*
* For a prepared fast-truncate, the prepare state is recorded in the time aggregate. We
* cannot have a prepared fast-truncate and a prepared time aggregate at the same time.
* Otherwise, it would be a write conflict.
*/
if (has_fast_truncate)
prepare_fast_truncate = true;
else
ta->prepare = 1;
}
if (LF_ISSET(WT_CELL_TS_START))
WT_RET(
__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &ta->oldest_start_ts));
if (LF_ISSET(WT_CELL_TXN_START))
WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &ta->newest_txn));
if (LF_ISSET(WT_CELL_TS_DURABLE_START)) {
WT_RET(__wt_vunpack_uint(
&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &ta->newest_start_durable_ts));
ta->newest_start_durable_ts += ta->oldest_start_ts;
}
if (LF_ISSET(WT_CELL_TS_STOP)) {
WT_RET(
__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &ta->newest_stop_ts));
ta->newest_stop_ts += ta->oldest_start_ts;
}
if (LF_ISSET(WT_CELL_TXN_STOP)) {
WT_RET(
__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &ta->newest_stop_txn));
ta->newest_stop_txn += ta->newest_txn;
}
if (LF_ISSET(WT_CELL_TS_DURABLE_STOP)) {
WT_RET(__wt_vunpack_uint(
&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &ta->newest_stop_durable_ts));
ta->newest_stop_durable_ts += ta->newest_stop_ts;
}
WT_RET(__wt_check_addr_validity(session, ta, end != NULL));
WT_RET(__cell_unpack_addr_cell(session, dsk, cell, &p, end, unpack_addr));
break;
case WT_CELL_DEL:
case WT_CELL_VALUE:
@ -1257,158 +1474,10 @@ copy_cell_restart:
break;
flags = *p++; /* skip second descriptor byte */
WT_CELL_LEN_CHK(p, 0, dsk, end);
wt_timestamp_t temp_start_ts, temp_durable_start_ts, temp_stop_ts, temp_durable_stop_ts;
temp_start_ts = temp_durable_start_ts = temp_durable_stop_ts = WT_TS_NONE;
temp_stop_ts = WT_TS_MAX;
if (LF_ISSET(WT_CELL_TS_START))
WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &temp_start_ts));
if (LF_ISSET(WT_CELL_TXN_START))
WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &tw->start_txn));
if (LF_ISSET(WT_CELL_TS_DURABLE_START))
WT_RET(
__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &temp_durable_start_ts));
if (LF_ISSET(WT_CELL_TS_STOP))
WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &temp_stop_ts));
if (LF_ISSET(WT_CELL_TXN_STOP)) {
WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &tw->stop_txn));
tw->stop_txn += tw->start_txn;
}
if (LF_ISSET(WT_CELL_TS_DURABLE_STOP))
WT_RET(
__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &temp_durable_stop_ts));
/* Load temporary values to the right fields. */
if (LF_ISSET(WT_CELL_PREPARE)) {
bool preserve_prepared = F_ISSET(S2C(session), WT_CONN_PRESERVE_PREPARED);
/*
* We can compare the txn_id only here, but cannot do it everywhere else because when
* recovering, all transaction ids are reset to WT_TXN_NONE, so we cannot compare the
* transaction ids.
*/
if (tw->start_txn == tw->stop_txn && temp_stop_ts == WT_TS_NONE) {
/*
* This is a special case where both transaction start and stop are in prepared
* state. The prepared record is written with the preserve prepared config enabled.
* The same prepared id is packed to WT_CELL_TS_DURABLE_START. Since temp_stop_ts
* here stores the difference between start_prepared_id and stop_prepared_id,
* temp_stop_ts must be 0.
*/
if (temp_durable_start_ts != WT_TS_NONE) {
WT_ASSERT(session, temp_durable_stop_ts == WT_TS_NONE);
tw->start_prepare_ts = temp_start_ts;
tw->start_prepared_id = temp_durable_start_ts;
tw->stop_prepare_ts = temp_start_ts;
tw->stop_prepared_id = temp_durable_start_ts;
} else {
WT_ASSERT_ALWAYS(session, !preserve_prepared,
"Read prepared record with no prepared id when preserve prepared is "
"enabled.");
WT_ASSERT(session, temp_durable_start_ts == temp_durable_stop_ts);
tw->start_prepare_ts = tw->stop_prepare_ts = temp_start_ts;
}
} else if (tw->stop_txn != WT_TXN_MAX) {
/*
* This case happens where the transaction start is committed, but the transaction
* stop is prepared. In this case, we store the start timestamp and durable start
* timestamp in WT_CELL_TS_START and WT_CELL_TS_DURABLE_START, prepare ts in
* WT_CELL_TS_STOP.
*/
tw->start_ts = temp_start_ts;
/*
* The prepared record is written with the preserve prepared config enabled. We
* store the prepared id in WT_CELL_TS_DURABLE_STOP.
*/
if (temp_durable_start_ts != WT_TS_NONE)
tw->durable_start_ts = temp_durable_start_ts + tw->start_ts;
else
tw->durable_start_ts = tw->start_ts;
WT_ASSERT(session, temp_stop_ts != WT_TS_MAX);
tw->stop_prepare_ts = tw->start_ts + temp_stop_ts;
if (temp_durable_stop_ts != WT_TS_NONE)
tw->stop_prepared_id = temp_durable_stop_ts;
else
WT_ASSERT_ALWAYS(session, !preserve_prepared,
"Read prepared record with no prepared id when preserve prepared is "
"enabled.");
} else {
WT_ASSERT(session, tw->start_ts == WT_TS_NONE);
/*
* This case happens when only transaction start is prepared, and there is no
* transaction stop. In this case, we store the prepare ts in WT_CELL_TS_START.
*/
tw->start_prepare_ts = temp_start_ts;
/*
* The prepared record is written with the preserve prepared config enabled. We
* store prepared id in WT_CELL_TS_DURABLE_START.
*/
if (temp_durable_start_ts != WT_TS_NONE)
tw->start_prepared_id = temp_durable_start_ts;
else
WT_ASSERT_ALWAYS(session, !preserve_prepared,
"Read prepared record with no prepared id when preserve prepared is "
"enabled.");
}
} else {
if (LF_ISSET(WT_CELL_TS_START))
tw->start_ts = temp_start_ts;
if (LF_ISSET(WT_CELL_TS_DURABLE_START))
tw->durable_start_ts = temp_durable_start_ts + tw->start_ts;
else
tw->durable_start_ts = tw->start_ts;
if (LF_ISSET(WT_CELL_TS_STOP))
tw->stop_ts = temp_stop_ts + tw->start_ts;
if (LF_ISSET(WT_CELL_TS_DURABLE_STOP))
tw->durable_stop_ts = temp_durable_stop_ts + tw->stop_ts;
else if (tw->stop_ts != WT_TS_MAX)
tw->durable_stop_ts = tw->stop_ts;
}
__cell_assert_tw_has_ts_for_garbage_collection_table(session, tw);
WT_RET(__cell_check_value_validity(session, tw, end != NULL));
WT_RET(__cell_unpack_value_window(session, &p, end, flags, tw));
break;
}
/* Unpack any fast-truncate information. */
if (has_fast_truncate) {
page_del = &unpack_addr->page_del;
WT_RET(__wt_vunpack_uint(
&p, end == NULL ? 0 : WT_PTRDIFF(end, p), (uint64_t *)&page_del->txnid));
if (prepare_fast_truncate) {
page_del->prepare_state = WT_PREPARE_INPROGRESS;
page_del->committed = false;
/*
* For prepared fast-truncates, the prepared state is shared with the time aggregate but
* the prepare timestamp and the prepared id are stored in the page_del block.
*/
WT_RET(
__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &page_del->prepare_ts));
page_del->pg_del_start_ts = page_del->prepare_ts;
WT_RET(
__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &page_del->prepared_id));
/* Explicitly initialize the durable timestamp to WT_TS_NONE. */
page_del->pg_del_durable_ts = WT_TS_NONE;
WT_ASSERT_ALWAYS(session,
!F_ISSET(S2C(session), WT_CONN_PRESERVE_PREPARED) ||
page_del->prepared_id != WT_PREPARED_ID_NONE,
"Read prepared record with no prepared id when preserve prepared is enabled.");
} else {
page_del->prepare_state = WT_PREPARE_INIT;
page_del->committed = true;
WT_RET(__wt_vunpack_uint(
&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &page_del->pg_del_start_ts));
WT_RET(__wt_vunpack_uint(
&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &page_del->pg_del_durable_ts));
}
page_del->selected_for_write = true;
}
/*
* Check for an RLE count or record number that optionally follows the cell descriptor byte on
* column-store variable-length pages.
@ -1441,48 +1510,9 @@ copy_cell_restart:
cell = (WT_CELL *)((uint8_t *)cell - v);
goto copy_cell_restart;
case WT_CELL_KEY_OVFL:
case WT_CELL_KEY_OVFL_RM:
case WT_CELL_VALUE_OVFL:
case WT_CELL_VALUE_OVFL_RM:
/*
* Set overflow flag.
*/
F_SET(unpack, WT_CELL_UNPACK_OVERFLOW);
/* FALLTHROUGH */
case WT_CELL_ADDR_DEL:
case WT_CELL_ADDR_DEL_VISIBLE_ALL:
case WT_CELL_ADDR_INT:
case WT_CELL_ADDR_LEAF:
case WT_CELL_ADDR_LEAF_NO:
case WT_CELL_KEY:
case WT_CELL_KEY_PFX:
case WT_CELL_VALUE:
/*
* The cell is followed by a 4B data length and a chunk of data.
*/
WT_RET(__wt_vunpack_uint(&p, end == NULL ? 0 : WT_PTRDIFF(end, p), &v));
/*
* If the size was what prevented us from using a short cell, it's larger than the
* adjustment size. Decrement/increment it when packing/unpacking so it takes up less room.
*/
if (unpack->raw == WT_CELL_KEY || unpack->raw == WT_CELL_KEY_PFX ||
(unpack->raw == WT_CELL_VALUE && unpack->v == 0 &&
(cell->__chunk[0] & WT_CELL_SECOND_DESC) == 0))
v += WT_CELL_SIZE_ADJUST;
unpack->data = p;
unpack->size = (uint32_t)v;
unpack->__len = WT_PTRDIFF32(p, cell) + unpack->size;
break;
case WT_CELL_DEL:
unpack->__len = WT_PTRDIFF32(p, cell);
break;
default:
return (WT_ERROR); /* Unknown cell type. */
WT_RET(__cell_unpack_data_len(cell, unpack, &p, end));
break;
}
done:

View File

@ -157,6 +157,7 @@ WT_CONF_API_DECLARE(WT_CONNECTION, open_session, 3, 9);
WT_CONF_API_DECLARE(WT_CONNECTION, query_timestamp, 1, 1);
WT_CONF_API_DECLARE(WT_CONNECTION, reconfigure, 21, 132);
WT_CONF_API_DECLARE(WT_CONNECTION, rollback_to_stable, 1, 2);
WT_CONF_API_DECLARE(WT_CONNECTION, set_key_provider, 1, 1);
WT_CONF_API_DECLARE(WT_CONNECTION, set_timestamp, 1, 5);
WT_CONF_API_DECLARE(WT_CURSOR, bound, 1, 3);
WT_CONF_API_DECLARE(WT_CURSOR, reconfigure, 1, 3);

View File

@ -1198,15 +1198,16 @@ struct __wt_connection_impl {
#define WT_CONN_CKPT_CLEANUP_RECLAIM_SPACE 0x0008u
#define WT_CONN_CKPT_SYNC 0x0010u
#define WT_CONN_IN_MEMORY 0x0020u
#define WT_CONN_LIVE_RESTORE_FS 0x0040u
#define WT_CONN_PRECISE_CHECKPOINT 0x0080u
#define WT_CONN_PRESERVE_PREPARED 0x0100u
#define WT_CONN_READONLY 0x0200u
#define WT_CONN_RECOVERING 0x0400u
#define WT_CONN_RECOVERING_METADATA 0x0800u
#define WT_CONN_RECOVERY_COMPLETE 0x1000u
#define WT_CONN_SALVAGE 0x2000u
#define WT_CONN_WAS_BACKUP 0x4000u
#define WT_CONN_KEY_PROVIDER_PUSH 0x0040u
#define WT_CONN_LIVE_RESTORE_FS 0x0080u
#define WT_CONN_PRECISE_CHECKPOINT 0x0100u
#define WT_CONN_PRESERVE_PREPARED 0x0200u
#define WT_CONN_READONLY 0x0400u
#define WT_CONN_RECOVERING 0x0800u
#define WT_CONN_RECOVERING_METADATA 0x1000u
#define WT_CONN_RECOVERY_COMPLETE 0x2000u
#define WT_CONN_SALVAGE 0x4000u
#define WT_CONN_WAS_BACKUP 0x8000u
/* AUTOMATIC FLAG VALUE GENERATION STOP 32 */
wt_shared uint32_t flags;

View File

@ -32,14 +32,15 @@ __wt_single_thread_check_start(WT_SESSION_IMPL *s)
if (!WT_SESSION_IS_DEFAULT(s) && s->thread_check.owning_thread != current_tid) {
ret = __wt_spin_trylock(s, &s->thread_check.lock);
const char *session_name = __wt_atomic_load_ptr_relaxed(&s->name);
WT_ASSERT_ALWAYS(s, ret == 0,
"Session %" PRIu32
" is accessed concurrently by multiple threads: "
"current thread %" PRIuMAX ", owning thread %" PRIuMAX
" (active op: %s, last op: %s, api depth: %u, dhandle: %s)",
s->id, current_tid, s->thread_check.owning_thread, s->name != NULL ? s->name : "none",
s->lastop != NULL ? s->lastop : "none", s->api_call_counter,
s->dhandle != NULL ? s->dhandle->name : "none");
s->id, current_tid, s->thread_check.owning_thread,
session_name != NULL ? session_name : "none", s->lastop != NULL ? s->lastop : "none",
s->api_call_counter, s->dhandle != NULL ? s->dhandle->name : "none");
s->thread_check.owning_thread = current_tid;
}

View File

@ -1514,7 +1514,7 @@ __wt_txn_read_upd_list_internal(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
*restored_updp = NULL;
__wt_upd_value_clear(cbt->upd_value);
for (; upd != NULL; upd = upd->next) {
for (; upd != NULL; upd = __wt_atomic_load_ptr_relaxed(&upd->next)) {
/* Skip reserved place-holders, they're never visible. */
if (upd->type == WT_UPDATE_RESERVE)
continue;

View File

@ -2258,12 +2258,11 @@ struct __wt_connection {
* isolation the cursor temporarily releases any page that requires force eviction\, then
* repositions back to the page for further operations. A page release encourages eviction of
* hot or large pages\, which is more likely to succeed without a cursor keeping the page
* pinned. Note: This setting is not compatible with disaggregated storage., a boolean flag;
* default \c false.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;eviction, if true\, modify internal
* algorithms to change skew to force history store eviction to happen more aggressively. This
* includes but is not limited to not skewing newest\, not favoring leaf pages\, and modifying
* the eviction score mechanism., a boolean flag; default \c false.}
* pinned., a boolean flag; default \c false.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;eviction, if
* true\, modify internal algorithms to change skew to force history store eviction to happen
* more aggressively. This includes but is not limited to not skewing newest\, not favoring
* leaf pages\, and modifying the eviction score mechanism., a boolean flag; default \c false.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;eviction_checkpoint_ts_ordering, if true\, act as if eviction
* is being run in parallel to checkpoint. We should return EBUSY in eviction if we detect any
* timestamp ordering issue., a boolean flag; default \c false.}
@ -3013,7 +3012,11 @@ struct __wt_connection {
*
* @param connection the connection handle
* @param km the key provider structure
* @configempty{WT_CONNECTION.set_key_provider, see dist/api_data.py}
* @configstart{WT_CONNECTION.set_key_provider, see dist/api_data.py}
* @config{version, the key provider API version. Version 0 uses the pull model (WiredTiger
* calls WT_KEY_PROVIDER::get_key). Version 1 uses the push model., an integer between \c 0 and
* \c 1; default \c 0.}
* @configend
* @errors
*/
int __F(set_key_provider)(
@ -3191,27 +3194,26 @@ struct __wt_connection {
* cursor_reposition, if true\, for operations with snapshot isolation the cursor temporarily
* releases any page that requires force eviction\, then repositions back to the page for further
* operations. A page release encourages eviction of hot or large pages\, which is more likely to
* succeed without a cursor keeping the page pinned. Note: This setting is not compatible with
* disaggregated storage., a boolean flag; default \c false.}
* succeed without a cursor keeping the page pinned., a boolean flag; default \c false.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;eviction, if true\, modify internal algorithms to change skew to
* force history store eviction to happen more aggressively. This includes but is not limited to
* not skewing newest\, not favoring leaf pages\, and modifying the eviction score mechanism., a
* boolean flag; default \c false.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;eviction_checkpoint_ts_ordering,
* if true\, act as if eviction is being run in parallel to checkpoint. We should return EBUSY in
* eviction if we detect any timestamp ordering issue., a boolean flag; default \c false.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;log_retention, adjust log removal to retain at least this number
* of log files. (Warning: this option can remove log files required for recovery if no checkpoints
* have yet been done and the number of log files exceeds the configured value. As WiredTiger
* cannot detect the difference between a system that has not yet checkpointed and one that will
* never checkpoint\, it might discard log files before any checkpoint is done.) Ignored if set to
* 0., an integer between \c 0 and \c 1024; default \c 0.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;
* eviction, if true\, modify internal algorithms to change skew to force history store eviction to
* happen more aggressively. This includes but is not limited to not skewing newest\, not favoring
* leaf pages\, and modifying the eviction score mechanism., a boolean flag; default \c false.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;eviction_checkpoint_ts_ordering, if true\, act as if eviction is
* being run in parallel to checkpoint. We should return EBUSY in eviction if we detect any
* timestamp ordering issue., a boolean flag; default \c false.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;
* log_retention, adjust log removal to retain at least this number of log files. (Warning: this
* option can remove log files required for recovery if no checkpoints have yet been done and the
* number of log files exceeds the configured value. As WiredTiger cannot detect the difference
* between a system that has not yet checkpointed and one that will never checkpoint\, it might
* discard log files before any checkpoint is done.) Ignored if set to 0., an integer between \c 0
* and \c 1024; default \c 0.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;realloc_exact, if true\, reallocation
* of memory will only provide the exact amount requested. This will help with spotting memory
* allocation issues more easily., a boolean flag; default \c false.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;realloc_malloc, if true\, every realloc call will force a new
* memory allocation by using malloc., a boolean flag; default \c false.}
* realloc_exact, if true\, reallocation of memory will only provide the exact amount requested.
* This will help with spotting memory allocation issues more easily., a boolean flag; default \c
* false.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;realloc_malloc, if true\, every realloc call will force a
* new memory allocation by using malloc., a boolean flag; default \c false.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;rollback_error, return a WT_ROLLBACK error from a transaction
* operation about every Nth operation to simulate a collision., an integer between \c 0 and \c 10M;
* default \c 0.}

View File

@ -349,18 +349,6 @@ __rec_save_delete_hs_upd_and_free_obs_updates(WT_SESSION_IMPL *session, WTI_RECO
break;
}
/*
* Prepare transaction rollback adds a globally visible tombstone to the update chain to
* remove the entire key. Treating these globally visible tombstones as obsolete and
* trimming update list can cause problems if the update chain is getting accessed somewhere
* else. To avoid this problem, skip these globally visible tombstones from the update
* obsolete check.
*/
if (F_ISSET(delete_upd, WT_UPDATE_PREPARE_ROLLBACK)) {
visible_all_upd = NULL;
break;
}
/* Track the first self-contained value that is globally visible. */
if (F_ISSET(r, WT_REC_CHECKPOINT) && visible_all_upd == NULL && delete_upd->next != NULL &&
WT_UPDATE_DATA_VALUE(delete_upd) && __wt_txn_upd_visible_all(session, delete_upd))
@ -737,14 +725,13 @@ __rec_upd_select(WT_SESSION_IMPL *session, WTI_RECONCILE *r, WT_CELL_UNPACK_KV *
bool *has_newer_updatesp, bool *write_prepare, size_t *upd_memsizep)
{
WT_CONNECTION_IMPL *conn;
WT_UPDATE *upd, *prepare_rollback_tombstone;
WT_UPDATE *upd;
wt_timestamp_t max_ts;
uint64_t max_txn, session_txnid, txnid;
uint8_t prepare_state;
bool is_hs_page;
conn = S2C(session);
prepare_rollback_tombstone = NULL;
max_ts = WT_TS_NONE;
max_txn = WT_TXN_NONE;
is_hs_page = F_ISSET(session->dhandle, WT_DHANDLE_HS);
@ -773,15 +760,8 @@ __rec_upd_select(WT_SESSION_IMPL *session, WTI_RECONCILE *r, WT_CELL_UNPACK_KV *
upd->prepare_state == WT_PREPARE_INPROGRESS);
/* Ignore the prepared update if the rollback timestamp is stable. */
if (upd->upd_rollback_ts != WT_TS_NONE &&
upd->upd_rollback_ts <= r->rec_start_pinned_stable_ts) {
/*
* If we have seen a tombstone that rolled back the prepared update, delete the key
* from the disk.
*/
if (prepare_rollback_tombstone != NULL)
break;
upd->upd_rollback_ts <= r->rec_start_pinned_stable_ts)
continue;
}
txnid = upd->upd_saved_txnid;
}
@ -807,7 +787,6 @@ __rec_upd_select(WT_SESSION_IMPL *session, WTI_RECONCILE *r, WT_CELL_UNPACK_KV *
session_txnid != WT_TXN_NONE && txnid == session_txnid) {
*upd_memsizep += WT_UPDATE_MEMSIZE(upd);
*has_newer_updatesp = true;
WT_ASSERT(session, prepare_rollback_tombstone == NULL);
WT_ASSERT(session, !upd_select->skip_aborted_prepared_value);
continue;
}
@ -841,16 +820,6 @@ __rec_upd_select(WT_SESSION_IMPL *session, WTI_RECONCILE *r, WT_CELL_UNPACK_KV *
*upd_memsizep += WT_UPDATE_MEMSIZE(upd);
*has_newer_updatesp = true;
/*
* If we have already seen a globally visible tombstone from prepared rollback, the
* update we are now skipping is the aborted prepared update that the tombstone rolled
* back, and its rollback is not yet stable (otherwise we would have broken out of the
* loop above). The rollback decision is not durable, so the rollback tombstone is not
* safe to write to disk. Drop it from consideration so the fallback after the loop does
* not select it for write; we will revisit this key in a later reconcile once the
* rollback becomes stable.
*/
prepare_rollback_tombstone = NULL;
/*
* Same reason as the aborted-prepared skip earlier: this rolled-back prepared value has
* no in-chain fallback, so the on-disk cell must not be dropped on this reconciliation.
@ -884,8 +853,6 @@ __rec_upd_select(WT_SESSION_IMPL *session, WTI_RECONCILE *r, WT_CELL_UNPACK_KV *
WT_ASSERT(session, !is_hs_page);
*upd_memsizep += WT_UPDATE_MEMSIZE(upd);
*has_newer_updatesp = true;
/* We should write nothing to disk. */
prepare_rollback_tombstone = NULL;
/*
* Same reason as the aborted-prepared skip earlier: this rolled-back prepared
@ -925,8 +892,7 @@ __rec_upd_select(WT_SESSION_IMPL *session, WTI_RECONCILE *r, WT_CELL_UNPACK_KV *
* commit/rollback. But it is enough to help us catch some issues.
*/
WT_ASSERT_ALWAYS(session,
!F_ISSET(r, WT_REC_EVICT) || prepare_rollback_tombstone != NULL ||
upd->next != NULL ||
!F_ISSET(r, WT_REC_EVICT) || upd->next != NULL ||
(WT_REC_HAS_ON_DISK(vpack) && !WT_TIME_WINDOW_HAS_PREPARE(&vpack->tw)),
"leaked prepared update.");
} else
@ -999,46 +965,12 @@ __rec_upd_select(WT_SESSION_IMPL *session, WTI_RECONCILE *r, WT_CELL_UNPACK_KV *
}
}
if (F_ISSET(conn, WT_CONN_PRESERVE_PREPARED) && F_ISSET(upd, WT_UPDATE_PREPARE_ROLLBACK) &&
!F_ISSET(upd, WT_UPDATE_SELECT_FOR_DS))
prepare_rollback_tombstone = upd;
/*
* Always select the newest visible update if precise checkpoint is not enabled. Otherwise,
* select the first update that is smaller or equal to the pinned timestamp.
*/
else if (upd_select->upd == NULL) {
if (upd_select->upd == NULL)
upd_select->upd = upd;
if (prepare_rollback_tombstone != NULL) {
/*
* Not checking upd->txnid == WT_TXN_ABORTED here because when doing prepared
* rollback, we first insert the rollback tombstone then mark the prepare aborted,
* so this assert can fire if we race with prepared rollback.
*/
WT_ASSERT(session,
*write_prepare &&
(prepare_state == WT_PREPARE_INPROGRESS || prepare_state == WT_PREPARE_LOCKED));
#ifdef HAVE_DIAGNOSTIC
/*
* Walk from the rollback tombstone to the current prepared update; the only updates
* permitted in between are reserve updates. Any other update would mean an unknown
* entry slipped in front of the prepared update we are about to select.
*/
WT_UPDATE *scan;
for (scan = prepare_rollback_tombstone->next; scan != NULL && scan != upd;
scan = scan->next)
WT_ASSERT(
session, scan->type == WT_UPDATE_RESERVE && scan->txnid == WT_TXN_ABORTED);
WT_ASSERT(session, scan == upd);
#endif
/* We skipped the prepare rollback tombstone. */
WT_ASSERT(session, *has_newer_updatesp);
/*
* If we have seen a tombstone that rolled back the prepared update, this must be
* the prepared update. No need to walk further.
*/
prepare_rollback_tombstone = NULL;
}
}
/* Track the selected update transaction id and timestamp. */
if (max_txn < txnid)
@ -1052,13 +984,6 @@ __rec_upd_select(WT_SESSION_IMPL *session, WTI_RECONCILE *r, WT_CELL_UNPACK_KV *
max_ts = upd->upd_start_ts;
}
/*
* If we see a globally visible tombstone that deletes a key because of prepared rollback,
* keep walking to see if we should write the prepared update instead.
*/
if (prepare_rollback_tombstone != NULL)
continue;
/*
* We only need to walk the whole update chain if we are evicting metadata as it is written
* with read uncommitted isolation and we may see a committed update followed by uncommitted
@ -1068,10 +993,6 @@ __rec_upd_select(WT_SESSION_IMPL *session, WTI_RECONCILE *r, WT_CELL_UNPACK_KV *
break;
}
/* The prepare rollback is stable. Delete the key by selecting the rollback tombstone. */
if (upd_select->upd == NULL && prepare_rollback_tombstone != NULL)
upd_select->upd = prepare_rollback_tombstone;
/*
* Track the most recent transaction in the page. We store this in the tree at the end of
* reconciliation in the service of checkpoints, it is used to avoid discarding trees from

View File

@ -2631,7 +2631,7 @@ __open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const
session_ret->iface = F_ISSET(conn, WT_CONN_READONLY) ? stds_readonly : stds;
session_ret->iface.connection = &conn->iface;
session_ret->name = NULL;
__wt_atomic_store_ptr_relaxed(&session_ret->name, NULL);
session_ret->id = i;
#ifdef HAVE_UNITTEST_ASSERTS

View File

@ -78,13 +78,15 @@ __wt_session_dump(WT_SESSION_IMPL *session, WT_SESSION_IMPL *dump_session, bool
WT_CURSOR *cursor;
WT_DECL_ITEM(buf);
WT_DECL_RET;
const char *session_name;
WT_ERR(__wt_scr_alloc(session, 0, &buf));
WT_ERR(__wt_msg(
session, "Session: ID: %" PRIu32 " @: 0x%p", dump_session->id, (void *)dump_session));
WT_ERR(
__wt_msg(session, " Name: %s", dump_session->name == NULL ? "EMPTY" : dump_session->name));
session_name = __wt_atomic_load_ptr_relaxed(&dump_session->name);
WT_ERR(__wt_msg(session, " Name: %s", session_name == NULL ? "EMPTY" : session_name));
WT_ERR(__wt_msg(session, " Last operation: %s",
dump_session->lastop == NULL ? "NONE" : dump_session->lastop));
WT_ERR(__wt_msg(session, " Current dhandle: %s",

View File

@ -415,6 +415,8 @@ __wt_hazard_check_assert(WT_SESSION_IMPL *session, void *ref, bool waitfor)
break;
__wt_sleep(0, 10 * WT_THOUSAND);
}
const char *session_name = __wt_atomic_load_ptr_relaxed(&s->name);
#ifdef HAVE_DIAGNOSTIC
/*
* In diagnostic mode we also track the file and line where the hazard pointer is set. If this
@ -422,10 +424,11 @@ __wt_hazard_check_assert(WT_SESSION_IMPL *session, void *ref, bool waitfor)
*/
__wt_errx(session,
"hazard pointer reference to discarded object: (%p: session %p name %s: %s, line %d)",
(void *)hp->ref, (void *)s, s->name == NULL ? "UNKNOWN" : s->name, hp->func, hp->line);
(void *)hp->ref, (void *)s, session_name == NULL ? "UNKNOWN" : session_name, hp->func,
hp->line);
#else
__wt_errx(session, "hazard pointer reference to discarded object: (%p: session %p name %s)",
(void *)hp->ref, (void *)s, s->name == NULL ? "UNKNOWN" : s->name);
(void *)hp->ref, (void *)s, session_name == NULL ? "UNKNOWN" : session_name);
#endif
return (false);
}

View File

@ -958,7 +958,7 @@ __txn_prepare_rollback_restore_hs_update(
}
/* Append the update to the end of the chain. */
WT_RELEASE_WRITE_WITH_BARRIER(upd_chain->next, upd);
__wt_atomic_store_ptr_relaxed(&upd_chain->next, upd);
__wt_cache_page_inmem_incr(session, page, total_size, false);
@ -1034,30 +1034,35 @@ __txn_search_prepared_op(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_ITEM *key
/*
* __txn_prepare_rollback_delete_key --
* Prepend a global visible tombstone to the head of the update chain to delete the key for
* prepare rollback.
* Append a globally visible tombstone to the tail of the update chain to delete the key for
* prepare rollback. Placing the tombstone below the prepared update encodes its role by
* position so reconciliation and pruning see a normal globally visible tombstone without
* needing a distinguishing flag.
*/
static int
__txn_prepare_rollback_delete_key(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_CURSOR_BTREE *cbt)
__txn_prepare_rollback_delete_key(WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd_chain)
{
WT_DECL_RET;
WT_UPDATE *tombstone;
size_t not_used;
size_t size;
tombstone = NULL;
WT_ASSERT(session, upd_chain != NULL);
WT_ERR(__wt_upd_alloc_tombstone(session, &tombstone, &not_used));
F_SET(tombstone, WT_UPDATE_PREPARE_ROLLBACK);
WT_WITH_BTREE(session, btree,
ret = btree->type == BTREE_ROW ?
__wt_row_modify(cbt, &cbt->iface.key, NULL, &tombstone, WT_UPDATE_INVALID, false, false) :
__wt_col_modify(cbt, cbt->recno, NULL, &tombstone, WT_UPDATE_INVALID, false, false));
WT_ERR(ret);
tombstone = NULL;
size = 0;
WT_RET(__wt_upd_alloc_tombstone(session, &tombstone, &size));
err:
__wt_free(session, tombstone);
return (ret);
/*
* Walk to the end of the chain. The caller guarantees that the chain at this point consists
* only of updates from the resolving prepared transaction (and aborted reserve entries), so the
* tail's next pointer is stable.
*/
while (upd_chain->next != NULL)
upd_chain = upd_chain->next;
__wt_atomic_store_ptr_relaxed(&upd_chain->next, tombstone);
__wt_cache_page_inmem_incr(session, page, size, false);
return (0);
}
/*
@ -1218,28 +1223,24 @@ __wt_txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_BTREE *btree,
* If the prepared update is a single tombstone, we don't need to do anything special and we can
* directly resolve it in memory.
*
* If the prepared update is not a tombstone or we have multiple prepared updates in the same
* transaction. There are four base cases:
* Otherwise there are three resolve cases:
*
* 1) Prepared updates are on the update chain.
* commit: simply resolve the updates on chain.
* rollback: simply resolve the updates on chain.
* 1) Prepared updates are on the update chain (RESOLVE_UPDATE_CHAIN).
* commit: resolve the updates on chain.
* rollback: if the prepared update is the only update and there is no on-disk value,
* append a globally visible tombstone to delete the key.
*
* 2) Prepared updates are written to the data store.
* If there is no older updates written to the history store:
* commit: simply resolve the prepared updates in memory.
* rollback: delete the whole key.
*
* If there are older updates written to the history store:
* 2) Prepared updates are written to the data store (RESOLVE_PREPARE_ON_DISK).
* If there are older updates in the history store:
* commit: restore the newest history store update with a max stop time point to the
* update chain. Reconciliation should know when to delete it from the history
* store.
* rollback:restore the newest update in the history store to the update chain.
* Reconciliation should know when to delete it from the history store.
* update chain.
* rollback: restore the newest history store update to the update chain.
* If there are no older updates in the history store:
* commit: resolve the prepared updates in memory.
* rollback: append a globally visible tombstone to delete the key.
*
* 4) We are running an in-memory database:
* commit: resolve the prepared updates in memory.
* rollback: if the prepared update is written to the disk image, delete the whole key.
* 3) We are running an in-memory database (RESOLVE_IN_MEMORY).
* commit/rollback: resolve the prepared updates in memory only.
*/
/*
@ -1264,7 +1265,7 @@ __wt_txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_BTREE *btree,
if (!commit && first_committed_upd == NULL) {
tw_found = __wt_read_cell_time_window(cbt, &tw);
if (!tw_found)
WT_ERR(__txn_prepare_rollback_delete_key(session, btree, cbt));
WT_ERR(__txn_prepare_rollback_delete_key(session, page, head_upd));
else
WT_ASSERT_ALWAYS(
session, !WT_TIME_WINDOW_HAS_PREPARE(&tw), "no committed update to fallback to.");
@ -1303,12 +1304,12 @@ __wt_txn_resolve_prepared_op(WT_SESSION_IMPL *session, WT_BTREE *btree,
else {
ret = 0;
/*
* Allocate a tombstone and prepend it to the row so when we reconcile the update chain
* we don't copy the prepared cell, which is now associated with a rolled back prepare,
* and instead write nothing.
* Append a globally visible tombstone to the end of the chain. When reconciliation
* later drops the rolled-back prepared cell, the tombstone remains as the correct
* post-rollback state for the key.
*/
if (!commit)
WT_ERR(__txn_prepare_rollback_delete_key(session, btree, cbt));
WT_ERR(__txn_prepare_rollback_delete_key(session, page, head_upd));
}
break;
case RESOLVE_IN_MEMORY:
@ -2764,13 +2765,15 @@ __wt_verbose_dump_txn_one(
buf_len = 512;
WT_RET(__wt_scr_alloc(session, buf_len, &buf));
const char *session_name = __wt_atomic_load_ptr_relaxed(&txn_session->name);
WT_ERR(__wt_snprintf((char *)buf->data, buf_len,
"session ID: %" PRIu32 ", txn ID: %" PRIu64 ", pinned ID: %" PRIu64
", metadata pinned ID: %" PRIu64 ", name: %s",
txn_session->id, __wt_atomic_load_uint64_v_relaxed(&txn_shared->id),
__wt_atomic_load_uint64_v_relaxed(&txn_shared->pinned_id),
__wt_atomic_load_uint64_v_relaxed(&txn_shared->metadata_pinned),
txn_session->name == NULL ? "EMPTY" : txn_session->name));
session_name == NULL ? "EMPTY" : session_name));
if (error_code != 0)
WT_ERR_MSG(session, error_code, "%s, %s", (char *)buf->data,

View File

@ -326,6 +326,26 @@ TEST_CASE_METHOD(kp_fixture, "Persist key, failure", "[key_provider]")
free(const_cast<void *>(crypt.keys.data));
}
TEST_CASE_METHOD(kp_fixture, "set_key_provider version selects push mode", "[key_provider]")
{
WT_CONNECTION *wt_conn = conn.get_wt_connection();
WT_CONNECTION_IMPL *conn_impl = conn.get_wt_connection_impl();
WT_KEY_PROVIDER stub = {};
/* version=0 (default): push flag stays clear. */
REQUIRE(wt_conn->set_key_provider(wt_conn, &stub, "version=0") == 0);
REQUIRE(!F_ISSET(conn_impl, WT_CONN_KEY_PROVIDER_PUSH));
conn_impl->key_provider = nullptr; /* Allow reconfiguration. */
/* version=1: push flag is set. */
REQUIRE(wt_conn->set_key_provider(wt_conn, &stub, "version=1") == 0);
REQUIRE(F_ISSET(conn_impl, WT_CONN_KEY_PROVIDER_PUSH));
/* Cleanup so the fixture destructor doesn't see a stale provider. */
conn_impl->key_provider = nullptr;
F_CLR(conn_impl, WT_CONN_KEY_PROVIDER_PUSH);
}
TEST_CASE_METHOD(kp_fixture, "Key always expires", "[key_provider]")
{
kp_ptr_t kp = kp_init("key_expires=0");

View File

@ -1489,10 +1489,6 @@ config_disagg_storage(void)
config_off(NULL, "ops.compaction");
config_off(NULL, "background_compact");
/* Cursor reposition is not supported for disaggregated storage. */
config_off(NULL, "debug.cursor_reposition");
config_off(NULL, "stress.evict_reposition");
/* Tiered storage is not supported with disagg */
config_single(NULL, "tiered_storage.storage_source=off", true);
}

View File

@ -0,0 +1,208 @@
#!/usr/bin/env python3
#
# Public Domain 2014-present MongoDB, Inc.
# Public Domain 2008-2014 WiredTiger, Inc.
#
# This is free and unencumbered software released into the public domain.
#
# Anyone is free to copy, modify, publish, use, compile, sell, or
# distribute this software, either in source code form or as a compiled
# binary, for any purpose, commercial or non-commercial, and by any
# means.
#
# In jurisdictions that recognize copyright laws, the author or authors
# of this software dedicate any and all copyright interest in the
# software to the public domain. We make this dedication for the benefit
# of the public at large and to the detriment of our heirs and
# successors. We intend this dedication to be an overt act of
# relinquishment in perpetuity of all present and future rights to this
# software under copyright law.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
# helper_layered_fast_truncate.py
# Shared helpers for the layered fast truncate Python tests.
from contextlib import closing
from itertools import chain
from typing import Iterable
import wiredtiger
def concat(*iterables):
"""Concatenate any number of iterables into a single list."""
return list(chain.from_iterable(iterables))
def range_inclusive(start, stop):
"""Return a range covering [start, stop] inclusive."""
return range(start, stop + 1)
class LayeredFastTruncateConfigMixin:
"""Shared helpers for the layered fast truncate test suite."""
def key(self, n):
"""
Convert an int into a key; override in subclasses that use a different
key format.
"""
return n
def session_create_config(self):
"""
Return the session.create() config string, and, for layered URIs, the
disaggregated storage options.
"""
cfg = 'key_format=i,value_format=S'
uri = getattr(self, 'uri', '')
if uri.startswith('table'):
cfg += ',block_manager=disagg,type=layered'
return cfg
def auto_closing_cursor(self, config=None):
"""Return a cursor that auto-closes as it goes out of scope."""
return closing(self.session.open_cursor(self.uri, None, config))
def populate(self, keys, value='v'):
"""Insert each key with a placeholder value in a single transaction."""
with self.auto_closing_cursor() as cursor:
with self.transaction():
for key in keys:
cursor[self.key(key)] = value
def setup_leader(self, keys=None, extra_cfg=''):
"""
Create the table on the leader and optionally populate stable. The
follower picks up these keys via the initial checkpoint.
"""
self.session.create(self.uri, self.session_create_config() + extra_cfg)
if keys is not None:
self.populate(keys)
self.session.checkpoint()
def setup_follower(self, keys=None):
"""Switch to follower role and optionally write keys to ingest."""
self.reopen_disagg_conn('disaggregated=(role="follower"),')
if keys is not None:
self.populate(keys)
def truncate(self, start_key=None, stop_key=None, commit_timestamp=None):
"""
Truncate [start_key, stop_key] inclusive on self.uri. Either bound
may be None for an open-ended side. If commit_timestamp is set,
the truncate transaction commits at that timestamp.
"""
start = stop = None
try:
if start_key is not None:
start = self.session.open_cursor(self.uri)
start.set_key(self.key(start_key))
if stop_key is not None:
stop = self.session.open_cursor(self.uri)
stop.set_key(self.key(stop_key))
# session.truncate() needs a URI iff both cursors are NULL.
uri = self.uri if (start is None and stop is None) else None
with self.transaction(commit_timestamp=commit_timestamp):
self.session.truncate(uri, start, stop, None)
finally:
if start is not None:
start.close()
if stop is not None:
stop.close()
def visible_keys(self, forward=True):
"""Return all keys visible via a scan (forward or backward)."""
result = []
with self.auto_closing_cursor() as cursor:
step = cursor.next if forward else cursor.prev
with self.transaction(rollback=True):
while step() == 0:
result.append(cursor.get_key())
return result
def key_exists(self, key):
"""Return True if key is visible to a search in its own transaction."""
with self.auto_closing_cursor() as cursor:
with self.transaction(rollback=True):
cursor.set_key(self.key(key))
return cursor.search() == 0
def search_near_key(self, key):
"""
Run search_near. Returns (exact, found_key). exact follows WT
convention: 0 = exact, 1 = positioned above, -1 = positioned
below, or WT_NOTFOUND if no visible keys exist (in which case
found_key is None).
"""
with self.auto_closing_cursor() as cursor:
with self.transaction(rollback=True):
cursor.set_key(self.key(key))
exact = cursor.search_near()
if exact == wiredtiger.WT_NOTFOUND:
return exact, None
return exact, cursor.get_key()
def leader_checkpoint(self, ts=None):
"""Set timestamps and checkpoint on the leader."""
if ts is not None:
self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(ts) +
',oldest_timestamp=' + self.timestamp_str(1))
self.session.checkpoint()
def step_up(self):
"""Promote self.conn_follow to leader; the original leader steps down."""
self.ignoreStdoutPattern('Picking up the same checkpoint')
self.disagg_switch_follower_and_leader(self.conn_follow)
def open_follower(self, table_config='key_format=i,value_format=S'):
"""
Open a separate follower connection (distinct from setup_follower
which reopens the existing connection). Returns (conn, session).
"""
conn = self.wiredtiger_open(
'follower',
self.extensionsConfig() +
',create,cache_size=50MB,statistics=(all),disaggregated=(role="follower")')
session = conn.open_session('')
session.create(self.uri, table_config)
self.disagg_advance_checkpoint(conn, self.conn)
return conn, session
def search_at(self, session, key, ts):
"""Search for key under a read_timestamp; return (ret, value)."""
cur = session.open_cursor(self.uri)
try:
with self.transaction(session=session, read_timestamp=ts, rollback=True):
cur.set_key(key)
ret = cur.search()
val = cur.get_value() if ret == 0 else None
return ret, val
finally:
cur.close()
def evict_range(self, session, start, stop, step=1):
"""Evict the page(s) backing keys [start, stop] on the given session."""
evict_cur = session.open_cursor(self.uri, None, 'debug=(release_evict)')
try:
with self.transaction(session=session, read_timestamp=10, rollback=True):
for i in range(start, stop + 1, step):
evict_cur.set_key(i)
evict_cur.search()
evict_cur.reset()
finally:
evict_cur.close()
def get_stat(self, conn, stat_key):
"""Read a connection statistic on the given connection."""
s = conn.open_session('')
val = s.open_cursor('statistics:')[stat_key][2]
s.close()
return val

View File

@ -4,6 +4,8 @@
test_autoclose.py
test_config02.py
test_config09.py
test_cursor13.py # FIXME: WT-15369
test_cursor21.py # FIXME: WT-15369
test_drop03.py
test_dump.py
test_dump01.py

View File

@ -56,15 +56,7 @@ class test_cursor13_base(wttest.WiredTigerTestCase):
def caching_stats(self):
hs_stats_uri = 'statistics:file:WiredTigerHS.wt'
max_tries = 100
# Cursor cache/reopen stats are updated with plain (non-atomic) int64 add/subtract
# operations.
# A recent increment by another core may not yet be visible to this reader. Re-reading
# in a tight Python loop cannot force coherence; the fix is to pause briefly on retry
# so store buffers drain and cache lines propagate.
retry_sleep = 0.005 # seconds
for i in range(max_tries):
if i > 0:
time.sleep(retry_sleep)
hs_stats_before = self.session.open_cursor(hs_stats_uri, None, None)
conn_stats = self.session.open_cursor('statistics:', None, None)
hs_stats_after = self.session.open_cursor(hs_stats_uri, None, None)
@ -86,14 +78,7 @@ class test_cursor13_base(wttest.WiredTigerTestCase):
hs_after[0] += hs_disagg_stat_after[stat.dsrc.cursor_cache][2]
hs_after[1] += hs_disagg_stat_after[stat.dsrc.cursor_reopen][2]
report = [totals[0],
hs_before[0],
hs_disagg_stat_before[stat.dsrc.cursor_cache][2],
hs_stats_before[stat.dsrc.cursor_cache][2]]
self.pr(' '.join(map(str, report)))
hs_disagg_stat_before.close()
hs_disagg_stat_after.close()
self.pr(str(totals[0]) + " " + str(hs_before[0]) + " " + str(hs_disagg_stat_before[stat.dsrc.cursor_cache][2]) + " " + str(hs_stats_before[stat.dsrc.cursor_cache][2]))
hs_stats_before.close()
hs_stats_after.close()
@ -526,7 +511,6 @@ class test_cursor13_big(test_cursor13_big_base):
self.assertEqual(end_stats[0] - begin_stats[0], self.closecount)
self.assertEqual(end_stats[1] - begin_stats[1], self.opencount)
@wttest.skip_for_hook("disagg", "layered dhandles are never swept: FIXME-WT-16982")
class test_cursor13_sweep(test_cursor13_big_base):
# Set dhandle sweep configuration so that dhandles should be closed within
# two seconds of all the cursors for the dhandle being closed (cached).

View File

@ -31,7 +31,7 @@
import wttest
from wtscenario import make_scenarios
from wiredtiger import stat, WiredTigerError
from wiredtiger import stat
class test_cursor21(wttest.WiredTigerTestCase):
uri = "table:test_cursor21"
@ -71,7 +71,6 @@ class test_cursor21(wttest.WiredTigerTestCase):
self.assertEqual(reposition_count, 0)
return reposition_count
@wttest.skip_for_hook("disagg", "layered tables don't support cursor reposition")
def test_cursor21(self):
format = 'key_format={},value_format={}'.format(self.key_format, self.value_format)
reposition_count = 0
@ -126,15 +125,3 @@ class test_cursor21(wttest.WiredTigerTestCase):
reposition_count += self.check_reposition(reposition_count)
cursor.close()
self.session.close()
@wttest.only_for_hook("disagg", "check reposition is disabled for disaggregated storage")
def test_cursor21_dsc(self):
# Skip the test if reposition is disabled or it's column store (unsupported in disagg).
if not self.reposition or self.scenario_name == 'column.reposition':
return
format = 'key_format={},value_format={}'.format(self.key_format, self.value_format)
self.session.create(self.uri, format)
msg = '/Operation not supported/'
self.assertRaisesWithMessage(WiredTigerError,
lambda: self.session.open_cursor(self.uri), msg)

View File

@ -34,7 +34,6 @@ import wiredtiger
from wtscenario import make_scenarios
WT_TS_MAX = 2**64 - 1
WT_UPDATE_PREPARE_ROLLBACK = 0x080
class test_cursor24(wttest.WiredTigerTestCase):
uri = 'file:test_cursor24.wt'

View File

@ -181,7 +181,7 @@ class test_cursor25(wttest.WiredTigerTestCase):
cursor[1] = 10
self.session.commit_transaction("commit_timestamp=" + self.timestamp_str(1))
# Prepared overwrite + rollback. No PREPARE_ROLLBACK tombstone because
# Prepared overwrite + rollback. No rollback tombstone appended because
# first_committed_upd != NULL.
session2 = self.conn.open_session()
cursor2 = session2.open_cursor(self.uri, None)
@ -229,7 +229,7 @@ class test_cursor25(wttest.WiredTigerTestCase):
cursor[1] = 10
self.session.commit_transaction("commit_timestamp=" + self.timestamp_str(1))
# Prepared delete + rollback. No PREPARE_ROLLBACK tombstone because
# Prepared delete + rollback. No rollback tombstone appended because
# first_committed_upd != NULL.
session2 = self.conn.open_session()
cursor2 = session2.open_cursor(self.uri, None)

View File

@ -199,10 +199,13 @@ class test_layered69(test_prepare_preserve_prepare_base):
session_prepare.rollback_transaction(f'rollback_timestamp={self.timestamp_str(45)}')
session_prepare.close()
# Verify checkpoint skips writing a page to disk
# Verify checkpoint skips writing a page to disk. When the page was evicted before the
# prepare, the prior committed delete tombstone is gone from memory, so the prepare
# rollback appends a fresh tail tombstone with no durable flag set; that tombstone gets
# re-saved and causes one extra write here.
self.checkpoint_and_verify_stats({
wiredtiger.stat.dsrc.rec_time_window_prepared: False,
stat: False,
stat: self.evict,
}, self.uri)
# Make stable timestamp equal to prepare timestamp - this should allow checkpoint to reconcile prepared update

View File

@ -28,12 +28,13 @@
import unittest, wttest, wiredtiger
from helper_disagg import disagg_test_class, gen_disagg_storages
from helper_layered_fast_truncate import LayeredFastTruncateConfigMixin
from wtscenario import make_scenarios
# test_layered_fast_truncate01.py
# Test basic fast truncate functionality.
@disagg_test_class
class test_layered_fast_truncate01(wttest.WiredTigerTestCase):
class test_layered_fast_truncate01(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
conn_config = 'disaggregated=(role="leader"),'
@ -48,6 +49,9 @@ class test_layered_fast_truncate01(wttest.WiredTigerTestCase):
nitems = 1000
def key(self, n):
return str(n)
def session_create_config(self):
cfg = 'key_format=S,value_format=S'
if self.uri.startswith('table'):

View File

@ -32,10 +32,11 @@
import wiredtiger, wttest
from helper_disagg import disagg_test_class, gen_disagg_storages
from helper_layered_fast_truncate import LayeredFastTruncateConfigMixin
from wtscenario import make_scenarios
@disagg_test_class
class test_layered_fast_truncate02(wttest.WiredTigerTestCase):
class test_layered_fast_truncate02(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
uri = 'layered:test_layered_fast_truncate02'
nrows = 5000
@ -48,11 +49,6 @@ class test_layered_fast_truncate02(wttest.WiredTigerTestCase):
disagg_storages = gen_disagg_storages('test_layered_fast_truncate02', disagg_only=True)
scenarios = make_scenarios(disagg_storages)
def leader_checkpoint(self, ts):
self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(ts) +
',oldest_timestamp=' + self.timestamp_str(1))
self.session.checkpoint()
def setup_leader(self):
self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1))
self.session.create(self.uri, 'key_format=i,value_format=S')
@ -74,44 +70,12 @@ class test_layered_fast_truncate02(wttest.WiredTigerTestCase):
evict_cur.close()
self.session.rollback_transaction()
def truncate_and_checkpoint(self, trunc_start, trunc_stop, ts):
# Fast-truncate rows [trunc_start, trunc_stop] on the leader and checkpoint.
c_start = self.session.open_cursor(self.uri)
c_start.set_key(trunc_start)
c_stop = self.session.open_cursor(self.uri)
c_stop.set_key(trunc_stop)
self.session.begin_transaction()
self.session.truncate(None, c_start, c_stop, None)
self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
c_start.close()
c_stop.close()
self.leader_checkpoint(ts)
def open_follower(self):
conn = self.wiredtiger_open(
'follower',
self.extensionsConfig() + ',create,cache_size=50MB,statistics=(all),disaggregated=(role="follower")')
sess = conn.open_session('')
sess.create(self.uri, 'key_format=i,value_format=S')
self.disagg_advance_checkpoint(conn, self.conn)
return conn, sess
def search_at(self, sess, key, ts):
cur = sess.open_cursor(self.uri)
txn_cfg = ('read_timestamp=' + self.timestamp_str(ts))
sess.begin_transaction(txn_cfg)
cur.set_key(key)
ret = cur.search()
val = cur.get_value() if ret == 0 else None
sess.rollback_transaction()
cur.close()
return ret, val
def test_visibility(self):
# At ts=20 (equal to truncation at ts=20): truncated keys return WT_NOTFOUND, boundary and
# exterior keys return their values. At ts=15 (before truncation): all keys are visible.
self.setup_leader()
self.truncate_and_checkpoint(self.trunc_start, self.trunc_stop, 20)
self.truncate(self.trunc_start, self.trunc_stop, commit_timestamp=20)
self.leader_checkpoint(20)
conn, sess = self.open_follower()
# Truncation is visible: deleted keys are gone, surrounding keys survive.
@ -137,7 +101,8 @@ class test_layered_fast_truncate02(wttest.WiredTigerTestCase):
# Reading at a timestamp before the truncation must still find all rows, including those
# later deleted. Verifies mvcc correctness across the follower checkpoint boundary.
self.setup_leader()
self.truncate_and_checkpoint(self.trunc_start, self.trunc_stop, 20)
self.truncate(self.trunc_start, self.trunc_stop, commit_timestamp=20)
self.leader_checkpoint(20)
conn, sess = self.open_follower()
for key in [self.trunc_start, self.trunc_mid, self.trunc_stop]:
@ -161,7 +126,8 @@ class test_layered_fast_truncate02(wttest.WiredTigerTestCase):
# Forward and backward scans must skip the entire truncated range without visiting any
# deleted key. search_near on a deleted key must land outside the range.
self.setup_leader()
self.truncate_and_checkpoint(self.trunc_start, self.trunc_stop, 20)
self.truncate(self.trunc_start, self.trunc_stop, commit_timestamp=20)
self.leader_checkpoint(20)
conn, sess = self.open_follower()
expected = self.nrows - (self.trunc_stop - self.trunc_start + 1)

View File

@ -33,11 +33,12 @@
import wiredtiger, wttest
from helper_disagg import disagg_test_class, gen_disagg_storages
from helper_layered_fast_truncate import LayeredFastTruncateConfigMixin
from wtscenario import make_scenarios
from wiredtiger import stat
@disagg_test_class
class test_layered_fast_truncate03(wttest.WiredTigerTestCase):
class test_layered_fast_truncate03(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
uri = 'layered:test_layered_fast_truncate03'
nrows = 5000
@ -49,17 +50,6 @@ class test_layered_fast_truncate03(wttest.WiredTigerTestCase):
disagg_storages = gen_disagg_storages('test_layered_fast_truncate03', disagg_only=True)
scenarios = make_scenarios(disagg_storages)
def get_stat(self, conn, stat_key):
s = conn.open_session('')
val = s.open_cursor('statistics:')[stat_key][2]
s.close()
return val
def leader_checkpoint(self, ts):
self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(ts) +
',oldest_timestamp=' + self.timestamp_str(1))
self.session.checkpoint()
def setup_leader(self, extra_cfg=''):
self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1))
self.session.create(self.uri, 'key_format=i,value_format=S' + extra_cfg)
@ -81,58 +71,16 @@ class test_layered_fast_truncate03(wttest.WiredTigerTestCase):
evict_cur.close()
self.session.rollback_transaction()
def truncate_and_checkpoint(self, trunc_start, trunc_stop, ts):
# Fast-truncate rows [trunc_start, trunc_stop] on the leader and checkpoint.
c_start = self.session.open_cursor(self.uri)
c_start.set_key(trunc_start)
c_stop = self.session.open_cursor(self.uri)
c_stop.set_key(trunc_stop)
self.session.begin_transaction()
self.session.truncate(None, c_start, c_stop, None)
self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
c_start.close()
c_stop.close()
self.leader_checkpoint(ts)
def open_follower(self):
conn = self.wiredtiger_open(
'follower',
self.extensionsConfig() + ',create,cache_size=50MB,statistics=(all),disaggregated=(role="follower")')
sess = conn.open_session('')
sess.create(self.uri, 'key_format=i,value_format=S')
self.disagg_advance_checkpoint(conn, self.conn)
return conn, sess
def advance_follower(self, conn):
self.leader_checkpoint(20)
self.disagg_advance_checkpoint(conn, self.conn)
def evict_range(self, sess, start, stop, step=1):
evict_cur = sess.open_cursor(self.uri, None, 'debug=(release_evict)')
sess.begin_transaction('read_timestamp=' + self.timestamp_str(10))
for i in range(start, stop + 1, step):
evict_cur.set_key(i)
evict_cur.search()
evict_cur.reset()
evict_cur.close()
sess.rollback_transaction()
def search_at(self, sess, key, ts):
cur = sess.open_cursor(self.uri)
txn_cfg = ('read_timestamp=' + self.timestamp_str(ts))
sess.begin_transaction(txn_cfg)
cur.set_key(key)
ret = cur.search()
val = cur.get_value() if ret == 0 else None
sess.rollback_transaction()
cur.close()
return ret, val
def test_no_dirty_on_read(self):
# Reading fast-truncated pages on the follower must never dirty them. Verifies this holds
# across a full load-evict-reload cycle for both single and bulk page reads.
self.setup_leader()
self.truncate_and_checkpoint(self.trunc_start, self.trunc_stop, 20)
self.truncate(self.trunc_start, self.trunc_stop, commit_timestamp=20)
self.leader_checkpoint(20)
conn, sess = self.open_follower()
sample = list(range(self.trunc_start, self.trunc_stop + 1, 10))
dirty_before = self.get_stat(conn, stat.conn.cache_pages_dirty)
@ -168,7 +116,8 @@ class test_layered_fast_truncate03(wttest.WiredTigerTestCase):
# restore a subset of truncated keys, those keys must be visible while the rest
# remain deleted.
self.setup_leader(',leaf_page_max=4096')
self.truncate_and_checkpoint(self.trunc_start, self.trunc_stop, 20)
self.truncate(self.trunc_start, self.trunc_stop, commit_timestamp=20)
self.leader_checkpoint(20)
conn, sess = self.open_follower()
sample = list(range(self.trunc_start, self.trunc_stop + 1, 10))
dirty_before = self.get_stat(conn, stat.conn.cache_pages_dirty)
@ -226,7 +175,8 @@ class test_layered_fast_truncate03(wttest.WiredTigerTestCase):
# Closing and reopening the follower connection must not lose the deleted state.
# The same checkpoint must still show truncated keys as WT_NOTFOUND after a cold start.
self.setup_leader()
self.truncate_and_checkpoint(self.trunc_start, self.trunc_stop, 20)
self.truncate(self.trunc_start, self.trunc_stop, commit_timestamp=20)
self.leader_checkpoint(20)
truncated_keys = [self.trunc_start, self.trunc_start + 100, self.trunc_stop]
non_truncated_keys = [1, self.trunc_start - 1, self.trunc_stop + 1, self.nrows]
@ -250,7 +200,8 @@ class test_layered_fast_truncate03(wttest.WiredTigerTestCase):
# Reading a deleted page at a timestamp before the truncation forces it to load from disk.
# The key must be found, cache_read_deleted must increment, and the page must not be dirtied.
self.setup_leader()
self.truncate_and_checkpoint(self.trunc_start, self.trunc_stop, 20)
self.truncate(self.trunc_start, self.trunc_stop, commit_timestamp=20)
self.leader_checkpoint(20)
conn, sess = self.open_follower()
dirty_before = self.get_stat(conn, stat.conn.cache_pages_dirty)

View File

@ -26,9 +26,9 @@
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
import unittest
import wttest, wiredtiger
import wttest
from helper_disagg import disagg_test_class, gen_disagg_storages
from helper_layered_fast_truncate import LayeredFastTruncateConfigMixin
from wtscenario import make_scenarios
# test_layered_fast_truncate04.py
@ -37,7 +37,7 @@ from wtscenario import make_scenarios
# open-ended truncation, multiple truncated ranges, and mixed
# update-then-truncate workloads.
@disagg_test_class
class test_layered_fast_truncate04(wttest.WiredTigerTestCase):
class test_layered_fast_truncate04(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
conn_config = 'disaggregated=(role="leader"),'
@ -54,8 +54,7 @@ class test_layered_fast_truncate04(wttest.WiredTigerTestCase):
# digits so that lexicographic order matches numeric order.
nitems = 1000
@staticmethod
def key(n):
def key(self, n):
return f'{n:04d}'
def session_create_config(self):
@ -66,104 +65,35 @@ class test_layered_fast_truncate04(wttest.WiredTigerTestCase):
# Populate the table on the leader, checkpoint, then reopen as follower.
def setup_follower(self):
self.session.create(self.uri, self.session_create_config())
cursor = self.session.open_cursor(self.uri)
for i in range(self.nitems):
self.session.begin_transaction()
cursor[self.key(i)] = 'value'
self.session.commit_transaction()
cursor.close()
self.session.checkpoint()
self.setup_leader(keys=range(self.nitems))
super().setup_follower()
follower_config = (
'disaggregated=(role="follower",'
f'checkpoint_meta="{self.disagg_get_complete_checkpoint_meta()}")'
)
self.reopen_conn(config=follower_config)
# Truncate the range [start, stop] (inclusive). If stop is None, truncate
# from start to the end of the table.
def truncate_range(self, start, stop):
c1 = self.session.open_cursor(self.uri)
c1.set_key(self.key(start))
c2 = None
if stop is not None:
c2 = self.session.open_cursor(self.uri)
c2.set_key(self.key(stop))
self.session.begin_transaction()
self.session.truncate(None, c1, c2, None)
self.session.commit_transaction()
c1.close()
if c2 is not None:
c2.close()
# Return all keys visible via a forward scan.
def scan_forward(self):
cursor = self.session.open_cursor(self.uri)
self.session.begin_transaction()
keys = []
while cursor.next() == 0:
keys.append(cursor.get_key())
self.session.rollback_transaction()
cursor.close()
return keys
# Return all keys visible via a backward scan.
def scan_backward(self):
cursor = self.session.open_cursor(self.uri)
self.session.begin_transaction()
keys = []
while cursor.prev() == 0:
keys.append(cursor.get_key())
self.session.rollback_transaction()
cursor.close()
return list(reversed(keys)) # reverse so order matches forward scan
# Return all keys visible via a forward and a backward scan; assert both
# match the expected list.
def assert_scan(self, expected):
self.assertEqual(self.visible_keys(), expected, 'forward scan mismatch')
self.assertEqual(list(reversed(self.visible_keys(forward=False))), expected,
'backward scan mismatch')
# Run search_near in its own transaction; return (exact, landed_key).
def search_near(self, key):
cursor = self.session.open_cursor(self.uri)
self.session.begin_transaction()
cursor.set_key(self.key(key))
exact = cursor.search_near()
landed = cursor.get_key()
self.session.rollback_transaction()
cursor.close()
return exact, landed
# Run search in its own transaction; return the return value (0 or WT_NOTFOUND).
def search(self, key):
cursor = self.session.open_cursor(self.uri)
self.session.begin_transaction()
cursor.set_key(self.key(key))
ret = cursor.search()
self.session.rollback_transaction()
cursor.close()
return ret
# Assert forward and backward scans both return the expected key list.
def assert_scan(self, expected):
self.assertEqual(self.scan_forward(), expected, 'forward scan mismatch')
self.assertEqual(self.scan_backward(), expected, 'backward scan mismatch')
return self.search_near_key(key)
# Write a single key/value pair in its own transaction.
def put(self, key, value='v'):
cursor = self.session.open_cursor(self.uri)
self.session.begin_transaction()
cursor[self.key(key)] = value
self.session.commit_transaction()
cursor.close()
self.populate([key], value=value)
def test_cursor_scan_skips_truncated_range(self):
# Forward and backward scans must skip every key in the truncated range.
self.setup_follower()
self.truncate_range(100, 700)
self.truncate(100, 700)
self.assert_scan([self.key(i) for i in range(self.nitems) if i < 100 or i > 700])
def test_search_near_inside_truncated_range(self):
# search_near for a key deep inside a truncated range must land outside
# the range and must not report an exact match.
self.setup_follower()
self.truncate_range(100, 700)
self.truncate(100, 700)
exact, landed = self.search_near(400)
self.assertFalse(self.key(100) <= landed <= self.key(700),
@ -175,7 +105,7 @@ class test_layered_fast_truncate04(wttest.WiredTigerTestCase):
# as candidates for search_near. Test both directions by placing the
# single visible ingest key above or below the search key.
self.setup_follower()
self.truncate_range(0, self.nitems - 1)
self.truncate(0, self.nitems - 1)
# Scenario 1: ingest 0600 above search key 0500 forward (exact=1).
self.put(600, 'ingest-live')
@ -197,7 +127,7 @@ class test_layered_fast_truncate04(wttest.WiredTigerTestCase):
# The start and stop keys of the range are inclusive, so search_near at
# either boundary must land strictly outside the range.
self.setup_follower()
self.truncate_range(100, 700)
self.truncate(100, 700)
for boundary in (100, 700):
_, landed = self.search_near(boundary)
@ -207,22 +137,22 @@ class test_layered_fast_truncate04(wttest.WiredTigerTestCase):
def test_truncate_to_end_of_table(self):
# Open-ended truncate from key 500; only 0-499 remain visible.
self.setup_follower()
self.truncate_range(500, None)
self.truncate(500, None)
self.assert_scan([self.key(i) for i in range(500)])
def test_multiple_truncate_ranges(self):
# Two disjoint bounded ranges; scans must skip both.
self.setup_follower()
self.truncate_range(100, 300)
self.truncate_range(600, 800)
self.truncate(100, 300)
self.truncate(600, 800)
self.assert_scan([self.key(i) for i in range(self.nitems)
if not (100 <= i <= 300) and not (600 <= i <= 800)])
def test_mixed_bounded_and_open_ended_truncates(self):
# Bounded [100, 300] combined with open-ended [600, end]; 0-99 and 301-599 visible.
self.setup_follower()
self.truncate_range(100, 300)
self.truncate_range(600, None)
self.truncate(100, 300)
self.truncate(600, None)
self.assert_scan([self.key(i) for i in range(self.nitems)
if i < 100 or (301 <= i <= 599)])
@ -230,7 +160,7 @@ class test_layered_fast_truncate04(wttest.WiredTigerTestCase):
# Open-ended truncate captures a snapshot of "end" at commit time. Keys
# appended afterwards are new data and must remain visible.
self.setup_follower()
self.truncate_range(800, None)
self.truncate(800, None)
for i in range(1000, 1100):
self.put(i, 'appended')
@ -244,23 +174,23 @@ class test_layered_fast_truncate04(wttest.WiredTigerTestCase):
self.setup_follower()
for i in range(200, 401):
self.put(i, 'updated')
self.truncate_range(100, 700)
self.truncate(100, 700)
self.assert_scan([self.key(i) for i in range(self.nitems) if i < 100 or i > 700])
self.assertEqual(self.search(300), wiredtiger.WT_NOTFOUND,
self.assertFalse(self.key_exists(300),
'search must hide an updated-then-truncated key')
def test_search_returns_not_found_in_truncated_range(self):
# search() goes through a different read path than scans and search_near;
# both boundaries and interior keys must return WT_NOTFOUND.
self.setup_follower()
self.truncate_range(100, 700)
self.truncate(100, 700)
for k in (400, 100, 700):
self.assertEqual(self.search(k), wiredtiger.WT_NOTFOUND,
self.assertFalse(self.key_exists(k),
f'search({self.key(k)}) inside range must be hidden')
for k in (99, 701):
self.assertEqual(self.search(k), 0,
self.assertTrue(self.key_exists(k),
f'search({self.key(k)}) outside range must succeed')
def test_search_near_direction_in_truncated_range(self):
@ -269,24 +199,24 @@ class test_layered_fast_truncate04(wttest.WiredTigerTestCase):
self.setup_follower()
# Bounded range [100, 700]. Forward finds 0701.
self.truncate_range(100, 700)
self.truncate(100, 700)
self.assertEqual(self.search_near(400), (1, self.key(701)), 'forward scenario')
# Add open-ended truncate [800, end]. Forward exhausts, falls back to 0799.
self.truncate_range(800, None)
self.truncate(800, None)
self.assertEqual(self.search_near(900), (-1, self.key(799)), 'backward scenario')
def test_overlapping_truncated_ranges_scan(self):
# Two overlapping ranges [100, 400] and [300, 700]: scans must skip the
# full union [100, 700], not just one range at a time.
self.setup_follower()
self.truncate_range(100, 400)
self.truncate_range(300, 700)
self.truncate(100, 400)
self.truncate(300, 700)
self.assert_scan([self.key(i) for i in range(self.nitems)
if i < 100 or i > 700])
def test_entire_table_truncated(self):
# Truncate every key; both scans must be empty.
self.setup_follower()
self.truncate_range(0, self.nitems - 1)
self.truncate(0, self.nitems - 1)
self.assert_scan([])

View File

@ -28,6 +28,7 @@
import wttest
from helper_disagg import disagg_test_class, gen_disagg_storages
from helper_layered_fast_truncate import LayeredFastTruncateConfigMixin
from wtscenario import make_scenarios
# test_layered_fast_truncate05.py
@ -35,7 +36,7 @@ from wtscenario import make_scenarios
# standby (follower) node.
@disagg_test_class
class test_layered_fast_truncate05(wttest.WiredTigerTestCase):
class test_layered_fast_truncate05(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
conn_config = 'disaggregated=(role="leader"),'
@ -52,8 +53,7 @@ class test_layered_fast_truncate05(wttest.WiredTigerTestCase):
# digits so that lexicographic order matches numeric order.
nitems = 1000
@staticmethod
def key(n):
def key(self, n):
return f'{n:04d}'
def session_create_config(self):
@ -64,36 +64,8 @@ class test_layered_fast_truncate05(wttest.WiredTigerTestCase):
# Populate the table on the leader, checkpoint, then reopen as follower.
def setup_follower(self):
self.session.create(self.uri, self.session_create_config())
cursor = self.session.open_cursor(self.uri)
for i in range(self.nitems):
self.session.begin_transaction()
cursor[self.key(i)] = 'value'
self.session.commit_transaction()
cursor.close()
self.session.checkpoint()
follower_config = (
'disaggregated=(role="follower",'
f'checkpoint_meta="{self.disagg_get_complete_checkpoint_meta()}")'
)
self.reopen_conn(config=follower_config)
# Truncate the range [start, stop] (inclusive). If stop is None, truncate
# from start to the end of the table.
def truncate_range(self, start, stop):
c1 = self.session.open_cursor(self.uri)
c1.set_key(self.key(start))
c2 = None
if stop is not None:
c2 = self.session.open_cursor(self.uri)
c2.set_key(self.key(stop))
self.session.begin_transaction()
self.session.truncate(None, c1, c2, None)
self.session.commit_transaction()
c1.close()
if c2 is not None:
c2.close()
self.setup_leader(keys=range(self.nitems))
super().setup_follower()
# Draw `samples` random keys and assert none fall inside [low, high].
def sample_assert_random(self, low, high, samples=200):
@ -110,7 +82,7 @@ class test_layered_fast_truncate05(wttest.WiredTigerTestCase):
def test_random_cursor_skips_truncated_range(self):
# 200 random samples must all land outside the truncated range.
self.setup_follower()
self.truncate_range(100, 700)
self.truncate(100, 700)
self.sample_assert_random(100, 700)
def test_random_cursor_skips_truncated_range_with_live_ingest(self):
@ -125,5 +97,5 @@ class test_layered_fast_truncate05(wttest.WiredTigerTestCase):
self.session.commit_transaction()
cursor.close()
self.truncate_range(100, 700)
self.truncate(100, 700)
self.sample_assert_random(100, 700)

View File

@ -34,10 +34,11 @@
import wttest
from helper_disagg import disagg_test_class, gen_disagg_storages
from helper_layered_fast_truncate import LayeredFastTruncateConfigMixin
from wtscenario import make_scenarios
@disagg_test_class
class test_layered_fast_truncate06(wttest.WiredTigerTestCase):
class test_layered_fast_truncate06(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
conn_config = 'disaggregated=(role="leader"),'
nrows = 100
@ -50,14 +51,6 @@ class test_layered_fast_truncate06(wttest.WiredTigerTestCase):
'test_layered_fast_truncate06', disagg_only=True)
scenarios = make_scenarios(disagg_storages, uris)
def visible_keys(self):
c = self.session.open_cursor(self.uri)
keys = []
while c.next() == 0:
keys.append(c.get_key())
c.close()
return keys
def session_create_config(self):
cfg = 'key_format=i,value_format=S'
if self.uri.startswith('table:'):
@ -65,8 +58,8 @@ class test_layered_fast_truncate06(wttest.WiredTigerTestCase):
return cfg
def setup_follower(self):
# Create the table on the leader, load nrows, checkpoint, then reopen the
# connection as a follower picking up that checkpoint.
# Create the table on the leader, load nrows with per-row commit timestamps,
# checkpoint, then reopen the connection as a follower picking up that checkpoint.
self.session.create(self.uri, self.session_create_config())
cursor = self.session.open_cursor(self.uri)
@ -77,32 +70,29 @@ class test_layered_fast_truncate06(wttest.WiredTigerTestCase):
cursor.close()
self.session.checkpoint()
follower_config = ('disaggregated=(role="follower",'
f'checkpoint_meta="{self.disagg_get_complete_checkpoint_meta()}")')
self.reopen_conn(config=follower_config)
super().setup_follower()
def follower_truncate(self, start, stop):
c_start = self.session.open_cursor(self.uri)
c_start.set_key(start)
c_stop = self.session.open_cursor(self.uri)
c_stop.set_key(stop)
self.session.begin_transaction()
self.session.truncate(None, c_start, c_stop, None)
self.session.commit_transaction()
c_start.close()
c_stop.close()
def visible_keys_simple(self):
# The test verifies a scan outside a transaction; use a simple inline scan
# to match the original semantics (no transaction wrapping).
c = self.session.open_cursor(self.uri)
keys = []
while c.next() == 0:
keys.append(c.get_key())
c.close()
return keys
def test_verify_preserves_follower_truncate(self):
self.setup_follower()
self.follower_truncate(30, 60)
self.truncate(30, 60)
expected = [i for i in range(1, self.nrows + 1) if i < 30 or i > 60]
# Before verify: a scan does not return the truncated rows.
self.assertEqual(self.visible_keys(), expected)
self.assertEqual(self.visible_keys_simple(), expected)
# Verify the layered URI. This triggers a close + reopen of the dhandle.
self.session.verify(self.uri)
# After verify: a scan must still not return the truncated rows.
self.assertEqual(self.visible_keys(), expected)
self.assertEqual(self.visible_keys_simple(), expected)

View File

@ -26,7 +26,7 @@
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
# test_layered_fast_truncate06.py
# test_layered_fast_truncate07.py
# Follower-initiated truncate stores a bounded range in the truncate list.
# Verifies NULL start/stop from the session API are resolved to the table's
# first/last visible key, both via the verbose log line and by the row set
@ -34,19 +34,20 @@
import wttest
from helper_disagg import disagg_test_class, gen_disagg_storages
from helper_layered_fast_truncate import LayeredFastTruncateConfigMixin
from wtscenario import make_scenarios
@disagg_test_class
class test_layered_fast_truncate06(wttest.WiredTigerTestCase):
class test_layered_fast_truncate07(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
conn_config = 'verbose=[layered:3],disaggregated=(role="leader"),'
uri = 'layered:test_layered_fast_truncate06'
uri = 'layered:test_layered_fast_truncate07'
key_formats = [
('string', dict(key_format='S')),
('int', dict(key_format='i')),
]
disagg_storages = gen_disagg_storages('test_layered_fast_truncate06', disagg_only=True)
disagg_storages = gen_disagg_storages('test_layered_fast_truncate07', disagg_only=True)
scenarios = make_scenarios(disagg_storages, key_formats)
nitems = 100
@ -59,42 +60,17 @@ class test_layered_fast_truncate06(wttest.WiredTigerTestCase):
def key_str(self, n):
return f'{n:04d}' if self.key_format == 'S' else str(n)
def session_create_config(self):
return f'key_format={self.key_format},value_format=S'
def setup_follower(self):
self.session.create(self.uri, f'key_format={self.key_format},value_format=S')
self.session.create(self.uri, self.session_create_config())
self.insert_range(1, self.nitems)
self.session.checkpoint()
follower_config = ('verbose=[layered:3],disaggregated=(role="follower",'
f'checkpoint_meta="{self.disagg_get_complete_checkpoint_meta()}")')
self.reopen_conn(config=follower_config)
def truncate(self, start=None, stop=None):
c_start = c_stop = None
if start is not None:
c_start = self.session.open_cursor(self.uri)
c_start.set_key(self.key(start))
if stop is not None:
c_stop = self.session.open_cursor(self.uri)
c_stop.set_key(self.key(stop))
# Use the table uri if both start and stop cursors are not given.
uri = self.uri if (c_start is None and c_stop is None) else None
self.session.begin_transaction()
self.session.truncate(uri, c_start, c_stop, None)
self.session.commit_transaction()
if c_start is not None:
c_start.close()
if c_stop is not None:
c_stop.close()
def visible_keys(self, forward=True):
c = self.session.open_cursor(self.uri)
step = c.next if forward else c.prev
keys = []
while step() == 0:
keys.append(c.get_key())
c.close()
return keys
def insert_range(self, lo, hi):
c = self.session.open_cursor(self.uri)
for i in range(lo, hi + 1):
@ -103,6 +79,16 @@ class test_layered_fast_truncate06(wttest.WiredTigerTestCase):
self.session.commit_transaction()
c.close()
def follower_visible_keys(self, forward=True):
# Simple inline scan without a transaction wrapper to match the original behavior.
c = self.session.open_cursor(self.uri)
step = c.next if forward else c.prev
keys = []
while step() == 0:
keys.append(c.get_key())
c.close()
return keys
# Keys in [1, nitems] minus [start, stop] (inclusive on both ends).
def expected_keys(self, start, stop):
return [self.key(i) for i in range(1, self.nitems + 1)
@ -117,59 +103,59 @@ class test_layered_fast_truncate06(wttest.WiredTigerTestCase):
def test_bounded_range(self):
self.setup_follower()
self.truncate(start=30, stop=60)
self.truncate(start_key=30, stop_key=60)
self.assert_trunc_log(30, 60)
self.assertEqual(self.visible_keys(), self.expected_keys(30, 60))
self.assertEqual(self.follower_visible_keys(), self.expected_keys(30, 60))
def test_null_start_resolves_to_first_key(self):
self.setup_follower()
self.truncate(start=None, stop=60)
self.truncate(start_key=None, stop_key=60)
self.assert_trunc_log(1, 60)
self.assertEqual(self.visible_keys(), self.expected_keys(1, 60))
self.assertEqual(self.follower_visible_keys(), self.expected_keys(1, 60))
def test_null_stop_resolves_to_last_key(self):
self.setup_follower()
self.truncate(start=30, stop=None)
self.truncate(start_key=30, stop_key=None)
self.assert_trunc_log(30, self.nitems)
self.assertEqual(self.visible_keys(), self.expected_keys(30, self.nitems))
self.assertEqual(self.follower_visible_keys(), self.expected_keys(30, self.nitems))
def test_both_null_is_full_table(self):
self.setup_follower()
self.truncate(start=None, stop=None)
self.truncate(start_key=None, stop_key=None)
self.assert_trunc_log(1, self.nitems)
self.assertEqual(self.visible_keys(), [])
self.assertEqual(self.follower_visible_keys(), [])
# An open-ended truncate captures "end" at commit time, not dynamically. Keys appended
# after stop should be visible.
def test_open_ended_truncate_does_not_hide_later_appends(self):
self.setup_follower()
self.truncate(start=80, stop=None)
self.truncate(start_key=80, stop_key=None)
self.assert_trunc_log(80, self.nitems)
self.insert_range(200, 210)
expected = [self.key(i) for i in range(1, 80)] + \
[self.key(i) for i in range(200, 211)]
self.assertEqual(self.visible_keys(), expected)
self.assertEqual(self.follower_visible_keys(), expected)
def test_bounded_and_end_open_ended_overlap(self):
self.setup_follower()
self.truncate(start=20, stop=60)
self.truncate(start_key=20, stop_key=60)
self.assert_trunc_log(20, 60)
self.truncate(start=50, stop=None)
self.truncate(start_key=50, stop_key=None)
# key 50-60 was deleted by the first truncate; search_near positions it on the
# nearest in-bound key, 61.
self.assert_trunc_log(61, self.nitems)
expected = [self.key(i) for i in range(1, 20)]
self.assertEqual(self.visible_keys(), expected)
self.assertEqual(self.visible_keys(forward=False), list(reversed(expected)))
self.assertEqual(self.follower_visible_keys(), expected)
self.assertEqual(self.follower_visible_keys(forward=False), list(reversed(expected)))
def test_bounded_and_start_open_ended_overlap(self):
self.setup_follower()
self.truncate(start=20, stop=60)
self.truncate(start_key=20, stop_key=60)
self.assert_trunc_log(20, 60)
self.truncate(start=0, stop=30)
self.truncate(start_key=0, stop_key=30)
# key 20-30 was deleted by the first truncate; search_near positions it on the
# nearest live key, 19.
self.assert_trunc_log(1, 19)
expected = [self.key(i) for i in range(61, self.nitems + 1)]
self.assertEqual(self.visible_keys(), expected)
self.assertEqual(self.visible_keys(forward=False), list(reversed(expected)))
self.assertEqual(self.follower_visible_keys(), expected)
self.assertEqual(self.follower_visible_keys(forward=False), list(reversed(expected)))

View File

@ -33,68 +33,59 @@
from contextlib import closing
from helper_disagg import disagg_test_class, gen_disagg_storages
from helper_layered_fast_truncate import LayeredFastTruncateConfigMixin
from wtscenario import make_scenarios
import wttest
@disagg_test_class
class test_layered_fast_truncate08(wttest.WiredTigerTestCase):
class test_layered_fast_truncate08(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
test_name = __qualname__
disagg_storages = gen_disagg_storages(test_name, disagg_only=True)
scenarios = make_scenarios(disagg_storages)
conn_config = 'disaggregated=(role="leader"),'
def setup_layered_table(self, layered_uri: str):
uri = f"layered:{test_name}"
def session_create_config(self):
return "key_format=i,value_format=u"
def populate(self, keys, value=b"v"):
with closing(self.session.open_cursor(self.uri)) as cursor:
with self.transaction():
for key in keys:
cursor[key] = value
def setup_layered_table(self):
# Create the table and produce the initial checkpoint that the follower
# will attach to.
session_config = "key_format=i,value_format=u"
self.session.create(layered_uri, session_config)
self.session.checkpoint()
def setup_follower(self, layered_uri: str):
self.reopen_disagg_conn('disaggregated=(role="follower"),')
self.setup_leader()
def setup_follower(self, keys=range(100)):
super().setup_follower()
# Add updates on the ingest that can be truncated later.
with closing(self.session.open_cursor(layered_uri)) as cursor:
with self.transaction():
for i in range(100):
cursor[i] = b"v"
self.populate(keys)
def truncate(self, layered_uri: str, start_key: int, stop_key: int):
# Truncate between start and stop keys inclusive.
with (
closing(self.session.open_cursor(layered_uri)) as start_cursor,
closing(self.session.open_cursor(layered_uri)) as stop_cursor,
):
start_cursor.set_key(start_key)
stop_cursor.set_key(stop_key)
with self.transaction():
self.session.truncate(None, start_cursor, stop_cursor, None)
def get_values(self, uri: str, start_key: int, stop_key: int):
def get_values(self, uri, start_key, stop_key):
# Return values of any keys between start and stop inclusive that exist.
values = []
with closing(self.session.open_cursor(uri)) as cursor:
for i in range(start_key, stop_key + 1):
cursor.set_key(i)
if cursor.search() == 0:
values.append(cursor.get_value())
return values
def test_follower_truncate_writes_tombstone_to_ingest(self):
# Set up a follower with existing ingest updates.
layered_uri = f"layered:{self.test_name}"
self.setup_layered_table(layered_uri)
self.setup_follower(layered_uri)
self.setup_layered_table()
self.setup_follower()
# Truncate a range of keys.
start_key = 20
stop_key = 80
self.truncate(layered_uri, start_key, stop_key)
self.truncate(start_key, stop_key)
# Examine what the truncate actually wrote to the ingest file.
ingest_uri = f"file:{self.test_name}.wt_ingest"

View File

@ -28,12 +28,13 @@
import wiredtiger, wttest
from helper_disagg import disagg_test_class, gen_disagg_storages
from helper_layered_fast_truncate import LayeredFastTruncateConfigMixin
from wtscenario import make_scenarios
# test_layered_fast_truncate09.py
# Follower truncate-list visibility coverage.
@disagg_test_class
class test_layered_fast_truncate09(wttest.WiredTigerTestCase):
class test_layered_fast_truncate09(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
conn_config = 'disaggregated=(role="leader"),'
@ -49,7 +50,6 @@ class test_layered_fast_truncate09(wttest.WiredTigerTestCase):
def setUp(self):
super().setUp()
self.setup_follower()
def session_create_config(self):
@ -85,7 +85,7 @@ class test_layered_fast_truncate09(wttest.WiredTigerTestCase):
c_start.close()
c_stop.close()
def search_key(self, session, key):
def search_in(self, session, key):
cursor = session.open_cursor(self.uri)
cursor.set_key(key)
ret = cursor.search()
@ -93,7 +93,7 @@ class test_layered_fast_truncate09(wttest.WiredTigerTestCase):
cursor.close()
return ret, value
def search_near_key(self, session, key):
def search_near_in(self, session, key):
cursor = session.open_cursor(self.uri)
cursor.set_key(key)
exact = cursor.search_near()
@ -118,9 +118,9 @@ class test_layered_fast_truncate09(wttest.WiredTigerTestCase):
with self.transaction(session=self.session, rollback=True):
self.truncate_range(self.session, 100, 700)
ret = self.search_key(self.session, 150)[0]
ret = self.search_in(self.session, 150)[0]
self.assertEqual(ret, wiredtiger.WT_NOTFOUND)
exact, landed = self.search_near_key(self.session, 150)
exact, landed = self.search_near_in(self.session, 150)
self.assertNotEqual(exact, 0)
if exact < 0:
self.assertEqual(landed, 99)
@ -136,8 +136,8 @@ class test_layered_fast_truncate09(wttest.WiredTigerTestCase):
session2 = self.conn.open_session()
try:
with self.transaction(session=session2, rollback=True):
self.assertEqual(self.search_key(session2, 150), (0, 'value'))
self.assertEqual(self.search_near_key(session2, 150), (0, 150))
self.assertEqual(self.search_in(session2, 150), (0, 'value'))
self.assertEqual(self.search_near_in(session2, 150), (0, 150))
self.assertEqual(self.next_key_after(session2, 149), 150)
finally:
session2.close()
@ -145,14 +145,14 @@ class test_layered_fast_truncate09(wttest.WiredTigerTestCase):
def test_rollback_restores_visibility(self):
with self.transaction(session=self.session, rollback=True):
self.truncate_range(self.session, 100, 700)
ret = self.search_key(self.session, 150)[0]
ret = self.search_in(self.session, 150)[0]
self.assertEqual(ret, wiredtiger.WT_NOTFOUND)
session2 = self.conn.open_session()
try:
with self.transaction(session=session2, rollback=True):
self.assertEqual(self.search_key(session2, 150), (0, 'value'))
self.assertEqual(self.search_near_key(session2, 150), (0, 150))
self.assertEqual(self.search_in(session2, 150), (0, 'value'))
self.assertEqual(self.search_near_in(session2, 150), (0, 150))
self.assertEqual(self.next_key_after(session2, 149), 150)
finally:
session2.close()
@ -163,14 +163,14 @@ class test_layered_fast_truncate09(wttest.WiredTigerTestCase):
session2 = self.conn.open_session()
try:
with self.transaction(session=session2, read_timestamp=20, rollback=True):
self.assertEqual(self.search_key(session2, 150), (0, 'value'))
self.assertEqual(self.search_near_key(session2, 150), (0, 150))
self.assertEqual(self.search_in(session2, 150), (0, 'value'))
self.assertEqual(self.search_near_in(session2, 150), (0, 150))
self.assertEqual(self.next_key_after(session2, 149), 150)
with self.transaction(session=session2, read_timestamp=30, rollback=True):
ret = self.search_key(session2, 150)[0]
ret = self.search_in(session2, 150)[0]
self.assertEqual(ret, wiredtiger.WT_NOTFOUND)
exact, landed = self.search_near_key(session2, 150)
exact, landed = self.search_near_in(session2, 150)
self.assertNotEqual(exact, 0)
if exact < 0:
self.assertEqual(landed, 99)
@ -188,16 +188,16 @@ class test_layered_fast_truncate09(wttest.WiredTigerTestCase):
session2 = self.conn.open_session()
try:
with self.transaction(session=session2, read_timestamp=30, rollback=True):
ret = self.search_key(session2, 350)[0]
ret = self.search_in(session2, 350)[0]
self.assertEqual(ret, wiredtiger.WT_NOTFOUND)
self.assertEqual(self.search_key(session2, 500), (0, 'value'))
self.assertEqual(self.search_in(session2, 500), (0, 'value'))
with self.transaction(session=session2, read_timestamp=40, rollback=True):
ret = self.search_key(session2, 350)[0]
ret = self.search_in(session2, 350)[0]
self.assertEqual(ret, wiredtiger.WT_NOTFOUND)
ret = self.search_key(session2, 500)[0]
ret = self.search_in(session2, 500)[0]
self.assertEqual(ret, wiredtiger.WT_NOTFOUND)
exact, landed = self.search_near_key(session2, 150)
exact, landed = self.search_near_in(session2, 150)
self.assertNotEqual(exact, 0)
if exact < 0:
self.assertEqual(landed, 99)

View File

@ -33,26 +33,16 @@
# the logical union of the stable and ingest tables, independent of which
# table any given key actually lives in.
from contextlib import closing
from itertools import chain
from typing import Iterable
from helper_disagg import disagg_test_class, gen_disagg_storages
from helper_layered_fast_truncate import (
LayeredFastTruncateConfigMixin, concat, range_inclusive,
)
from wtscenario import make_scenarios
import wttest
def concat(*iterables: Iterable[int]) -> list[int]:
"""Concatenate any number of iterables into a single list."""
return list(chain.from_iterable(iterables))
def range_inclusive(start: int, stop: int) -> range:
"""Return a range covering [start, stop] inclusive."""
return range(start, stop + 1)
@disagg_test_class
class test_layered_fast_truncate10(wttest.WiredTigerTestCase):
class test_layered_fast_truncate10(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
"""
Data location semantics (stable vs ingest).
@ -70,60 +60,6 @@ class test_layered_fast_truncate10(wttest.WiredTigerTestCase):
scenarios = make_scenarios(disagg_storages, uris)
conn_config = 'disaggregated=(role="leader"),'
def session_create_config(self):
cfg = "key_format=i,value_format=S"
if self.uri.startswith("table"):
cfg += ",block_manager=disagg,type=layered"
return cfg
def auto_closing_cursor(self):
"""Return a cursor that auto-closes as it goes out of scope."""
return closing(self.session.open_cursor(self.uri))
def populate(self, keys: Iterable[int]):
"""Insert each key with a placeholder value in a single transaction."""
with self.auto_closing_cursor() as cursor:
with self.transaction():
for key in keys:
cursor[key] = "v"
def setup_leader(self, keys: Iterable[int] | None = None):
"""
Create the table on the leader and optionally pre-populate stable.
The follower will pick up these keys via the initial checkpoint.
"""
self.session.create(self.uri, self.session_create_config())
if keys is not None:
self.populate(keys)
self.session.checkpoint()
def setup_follower(self, keys: Iterable[int] | None = None):
"""Switch to follower role and optionally write keys to ingest."""
self.reopen_disagg_conn('disaggregated=(role="follower"),')
if keys is not None:
self.populate(keys)
def truncate(self, start_key: int, stop_key: int):
"""Truncate between start and stop keys inclusive."""
with (
self.auto_closing_cursor() as start_cursor,
self.auto_closing_cursor() as stop_cursor,
):
start_cursor.set_key(start_key)
stop_cursor.set_key(stop_key)
with self.transaction():
self.session.truncate(None, start_cursor, stop_cursor, None)
def visible_keys(self) -> list[int]:
"""Return all keys visible via a forward scan, in key order."""
result = []
with self.auto_closing_cursor() as cursor:
with self.transaction(rollback=True):
while cursor.next() == 0:
result.append(cursor.get_key())
return result
def test_truncate_range_with_both_tables_empty(self):
# Stable and ingest are both empty.
self.setup_leader()

View File

@ -34,27 +34,17 @@
# Open-ended truncates should not apply to keys written after the truncate
# commits.
from contextlib import closing, nullcontext
from itertools import chain
from typing import Iterable
from helper_disagg import disagg_test_class, gen_disagg_storages
from helper_layered_fast_truncate import (
LayeredFastTruncateConfigMixin, concat, range_inclusive,
)
from wiredtiger import WiredTigerError
from wtscenario import make_scenarios
import wttest
def concat(*iterables: Iterable[int]) -> list[int]:
"""Concatenate any number of iterables into a single list."""
return list(chain.from_iterable(iterables))
def range_inclusive(start: int, stop: int) -> range:
"""Return a range covering [start, stop] inclusive."""
return range(start, stop + 1)
@disagg_test_class
class test_layered_fast_truncate11(wttest.WiredTigerTestCase):
class test_layered_fast_truncate11(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
"""
Range specification (start / end / open-ended).
@ -73,67 +63,6 @@ class test_layered_fast_truncate11(wttest.WiredTigerTestCase):
scenarios = make_scenarios(disagg_storages, uris)
conn_config = 'disaggregated=(role="leader"),'
def session_create_config(self):
cfg = "key_format=i,value_format=S"
if self.uri.startswith("table"):
cfg += ",block_manager=disagg,type=layered"
return cfg
def auto_closing_cursor(self) -> closing:
"""Return a cursor that auto-closes as it goes out of scope."""
return closing(self.session.open_cursor(self.uri))
def populate(self, keys: Iterable[int]):
"""Insert each key with a placeholder value in a single transaction."""
with self.auto_closing_cursor() as cursor:
with self.transaction():
for key in keys:
cursor[key] = "v"
def setup_leader(self, keys: Iterable[int] | None = None):
"""
Create the table on the leader and optionally pre-populate stable.
The follower will pick up these keys via the initial checkpoint.
"""
self.session.create(self.uri, self.session_create_config())
if keys is not None:
self.populate(keys)
self.session.checkpoint()
def setup_follower(self, keys: Iterable[int] | None = None):
"""Switch to follower role and optionally write keys to ingest."""
self.reopen_disagg_conn('disaggregated=(role="follower"),')
if keys is not None:
self.populate(keys)
def cursor_for_key(self, key: int | None):
"""Return a cursor with its key set, or None if key is None."""
if key is None:
return nullcontext(None) # Open-ended truncate.
cursor = self.auto_closing_cursor()
cursor.thing.set_key(key)
return cursor
def truncate(self, start_key: int | None, stop_key: int | None):
"""Truncate [start_key, stop_key] inclusive; None means open end."""
with (
self.cursor_for_key(start_key) as start,
self.cursor_for_key(stop_key) as stop,
):
# WT requires a URI when both cursors are absent.
uri = self.uri if (start is None and stop is None) else None
with self.transaction():
self.session.truncate(uri, start, stop, None)
def visible_keys(self) -> list[int]:
"""Return all keys visible via a forward scan, in key order."""
result = []
with self.auto_closing_cursor() as cursor:
with self.transaction(rollback=True):
while cursor.next() == 0:
result.append(cursor.get_key())
return result
def test_truncate_with_null_start_key(self):
# Set up a follower with keys 1-100.
self.setup_leader()

View File

@ -32,27 +32,16 @@
# Verify that forward scans, backward scans, next_random, search, and
# search_near all treat truncated keys as non-existent on a follower.
from contextlib import closing, nullcontext
from itertools import chain
from typing import Iterable
from helper_disagg import disagg_test_class, gen_disagg_storages
from wiredtiger import WT_NOTFOUND
from helper_layered_fast_truncate import (
LayeredFastTruncateConfigMixin, concat, range_inclusive,
)
from wtscenario import make_scenarios
import wttest
def concat(*iterables: Iterable[int]) -> list[int]:
"""Concatenate any number of iterables into a single list."""
return list(chain.from_iterable(iterables))
def range_inclusive(start: int, stop: int) -> range:
"""Return a range covering [start, stop] inclusive."""
return range(start, stop + 1)
@disagg_test_class
class test_layered_fast_truncate12(wttest.WiredTigerTestCase):
class test_layered_fast_truncate12(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
"""
Cursor iteration and searches over truncated ranges.
@ -69,76 +58,7 @@ class test_layered_fast_truncate12(wttest.WiredTigerTestCase):
scenarios = make_scenarios(disagg_storages, uris)
conn_config = 'disaggregated=(role="leader"),'
def session_create_config(self):
cfg = "key_format=i,value_format=S"
if self.uri.startswith("table"):
cfg += ",block_manager=disagg,type=layered"
return cfg
def auto_closing_cursor(self, config: str | None = None) -> closing:
"""Return a cursor that auto-closes as it goes out of scope."""
return closing(self.session.open_cursor(self.uri, None, config))
def populate(self, keys: Iterable[int]):
"""Insert each key with a placeholder value in a single transaction."""
with self.auto_closing_cursor() as cursor:
with self.transaction():
for key in keys:
cursor[key] = "v"
def setup_leader(self, keys: Iterable[int] | None = None):
"""
Create the table on the leader and optionally pre-populate stable.
The follower will pick up these keys via the initial checkpoint.
"""
self.session.create(self.uri, self.session_create_config())
if keys is not None:
self.populate(keys)
self.session.checkpoint()
def setup_follower(self, keys: Iterable[int] | None = None):
"""Switch to follower role and optionally write keys to ingest."""
self.reopen_disagg_conn('disaggregated=(role="follower"),')
if keys is not None:
self.populate(keys)
def cursor_for_key(self, key: int | None):
"""Return a cursor with its key set, or None if key is None."""
if key is None:
return nullcontext(None)
cursor = self.auto_closing_cursor()
cursor.thing.set_key(key)
return cursor
def truncate(self, start_key: int | None, stop_key: int | None):
"""Truncate [start_key, stop_key] inclusive; None means open end."""
with (
self.cursor_for_key(start_key) as start,
self.cursor_for_key(stop_key) as stop,
):
uri = self.uri if (start is None and stop is None) else None
with self.transaction():
self.session.truncate(uri, start, stop, None)
def visible_keys(self) -> list[int]:
"""Return all keys visible via a forward scan, in key order."""
result = []
with self.auto_closing_cursor() as cursor:
with self.transaction(rollback=True):
while cursor.next() == 0:
result.append(cursor.get_key())
return result
def backward_visible_keys(self) -> list[int]:
"""Return all keys visible via a backward scan."""
result = []
with self.auto_closing_cursor() as cursor:
with self.transaction(rollback=True):
while cursor.prev() == 0:
result.append(cursor.get_key())
return result
def random_sample_keys(self, n: int) -> list[int]:
def random_sample_keys(self, n):
"""Return n keys drawn from a next_random cursor."""
result = []
with self.auto_closing_cursor("next_random=true") as cursor:
@ -148,27 +68,6 @@ class test_layered_fast_truncate12(wttest.WiredTigerTestCase):
result.append(cursor.get_key())
return result
def search_key(self, key: int) -> int:
"""Search for key; return 0 on exact match or WT_NOTFOUND."""
with self.cursor_for_key(key) as cursor:
with self.transaction(rollback=True):
return cursor.search()
def search_near_key(self, key: int) -> tuple[int, int | None]:
"""
Call search_near for a key.
Returns (exact, found_key). exact follows WT convention: 0 = exact,
1 = positioned above, -1 = positioned below, or WT_NOTFOUND if no
visible keys exist.
"""
with self.cursor_for_key(key) as cursor:
with self.transaction(rollback=True):
exact = cursor.search_near()
if exact == WT_NOTFOUND:
return exact, None
return exact, cursor.get_key()
def test_forward_scan_skips_truncated_range(self):
# Set up a follower with keys 1-100.
self.setup_leader()
@ -194,7 +93,7 @@ class test_layered_fast_truncate12(wttest.WiredTigerTestCase):
reversed(range_inclusive(61, 100)),
reversed(range_inclusive(1, 29)),
)
self.assertEqual(self.backward_visible_keys(), expected)
self.assertEqual(self.visible_keys(forward=False), expected)
def test_next_random_never_lands_in_truncated_range(self):
# Set up a follower with keys 1-100.
@ -219,7 +118,7 @@ class test_layered_fast_truncate12(wttest.WiredTigerTestCase):
# Searching for a key inside the truncated range should return
# WT_NOTFOUND.
self.assertEqual(self.search_key(45), WT_NOTFOUND)
self.assertFalse(self.key_exists(45))
def test_search_at_inclusive_truncate_boundary(self):
# Set up a follower with keys 1-100.
@ -230,12 +129,12 @@ class test_layered_fast_truncate12(wttest.WiredTigerTestCase):
self.truncate(30, 60)
# The boundary keys should be invisible.
self.assertEqual(self.search_key(30), WT_NOTFOUND)
self.assertEqual(self.search_key(60), WT_NOTFOUND)
self.assertFalse(self.key_exists(30))
self.assertFalse(self.key_exists(60))
# The keys just outside the truncated range should still be found.
self.assertEqual(self.search_key(29), 0)
self.assertEqual(self.search_key(61), 0)
self.assertTrue(self.key_exists(29))
self.assertTrue(self.key_exists(61))
def test_search_near_inside_truncated_range(self):
# Set up a follower with keys 1-100.

View File

@ -32,26 +32,16 @@
# Verify that subsequent operations - additional truncates, per-key removes,
# and reinsertion - compose correctly with a prior committed truncate.
from contextlib import closing, nullcontext
from itertools import chain
from typing import Iterable
from helper_disagg import disagg_test_class, gen_disagg_storages
from helper_layered_fast_truncate import (
LayeredFastTruncateConfigMixin, concat, range_inclusive,
)
from wtscenario import make_scenarios
import wttest
def concat(*iterables: Iterable[int]) -> list[int]:
"""Concatenate any number of iterables into a single list."""
return list(chain.from_iterable(iterables))
def range_inclusive(start: int, stop: int) -> range:
"""Return a range covering [start, stop] inclusive."""
return range(start, stop + 1)
@disagg_test_class
class test_layered_fast_truncate13(wttest.WiredTigerTestCase):
class test_layered_fast_truncate13(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
"""
Interactions with existing truncates.
@ -68,72 +58,13 @@ class test_layered_fast_truncate13(wttest.WiredTigerTestCase):
scenarios = make_scenarios(disagg_storages, uris)
conn_config = 'disaggregated=(role="leader"),'
def session_create_config(self):
cfg = "key_format=i,value_format=S"
if self.uri.startswith("table"):
cfg += ",block_manager=disagg,type=layered"
return cfg
def auto_closing_cursor(self, config: str | None = None) -> closing:
"""Return a cursor that auto-closes as it goes out of scope."""
return closing(self.session.open_cursor(self.uri, None, config))
def populate(self, keys: Iterable[int]):
"""Insert each key with a placeholder value in a single transaction."""
with self.auto_closing_cursor() as cursor:
with self.transaction():
for key in keys:
cursor[key] = "v"
def setup_leader(self, keys: Iterable[int] | None = None):
"""
Create the table on the leader and optionally pre-populate stable. The
follower will pick up these keys via the initial checkpoint.
"""
self.session.create(self.uri, self.session_create_config())
if keys is not None:
self.populate(keys)
self.session.checkpoint()
def setup_follower(self, keys: Iterable[int] | None = None):
"""Switch to follower role and optionally write keys to ingest."""
self.reopen_disagg_conn('disaggregated=(role="follower"),')
if keys is not None:
self.populate(keys)
def cursor_for_key(self, key: int | None):
"""Return a cursor with its key set, or None if key is None."""
if key is None:
return nullcontext(None)
cursor = self.auto_closing_cursor()
cursor.thing.set_key(key)
return cursor
def truncate(self, start_key: int | None, stop_key: int | None):
"""Truncate [start_key, stop_key] inclusive; None means open end."""
with (
self.cursor_for_key(start_key) as start,
self.cursor_for_key(stop_key) as stop,
):
uri = self.uri if (start is None and stop is None) else None
with self.transaction():
self.session.truncate(uri, start, stop, None)
def remove_key(self, key: int):
def remove_key(self, key):
"""Remove a single key in a transaction."""
with self.cursor_for_key(key) as cursor:
with self.auto_closing_cursor() as cursor:
cursor.set_key(self.key(key))
with self.transaction():
cursor.remove()
def visible_keys(self) -> list[int]:
"""Return all keys visible via a forward scan, in key order."""
result = []
with self.auto_closing_cursor() as cursor:
with self.transaction(rollback=True):
while cursor.next() == 0:
result.append(cursor.get_key())
return result
def test_per_key_removes_before_truncate(self):
# Set up a follower with keys 1-100.
self.setup_leader()
@ -226,10 +157,12 @@ class test_layered_fast_truncate13(wttest.WiredTigerTestCase):
# Truncate keys 30-60 and reinsert key 45 within the same transaction.
with self.transaction():
with (
self.cursor_for_key(30) as start,
self.cursor_for_key(60) as stop,
self.auto_closing_cursor() as start,
self.auto_closing_cursor() as stop,
self.auto_closing_cursor() as cursor,
):
start.set_key(self.key(30))
stop.set_key(self.key(60))
self.session.truncate(None, start, stop, None)
cursor[45] = "v"

View File

@ -29,15 +29,14 @@
# test_layered_fast_truncate14.py
# Ensure next() skips truncated stable keys after search_near lands on an ingest key.
from contextlib import closing
from typing import Iterable
from helper_disagg import disagg_test_class, gen_disagg_storages
from helper_layered_fast_truncate import LayeredFastTruncateConfigMixin
from wtscenario import make_scenarios
import wttest
@disagg_test_class
class test_layered_fast_truncate14(wttest.WiredTigerTestCase):
class test_layered_fast_truncate14(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
"""next() skips truncated stable keys after search_near lands on an ingest key."""
uris = [
@ -49,43 +48,7 @@ class test_layered_fast_truncate14(wttest.WiredTigerTestCase):
scenarios = make_scenarios(disagg_storages, uris)
conn_config = 'disaggregated=(role="leader"),'
def session_create_config(self):
cfg = "key_format=i,value_format=S"
if self.uri.startswith("table"):
cfg += ",block_manager=disagg,type=layered"
return cfg
def auto_closing_cursor(self):
return closing(self.session.open_cursor(self.uri))
def populate(self, keys: Iterable[int]):
with self.auto_closing_cursor() as cursor:
with self.transaction():
for key in keys:
cursor[key] = "v"
def setup_leader(self, keys: Iterable[int] | None = None):
self.session.create(self.uri, self.session_create_config())
if keys is not None:
self.populate(keys)
self.session.checkpoint()
def setup_follower(self, keys: Iterable[int] | None = None):
self.reopen_disagg_conn('disaggregated=(role="follower"),')
if keys is not None:
self.populate(keys)
def truncate(self, start_key: int, stop_key: int):
with (
self.auto_closing_cursor() as start,
self.auto_closing_cursor() as stop,
):
start.set_key(start_key)
stop.set_key(stop_key)
with self.transaction():
self.session.truncate(None, start, stop, None)
def keys_after_search_near(self, search_key: int) -> list[int]:
def keys_after_search_near(self, search_key):
"""
Position on search_key via search_near (must be an exact match), then
return all keys yielded by subsequent next() calls.

View File

@ -30,16 +30,14 @@
# Validate edge scenario where no tombstones are written when ingest keys sit outside
# the range. Follower truncate tombstones ingest keys only inside the range.
from contextlib import closing
from typing import Iterable
from helper_disagg import disagg_test_class, gen_disagg_storages
from wiredtiger import WT_NOTFOUND
from helper_layered_fast_truncate import LayeredFastTruncateConfigMixin
from wtscenario import make_scenarios
import wttest
@disagg_test_class
class test_layered_fast_truncate15(wttest.WiredTigerTestCase):
class test_layered_fast_truncate15(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
"""Follower truncate tombstones only ingest keys inside the range."""
uris = [
@ -51,65 +49,15 @@ class test_layered_fast_truncate15(wttest.WiredTigerTestCase):
scenarios = make_scenarios(disagg_storages, uris)
conn_config = 'disaggregated=(role="leader"),'
def session_create_config(self):
cfg = "key_format=i,value_format=S"
if self.uri.startswith("table"):
cfg += ",block_manager=disagg,type=layered"
return cfg
def auto_closing_cursor(self):
return closing(self.session.open_cursor(self.uri))
def populate(self, keys: Iterable[int]):
with self.auto_closing_cursor() as cursor:
with self.transaction():
for key in keys:
cursor[key] = "v"
def setup_leader(self, keys: Iterable[int] | None = None):
self.session.create(self.uri, self.session_create_config())
if keys is not None:
self.populate(keys)
self.session.checkpoint()
def setup_follower(self, keys: Iterable[int] | None = None):
self.reopen_disagg_conn('disaggregated=(role="follower"),')
if keys is not None:
self.populate(keys)
def truncate(self, start_key: int, stop_key: int):
with (
self.auto_closing_cursor() as start,
self.auto_closing_cursor() as stop,
):
start.set_key(start_key)
stop.set_key(stop_key)
with self.transaction():
self.session.truncate(None, start, stop, None)
def search_key(self, key: int) -> int:
with self.auto_closing_cursor() as cursor:
with self.transaction(rollback=True):
cursor.set_key(key)
return cursor.search()
def visible_keys(self) -> list[int]:
result = []
with self.auto_closing_cursor() as cursor:
with self.transaction(rollback=True):
while cursor.next() == 0:
result.append(cursor.get_key())
return result
def test_ingest_keys_flanking_range_not_tombstoned(self):
# Ingest keys flank the range on both sides with none inside; neither should be tombstoned.
self.setup_leader(keys=[0, 10, 20, 30])
self.setup_follower(keys=[5, 25])
self.truncate(10, 20)
self.assertEqual(self.search_key(10), WT_NOTFOUND,
self.assertFalse(self.key_exists(10),
"key 10 must be deleted (stable-only, inside truncate range)")
self.assertEqual(self.search_key(25), 0,
self.assertTrue(self.key_exists(25),
"key 25 must be visible (ingest key, outside truncate range)")
def test_scan_correct_when_ingest_keys_flank_range(self):
@ -126,10 +74,8 @@ class test_layered_fast_truncate15(wttest.WiredTigerTestCase):
self.setup_follower(keys=[5])
self.truncate(10, 15)
self.assertEqual(self.search_key(10), WT_NOTFOUND,
"key 10 must be deleted")
self.assertEqual(self.search_key(5), 0,
"key 5 must be visible")
self.assertFalse(self.key_exists(10), "key 10 must be deleted")
self.assertTrue(self.key_exists(5), "key 5 must be visible")
def test_ingest_key_only_above_range(self):
# All ingest keys are above the range; none should be tombstoned.
@ -137,10 +83,8 @@ class test_layered_fast_truncate15(wttest.WiredTigerTestCase):
self.setup_follower(keys=[15])
self.truncate(5, 10)
self.assertEqual(self.search_key(10), WT_NOTFOUND,
"key 10 must be deleted")
self.assertEqual(self.search_key(15), 0,
"key 15 must be visible")
self.assertFalse(self.key_exists(10), "key 10 must be deleted")
self.assertTrue(self.key_exists(15), "key 15 must be visible")
def test_multiple_ingest_keys_both_sides_no_ingest_in_range(self):
# Multiple ingest keys on both sides of the range; none inside; all should stay visible.
@ -149,10 +93,10 @@ class test_layered_fast_truncate15(wttest.WiredTigerTestCase):
self.truncate(10, 15)
for k in [10, 15]:
self.assertEqual(self.search_key(k), WT_NOTFOUND,
self.assertFalse(self.key_exists(k),
f"key {k} must be deleted (stable-only, inside truncate range)")
for k in [3, 7, 18, 22]:
self.assertEqual(self.search_key(k), 0,
self.assertTrue(self.key_exists(k),
f"key {k} must be visible (ingest key, outside truncate range)")
if __name__ == "__main__":

View File

@ -28,13 +28,14 @@
import wttest, wiredtiger
from helper_disagg import disagg_test_class, gen_disagg_storages
from helper_layered_fast_truncate import LayeredFastTruncateConfigMixin
from wtscenario import make_scenarios
# test_layered_fast_truncate16.py
# Verify that pending follower truncates land on stable when the follower steps up,
# across the variety of per-key shapes and edge cases.
@disagg_test_class
class test_layered_fast_truncate_stepup(wttest.WiredTigerTestCase):
class test_layered_fast_truncate_stepup(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
conn_config = 'disaggregated=(role="leader")'
uri = 'layered:test_layered_fast_truncate_stepup'
@ -53,22 +54,10 @@ class test_layered_fast_truncate_stepup(wttest.WiredTigerTestCase):
self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(ts))
self.session.checkpoint()
# Open a separate follower connection, create the table on both sides, leader populates,
# follower picks up the checkpoint. After this, follower-side ops run on session_follow.
def setup_follower(self):
self.conn_follow = self.wiredtiger_open(
'follower',
self.extensionsConfig() + ',create,disaggregated=(role="follower")')
self.session_follow = self.conn_follow.open_session('')
self.session.create(self.uri, 'key_format=i,value_format=S')
self.session_follow.create(self.uri, 'key_format=i,value_format=S')
self.populate_on_leader()
self.disagg_advance_checkpoint(self.conn_follow)
# Step up the follower (which becomes the new leader) and step the original leader down.
def step_up(self):
self.ignoreStdoutPattern('Picking up the same checkpoint')
self.disagg_switch_follower_and_leader(self.conn_follow)
self.conn_follow, self.session_follow = self.open_follower()
def write_kv(self, key, value, ts):
cursor = self.session_follow.open_cursor(self.uri)
@ -97,26 +86,18 @@ class test_layered_fast_truncate_stepup(wttest.WiredTigerTestCase):
c_stop.close()
def assert_visible(self, keys, value=None, ts=None):
self.session_follow.begin_transaction('read_timestamp=' + self.timestamp_str(ts))
cursor = self.session_follow.open_cursor(self.uri)
for k in keys:
cursor.set_key(k)
self.assertEqual(cursor.search(), 0, f"key {k} should be visible at ts={ts}")
ret, val = self.search_at(self.session_follow, k, ts)
self.assertEqual(ret, 0, f"key {k} should be visible at ts={ts}")
if value is not None:
expected = value(k) if callable(value) else value
self.assertEqual(cursor.get_value(), expected)
cursor.close()
self.session_follow.rollback_transaction()
self.assertEqual(val, expected)
def assert_deleted(self, keys, ts):
self.session_follow.begin_transaction('read_timestamp=' + self.timestamp_str(ts))
cursor = self.session_follow.open_cursor(self.uri)
for k in keys:
cursor.set_key(k)
self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND,
ret, _ = self.search_at(self.session_follow, k, ts)
self.assertEqual(ret, wiredtiger.WT_NOTFOUND,
f"key {k} should be deleted at ts={ts}")
cursor.close()
self.session_follow.rollback_transaction()
def assert_keys_gone(self, ranges):
# Sweep the populated key space: keys inside any (lo, hi) inclusive range must be

View File

@ -28,6 +28,7 @@
import wiredtiger, wttest
from helper_disagg import disagg_test_class, gen_disagg_storages
from helper_layered_fast_truncate import LayeredFastTruncateConfigMixin
from wtscenario import make_scenarios
from wiredtiger import stat
@ -35,7 +36,7 @@ from wiredtiger import stat
# Verify that step-up replay uses fast page truncation (WT_REF_DELETED) when
# replaying follower truncates.
@disagg_test_class
class test_layered_fast_truncate17(wttest.WiredTigerTestCase):
class test_layered_fast_truncate17(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
conn_config = 'disaggregated=(role="leader")'
uri = 'layered:test_layered_ft_replay'
@ -45,12 +46,6 @@ class test_layered_fast_truncate17(wttest.WiredTigerTestCase):
disagg_storages = gen_disagg_storages('test_layered_ft_replay', disagg_only=True)
scenarios = make_scenarios(disagg_storages)
def get_stat(self, conn, stat_key):
s = conn.open_session('')
val = s.open_cursor('statistics:')[stat_key][2]
s.close()
return val
def populate_on_leader(self, ts=10):
cursor = self.session.open_cursor(self.uri)
for i in range(self.nitems):
@ -58,23 +53,12 @@ class test_layered_fast_truncate17(wttest.WiredTigerTestCase):
cursor[i] = 'v'
self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts))
cursor.close()
self.conn.set_timestamp('stable_timestamp=' + self.timestamp_str(ts) +
',oldest_timestamp=' + self.timestamp_str(1))
self.session.checkpoint()
self.leader_checkpoint(ts)
def setup_follower(self):
self.conn_follow = self.wiredtiger_open(
'follower',
self.extensionsConfig() + ',create,statistics=(all),disaggregated=(role="follower")')
self.session_follow = self.conn_follow.open_session('')
self.session.create(self.uri, self.table_config)
self.session_follow.create(self.uri, self.table_config)
self.populate_on_leader()
self.disagg_advance_checkpoint(self.conn_follow)
def step_up(self):
self.ignoreStdoutPattern('Picking up the same checkpoint')
self.disagg_switch_follower_and_leader(self.conn_follow)
self.conn_follow, self.session_follow = self.open_follower(self.table_config)
def truncate_range(self, start_key, stop_key, ts):
c_start = self.session_follow.open_cursor(self.uri)

View File

@ -30,22 +30,16 @@
# Write conflict detection for follower fast truncate (truncate-truncate
# conflicts only).
import unittest
from contextlib import closing, nullcontext
from typing import Iterable
from helper_disagg import disagg_test_class, gen_disagg_storages
from helper_layered_fast_truncate import LayeredFastTruncateConfigMixin, range_inclusive
from wiredtiger import WiredTigerError
from wtscenario import make_scenarios
import wttest
def range_inclusive(start: int, stop: int) -> range:
"""Return a range covering [start, stop] inclusive."""
return range(start, stop + 1)
@disagg_test_class
class test_layered_fast_truncate18(wttest.WiredTigerTestCase):
class test_layered_fast_truncate18(LayeredFastTruncateConfigMixin, wttest.WiredTigerTestCase):
"""
Write conflict detection for follower fast truncate (truncate-truncate
conflicts only).
@ -62,51 +56,32 @@ class test_layered_fast_truncate18(wttest.WiredTigerTestCase):
CONFLICT_MSG = "/conflict between concurrent operations/"
def session_create_config(self) -> str:
"""Return a config string for session.create() based on table URI."""
cfg = "key_format=i,value_format=S"
if self.uri.startswith("table"):
cfg += ",block_manager=disagg,type=layered"
return cfg
# These helpers are local to 18 because they all take an explicit session
# (the conflict tests drive two sessions concurrently). The equivalent
# mixin helpers are bound to self.session and so are not reusable here.
def auto_closing_cursor(self, session) -> closing:
"""Return a cursor that auto-closes as it goes out of scope."""
def cursor_on(self, session):
"""Return a cursor on the given session that auto-closes."""
return closing(session.open_cursor(self.uri))
def auto_closing_session(self) -> closing:
def auto_closing_session(self):
"""Return a session that auto-closes as it goes out of scope."""
return closing(self.conn.open_session())
def populate(self, keys: Iterable[int]):
"""Insert each key with a placeholder value in a single transaction."""
with self.auto_closing_cursor(self.session) as cursor:
with self.transaction():
for key in keys:
cursor[key] = "v"
def setup_leader(self, keys: Iterable[int] | None = None):
"""Create the table on the leader and optionally populate stable."""
self.session.create(self.uri, self.session_create_config())
if keys is not None:
self.populate(keys)
self.session.checkpoint()
def setup_follower(self, keys: Iterable[int] | None = None):
"""Switch to follower role and optionally write keys to ingest."""
self.reopen_disagg_conn('disaggregated=(role="follower"),')
if keys is not None:
self.populate(keys)
def cursor_for_key(self, key: int | None, session):
def cursor_for_key(self, key, session):
"""Return a cursor with its key set, or None if key is None."""
if key is None:
return nullcontext(None)
cursor = self.auto_closing_cursor(session)
cursor = self.cursor_on(session)
cursor.thing.set_key(key)
return cursor
def truncate(self, session, start_key: int | None, stop_key: int | None):
"""Execute a truncate from start to stop key inclusive."""
def truncate_on(self, session, start_key, stop_key):
"""
Truncate [start_key, stop_key] inclusive on the given session.
Caller manages the transaction (the conflict tests inspect the
truncate's failure/success inside a hand-managed txn).
"""
with (
self.cursor_for_key(start_key, session) as start,
self.cursor_for_key(stop_key, session) as stop,
@ -121,8 +96,8 @@ class test_layered_fast_truncate18(wttest.WiredTigerTestCase):
# Within a single transaction: truncate 30-60, then truncate 40-80.
with self.transaction():
self.truncate(self.session, 30, 60)
self.truncate(self.session, 40, 80)
self.truncate_on(self.session, 30, 60)
self.truncate_on(self.session, 40, 80)
# The transaction committed; no WT_ROLLBACK raised.
@ -134,7 +109,7 @@ class test_layered_fast_truncate18(wttest.WiredTigerTestCase):
# txn A begins a truncate over 30-60 and leaves it uncommitted.
session_a = self.session
session_a.begin_transaction()
self.truncate(session_a, 30, 60)
self.truncate_on(session_a, 30, 60)
# txn B truncates overlapping range 40-70 and gets WT_ROLLBACK.
with (
@ -143,7 +118,7 @@ class test_layered_fast_truncate18(wttest.WiredTigerTestCase):
):
self.assertRaisesException(
WiredTigerError,
lambda: self.truncate(session_b, 40, 70),
lambda: self.truncate_on(session_b, 40, 70),
self.CONFLICT_MSG,
)
@ -155,7 +130,7 @@ class test_layered_fast_truncate18(wttest.WiredTigerTestCase):
# txn A begins a truncate over 30-60 and leaves it uncommitted.
session_a = self.session
session_a.begin_transaction()
self.truncate(session_a, 30, 60)
self.truncate_on(session_a, 30, 60)
# txn B truncates overlapping range 40-70 and gets WT_ROLLBACK.
with (
@ -164,7 +139,7 @@ class test_layered_fast_truncate18(wttest.WiredTigerTestCase):
):
self.assertRaisesException(
WiredTigerError,
lambda: self.truncate(session_b, 40, 70),
lambda: self.truncate_on(session_b, 40, 70),
self.CONFLICT_MSG,
)
@ -176,14 +151,14 @@ class test_layered_fast_truncate18(wttest.WiredTigerTestCase):
# txn A truncates 10-30 and leaves it uncommitted.
session_a = self.session
session_a.begin_transaction()
self.truncate(session_a, 10, 30)
self.truncate_on(session_a, 10, 30)
# txn B truncates 50-70 (no overlap) and commits successfully.
with (
self.auto_closing_session() as session_b,
self.transaction(session=session_b),
):
self.truncate(session_b, 50, 70)
self.truncate_on(session_b, 50, 70)
def test_rolled_back_truncate_no_residual(self):
# A follower with stable keys 1-100.
@ -193,14 +168,14 @@ class test_layered_fast_truncate18(wttest.WiredTigerTestCase):
# txn A truncates 30-60 then explicitly rolls back.
session_a = self.session
with self.transaction(session=session_a, rollback=True):
self.truncate(session_a, 30, 60)
self.truncate_on(session_a, 30, 60)
# txn B truncates the same range 30-60 and commits without WT_ROLLBACK.
with (
self.auto_closing_session() as session_b,
self.transaction(session=session_b),
):
self.truncate(session_b, 30, 60)
self.truncate_on(session_b, 30, 60)
def test_invisible_committed_truncate_conflicts(self):
# A follower with stable keys 1-100.
@ -210,7 +185,7 @@ class test_layered_fast_truncate18(wttest.WiredTigerTestCase):
# txn A commits a truncate over 30-60 at ts=10 (invisible to txn B).
self.conn.set_timestamp("oldest_timestamp=" + self.timestamp_str(1))
with self.transaction(commit_timestamp=10):
self.truncate(self.session, 30, 60)
self.truncate_on(self.session, 30, 60)
# txn B (read_ts=5) truncates overlapping range 40-70 and gets
# WT_ROLLBACK.
@ -222,7 +197,7 @@ class test_layered_fast_truncate18(wttest.WiredTigerTestCase):
):
self.assertRaisesException(
WiredTigerError,
lambda: self.truncate(session_b, 40, 70),
lambda: self.truncate_on(session_b, 40, 70),
self.CONFLICT_MSG,
)
@ -234,7 +209,7 @@ class test_layered_fast_truncate18(wttest.WiredTigerTestCase):
# txn A commits a truncate over 30-60 at ts=5 (visible to txn B).
self.conn.set_timestamp("oldest_timestamp=" + self.timestamp_str(1))
with self.transaction(commit_timestamp=5):
self.truncate(self.session, 30, 60)
self.truncate_on(self.session, 30, 60)
# txn B (read_ts=10) truncates overlapping range 40-70 without
# WT_ROLLBACK.
@ -242,7 +217,7 @@ class test_layered_fast_truncate18(wttest.WiredTigerTestCase):
self.auto_closing_session() as session_b,
self.transaction(session=session_b, read_timestamp=10),
):
self.truncate(session_b, 40, 70)
self.truncate_on(session_b, 40, 70)
if __name__ == "__main__":

View File

@ -0,0 +1,112 @@
#!/usr/bin/env python3
#
# Public Domain 2014-present MongoDB, Inc.
# Public Domain 2008-2014 WiredTiger, Inc.
#
# This is free and unencumbered software released into the public domain.
#
# Anyone is free to copy, modify, publish, use, compile, sell, or
# distribute this software, either in source code form or as a compiled
# binary, for any purpose, commercial or non-commercial, and by any
# means.
#
# In jurisdictions that recognize copyright laws, the author or authors
# of this software dedicate any and all copyright interest in the
# software to the public domain. We make this dedication for the benefit
# of the public at large and to the detriment of our heirs and
# successors. We intend this dedication to be an overt act of
# relinquishment in perpetuity of all present and future rights to this
# software under copyright law.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
import wiredtiger, wttest
from helper_disagg import disagg_test_class
# test_layered_prepare03.py
# Forward iteration on a layered cursor after the very first next() returns
# WT_PREPARE_CONFLICT must resume correctly and return all visible keys.
@disagg_test_class
class test_layered_prepare03(wttest.WiredTigerTestCase):
conn_base_config = 'precise_checkpoint=true,'
conn_config = conn_base_config + 'disaggregated=(role="leader")'
def safe_next(self, cursor):
try:
return cursor.next()
except wiredtiger.WiredTigerError as e:
if 'WT_PREPARE_CONFLICT' in str(e):
return wiredtiger.WT_PREPARE_CONFLICT
raise
def test_iterate_after_prepare_conflict_on_first_key(self):
'''
A layered cursor that encounters WT_PREPARE_CONFLICT on its very first
next() call must resume from the beginning after the conflict is resolved
and return all stable keys.
'''
uri = 'table:test_layered_prepare03'
stable_keys = ['1', '2', '3']
# Write stable keys on the leader and checkpoint.
self.session.create(
uri, 'key_format=S,value_format=S,block_manager=disagg,type=layered')
with self.transaction(session=self.session, commit_timestamp=100):
c = self.session.open_cursor(uri)
for k in stable_keys:
c[k] = 'stable_' + k
c.close()
self.conn.set_timestamp(f'stable_timestamp={self.timestamp_str(200)}')
self.session.checkpoint()
# Open a follower and pull in the stable checkpoint.
conn_follow = self.wiredtiger_open(
'follower',
self.extensionsConfig() + ',create,' + self.conn_base_config +
'disaggregated=(role="follower")')
self.disagg_advance_checkpoint(conn_follow)
# Prepare an ingest update for key '1' so that the first next() on the
# layered cursor returns WT_PREPARE_CONFLICT.
prep_session = conn_follow.open_session('')
prep_cursor = prep_session.open_cursor(uri)
prep_session.begin_transaction()
prep_cursor['1'] = 'prepared_update'
prep_cursor.close()
prep_session.prepare_transaction(
f'prepare_timestamp={self.timestamp_str(300)}'
+ f',prepared_id={self.prepared_id_str(1)}')
# Read-committed isolation: the transaction sees the prepared update as
# a conflict on the very first next() call.
iter_session = conn_follow.open_session('')
iter_session.begin_transaction('isolation=read-committed')
iter_cursor = iter_session.open_cursor(uri)
# First next() must hit the prepared key and return WT_PREPARE_CONFLICT.
self.assertEqual(self.safe_next(iter_cursor), wiredtiger.WT_PREPARE_CONFLICT)
# Resolve the conflict and verify that subsequent iteration returns all
# stable keys from the beginning.
prep_session.rollback_transaction()
got = []
ret = iter_cursor.next()
while ret == 0:
got.append(iter_cursor.get_key())
ret = iter_cursor.next()
self.assertEqual(ret, wiredtiger.WT_NOTFOUND)
self.assertEqual(got, stable_keys)
iter_cursor.close()
iter_session.rollback_transaction()
prep_session.close()
conn_follow.close()

View File

@ -87,7 +87,7 @@ class test_prepare35(test_prepare_preserve_prepare_base):
session_evict.close()
# Step 4: Rollback the first prepared transaction
# This prepends a globally visible tombstone
# This appends a globally visible tombstone to the tail of the update chain
session_prepare.rollback_transaction("rollback_timestamp=" + self.timestamp_str(35))
session_prepare.close()

View File

@ -160,13 +160,13 @@ class test_prepare47(wttest.WiredTigerTestCase):
evict_session.close()
def test_aborted_prepared_with_lost_disk_fallback(self):
# Theory: at rollback time, first_committed_upd is NULL (no committed update behind
# the prepared insert) but tw_found is true (on-disk cell with stop is the fallback),
# so __txn_prepare_rollback_delete_key is not called and no rollback tombstone is
# prepended. Later, a reconcile drops the on-disk cell (its stop is globally visible
# and nothing is selected for the key), erasing the only fallback. A subsequent
# reconcile that walks the surviving aborted prepared update has neither a rollback
# tombstone nor an on-disk fallback, tripping the leaked-prepared-update assertion.
# Theory: at rollback time there is no committed update behind the prepared insert,
# but there is an on-disk cell with a stop that serves as the fallback, so no rollback
# tombstone is appended to the chain. Later, a reconcile drops the on-disk cell (its
# stop is globally visible and nothing is selected for the key), erasing the only
# fallback. A subsequent reconcile that walks the surviving aborted prepared update has
# neither a rollback tombstone nor an on-disk fallback, tripping the
# leaked-prepared-update assertion.
insert_ts = 20
delete_ts = 30
oldest_after_delete = 31
@ -224,8 +224,9 @@ class test_prepare47(wttest.WiredTigerTestCase):
self.conn.set_timestamp(
'stable_timestamp=' + self.timestamp_str(stable_unstable))
# Roll back with rollback_ts ahead of stable; first_committed_upd is NULL but
# tw_found is true so no rollback tombstone is prepended.
# Roll back with rollback_ts ahead of stable; there is no committed update behind the
# prepared insert but the on-disk cell exists, so no rollback tombstone is appended to
# the chain.
self.session.rollback_transaction(
'rollback_timestamp=' + self.timestamp_str(rollback_ts))