Import wiredtiger: 3c6ec61a01a4dbdb3351121c0a624d3996e28c03 from branch mongodb-master (#54418)

Co-authored-by: wt-vendoring-bot <wt-vendoring-bot@mongodb.com>
GitOrigin-RevId: 0bbb8b1c0435ef423df293b6af6f4a39fdb390df
This commit is contained in:
wt-vendoring-bot[bot] 2026-05-26 11:33:17 +10:00 committed by MongoDB Bot
parent ece8a907d7
commit e7ead1c8cf
46 changed files with 1504 additions and 1357 deletions

View File

@ -1,8 +1,13 @@
{
"version": 3,
"configurePresets": [
{
"name": "default",
"displayName": "Default"
},
{
"name": "linux",
"inherits": "default",
"hidden": true,
"condition": {
"type": "equals",
@ -15,6 +20,7 @@
},
{
"name": "linux-v4",
"inherits": "default",
"hidden": true,
"condition": {
"type": "equals",
@ -52,5 +58,13 @@
"CMAKE_CXX_COMPILER": "$env{MONGODBTOOLCHAIN_BIN}/g++"
}
}
],
"buildPresets": [
{
"name": "default",
"displayName": "Default",
"configurePreset": "default",
"jobs": 0
}
]
}

View File

@ -1,186 +0,0 @@
include(cmake/helpers.cmake)
### Auto configure options and checks that we can infer from our toolchain environment.
config_include(
HAVE_X86INTRIN_H
"Include header x86intrin.h exists."
FILE "x86intrin.h"
)
config_include(
HAVE_ARM_NEON_INTRIN_H
"Include header arm_neon.h exists."
FILE "arm_neon.h"
)
config_func(
HAVE_FALLOCATE
"Function fallocate exists."
FUNC "fallocate"
FILES "fcntl.h"
)
config_func(
HAVE_FDATASYNC
"Function fdatasync exists."
FUNC "fdatasync"
FILES "unistd.h"
DEPENDS "NOT WT_DARWIN"
)
config_func(
HAVE_CLOCK_GETTIME
"Function clock_gettime exists."
FUNC "clock_gettime"
FILES "time.h"
)
config_func(
HAVE_GETTIMEOFDAY
"Function gettimeofday exists."
FUNC "gettimeofday"
FILES "sys/time.h"
)
config_func(
HAVE_POSIX_FADVISE
"Function posix_fadvise exists."
FUNC "posix_fadvise"
FILES "fcntl.h"
)
config_func(
HAVE_POSIX_FALLOCATE
"Function posix_fallocate exists."
FUNC "posix_fallocate"
FILES "fcntl.h"
)
config_func(
HAVE_POSIX_MADVISE
"Function posix_madvise exists."
FUNC "posix_madvise"
FILES "sys/mman.h"
)
config_func(
HAVE_POSIX_MEMALIGN
"Function posix_memalign exists."
FUNC "posix_memalign"
FILES "stdlib.h"
)
config_func(
HAVE_SETRLIMIT
"Function setrlimit exists."
FUNC "setrlimit"
FILES "sys/time.h;sys/resource.h"
)
config_func(
HAVE_SYNC_FILE_RANGE
"Function sync_file_range exists."
FUNC "sync_file_range"
FILES "fcntl.h"
)
config_func(
HAVE_TIMER_CREATE
"Function timer_create exists."
FUNC "timer_create"
FILES "signal.h;time.h"
LIBS "rt"
)
config_lib(
HAVE_LIBMEMKIND
"memkind library exists."
LIB "memkind"
HEADER "memkind.h"
)
config_lib(
HAVE_LIBPTHREAD
"Pthread library exists."
LIB "pthread"
)
config_lib(
HAVE_LIBRT
"rt library exists."
LIB "rt"
)
config_lib(
HAVE_LIBDL
"dl library exists."
LIB "dl"
)
config_lib(
HAVE_LIBCXX
"stdc++ library exists."
LIB "stdc++"
)
config_lib(
HAVE_LIBACCEL_CONFIG
"accel-config library exists."
LIB "accel-config"
)
config_lib(
HAVE_LIBLZ4
"lz4 library exists."
LIB "lz4"
HEADER "lz4.h"
)
config_lib(
HAVE_LIBSNAPPY
"snappy library exists."
LIB "snappy"
HEADER "snappy.h"
)
config_lib(
HAVE_LIBZ
"zlib library exists."
LIB "z"
HEADER "zlib.h"
)
config_lib(
HAVE_LIBZSTD
"zstd library exists."
LIB "zstd"
HEADER "zstd.h"
)
config_lib(
HAVE_LIBQPL
"qpl library exists."
LIB "qpl"
HEADER "qpl/qpl.h"
)
config_lib(
HAVE_LIBSODIUM
"sodium library exists."
LIB "sodium"
HEADER "sodium.h"
)
config_compile(
HAVE_PTHREAD_COND_MONOTONIC
"If pthread condition variables support monotonic clocks."
SOURCE "${CMAKE_CURRENT_LIST_DIR}/compile_test/pthread_cond_monotonic_test.c"
LIBS "pthread"
DEPENDS "HAVE_LIBPTHREAD"
)
set(WORDS_BIGENDIAN FALSE)
if(${CMAKE_C_BYTE_ORDER} STREQUAL "BIG_ENDIAN")
set(WORDS_BIGENDIAN TRUE)
endif()

View File

@ -1,40 +0,0 @@
/*
* Copyright (c) 2014-present MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
* See the file LICENSE for redistribution information.
*/
#include <errno.h>
#include <pthread.h>
#include <stdlib.h>
#include <time.h>
int
main()
{
int ret;
pthread_condattr_t condattr;
pthread_cond_t cond;
pthread_mutex_t mtx;
struct timespec ts;
if ((ret = pthread_condattr_init(&condattr)) != 0)
exit(1);
if ((ret = pthread_condattr_setclock(&condattr, CLOCK_MONOTONIC)) != 0)
exit(1);
if ((ret = pthread_cond_init(&cond, &condattr)) != 0)
exit(1);
if ((ret = pthread_mutex_init(&mtx, NULL)) != 0)
exit(1);
if ((ret = clock_gettime(CLOCK_MONOTONIC, &ts)) != 0)
exit(1);
ts.tv_sec += 1;
if ((ret = pthread_mutex_lock(&mtx)) != 0)
exit(1);
if ((ret = pthread_cond_timedwait(&cond, &mtx, &ts)) != 0 && ret != EINTR && ret != ETIMEDOUT)
exit(1);
exit(0);
}

View File

@ -70,95 +70,54 @@
*/
#cmakedefine HAVE_UNITTEST_ASSERTS 1
/* Define to 1 if you have the `fallocate' function. */
#cmakedefine HAVE_FALLOCATE 1
/* Define to 1 if you have the `fdatasync' function. */
#cmakedefine HAVE_FDATASYNC 1
/* Define to 1 if you have the `clock_gettime' function. */
#cmakedefine HAVE_CLOCK_GETTIME 1
/* Define to 1 if you have the `gettimeofday' function. */
#cmakedefine HAVE_GETTIMEOFDAY 1
/* Define to 1 if you have the `dl' library (-ldl). */
#cmakedefine HAVE_LIBDL 1
/* Define to 1 if you have the `stdc++' library (-lstdc++). */
#cmakedefine HAVE_LIBCXX 1
/* Define to 1 if you have the `accel-config' library (-laccel-config). */
#cmakedefine HAVE_LIBACCEL_CONFIG 1
/* Define to 1 if you have the `lz4' library (-llz4). */
#cmakedefine HAVE_LIBLZ4 1
/* Define to 1 if you have the `memkind' library (-lmemkind). */
#cmakedefine HAVE_LIBMEMKIND 1
/* Define to 1 if the user has explicitly enable memkind builds. */
/* Define to 1 if the user has explicitly enabled memkind builds. */
#cmakedefine ENABLE_MEMKIND 1
/* Define to 1 if you have the `pthread' library (-lpthread). */
#cmakedefine HAVE_LIBPTHREAD 1
/* Define to 1 if you have the `rt' library (-lrt). */
#cmakedefine HAVE_LIBRT 1
/* Define to 1 if you have the `snappy' library (-lsnappy). */
#cmakedefine HAVE_LIBSNAPPY 1
/* Define to 1 if the user has set enable antithesis. */
#cmakedefine ENABLE_ANTITHESIS 1
/* Define to 1 if you have the `z' library (-lz). */
#cmakedefine HAVE_LIBZ 1
/* Define to 1 if you have the `zstd' library (-lzstd). */
#cmakedefine HAVE_LIBZSTD 1
/* Define to 1 if you have the `qpl' library (-lqpl). */
#cmakedefine HAVE_LIBQPL 1
/* Define to 1 if you have the `sodium' library (-lsodium). */
#cmakedefine HAVE_LIBSODIUM 1
/* Automatically set by the build system, turns on or off optional RCpc ARM instructions. */
#cmakedefine HAVE_RCPC 1
/* Define to 1 to disable any crc32 hardware support. */
#cmakedefine HAVE_NO_CRC32_HARDWARE
#cmakedefine HAVE_NO_CRC32_HARDWARE 1
/* Define to 1 if you have the `posix_fadvise' function. */
#cmakedefine HAVE_POSIX_FADVISE 1
/*
* Compile-time platform feature flags.
*/
/* Define to 1 if you have the `posix_fallocate' function. */
#cmakedefine HAVE_POSIX_FALLOCATE 1
/* POSIX.1-2001 functions available on every supported POSIX target. */
#if defined(__linux__) || defined(__APPLE__) || defined(__NetBSD__)
#define HAVE_CLOCK_GETTIME 1
#define HAVE_GETTIMEOFDAY 1
#define HAVE_POSIX_MADVISE 1
#define HAVE_POSIX_MEMALIGN 1
#define HAVE_SETRLIMIT 1
#endif
/* Define to 1 if you have the `posix_madvise' function. */
#cmakedefine HAVE_POSIX_MADVISE 1
/* POSIX functions macOS does not implement. */
#if defined(__linux__) || defined(__NetBSD__)
#define HAVE_FDATASYNC 1
#define HAVE_POSIX_FADVISE 1
#define HAVE_POSIX_FALLOCATE 1
#define HAVE_PTHREAD_COND_MONOTONIC 1
#define HAVE_TIMER_CREATE 1
#endif
/* Define to 1 if `posix_memalign' works. */
#cmakedefine HAVE_POSIX_MEMALIGN 1
/* Linux-specific syscalls and extensions. */
#if defined(__linux__)
#define HAVE_FALLOCATE 1
#define HAVE_SYNC_FILE_RANGE 1
#endif
/* Define to 1 if pthread condition variables support monotonic clocks. */
#cmakedefine HAVE_PTHREAD_COND_MONOTONIC 1;
/* Architecture-specific intrinsic headers. */
#if defined(__x86_64__) || defined(_M_X64)
#define HAVE_X86INTRIN_H 1
#endif
/* Define to 1 if you have the `setrlimit' function. */
#cmakedefine HAVE_SETRLIMIT 1
/* Define to 1 if you have the `sync_file_range' function. */
#cmakedefine HAVE_SYNC_FILE_RANGE 1
/* Define to 1 if you have the `timer_create' function. */
#cmakedefine HAVE_TIMER_CREATE 1
/* Define to 1 if you have the <x86intrin.h> header file. */
#cmakedefine HAVE_X86INTRIN_H 1
/* Define to 1 if you have the <arm_neon.h> header file. */
#cmakedefine HAVE_ARM_NEON_INTRIN_H 1
#if defined(__aarch64__) || defined(_M_ARM64)
#define HAVE_ARM_NEON_INTRIN_H 1
#endif
/* Spinlock type from mutex.h. */
#cmakedefine SPINLOCK_TYPE @SPINLOCK_TYPE_CONFIG_VAR@

View File

@ -50,15 +50,10 @@ macro(define_wiredtiger_library target type)
C_STANDARD 11
)
# Ensure we link any available library dependencies to our wiredtiger target.
if(HAVE_LIBPTHREAD)
target_link_libraries(${target} PUBLIC ${HAVE_LIBPTHREAD})
endif()
if(HAVE_LIBRT)
target_link_libraries(${target} PUBLIC ${HAVE_LIBRT})
endif()
if(HAVE_LIBDL)
target_link_libraries(${target} PUBLIC ${HAVE_LIBDL})
# System library dependencies.
target_link_libraries(${target} PUBLIC Threads::Threads ${CMAKE_DL_LIBS})
if(WT_LINUX)
target_link_libraries(${target} PUBLIC rt)
endif()
if(ENABLE_MEMKIND)
target_link_libraries(${target} PRIVATE wt::memkind)
@ -88,11 +83,8 @@ macro(define_wiredtiger_library target type)
if(HAVE_BUILTIN_EXTENSION_IAA)
target_link_libraries(${target} PRIVATE iaacodec)
if(HAVE_LIBCXX)
target_link_libraries(${target} PRIVATE ${HAVE_LIBCXX})
endif()
if(HAVE_LIBACCEL_CONFIG)
target_link_libraries(${target} PRIVATE ${HAVE_LIBACCEL_CONFIG})
target_link_libraries(${target} PRIVATE wt::accel_config)
endif()
endif()

View File

@ -1,8 +1,3 @@
include(CheckIncludeFiles)
include(CheckSymbolExists)
include(CheckLibraryExists)
include(CheckTypeSize)
# Helper function for evaluating a list of dependencies. Mostly used by the
# "config_X" helpers to evaluate the dependencies required to enable the config
# option.
@ -244,248 +239,132 @@ function(config_bool config_name description)
endif()
endfunction()
# config_func(config_name description FUNC <function-symbol> FILE <include-header> [DEPENDS <deps>] [LIBS <library-dependencies>])
# Defines a boolean (0/1) configuration option based on whether a given function symbol exists.
# The configuration option is stored in the cmake cache and can be exported to the wiredtiger config header.
# config_name - name of the configuration option.
# description - docstring to describe the configuration option (viewable in the cmake-gui).
# FUNC <function-symbol> - function symbol we want to search for.
# FILE <include-header> - header we expect the function symbol to be defined e.g a std header.
# DEPENDS <deps> - list of dependencies (semicolon separated) required for the configuration to be evaluated.
# If any of the dependencies aren't met the configuration value will be set to '0' (false).
# LIBS <library-dependencies> - a list of any additional library dependencies needed to successfully link with the function symbol.
function(config_func config_name description)
# wt_find_library(NAME <name>
# [CMAKE_TARGET <target>]
# [PACKAGE <pkg> TARGET <target>]
# [PKGCONFIG_MODULE <mod>]
# [LIBRARY <libname>]
# [HEADER <hdr>])
#
# Discover a third-party library through CMake's canonical lookup chain:
#
# 1. find_package(<PACKAGE> QUIET)
# Tries MODULE mode (CMake-shipped Find<Pkg>.cmake) then CONFIG mode
# (library-shipped <Pkg>Config.cmake).
# 2. pkg_check_modules(... IMPORTED_TARGET <PKGCONFIG_MODULE>)
# Falls back to pkg-config metadata.
# 3. find_library(<LIBRARY>) + find_path(<HEADER>)
# Raw filesystem search; constructs an UNKNOWN IMPORTED target.
#
# Each step is attempted only if the relevant arguments are provided. The first
# successful step wins; the rest are skipped.
#
# On success:
# HAVE_LIB${upper(NAME)} cache variable set ON.
# wt::${CMAKE_TARGET or NAME} alias created from the discovered imported target.
# On failure:
# HAVE_LIB${upper(NAME)} cache variable set OFF.
#
# Examples:
# wt_find_library(NAME lz4
# PACKAGE lz4 TARGET LZ4::lz4
# PKGCONFIG_MODULE liblz4
# HEADER lz4.h)
#
# wt_find_library(NAME z CMAKE_TARGET zlib
# PACKAGE ZLIB TARGET ZLIB::ZLIB
# PKGCONFIG_MODULE zlib
# HEADER zlib.h)
function(wt_find_library)
cmake_parse_arguments(
PARSE_ARGV
2
"CONFIG_FUNC"
0
"WTLIB"
""
"FUNC;DEPENDS;FILES;LIBS"
"NAME;CMAKE_TARGET;PACKAGE;TARGET;PKGCONFIG_MODULE;LIBRARY;HEADER"
""
)
if (NOT "${CONFIG_FUNC_UNPARSED_ARGUMENTS}" STREQUAL "")
message(FATAL_ERROR "Unknown arguments to config_func: ${CONFIG_FUNC_UNPARSED_ARGUMENTS}")
if(NOT WTLIB_NAME)
message(FATAL_ERROR "wt_find_library: NAME is required")
endif()
# We require an include header (not optional).
if ("${CONFIG_FUNC_FILES}" STREQUAL "")
message(FATAL_ERROR "No file list passed")
endif()
# We require a function symbol (not optional).
if ("${CONFIG_FUNC_FUNC}" STREQUAL "")
message(FATAL_ERROR "No function passed")
if(WTLIB_PACKAGE AND NOT WTLIB_TARGET)
message(FATAL_ERROR "wt_find_library(${WTLIB_NAME}): PACKAGE requires TARGET")
endif()
# Check that the configs dependencies are enabled before setting it to a visible enabled state.
eval_dependency("${CONFIG_FUNC_DEPENDS}" enabled)
if(enabled)
set(CMAKE_REQUIRED_LIBRARIES "${CONFIG_FUNC_LIBS}")
check_symbol_exists(${CONFIG_FUNC_FUNC} "${CONFIG_FUNC_FILES}" has_symbol_${config_name})
set(CMAKE_REQUIRED_LIBRARIES)
set(has_symbol "0")
if(has_symbol_${config_name})
set(has_symbol ${has_symbol_${config_name}})
endif()
# Set an internal cache variable "${config_name}_DISABLED" to capture its enabled/disabled state.
# We want to ensure we capture a transition from a disabled to enabled state when dependencies are met.
if(${config_name}_DISABLED)
unset(${config_name}_DISABLED CACHE)
set(${config_name} ${has_symbol} CACHE BOOL "${description}" FORCE)
else()
set(${config_name} ${has_symbol} CACHE BOOL "${description}")
endif()
# 'check_symbol_exists' sets our given temp variable into the cache. Clear this so it doesn't persist between
# configuration runs.
unset(has_symbol_${config_name} CACHE)
string(TOUPPER "${WTLIB_NAME}" _name_upper)
set(_have_var "HAVE_LIB${_name_upper}")
if(WTLIB_CMAKE_TARGET)
set(_alias "wt::${WTLIB_CMAKE_TARGET}")
else()
# Config doesn't meet dependency requirements, set a disabled state.
set(${config_name} OFF CACHE INTERNAL "" FORCE)
set(${config_name}_DISABLED ON CACHE INTERNAL "" FORCE)
endif()
endfunction()
# config_include(config_name description FILE <include-header> [DEPENDS <deps>])
# Defines a boolean (0/1) configuration option based on whether a given include header exists.
# The configuration option is stored in the cmake cache and can be exported to the wiredtiger config header.
# config_name - name of the configuration option.
# description - docstring to describe the configuration option (viewable in the cmake-gui).
# FILE <include-header> - header we want to search for e.g a std header.
# DEPENDS <deps> - list of dependencies (semicolon separated) required for the configuration to be evaluated.
# If any of the dependencies aren't met the configuration value will be set to '0' (false).
function(config_include config_name description)
cmake_parse_arguments(
PARSE_ARGV
2
"CONFIG_INCLUDE"
""
"FILE;DEPENDS"
""
)
if (NOT "${CONFIG_INCLUDE_UNPARSED_ARGUMENTS}" STREQUAL "")
message(FATAL_ERROR "Unknown arguments to config_include: ${CONFIG_INCLUDE_UNPARSED_ARGUMENTS}")
endif()
# We require a include header (not optional).
if ("${CONFIG_INCLUDE_FILE}" STREQUAL "")
message(FATAL_ERROR "No include file passed")
set(_alias "wt::${WTLIB_NAME}")
endif()
# Check that the configs dependencies are enabled before setting it to a visible enabled state.
eval_dependency("${CONFIG_INCLUDE_DEPENDS}" enabled)
if(enabled)
check_include_files(${CONFIG_INCLUDE_FILE} has_include_${config_name})
set(has_include "0")
if(has_include_${config_name})
set(has_include ${has_include_${config_name}})
endif()
# Set an internal cache variable "${config_name}_DISABLED" to capture its enabled/disabled state.
# We want to ensure we capture a transition from a disabled to enabled state when dependencies are met.
if(${config_name}_DISABLED)
unset(${config_name}_DISABLED CACHE)
set(${config_name} ${has_include} CACHE BOOL "${description}" FORCE)
else()
set(${config_name} ${has_include} CACHE BOOL "${description}")
endif()
# 'check_include_files' sets our given temp variable into the cache. Clear this so it doesn't persist between
# configuration runs.
unset(has_include_${config_name} CACHE)
# Guard against repeated work.
if(TARGET ${_alias})
return()
endif()
if(WTLIB_LIBRARY)
set(_libname "${WTLIB_LIBRARY}")
else()
set(${config_name} OFF CACHE INTERNAL "" FORCE)
set(${config_name}_DISABLED ON CACHE INTERNAL "" FORCE)
endif()
endfunction()
# config_lib(config_name description LIB <library> FUNC <function-symbol> [DEPENDS <deps>] [HEADER <file>])
# Defines a boolean (0/1) configuration option based on whether a given library exists.
# The configuration option is stored in the cmake cache and can be exported to the wiredtiger config header.
# config_name - name of the configuration option.
# description - docstring to describe the configuration option (viewable in the cmake-gui).
# LIB <library> - library we are searching for (defined as if we are linking against it e.g -lpthread).
# FUNC <function-symbol> - function symbol we expect to be available to link against within the library.
# DEPENDS <deps> - list of dependencies (semicolon separated) required for the configuration to be evaluated.
# If any of the dependencies aren't met the configuration value will be set to '0' (false).
function(config_lib config_name description)
cmake_parse_arguments(
PARSE_ARGV
2
"CONFIG_LIB"
""
"LIB;DEPENDS;HEADER"
""
)
if (NOT "${CONFIG_LIB_UNPARSED_ARGUMENTS}" STREQUAL "")
message(FATAL_ERROR "Unknown arguments to config_lib: ${CONFIG_LIB_UNPARSED_ARGUMENTS}")
endif()
# We require a library (not optional).
if ("${CONFIG_LIB_LIB}" STREQUAL "")
message(FATAL_ERROR "No library passed")
set(_libname "${WTLIB_NAME}")
endif()
# Check that the configs dependencies are enabled before setting it to a visible enabled state.
eval_dependency("${CONFIG_LIB_DEPENDS}" enabled)
if(enabled)
message(CHECK_START "Looking for library ${CONFIG_LIB_LIB}")
find_library(has_lib_${config_name} ${CONFIG_LIB_LIB})
set(has_lib "0")
set(has_include "")
if(has_lib_${config_name})
set(has_lib ${has_lib_${config_name}})
if (CONFIG_LIB_HEADER)
find_path(include_path_${config_name} ${CONFIG_LIB_HEADER})
if (include_path_${config_name})
message(CHECK_PASS "found ${has_lib_${config_name}}, include path ${include_path_${config_name}}")
set(has_include ${include_path_${config_name}})
else()
message(CHECK_PASS "found ${has_lib_${config_name}}")
endif()
unset(include_path_${config_name} CACHE)
else()
message(CHECK_PASS "found ${has_lib_${config_name}}")
message(CHECK_START "Looking for library ${_libname}")
set(_imported "")
# Step 1: find_package (MODULE then CONFIG by default).
if(WTLIB_PACKAGE)
find_package(${WTLIB_PACKAGE} QUIET)
if(${WTLIB_PACKAGE}_FOUND AND TARGET ${WTLIB_TARGET})
set(_imported ${WTLIB_TARGET})
endif()
endif()
# Step 2: pkg-config.
if(NOT _imported AND WTLIB_PKGCONFIG_MODULE)
find_package(PkgConfig QUIET)
if(PkgConfig_FOUND)
pkg_check_modules(${_name_upper} QUIET IMPORTED_TARGET ${WTLIB_PKGCONFIG_MODULE})
if(${_name_upper}_FOUND)
set(_imported "PkgConfig::${_name_upper}")
endif()
else()
message(CHECK_FAIL "not found")
endif()
# Set an internal cache variable "${config_name}_DISABLED" to capture its enabled/disabled state.
# We want to ensure we capture a transition from a disabled to enabled state when dependencies are met.
if(${config_name}_DISABLED)
unset(${config_name}_DISABLED CACHE)
set(${config_name} ${has_lib} CACHE STRING "${description}" FORCE)
set(${config_name}_INCLUDES ${has_include} CACHE STRING "Additional include paths for ${config_name}" FORCE)
else()
set(${config_name} ${has_lib} CACHE STRING "${description}")
set(${config_name}_INCLUDES ${has_include} CACHE STRING "Additional include paths for ${config_name}")
endif()
# Step 3: raw find_library + find_path.
if(NOT _imported)
find_library(${_name_upper}_LIBRARY ${_libname})
if(WTLIB_HEADER)
find_path(${_name_upper}_INCLUDE_DIR ${WTLIB_HEADER})
endif()
# 'check_library_exists' sets our given temp variable into the cache. Clear this so it doesn't persist between
# configuration runs.
unset(has_lib_${config_name} CACHE)
if(${_name_upper}_LIBRARY AND (NOT WTLIB_HEADER OR ${_name_upper}_INCLUDE_DIR))
set(_raw "wt_imported_${WTLIB_NAME}")
if(NOT TARGET ${_raw})
add_library(${_raw} UNKNOWN IMPORTED GLOBAL)
set_target_properties(${_raw} PROPERTIES
IMPORTED_LOCATION "${${_name_upper}_LIBRARY}")
if(WTLIB_HEADER)
set_target_properties(${_raw} PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${${_name_upper}_INCLUDE_DIR}")
endif()
endif()
set(_imported ${_raw})
endif()
endif()
if(_imported)
set(${_have_var} ON CACHE INTERNAL "${WTLIB_NAME} available on system")
if(NOT TARGET ${_alias})
add_library(${_alias} ALIAS ${_imported})
endif()
message(CHECK_PASS "found")
else()
message(STATUS "Not looking for library ${CONFIG_LIB_LIB}: disabled")
set(${config_name} 0 CACHE INTERNAL "" FORCE)
set(${config_name}_DISABLED ON CACHE INTERNAL "" FORCE)
endif()
endfunction()
# config_compile(config_name description SOURCE <source-file> [DEPENDS <deps>] [LIBS <library-dependencies>])
# Defines a boolean (0/1) configuration option based on whether a source file can be successfully compiled and run. Used
# to determine if more fine grained functionality is supported on a given target environment (beyond what function
# symbols, libraries and headers are available). The configuration option is stored in the cmake cache and can be
# exported to the wiredtiger config header.
# config_name - name of the configuration option.
# description - docstring to describe the configuration option (viewable in the cmake-gui).
# SOURCE <source-file> - specific source file we want to test compile.
# DEPENDS <deps> - list of dependencies (semicolon separated) required for the configuration to be evaluated.
# If any of the dependencies aren't met the configuration value will be set to '0' (false).
# LIBS <library-dependencies> - a list of any additional library dependencies needed to successfully compile the source.
function(config_compile config_name description)
cmake_parse_arguments(
PARSE_ARGV
2
"CONFIG_COMPILE"
""
"SOURCE;DEPENDS;LIBS"
""
)
if (NOT "${CONFIG_COMPILE_UNPARSED_ARGUMENTS}" STREQUAL "")
message(FATAL_ERROR "Unknown arguments to config_compile: ${CONFIG_COMPILE_UNPARSED_ARGUMENTS}")
endif()
# We require a source file (not optional).
if ("${CONFIG_COMPILE_SOURCE}" STREQUAL "")
message(FATAL_ERROR "No source passed")
endif()
# Check that the configs dependencies are enabled before setting it to a visible enabled state.
eval_dependency("${CONFIG_COMPILE_DEPENDS}" enabled)
if(enabled)
# Test compile the source file.
try_run(
can_run_${config_name} can_compile_${config_name}
${CMAKE_CURRENT_BINARY_DIR}
${CONFIG_COMPILE_SOURCE}
LINK_LIBRARIES "${CONFIG_COMPILE_LIBS}"
)
set(can_run "0")
if((NOT "${can_run_${config_name}}" STREQUAL "FAILED_TO_RUN") AND
("${can_run_${config_name}}" STREQUAL "0"))
set(can_run "1")
endif()
# Set an internal cache variable "${config_name}_DISABLED" to capture its enabled/disabled state.
# We want to ensure we capture a transition from a disabled to enabled state when dependencies are met.
if(${config_name}_DISABLED)
unset(${config_name}_DISABLED CACHE)
set(${config_name} ${can_run} CACHE STRING "${description}" FORCE)
else()
set(${config_name} ${can_run} CACHE STRING "${description}")
endif()
# 'try_run' sets our given temp variable into the cache. Clear this so it doesn't persist between
# configuration runs.
unset(can_run_${config_name} CACHE)
unset(can_compile_${config_name} CACHE)
else()
set(${config_name} 0 CACHE INTERNAL "" FORCE)
set(${config_name}_DISABLED ON CACHE INTERNAL "" FORCE)
set(${_have_var} OFF CACHE INTERNAL "${WTLIB_NAME} available on system")
message(CHECK_FAIL "not found")
endif()
endfunction()

View File

@ -25,16 +25,10 @@ install(TARGETS ${wt_targets}
if(WT_POSIX)
# Established the link flags for private libraries used by this WiredTiger. 'Private' in this context refers
# to libraries WT links against, but isn't exposed to using applications.
set(private_libs)
if(HAVE_LIBPTHREAD)
set(private_libs "${private_libs} -lpthread")
endif()
if(HAVE_LIBRT)
set(private_libs " -lpthread -ldl")
if(WT_LINUX)
set(private_libs "${private_libs} -lrt")
endif()
if(HAVE_LIBDL)
set(private_libs "${private_libs} -ldl")
endif()
if(ENABLE_MEMKIND)
set(private_libs "${private_libs} -lmemkind")
endif()
@ -58,10 +52,7 @@ if(WT_POSIX)
endif()
if(HAVE_BUILTIN_EXTENSION_IAA)
set(private_libs "${private_libs} -lqpl")
if(HAVE_LIBCXX)
set(private_libs "${private_libs} -lstdc++")
endif()
if(HAVE_LIBACCEL_CONFIG)
if(HAVE_LIBACCEL_CONFIG)
set(private_libs "${private_libs} -laccel-config")
endif()
endif()

View File

@ -1,22 +1,13 @@
if(NOT HAVE_LIBQPL)
# We don't need to construct a iaa library target.
return()
endif()
# Intel IAA / QPL + libaccel-config: capability detection + imported targets.
#
# Layer 1 (capability): HAVE_LIBQPL, HAVE_LIBACCEL_CONFIG
# Layer 2 (default policy): cmake/configs/base.cmake
# Layer 3 (user toggle): ENABLE_IAA / HAVE_BUILTIN_EXTENSION_IAA
if(TARGET wt::qpl)
# Avoid redefining the imported library.
return()
endif()
# Produces target wt::qpl when the library is available.
wt_find_library(NAME qpl
HEADER qpl/qpl.h)
# Define the imported iaa library target that can be subsequently linked across the build system.
# We use the double colons (::) as a convention to tell CMake that the target name is associated
# with an IMPORTED target (which allows CMake to issue a diagnostic message if the library wasn't found).
add_library(wt::qpl STATIC IMPORTED GLOBAL)
set_target_properties(wt::qpl PROPERTIES
IMPORTED_LOCATION ${HAVE_LIBQPL}
)
if (HAVE_LIBQPL_INCLUDES)
set_target_properties(wt::qpl PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES ${HAVE_LIBQPL_INCLUDES}
)
endif()
# Produces target wt::accel_config when the library is available.
wt_find_library(NAME accel_config
LIBRARY accel-config)

View File

@ -1,23 +1,11 @@
if(NOT HAVE_LIBLZ4)
# We don't need to construct a lz4 library target.
return()
endif()
# lz4: capability detection + imported target.
#
# Layer 1 (capability): HAVE_LIBLZ4
# Layer 2 (default policy): cmake/configs/base.cmake
# Layer 3 (user toggle): ENABLE_LZ4 / HAVE_BUILTIN_EXTENSION_LZ4
if(TARGET wt::lz4)
# Avoid redefining the imported library, given this file can be used as an include.
return()
endif()
# Define the imported lz4 library target that can be subsequently linked across the build system.
# We use the double colons (::) as a convention to tell CMake that the target name is associated
# with an IMPORTED target (which allows CMake to issue a diagnostic message if the library wasn't found).
add_library(wt::lz4 SHARED IMPORTED GLOBAL)
set_target_properties(wt::lz4 PROPERTIES
IMPORTED_LOCATION ${HAVE_LIBLZ4}
IMPORTED_IMPLIB ${HAVE_LIBLZ4}
)
if (HAVE_LIBLZ4_INCLUDES)
set_target_properties(wt::lz4 PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES ${HAVE_LIBLZ4_INCLUDES}
)
endif()
# Produces target wt::lz4 when the library is available.
wt_find_library(NAME lz4
PACKAGE lz4 TARGET LZ4::lz4
PKGCONFIG_MODULE liblz4
HEADER lz4.h)

View File

@ -1,27 +1,9 @@
if(NOT HAVE_LIBMEMKIND)
# We can't construct a memkind library target.
return()
endif()
# memkind: capability detection + imported target.
#
# Layer 1 (capability): HAVE_LIBMEMKIND
# Layer 2 (default policy): cmake/configs/base.cmake (DEFAULT OFF)
# Layer 3 (user toggle): ENABLE_MEMKIND
if (NOT ENABLE_MEMKIND)
# We don't want to construct a memkind library target.
return()
endif()
if(TARGET wt::memkind)
# Avoid redefining the imported library.
return()
endif()
# Define the imported memkind library target that can be subsequently linked across the build system.
# We use the double colons (::) as a convention to tell CMake that the target name is associated
# with an IMPORTED target (which allows CMake to issue a diagnostic message if the library wasn't found).
add_library(wt::memkind SHARED IMPORTED GLOBAL)
set_target_properties(wt::memkind PROPERTIES
IMPORTED_LOCATION ${HAVE_LIBMEMKIND}
)
if (HAVE_LIBMEMKIND_INCLUDES)
set_target_properties(wt::memkind PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES ${HAVE_LIBMEMKIND_INCLUDES}
)
endif()
# Produces target wt::memkind when the library is available.
wt_find_library(NAME memkind
HEADER memkind.h)

View File

@ -0,0 +1,17 @@
# pthread (POSIX threading): capability detection.
#
# Uses CMake's standard Threads module, which selects pthread on POSIX and
# Win32 threading on Windows. Consumers link the Threads::Threads imported
# target.
#
# Layer 1 (capability): HAVE_LIBPTHREAD true when threading is available.
# pthread is non-optional on supported platforms; configure fails without it.
if(TARGET Threads::Threads)
# Avoid redefining the imported library.
return()
endif()
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
set(HAVE_LIBPTHREAD ${Threads_FOUND} CACHE INTERNAL "pthread available on system")

View File

@ -1,23 +1,11 @@
if(NOT HAVE_LIBSNAPPY)
# We don't need to construct a snappy library target.
return()
endif()
# snappy: capability detection + imported target.
#
# Layer 1 (capability): HAVE_LIBSNAPPY
# Layer 2 (default policy): cmake/configs/base.cmake
# Layer 3 (user toggle): ENABLE_SNAPPY / HAVE_BUILTIN_EXTENSION_SNAPPY
if(TARGET wt::snappy)
# Avoid redefining the imported library.
return()
endif()
# Define the imported snappy library target that can be subsequently linked across the build system.
# We use the double colons (::) as a convention to tell CMake that the target name is associated
# with an IMPORTED target (which allows CMake to issue a diagnostic message if the library wasn't found).
add_library(wt::snappy SHARED IMPORTED GLOBAL)
set_target_properties(wt::snappy PROPERTIES
IMPORTED_LOCATION ${HAVE_LIBSNAPPY}
IMPORTED_IMPLIB ${HAVE_LIBSNAPPY}
)
if (HAVE_LIBSNAPPY_INCLUDES)
set_target_properties(wt::snappy PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES ${HAVE_LIBSNAPPY_INCLUDES}
)
endif()
# Produces target wt::snappy when the library is available.
wt_find_library(NAME snappy
PACKAGE Snappy TARGET Snappy::snappy
PKGCONFIG_MODULE snappy
HEADER snappy.h)

View File

@ -1,22 +1,10 @@
if(NOT HAVE_LIBSODIUM)
# We don't need to construct a sodium library target.
return()
endif()
# libsodium: capability detection + imported target.
#
# Layer 1 (capability): HAVE_LIBSODIUM
# Layer 2 (default policy): cmake/configs/base.cmake (DEFAULT OFF)
# Layer 3 (user toggle): ENABLE_SODIUM / HAVE_BUILTIN_EXTENSION_SODIUM
if(TARGET wt::sodium)
# Avoid redefining the imported library.
return()
endif()
# Define the imported sodium library target that can be subsequently linked across the build system.
# We use the double colons (::) as a convention to tell CMake that the target name is associated
# with an IMPORTED target (which allows CMake to issue a diagnostic message if the library wasn't found).
add_library(wt::sodium SHARED IMPORTED GLOBAL)
set_target_properties(wt::sodium PROPERTIES
IMPORTED_LOCATION ${HAVE_LIBSODIUM}
)
if (HAVE_LIBSODIUM_INCLUDES)
set_target_properties(wt::sodium PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES ${HAVE_LIBSODIUM_INCLUDES}
)
endif()
# Produces target wt::sodium when the library is available.
wt_find_library(NAME sodium
PKGCONFIG_MODULE libsodium
HEADER sodium.h)

View File

@ -2,6 +2,11 @@ if(NOT ENABLE_PALITE)
return() # PALite is disabled, skip the rest of this file
endif()
if(TARGET wt::sqlite3)
# Avoid redefining the imported library.
return()
endif()
if(USE_SYSTEM_SQLITE3)
find_package(SQLite3 ${SQLITE3_REQUIRED_VERSION} REQUIRED)
add_library(wt::sqlite3 ALIAS SQLite::SQLite3)
@ -66,8 +71,8 @@ endif()
# Needed for SQLite3 on some platforms
target_link_libraries(sqlite3_lib PUBLIC
$<$<BOOL:${WT_LINUX}>:${HAVE_LIBPTHREAD}>
$<$<BOOL:${WT_LINUX}>:${HAVE_LIBDL}>
$<$<BOOL:${WT_LINUX}>:Threads::Threads>
$<$<BOOL:${WT_LINUX}>:${CMAKE_DL_LIBS}>
$<$<BOOL:${WT_LINUX}>:m>
)

View File

@ -1,23 +1,11 @@
if(NOT HAVE_LIBZ)
# We don't need to construct a zlib library target.
return()
endif()
# zlib: capability detection + imported target.
#
# Layer 1 (capability): HAVE_LIBZ
# Layer 2 (default policy): cmake/configs/base.cmake
# Layer 3 (user toggle): ENABLE_ZLIB / HAVE_BUILTIN_EXTENSION_ZLIB
if(TARGET wt::zlib)
# Avoid redefining the imported library.
return()
endif()
# Define the imported zlib library target that can be subsequently linked across the build system.
# We use the double colons (::) as a convention to tell CMake that the target name is associated
# with an IMPORTED target (which allows CMake to issue a diagnostic message if the library wasn't found).
add_library(wt::zlib SHARED IMPORTED GLOBAL)
set_target_properties(wt::zlib PROPERTIES
IMPORTED_LOCATION ${HAVE_LIBZ}
IMPORTED_IMPLIB ${HAVE_LIBZ}
)
if (HAVE_LIBZ_INCLUDES)
set_target_properties(wt::zlib PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES ${HAVE_LIBZ_INCLUDES}
)
endif()
# Produces target wt::zlib when the library is available.
wt_find_library(NAME z CMAKE_TARGET zlib
PACKAGE ZLIB TARGET ZLIB::ZLIB
PKGCONFIG_MODULE zlib
HEADER zlib.h)

View File

@ -1,23 +1,11 @@
if(NOT HAVE_LIBZSTD)
# We don't need to construct a zstd library target.
return()
endif()
# zstd: capability detection + imported target.
#
# Layer 1 (capability): HAVE_LIBZSTD
# Layer 2 (default policy): cmake/configs/base.cmake
# Layer 3 (user toggle): ENABLE_ZSTD / HAVE_BUILTIN_EXTENSION_ZSTD
if(TARGET wt::zstd)
# Avoid redefining the imported library.
return()
endif()
# Define the imported zstd library target that can be subsequently linked across the build system.
# We use the double colons (::) as a convention to tell CMake that the target name is associated
# with an IMPORTED target (which allows CMake to issue a diagnostic message if the library wasn't found).
add_library(wt::zstd SHARED IMPORTED GLOBAL)
set_target_properties(wt::zstd PROPERTIES
IMPORTED_LOCATION ${HAVE_LIBZSTD}
IMPORTED_IMPLIB ${HAVE_LIBZSTD}
)
if (HAVE_LIBZSTD_INCLUDES)
set_target_properties(wt::zstd PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES ${HAVE_LIBZSTD_INCLUDES}
)
endif()
# Produces target wt::zstd when the library is available.
wt_find_library(NAME zstd
PACKAGE zstd TARGET zstd::libzstd_shared
PKGCONFIG_MODULE libzstd
HEADER zstd.h)

View File

@ -108,6 +108,11 @@ arch_doc_pages = [
ArchDocPage('arch-metadata',
[],
['src/include/meta.h', 'src/meta/']),
ArchDocPage('arch-page-delta',
['WTI_DELTA_LEAF_MERGE_STATE', 'WT_PAGE_BLOCK_META', 'WT_PAGE_DELTA_CONFIG'],
['src/include/btmem.h', 'src/include/btree.h', 'src/include/connection.h',
'src/btree/bt_page.c', 'src/btree/bt_read.c', 'src/reconcile/rec_row.c',
'src/reconcile/rec_write.c']),
ArchDocPage('arch-prefetch',
['WT_PREFETCH', 'WT_PREFETCH_QUEUE_ENTRY', 'WT_REF'],
['src/btree/bt_prefetch.c', 'src/conn/conn_prefetch.c',

View File

@ -104,8 +104,6 @@ WT_SESSION_LOCKED_LIVE_RESTORE_STATE
WT_SESSION_LOCKED_TABLE_READ
WT_SESSION_LOCKED_TABLE_WRITE
WT_SESSION_LOCKED_TURTLE
WT_SINGLE_THREAD_CHECK_START
WT_SINGLE_THREAD_CHECK_STOP
WT_SIZEOF_FIELD
WT_SPLIT_SAVE_STATE_MAX
WT_STATS_FIELD_TO_OFFSET

View File

@ -71,6 +71,7 @@ perf_hist_leaf_reconstruct_latency_total_usecs
perf_hist_opread_latency_total_usecs
perf_hist_opwrite_latency_total_usecs
txn_rts_upd_aborted_dryrun
read_reject_count
write_reject_count
UNUSED_STAT_FIELDS

View File

@ -658,6 +658,7 @@ conn_stats = [
# Load Control statistics
##########################################
LoadControlStat('read_load', 'read load at the system level'),
LoadControlStat('read_reject_count', 'number of read operations rejected due to load control'),
LoadControlStat('write_load', 'write load at the system level'),
LoadControlStat('write_reject_count', 'number of write operations rejected due to load control'),

View File

@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger",
"branch": "mongodb-master",
"commit": "46bf841062af800d92786a96abe10c02c0d9f17f"
"commit": "3c6ec61a01a4dbdb3351121c0a624d3996e28c03"
}

View File

@ -8,10 +8,12 @@
#include "wt_internal.h"
static int __ckpt_delete_and_merge(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *, WT_BLOCK_CKPT *);
static int __ckpt_process(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *);
static int __ckpt_read_deletion_extlists(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *, bool *);
static int __ckpt_reinit_extlists(WT_SESSION_IMPL *, WT_BLOCK_CKPT *);
static int __ckpt_update(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *, WT_CKPT *, WT_BLOCK_CKPT *);
static int __ckpt_update_live(WT_SESSION_IMPL *, WT_BLOCK *, WT_CKPT *, WT_BLOCK_CKPT *, uint64_t);
static int __ckpt_validate_state(WT_SESSION_IMPL *, WT_BLOCK *);
/*
@ -686,6 +688,227 @@ __ckpt_reinit_extlists(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci)
return (0);
}
/*
* __ckpt_update_live --
* Update the live checkpoint: truncate the file, calculate the checkpoint size, and call
* __ckpt_update for the ADD checkpoint. Also resets the live system's alloc and discard extent
* lists so that extents freed by the checkpoint are reclaimed outside of the lock.
*
* The caller must hold block->live_lock.
*/
static int
__ckpt_update_live(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase, WT_BLOCK_CKPT *ci,
uint64_t ckpt_size)
{
WT_CKPT *ckpt;
WT_ASSERT_SPINLOCK_OWNED(session, &block->live_lock);
/* Truncate the file if that's possible. */
WT_RET(__wti_block_extlist_truncate(session, block, &ci->avail));
/* Update the final, added checkpoint based on the live system. */
WT_CKPT_FOREACH (ckptbase, ckpt)
if (F_ISSET(ckpt, WT_CKPT_ADD)) {
/*
* !!!
* Our caller wants the final checkpoint size. Setting the size here violates layering,
* but the alternative is a call for the btree layer to crack the checkpoint cookie into
* its components, and that's a fair amount of work.
*/
ckpt->size = ckpt_size;
/*
* Set the rolling checkpoint size for the live system. The current size includes the
* current checkpoint's root page size (root pages are on the checkpoint's block
* allocation list as root pages are allocated with the usual block allocation
* functions). That's correct, but we don't want to include it in the size for the next
* checkpoint.
*/
ckpt_size -= ci->root_size;
/*
* Additionally, we had a bug for awhile where the live checkpoint size grew without
* bound. We can't sanity check the value, that would require walking the tree as part
* of the checkpoint. Bound any bug at the size of the file. It isn't practical to
* assert that the value is within bounds since databases created with older versions of
* WiredTiger (2.8.0) would likely see an error.
*/
ci->ckpt_size = WT_MIN(ckpt_size, (uint64_t)block->size);
WT_RET_MSG_CHK(session, __ckpt_update(session, block, ckptbase, ckpt, ci),
"updating the live (ADD) checkpoint %s", ckpt->name);
}
/*
* Reset the live system's alloc and discard extent lists, leave the avail list alone. This
* includes freeing a lot of extents, so do it outside of the system's lock by copying and
* resetting the original, then doing the work later.
*/
ci->ckpt_alloc = ci->alloc;
WT_RET_MSG_CHK(session, __wti_block_extlist_init(session, &ci->alloc, "live", "alloc", false),
"resetting the live alloc extent list after copying it to ckpt_alloc");
ci->ckpt_discard = ci->discard;
WT_RET_MSG_CHK(session,
__wti_block_extlist_init(session, &ci->discard, "live", "discard", false),
"resetting the live discard extent list after copying it to ckpt_discard");
#ifdef HAVE_DIAGNOSTIC
/*
* The first checkpoint in the system should always have an empty discard list. If we've read
* that checkpoint and/or created it, check.
*/
WT_BLOCK_CKPT *a;
WT_CKPT_FOREACH (ckptbase, ckpt)
if (!F_ISSET(ckpt, WT_CKPT_DELETE))
break;
if ((a = ckpt->bpriv) == NULL)
a = &block->live;
if (a->discard.entries != 0)
WT_RET_MSG(
session, WT_ERROR, "first checkpoint incorrectly has blocks on the discard list");
#endif
return (0);
}
/*
* __ckpt_delete_and_merge --
* Delete checkpoints no longer needed and merge their extent lists into the subsequent
* checkpoint. Handles freeing root pages, freeing extent list blocks, merging alloc and discard
* lists, finding overlapping blocks for reuse, updating checkpoints, and clearing block
* modification tracking. Must be called while the live lock is held.
*/
static int
__ckpt_delete_and_merge(
WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase, WT_BLOCK_CKPT *ci)
{
WT_BLOCK_CKPT *a, *b;
WT_CKPT *ckpt, *next_ckpt;
const char *next_name;
WT_ASSERT_SPINLOCK_OWNED(session, &block->live_lock);
/*
* Delete any no-longer-needed checkpoints: we do this first as it frees blocks to the live
* lists, and the freed blocks will then be included when writing the live extent lists.
*/
WT_CKPT_FOREACH (ckptbase, ckpt) {
if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE))
continue;
/*
* Set the "from" checkpoint structure. If it applies to a previous object, there's nothing
* more to do.
*/
a = ckpt->bpriv;
if (a->root_objectid != block->objectid)
continue;
if (WT_VERBOSE_LEVEL_ISSET(session, WT_VERB_CHECKPOINT, WT_VERBOSE_DEBUG_2))
__wti_ckpt_verbose(
session, block, "delete", ckpt->name, ckpt->raw.data, ckpt->raw.size);
/*
* Find the checkpoint into which we'll roll this checkpoint's blocks: it's the next real
* checkpoint in the list, and it better have been read in (if it's not the add slot).
*/
for (next_ckpt = ckpt + 1;; ++next_ckpt)
if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
break;
/*
* Set the "to" checkpoint structure, it may be the live tree.
*/
if (F_ISSET(next_ckpt, WT_CKPT_ADD))
b = &block->live;
else
b = next_ckpt->bpriv;
next_name = next_ckpt->name != NULL ? next_ckpt->name : "live";
/*
* Free the root page: there's nothing special about this free, the root page is allocated
* using normal rules, that is, it may have been taken from the avail list, and was entered
* on the live system's alloc list at that time. We free it into the checkpoint's discard
* list, however, not the live system's list because it appears on the checkpoint's alloc
* list and so must be paired in the checkpoint.
*/
if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
WT_RET_MSG_CHK(session,
__wti_block_insert_ext(session, block, &a->discard, a->root_offset, a->root_size),
"inserting root page into discard list for deleted checkpoint %s", ckpt->name);
/*
* Free the blocks used to hold the "from" checkpoint's extent lists, including the avail
* list.
*/
WT_RET_MSG_CHK(session, __ckpt_extlist_fblocks(session, block, &a->alloc),
"freeing alloc extent list blocks for deleted checkpoint %s", ckpt->name);
WT_RET_MSG_CHK(session, __ckpt_extlist_fblocks(session, block, &a->avail),
"freeing avail extent list blocks for deleted checkpoint %s", ckpt->name);
WT_RET_MSG_CHK(session, __ckpt_extlist_fblocks(session, block, &a->discard),
"freeing discard extent list blocks for deleted checkpoint %s", ckpt->name);
/*
* Roll the "from" alloc and discard extent lists into the "to" checkpoint's lists.
*/
if (a->alloc.entries != 0)
WT_RET_MSG_CHK(session, __wti_block_extlist_merge(session, block, &a->alloc, &b->alloc),
"merging alloc extent list from deleted checkpoint %s", ckpt->name);
if (a->discard.entries != 0)
WT_RET_MSG_CHK(session,
__wti_block_extlist_merge(session, block, &a->discard, &b->discard),
"merging discard extent list from deleted checkpoint %s", ckpt->name);
/*
* If the "to" checkpoint is also being deleted, we're done with it, it's merged into some
* other checkpoint in the next loop. This means the extent lists may aggregate over a
* number of checkpoints, but that's OK, they're disjoint sets of ranges.
*/
if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
continue;
/*
* Find blocks for re-use: wherever the "to" checkpoint's allocate and discard lists
* overlap, move the range to the live system's checkpoint available list.
*/
WT_RET_MSG_CHK(session, __wti_block_extlist_overlap(session, block, b),
"finding reusable blocks in checkpoint %s", next_name);
/*
* If we're updating the live system's information, we're done.
*/
if (F_ISSET(next_ckpt, WT_CKPT_ADD)) {
/* Clear any possible blocks that are now available after merging. */
WT_RET_MSG_CHK(session, __ckpt_live_blkmods(session, ckptbase, ci, block, false),
"clearing live block modifications after merging into %s", next_name);
continue;
}
/*
* We have to write the "to" checkpoint's extent lists out in new blocks, and update its
* cookie.
*
* Free the blocks used to hold the "to" checkpoint's extent lists; don't include the avail
* list, it's not changing.
*/
WT_RET_MSG_CHK(session, __ckpt_extlist_fblocks(session, block, &b->alloc),
"freeing alloc extent list blocks for updated checkpoint %s", next_name);
WT_RET_MSG_CHK(session, __ckpt_extlist_fblocks(session, block, &b->discard),
"freeing discard extent list blocks for updated checkpoint %s", next_name);
F_SET(next_ckpt, WT_CKPT_UPDATE);
}
/* Update checkpoints marked for update. */
WT_CKPT_FOREACH (ckptbase, ckpt)
if (F_ISSET(ckpt, WT_CKPT_UPDATE))
WT_RET_MSG_CHK(session, __ckpt_update(session, block, ckptbase, ckpt, ckpt->bpriv),
"updating checkpoint %s after deletion merge", ckpt->name);
return (0);
}
/*
* __ckpt_process --
* Process the list of checkpoints.
@ -693,8 +916,8 @@ __ckpt_reinit_extlists(WT_SESSION_IMPL *session, WT_BLOCK_CKPT *ci)
static int
__ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
{
WT_BLOCK_CKPT *a, *b, *ci;
WT_CKPT *ckpt, *next_ckpt;
WT_BLOCK_CKPT *ci;
WT_CKPT *ckpt;
WT_DECL_RET;
uint64_t ckpt_size;
bool deleting, fatal;
@ -779,170 +1002,11 @@ __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
if (!deleting)
goto live_update;
/*
* Delete any no-longer-needed checkpoints: we do this first as it frees blocks to the live
* lists, and the freed blocks will then be included when writing the live extent lists.
*/
WT_CKPT_FOREACH (ckptbase, ckpt) {
if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE))
continue;
/*
* Set the "from" checkpoint structure. If it applies to a previous object, there's nothing
* more to do.
*/
a = ckpt->bpriv;
if (a->root_objectid != block->objectid)
continue;
if (WT_VERBOSE_LEVEL_ISSET(session, WT_VERB_CHECKPOINT, WT_VERBOSE_DEBUG_2))
__wti_ckpt_verbose(
session, block, "delete", ckpt->name, ckpt->raw.data, ckpt->raw.size);
/*
* Find the checkpoint into which we'll roll this checkpoint's blocks: it's the next real
* checkpoint in the list, and it better have been read in (if it's not the add slot).
*/
for (next_ckpt = ckpt + 1;; ++next_ckpt)
if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
break;
/*
* Set the "to" checkpoint structure, it may be the live tree.
*/
if (F_ISSET(next_ckpt, WT_CKPT_ADD))
b = &block->live;
else
b = next_ckpt->bpriv;
/*
* Free the root page: there's nothing special about this free, the root page is allocated
* using normal rules, that is, it may have been taken from the avail list, and was entered
* on the live system's alloc list at that time. We free it into the checkpoint's discard
* list, however, not the live system's list because it appears on the checkpoint's alloc
* list and so must be paired in the checkpoint.
*/
if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
WT_ERR(
__wti_block_insert_ext(session, block, &a->discard, a->root_offset, a->root_size));
/*
* Free the blocks used to hold the "from" checkpoint's extent lists, including the avail
* list.
*/
WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));
/*
* Roll the "from" alloc and discard extent lists into the "to" checkpoint's lists.
*/
if (a->alloc.entries != 0)
WT_ERR(__wti_block_extlist_merge(session, block, &a->alloc, &b->alloc));
if (a->discard.entries != 0)
WT_ERR(__wti_block_extlist_merge(session, block, &a->discard, &b->discard));
/*
* If the "to" checkpoint is also being deleted, we're done with it, it's merged into some
* other checkpoint in the next loop. This means the extent lists may aggregate over a
* number of checkpoints, but that's OK, they're disjoint sets of ranges.
*/
if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
continue;
/*
* Find blocks for re-use: wherever the "to" checkpoint's allocate and discard lists
* overlap, move the range to the live system's checkpoint available list.
*/
WT_ERR(__wti_block_extlist_overlap(session, block, b));
/*
* If we're updating the live system's information, we're done.
*/
if (F_ISSET(next_ckpt, WT_CKPT_ADD)) {
/* Clear any possible blocks that are now available after merging. */
WT_ERR(__ckpt_live_blkmods(session, ckptbase, ci, block, false));
continue;
}
/*
* We have to write the "to" checkpoint's extent lists out in new blocks, and update its
* cookie.
*
* Free the blocks used to hold the "to" checkpoint's extent lists; don't include the avail
* list, it's not changing.
*/
WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));
F_SET(next_ckpt, WT_CKPT_UPDATE);
}
/* Update checkpoints marked for update. */
WT_CKPT_FOREACH (ckptbase, ckpt)
if (F_ISSET(ckpt, WT_CKPT_UPDATE))
WT_ERR(__ckpt_update(session, block, ckptbase, ckpt, ckpt->bpriv));
/* Delete checkpoints no longer needed and merge their extent lists into subsequent ones. */
WT_ERR(__ckpt_delete_and_merge(session, block, ckptbase, ci));
live_update:
/* Truncate the file if that's possible. */
WT_ERR(__wti_block_extlist_truncate(session, block, &ci->avail));
/* Update the final, added checkpoint based on the live system. */
WT_CKPT_FOREACH (ckptbase, ckpt)
if (F_ISSET(ckpt, WT_CKPT_ADD)) {
/*
* !!!
* Our caller wants the final checkpoint size. Setting the size here violates layering,
* but the alternative is a call for the btree layer to crack the checkpoint cookie into
* its components, and that's a fair amount of work.
*/
ckpt->size = ckpt_size;
/*
* Set the rolling checkpoint size for the live system. The current size includes the
* current checkpoint's root page size (root pages are on the checkpoint's block
* allocation list as root pages are allocated with the usual block allocation
* functions). That's correct, but we don't want to include it in the size for the next
* checkpoint.
*/
ckpt_size -= ci->root_size;
/*
* Additionally, we had a bug for awhile where the live checkpoint size grew without
* bound. We can't sanity check the value, that would require walking the tree as part
* of the checkpoint. Bound any bug at the size of the file. It isn't practical to
* assert that the value is within bounds since databases created with older versions of
* WiredTiger (2.8.0) would likely see an error.
*/
ci->ckpt_size = WT_MIN(ckpt_size, (uint64_t)block->size);
WT_ERR(__ckpt_update(session, block, ckptbase, ckpt, ci));
}
/*
* Reset the live system's alloc and discard extent lists, leave the avail list alone. This
* includes freeing a lot of extents, so do it outside of the system's lock by copying and
* resetting the original, then doing the work later.
*/
ci->ckpt_alloc = ci->alloc;
WT_ERR(__wti_block_extlist_init(session, &ci->alloc, "live", "alloc", false));
ci->ckpt_discard = ci->discard;
WT_ERR(__wti_block_extlist_init(session, &ci->discard, "live", "discard", false));
#ifdef HAVE_DIAGNOSTIC
/*
* The first checkpoint in the system should always have an empty discard list. If we've read
* that checkpoint and/or created it, check.
*/
WT_CKPT_FOREACH (ckptbase, ckpt)
if (!F_ISSET(ckpt, WT_CKPT_DELETE))
break;
if ((a = ckpt->bpriv) == NULL)
a = &block->live;
if (a->discard.entries != 0)
WT_ERR_MSG(
session, WT_ERROR, "first checkpoint incorrectly has blocks on the discard list");
#endif
WT_ERR(__ckpt_update_live(session, block, ckptbase, ci, ckpt_size));
err:
if (ret != 0 && fatal) {

View File

@ -169,12 +169,11 @@ __layered_reset_ingest_table_prune_timestamp(WT_SESSION_IMPL *session, const cha
WT_DECL_RET;
wt_timestamp_t btree_prune_timestamp;
WT_ERR_NOTFOUND_OK(__wt_session_get_dhandle(session, ingest_uri, NULL, NULL, 0), true);
if (ret == WT_NOTFOUND) {
WT_RET_ERROR_OK(ret = __wt_session_get_dhandle(session, ingest_uri, NULL, NULL, 0), ENOENT);
if (ret == ENOENT) {
__wt_verbose_level(session, WT_VERB_LAYERED, WT_VERBOSE_DEBUG_5,
"Handle not found for ingest table uri: %s", ingest_uri);
ret = 0;
goto err;
return (0);
}
btree = (WT_BTREE *)session->dhandle->handle;
@ -186,9 +185,8 @@ __layered_reset_ingest_table_prune_timestamp(WT_SESSION_IMPL *session, const cha
__wt_atomic_store_uint64_relaxed(&btree->prune_timestamp, WT_TS_NONE);
WT_ERR(__wt_session_release_dhandle(session));
WT_RET(__wt_session_release_dhandle(session));
err:
return (ret);
}
@ -746,9 +744,9 @@ __layered_drain_ingest_table_and_truncate_list(WT_SESSION_IMPL *session, const c
WT_RET(__wt_scr_alloc(session, 0, &layered_uri_buf));
WT_ERR(__layered_derive_layered_uri(session, ingest_uri, layered_uri_buf));
WT_ERR_NOTFOUND_OK(
__wt_session_get_dhandle(session, layered_uri_buf->data, NULL, NULL, 0), true);
if (ret == WT_NOTFOUND) {
WT_ERR_ERROR_OK(
__wt_session_get_dhandle(session, layered_uri_buf->data, NULL, NULL, 0), ENOENT, true);
if (ret == ENOENT) {
__wt_verbose_level(session, WT_VERB_LAYERED, WT_VERBOSE_DEBUG_5,
"No layered handle found for ingest table \"%s\", only performing ingest drain",
ingest_uri);
@ -1017,13 +1015,12 @@ __layered_update_ingest_table_prune_timestamp(WT_SESSION_IMPL *session, const ch
layered_table = NULL;
prune_timestamp = WT_TS_NONE;
/*
* Get the layered table from the provided URI. We don't hold any global locks so that's
* possible that it was already removed.
*/
WT_ERR_NOTFOUND_OK(__wt_session_get_dhandle(session, layered_uri, NULL, NULL, 0), true);
if (ret == WT_NOTFOUND) {
WT_RET_ERROR_OK(ret = __wt_session_get_dhandle(session, layered_uri, NULL, NULL, 0), ENOENT);
if (ret == ENOENT) {
__wt_verbose_level(session, WT_VERB_LAYERED, WT_VERBOSE_DEBUG_5,
"GC %s: Layered table was not found.", layered_uri);
return (0);
@ -1106,9 +1103,9 @@ __layered_update_ingest_table_prune_timestamp(WT_SESSION_IMPL *session, const ch
* that it hasn't been opened yet. In that case, we need to skip updating its timestamp for
* pruning, and we'll get another chance to update the prune timestamp at the next checkpoint.
*/
WT_ERR_NOTFOUND_OK(
__wt_session_get_dhandle(session, layered_table->ingest_uri, NULL, NULL, 0), true);
if (ret == WT_NOTFOUND) {
WT_ERR_ERROR_OK(
__wt_session_get_dhandle(session, layered_table->ingest_uri, NULL, NULL, 0), ENOENT, true);
if (ret == ENOENT) {
__wt_verbose_level(session, WT_VERB_LAYERED, WT_VERBOSE_DEBUG_5,
"GC %s: Handle not found for ingest table uri: %s", layered_table->iface.name,
layered_table->ingest_uri);

View File

@ -174,6 +174,8 @@ __curds_next(WT_CURSOR *cursor)
CURSOR_API_CALL(cursor, session, ret, next, NULL);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_STAT_CONN_DSRC_INCR(session, cursor_next);
F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
@ -197,6 +199,7 @@ __curds_prev(WT_CURSOR *cursor)
source = ((WT_CURSOR_DATA_SOURCE *)cursor)->source;
CURSOR_API_CALL(cursor, session, ret, prev, NULL);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_STAT_CONN_DSRC_INCR(session, cursor_prev);
@ -247,6 +250,8 @@ __curds_search(WT_CURSOR *cursor)
CURSOR_API_CALL(cursor, session, ret, search, NULL);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_STAT_CONN_DSRC_INCR(session, cursor_search);
WT_ERR(__curds_key_set(cursor));
@ -271,6 +276,8 @@ __curds_search_near(WT_CURSOR *cursor, int *exact)
CURSOR_API_CALL(cursor, session, ret, search_near, NULL);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_STAT_CONN_DSRC_INCR(session, cursor_search_near);
WT_ERR(__curds_key_set(cursor));
@ -295,6 +302,8 @@ __curds_insert(WT_CURSOR *cursor)
CURSOR_UPDATE_API_CALL(cursor, session, ret, insert, NULL);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_STAT_CONN_DSRC_INCR(session, cursor_insert);
WT_STAT_DSRC_INCRV(session, cursor_insert_bytes, cursor->key.size + cursor->value.size);
@ -323,6 +332,8 @@ __curds_update(WT_CURSOR *cursor)
CURSOR_UPDATE_API_CALL(cursor, session, ret, update, NULL);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_STAT_CONN_DSRC_INCR(session, cursor_update);
WT_STAT_CONN_DSRC_INCRV(session, cursor_update_bytes, cursor->value.size);
@ -350,6 +361,8 @@ __curds_remove(WT_CURSOR *cursor)
CURSOR_REMOVE_API_CALL(cursor, session, ret, NULL);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_STAT_CONN_DSRC_INCR(session, cursor_remove);
WT_STAT_CONN_DSRC_INCRV(session, cursor_remove_bytes, cursor->key.size);
@ -376,6 +389,8 @@ __curds_reserve(WT_CURSOR *cursor)
CURSOR_UPDATE_API_CALL(cursor, session, ret, reserve, NULL);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_STAT_CONN_DSRC_INCR(session, cursor_reserve);
WT_ERR(__curds_key_set(cursor));

View File

@ -178,6 +178,12 @@ __curfile_next(WT_CURSOR *cursor)
cbt = (WT_CURSOR_BTREE *)cursor;
CURSOR_API_CALL(cursor, session, ret, next, cbt->dhandle);
CURSOR_REPOSITION_ENTER(cursor, session);
/*
* If this is a user cursor call, check for system overload before doing any work.
*/
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_ERR(__cursor_copy_release(cursor));
WT_ERR(__curfile_check_cbt_txn(session, cbt));
@ -238,6 +244,8 @@ __curfile_prev(WT_CURSOR *cursor)
cbt = (WT_CURSOR_BTREE *)cursor;
CURSOR_API_CALL(cursor, session, ret, prev, cbt->dhandle);
CURSOR_REPOSITION_ENTER(cursor, session);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_ERR(__cursor_copy_release(cursor));
WT_ERR(__curfile_check_cbt_txn(session, cbt));
@ -305,6 +313,9 @@ __curfile_search(WT_CURSOR *cursor)
CURSOR_API_CALL(cursor, session, ret, search, cbt->dhandle);
API_RETRYABLE(session);
CURSOR_REPOSITION_ENTER(cursor, session);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_ERR(__cursor_copy_release(cursor));
WT_ERR(__cursor_checkkey(cursor));
@ -343,6 +354,9 @@ __curfile_search_near(WT_CURSOR *cursor, int *exact)
CURSOR_API_CALL(cursor, session, ret, search_near, cbt->dhandle);
API_RETRYABLE(session);
CURSOR_REPOSITION_ENTER(cursor, session);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_ERR(__cursor_copy_release(cursor));
WT_ERR(__cursor_checkkey(cursor));
@ -379,6 +393,9 @@ __curfile_insert(WT_CURSOR *cursor)
cbt = (WT_CURSOR_BTREE *)cursor;
CURSOR_UPDATE_API_CALL_BTREE(cursor, session, ret, insert);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_ERR(__cursor_copy_release(cursor));
if (!F_ISSET(cursor, WT_CURSTD_APPEND))
@ -421,6 +438,9 @@ __wt_curfile_insert_check(WT_CURSOR *cursor)
cbt = (WT_CURSOR_BTREE *)cursor;
tret = 0;
CURSOR_UPDATE_API_CALL_BTREE(cursor, session, ret, insert_check);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_ERR(__cursor_copy_release(cursor));
WT_ERR(__cursor_checkkey(cursor));
@ -449,6 +469,9 @@ __curfile_modify(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries)
cbt = (WT_CURSOR_BTREE *)cursor;
CURSOR_UPDATE_API_CALL_BTREE(cursor, session, ret, modify);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_ERR(__cursor_copy_release(cursor));
WT_ERR(__cursor_checkkey(cursor));
@ -487,6 +510,9 @@ __curfile_update(WT_CURSOR *cursor)
cbt = (WT_CURSOR_BTREE *)cursor;
CURSOR_UPDATE_API_CALL_BTREE(cursor, session, ret, update);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_ERR(__cursor_copy_release(cursor));
WT_ERR(__cursor_checkkey(cursor));
WT_ERR(__cursor_checkvalue(cursor));
@ -530,6 +556,9 @@ __curfile_remove(WT_CURSOR *cursor)
cbt = (WT_CURSOR_BTREE *)cursor;
CURSOR_REMOVE_API_CALL(cursor, session, ret, cbt->dhandle);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_ERR(__cursor_copy_release(cursor));
WT_ERR(__cursor_checkkey(cursor));
@ -572,6 +601,9 @@ __curfile_reserve(WT_CURSOR *cursor)
cbt = (WT_CURSOR_BTREE *)cursor;
CURSOR_UPDATE_API_CALL_BTREE(cursor, session, ret, reserve);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_ERR(__cursor_copy_release(cursor));
WT_ERR(__cursor_checkkey(cursor));

View File

@ -1323,6 +1323,9 @@ __clayered_next(WT_CURSOR *cursor)
clayered = (WT_CURSOR_LAYERED *)cursor;
CURSOR_API_CALL(cursor, session, ret, next, clayered->dhandle);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
__cursor_novalue(cursor);
WT_ERR(__cursor_copy_release(cursor));
@ -1354,6 +1357,9 @@ __clayered_prev(WT_CURSOR *cursor)
clayered = (WT_CURSOR_LAYERED *)cursor;
CURSOR_API_CALL(cursor, session, ret, prev, clayered->dhandle);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
__cursor_novalue(cursor);
WT_ERR(__cursor_copy_release(cursor));
@ -1718,7 +1724,7 @@ __clayered_lookup(WT_SESSION_IMPL *session, WT_CURSOR_LAYERED *clayered, WT_ITEM
__clayered_lookup_constituent(clayered->stable_cursor, clayered, value), true);
err:
if (ret != 0 && ret != WT_PREPARE_CONFLICT) {
if (ret != 0) {
WT_TRET(__clayered_reset_cursors(clayered, false));
/* Reset the buffer if the key was deleted on the ingest table. */
value->data = NULL;
@ -1748,6 +1754,8 @@ __clayered_search(WT_CURSOR *cursor)
__cursor_novalue(cursor);
WT_ERR(__clayered_enter(clayered, true, true, false));
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_STAT_CONN_DSRC_INCR(session, layered_curs_search);
WT_ERR(__clayered_lookup(session, clayered, &cursor->value));
WT_ITEM_SET(cursor->key, clayered->current_cursor->key);
@ -2008,7 +2016,7 @@ done:
}
err:
if (ret != 0 && ret != WT_PREPARE_CONFLICT)
if (ret != 0)
WT_TRET(__clayered_reset_cursors(clayered, false));
return (ret);
@ -2034,6 +2042,8 @@ __clayered_search_near(WT_CURSOR *cursor, int *exactp)
__cursor_novalue(cursor);
WT_ERR(__clayered_enter(clayered, true, true, false));
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_ERR(__clayered_search_near_int(session, cursor, exactp));
WT_ITEM_SET(cursor->key, clayered->current_cursor->key);
@ -2067,6 +2077,8 @@ __clayered_reserve_constituent(WT_SESSION_IMPL *session, WT_CURSOR *constituent)
WT_DECL_RET;
CURSOR_UPDATE_API_CALL_BTREE(constituent, session, ret, reserve);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
/*
* Pass overwrite=true for followers: a follower's ingest table may not contain the key yet (it
* lives only in the stable table), so we need overwrite mode to allow the reserve to succeed
@ -2270,6 +2282,9 @@ __clayered_insert(WT_CURSOR *cursor)
clayered = (WT_CURSOR_LAYERED *)cursor;
CURSOR_UPDATE_API_CALL(cursor, session, ret, insert, clayered->dhandle);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
/* Insert doesn't keep the cursor positioned. Always clear the iteration flags. */
F_CLR(clayered, WT_CLAYERED_ITERATE_NEXT | WT_CLAYERED_ITERATE_PREV);
WT_ERR(__cursor_copy_release(cursor));
@ -2328,6 +2343,9 @@ __clayered_update(WT_CURSOR *cursor)
clayered = (WT_CURSOR_LAYERED *)cursor;
CURSOR_UPDATE_API_CALL(cursor, session, ret, update, clayered->dhandle);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
/*
* Update keeps the cursor positioned. Retain the iteration flags if we are in the middle of a
* cursor traversal.
@ -2400,6 +2418,9 @@ __clayered_remove(WT_CURSOR *cursor)
__cursor_novalue(cursor);
WT_ERR(__clayered_enter(clayered, false, true, false));
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
/*
* Copy the key out, since the insert resets non-primary chunk cursors which our lookup may have
* landed on.
@ -2442,6 +2463,8 @@ __clayered_reserve(WT_CURSOR *cursor)
CURSOR_UPDATE_API_CALL(cursor, session, ret, reserve, clayered->dhandle);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
/*
* Since a search will be performed afterward that clears the iteration flags, no point to
* retain the flags.
@ -2834,6 +2857,9 @@ __clayered_modify(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries)
WT_CURSOR_LAYERED *clayered = (WT_CURSOR_LAYERED *)cursor;
CURSOR_UPDATE_API_CALL(cursor, session, ret, modify, clayered->dhandle);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
/*
* Modify keeps the cursor positioned. Retain the iteration flags if we are in the middle of a
* cursor traversal.

View File

@ -266,6 +266,9 @@ __curtable_next(WT_CURSOR *cursor)
ctable = (WT_CURSOR_TABLE *)cursor;
CURSOR_API_CALL(cursor, session, ret, next, NULL);
CURSOR_REPOSITION_ENTER(cursor, session);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
APPLY_CG(ctable, next);
err:
@ -321,6 +324,9 @@ __curtable_prev(WT_CURSOR *cursor)
ctable = (WT_CURSOR_TABLE *)cursor;
CURSOR_API_CALL(cursor, session, ret, prev, NULL);
CURSOR_REPOSITION_ENTER(cursor, session);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
APPLY_CG(ctable, prev);
err:
@ -377,6 +383,9 @@ __curtable_search(WT_CURSOR *cursor)
CURSOR_API_CALL(cursor, session, ret, search, NULL);
API_RETRYABLE(session);
CURSOR_REPOSITION_ENTER(cursor, session);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
APPLY_CG(ctable, search);
err:
@ -403,6 +412,8 @@ __curtable_search_near(WT_CURSOR *cursor, int *exact)
API_RETRYABLE(session);
CURSOR_REPOSITION_ENTER(cursor, session);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
cp = ctable->cg_cursors;
primary = *cp;
WT_ERR(primary->search_near(primary, exact));
@ -437,6 +448,9 @@ __curtable_insert(WT_CURSOR *cursor)
ctable = (WT_CURSOR_TABLE *)cursor;
CURSOR_UPDATE_API_CALL(cursor, session, ret, insert, NULL);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_ERR(__curtable_open_indices(ctable));
cp = ctable->cg_cursors;
@ -515,6 +529,9 @@ __curtable_update(WT_CURSOR *cursor)
ctable = (WT_CURSOR_TABLE *)cursor;
CURSOR_UPDATE_API_CALL(cursor, session, ret, update, NULL);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_ERR(__curtable_open_indices(ctable));
/*
@ -567,6 +584,9 @@ __curtable_remove(WT_CURSOR *cursor)
ctable = (WT_CURSOR_TABLE *)cursor;
CURSOR_REMOVE_API_CALL(cursor, session, ret, NULL);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
WT_ERR(__curtable_open_indices(ctable));
/* Check if the cursor was positioned. */
@ -613,6 +633,8 @@ __curtable_reserve(WT_CURSOR *cursor)
ctable = (WT_CURSOR_TABLE *)cursor;
CURSOR_UPDATE_API_CALL(cursor, session, ret, reserve, NULL);
CURSOR_API_CHECK_SYSTEM_OVERLOAD(session, ret);
/*
* We don't have to open the indices here, but it makes the code similar to other cursor
* functions, and it's odd for a reserve call to succeed but the subsequent update fail opening

View File

@ -199,6 +199,7 @@ of the documentation.
- @ref arch-fs-os
- @subpage arch-toc-disaggregated-storage
- @ref arch-page-delta
- @ref arch-disagg-layered
- @ref arch-disagg-layered-cursor
- @ref arch-disagg-key-provider
@ -380,6 +381,7 @@ operation.
/*! @page arch-toc-disaggregated-storage Disaggregated Storage
@subpage arch-page-delta
@subpage arch-disagg-layered
@subpage arch-disagg-layered-cursor
@subpage arch-disagg-key-provider

View File

@ -0,0 +1,154 @@
/*! @arch_page arch-page-delta Page Deltas
WiredTiger's page delta feature reduces write amplification and I/O by writing
only the changed entries for a page (a <i>delta</i>) rather than always
rewriting the full page image during reconciliation. This feature is used
exclusively with disaggregated (layered) row-store btrees.
@section pd_goals Goals and high-level model
Instead of always writing a full page image on reconciliation, WiredTiger can write:
- A base page image (full page), and
- A bounded chain of deltas on top of that base.
On disk the block for a page therefore contains either:
- Only a full page image, or
- A full page image plus one or more delta images.
A <i>delta chain</i> is the sequence of deltas that must be applied to the most
recent full page image (the "base image") to reconstruct the latest page state. Its
maximum length is bounded by the \c page_delta.max_consecutive_delta configuration
option.
Page deltas only apply to row-store pages (\c WT_PAGE_ROW_LEAF and
\c WT_PAGE_ROW_INT). Column-store pages do not participate in this framework.
Deltas are also never written for non-disaggregated tables the btree must have
the \c WT_BTREE_DISAGGREGATED flag set (set automatically for layered btrees).
@section pd_config Configuration
Page delta behavior is controlled via the \c page_delta sub-config at
\c wiredtiger_open or \c reconfigure time:
- \c page_delta.leaf_page_delta (bool): enable leaf page deltas.
- \c page_delta.internal_page_delta (bool): enable internal page deltas.
- \c page_delta.delta_pct (int): maximum allowed size of a delta as a percentage
of the full page size. Candidates that exceed this threshold fall back to a
full page write.
- \c page_delta.max_consecutive_delta (int): maximum number of deltas in a chain
for a single page. When this limit would be exceeded, reconciliation writes a
full page image and resets the chain.
These settings are stored at the connection level in \c WT_PAGE_DELTA_CONFIG
(see \c src/include/connection.h). The flags \c WT_LEAF_PAGE_DELTA and
\c WT_INTERNAL_PAGE_DELTA are derived from them at startup or reconfig time.
Whether a given btree and page type actually use deltas is controlled by the
macros \c WT_DELTA_LEAF_ENABLED and \c WT_DELTA_INT_ENABLED, both of which
additionally require the btree to have the \c WT_BTREE_DISAGGREGATED flag set.
@section pd_generation Generating deltas (reconciliation)
Deltas are generated during reconciliation (\c __wt_reconcile). The two page
types follow different paths:
- For a leaf page: after the full disk image is built, the delta build function
is called if the page has not undergone an in-memory split and the reconciliation
produced a single block result.
- For an internal page: the delta decision and entry packing happen concurrently
with building the full page image. Modified children are packed into the delta buffer
as the child-walk proceeds. Once the full page image is complete, the write generation
field is updated onto the already-populated delta buffer.
After a candidate delta is built, the write path validates whether the delta can
be written or whether a full page image is required, and takes the decision accordingly:
1. If \c block_meta->delta_count has reached \c max_consecutive_delta, the delta
is rejected and a full page image is written instead, resetting the chain.
2. If the built delta's size exceeds \c delta_pct percent of the full page image, it
is also rejected.
3. Several structural reasons can also prevent a delta from being used: the
reconcile produced multiple blocks, the page has no stable LSN yet, the
result is not a single page, the leaf delta is empty, or the delta build
returned no entries.
When a delta is accepted, it is written by \c __rec_write_delta and
\c block_meta->delta_count is incremented.
@subsection pd_leaf Leaf page deltas
\c __rec_build_delta_leaf iterates over the saved-updates list (\c supd) for the
page. For each key whose selected on-page value has changed since the last
reconciliation (\c __rec_selected_key_changed), it packs a key/value entry into
the delta buffer via \c __wti_rec_pack_delta_row_leaf. Only keys that actually
changed are included, making the delta compact for workloads with sparse updates.
Each packed entry carries full MVCC time-window metadata so reconstruction can
respect visibility rules identically to a full page image.
@subsection pd_internal Internal page deltas
Internal page deltas are built during the row-store internal reconcile pass in
\c src/reconcile/rec_row.c. Each child-address change (insert, update, or delete
of a sub-tree pointer) is packed into \c r->delta.
@section pd_key_files Key files
| File | Role |
|------|------|
| \c src/reconcile/rec_write.c | Delta creation decision and writing (\c __rec_build_delta, \c __rec_split_write, \c __rec_write_delta) |
| \c src/reconcile/rec_row.c | Delta packing per row for internal pages (\c __rec_pack_delta_row_int, \c __rec_build_delta_int) and leaf pages (\c __rec_build_delta_leaf entry helpers) |
| \c src/reconcile/rec_visibility.c | Update selection and time-window handling used by leaf delta packing |
| \c src/btree/bt_page.c | Merge helpers on read (\c __wti_page_merge_deltas_with_base_image_leaf, \c __wti_page_merge_deltas_with_base_image_int) |
| \c src/btree/bt_read.c | Page fault-in and delta reconstruction (\c __page_read_build_full_disk_image) |
| \c src/include/btmem.h | Chain-tracking structures (\c WT_PAGE_BLOCK_META, \c WT_PAGE_DISAGG_INFO) |
| \c src/include/connection.h | Connection-level delta config (\c WT_PAGE_DELTA_CONFIG) |
| \c src/include/btree.h | Enable macros (\c WT_DELTA_LEAF_ENABLED, \c WT_DELTA_INT_ENABLED) |
| \c src/reconcile/reconcile_private.h | Reconcile-time macros (\c WT_BUILD_DELTA_LEAF, \c WT_BUILD_DELTA_INT, \c WT_REC_RESULT_SINGLE_PAGE) |
@section pd_on_disk On-disk layout and chain tracking
The per-page delta chain state is kept in \c WT_PAGE_BLOCK_META
(see \c src/include/btmem.h):
- \c delta_count: number of deltas written on top of the current base image
(a \c uint8_t, so the theoretical maximum is 255; keep
\c max_consecutive_delta well below this in practice).
- \c base_lsn: LSN of the base full-image block in the page log.
- \c backlink_lsn: LSN of the previous delta (or base) in the chain.
- \c cumulative_size: total byte size of the base image plus all deltas.
\c WT_PAGE_BLOCK_META lives inside \c WT_PAGE_DISAGG_INFO, which is attached to
the in-memory page via \c page->disagg_info.
@section pd_reconstruction Reconstruction on read
When a page is read from storage and has \c delta_count > 0, the read path in
\c __page_read_build_full_disk_image (see \c src/btree/bt_read.c) merges the
base image with all delta images into a single complete disk image before
building the in-memory page. The rest of the read path then proceeds identically
to the non-delta case.
The merge helpers live in \c src/btree/bt_page.c:
- \c __wti_page_merge_deltas_with_base_image_leaf: merges base leaf image and
all leaf deltas in key order. For each key the newest delta entry wins;
delta deletes remove the key. The result is a valid leaf page disk image
reflecting the latest logical state.
- \c __wti_page_merge_deltas_with_base_image_int: merges base internal image
and all internal deltas. Each child address is either kept from the base or
replaced/removed according to the newest delta entry.
@section pd_tuning Tuning
The primary tuning levers are:
- \c page_delta.delta_pct: raise to allow larger deltas, lower to keep them compact.
- \c page_delta.max_consecutive_delta: raise to extend chains and maximize write
savings, lower to bound read reconstruction cost.
The right balance depends on the read/write ratio and how many keys are updated
per page per checkpoint cycle. Write-heavy workloads with sparse updates benefit
most. Read-heavy workloads or workloads that touch every key on a page should
use short chains (\c max_consecutive_delta of 48).
*/

View File

@ -127,6 +127,7 @@ NUMA
NoSQL
OOP
OPTYPE
Observability
PALI
PALite
PMU
@ -220,6 +221,7 @@ autoconf
autogen
automake
backend
backlink
basecfg
basho
benchmarking
@ -544,6 +546,7 @@ msec
msg
msgs
multi
multiblock
multiprocess
multithreaded
multithreading
@ -720,6 +723,7 @@ subpage
substring
subsubsection
sudo
supd
superset
svg
sys

View File

@ -8,54 +8,6 @@
#pragma once
#ifdef HAVE_DIAGNOSTIC
/*
* Capture cases where a single session handle is used by multiple threads in parallel. The check
* isn't trivial because some API calls re-enter via public API entry points and the session with ID
* 0 is the default session in the connection handle which can be used across multiple threads.
*/
#define WT_SINGLE_THREAD_CHECK_START(s) \
{ \
uintmax_t __tmp_api_tid; \
__wt_thread_id(&__tmp_api_tid); \
\
/* \
* Only a single thread should use this session at a time. It's ok \
* (but unexpected) if different threads use the session consecutively, \
* but concurrent access is not allowed. Verify this by having the thread \
* take a lock on first API access. Failing to take the lock implies \
* another thread holds it and we're attempting concurrent access of the \
* session. \
* \
* The default session (ID == 0) is an exception where concurrent access \
* is allowed. We can also skip taking the lock if we're re-entrant and \
* already hold it. \
*/ \
if (!WT_SESSION_IS_DEFAULT(s) && (s)->thread_check.owning_thread != __tmp_api_tid) { \
bool lock_success = __wt_spin_trylock((s), &(s)->thread_check.lock); \
WT_ASSERT((s), lock_success == 0); \
(s)->thread_check.owning_thread = __tmp_api_tid; \
} \
\
++(s)->thread_check.entry_count; \
}
#define WT_SINGLE_THREAD_CHECK_STOP(s) \
{ \
uintmax_t __tmp_api_tid; \
__wt_thread_id(&__tmp_api_tid); \
if (--((s)->thread_check.entry_count) == 0) { \
if ((s)->id != 0) { \
(s)->thread_check.owning_thread = 0; \
__wt_spin_unlock((s), &(s)->thread_check.lock); \
} \
} \
}
#else
#define WT_SINGLE_THREAD_CHECK_START(s)
#define WT_SINGLE_THREAD_CHECK_STOP(s)
#endif
#define API_SESSION_PUSH(s, struct_name, func_name, dh) \
WT_DATA_HANDLE *__olddh = (s)->dhandle; \
const char *__oldname; \
@ -80,7 +32,7 @@
* correct. \
*/ \
WT_ERR(WT_SESSION_CHECK_PANIC(s)); \
WT_SINGLE_THREAD_CHECK_START(s); \
__wt_single_thread_check_start(s); \
WT_TRACK_OP_INIT(s); \
if ((s)->api_call_counter == 1 && !F_ISSET(s, WT_SESSION_INTERNAL)) \
__wt_op_timer_start(s); \
@ -128,7 +80,7 @@
#define API_END(s, ret) \
if ((s) != NULL) { \
WT_TRACK_OP_END(s); \
WT_SINGLE_THREAD_CHECK_STOP(s); \
__wt_single_thread_check_stop(s); \
if ((ret) != 0 && __set_err) \
__wt_txn_err_set(s, (ret)); \
if ((s)->api_call_counter == 1) { \
@ -316,6 +268,19 @@
TXN_API_CALL(s, WT_SESSION, func_name, NULL, config, cfg); \
SESSION_API_PREPARE_CHECK(s, ret, WT_SESSION, func_name)
#define CURSOR_API_CHECK_SYSTEM_OVERLOAD(s, ret) \
do { \
if (API_USER_ENTRY(s) && \
(!F_ISSET( \
s, WT_SESSION_INTERNAL | WT_SESSION_CHECKPOINT | WT_SESSION_IGNORE_CACHE_SIZE))) { \
if (__wt_conn_load_control_read_overload(s)) { \
WT_STAT_CONN_INCR(s, read_reject_count); \
(ret) = WT_ROLLBACK; \
goto err; \
} \
} \
} while (0)
#define CURSOR_API_CALL(cur, s, ret, func_name, dh) \
(s) = CUR2S(cur); \
API_CALL_NOCONF(s, WT_CURSOR, func_name, dh, true); \
@ -337,10 +302,10 @@
WT_ERR(__wt_cursor_cached(cur))
/*
* API_RETRYABLE and API_RETRYABLE_END are used to wrap API calls so that they are silently
* retried on rollback errors. Generally, these only need to be used with readonly APIs, as
* writable APIs have their own retry code via TXN_API_CALL. These macros may be used with
* *API_CALL and API_END* provided they are ordered in a balanced way.
* API_RETRYABLE and API_RETRYABLE_END are used to wrap API calls so that they are silently retried
* on rollback errors. Generally, these only need to be used with readonly APIs, as writable APIs
* have their own retry code via TXN_API_CALL. These macros may be used with *API_CALL and API_END*
* provided they are ordered in a balanced way.
*/
#define API_RETRYABLE(s) do {

View File

@ -2000,6 +2000,8 @@ static WT_INLINE bool __wt_conf_get_compiled(WT_CONNECTION_IMPL *conn, const cha
WT_CONF **confp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static WT_INLINE bool __wt_conf_is_compiled(WT_CONNECTION_IMPL *conn, const char *config)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static WT_INLINE bool __wt_conn_load_control_read_overload(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static WT_INLINE bool __wt_conn_load_control_write_overload(WT_SESSION_IMPL *session)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
static WT_INLINE bool __wt_cursor_has_cached_memory(WT_CURSOR *cursor)
@ -2580,6 +2582,8 @@ static WT_INLINE void __wt_row_leaf_value_set(WT_ROW *rip, WT_CELL_UNPACK_KV *un
static WT_INLINE void __wt_scr_free(WT_SESSION_IMPL *session, WT_ITEM **bufp);
static WT_INLINE void __wt_seconds(WT_SESSION_IMPL *session, uint64_t *secondsp);
static WT_INLINE void __wt_seconds32(WT_SESSION_IMPL *session, uint32_t *secondsp);
static WT_INLINE void __wt_single_thread_check_start(WT_SESSION_IMPL *s);
static WT_INLINE void __wt_single_thread_check_stop(WT_SESSION_IMPL *s);
static WT_INLINE void __wt_spin_backoff(uint64_t *yield_count, uint64_t *sleep_usecs);
static WT_INLINE void __wt_spin_destroy(WT_SESSION_IMPL *session, WT_SPINLOCK *t);
static WT_INLINE void __wt_spin_lock(WT_SESSION_IMPL *session, WT_SPINLOCK *t);

View File

@ -12,6 +12,23 @@
extern "C" {
#endif
/*
* __wt_conn_load_control_read_overload --
* check if the system is read overloaded.
*/
static WT_INLINE bool
__wt_conn_load_control_read_overload(WT_SESSION_IMPL *session)
{
WT_CONNECTION_LOAD_CONTROL *load_control = &S2C(session)->load_control;
/* If load control is enabled, check if the read load crossed the control threshold. */
if (F_ISSET(load_control, WT_CONN_LOAD_CONTROL))
return (load_control->control_threshold <=
__wt_atomic_load_uint8_relaxed(&load_control->read_load));
return (false);
}
/*
* __wt_conn_load_control_write_overload --
* check if the system is write overloaded.

View File

@ -292,13 +292,12 @@ struct __wt_session_impl {
#endif
#ifdef HAVE_UNITTEST_ASSERTS
/*
* Unit testing assertions requires overriding abort logic and instead capturing this information to
* be checked by the unit test.
*/
#define WT_SESSION_UNITTEST_BUF_LEN 100
/*
* Unit testing assertions requires overriding abort logic and instead capturing this
* information to be checked by the unit test.
*/
bool unittest_assert_hit;
char unittest_assert_msg[WT_SESSION_UNITTEST_BUF_LEN];
char unittest_assert_msg[WT_ERR_MSG_BUF_LEN];
#endif
/* AUTOMATIC FLAG VALUE GENERATION START 0 */

View File

@ -0,0 +1,66 @@
/*-
* Copyright (c) 2014-present MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
* See the file LICENSE for redistribution information.
*/
#pragma once
/*
* __wt_single_thread_check_start --
* Only a single thread should use this session at a time. It's ok (but unexpected) if different
* threads use the session consecutively, but concurrent access is not allowed. Verify this by
* having the thread take a lock on first API access. Failing to take the lock implies another
* thread holds it and we're attempting concurrent access of the session.
*
* The default session (ID == 0) is an exception where concurrent access is allowed. We can also
* skip taking the lock if we're re-entrant and already hold it.
*/
static WT_INLINE void
__wt_single_thread_check_start(WT_SESSION_IMPL *s)
{
#if !defined(HAVE_DIAGNOSTIC)
WT_UNUSED(s);
return;
#else
uintmax_t current_tid;
WT_DECL_RET;
__wt_thread_id(&current_tid);
if (!WT_SESSION_IS_DEFAULT(s) && s->thread_check.owning_thread != current_tid) {
ret = __wt_spin_trylock(s, &s->thread_check.lock);
WT_ASSERT_ALWAYS(s, ret == 0,
"Session %" PRIu32
" is accessed concurrently by multiple threads: "
"current thread %" PRIuMAX ", owning thread %" PRIuMAX
" (active op: %s, last op: %s, api depth: %u, dhandle: %s)",
s->id, current_tid, s->thread_check.owning_thread, s->name != NULL ? s->name : "none",
s->lastop != NULL ? s->lastop : "none", s->api_call_counter,
s->dhandle != NULL ? s->dhandle->name : "none");
s->thread_check.owning_thread = current_tid;
}
++s->thread_check.entry_count;
#endif
}
/*
* __wt_single_thread_check_stop --
* Release the single-thread ownership of this session.
*/
static WT_INLINE void
__wt_single_thread_check_stop(WT_SESSION_IMPL *s)
{
#if !defined(HAVE_DIAGNOSTIC)
WT_UNUSED(s);
return;
#else
if (--s->thread_check.entry_count == 0 && !WT_SESSION_IS_DEFAULT(s)) {
s->thread_check.owning_thread = 0;
__wt_spin_unlock(s, &s->thread_check.lock);
}
#endif
}

View File

@ -1036,6 +1036,7 @@ struct __wt_connection_stats {
int64_t live_restore_hist_source_read_latency_gt1000;
int64_t live_restore_hist_source_read_latency_total_msecs;
int64_t live_restore_state;
int64_t read_reject_count;
int64_t write_reject_count;
int64_t read_load;
int64_t write_load;

File diff suppressed because it is too large Load Diff

View File

@ -671,6 +671,7 @@ typedef uint64_t wt_timestamp_t;
#include "timestamp_inline.h" /* required by btree_inline.h */
#include "cell_inline.h" /* required by btree_inline.h */
#include "mutex_inline.h" /* required by btree_inline.h */
#include "session_inline.h" /* required by api.h macros */
#include "txn_inline.h" /* required by btree_inline.h */
#include "bitstring_inline.h"

View File

@ -91,7 +91,7 @@ int
__wt_thread_str(char *buf, size_t buflen)
{
return (__wt_snprintf(buf, buflen, "%" PRIu64 ":%" PRIu64, (uint64_t)GetCurrentProcessId(),
(uint64_t)GetCurrentThreadId));
(uint64_t)GetCurrentThreadId()));
}
/*

View File

@ -463,6 +463,8 @@ __wt_prepared_discover_filter_apply_handles(WT_SESSION_IMPL *session)
const char *checkpoint_name, *uri, *config;
bool has_prepare;
checkpoint_name = NULL;
WT_RET(__wt_metadata_cursor(session, &cursor));
while ((ret = cursor->next(cursor)) == 0) {
@ -489,12 +491,14 @@ __wt_prepared_discover_filter_apply_handles(WT_SESSION_IMPL *session)
*/
WT_ERR(__wt_buf_fmt(session, stable_uri_buf, "%s/%s", uri, checkpoint_name));
uri = stable_uri_buf->data;
__wt_free(session, checkpoint_name);
}
WT_ERR(__prepared_discover_walk_one_tree(session, uri));
}
if (ret == WT_NOTFOUND)
ret = 0;
err:
__wt_free(session, checkpoint_name);
WT_TRET(__wt_metadata_cursor_release(session, &cursor));
__wt_scr_free(session, &stable_uri_buf);
return (ret);

View File

@ -2636,7 +2636,7 @@ __open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const
#ifdef HAVE_UNITTEST_ASSERTS
session_ret->unittest_assert_hit = false;
memset(session->unittest_assert_msg, 0, WT_SESSION_UNITTEST_BUF_LEN);
memset(session->unittest_assert_msg, 0, sizeof(session->unittest_assert_msg));
#endif
#ifdef HAVE_DIAGNOSTIC

View File

@ -2507,6 +2507,7 @@ static const char *const __stats_connection_desc[] = {
"live-restore: source read latency histogram (bucket 9) - 1000ms+",
"live-restore: source read latency histogram total (msecs)",
"live-restore: state",
"load-control: number of read operations rejected due to load control",
"load-control: number of write operations rejected due to load control",
"load-control: read load at the system level",
"load-control: write load at the system level",
@ -3566,6 +3567,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats)
stats->live_restore_hist_source_read_latency_gt1000 = 0;
stats->live_restore_hist_source_read_latency_total_msecs = 0;
/* not clearing live_restore_state */
stats->read_reject_count = 0;
stats->write_reject_count = 0;
stats->read_load = 0;
stats->write_load = 0;
@ -4759,6 +4761,7 @@ __wt_stat_connection_aggregate(WT_CONNECTION_STATS **from, WT_CONNECTION_STATS *
to->live_restore_hist_source_read_latency_total_msecs +=
WT_STAT_CONN_READ(from, live_restore_hist_source_read_latency_total_msecs);
to->live_restore_state += WT_STAT_CONN_READ(from, live_restore_state);
to->read_reject_count += WT_STAT_CONN_READ(from, read_reject_count);
to->write_reject_count += WT_STAT_CONN_READ(from, write_reject_count);
to->read_load += WT_STAT_CONN_READ(from, read_load);
to->write_load += WT_STAT_CONN_READ(from, write_load);

View File

@ -46,7 +46,7 @@ check_assertion_fired(WT_SESSION_IMPL *session)
if (ret == ASSERT_FIRED) {
// Clear the assertion flag and message for the next test step.
session->unittest_assert_hit = false;
memset(session->unittest_assert_msg, 0, WT_SESSION_UNITTEST_BUF_LEN);
memset(session->unittest_assert_msg, 0, sizeof(session->unittest_assert_msg));
}
return ret;

View File

@ -0,0 +1,50 @@
/*-
* Copyright (c) 2014-present MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
* See the file LICENSE for redistribution information.
*/
#include <catch2/catch.hpp>
#include "wt_internal.h"
#include "wrappers/mock_session.h"
/* Tests that __wt_single_thread_check_start assertion failure handles some fields being NULL. */
#if defined(HAVE_DIAGNOSTIC) && defined(HAVE_UNITTEST_ASSERTS)
TEST_CASE(
"Single thread check: concurrent access with NULL fields produces valid assertion message",
"[single_thread_check]")
{
std::shared_ptr<mock_session> ms = mock_session::build_test_mock_session();
WT_SESSION_IMPL *session = ms->get_wt_session_impl();
session->id = 1;
/*
* Simulate another thread holding the session lock. Taking it here causes
* __wt_spin_trylock inside check_start to return EBUSY, which fires the assertion.
* Real thread IDs are never 0, so owning_thread (0) != current_tid is guaranteed.
*/
__wt_spin_lock(session, &session->thread_check.lock);
__wt_single_thread_check_start(session);
REQUIRE(session->unittest_assert_hit);
uintmax_t current_tid;
__wt_thread_id(&current_tid);
/* name, last op, dhandle remain NULL, owning_thread is also 0. */
std::string expected = std::string("WiredTiger assertion failed: 'ret == 0'. ") +
"Session 1 is accessed concurrently by multiple threads: " + "current thread " +
std::to_string(current_tid) +
", owning thread 0 (active op: none, last op: none, api depth: 0, dhandle: none)";
REQUIRE(std::string(session->unittest_assert_msg) == expected);
__wt_spin_unlock(session, &session->thread_check.lock);
}
#endif

View File

@ -78,7 +78,8 @@ typedef struct key_values {
*/
#define COPY_MESSAGE_CONTENT(dest, src) \
do { \
char *_dest, *_src, *_trailing; \
const char *_src; \
char *_dest, *_trailing; \
\
_src = strchr(src, ':') + 1; \
while (*_src == ' ') \
@ -310,7 +311,8 @@ static int
handle_wiredtiger_message(WT_EVENT_HANDLER *handler, WT_SESSION *session, const char *message)
{
CUSTOM_EVENT_HANDLER *custom;
char *output, *p;
const char *p;
char *output;
(void)session;

View File

@ -0,0 +1,168 @@
#!/usr/bin/env python3
#
# Public Domain 2014-present MongoDB, Inc.
# Public Domain 2008-2014 WiredTiger, Inc.
#
# This is free and unencumbered software released into the public domain.
#
# Anyone is free to copy, modify, publish, use, compile, sell, or
# distribute this software, either in source code form or as a compiled
# binary, for any purpose, commercial or non-commercial, and by any
# means.
#
# In jurisdictions that recognize copyright laws, the author or authors
# of this software dedicate any and all copyright interest in the
# software to the public domain. We make this dedication for the benefit
# of the public at large and to the detriment of our heirs and
# successors. We intend this dedication to be an overt act of
# relinquishment in perpetuity of all present and future rights to this
# software under copyright law.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
import wiredtiger, wttest
from helper_disagg import disagg_test_class
from wtscenario import make_scenarios
# test_layered_prepare02.py
# Forward iteration after a search() or search_near() that returns
# WT_PREPARE_CONFLICT must yield correct results on a layered cursor.
@disagg_test_class
class test_layered_prepare02(wttest.WiredTigerTestCase):
scenarios = make_scenarios([
('search', dict(use_search_near=False)),
('search_near', dict(use_search_near=True)),
])
conn_base_config = 'precise_checkpoint=true,'
conn_config = conn_base_config + 'disaggregated=(role="leader")'
def safe_next(self, cursor):
try:
return cursor.next()
except wiredtiger.WiredTigerError as e:
if 'WT_PREPARE_CONFLICT' in str(e):
return wiredtiger.WT_PREPARE_CONFLICT
raise
def safe_search(self, cursor, key):
cursor.set_key(key)
try:
cursor.search()
return 0
except wiredtiger.WiredTigerError as e:
if 'WT_PREPARE_CONFLICT' in str(e):
return wiredtiger.WT_PREPARE_CONFLICT
raise
def safe_search_near(self, cursor, key):
cursor.set_key(key)
try:
cursor.search_near()
return 0
except wiredtiger.WiredTigerError as e:
if 'WT_PREPARE_CONFLICT' in str(e):
return wiredtiger.WT_PREPARE_CONFLICT
raise
def conflict_search(self, cursor, key):
if self.use_search_near:
return self.safe_search_near(cursor, key)
return self.safe_search(cursor, key)
def _setup_follower(self, prepared_key):
'''
Open a follower with keys '1','2','3' checkpointed as stable and one
prepared ingest update on prepared_key. Return (stable_keys,
conn_follow, prep_session, iter_session, iter_cursor).
'''
uri = 'table:test_layered_prepare02'
conn_follow = self.wiredtiger_open('follower', self.extensionsConfig() +
',create,' + self.conn_base_config + 'disaggregated=(role="follower")')
stable_keys = ['1', '2', '3']
self.session.create(uri, 'key_format=S,value_format=S,block_manager=disagg,type=layered')
with self.transaction(session=self.session, commit_timestamp=100):
c = self.session.open_cursor(uri)
for k in stable_keys:
c[k] = 'stable_' + k
c.close()
self.conn.set_timestamp(f'stable_timestamp={self.timestamp_str(200)}')
self.session.checkpoint()
self.disagg_advance_checkpoint(conn_follow)
prep_session = conn_follow.open_session('')
prep_cursor = prep_session.open_cursor(uri)
prep_session.begin_transaction()
prep_cursor[prepared_key] = 'prepared_update'
prep_cursor.close()
prep_session.prepare_transaction(
f'prepare_timestamp={self.timestamp_str(300)}'
+ f',prepared_id={self.prepared_id_str(1)}')
# Reader whose read_timestamp covers the prepare so it sees the conflict.
iter_session = conn_follow.open_session('')
iter_session.begin_transaction(f'read_timestamp={self.timestamp_str(400)}')
iter_cursor = iter_session.open_cursor(uri)
return stable_keys, conn_follow, prep_session, iter_session, iter_cursor
def test_next_after_prepare_conflict(self):
# A search that succeeds followed by a search/search_near that returns
# WT_PREPARE_CONFLICT must leave the cursor fully reset so that subsequent
# next() calls iterate all keys from the beginning.
stable_keys, conn_follow, prep_session, iter_session, cursor = \
self._setup_follower('2')
# Position cursor at '3', then trigger WT_PREPARE_CONFLICT on '2'.
# The cursor must reset regardless of where it was previously positioned.
self.assertEqual(self.safe_search(cursor, '3'), 0)
self.assertEqual(self.conflict_search(cursor, '2'), wiredtiger.WT_PREPARE_CONFLICT)
# After rolling back the prepare, iteration must return all stable keys.
prep_session.rollback_transaction()
got = []
ret = cursor.next()
while ret == 0:
got.append(cursor.get_key())
ret = cursor.next()
self.assertEqual(ret, wiredtiger.WT_NOTFOUND)
self.assertEqual(got, stable_keys)
cursor.close()
iter_session.rollback_transaction()
conn_follow.close()
def test_next_after_conflicting_next_then_search(self):
# A next() that returns WT_PREPARE_CONFLICT followed by a search/search_near
# on the same prepared key must leave the cursor fully reset so that
# subsequent next() calls iterate all keys from the beginning.
stable_keys, conn_follow, prep_session, iter_session, cursor = \
self._setup_follower('1')
# next() conflicts on the first key; search/search_near on the same key
# also conflicts. Both must leave the cursor in a clean state.
self.assertEqual(self.safe_next(cursor), wiredtiger.WT_PREPARE_CONFLICT)
self.assertEqual(self.conflict_search(cursor, '1'), wiredtiger.WT_PREPARE_CONFLICT)
# After rolling back the prepare, iteration must return all stable keys.
prep_session.rollback_transaction()
got = []
ret = cursor.next()
while ret == 0:
got.append(cursor.get_key())
ret = cursor.next()
self.assertEqual(ret, wiredtiger.WT_NOTFOUND)
self.assertEqual(got, stable_keys)
cursor.close()
iter_session.rollback_transaction()
conn_follow.close()