Import wiredtiger: c629c898d0290037c0c8b3af653e6354f7424199 from branch mongodb-master (#48351)

Co-authored-by: wt-vendoring-bot <wt-vendoring-bot@mongodb.com>
GitOrigin-RevId: c19a68f7b4fb802242058eb753ac71b7043f81ad
This commit is contained in:
wt-vendoring-bot[bot] 2026-02-20 00:02:34 +00:00 committed by MongoDB Bot
parent 9234727de5
commit 4d4b95e18e
20 changed files with 468 additions and 185 deletions

View File

@ -1434,8 +1434,10 @@ wiredtiger_open_common =\
Enable automatic detection of scans by applications, and attempt to pre-fetch future
content into the cache''',
type='category', subconfig=[
Config('available', 'false', r'''
whether the thread pool for the pre-fetch functionality is started''',
Config('available', 'true', r'''
whether the thread pool for the pre-fetch functionality is started, this does not mean
that pre-fetch is enabled for sessions by default, see the \c default setting at the
connection level and the \c prefetch setting at the session level.''',
type='boolean'),
Config('default', 'false', r'''
whether pre-fetch is enabled for all sessions by default''',

View File

@ -40,7 +40,7 @@ def flag_declare(name):
print("Flag stop value of " + str(max_flags) \
+ " is larger than field size " + str(fld_size))
sys.exit(1)
if line.find('AUTOMATIC FLAG VALUE GENERATION START') != -1:
if re.search(r"/\*[A-Z ]*AUTOMATIC[A-Z ]+FLAG[A-Z ]+GENERATION START", line) != None:
m = re.search(r"\d+", line)
if m == None:
print(name + ": automatic flag generation start at line " +
@ -51,7 +51,7 @@ def flag_declare(name):
header = line
defines = []
parsing = True
elif line.find('AUTOMATIC FLAG VALUE GENERATION STOP') != -1:
elif re.search(r"/\*[A-Z ]*AUTOMATIC[A-Z ]+FLAG[A-Z ]+GENERATION STOP", line) != None:
m = re.search(r"\d+", line)
if m == None:
print(name + ": automatic flag generation stop at line " +

View File

@ -2,5 +2,5 @@
"vendor": "wiredtiger",
"github": "wiredtiger/wiredtiger",
"branch": "mongodb-master",
"commit": "57ba095dca3bf1d8628cb10d6cbf8dd3aeea3b4c"
"commit": "c629c898d0290037c0c8b3af653e6354f7424199"
}

View File

@ -643,6 +643,114 @@ err:
return (ret);
}
/*
* __wt_blkcache_compress --
* Optionally compress a buffer for writing.
*
* If compression is performed, the compressed buffer is returned via the output parameter. If
* compression is not performed (not configured, block too small, or compression didn't help),
* the output parameter is set to NULL. The caller is responsible for freeing the output buffer.
*/
int
__wt_blkcache_compress(WT_SESSION_IMPL *session, WT_ITEM *buf, bool already_compressed,
WT_ITEM **compressed_bufp, size_t *compressed_sizep, bool *compressedp)
{
WT_BM *bm;
WT_BTREE *btree;
WT_DECL_ITEM(ctmp);
WT_DECL_RET;
WT_PAGE_HEADER *dsk;
size_t compression_ratio, dst_len, len, result_len, size, src_len;
uint8_t *dst, *src;
int compression_failed; /* Extension API, so not a bool. */
btree = S2BT(session);
bm = btree->bm;
*compressed_bufp = NULL;
if (compressed_sizep != NULL)
*compressed_sizep = 0;
*compressedp = already_compressed;
/*
* Optionally stream-compress the data, but don't compress blocks that are already as small as
* they're going to get.
*/
if (btree->compressor == NULL || btree->compressor->compress == NULL || already_compressed)
return (0);
if (buf->size <= btree->allocsize) {
WT_STAT_DSRC_INCR(session, compress_write_too_small);
return (0);
}
/* Skip the header bytes of the source data. */
src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP;
src_len = buf->size - WT_BLOCK_COMPRESS_SKIP;
/*
* Compute the size needed for the destination buffer. We only allocate enough memory for a copy
* of the original by default, if any compressed version is bigger than the original, we won't
* use it. However, some compression engines (snappy is one example), may need more memory
* because they don't stop just because there's no more memory into which to compress.
*/
if (btree->compressor->pre_size == NULL)
len = src_len;
else
WT_RET(btree->compressor->pre_size(btree->compressor, &session->iface, src, src_len, &len));
size = len + WT_BLOCK_COMPRESS_SKIP;
WT_RET(bm->write_size(bm, session, &size));
WT_RET(__wt_scr_alloc(session, size, &ctmp));
/* Skip the header bytes of the destination data. */
dst = (uint8_t *)ctmp->mem + WT_BLOCK_COMPRESS_SKIP;
dst_len = len;
compression_failed = 0;
WT_ERR(btree->compressor->compress(btree->compressor, &session->iface, src, src_len, dst,
dst_len, &result_len, &compression_failed));
result_len += WT_BLOCK_COMPRESS_SKIP;
/*
* If compression fails, or doesn't gain us at least one unit of allocation, fallback to the
* original version. This isn't unexpected: if compression doesn't work for some chunk of data
* for some reason (noting likely additional format/header information which compressed output
* requires), it just means the uncompressed version is as good as it gets, and that's what we
* use.
*/
if (compression_failed || buf->size / btree->allocsize <= result_len / btree->allocsize) {
__wt_scr_free(session, &ctmp);
WT_STAT_DSRC_INCR(session, compress_write_fail);
return (0);
}
*compressedp = true;
WT_STAT_DSRC_INCR(session, compress_write);
compression_ratio = src_len / (result_len - WT_BLOCK_COMPRESS_SKIP);
__wt_stat_compr_ratio_write_hist_incr(session, compression_ratio);
/* Copy in the skipped header bytes and set the final data size. */
memcpy(ctmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP);
ctmp->size = result_len;
/* Set the disk header flags. */
dsk = ctmp->mem;
F_SET(dsk, WT_PAGE_COMPRESSED);
/* Return the compressed buffer and optionally the compressed size. */
*compressed_bufp = ctmp;
if (compressed_sizep != NULL)
*compressed_sizep = result_len;
return (0);
err:
__wt_scr_free(session, &ctmp);
return (ret);
}
/*
* __wt_blkcache_write --
* Write a buffer into a block, returning the block's address cookie.
@ -655,100 +763,27 @@ __wt_blkcache_write(WT_SESSION_IMPL *session, WT_ITEM *buf, WT_PAGE_BLOCK_META *
WT_BLKCACHE *blkcache;
WT_BM *bm;
WT_BTREE *btree;
WT_DECL_ITEM(ctmp);
WT_DECL_ITEM(etmp);
WT_DECL_RET;
WT_ITEM *ip;
WT_ITEM *ctmp, *ip;
WT_KEYED_ENCRYPTOR *kencryptor;
WT_PAGE_HEADER *dsk;
size_t compression_ratio, dst_len, len, result_len, size, src_len;
size_t size;
uint64_t time_diff, time_start, time_stop;
uint32_t delta_count, mem_size;
uint8_t *dst, *src;
int compression_failed; /* Extension API, so not a bool. */
bool data_checksum, encrypted, timer;
if (compressed_sizep != NULL)
*compressed_sizep = 0;
blkcache = &S2C(session)->blkcache;
btree = S2BT(session);
bm = btree->bm;
ctmp = NULL;
delta_count = (block_meta == NULL) ? 0 : block_meta->delta_count;
dsk = NULL;
encrypted = false;
/*
* Optionally stream-compress the data, but don't compress blocks that are already as small as
* they're going to get.
*/
if (btree->compressor == NULL || btree->compressor->compress == NULL || compressed)
ip = buf;
else if (buf->size <= btree->allocsize) {
ip = buf;
WT_STAT_DSRC_INCR(session, compress_write_too_small);
} else {
/* Skip the header bytes of the source data. */
src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP;
src_len = buf->size - WT_BLOCK_COMPRESS_SKIP;
/*
* Compute the size needed for the destination buffer. We only allocate enough memory for a
* copy of the original by default, if any compressed version is bigger than the original,
* we won't use it. However, some compression engines (snappy is one example), may need more
* memory because they don't stop just because there's no more memory into which to
* compress.
*/
if (btree->compressor->pre_size == NULL)
len = src_len;
else
WT_ERR(
btree->compressor->pre_size(btree->compressor, &session->iface, src, src_len, &len));
size = len + WT_BLOCK_COMPRESS_SKIP;
WT_ERR(bm->write_size(bm, session, &size));
WT_ERR(__wt_scr_alloc(session, size, &ctmp));
/* Skip the header bytes of the destination data. */
dst = (uint8_t *)ctmp->mem + WT_BLOCK_COMPRESS_SKIP;
dst_len = len;
compression_failed = 0;
WT_ERR(btree->compressor->compress(btree->compressor, &session->iface, src, src_len, dst,
dst_len, &result_len, &compression_failed));
result_len += WT_BLOCK_COMPRESS_SKIP;
/*
* If compression fails, or doesn't gain us at least one unit of allocation, fallback to the
* original version. This isn't unexpected: if compression doesn't work for some chunk of
* data for some reason (noting likely additional format/header information which compressed
* output requires), it just means the uncompressed version is as good as it gets, and
* that's what we use.
*/
if (compression_failed || buf->size / btree->allocsize <= result_len / btree->allocsize) {
ip = buf;
WT_STAT_DSRC_INCR(session, compress_write_fail);
} else {
compressed = true;
WT_STAT_DSRC_INCR(session, compress_write);
compression_ratio = src_len / (result_len - WT_BLOCK_COMPRESS_SKIP);
__wt_stat_compr_ratio_write_hist_incr(session, compression_ratio);
/* Copy in the skipped header bytes and set the final data size. */
memcpy(ctmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP);
ctmp->size = result_len;
ip = ctmp;
/* Set the disk header flags. */
dsk = ip->mem;
F_SET(dsk, WT_PAGE_COMPRESSED);
/* Optionally return the compressed size. */
if (compressed_sizep != NULL)
*compressed_sizep = result_len;
}
}
/* Optionally compress the data. */
WT_ERR(__wt_blkcache_compress(session, buf, compressed, &ctmp, compressed_sizep, &compressed));
ip = (ctmp != NULL) ? ctmp : buf;
/*
* Optionally encrypt the data. We need to add in the original length, in case both compression

View File

@ -195,11 +195,15 @@ __block_disagg_read_multiple(WT_SESSION_IMPL *session, WT_BLOCK_DISAGG *block_di
blk = WT_BLOCK_HEADER_REF(current->data);
__wti_block_disagg_header_byteswap_copy(blk, &swap);
if (swap.checksum == checksum) {
/*
* TODO(WT-16511): When we have the original checksum stored in the page, we should check
* that instead of skipping the check entirely for cached pages.
*/
if (F_ISSET(&swap, WT_BLOCK_DISAGG_MODIFIED) || swap.checksum == checksum) {
blk->checksum = 0;
if (__wt_checksum_match(current->data,
F_ISSET(&swap, WT_BLOCK_DATA_CKSUM) ? size : WT_MIN(size, WT_BLOCK_COMPRESS_SKIP),
checksum)) {
swap.checksum)) {
expected_magic =
(is_delta ? WT_BLOCK_DISAGG_MAGIC_DELTA : WT_BLOCK_DISAGG_MAGIC_BASE);
if (swap.magic != expected_magic) {
@ -218,27 +222,33 @@ __block_disagg_read_multiple(WT_SESSION_IMPL *session, WT_BLOCK_DISAGG *block_di
}
if (result == last) {
WT_ASSERT(session, get_args.lsn > 0);
WT_ASSERT(session,
(*results_count > 1) == FLD_ISSET(flags, WT_BLOCK_DISAGG_ADDR_FLAG_DELTA));
/* The server is allowed to set base LSN to 0 for full page images. */
WT_ASSERT(session,
(get_args.base_lsn == 0 && *results_count == 1) ||
get_args.base_lsn == base_lsn);
/* Set the other metadata returned by the Page Service. */
block_meta->page_id = page_id;
block_meta->backlink_lsn = get_args.backlink_lsn;
block_meta->base_lsn = get_args.base_lsn;
block_meta->disagg_lsn = get_args.lsn;
block_meta->delta_count = (uint8_t)(*results_count - 1);
block_meta->delta_count = F_ISSET(&swap, WT_BLOCK_DISAGG_MODIFIED) ?
(uint8_t)get_args.delta_count :
(uint8_t)(*results_count - 1);
block_meta->checksum = checksum;
if (block_meta->delta_count > 0)
WT_ASSERT(session, get_args.base_lsn > 0);
else
WT_ASSERT(
session, get_args.base_lsn == 0 && get_args.base_checkpoint_id == 0);
WT_ASSERT(session, get_args.lsn > 0);
if (!F_ISSET(&swap, WT_BLOCK_DISAGG_MODIFIED)) {
WT_ASSERT(session,
(*results_count > 1) ==
FLD_ISSET(flags, WT_BLOCK_DISAGG_ADDR_FLAG_DELTA));
/* The server is allowed to set base LSN to 0 for full page images. */
WT_ASSERT(session,
(get_args.base_lsn == 0 && *results_count == 1) ||
get_args.base_lsn == base_lsn);
if (block_meta->delta_count > 0)
WT_ASSERT(session, get_args.base_lsn > 0);
else
WT_ASSERT(
session, get_args.base_lsn == 0 && get_args.base_checkpoint_id == 0);
}
}
/*

View File

@ -1774,31 +1774,30 @@ __split_multi_inmem_fail(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_MULTI *mult
WT_UPDATE *upd;
uint32_t i, slot;
if (!F_ISSET(S2BT(session), WT_BTREE_IN_MEMORY))
/* Append the onpage values back to the original update chains. */
for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd) {
/*
* We don't need to do anything for update chains that are not restored, or restored
* without an onpage value.
*/
if (!supd->restore || supd->onpage_upd == NULL)
continue;
/* Append the onpage values back to the original update chains. */
for (i = 0, supd = multi->supd; i < multi->supd_entries; ++i, ++supd) {
/*
* We don't need to do anything for update chains that are not restored, or restored without
* an onpage value.
*/
if (!supd->restore || supd->onpage_upd == NULL)
continue;
if (supd->free_upds == NULL)
continue;
if (supd->free_upds == NULL)
continue;
if (supd->ins == NULL) {
/* Note: supd->ins is never null for column-store. */
slot = WT_ROW_SLOT(orig, supd->rip);
upd = orig->modify->mod_row_update[slot];
} else
upd = supd->ins->upd;
if (supd->ins == NULL) {
/* Note: supd->ins is never null for column-store. */
slot = WT_ROW_SLOT(orig, supd->rip);
upd = orig->modify->mod_row_update[slot];
} else
upd = supd->ins->upd;
WT_ASSERT(session, upd != NULL);
for (; upd->next != NULL; upd = upd->next)
;
upd->next = supd->free_upds;
}
WT_ASSERT(session, upd != NULL);
for (; upd->next != NULL; upd = upd->next)
;
upd->next = supd->free_upds;
}
/*
* We failed creating new in-memory pages. For error-handling reasons, we've left the update

View File

@ -4436,8 +4436,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"operation_tracking=(enabled=false,path=\".\"),"
"page_delta=(delta_pct=20,internal_page_delta=true,"
"leaf_page_delta=true,max_consecutive_delta=32),"
"precise_checkpoint=false,prefetch=(available=false,"
"default=false),preserve_prepared=false,readonly=false,"
"precise_checkpoint=false,prefetch=(available=true,default=false)"
",preserve_prepared=false,readonly=false,"
"rollback_to_stable=(threads=4),salvage=false,session_max=100,"
"session_scratch_max=2MB,session_table_cache=true,"
"shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"
@ -4505,8 +4505,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"operation_tracking=(enabled=false,path=\".\"),"
"page_delta=(delta_pct=20,internal_page_delta=true,"
"leaf_page_delta=true,max_consecutive_delta=32),"
"precise_checkpoint=false,prefetch=(available=false,"
"default=false),preserve_prepared=false,readonly=false,"
"precise_checkpoint=false,prefetch=(available=true,default=false)"
",preserve_prepared=false,readonly=false,"
"rollback_to_stable=(threads=4),salvage=false,session_max=100,"
"session_scratch_max=2MB,session_table_cache=true,"
"shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"
@ -4574,8 +4574,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"operation_timeout_ms=0,operation_tracking=(enabled=false,"
"path=\".\"),page_delta=(delta_pct=20,internal_page_delta=true,"
"leaf_page_delta=true,max_consecutive_delta=32),"
"precise_checkpoint=false,prefetch=(available=false,"
"default=false),preserve_prepared=false,readonly=false,"
"precise_checkpoint=false,prefetch=(available=true,default=false)"
",preserve_prepared=false,readonly=false,"
"rollback_to_stable=(threads=4),salvage=false,session_max=100,"
"session_scratch_max=2MB,session_table_cache=true,"
"shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"
@ -4642,8 +4642,8 @@ static const WT_CONFIG_ENTRY config_entries[] = {
"operation_timeout_ms=0,operation_tracking=(enabled=false,"
"path=\".\"),page_delta=(delta_pct=20,internal_page_delta=true,"
"leaf_page_delta=true,max_consecutive_delta=32),"
"precise_checkpoint=false,prefetch=(available=false,"
"default=false),preserve_prepared=false,readonly=false,"
"precise_checkpoint=false,prefetch=(available=true,default=false)"
",preserve_prepared=false,readonly=false,"
"rollback_to_stable=(threads=4),salvage=false,session_max=100,"
"session_scratch_max=2MB,session_table_cache=true,"
"shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB),"

View File

@ -3346,11 +3346,17 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *c
WT_ERR(__wt_config_gets(session, cfg, "prefetch.available", &cval));
conn->prefetch_available = cval.val != 0;
if (F_ISSET(conn, WT_CONN_IN_MEMORY) && conn->prefetch_available)
WT_ERR_MSG(
session, EINVAL, "prefetch configuration is incompatible with in-memory configuration");
WT_ERR(__wt_config_gets(session, cfg, "prefetch.default", &cval));
conn->prefetch_auto_on = cval.val != 0;
if (F_ISSET(conn, WT_CONN_IN_MEMORY) && (conn->prefetch_available || conn->prefetch_auto_on)) {
__wt_verbose(session, WT_VERB_PREFETCH, "%s",
"prefetch configuration is incompatible with in-memory configuration");
WT_CONFIG_DEBUG(session, "%s", "setting prefetch.available and prefetch.default to false");
conn->prefetch_auto_on = false;
conn->prefetch_available = false;
}
if (conn->prefetch_auto_on && !conn->prefetch_available)
WT_ERR_MSG(session, EINVAL,
"pre-fetching cannot be enabled if pre-fetching is configured as unavailable");

View File

@ -8,10 +8,6 @@
#include "wt_internal.h"
#define WT_CONFIG_DEBUG(session, fmt, ...) \
if (FLD_ISSET(S2C(session)->debug_flags, WT_CONN_DEBUG_CONFIGURATION)) \
__wt_verbose_warning(session, WT_VERB_CONFIGURATION, fmt, __VA_ARGS__);
/*
* __evict_config_abs_to_pct --
* Evict configuration values can be either a percentage or an absolute size, this function

View File

@ -1905,7 +1905,13 @@ retry:
continue;
}
if (F_ISSET(btree, WT_BTREE_IN_MEMORY) && !F_ISSET(evict, WT_EVICT_CACHE_DIRTY)) {
/*
* For in-memory btrees, if we are not evicting dirty pages or pages with active updates,
* walking them serves no purpose. Such pages are not eligible for clean eviction, making
* the operation unnecessary.
*/
if (F_ISSET(btree, WT_BTREE_IN_MEMORY) &&
!F_ISSET(evict, WT_EVICT_CACHE_DIRTY | WT_EVICT_CACHE_UPDATES)) {
__evict_disagg_btree_skip_count(session, btree);
continue;
}
@ -2303,8 +2309,8 @@ __evict_get_min_pages(WT_SESSION_IMPL *session, uint32_t target_pages)
/*
* __evict_try_restore_walk_position --
* Try to restore the walk position from saved soft pos. Returns true if the walk position is
* restored.
* Try to restore the eviction walk position from saved soft pos. If we can't restore a saved
* position, clear the eviction walk position instead.
*/
static WT_INLINE int
__evict_try_restore_walk_position(WT_SESSION_IMPL *session, WT_BTREE *btree, uint32_t walk_flags)
@ -2557,8 +2563,7 @@ __evict_try_queue_page(WT_SESSION_IMPL *session, WTI_EVICT_QUEUE *queue, WT_REF
want_page =
(F_ISSET(evict, WT_EVICT_CACHE_CLEAN) && !F_ISSET(btree, WT_BTREE_IN_MEMORY) && !modified) ||
(F_ISSET(evict, WT_EVICT_CACHE_DIRTY) && modified) ||
(F_ISSET(evict, WT_EVICT_CACHE_UPDATES) && !F_ISSET(btree, WT_BTREE_IN_MEMORY) &&
page->modify != NULL);
(F_ISSET(evict, WT_EVICT_CACHE_UPDATES) && page->modify != NULL);
if (!want_page) {
WT_STAT_CONN_INCR(session, eviction_server_skip_unwanted_pages);
return;

View File

@ -50,6 +50,174 @@ __evict_exclusive(WT_SESSION_IMPL *session, WT_REF *ref)
#define WT_EVICT_STATS_SUCCESS 0x04
#define WT_EVICT_STATS_URGENT 0x08
/*
* Victim Cache Overview
* ---------------------
* The Victim Cache is an LRU cache designed to avoid data duplication with WiredTiger's
* in-memory page cache. Unlike a traditional transparent read-write cache, data enters the
* Victim Cache only when pages are evicted from WiredTiger's cache. Conversely, when a page
* is read into or written from WiredTiger's cache, it is removed from the Victim Cache, since
* it is already present in the main cache.
*
* WiredTiger only evicts clean pages from memory. If a page has unwritten data (dirty),
* it must first be reconciled to produce a clean version before it can be evicted.
*
* Only leaf pages are cached; internal pages are not stored in the Victim Cache.
*
* Pages are not cached during shutdown, since they will not be needed again.
*
* Pages are compressed before being stored in the Victim Cache to reduce memory usage,
* though this incurs CPU cost.
*
* Implementation Note:
* A page's in-memory representation may differ from its on-disk format. To handle this, we
* store additional metadata alongside each cached page:
* - The original delta length.
* - A flag indicating whether to skip address cookie checksum validation when the page is
* retrieved from the cache (since the checksum may no longer match the in-memory state).
*/
/*
* __evict_page_victim_cache --
* Check eligibility and put page in victim cache if applicable.
*/
static void
__evict_page_victim_cache(WT_SESSION_IMPL *session, WT_REF *ref)
{
if (!F_ISSET(S2BT(session), WT_BTREE_DISAGGREGATED))
return;
WT_BM *bm = S2BT(session)->bm;
if (bm == NULL)
return;
WT_BLOCK_DISAGG *block_disagg = (WT_BLOCK_DISAGG *)bm->block;
if (block_disagg == NULL)
return;
WT_PAGE_LOG_HANDLE *plh = block_disagg->plhandle;
if (plh == NULL)
return;
if (plh->plh_cache_put == NULL || plh->plh_cache_available == NULL ||
!plh->plh_cache_available(plh, &session->iface))
return;
WT_PAGE *page = ref->page;
/* Only cache clean pages without modify. */
if (__wt_page_is_modified(page))
return;
/* Must be a leaf page with disagg info and disk image. */
if (!F_ISSET(ref, WT_REF_FLAG_LEAF) || page->disagg_info == NULL || page->dsk == NULL)
return;
if (page->disagg_info->block_meta.page_id == WT_BLOCK_INVALID_PAGE_ID)
return;
/* Cannot cache root pages. */
if (__wt_ref_is_root(ref))
return;
/*
* Victim cache: store evicted pages in disagg cache. The format must match what disagg read
* path expects: WT_PAGE_HEADER + WT_BLOCK_DISAGG_HEADER + data
*/
WT_ITEM buf_orig = {
.data = page->dsk,
.size = page->dsk->mem_size,
.mem = (void *)page->dsk,
.memsize = page->dsk->mem_size,
.flags = 0,
};
WT_ITEM *cache_buf = &buf_orig;
WT_ITEM *compressed_buf = NULL;
WT_PAGE_HEADER *dsk;
bool compressed = false;
bool data_checksum = true;
/* Optionally compress the data before caching. */
WT_IGNORE_RET(
__wt_blkcache_compress(session, &buf_orig, false, &compressed_buf, NULL, &compressed));
if (compressed_buf != NULL)
cache_buf = compressed_buf;
/* Point dsk to the cache buffer's page header. */
dsk = (WT_PAGE_HEADER *)cache_buf->mem;
/*
* Determine if full data checksum is needed based on btree config. This follows the same logic
* as __wt_blkcache_write.
*/
switch (S2BT(session)->checksum) {
case CKSUM_ON:
break;
case CKSUM_OFF:
data_checksum = false;
break;
case CKSUM_UNCOMPRESSED:
data_checksum = !compressed;
break;
case CKSUM_UNENCRYPTED:
/* Not encrypted in this path. */
break;
}
/*
* Fill in the disagg block header following the pattern from
* __wti_block_disagg_write_internal. The disagg block header
* is at WT_BLOCK_HEADER_REF (after the page header).
*/
WT_BLOCK_DISAGG_HEADER *blk = WT_BLOCK_HEADER_REF(cache_buf->data);
memset(blk, 0, sizeof(*blk));
/* Set disagg header fields. */
blk->magic = WT_BLOCK_DISAGG_MAGIC_BASE;
blk->version = WT_BLOCK_DISAGG_VERSION;
blk->compatible_version = WT_BLOCK_DISAGG_COMPATIBLE_VERSION;
blk->header_size = WT_BLOCK_DISAGG_HEADER_BYTE_SIZE;
blk->previous_checksum = page->disagg_info->block_meta.checksum;
blk->flags = 0;
if (data_checksum)
F_SET(blk, WT_BLOCK_DISAGG_DATA_CKSUM);
if (compressed)
F_SET(blk, WT_BLOCK_DISAGG_COMPRESSED);
/* Mark as cached so read path skips address cookie checksum match. */
F_SET(blk, WT_BLOCK_DISAGG_MODIFIED);
/* Not encrypted in this path. */
/* Calculate checksum following __wti_block_disagg_write_internal. */
blk->checksum = 0;
blk->checksum = __wt_checksum(cache_buf->data,
data_checksum ? cache_buf->size : WT_MIN(cache_buf->size, WT_BLOCK_COMPRESS_SKIP));
/*
* Swap page header to little-endian for on-disk format.
*/
__wt_page_header_byteswap(dsk);
WT_PAGE_LOG_PUT_ARGS args = {
.backlink_lsn = page->disagg_info->block_meta.backlink_lsn,
.base_lsn = page->disagg_info->block_meta.base_lsn,
.backlink_checkpoint_id = 0,
.base_checkpoint_id = 0,
.delta_count = page->disagg_info->block_meta.delta_count,
.image_size = page->dsk->mem_size,
.flags = compressed ? WT_PAGE_LOG_COMPRESSED : 0,
.lsn = page->disagg_info->block_meta.disagg_lsn,
};
WT_IGNORE_RET(plh->plh_cache_put(
plh, &session->iface, page->disagg_info->block_meta.page_id, 0, &args, cache_buf));
if (compressed_buf != NULL)
__wt_scr_free(session, &compressed_buf);
else
/* Swap page header back to native order. */
__wt_page_header_byteswap(dsk);
}
/*
* __evict_stats_update --
* Update the stats of eviction.
@ -399,7 +567,10 @@ static int
__evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
{
WT_DECL_RET;
bool instantiated;
bool closing, instantiated, tree_dead;
closing = FLD_ISSET(flags, WT_EVICT_CALL_CLOSING);
tree_dead = F_ISSET(session->dhandle, WT_DHANDLE_DEAD);
/*
* We might discard an instantiated deleted page, because instantiated pages are not marked
@ -412,6 +583,10 @@ __evict_page_clean_update(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags)
instantiated = false;
}
if (!instantiated && !tree_dead && !F_ISSET(S2C(session), WT_CONN_IN_MEMORY) &&
!F_ISSET(S2BT(session), WT_BTREE_IN_MEMORY) && !closing)
__evict_page_victim_cache(session, ref);
/*
* Discard the page and update the reference structure. A leaf page without a disk address is a
* deleted page that either was created empty and never written out, or had its on-disk page
@ -847,8 +1022,8 @@ __evict_review(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags, bool
modified = __wt_page_is_modified(page);
/*
* Clean pages can't be evicted when running in memory only. This should be uncommon - we don't
* add clean pages to the queue.
* Clean pages can't be evicted from in memory btrees. This should be uncommon - we don't add
* clean pages to the queue.
*/
if (F_ISSET(btree, WT_BTREE_IN_MEMORY) && !modified && !closing)
return (__wt_set_return(session, EBUSY));
@ -955,7 +1130,7 @@ __evict_reconcile(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t evict_flags)
*/
else if (F_ISSET(ref, WT_REF_FLAG_INTERNAL) || WT_IS_HS(btree->dhandle))
;
/* Always do update restore for in-memory database. */
/* Always do update restore for in-memory btrees. */
else if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
LF_SET(WT_REC_IN_MEMORY | WT_REC_SCRUB);
/* For data store leaf pages, write the history to history store except for metadata. */

View File

@ -518,6 +518,7 @@ struct __wt_block_disagg_header {
#define WT_BLOCK_DISAGG_DATA_CKSUM 0x1u /* Block data is part of the checksum */
#define WT_BLOCK_DISAGG_ENCRYPTED 0x2u /* Data following header is encrypted */
#define WT_BLOCK_DISAGG_COMPRESSED 0x4u /* Data following header is compressed */
#define WT_BLOCK_DISAGG_MODIFIED 0x08u /* The page is modified "offline" */
uint8_t flags; /* 12: flags */
/*

View File

@ -116,6 +116,9 @@ extern int __wt_backup_set_blkincr(WT_SESSION_IMPL *session, uint64_t i, uint64_
const char *id, uint64_t id_len) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_bad_object_type(WT_SESSION_IMPL *session, const char *uri)
WT_GCC_FUNC_DECL_ATTRIBUTE((cold)) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_blkcache_compress(WT_SESSION_IMPL *session, WT_ITEM *buf, bool already_compressed,
WT_ITEM **compressed_bufp, size_t *compressed_sizep, bool *compressedp)
WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_blkcache_get_handle(WT_SESSION_IMPL *session, WT_BM *bm, uint32_t objectid,
bool reading, WT_BLOCK **blockp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result));
extern int __wt_blkcache_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[],

View File

@ -314,3 +314,14 @@ struct __wt_verbose_multi_category {
} \
} \
} while (0)
/*
* WT_CONFIG_DEBUG --
* Emit a verbose warning message when debug_mode.configuration is enabled. This macro is used
* to warn users when configuration values are adjusted automatically to valid/safe values.
*/
#define WT_CONFIG_DEBUG(session, fmt, ...) \
do { \
if (FLD_ISSET(S2C(session)->debug_flags, WT_CONN_DEBUG_CONFIGURATION)) \
__wt_verbose_warning(session, WT_VERB_CONFIGURATION, fmt, __VA_ARGS__); \
} while (0)

View File

@ -3443,11 +3443,13 @@ struct __wt_connection {
* pre-fetch future content into the cache., a set of related configuration options defined as
* follows.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;available, whether the thread pool for the pre-fetch
* functionality is started., a boolean flag; default \c false.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;
* default, whether pre-fetch is enabled for all sessions by default., a boolean flag; default \c
* false.}
* @config{ ),,}
* functionality is started\, this does not mean that pre-fetch is enabled for sessions by default\,
* see the \c default setting at the connection level and the \c prefetch setting at the session
* level., a boolean flag; default \c true.}
* @config{&nbsp;&nbsp;&nbsp;&nbsp;default, whether
* pre-fetch is enabled for all sessions by default., a boolean flag; default \c false.}
* @config{
* ),,}
* @config{preserve_prepared, open connection in preserve prepare mode. All the prepared
* transactions that are not yet committed or rolled back will be preserved in the database. This
* is useful for applications that want to preserve prepared transactions across restarts., a
@ -5866,6 +5868,11 @@ struct __wt_page_log_put_args {
* Output arguments, returned by the call
*/
uint64_t lsn;
/*
* Arguments used by the block cache
*/
uint64_t delta_count;
};
/*!
@ -5889,6 +5896,11 @@ struct __wt_page_log_get_args {
uint64_t base_lsn;
uint64_t backlink_checkpoint_id;
uint64_t base_checkpoint_id;
/*
* Arguments used by the block cache
*/
uint64_t delta_count;
};
/*!
@ -6007,6 +6019,30 @@ struct __wt_page_log_handle {
* @param session the current WiredTiger session
*/
int (*plh_close)(WT_PAGE_LOG_HANDLE *plh, WT_SESSION *session);
/*!
* Put an object into cache.
*/
int (*plh_cache_put)(WT_PAGE_LOG_HANDLE *plh, WT_SESSION *session,
uint64_t page_id, uint64_t checkpoint_id, WT_PAGE_LOG_PUT_ARGS *args,
const WT_ITEM *buf);
/*!
* Query object in cache.
*/
int (*plh_cache_has)(WT_PAGE_LOG_HANDLE *plh, WT_SESSION *session,
uint64_t page_id, uint64_t checkpoint_id, WT_PAGE_LOG_PUT_ARGS *args);
/*!
* Delete object from cache.
*/
int (*plh_cache_del)(WT_PAGE_LOG_HANDLE *plh, WT_SESSION *session,
uint64_t page_id, uint64_t checkpoint_id, WT_PAGE_LOG_PUT_ARGS *args);
/*!
* Query if block cache is available.
*/
bool (*plh_cache_available)(WT_PAGE_LOG_HANDLE *plh, WT_SESSION *session);
};
#endif

View File

@ -1452,7 +1452,7 @@ __wti_rec_upd_select(WT_SESSION_IMPL *session, WTI_RECONCILE *r, WT_INSERT *ins,
}
if (F_ISSET(S2BT(session), WT_BTREE_IN_MEMORY)) {
/* Never write prepared updates for in-memory btree */
/* Never write prepared updates for in-memory btrees. */
write_prepare = false;
WT_RET(__rec_upd_select_inmem(session, r, vpack, first_upd, upd_select, &first_txn_upd,
&has_newer_updates, &upd_memsize));

View File

@ -82,30 +82,29 @@ __wt_session_dump(WT_SESSION_IMPL *session, WT_SESSION_IMPL *dump_session, bool
session, "Session: ID: %" PRIu32 " @: 0x%p", dump_session->id, (void *)dump_session));
WT_ERR(
__wt_msg(session, " Name: %s", dump_session->name == NULL ? "EMPTY" : dump_session->name));
if (!show_cursors) {
WT_ERR(__wt_msg(session, " Last operation: %s",
dump_session->lastop == NULL ? "NONE" : dump_session->lastop));
WT_ERR(__wt_msg(session, " Current dhandle: %s",
dump_session->dhandle == NULL ? "NONE" : dump_session->dhandle->name));
WT_ERR(__wt_msg(
session, " Backup in progress: %s", dump_session->bkp_cursor == NULL ? "no" : "yes"));
WT_ERR(__wt_msg(session, " Compact state: %s",
dump_session->compact_state == WT_COMPACT_NONE ?
"none" :
(dump_session->compact_state == WT_COMPACT_RUNNING ? "running" : "success")));
WT_ERR(__wt_msg(session, " Flags: 0x%" PRIx32, dump_session->flags));
WT_ERR(__wt_msg(session, " Isolation level: %s",
dump_session->isolation == WT_ISO_READ_COMMITTED ?
"read-committed" :
(dump_session->isolation == WT_ISO_READ_UNCOMMITTED ? "read-uncommitted" :
"snapshot")));
WT_ERR(__wt_msg(session, " last saved error code: %d", dump_session->err_info.err));
WT_ERR(__wt_msg(
session, " last saved sub-level error code: %d", dump_session->err_info.sub_level_err));
WT_ERR(__wt_msg(session, " last saved error message: %s", dump_session->err_info.err_msg));
WT_ERR(__wt_msg(session, " Transaction:"));
WT_ERR(__wt_verbose_dump_txn_one(session, dump_session, 0, NULL));
} else {
WT_ERR(__wt_msg(session, " Last operation: %s",
dump_session->lastop == NULL ? "NONE" : dump_session->lastop));
WT_ERR(__wt_msg(session, " Current dhandle: %s",
dump_session->dhandle == NULL ? "NONE" : dump_session->dhandle->name));
WT_ERR(__wt_msg(
session, " Backup in progress: %s", dump_session->bkp_cursor == NULL ? "no" : "yes"));
WT_ERR(__wt_msg(session, " Compact state: %s",
dump_session->compact_state == WT_COMPACT_NONE ?
"none" :
(dump_session->compact_state == WT_COMPACT_RUNNING ? "running" : "success")));
WT_ERR(__wt_msg(session, " Flags: 0x%" PRIx32, dump_session->flags));
WT_ERR(__wt_msg(session, " Isolation level: %s",
dump_session->isolation == WT_ISO_READ_COMMITTED ?
"read-committed" :
(dump_session->isolation == WT_ISO_READ_UNCOMMITTED ? "read-uncommitted" : "snapshot")));
WT_ERR(__wt_msg(session, " last saved error code: %d", dump_session->err_info.err));
WT_ERR(__wt_msg(
session, " last saved sub-level error code: %d", dump_session->err_info.sub_level_err));
WT_ERR(__wt_msg(session, " last saved error message: %s", dump_session->err_info.err_msg));
WT_ERR(__wt_msg(session, " Transaction:"));
WT_ERR(__wt_verbose_dump_txn_one(session, dump_session, 0, NULL));
if (show_cursors) {
WT_ERR(__wt_msg(session, " Number of positioned cursors: %u", dump_session->ncursors));
TAILQ_FOREACH (cursor, &dump_session->cursors, q) {
WT_ERR(__wt_msg(session, "Cursor @ %p:", (void *)cursor));

View File

@ -38,6 +38,7 @@ class test_layered29(wttest.WiredTigerTestCase):
scenarios = gen_disagg_storages('test_layered29', disagg_only = True)
@wttest.longtest('lots of tables')
def test_create_tables(self):
for i in (0, 10000):
for i in range(0, 10000):
self.assertEqual(self.session.create("layered:test_table" + str(i), "key_format=S,value_format=S"), 0)

View File

@ -106,7 +106,7 @@ class test_layered37(wttest.WiredTigerTestCase):
# Trigger eviction on the ingest table
evict_cursor = session_follow.open_cursor("file:test_layered37.wt_ingest", None, "debug=(release_evict)")
for i in (1, self.nitems):
for i in range(1, self.nitems):
session_follow.begin_transaction(f'read_timestamp={self.timestamp_str(ts)}')
evict_cursor.set_key(str(i))
self.assertEqual(evict_cursor.search(), 0)

View File

@ -26,7 +26,7 @@
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
import wiredtiger, wttest
import wttest
from wtscenario import make_scenarios
# test_prefetch03.py
@ -34,17 +34,21 @@ from wtscenario import make_scenarios
class test_prefetch03(wttest.WiredTigerTestCase):
uri = 'file:test_prefetch03'
prefetch_config = 'prefetch=(available=true,default=true)'
verbose_config = 'verbose=(prefetch:1)'
config_options = [
('default', dict(conn_cfg='prefetch=(available=true,default=true)')),
('inmem', dict(conn_cfg='in_memory=true,prefetch=(available=true,default=true)')),
('default', dict(conn_cfg=f'{prefetch_config},{verbose_config}')),
('inmem', dict(conn_cfg=f'in_memory=true,{prefetch_config},{verbose_config}')),
]
scenarios = make_scenarios(config_options)
def test_prefetch03(self):
# The in-memory configuration is incompatible with prefetch, so prefetch should be disabled
# and a message should be logged if the in-memory configuration is set.
if 'in_memory=true' not in self.conn_cfg:
self.reopen_conn(".", self.conn_cfg)
else:
with self.expectedStderrPattern('prefetch configuration is incompatible with in-memory configuration'):
self.assertRaisesException(wiredtiger.WiredTigerError,
lambda: self.reopen_conn(".", self.conn_cfg))
with self.expectedStdoutPattern('prefetch configuration is incompatible with in-memory configuration'):
self.reopen_conn(".", self.conn_cfg)