Merge pull request #2415 from wiredtiger/wt-2307-fix

(cherry picked from commit 12aaeb6)

WT-2307: Fix for cursor iteration bug when pages are splitting
This commit is contained in:
Alex Gorrod 2015-12-28 13:31:00 -05:00 committed by Alex Gorrod
parent b1768d0d9f
commit 3c2ad56b50
3 changed files with 249 additions and 172 deletions

1
dist/s_string.ok vendored
View File

@ -420,6 +420,7 @@ checkpointer
checkpointing
checksum
checksums
children's
chk
chongo
cip

View File

@ -190,6 +190,8 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
case WT_PAGE_COL_INT:
recno = 0; /* Less than any valid record number. */
WT_INTL_FOREACH_BEGIN(session, page, ref) {
WT_ASSERT(session, ref->home == page);
WT_ASSERT(session, ref->key.recno > recno);
recno = ref->key.recno;
} WT_INTL_FOREACH_END;
@ -202,6 +204,8 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page)
first = true;
WT_INTL_FOREACH_BEGIN(session, page, ref) {
WT_ASSERT(session, ref->home == page);
__wt_ref_key(page, ref, &next->data, &next->size);
if (last->size == 0) {
if (first)
@ -328,7 +332,7 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
/*
* If there's no address (the page has never been written), or the
* address has been instantiated, there's no work to do. Otherwise,
* get the address from the on-page cell.
* instantiate the address in-memory, from the on-page cell.
*/
addr = ref->addr;
if (addr != NULL && !__wt_off_page(from_home, addr)) {
@ -363,65 +367,101 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home,
}
/*
* __split_child_block_evict_and_split --
* Ensure the newly created child isn't evicted or split for now.
* __split_ref_step1 --
* Prepare a set of WT_REFs for a move.
*/
static void
__split_child_block_evict_and_split(WT_PAGE *child)
__split_ref_step1(
WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
{
WT_PAGE *child;
WT_REF *child_ref, *ref;
uint32_t i, j;
/* The newly created subtree is complete. */
WT_WRITE_BARRIER();
/*
* Once the split is live, newly created internal pages might be evicted
* and their WT_REF structures freed. If that happens before all threads
* exit the index of the page which previously "owned" the WT_REF, a
* thread might see a freed WT_REF. To ensure that doesn't happen, the
* newly created page's modify structure has a field with a transaction
* ID that's checked before any internal page is evicted. Unfortunately,
* we don't know the correct value until we update the original page's
* index (we need a transaction ID from after that update), but the act
* of updating the original page's index is what allows the eviction to
* happen.
*
* Once the split is live, newly created internal pages might themselves
* split. The split itself is not the problem: if a page splits before
* we fix up its WT_REF (in other words, a WT_REF we move is then moved
* again, before we reset the underlying page's parent reference), it's
* OK because the test we use to find a WT_REF and WT_PAGE that require
* fixing up is only that the WT_REF points to the wrong parent, not it
* points to a specific wrong parent. The problem is our fix up of the
* WT_REFs in the created page could race with the subsequent fix of the
* same WT_REFs (in a different created page), we'd have to acquire some
* lock to prevent that race, and that's going to be difficult at best.
*
* For now, block eviction and splits in newly created pages until they
* have been fixed up.
* Update the moved WT_REFs so threads moving through them start looking
* at the created children's page index information. Because we've not
* yet updated the page index of the parent page into which we are going
* to split this subtree, a cursor moving through these WT_REFs will
* ascend into the created children, but eventually fail as that parent
* page won't yet know about the created children pages. That's OK, we
* spin there until the parent's page index is updated.
*/
F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) {
ref = pindex->index[i];
child = ref->page;
/*
* Block eviction and splits in newly created pages.
*
* Once the split is live, newly created internal pages might be
* evicted and their WT_REF structures freed. If that happened
* before all threads exit the index of the page that previously
* "owned" the WT_REF, a thread might see a freed WT_REF. To
* ensure that doesn't happen, the newly created page's modify
* structure has a field with a transaction ID that's checked
* before any internal page is evicted. Unfortunately, we don't
* know the correct value until we update the original page's
* index (we need a transaction ID from after that update), but
* the act of updating the original page's index is what allows
* the eviction to happen.
*
* Split blocking was because historic versions of the split
* code didn't update the WT_REF.home field until after the
* split was live, so the WT_REF.home fields being updated could
* split again before the update, there's a race between splits
* as to which would update them first. The current code updates
* the WT_REF.home fields before going live (in this function),
* this shouldn't be an issue, but for now splits remain turned
* off.
*/
F_SET_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
/*
* We use a page flag to prevent the child from splitting from
* underneath us, but the split-generation error checks don't
* know about that flag; use the standard macros to ensure that
* reading the child's page index structure is safe.
*/
j = 0;
WT_ENTER_PAGE_INDEX(session);
WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
child_ref->home = child;
child_ref->pindex_hint = j++;
} WT_INTL_FOREACH_END;
WT_LEAVE_PAGE_INDEX(session);
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
__split_verify_intl_key_order(session, child));
#endif
}
}
/*
* __split_ref_move_final --
* Finalize the moved WT_REF structures after the split succeeds.
* __split_ref_step2 --
* Allow the newly created children to be evicted or split.
*/
static int
__split_ref_move_final(
WT_SESSION_IMPL *session, WT_REF **refp, uint32_t entries)
__split_ref_step2(
WT_SESSION_IMPL *session, WT_PAGE_INDEX *pindex, bool skip_first)
{
WT_DECL_RET;
WT_PAGE *child;
WT_REF *ref, *child_ref;
WT_REF *ref;
uint32_t i;
/*
* The WT_REF structures moved to newly allocated child pages reference
* the wrong parent page and we have to fix that up. The problem is
* revealed when a thread of control searches for the child page's
* reference structure slot, and fails to find it because the parent
* page being searched no longer references the child. When that failure
* happens the thread waits for the reference's home page to be updated,
* which we do here: walk the children and fix them up.
* The split has gone live, enable eviction and splits on the newly
* created internal pages.
*/
for (i = 0; i < entries; ++i, ++refp) {
ref = *refp;
WT_WRITE_BARRIER();
for (i = skip_first ? 1 : 0; i < pindex->entries; ++i) {
ref = pindex->index[i];
/*
* We don't hold hazard pointers on created pages, they cannot
@ -441,42 +481,18 @@ __split_ref_move_final(
WT_ERR(ret);
child = ref->page;
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
__split_verify_intl_key_order(session, child));
#endif
/*
* We use a page flag to prevent the child from splitting from
* underneath us, but the split-generation error checks don't
* know about that flag; use the standard macros to ensure that
* reading the child's page index structure is safe.
*/
WT_ENTER_PAGE_INDEX(session);
WT_INTL_FOREACH_BEGIN(session, child, child_ref) {
/*
* The page's home reference may not be wrong, as we
* opened up access from the top of the tree already,
* disk pages may have been read in since then, and
* those pages would have correct parent references.
*/
if (child_ref->home != child) {
child_ref->home = child;
child_ref->pindex_hint = 0;
}
} WT_INTL_FOREACH_END;
WT_LEAVE_PAGE_INDEX(session);
/* The child can now be evicted or split. */
F_CLR_ATOMIC(child, WT_PAGE_SPLIT_BLOCK);
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
__split_verify_intl_key_order(session, child));
#endif
WT_ERR(__wt_hazard_clear(session, child));
}
/*
* Push out the changes: not required for correctness, but don't let
* threads spin on incorrect page references longer than necessary.
*/
WT_FULL_BARRIER();
return (0);
err: /* Something really bad just happened. */
@ -500,9 +516,21 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
uint64_t split_gen;
uint32_t children, chunk, i, j, remain;
uint32_t slots;
bool complete;
void *p;
/*
* A note on error handling: this function first allocates/initializes
* new structures; failures during that period are handled by discarding
* the memory and returning an error code, our caller knows the split
* didn't happen and proceeds accordingly. Second, this function updates
* the tree, and a failure in that period is catastrophic, any partial
* update to the tree requires a panic, we can't recover. Third, once
* the split is complete and the tree has been fully updated, we have to
* ignore most errors because the split is complete and correct, callers
* have to proceed accordingly.
*/
enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete;
WT_STAT_FAST_CONN_INCR(session, cache_eviction_deepen);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_deepen);
WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal);
@ -511,7 +539,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
btree = S2BT(session);
alloc_index = NULL;
root_decr = root_incr = 0;
complete = false;
complete = ERR_RETURN;
/* The root page will be marked dirty, make sure that will succeed. */
WT_RET(__wt_page_modify_init(session, root));
@ -589,9 +617,6 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ERR(__wt_page_modify_init(session, child));
__wt_page_modify_set(session, child);
/* Ensure the page isn't evicted or split for now. */
__split_child_block_evict_and_split(child);
/*
* The newly allocated child's page index references the same
* structures as the root. (We cannot move WT_REF structures,
@ -615,31 +640,28 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
WT_ASSERT(session,
root_refp - pindex->index == (ptrdiff_t)pindex->entries);
/* Start making real changes to the tree, errors are fatal. */
complete = ERR_PANIC;
/* Prepare the WT_REFs for the move. */
__split_ref_step1(session, alloc_index, false);
/*
* Confirm the root page's index hasn't moved, then update it, which
* makes the split visible to threads descending the tree. From this
* point on, we're committed to the split.
*
* A note on error handling: until this point, there's no problem with
* unwinding on error. We allocated a new page index, a new set of
* WT_REFs and a new set of child pages -- if an error occurred, the
* root remained unchanged, although it may have an incorrect memory
* footprint. From now on we've modified the root page, attention
* needs to be paid. However, subsequent failures are relatively benign,
* the split is OK and complete. For that reason, we ignore errors past
* this point unless there's a panic.
* makes the split visible to threads descending the tree.
*/
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(root) == pindex);
WT_INTL_INDEX_SET(root, alloc_index);
complete = true;
#ifdef HAVE_DIAGNOSTIC
WT_WITH_PAGE_INDEX(session,
__split_verify_intl_key_order(session, root));
#endif
/* Fix up the moved WT_REF structures. */
WT_ERR(__split_ref_move_final(
session, alloc_index->index, alloc_index->entries));
/* Finalize the WT_REFs we moved. */
WT_ERR(__split_ref_step2(session, alloc_index, false));
/* The split is complete and correct, ignore benign errors. */
complete = ERR_IGNORE;
/* We've installed the allocated page-index, ensure error handling. */
alloc_index = NULL;
@ -664,24 +686,25 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root)
__wt_cache_page_inmem_decr(session, root, root_decr);
__wt_page_modify_set(session, root);
err: /*
* If complete is true, we saw an error after opening up the tree to
* descent through the root page's new index. There is nothing we
* can do, there are threads potentially active in both versions of
* the tree.
*
* A note on error handling: if we completed the split, return success,
* nothing really bad can have happened, and our caller has to proceed
* with the split.
*/
if (!complete)
err: switch (complete) {
case ERR_RETURN:
__wt_free_ref_index(session, root, alloc_index, true);
if (ret != 0 && ret != WT_PANIC)
break;
case ERR_PANIC:
__wt_err(session, ret,
"ignoring not-fatal error during root page split to "
"deepen the tree");
return (ret == WT_PANIC || !complete ? ret : 0);
"fatal error during root page split to deepen the tree");
ret = WT_PANIC;
break;
case ERR_IGNORE:
if (ret != 0 && ret != WT_PANIC) {
__wt_err(session, ret,
"ignoring not-fatal error during root page split "
"to deepen the tree");
ret = 0;
}
break;
}
return (ret);
}
/*
@ -964,9 +987,21 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
uint64_t split_gen;
uint32_t children, chunk, i, j, remain;
uint32_t slots;
bool complete;
void *p;
/*
* A note on error handling: this function first allocates/initializes
* new structures; failures during that period are handled by discarding
* the memory and returning an error code, our caller knows the split
* didn't happen and proceeds accordingly. Second, this function updates
* the tree, and a failure in that period is catastrophic, any partial
* update to the tree requires a panic, we can't recover. Third, once
* the split is complete and the tree has been fully updated, we have to
* ignore most errors because the split is complete and correct, callers
* have to proceed accordingly.
*/
enum { ERR_RETURN, ERR_PANIC, ERR_IGNORE } complete;
WT_STAT_FAST_CONN_INCR(session, cache_eviction_split_internal);
WT_STAT_FAST_DATA_INCR(session, cache_eviction_split_internal);
@ -977,7 +1012,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
alloc_index = replace_index = NULL;
page_ref = page->pg_intl_parent_ref;
page_decr = page_incr = parent_incr = 0;
complete = false;
complete = ERR_RETURN;
/*
* Our caller is holding the page locked to single-thread splits, which
@ -1074,9 +1109,6 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ERR(__wt_page_modify_init(session, child));
__wt_page_modify_set(session, child);
/* Ensure the page isn't evicted or split for now. */
__split_child_block_evict_and_split(child);
/*
* The newly allocated child's page index references the same
* structures as the parent. (We cannot move WT_REF structures,
@ -1100,22 +1132,16 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
WT_ASSERT(session,
page_refp - pindex->index == (ptrdiff_t)pindex->entries);
/* Start making real changes to the tree, errors are fatal. */
complete = ERR_PANIC;
/* Prepare the WT_REFs for the move. */
__split_ref_step1(session, alloc_index, true);
/* Split into the parent. */
WT_ERR(__split_parent(session, page_ref, alloc_index->index,
alloc_index->entries, parent_incr, false, false));
/*
* A note on error handling: until this point, there's no problem with
* unwinding on error. We allocated a new page index, a new set of
* WT_REFs and a new set of child pages -- if an error occurred, the
* page remained unchanged, although it may have an incorrect memory
* footprint. From now on we've modified the parent page, attention
* needs to be paid. However, subsequent failures are relatively benign,
* the split is OK and complete. For that reason, we ignore errors past
* this point unless there's a panic.
*/
complete = true;
/* Confirm the page's index hasn't moved, then update it. */
WT_ASSERT(session, WT_INTL_INDEX_GET_SAFE(page) == pindex);
WT_INTL_INDEX_SET(page, replace_index);
@ -1127,9 +1153,17 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
__split_verify_intl_key_order(session, page));
#endif
/* Fix up the moved WT_REF structures. */
WT_ERR(__split_ref_move_final(
session, alloc_index->index + 1, alloc_index->entries - 1));
/* Finalize the WT_REFs we moved. */
WT_ERR(__split_ref_step2(session, alloc_index, true));
/* The split is complete and correct, ignore benign errors. */
complete = ERR_IGNORE;
/*
* Push out the changes: not required for correctness, but no reason
* to wait.
*/
WT_FULL_BARRIER();
/*
* We don't care about the page-index we allocated, all we needed was
@ -1158,24 +1192,26 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page)
__wt_cache_page_inmem_decr(session, page, page_decr);
__wt_page_modify_set(session, page);
err: /*
* If complete is true, we saw an error after opening up the tree to
* descent through the page's new index. There is nothing we can do,
* there are threads potentially active in both versions of the tree.
*
* A note on error handling: if we completed the split, return success,
* nothing really bad can have happened, and our caller has to proceed
* with the split.
*/
if (!complete) {
err: switch (complete) {
case ERR_RETURN:
__wt_free_ref_index(session, page, alloc_index, true);
__wt_free_ref_index(session, page, replace_index, false);
}
if (ret != 0 && ret != WT_PANIC)
break;
case ERR_PANIC:
__wt_err(session, ret,
"ignoring not-fatal error during internal page split");
return (ret == WT_PANIC || !complete ? ret : 0);
"fatal error during internal page split");
ret = WT_PANIC;
break;
case ERR_IGNORE:
if (ret != 0 && ret != WT_PANIC) {
__wt_err(session, ret,
"ignoring not-fatal error during internal page "
"split");
ret = 0;
}
break;
}
return (ret);
}
/*

View File

@ -89,6 +89,48 @@ __ref_is_leaf(WT_REF *ref)
false : type == WT_CELL_ADDR_LEAF || type == WT_CELL_ADDR_LEAF_NO);
}
/*
* __page_ascend --
* Ascend the tree one level.
*/
static void
__page_ascend(WT_SESSION_IMPL *session,
WT_REF **refp, WT_PAGE_INDEX **pindexp, uint32_t *slotp)
{
WT_REF *parent_ref, *ref;
/*
* Ref points to the first/last slot on an internal page from which we
* are ascending the tree, moving to the parent page. This is tricky
* because the internal page we're on may be splitting into its parent.
* Find a stable configuration where the page we start from and the
* page we're moving to are connected. The tree eventually stabilizes
* into that configuration, keep trying until we succeed.
*/
for (ref = *refp;;) {
/*
* Find our parent slot on the next higher internal page, the
* slot from which we move to a next/prev slot, checking that
* we haven't reached the root.
*/
parent_ref = ref->home->pg_intl_parent_ref;
if (__wt_ref_is_root(parent_ref))
break;
__page_refp(session, parent_ref, pindexp, slotp);
/*
* When internal pages split, the WT_REF structures being moved
* are updated first. If the WT_REF we started with references
* the same page as we found on our search of the parent, there
* is a consistent view.
*/
if (ref->home == parent_ref->page)
break;
}
*refp = parent_ref;
}
/*
* __tree_walk_internal --
* Move to the next/previous page in the tree.
@ -173,7 +215,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session,
goto descend;
}
ascend: /*
/*
* If the active page was the root, we've reached the walk's end.
* Release any hazard-pointer we're holding.
*/
@ -187,13 +229,14 @@ ascend: /*
for (;;) {
/*
* If we're at the last/first slot on the page, return this page
* in post-order traversal. Otherwise we move to the next/prev
* slot and left/right-most element in its subtree.
* If we're at the last/first slot on the internal page, return
* it in post-order traversal. Otherwise move to the next/prev
* slot and left/right-most element in that subtree.
*/
if ((prev && slot == 0) ||
while ((prev && slot == 0) ||
(!prev && slot == pindex->entries - 1)) {
ref = ref->home->pg_intl_parent_ref;
/* Ascend to the parent. */
__page_ascend(session, &ref, &pindex, &slot);
/*
* If we got all the way through an internal page and
@ -205,40 +248,37 @@ ascend: /*
empty_internal = false;
}
/* Optionally skip internal pages. */
if (LF_ISSET(WT_READ_SKIP_INTL))
goto ascend;
/*
* We've ascended the tree and are returning an internal
* page. If it's the root, discard our hazard pointer,
* otherwise, swap our hazard pointer for the page we'll
* return.
* If at the root and returning internal pages, return
* the root page, otherwise we're done. Regardless, no
* hazard pointer is required, release the one we hold.
*/
if (__wt_ref_is_root(ref))
if (__wt_ref_is_root(ref)) {
WT_ERR(__wt_page_release(
session, couple, flags));
else {
/*
* Locate the reference to our parent page then
* swap our child hazard pointer for the parent.
* We don't handle restart or not-found returns.
* It would require additional complexity and is
* not a possible return: we're moving to the
* parent of the current child page, our parent
* reference can't have split or been evicted.
*/
__page_refp(session, ref, &pindex, &slot);
if (!LF_ISSET(WT_READ_SKIP_INTL))
*refp = ref;
goto done;
}
/*
* Optionally return internal pages. Swap our previous
* hazard pointer for the page we'll return. We don't
* handle restart or not-found returns, it would require
* additional complexity and is not a possible return:
* we're moving to the parent of the current child page,
* the parent can't have been evicted.
*/
if (!LF_ISSET(WT_READ_SKIP_INTL)) {
if ((ret = __wt_page_swap(
session, couple, ref, flags)) != 0) {
WT_TRET(__wt_page_release(
session, couple, flags));
WT_ERR(ret);
}
*refp = ref;
goto done;
}
*refp = ref;
goto done;
}
if (prev)