diff --git a/src/mongo/bson/column/BUILD.bazel b/src/mongo/bson/column/BUILD.bazel index d44618713fa..f1e47f1f1b3 100644 --- a/src/mongo/bson/column/BUILD.bazel +++ b/src/mongo/bson/column/BUILD.bazel @@ -18,6 +18,7 @@ mongo_cc_library( "simple8b_type_util.cpp", ], hdrs = [ + "binary_reopen.h", "bsoncolumn.h", "bsoncolumn.inl", "bsoncolumn_helpers.h", @@ -117,6 +118,20 @@ mongo_cc_unit_test( ], ) +mongo_cc_unit_test( + name = "binary_reopen_test", + srcs = [ + "binary_reopen_test.cpp", + ], + tags = [ + "mongo_unittest_seventh_group", + "server-bsoncolumn", + ], + deps = [ + ":column", + ], +) + mongo_cc_benchmark( name = "simple8b_bm", srcs = [ diff --git a/src/mongo/bson/column/binary_reopen.h b/src/mongo/bson/column/binary_reopen.h new file mode 100644 index 00000000000..29dfcc5939a --- /dev/null +++ b/src/mongo/bson/column/binary_reopen.h @@ -0,0 +1,763 @@ +/** + * Copyright (C) 2025-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#pragma once + +#include "mongo/bson/column/bsoncolumn_util.h" +#include "mongo/bson/column/simple8b.h" +#include "mongo/bson/column/simple8b_builder.h" + +#include + +namespace mongo::bsoncolumn::internal { + +/** + * Constant to indicate invalid index for overflow or pending RLE. + */ +static constexpr int kInvalidIndex = -1; + +/** + * Helper struct for a scanned control block. + * + * lastAtEndOfBlock and scaleIndex are only set for control blocks containing double data. + */ +struct ControlBlock { + const char* control = nullptr; + double lastAtEndOfBlock = 0.0; + uint8_t scaleIndex = Simple8bTypeUtil::kMemoryAsInteger; +}; + +using ControlBlockContainer = std::vector; + +/** + * Helper range of control blocks to allow for range based for loops. + */ +class ControlBlockRange { +public: + ControlBlockRange() = default; + ControlBlockRange(ControlBlockContainer::const_iterator b, + ControlBlockContainer::const_iterator e) + : _begin(b), _end(e) {} + + ControlBlockContainer::const_iterator begin() const { + return _begin; + } + + ControlBlockContainer::const_iterator end() const { + return _end; + } + +private: + ControlBlockContainer::const_iterator _begin; + ControlBlockContainer::const_iterator _end; +}; + +/** + * Calculated overflow point used to initialize the BSONColumnBuilder in the binary reopen + * operation. + */ +template +struct OverflowPoint { + explicit OverflowPoint(boost::optional val) : _last(val) {} + + /** + * Calculated control byte for overflow/no-overflow. + * + * If overflow occurred, overflow() will return true and index() the position in this + * control byte where we overflowd. Data before and including the overflow point needs to be + * written to the buffer and data after this overflow point needs to be appended to the + * pending state. + * + * If no overflow, overflow() will return false and index() return -1. No data should be + * written to the buffer and all data in this control should be appended to the pending + * state. + */ + const char* control() const { + return _control; + } + + /** + * Scale index for the control byte returned by control(). + */ + uint8_t scaleIndex() const { + return _scaleIndex; + } + + /** + * Returns true if overflow occurred, false otherwise. + */ + bool overflow() const { + return overflowIndex != bsoncolumn::internal::kInvalidIndex; + } + /** + * Returns index of the simple8b block where the overflow occurred, -1 if no overflow. + */ + int index() const { + return overflowIndex; + } + /** + * Returns true if all values after the overflow point are identical to the value returned + * by last(). + */ + bool allValuesIdentical() const { + return _allIdentical; + } + + /** + * Last control byte of the binary prior to reopen. + */ + uint8_t lastControl() const { + return _lastControl; + } + /** + * Offset to the last control byte of the binary prior to reopen. + */ + uint16_t lastControlOffset() const { + return _lastControlOffset; + } + + /** + * Range of control blocks after the overflow point that needs to be appended to the pending + * state. + */ + const ControlBlockRange& remaining() const { + return _remaining; + } + + /** + * Last value at the overflow point. Used to decode values after overflow when they start + * with RLE and to setup the pending state with RLE if necessary. + */ + const boost::optional& last() const { + return _last; + } + + /** + * Internal helper used by OverflowState to set final OverflowPoint result. + */ + void setControl(const char* ctrl, + uint8_t scale, + ControlBlockRange remain, + uint8_t lastControl, + uint16_t lastControlOffset) { + _control = ctrl; + _scaleIndex = scale; + _remaining = remain; + _lastControl = lastControl; + _lastControlOffset = lastControlOffset; + } + /** + * Internal helper to set last. + */ + void setLast(boost::optional value) { + _last = value; + } + + /** + * Internal helper to mark that overflow occurred, at which position and if all values after + * the overflow are identical or not. + */ + void markOverflow(int index, bool allIdentical) { + invariant(index != bsoncolumn::internal::kInvalidIndex); + overflowIndex = index; + _allIdentical = allIdentical; + } + /** + * Internal helper to override overflow to set it back to no overflow. + */ + void markNoOverflow() { + overflowIndex = bsoncolumn::internal::kInvalidIndex; + } + /** + * Internal helper to explicitly set that all values are identical. + */ + void setAllIdentical() { + _allIdentical = true; + } + +private: + boost::optional _last; + const char* _control = nullptr; + int overflowIndex = bsoncolumn::internal::kInvalidIndex; + uint8_t _scaleIndex = kInvalidScaleIndex; + uint8_t _lastControl = 0; + uint16_t _lastControlOffset = 0; + ControlBlockRange _remaining; + bool _allIdentical = false; +}; + +/** + * Helper to calculate how to re-initialize the compressor from a compressed binary. + * + * The main difficulty with re-initializing the compressor from a compressed binary is how to + * undo the 'finalize()/intermediate()' call where pending values are flushed out to simple8b + * blocks in the binary. We need to undo this operation by putting back these values back into + * the pending state. The point in the binary where we need to do this undo is called the + * overflow point. + * + * For this to be efficient we need to calculate this from the end of the binary rather than the + * beginning. In the typical case we will use a dummy Simple8bBuilder where values are added in + * the reverse order to observe when we can no longer add values without needing to write full + * simple8b blocks. + * + * The two main sources of complexity in this algorithm is how to deal with RLE and double + * values that have been rescaled. RLE in BSONColumn is defined as a repeat of the prior value + * so we must be able to travserse past RLE to determine if the overflow point happens before or + * after. For rescale we have a similar problem, either the rescale can be undone and put back + * into pending or it is required due to an incompatible value. + */ +template +class OverflowState { +public: + // Initialize the overflow state from the last control block. + OverflowState(ControlBlock cb); + + // Perform the overflow detection. + const OverflowPoint& detect(const ControlBlockContainer& controls); + +private: + // Helper function to handle the special case where the binary ends with RLE + void _detectEndsRLE(ControlBlock cb); + // Helper function to perform the regular detection logic. When overflow is detected it + // returns a number of control blocks already processed that the overflow point refers to. + // This can happen when we are processing RLE but discover that the RLE data belong prior to + // the overflow point. + int _detectRegular(ControlBlock cb); + // Helper function when a rescale is detected. This automatically ends the overflow + // detection but we need to calculate if the rescaled data should be included in the + // overflow or not. Returns a binary offset to the last control byte in the case where the + // rescaled data was only output because of the finalize/intermediate call. + uint16_t _detectRescale(ControlBlockRange before, ControlBlockRange after); + + OverflowPoint _op; + Simple8bBuilder _overflowDetector; + int _pendingRle = bsoncolumn::internal::kInvalidIndex; + int _pendingRleBlocks = 0; +}; + +/** + * Result from the 'findOverflow' call. + * + * lastValue is the last value in the simple8b causing overflow. If no overflow was detected it is + * set to the previously known last value. + * + * overflowIndex is the index position of the simple8b block in the control that caused overflow. + * Invalid if no overflow was detected. + * + * pendingRLEindex is the index position of the first non-RLE simple8b block when the control begins + * with RLE and no overflow was detected. + */ +template +struct OverflowResult { + boost::optional lastValue; + int overflowIndex; + int pendingRLEindex; +}; + +/** + * Result from the 'findLastNonRLE' call. + * + * lastValue is the last value in the last non-RLE simple8b. Only applicable when + * 'index' is set to a non-invalid index. + * + * index is the index position of the last non-RLE simple8b in this control + */ +template +struct LastNonRLEResult { + boost::optional lastValue; + int index; +}; + +/** + * Helper to get a simple8b block at index from a control block + */ +const char* s8b(const char* control, int index); + +/** + * Helper to determine if the provided simple8b block is an RLE block. + */ +bool isRLE(const char* s8b); + +/** + * Estimates the last non-skip value in a control block. + * + * If the last block is RLE, 0 is returned. + * If no non-skip value can be found within what could fit in a non-RLE block, 'none' is returned. + */ +template +boost::optional estimateLastValue(const char* control); + +/** + * Finds the last non-skip value in a control block. The last block must NOT be RLE. + * + * If no non-skip value can be found within what could fit in a non-RLE block, 'none' is returned. + */ +template +boost::optional findLastNonSkip(const char* control, int numBlocks); + +/** + * Finds which simple8b block in the provided control block causes overflow, searches in reverse + * order. + * + * 'lastValForRLE' indicates how any encountered RLE blocks should be interpreted. + * + * 'overflowDetector' is appended to internally, overflow is detected when it needs to write a + * simple8b block. The same detector may be used in multiple calls for finding overflow. + */ +template +OverflowResult findOverflow(const char* control, + boost::optional lastValForRLE, + Simple8bBuilder& overflowDetector); + +/** + * Finds the last non-RLE simple8b block in the provided control, returns its index position and + * last value. + * + * 'index' indicates position to start search that is performed in reverse order. + */ +template +LastNonRLEResult findLastNonRLE(const char* control); +template +LastNonRLEResult findLastNonRLE(const char* control, int index); + + +template +OverflowState::OverflowState(ControlBlock cb) + : _op(bsoncolumn::internal::estimateLastValue(cb.control)), + _overflowDetector(_op.last(), 0) {} + +template +const OverflowPoint& OverflowState::detect(const ControlBlockContainer& controls) { + using namespace bsoncolumn::internal; + + // Setup reverse iteration. + auto begin = controls.rbegin(); + auto it = begin; + auto end = controls.rend(); + + // Setup some internal state, the algorithm is different if the last block is RLE. + uint16_t lastControlOffset = 0; + bool endsWithRLE = isRLE(s8b(controls.back().control, + numSimple8bBlocksForControlByte(*controls.back().control) - 1)); + + // Search backwards for the overflow point. + for (; it != end; ++it) { + if (it->scaleIndex == controls.back().scaleIndex) { + if (endsWithRLE) { + // If we end with RLE, we simply search backwards for the first non-RLE value. This + // will be the last non-RLE value in the binary and that will be our overflow point. + _detectEndsRLE(*it); + } else { + // Regular case where we don't end with RLE. If RLE is encountered during the + // iteration we need to continue to search until the next non-RLE value is + // encountered. Depending on its value we might have to go back to where we were + // prior to the RLE and assign that as our overflow point. _detectRegular will + // return how many blocks we need to undo if this is the case. + it = std::prev(it, _detectRegular(*it)); + } + + // _detectEndsRLE or _detectRegular will internally mark for overflow if it happened. If + // this is the case, break out of the iteration as we are done. + if (_op.overflow()) { + break; + } + } else { + // Special case for the double type when a control block of a different scale was + // detected. We have a special algorithm to determine if the overflow happened in this + // rescaled control or prior. _detectRescale will return an offset to the last + // (rescaled) control if it can be undone and all values put back to pending. + if constexpr (std::is_same_v) { + lastControlOffset = + _detectRescale({controls.begin(), it.base()}, {it.base(), controls.end()}); + break; + } else { + // This cannot happen as scan() has already verified this. + MONGO_UNREACHABLE; + } + } + } + + // Check if we've finished the iteration without finding an overflow + if (it == end) { + if (_pendingRle != bsoncolumn::internal::kInvalidIndex) { + // We are in pending RLE without finding an overflow. We can put everything back in + // pending if the pending RLE value is 0 which is the only allowed form of RLE in the + // beginning of the binary. + if (_op.last() == T{0}) { + _pendingRleBlocks = 0; + _op.setAllIdentical(); + } else { + // Our pending RLE value is non-zero which means that the RLE cannot be put in + // pending and the overflow happened after the RLE. Restore the state to this point. + _op.markOverflow(_pendingRle, false); + it = std::prev(it, _pendingRleBlocks + 1); + _pendingRleBlocks = 0; + } + } + // As we got to the beginning, set last to 0 which is how RLE in the beginning of the binary + // must be interpreted. + _op.setLast(T{0}); + // If we end with RLE but never detect overflow, all values are identical to 0. + if (endsWithRLE) { + _op.setAllIdentical(); + } + } + + // If we have found an overflow that happened in the last simple8b block of a control, we can + // transform this to a non-overflow at the beginning of the control after (moving the iterator + // will be done in the next if-statement). + if (_op.overflow() && it != begin && _op.index() == kMaxNumSimple8bPerControl - 1) { + _op.markNoOverflow(); + } + + // If no overflow occurred, go back to the previous control as we should not add data from the + // current control. + if (!_op.overflow() && it != begin && lastControlOffset == 0) { + it = std::prev(it, _pendingRleBlocks + 1); + } + + // Record final calculatetion + _op.setControl(it->control, + it->scaleIndex, + {it.base(), controls.end()}, + // If lastControlOffset is non-zero we're in the special rescale case where we + // need to report the final control byte from the binary. + lastControlOffset == 0 ? *it->control : *controls.back().control, + lastControlOffset); + return _op; +} + +template +void OverflowState::_detectEndsRLE(ControlBlock cb) { + // If the last block ends with RLE we just need to look for the last non-RLE block to + // discover the overflow point. + using namespace bsoncolumn::internal; + LastNonRLEResult res = findLastNonRLE(cb.control); + _op.setLast(res.lastValue); + if (res.index != kInvalidIndex) { + _op.markOverflow(res.index, true); + } +} + +template +int OverflowState::_detectRegular(ControlBlock cb) { + using namespace bsoncolumn::internal; + if (_pendingRle == kInvalidIndex) { + // If we haven't encountered an RLE block in the beginning of a control block yet then + // continue with the regular overflow detection. + OverflowResult res = findOverflow(cb.control, _op.last(), _overflowDetector); + _op.setLast(res.lastValue); + // If this block begins with RLE we need to remember the index position after this RLE. + _pendingRle = res.pendingRLEindex; + if (res.overflowIndex != kInvalidIndex) { + _op.markOverflow(res.overflowIndex, /* allIdentical= */ _pendingRle != kInvalidIndex); + } + + } else { + // When we've encountered RLE in the beginning of a control block we need to continue to + // search for the next non-RLE block to determine where the overflow point is. + LastNonRLEResult res = findLastNonRLE(cb.control); + if (res.index == kInvalidIndex) { + // Still no overflow, increment how many control blocks we've consumed in this state. + ++_pendingRleBlocks; + } else if (res.lastValue == _op.last()) { + // Last value prior to RLE matches our RLE state after RLE. We then overflow in + // the block prior to RLE. Reset pending blocks and mark the overflow with all identical + // values. + _pendingRleBlocks = 0; + _op.markOverflow(res.index, /* allIdentical= */ true); + } else { + // Values to not match, so the overflow happened in the pending block after the RLE, + // we've saved this position in _pendingRle. + _op.markOverflow(_pendingRle, /* allIdentical= */ false); + _op.setLast(res.lastValue); + // Return how many control blocks ago the overflow position refers to. + auto ret = _pendingRleBlocks + 1; + _pendingRleBlocks = 0; + return ret; + } + } + + + return 0; +} + +template +uint16_t OverflowState::_detectRescale(ControlBlockRange before, ControlBlockRange after) { + using namespace bsoncolumn::internal; + + // Calculate last value before the rescaling event. Search backwards for the last non-RLE block + // and get the last value from it. + auto it = std::make_reverse_iterator(before.end()); + auto end = std::make_reverse_iterator(before.begin()); + auto blockWithOldScale = *it; + auto blocks = numSimple8bBlocksForControlByte(*blockWithOldScale.control); + + for (; it != end; ++it) { + LastNonRLEResult res = findLastNonRLE(it->control); + // kInvalidIndex index means that all blocks were RLE and we need to continue to next block. + if (res.index != kInvalidIndex) { + _op.setLast(res.lastValue); + break; + } + } + // Nothing found, 0 is used as last when the stream begins with RLE. + if (it == end) { + _op.setLast(T{0}); + } + + + // If this rescaled block is full, we know that we can treat this as a no-overflow in the next + // control as nothing more can fit in this one anyway. + if (blocks == kMaxNumSimple8bPerControl) { + // If we're in pending RLE, we can additionally mark all values as identical. + if (_pendingRle != kInvalidIndex) { + _op.setAllIdentical(); + _pendingRleBlocks = 0; + } + return 0; + } + + // Based on this actual last value, re-calculate if we will overflow with the data in + // the control blocks we've already processed. Previously we used an estimated last. + Simple8bBuilder s8bBuilder(_op.last(), 0); + for (auto&& cb : after) { + OverflowResult res = findOverflow(cb.control, _op.last(), s8bBuilder); + // If overflow is detected, we treat this as a non-overflow in the next control block. This + // is signalled by not marking for overflow and returning 0 offset to the final control + // block. Everything remaining will be put back into pending. + if (res.overflowIndex != kInvalidIndex) { + return 0; + } + + // RLE detected, we then know that all values are identical. + if (res.pendingRLEindex != kInvalidIndex) { + _op.setAllIdentical(); + break; + } + } + + // Next we need to see if the first value stored in the future control blocks (with a different + // scale) can be scaled using this scale factor that we've now encountered. First, take the next + // control block (the range is guaranteed to be non-empty). + const auto& next = *after.begin(); + + // Encode the last value using next scale factor, this is needed to expand future deltas. This + // is guaranteed to succeed as the scan() function has already validated this. + auto encoded = + Simple8bTypeUtil::encodeDouble(blockWithOldScale.lastAtEndOfBlock, next.scaleIndex); + + // Extract the first value from the next control block. Simple8b cannot be empty, so we can + // dereference the begin iterator without further checking. + boost::optional nextVal = + *Simple8b(next.control + 1, sizeof(uint64_t), _op.last()).begin(); + + // Skipped values can be always be scaled with any scale factor + if (nextVal) { + // Calculate the encoded delta of the next value and then try to encode it using our new + // scale factor + encoded = expandDelta(*encoded, Simple8bTypeUtil::decodeInt64(*nextVal)); + if (!Simple8bTypeUtil::encodeDouble( + Simple8bTypeUtil::decodeDouble(*encoded, next.scaleIndex), + blockWithOldScale.scaleIndex)) { + // Not possible to scale this value using the last scale factor. We return 0 to signal + // this as non-overflow in the block after the rescale, which effectively discards + // everything before the rescale as they will never be needed. + return 0; + } + } + + // Rescaling was possible, all the rescaled values will then need to be written back as pending + // values. This is signalled as an overflow in the last position of this control block. We also + // return an offset to the last control byte of the actual rescaled control block in the binary. + _op.markOverflow(blocks - 1, false); + return blocks * sizeof(uint64_t) + 1 + + std::distance(after.begin(), after.end() - 1) * + (kMaxNumSimple8bPerControl * sizeof(uint64_t) + 1); +} + +inline const char* s8b(const char* control, int index) { + return control + + /* offset to block at index */ index * /* simple8b block size */ sizeof(uint64_t) + + /* skip control byte*/ 1; +} + +inline bool isRLE(const char* s8b) { + // Read simple8b block and mask out the selector + return (ConstDataView(s8b).read>() & + simple8b_internal::kBaseSelectorMask) == simple8b_internal::kRleSelector; +} + +template +boost::optional estimateLastValue(const char* control) { + auto numBlocks = numSimple8bBlocksForControlByte(*control); + if (isRLE(s8b(control, numBlocks - 1))) { + return T{0}; + } + + // Assume that the last value in Simple8b blocks is the same as the one before the + // first. This assumption will hold if all values are equal and RLE is eligible. If it + // turns out to be incorrect the Simple8bBuilder will internally reset and disregard + // RLE. + return findLastNonSkip(control, numBlocks); +} + +template +boost::optional findLastNonSkip(const char* control, int numBlocks) { + // Limit the search for a non-skip value. If we go above 60 without overflow then we consider + // skip to be the last value for RLE as it would be the only one eligible for RLE. + constexpr int kMaxNumSkipInNonRLEBlock = 60; + for (int index = numBlocks - 1, numSkips = 0; index >= 0 && numSkips < kMaxNumSkipInNonRLEBlock; + --index) { + const char* block = s8b(control, index); + // Abort this operation when an RLE block is found, they are handled in a separate code + // path. + if (isRLE(block)) { + break; + } + Simple8b s8b(block, sizeof(uint64_t)); + for (auto it = s8b.begin(), end = s8b.end(); + it != end && numSkips < kMaxNumSkipInNonRLEBlock; + ++it) { + const auto& elem = *it; + if (elem) { + // We do not need to use the actual last value for RLE when determining overflow + // point later. We can use the first value we discover when performing this + // iteration. For a RLE block to be undone and put back into the pending state all + // values need to be the same. So if a value later in this Simple8b block is + // different from this value we cannot undo all these containing a RLE. If the + // values are not all the same we will not fit 120 zeros in pending and the RLE + // block will be left as-is. + return elem; + } + ++numSkips; + } + } + // We did not find any value, so use skip as RLE. It is important that we use 'none' to + // interpret RLE blocks going forward so we can properly undo simple8b blocks containing all + // skip and RLE blocks. + return boost::none; +} + +template +OverflowResult findOverflow(const char* control, + boost::optional lastValForRLE, + Simple8bBuilder& overflowDetector) { + // Search is performed in reverse order + int index = numSimple8bBlocksForControlByte(*control) - 1; + for (; index >= 0; --index) { + // Get pointer to Simple8b block at this index position + const char* block = s8b(control, index); + + // If this is an RLE block and if the overflow detector is in RLE mode, we need to skip to + // the next non-RLE block and compare its last value against the values after RLE. + if (isRLE(block)) { + // If we are not in RLE mode then we know that overflow occurred in this RLE block, + // return its position. + if (!overflowDetector.rlePossible()) { + return {lastValForRLE, index, kInvalidIndex}; + } + + // Search for the next non-RLE block and get the last value from it. + LastNonRLEResult res = findLastNonRLE(control, index - 1); + if (res.index == kInvalidIndex) { + // We exhausted this control block without determining where the overflow point + // is. Return pending RLE index so we can continue this operation in the prior + // control block. If the value we find prior to the RLE is different, then the + // overflow happened at this 'pending RLE' index. + return {lastValForRLE, kInvalidIndex, index}; + } else if (res.lastValue == lastValForRLE) { + // Last value prior to RLE matches our RLE state after RLE. We then overflow in + // the block prior to RLE. + return {lastValForRLE, res.index, kInvalidIndex}; + } + + // Last value prior to RLE does not match our RLE state after RLE. We then overflow in + // the RLE block with the previous value set to the actual RLE value from the block + // prior to RLE. + return {res.lastValue, index, kInvalidIndex}; + } + + // Regular non-RLE block. We extract all values and append it to our overflow detector to + // see if they cause overflow. + Simple8b s8b(block, + /* one block at a time */ sizeof(uint64_t), + lastValForRLE); + boost::optional last; + + bool overflow = false; + auto writeFn = [&overflow](uint64_t block) mutable { + overflow = true; + }; + for (auto&& elem : s8b) { + last = elem; + if (elem) { + overflowDetector.append(*last, writeFn); + } else { + overflowDetector.skip(writeFn); + } + } + + // If overflow point detected, we return this index position and its calculated last value. + if (overflow) { + return {last, index, kInvalidIndex}; + } + } + + // We have depleated this control block without finding an overflow position, return invalid + // index positions. + return {lastValForRLE, kInvalidIndex, kInvalidIndex}; +} + +template +LastNonRLEResult findLastNonRLE(const char* control) { + return findLastNonRLE(control, numSimple8bBlocksForControlByte(*control) - 1); +} + +template +LastNonRLEResult findLastNonRLE(const char* control, int index) { + // Search is performed in reverse order + for (; index >= 0; --index) { + const char* block = s8b(control, index); + if (isRLE(block)) { + continue; + } + + // Non-RLE block found, calculate its last value and return. Last value for RLE is unused as + // we already know that this is not an RLE block. + uint64_t unused = simple8b::kInvalidSimple8b; + boost::optional last = simple8b::last(block, sizeof(uint64_t), unused); + + return {last, index}; + } + + return {T{}, index}; +} + +} // namespace mongo::bsoncolumn::internal diff --git a/src/mongo/bson/column/binary_reopen_test.cpp b/src/mongo/bson/column/binary_reopen_test.cpp new file mode 100644 index 00000000000..77a74eec572 --- /dev/null +++ b/src/mongo/bson/column/binary_reopen_test.cpp @@ -0,0 +1,942 @@ +/** + * Copyright (C) 2025-present MongoDB, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the Server Side Public License, version 1, + * as published by MongoDB, Inc. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * Server Side Public License for more details. + * + * You should have received a copy of the Server Side Public License + * along with this program. If not, see + * . + * + * As a special exception, the copyright holders give permission to link the + * code of portions of this program with the OpenSSL library under certain + * conditions as described in each individual source file and distribute + * linked combinations including the program with the OpenSSL library. You + * must comply with the Server Side Public License in all respects for + * all of the code used other than as permitted herein. If you modify file(s) + * with this exception, you may extend this exception to your version of the + * file(s), but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. If you delete this + * exception statement from all source files in the program, then also delete + * it in the license file. + */ + +#include "mongo/bson/column/binary_reopen.h" + +#include "mongo/bson/column/simple8b.h" +#include "mongo/bson/column/simple8b_builder.h" +#include "mongo/unittest/unittest.h" +#include "mongo/util/assert_util.h" + +namespace mongo::bsoncolumn::internal { +using V = boost::optional; + +class BinaryReopenTest : public unittest::Test { +public: + BinaryReopenTest(); + + // Generates a BSONColumn control block for a set of simple8b blocks and optionally provided + // scale factor + const char* control(std::vector blocks, + uint8_t scaleIndex = Simple8bTypeUtil::kMemoryAsInteger); + + // Simple8b block constants to be used in the tests + uint64_t block1Skip; + uint64_t block1Zero; + uint64_t block1One; + uint64_t block5Two; + uint64_t block6Skip; + uint64_t block60Skip; + uint64_t block60Zero; + uint64_t block2Zero1Skip; + uint64_t block3One1Skip; + uint64_t block3Skip1One; + uint64_t block6Skip1Two; + uint64_t blockFullOne; + uint64_t block1RLE; + uint64_t block16RLE; + +private: + // Memory for generated control blocks + std::forward_list> _ownedControls; +}; + +BinaryReopenTest::BinaryReopenTest() { + // Helper to generate a single simple8b block with the provided values. + auto generateSimple8b = [](boost::optional value, + int count, + boost::optional value2 = boost::none, + int count2 = 0) { + boost::optional block; + auto writeFn = [&](uint64_t b) mutable { + if (block) { + FAIL("Should only write one block"); + } + block = b; + }; + Simple8bBuilder builder; + for (int i = 0; i < count; i++) { + if (value) { + builder.append(*value, writeFn); + } else { + builder.skip(writeFn); + } + } + for (int i = 0; i < count2; i++) { + if (value2) { + builder.append(*value2, writeFn); + } else { + builder.skip(writeFn); + } + } + builder.flush(writeFn); + ASSERT_TRUE(block.has_value()); + return *block; + }; + + // Helper to generate a simple8b block that can fit the maximum amount of a particular value + auto generateFullSimple8b = [](boost::optional value) { + boost::optional block; + bool written = false; + auto writeFn = [&](uint64_t b) mutable { + block = b; + written = true; + }; + // We need to disable RLE, so we generate a previous value that is different from the value + // we're appending. + boost::optional different = value ? V{*value + 1} : V{0}; + // Initialize RLE with this value + Simple8bBuilder builder(different, 0); + // Append until a simple8b block has been full and written out + while (!written) { + if (value) { + builder.append(*value, writeFn); + } else { + builder.skip(writeFn); + } + } + return *block; + }; + + // Some constants used in the tests below + block1Skip = generateSimple8b(boost::none, 1); + block1Zero = generateSimple8b(0, 1); + block1One = generateSimple8b(1, 1); + block5Two = generateSimple8b(2, 5); + block6Skip = generateSimple8b(boost::none, 6); + block60Skip = generateSimple8b(boost::none, 60); + block60Zero = generateSimple8b(0, 60); + block2Zero1Skip = generateSimple8b(0, 2, boost::none, 1); + block3One1Skip = generateSimple8b(1, 3, boost::none, 1); + block3Skip1One = generateSimple8b(boost::none, 3, 1, 1); + block6Skip1Two = generateSimple8b(boost::none, 6, 2, 1); + blockFullOne = generateFullSimple8b(1); + block1RLE = simple8b_internal::kRleSelector; + block16RLE = simple8b_internal::kRleSelector | 0xF0; +} + +const char* BinaryReopenTest::control(std::vector blocks, uint8_t scaleIndex) { + // A control block contains between 1 and 16 simple8b blocks. + ASSERT_GT(blocks.size(), 0); + ASSERT_LTE(blocks.size(), 16); + + // Allocate enough memory to also fit the control byte preceding the simple8b blocks. + auto c = std::make_unique(blocks.size() * sizeof(uint64_t) + 1); + // Write control byte with out scale factor and number of simple8b blocks. + *c.get() = kControlByteForScaleIndex[scaleIndex] | (blocks.size() - 1); + // Copy simple8b data + memcpy(c.get() + 1, blocks.data(), blocks.size() * sizeof(uint64_t)); + auto ptr = c.get(); + // Store internally to simplify memory management in the tests + _ownedControls.push_front(std::move(c)); + return ptr; +} + +TEST_F(BinaryReopenTest, EstimateLastValue) { + // Block with zeros return zero + ASSERT_EQ(estimateLastValue(control({block1Zero})), V{0}); + + // Skips before a value does not affect the last value + ASSERT_EQ(estimateLastValue(control({block6Skip, block6Skip1Two})), V{2}); + + // Block ending with skips returns last non-skip value + ASSERT_EQ(estimateLastValue(control({block1Zero, block6Skip})), V{0}); + ASSERT_EQ(estimateLastValue(control({block5Two, block6Skip})), V{2}); + ASSERT_EQ(estimateLastValue(control({block3One1Skip, block6Skip})), V{1}); + + // Block ending with 60 or more skips return none even if value exists before the skips + ASSERT_EQ(estimateLastValue(control({block5Two, block60Skip})), V{boost::none}); + ASSERT_EQ(estimateLastValue(control({block5Two, block60Skip, block6Skip})), + V{boost::none}); + ASSERT_EQ(estimateLastValue(control({block5Two, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip})), + V{boost::none}); + + // Block ending with 59 or fewer skips returns last non-skip value + ASSERT_EQ(estimateLastValue(control({block5Two, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block1Skip, + block1Skip, + block1Skip, + block1Skip, + block1Skip})), + V{2}); + + // Block with skips only returns none + ASSERT_EQ(estimateLastValue(control({block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip})), + V{boost::none}); + ASSERT_EQ(estimateLastValue(control({block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block6Skip, + block1Skip, + block1Skip, + block1Skip, + block1Skip, + block1Skip})), + V{boost::none}); + ASSERT_EQ(estimateLastValue(control({block1Skip})), V{boost::none}); + + // Block with RLE returns zero regardless of what's before it + ASSERT_EQ(estimateLastValue(control({block1RLE})), V{0}); + ASSERT_EQ(estimateLastValue(control({block1One, block1RLE})), V{0}); + ASSERT_EQ(estimateLastValue(control({block1Zero, block1RLE})), V{0}); + ASSERT_EQ(estimateLastValue(control({block1Skip, block1RLE})), V{0}); +} + +TEST_F(BinaryReopenTest, FindOverflow) { + OverflowResult res; + auto findOverflowHelper = [](const char* control, V lastVal) { + Simple8bBuilder detector(lastVal, 0); + return findOverflow(control, lastVal, detector); + }; + + // Basic case of a single simple8b block with skip does not overflow + res = findOverflowHelper(control({block1Skip}), V{boost::none}); + ASSERT_EQ(res.overflowIndex, kInvalidIndex); + ASSERT_EQ(res.lastValue, V{boost::none}); // last value is unchanged when there is no overflow + ASSERT_EQ(res.pendingRLEindex, kInvalidIndex); + + // Basic case of a single simple8b block with values does not overflow + res = findOverflowHelper(control({block5Two}), V{0}); + ASSERT_EQ(res.overflowIndex, kInvalidIndex); + ASSERT_EQ(res.lastValue, V{0}); // last value is unchanged when there is no overflow + ASSERT_EQ(res.pendingRLEindex, kInvalidIndex); + + // Two blocks with identical values does not overflow if there is a block that could have all + // fit in + res = findOverflowHelper(control({block5Two, block5Two}), + V{0}); // Different value for RLE disables RLE mode + ASSERT_EQ(res.overflowIndex, kInvalidIndex); + ASSERT_EQ(res.lastValue, V{0}); // last value is unchanged when there is no overflow + ASSERT_EQ(res.pendingRLEindex, kInvalidIndex); + + // Two blocks with over 60 values that are different cannot fit in a single block so we overflow + // at index 0. + res = findOverflowHelper(control({block5Two, block60Zero}), V{0}); + ASSERT_EQ(res.overflowIndex, 0); + ASSERT_EQ(res.lastValue, V{2}); // last value in block that overflowed + ASSERT_EQ(res.pendingRLEindex, kInvalidIndex); + + // Three blocks with over 60 values that are different cannot fit in a single block so we + // overflow at index 1. + res = findOverflowHelper(control({block5Two, block5Two, block60Zero}), V{0}); + ASSERT_EQ(res.overflowIndex, 1); + ASSERT_EQ(res.lastValue, V{2}); // last value in block that overflowed + ASSERT_EQ(res.pendingRLEindex, kInvalidIndex); + + // Changing the last value does not affect the overflow point as RLE is not in play + res = findOverflowHelper(control({block5Two, block5Two, block60Zero}), V{2}); + ASSERT_EQ(res.overflowIndex, 1); + ASSERT_EQ(res.lastValue, V{2}); // last value in block that overflowed + ASSERT_EQ(res.pendingRLEindex, kInvalidIndex); + + // Without RLE we can only fit 30 '1' values in a single block, so overflow happens at index + // 1 even though values are identical + res = findOverflowHelper(control({block5Two, blockFullOne, blockFullOne}), V{0}); + ASSERT_EQ(res.overflowIndex, 1); + ASSERT_EQ(res.lastValue, V{1}); // last value in block that overflowed + ASSERT_EQ(res.pendingRLEindex, kInvalidIndex); + + // With RLE overflow happens in first block with a different value + res = findOverflowHelper(control({block5Two, blockFullOne, blockFullOne}), V{1}); + ASSERT_EQ(res.overflowIndex, 0); + ASSERT_EQ(res.lastValue, V{2}); // last value in block that overflowed + ASSERT_EQ(res.pendingRLEindex, kInvalidIndex); + + // No overflow if the values are all identical and RLE is in play + res = findOverflowHelper(control({blockFullOne, blockFullOne}), V{1}); + ASSERT_EQ(res.overflowIndex, kInvalidIndex); + ASSERT_EQ(res.lastValue, V{1}); // last value in block that overflowed + ASSERT_EQ(res.pendingRLEindex, kInvalidIndex); + + // With RLE block and all values are identical the overflow happens before the RLE block + res = findOverflowHelper(control({block5Two, block1RLE, block5Two}), V{2}); + ASSERT_EQ(res.overflowIndex, 0); + ASSERT_EQ(res.lastValue, V{2}); // last value in block that overflowed + ASSERT_EQ(res.pendingRLEindex, kInvalidIndex); + + // With RLE block and values are different before and after RLE the overflow happens at the RLE + // block + res = findOverflowHelper(control({blockFullOne, block1RLE, block5Two}), V{2}); + ASSERT_EQ(res.overflowIndex, 1); + ASSERT_EQ(res.lastValue, V{1}); // last value in block that overflowed + ASSERT_EQ(res.pendingRLEindex, kInvalidIndex); + + // With RLE block and values are different before and after RLE the overflow happens at the last + // RLE block + res = findOverflowHelper(control({blockFullOne, block16RLE, block1RLE, block5Two}), V{2}); + ASSERT_EQ(res.overflowIndex, 2); + ASSERT_EQ(res.lastValue, V{1}); // last value in block that overflowed + ASSERT_EQ(res.pendingRLEindex, kInvalidIndex); + + // Only RLE returns no overflow but pending RLE at the last RLE block + res = findOverflowHelper(control({block16RLE, block1RLE}), V{2}); + ASSERT_EQ(res.overflowIndex, kInvalidIndex); + ASSERT_EQ(res.lastValue, V{2}); // last value in block that overflowed + ASSERT_EQ(res.pendingRLEindex, 1); + + // RLE followed by non-RLE compatible with last value returns no overflow but pending RLE at + // the last RLE block + res = findOverflowHelper(control({block16RLE, block1RLE, block5Two}), V{2}); + ASSERT_EQ(res.overflowIndex, kInvalidIndex); + ASSERT_EQ(res.lastValue, V{2}); // last value in block that overflowed + ASSERT_EQ(res.pendingRLEindex, 1); + + // RLE followed by non-RLE not compatible with last value returns overflow at the + // non-RLE block + res = findOverflowHelper(control({block16RLE, block1RLE, block5Two}), V{1}); + ASSERT_EQ(res.overflowIndex, 1); + ASSERT_EQ(res.lastValue, + V{1}); // last value is left unchanged when it cannot be determined due to RLE + ASSERT_EQ(res.pendingRLEindex, kInvalidIndex); +} + +TEST_F(BinaryReopenTest, FindLastNonRLE) { + LastNonRLEResult res; + + // Single non-RLE returns index 0 and the last value in the block + res = findLastNonRLE(control({block1Zero})); + ASSERT_EQ(res.index, 0); + ASSERT_EQ(res.lastValue, V{0}); + + // Single non-RLE returns index 0 and the last value in the block + res = findLastNonRLE(control({block2Zero1Skip})); + ASSERT_EQ(res.index, 0); + ASSERT_EQ(res.lastValue, V{boost::none}); + + // Single non-RLE returns index 0 and the last value in the block + res = findLastNonRLE(control({block6Skip1Two})); + ASSERT_EQ(res.index, 0); + ASSERT_EQ(res.lastValue, V{2}); + + // Multiple non-RLE blocks returns index to last block and the last value in that block + res = findLastNonRLE(control({blockFullOne, block6Skip1Two})); + ASSERT_EQ(res.index, 1); + ASSERT_EQ(res.lastValue, V{2}); + + // RLE at the end is skipped. Position and last value to prior non-RLE block is returned + res = findLastNonRLE(control({blockFullOne, block1RLE})); + ASSERT_EQ(res.index, 0); + ASSERT_EQ(res.lastValue, V{1}); + + // RLE at the end is skipped. Position and last value to prior non-RLE block is returned + res = findLastNonRLE(control({block1RLE, blockFullOne, block16RLE, block1RLE})); + ASSERT_EQ(res.index, 1); + ASSERT_EQ(res.lastValue, V{1}); + + // Only RLE blocks returns invalid index and last value of 0 + res = findLastNonRLE(control({block1RLE})); + ASSERT_EQ(res.index, kInvalidIndex); + ASSERT_EQ(res.lastValue, V{0}); + + // Only RLE blocks returns invalid index and last value of 0 + res = findLastNonRLE(control({block16RLE, block1RLE})); + ASSERT_EQ(res.index, kInvalidIndex); + ASSERT_EQ(res.lastValue, V{0}); + + // Index parameter limits the search to before that index + res = + findLastNonRLE(control({block6Skip1Two, blockFullOne, block16RLE, block1RLE}), 0); + ASSERT_EQ(res.index, 0); + ASSERT_EQ(res.lastValue, V{2}); + + // Index parameter limits the search to before that index + res = findLastNonRLE(control({blockFullOne, block6Skip1Two, blockFullOne}), 1); + ASSERT_EQ(res.index, 1); + ASSERT_EQ(res.lastValue, V{2}); + + // Index parameter limits the search to before that index + res = findLastNonRLE( + control({blockFullOne, block6Skip1Two, block16RLE, block1RLE, blockFullOne}), 3); + ASSERT_EQ(res.index, 1); + ASSERT_EQ(res.lastValue, V{2}); +} + +TEST_F(BinaryReopenTest, Overflow) { + // Helper to run the overflow detection on the OverflowState class + auto overflowHelper = [](std::vector controls) -> OverflowPoint { + ControlBlockContainer cbs; + // Generate the control block container. We can ignore the data used for the double type + // (checked in OverflowScaled below) + for (auto&& c : controls) { + cbs.push_back({c}); + } + OverflowState overflow(cbs.back()); + return overflow.detect(cbs); + }; + + // Helper for a control block full of RLE + auto fullRLEControl = [&]() { + return control({block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE}); + }; + + std::vector controls; + + // Single control without overflow + controls = {control({block5Two})}; + OverflowPoint point = overflowHelper(controls); + ASSERT_EQ(point.control(), controls[0]); + ASSERT_FALSE(point.overflow()); + ASSERT_EQ(point.index(), kInvalidIndex); + ASSERT_EQ(point.last(), V{0}); // Last is defined as 0 when there is no overflow + ASSERT_FALSE(point.allValuesIdentical()); // this is never set unless RLE is involved + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[0]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 0); + + // Single control with overflow at index 0 + controls = {control({block5Two, block60Zero})}; + point = overflowHelper(controls); + ASSERT_EQ(point.control(), controls[0]); + ASSERT_TRUE(point.overflow()); + ASSERT_EQ(point.index(), 0); + ASSERT_EQ(point.last(), V{2}); // Last value in block that caused overflow + ASSERT_FALSE(point.allValuesIdentical()); // this is never set unless RLE is involved + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[0]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 0); + + // Two controls with overflow in the first control at the second to last index position + controls = {control({ + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + blockFullOne, + block5Two, + }), + control({block5Two})}; + point = overflowHelper(controls); + ASSERT_EQ(point.control(), controls[0]); + ASSERT_TRUE(point.overflow()); + ASSERT_EQ(point.index(), 14); + ASSERT_EQ(point.last(), V{1}); // Last value in block that caused overflow + ASSERT_FALSE(point.allValuesIdentical()); // this is never set unless RLE is involved + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[0]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 1); + + // Two controls with overflow in the first control at last index position is treated as no + // overflow with the second control returned. + controls = {control({block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + block5Two, + blockFullOne}), + control({block5Two})}; + point = overflowHelper(controls); + ASSERT_EQ(point.control(), controls[1]); + ASSERT_FALSE(point.overflow()); + ASSERT_EQ(point.index(), kInvalidIndex); + ASSERT_EQ(point.last(), V{1}); // Last value in block that caused overflow + ASSERT_FALSE(point.allValuesIdentical()); // this is never set unless RLE is involved + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[1]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 0); + + // Single control with RLE only returns no overflow with last value of 0 + controls = {control({block1RLE})}; + point = overflowHelper(controls); + ASSERT_EQ(point.control(), controls[0]); + ASSERT_FALSE(point.overflow()); + ASSERT_EQ(point.index(), kInvalidIndex); + ASSERT_EQ(point.last(), V{0}); // Last is defined as 0 when there is no overflow + ASSERT_TRUE(point.allValuesIdentical()); + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[0]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 0); + + // Only RLE can span more than one control which yields the same result + controls = {fullRLEControl(), control({block16RLE, block1RLE})}; + point = overflowHelper(controls); + ASSERT_EQ(point.control(), controls[0]); + ASSERT_FALSE(point.overflow()); + ASSERT_EQ(point.index(), kInvalidIndex); + ASSERT_EQ(point.last(), V{0}); // Last is defined as 0 when there is no overflow + ASSERT_TRUE(point.allValuesIdentical()); + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[0]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 1); + + // RLE spanning more than one control followed by blocks containing only zeros also yields the + // same result + controls = {fullRLEControl(), + fullRLEControl(), + fullRLEControl(), + control({block16RLE, block1RLE, block60Zero})}; + point = overflowHelper(controls); + ASSERT_EQ(point.control(), controls[0]); + ASSERT_FALSE(point.overflow()); + ASSERT_EQ(point.index(), kInvalidIndex); + ASSERT_EQ(point.last(), V{0}); // Last is defined as 0 when there is no overflow + ASSERT_TRUE(point.allValuesIdentical()); + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[0]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 3); + + // Value followed by RLE spanning more than one control is overflow at the index before the RLE + // starts + controls = {control({block5Two, + block5Two, + blockFullOne, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE}), + control({block16RLE, block1RLE})}; + point = overflowHelper(controls); + ASSERT_EQ(point.control(), controls[0]); + ASSERT_TRUE(point.overflow()); + ASSERT_EQ(point.index(), 2); + ASSERT_EQ(point.last(), V{1}); // Last value in block that caused overflow + ASSERT_TRUE(point.allValuesIdentical()); + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[0]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 1); + + // Value followed by RLE spanning more than one control is overflow at the index before the + // RLE starts as long as the value after RLE is the same as before RLE + controls = {control({block5Two, + block5Two, + blockFullOne, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE}), + control({block16RLE, block1RLE, block1One})}; + point = overflowHelper(controls); + ASSERT_EQ(point.control(), controls[0]); + ASSERT_TRUE(point.overflow()); + ASSERT_EQ(point.index(), 2); + ASSERT_EQ(point.last(), V{1}); // Last value in block that caused overflow + ASSERT_TRUE(point.allValuesIdentical()); + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[0]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 1); + + // When value before RLE is different from the value after RLE the overflow happens at the + // last RLE block + controls = {control({block5Two, + block5Two, + blockFullOne, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE}), + fullRLEControl(), + control({block16RLE, block1RLE, block5Two})}; + point = overflowHelper(controls); + ASSERT_EQ(point.control(), controls[2]); + ASSERT_TRUE(point.overflow()); + ASSERT_EQ(point.index(), 1); + ASSERT_EQ(point.last(), V{1}); // Last value in block that caused overflow + ASSERT_FALSE(point.allValuesIdentical()); + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[2]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 0); + + // When the stream starts with RLE but the value after RLE is not zero then the overflow happens + // at the last RLE block + controls = {fullRLEControl(), fullRLEControl(), control({block16RLE, block1RLE, block5Two})}; + point = overflowHelper(controls); + ASSERT_EQ(point.control(), controls[2]); + ASSERT_TRUE(point.overflow()); + ASSERT_EQ(point.index(), 1); + ASSERT_EQ(point.last(), V{0}); // Last value in block that caused overflow + ASSERT_FALSE(point.allValuesIdentical()); // this is never set unless RLE is involved + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[2]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 0); +} + +TEST_F(BinaryReopenTest, OverflowScaled) { + // Helper to run the overflow detection for doubles + auto overflowHelper = [](double base, + std::vector controls) -> OverflowPoint { + ControlBlockContainer cbs; + + // Every control block needs to set lastAtEndOfBlock. We calculate this based on 'base' and + // the control blocks provided. + uint64_t prevNonRLE = simple8b::kSingleZero; + auto ret = + Simple8bTypeUtil::encodeDouble(base, scaleIndexForControlByte(*controls.front())); + ASSERT_TRUE(ret.has_value()); + int64_t encoded = *ret; + + for (auto&& c : controls) { + uint8_t scaleIndex = scaleIndexForControlByte(*c); + // Doubles uses delta encoding, so we can use a sum to get the delta for the last value. + encoded += simple8b::sum( + c + 1, numSimple8bBlocksForControlByte(*c) * sizeof(uint64_t), prevNonRLE); + base = Simple8bTypeUtil::decodeDouble(encoded, scaleIndex); + cbs.push_back({c, base, scaleIndex}); + } + + OverflowState overflow(cbs.back()); + return overflow.detect(cbs); + }; + + std::vector controls; + + // Rescale after full block is reported as no overflow in the first control with a different + // scale + controls = {control({blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne}, + 0), + control({block5Two}, Simple8bTypeUtil::kMemoryAsInteger)}; + OverflowPoint point = overflowHelper(1.0, controls); + ASSERT_EQ(point.control(), controls[1]); + ASSERT_FALSE(point.overflow()); + ASSERT_EQ(point.index(), kInvalidIndex); + ASSERT_EQ(point.last(), V{1}); + ASSERT_FALSE(point.allValuesIdentical()); + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[1]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 0); + + // Same but the last value before rescale is skip + controls = {control({blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + blockFullOne, + block3One1Skip}, + 0), + control({block5Two}, Simple8bTypeUtil::kMemoryAsInteger)}; + point = overflowHelper(1.0, controls); + ASSERT_EQ(point.control(), controls[1]); + ASSERT_FALSE(point.overflow()); + ASSERT_EQ(point.index(), kInvalidIndex); + ASSERT_EQ(point.last(), V{boost::none}); + ASSERT_FALSE(point.allValuesIdentical()); + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[1]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 0); + + // Same but there are RLE before the rescale + controls = {control({blockFullOne, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE}, + 0), + control({block5Two}, Simple8bTypeUtil::kMemoryAsInteger)}; + point = overflowHelper(1.0, controls); + ASSERT_EQ(point.control(), controls[1]); + ASSERT_FALSE(point.overflow()); + ASSERT_EQ(point.index(), kInvalidIndex); + ASSERT_EQ(point.last(), V{1}); + ASSERT_FALSE(point.allValuesIdentical()); + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[1]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 0); + + // Same but there are only RLE before the rescale + controls = {control({block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE}, + 0), + control({block5Two}, Simple8bTypeUtil::kMemoryAsInteger)}; + point = overflowHelper(1.0, controls); + ASSERT_EQ(point.control(), controls[1]); + ASSERT_FALSE(point.overflow()); + ASSERT_EQ(point.index(), kInvalidIndex); + ASSERT_EQ(point.last(), V{0}); + ASSERT_FALSE(point.allValuesIdentical()); + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[1]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 0); + + // RLE can be before and after the rescale + controls = {control({blockFullOne, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE}, + 0), + control({block16RLE}, Simple8bTypeUtil::kMemoryAsInteger)}; + point = overflowHelper(1.0, controls); + ASSERT_EQ(point.control(), controls[1]); + ASSERT_FALSE(point.overflow()); + ASSERT_EQ(point.index(), kInvalidIndex); + ASSERT_EQ(point.last(), V{1}); + ASSERT_FALSE(point.allValuesIdentical()); + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[1]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 0); + + // Block before rescale is not full but it is not possible to scale the first value with scale + // factor kMemoryAsInteger with scale factor 0 so we also treat this as a no overflow but return + // the first control after the rescale. + controls = {control({blockFullOne, blockFullOne, blockFullOne}, 0), + control({block5Two}, Simple8bTypeUtil::kMemoryAsInteger)}; + point = overflowHelper(1.0, controls); + ASSERT_EQ(point.control(), controls[1]); + ASSERT_FALSE(point.overflow()); + ASSERT_EQ(point.index(), kInvalidIndex); + ASSERT_EQ(point.last(), V{1}); + ASSERT_FALSE(point.allValuesIdentical()); + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[1]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 0); + + // Last value before rescale can be scaled with the next scale factor and all values fit in + // pending without causing overflow. We then report the first control with a binary offset to + // the control byte after the scaling. + controls = {control({blockFullOne, blockFullOne, blockFullOne}, 1), control({block5Two}, 0)}; + point = overflowHelper(1.0, controls); + ASSERT_EQ(point.control(), controls[0]); + ASSERT_TRUE(point.overflow()); + ASSERT_EQ(point.index(), 2); + ASSERT_EQ(point.last(), V{1}); + ASSERT_FALSE(point.allValuesIdentical()); + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[1]); + ASSERT_EQ(point.lastControlOffset(), + numSimple8bBlocksForControlByte(*controls[0]) * sizeof(uint64_t) + 1); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 1); + + // Like above but we have a large amount of RLE after the rescale. The result is basically the + // same, but we report a larger offset and more values remaining. + controls = {control({blockFullOne, blockFullOne, blockFullOne}, 1), + control({block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE, + block16RLE}, + 0), + control({block16RLE, block16RLE, blockFullOne}, 0)}; + point = overflowHelper(1.0, controls); + ASSERT_EQ(point.control(), controls[0]); + ASSERT_TRUE(point.overflow()); + ASSERT_EQ(point.index(), 2); + ASSERT_EQ(point.last(), V{1}); + ASSERT_FALSE(point.allValuesIdentical()); // Values are not identical even if we have a large + // amont of RLE because the scaling is different + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[2]); + ASSERT_EQ(point.lastControlOffset(), + (numSimple8bBlocksForControlByte(*controls[0]) + kMaxNumSimple8bPerControl) * + sizeof(uint64_t) + + 2); // binary offset to the third control byte + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 2); + + // Same but with RLE on both sides of the scaling + controls = {control({blockFullOne, blockFullOne, block16RLE}, 1), control({block1RLE}, 0)}; + point = overflowHelper(1.0, controls); + ASSERT_EQ(point.control(), controls[0]); + ASSERT_TRUE(point.overflow()); + ASSERT_EQ(point.index(), 2); + ASSERT_EQ(point.last(), V{1}); + ASSERT_FALSE(point.allValuesIdentical()); // Values are not identical even if we have a large + // amont of RLE because the scaling is different + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[1]); + ASSERT_EQ(point.lastControlOffset(), + numSimple8bBlocksForControlByte(*controls[0]) * sizeof(uint64_t) + + 1); // binary offset to the second control byte + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 1); + + // Last value before rescale can be scaled with the next scale factor but all values cannot fit + // in pending without causing overflow. This case is also treated as no overflow + controls = {control({blockFullOne, blockFullOne, block5Two}, 1), + control({blockFullOne, block1One}, 0)}; + point = overflowHelper(1.0, controls); + ASSERT_EQ(point.control(), controls[1]); + ASSERT_FALSE(point.overflow()); + ASSERT_EQ(point.index(), kInvalidIndex); + ASSERT_EQ(point.last(), V{2}); + ASSERT_FALSE(point.allValuesIdentical()); + ASSERT_EQ(point.lastControl(), (uint8_t)*controls[1]); + ASSERT_EQ(point.lastControlOffset(), 0); + ASSERT_EQ(std::distance(point.remaining().begin(), point.remaining().end()), 0); +} +} // namespace mongo::bsoncolumn::internal diff --git a/src/mongo/bson/column/bsoncolumn_builder_fuzzer.cpp b/src/mongo/bson/column/bsoncolumn_builder_fuzzer.cpp index 9de45a95d39..11dd01f9a1d 100644 --- a/src/mongo/bson/column/bsoncolumn_builder_fuzzer.cpp +++ b/src/mongo/bson/column/bsoncolumn_builder_fuzzer.cpp @@ -102,13 +102,10 @@ extern "C" int LLVMFuzzerTestOneInput(const char* Data, size_t Size) { << base64::encode(diff.data(), diff.size())); // Verify binary reopen gives identical state as intermediate - // TODO SERVER-100659: Uncomment this after reopen bug is fixed - /* BSONColumnBuilder reopen(diff.data(), diff.size()); invariant(builder.isInternalStateIdentical(reopen), str::stream() << "Binary reopen does not yield equivalent state. Column: " << base64::encode(diff.data(), diff.size())); - */ return 0; } diff --git a/src/mongo/bson/column/bsoncolumn_decompress_fuzzer.cpp b/src/mongo/bson/column/bsoncolumn_decompress_fuzzer.cpp index 3eb10a5212f..56e704ba42a 100644 --- a/src/mongo/bson/column/bsoncolumn_decompress_fuzzer.cpp +++ b/src/mongo/bson/column/bsoncolumn_decompress_fuzzer.cpp @@ -30,42 +30,10 @@ #include "mongo/bson/bson_validate.h" #include "mongo/bson/bsonelement.h" #include "mongo/bson/column/bsoncolumn.h" +#include "mongo/bson/column/bsoncolumnbuilder.h" #include "mongo/bson/util/bsonobj_traversal.h" #include "mongo/util/base64.h" -// Returns true if the binary contains interleaved data. This function just scans the binary for an -// interleaved start control byte, it does no validation nor decompression. -static bool isDataInterleaved(const char* binary, size_t size) { - using namespace mongo; - const char* pos = binary; - const char* end = binary + size; - - while (pos != end) { - uint8_t control = *pos; - if (control == stdx::to_underlying(BSONType::eoo)) { - // Reached the end of the binary. - return false; - } - - if (bsoncolumn::isInterleavedStartControlByte(control)) { - return true; - } - - if (bsoncolumn::isUncompressedLiteralControlByte(control)) { - // Scan over the entire literal. - BSONElement literal(pos, 1, BSONElement::TrustedInitTag{}); - pos += literal.size(); - continue; - } - - // If there are no control bytes, scan over the simple8b block. - uint8_t size = bsoncolumn::numSimple8bBlocksForControlByte(control) * sizeof(uint64_t); - pos += size + 1; - } - - return false; -}; - // There are two decoding APIs. For all data that pass validation, both decoder implementations // must produce the same results. extern "C" int LLVMFuzzerTestOneInput(const char* Data, size_t Size) { @@ -84,6 +52,7 @@ extern "C" int LLVMFuzzerTestOneInput(const char* Data, size_t Size) { std::vector blockBasedElems = {}; std::string blockBasedError; std::string iteratorError; + std::string reopenError; // Attempt to decompress using the block-based API. try { @@ -102,14 +71,23 @@ extern "C" int LLVMFuzzerTestOneInput(const char* Data, size_t Size) { iteratorError = e.toString(); } - // If one API failed, then both APIs must fail. - if (!iteratorError.empty() || !blockBasedError.empty()) { - invariant(!(iteratorError.empty() || blockBasedError.empty()), + // Attempt to reopen using the reopen API. + try { + BSONColumnBuilder(Data, Size); + } catch (const DBException& e) { + reopenError = e.toString(); + } + + // If one API failed, then all APIs must fail. + if (!iteratorError.empty() || !blockBasedError.empty() || !reopenError.empty()) { + invariant(!(iteratorError.empty() || blockBasedError.empty() || reopenError.empty()), str::stream() << "For the input: " << base64::encode(StringData(Data, Size)) << ". Iterator API returned " << (iteratorError.empty() ? "results" : iteratorError) << ". The block based API returned " - << (blockBasedError.empty() ? "results" : blockBasedError)); + << (blockBasedError.empty() ? "results" : blockBasedError) + << ". The reopen API returned " + << (reopenError.empty() ? "results" : reopenError)); return 0; } diff --git a/src/mongo/bson/column/bsoncolumn_test.cpp b/src/mongo/bson/column/bsoncolumn_test.cpp index cde21c2c8b1..4edd0d1197c 100644 --- a/src/mongo/bson/column/bsoncolumn_test.cpp +++ b/src/mongo/bson/column/bsoncolumn_test.cpp @@ -1331,7 +1331,7 @@ TEST_F(BSONColumnTest, BuilderFuzzerReopenDiscoveredEdgeCases) { // std::vector binariesBase64 = { // Pending fix of SERVER-100659 - // "gPz/////////CAAAgP7/////////AQAAAAAAAAAAYI/OxcXFxcXFAQ4AAAAAAAAB7uLi4uLi4gAuHR0dHR2dAI5xcXFxcXEAjnFxcXFxcQCOcXFxcXFxAK6rq6urq2sAzri4uLi4OADOuLi4uLg4AM64uLi4uDgAzri4uLi4OADOuLi4uLg4AM64uLi4uDgAzri4uLi4OADOuLi4uLg4AI9ulpaWlpY2AG5cXFxcXBwAblxcXFxcHABuXFxcXFwcAG5cXFxcXBwAblxcXFxcHABuXFxcXFwcAG5cXFxcXBwAblxcXFxcHABuXFxcXFwcAG5cXFxcXBwAblxcXFxcHABuXFxcXFwcAG5cXFxcXBwAblxcXFxcHABuXFxcXFwcAI9uXFxcXFwcAG5cXFxcXBwA7gsMDAwMHAAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAI8uLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAI8uLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAK6wr6+vrwcALhcXFxcXBwAuFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAI8uFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAIYuFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAAA="_sd, + "gPz/////////CAAAgP7/////////AQAAAAAAAAAAYI/OxcXFxcXFAQ4AAAAAAAAB7uLi4uLi4gAuHR0dHR2dAI5xcXFxcXEAjnFxcXFxcQCOcXFxcXFxAK6rq6urq2sAzri4uLi4OADOuLi4uLg4AM64uLi4uDgAzri4uLi4OADOuLi4uLg4AM64uLi4uDgAzri4uLi4OADOuLi4uLg4AI9ulpaWlpY2AG5cXFxcXBwAblxcXFxcHABuXFxcXFwcAG5cXFxcXBwAblxcXFxcHABuXFxcXFwcAG5cXFxcXBwAblxcXFxcHABuXFxcXFwcAG5cXFxcXBwAblxcXFxcHABuXFxcXFwcAG5cXFxcXBwAblxcXFxcHABuXFxcXFwcAI9uXFxcXFwcAG5cXFxcXBwA7gsMDAwMHAAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAI8uLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAI8uLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAC4uLi4uLg4ALi4uLi4uDgAuLi4uLi4OAK6wr6+vrwcALhcXFxcXBwAuFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAI8uFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAIYuFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAC4XFxcXFwcALhcXFxcXBwAuFxcXFxcHAAA="_sd, }; for (auto&& binaryBase64 : binariesBase64) { diff --git a/src/mongo/bson/column/bsoncolumn_util.h b/src/mongo/bson/column/bsoncolumn_util.h index 8a0f44dc8e6..1ec9c1551da 100644 --- a/src/mongo/bson/column/bsoncolumn_util.h +++ b/src/mongo/bson/column/bsoncolumn_util.h @@ -40,6 +40,9 @@ static constexpr char kInterleavedStartControlByte = (char)0xF1; static constexpr char kInterleavedStartArrayRootControlByte = (char)0xF2; static constexpr uint8_t kInvalidScaleIndex = 0xFF; static constexpr uint8_t kInvalidControlByte = 0xFE; +static constexpr uint8_t kMaxNumSimple8bPerControl = 16; +static constexpr std::array + kControlByteForScaleIndex = {0x90, 0xA0, 0xB0, 0xC0, 0xD0, 0x80}; inline bool isUncompressedLiteralControlByte(uint8_t control) { return (control & 0xE0) == 0 || control == (uint8_t)stdx::to_underlying(BSONType::minKey) || diff --git a/src/mongo/bson/column/bsoncolumnbuilder.cpp b/src/mongo/bson/column/bsoncolumnbuilder.cpp index a8ec45ab816..d9675d67975 100644 --- a/src/mongo/bson/column/bsoncolumnbuilder.cpp +++ b/src/mongo/bson/column/bsoncolumnbuilder.cpp @@ -31,6 +31,7 @@ #include "mongo/base/string_data.h" #include "mongo/bson/bsonobjbuilder.h" +#include "mongo/bson/column/binary_reopen.h" #include "mongo/bson/column/bsoncolumn.h" #include "mongo/bson/column/bsoncolumn_util.h" #include "mongo/bson/column/simple8b.h" @@ -59,16 +60,12 @@ namespace mongo { using namespace bsoncolumn; namespace { -static constexpr uint8_t kMaxCount = 16; static constexpr uint8_t kCountMask = 0x0F; static constexpr uint8_t kControlMask = 0xF0; static constexpr std::ptrdiff_t kNoSimple8bControl = -1; static constexpr int kFinalizedOffset = -1; static constexpr size_t kDefaultBufferSize = 32; -static constexpr std::array - kControlByteForScaleIndex = {0x90, 0xA0, 0xB0, 0xC0, 0xD0, 0x80}; - template ptrdiff_t incrementSimple8bCount(allocator_aware::BufBuilder& buffer, ptrdiff_t& controlByteOffset, @@ -102,7 +99,7 @@ ptrdiff_t incrementSimple8bCount(allocator_aware::BufBuilder& buffer, // Write back new count and clear offset if we have reached max count *byte = control | (count & kCountMask); - if (count + 1 == kMaxCount) { + if (count + 1 == kMaxNumSimple8bPerControl) { auto prevControlByteOffset = controlByteOffset; controlByteOffset = kNoSimple8bControl; return prevControlByteOffset; @@ -386,6 +383,14 @@ void copyObjToBuffer(const BSONObj& obj, allocator_aware::SharedBuffer class BSONColumnBuilder::BinaryReopen { public: @@ -395,10 +400,9 @@ public: * BSONColumn::last(). We need this to leave 'previous' in the compressor correct to be able * to calculate deltas for future values. * - * 2. Remember the last two simple8b control blocks with their additional state from the - * decompressor. This is as far as we need to go back to be able to undo a previous - * 'BSONColumnBuilder::finalize()' call. The goal of this constructor is to leave this - * BSONColumnBuilder in an identical state as-if finalize() had never been called. + * 2. Record all control blocks since the last uncompressed element encountered during the scan. + * This is later used by reopen() to undo a previous 'BSONColumnBuilder::finalize() or + * BSONColumnBuilder::intermediate()' call. * * Returns 'false' if interleaved mode is encountered which is not supported in this * implementation. Full decompression+recompression must be done in this case. @@ -406,9 +410,11 @@ public: bool scan(const char* binary, int size); /* - * Initializes the provided BSONColumnBuilder from the state obtained from a previous scan. - * Effectively undos the 'finalize()' call from the BSONColumnBuilder used to produce this - * binary. + * Initializes the provided BSONColumnBuilder from the state obtained from a previous call to + * scan(). Effectively undos the 'finalize()' call from the BSONColumnBuilder used to produce + * this binary. + * + * scan() must be called before this function. */ void reopen(BSONColumnBuilder& builder, const Allocator&) const; @@ -421,7 +427,7 @@ private: allocator_aware::BufBuilder& buffer, int& offset, uint8_t& lastControl, - uint8_t& lastControlOffset) const; + uint16_t& lastControlOffset) const; void _reopen128BitTypes(EncodingState& regular, Encoder128& encoder, allocator_aware::BufBuilder& buffer, @@ -429,69 +435,50 @@ private: uint8_t& lastControl) const; /* - * Setup RLE state for Simple8bBuilder used to detect overflow. Returns the value needed to use - * as last for any Simple8b decoding while reopening. + * Writes the data to the internal buffer as needed to be maintained based on the calculated + * overflow point */ template - static boost::optional _setupRLEForOverflowDetector(Simple8bBuilder& overflowDetector, - const char* s8bBlock, - int index); - /* - * Appends data into a Simple8bBuilder used to detect overflow. Returns the index of the - * simple8b block that caused the overflow and sets the proper RLE state in the provided main - * Simple8bBuilder to be the last value in the block that caused the overflow. This function - * expects 'overflow' to be set to true when an overflow has occured. - * The second return value is an index to an RLE block if we have not overflowed yet. - */ - template - static std::pair _appendUntilOverflow(Simple8bBuilder& overflowDetector, - Simple8bBuilder& mainBuilder, - bool& overflow, - const boost::optional& lastValForRLE, - const char* s8bBlock, - int index); - /* - * Special case of _appendUntilOverflow when we know that the last simple8b block is RLE. It is - * trivial to calculate the overflow point as it will be inside the first discovered non-RLE - * block and the last value for RLE will be the actual value used for RLE. - */ - template - static std::pair, int> _appendUntilOverflowForRLE( - Simple8bBuilder& mainBuilder, - bool& overflow, - const char* s8bBlock, - int index); + void _writeBuffer(const bsoncolumn::internal::OverflowPoint& op, + allocator_aware::BufBuilder& buffer, + std::ptrdiff_t& controlByteOffset, + uint8_t scaleIndex) const; - struct ControlBlock { - const char* control = nullptr; - double lastAtEndOfBlock = 0.0; - uint8_t scaleIndex = 5; // reinterpret memory as integer - }; + /* + * Writes pending values after the overflow point to the internal encoder. + */ + template + void _writePendingValues(const bsoncolumn::internal::OverflowPoint& op, + allocator_aware::BufBuilder& buffer, + BSONType type, + Encoder& encoder, + std::ptrdiff_t& controlByteOffset) const; + bsoncolumn::internal::ControlBlockContainer controls; + BSONElement lastUncompressed; const char* scannedBinary; BSONColumn::Iterator::DecodingState state; - BSONElement lastUncompressed; int64_t lastUncompressedEncoded64; int128_t lastUncompressedEncoded128; bool lastLiteralUnencodable = false; - ControlBlock current; - ControlBlock last; }; template bool BSONColumnBuilder::BinaryReopen::scan(const char* binary, int size) { - // Attempt to initialize the compressor from the provided binary, we have a fallback of full - // decompress+recompress if anything unsupported is detected. This allows us to "support" the - // full BSONColumn spec. + // Scan the BSONColumn binary and collect all encountered control blocks without performing + // decompression. scannedBinary = binary; const char* pos = binary; const char* end = binary + size; + double lastAtEndOfBlock = 0.0; // Last encountered non-RLE block during binary scan uint64_t lastNonRLE = simple8b::kSingleZero; int128_t lastNonZeroDeltaForUnencodable{0}; + // Scan the entire binary. pos is always positioned at a control block. while (pos != end) { + // Dereference control block to get control byte. uint8_t control = *pos; // Stop at end terminal @@ -514,25 +501,27 @@ bool BSONColumnBuilder::BinaryReopen::scan(const char* binary, int si return false; } - // Remember last control byte - last = current; - + // Uncompressed literal detected. We record its state and clear our collected control blocks + // as we will never need to reopen past this point. if (isUncompressedLiteralControlByte(control)) { BSONElement element(pos, 1, BSONElement::TrustedInitTag{}); + + // Previous control blocks are no longer needed. Reopen only needs to consider data + // after the last uncompressed literal. + controls.clear(); state.loadUncompressed(element); // Uncompressed literal case lastUncompressed = element; lastNonRLE = simple8b::kSingleZero; - current.control = nullptr; - last.control = nullptr; lastLiteralUnencodable = false; + // Calculate the initial state for this literal. if (!uses128bit(lastUncompressed.type())) { auto& d64 = std::get(state.decoder); lastUncompressedEncoded64 = d64.lastEncodedValue; if (element.type() == BSONType::numberDouble) { - current.lastAtEndOfBlock = lastUncompressed._numberDouble(); + lastAtEndOfBlock = lastUncompressed._numberDouble(); } } else { auto& d128 = @@ -549,6 +538,7 @@ bool BSONColumnBuilder::BinaryReopen::scan(const char* binary, int si } } + // Advance iteration past this uncompressed literal to the next control block pos += element.size(); continue; } @@ -570,17 +560,19 @@ bool BSONColumnBuilder::BinaryReopen::scan(const char* binary, int si // For doubles we need to remember the last value from the previous block (as // the scaling can change between blocks). if (lastUncompressed.type() == BSONType::numberDouble) { - auto encoded = - Simple8bTypeUtil::encodeDouble(current.lastAtEndOfBlock, d64.scaleIndex); + auto encoded = Simple8bTypeUtil::encodeDouble(lastAtEndOfBlock, d64.scaleIndex); uassert(8288101, "Invalid double encoding in BSON Column", encoded); d64.lastEncodedValue = *encoded; } if (usesDeltaOfDelta(lastUncompressed.type())) { + // Delta-of-delta is encoded with prefix sum d64.lastEncodedValueForDeltaOfDelta = expandDelta(d64.lastEncodedValueForDeltaOfDelta, simple8b::prefixSum( pos + 1, blocksSize, d64.lastEncodedValue, lastNonRLE)); } else if (onlyZeroDelta(lastUncompressed.type())) { + // For types without value, we make sure that the simple8b blocks only contain 0 or + // skip simple8b::visitAll( pos + 1, blocksSize, @@ -590,16 +582,20 @@ bool BSONColumnBuilder::BinaryReopen::scan(const char* binary, int si }, []() {}); } else { + // Remaining types are using delta, which is calculated using a regular sum d64.lastEncodedValue = expandDelta( d64.lastEncodedValue, simple8b::sum(pos + 1, blocksSize, lastNonRLE)); + // For the double type we also need to decode this last value as the last double of + // the previous block needs to be maintained. if (lastUncompressed.type() == BSONType::numberDouble) { - current.lastAtEndOfBlock = + lastAtEndOfBlock = Simple8bTypeUtil::decodeDouble(d64.lastEncodedValue, d64.scaleIndex); } } - current.scaleIndex = d64.scaleIndex; + // Record this control block + controls.push_back({pos, lastAtEndOfBlock, d64.scaleIndex}); } else { uassert(8827801, "Invalid control byte in BSON Column", @@ -617,6 +613,8 @@ bool BSONColumnBuilder::BinaryReopen::scan(const char* binary, int si }; if (zeroDeltaOnly()) { + // For types without value, we make sure that the simple8b blocks only contain 0 or + // skip simple8b::visitAll( pos + 1, blocksSize, @@ -629,6 +627,8 @@ bool BSONColumnBuilder::BinaryReopen::scan(const char* binary, int si auto& d128 = std::get(state.decoder); if (!lastLiteralUnencodable) { + // Normal case where we can encode our last literal we calculate the delta using + // sum. d128.lastEncodedValue = expandDelta(d128.lastEncodedValue, simple8b::sum(pos + 1, blocksSize, lastNonRLE)); @@ -649,12 +649,14 @@ bool BSONColumnBuilder::BinaryReopen::scan(const char* binary, int si []() {}); } } + // Record this control block + controls.push_back({pos, lastAtEndOfBlock, Simple8bTypeUtil::kMemoryAsInteger}); } - // Remember control block and advance the position to next - current.control = pos; + // Advance the position to next control block pos += blocksSize + 1; } + // Throw if we reach the end without the expected sentinel. uasserted(8288102, "Unexpected end of BSONColumn binary"); } @@ -664,10 +666,12 @@ void BSONColumnBuilder::BinaryReopen::reopen(BSONColumnBuilder& build auto& regular = std::get(builder._is.state); // When the binary ends with an uncompressed element it is simple to re-initialize the // compressor - if (!current.control) { + if (controls.empty()) { auto& encoder = std::get(regular._encoder); // Set last double in previous block (if any). - encoder.lastValueInPrevBlock = last.lastAtEndOfBlock; + encoder.lastValueInPrevBlock = lastUncompressed.type() == BSONType::numberDouble + ? lastUncompressed._numberDouble() + : 0.0; // Append the last element to finish setting up the compressor builder.append(lastUncompressed); @@ -680,8 +684,9 @@ void BSONColumnBuilder::BinaryReopen::reopen(BSONColumnBuilder& build } if (!uses128bit(lastUncompressed.type())) { + // Initialize the internal encoder for 64 bit types and perform the reopen. auto& encoder = std::get(regular._encoder); - encoder.scaleIndex = current.scaleIndex; + encoder.scaleIndex = controls.back().scaleIndex; _reopen64BitTypes(regular, encoder, @@ -690,6 +695,7 @@ void BSONColumnBuilder::BinaryReopen::reopen(BSONColumnBuilder& build builder._is.lastControl, builder._is.lastControlOffset); } else { + // Initialize the internal encoder for 128 bit types and perform the reopen. auto& encoder = regular._encoder.template emplace(allocator); _reopen128BitTypes( regular, encoder, builder._bufBuilder, builder._is.offset, builder._is.lastControl); @@ -705,251 +711,32 @@ void BSONColumnBuilder::BinaryReopen::_reopen64BitTypes( allocator_aware::BufBuilder& buffer, int& offset, uint8_t& lastControl, - uint8_t& lastControlOffset) const { - // The main difficulty with re-initializing the compressor from a compressed binary is - // to undo the 'finalize()' call where pending values are flushed out to simple8b - // blocks. We need to undo this operation by putting values back into the pending state. - // The algorithm to perform this is to start from the end and add the values to a dummy - // Simple8bBuilder and discover when this becomes full and writes out a simple8b block. - // We will call this the 'overflow' point and all values in subsequent blocks in the - // binary can be put back in the pending state. + uint16_t& lastControlOffset) const { + + using namespace bsoncolumn::internal; + + // 1. Calculate overflow point. Anything before the overflow needs to be written to the buffer + // and anything after is put back into the pending state. + OverflowState overflow(controls.back()); + const OverflowPoint& op = overflow.detect(controls); + + // 2. Write buffer based on our overflow point. + offset = op.control() - scannedBinary; + _writeBuffer(op, buffer, regular._controlByteOffset, encoder.scaleIndex); + + // 3. Add remaining values as pending BSONType type = lastUncompressed.type(); - const char* control = current.control; - const char* extraS8b = nullptr; - bool overflow = false; - Simple8bBuilder s8bBuilder; + _writePendingValues(op, buffer, type, encoder, regular._controlByteOffset); - // Calculate how many simple8b blocks this control byte contains - auto currNumBlocks = numSimple8bBlocksForControlByte(*control); - - // First setup RLE state, the implementation for doing this differ if the last block actually - // ends with RLE or not. - const char* lastBlock = control + (sizeof(uint64_t) * (currNumBlocks - 1)) + 1; - bool rle = (ConstDataView(lastBlock).read>() & - simple8b_internal::kBaseSelectorMask) == simple8b_internal::kRleSelector; - - boost::optional lastForS8b; - // Current overflow point - int currIndex; - // Pending RLE block in current control when overflow has not happened yet. - int pendingRle = -1; - if (rle) { - // If the last block ends with RLE we just need to look for the last non-RLE block to - // discover the overflow point. The last value for RLE will be the actual last in this block - // as we know the RLE will follow. - std::tie(lastForS8b, currIndex) = _appendUntilOverflowForRLE( - encoder.simple8bBuilder, overflow, control, currNumBlocks - 2); - - } else { - // Assume that the last value in Simple8b blocks is the same as the one before the first. - // This assumption will hold if all values are equal and RLE is eligible. If it turns out to - // be incorrect the Simple8bBuilder will internally reset and disregard RLE. - lastForS8b = _setupRLEForOverflowDetector(s8bBuilder, control, currNumBlocks - 1); - - // When RLE is setup we append as many values as we can to detect when we overflow - std::tie(currIndex, pendingRle) = _appendUntilOverflow( - s8bBuilder, encoder.simple8bBuilder, overflow, lastForS8b, control, currNumBlocks - 1); - } - - // If we have pending RLE but no more control blocks to consider then set last for RLE to 0 as - // the binary begins with RLE. - if (!overflow && !last.control && pendingRle != -1) { - lastForS8b = 0; - } - - // If we have not yet overflowed then continue the same operation from the previous - // simple8b block - if (!overflow && last.control) { - auto blocks = numSimple8bBlocksForControlByte(*last.control); - // Append values from control block to detect overflow. If the scale indices are - // different we can skip this as we know we will not find a useful overflow point - // here. - int overflowIndex; - // Flag to back out of processing last control if we determined that overflow happened in - // RLE in current. - bool resumeCurrent = false; - if (current.scaleIndex == last.scaleIndex) { - if (rle) { - std::tie(lastForS8b, overflowIndex) = _appendUntilOverflowForRLE( - encoder.simple8bBuilder, overflow, last.control, blocks - 1); - } else if (pendingRle != -1) { - // Pending RLE block from current control we need to find overflow where we had our - // overflow. - auto [lastForRLE, rleIndexOverflow] = _appendUntilOverflowForRLE( - encoder.simple8bBuilder, overflow, last.control, blocks - 1); - if (lastForRLE == lastForS8b) { - // Last value prior to RLE matches our RLE state after RLE. We then overflow in - // the block prior to RLE. - overflowIndex = rleIndexOverflow; - } else { - // Values to not match, so the overflow happened in the pending RLE block. - currIndex = pendingRle; - resumeCurrent = true; - } - } else { - std::tie(overflowIndex, pendingRle) = _appendUntilOverflow(s8bBuilder, - encoder.simple8bBuilder, - overflow, - lastForS8b, - last.control, - blocks - 1); - } - } else { - overflowIndex = blocks - 1; - // Because we did not yet overflow we need to set last value in our simple8b - // builder to the last value in previous block to be able to resume with RLE. - Simple8b s8b(last.control + - /* offset to block at index */ overflowIndex * - sizeof(uint64_t) + - /* control byte*/ 1, - /* one block at a time */ sizeof(uint64_t)); - for (auto&& elem : s8b) { - lastForS8b = elem; - } - - // Re-calculate if we will overflow on the current control block using this lastForRLE. - Simple8bBuilder s8bBuilder; - s8bBuilder.setLastForRLE(lastForS8b); - // We're not intrested where the overflow happens, just if it happens or not. This - // function will set the 'overflow' variable. - _appendUntilOverflow(s8bBuilder, - encoder.simple8bBuilder, - overflow, - lastForS8b, - control, - currNumBlocks - 1); - // Enforce the lastForRle. - encoder.simple8bBuilder.setLastForRLE(lastForS8b); - } - - if (!resumeCurrent) { - // Check if we overflowed in the first simple8b in this second control block. We can - // then disregard this control block and proceed as-if we didn't overflow in the - // first as there's nothing to re-write in the second control block. - if (overflowIndex == blocks - 1) { - // If the previous control block was not full, and we scaled then we need to - // determine if we should consider the overflow happening in this block or not. This - // can happen for the double type where we might not fill the control block with - // values due to scaling. - // - // To determine if we overflowed here due to the new value being incompatible with - // the previous scale factor, we will check if the first value can be re-scaled into - // the new scale factor. If we cannot rescale then we need to proceed as if the last - // control block didn't exist as it will never be possible to undo it. - // - // We could have also scaled down because the types of values have changed where the - // larger scale factor is no longer needed. This can be completely undone if all - // these values still fit in pending, then this entire control block with the new - // scale factor need to be reversed. - if (blocks != 16 && current.scaleIndex != last.scaleIndex) { - // Encode last using new scale factor - auto encoded = - Simple8bTypeUtil::encodeDouble(last.lastAtEndOfBlock, current.scaleIndex); - Simple8b rescale( - control + 1, currNumBlocks * sizeof(uint64_t), lastForS8b); - // Initialize if scaling is possible on the overflow state of the current - // control block. - bool discardCurrent = !overflow; - // See if next value can be scaled using the old scale factor - for (auto&& elem : rescale) { - if (elem) { - // See if this value is possible to scale using the old scale factor - encoded = expandDelta(*encoded, Simple8bTypeUtil::decodeInt64(*elem)); - if (!Simple8bTypeUtil::encodeDouble( - Simple8bTypeUtil::decodeDouble(*encoded, current.scaleIndex), - last.scaleIndex)) { - // Not possible to scale this value using the last scale factor. The - // previous control block will then never be needed and we need to - // make sure that we do not discard the current control block. - discardCurrent = false; - } - } - break; - } - - if (discardCurrent) { - // We need to discard the current control block. Treat this as a special - // overflow where we append the necessary overflow data from the last - // control block but the state is marked as no overflow. We will then append - // all remaining values and the state will be setup accordingly - lastControlOffset = sizeof(uint64_t) * blocks + 1; - - buffer.appendBuf(last.control, lastControlOffset); - - // offset will temporarily set to a negative value to compensate for the - // buffer we wrote above even when there's no overflow. Later on we will add - // a larger value which will make it positive again. - offset -= lastControlOffset; - - regular._controlByteOffset = 0; - } - } - - overflow = false; - } else { - // If overflow happens later, we switch to this control byte as our new - // 'current'. The previous current is remembered so we can add its values to - // pending later. - extraS8b = control; - control = last.control; - currNumBlocks = blocks; - currIndex = overflowIndex; - } - } - } - - if (!overflow) { - // No overflow, discard entire buffer and record the offset up to this control byte. We will - // then add everything in this control as pending which might write a control block again - // because the values are now added in the correct order. - offset += control - scannedBinary; - } else { - // Overflow, copy everything from the control byte up to the overflow point - buffer.appendBuf(control, 1 + (currIndex + 1) * sizeof(uint64_t)); - - // Set binary offset to this control byte (the binary starts with it, see the copy above) - regular._controlByteOffset = 0; - offset = control - scannedBinary; - - // Update count inside last control byte - char* lastControlToUpdate = buffer.buf() + regular._controlByteOffset; - *lastControlToUpdate = - kControlByteForScaleIndex[encoder.scaleIndex] | (currIndex & kCountMask); - } - - // Append remaining values from our current control block and add all from the next - // block if needed - auto appendPending = [&](const Simple8b& s8b) { - for (auto&& elem : s8b) { - if (elem) { - encoder.append( - type, *elem, buffer, regular._controlByteOffset, NoopControlBlockWriter{}); - } else { - encoder.skip(type, buffer, regular._controlByteOffset, NoopControlBlockWriter{}); - } - } - }; - - // Append all our pending values - appendPending(Simple8b(control + sizeof(uint64_t) * (currIndex + 1) + 1, - (currNumBlocks - currIndex - 1) * sizeof(uint64_t), - lastForS8b)); - - if (extraS8b) { - appendPending( - Simple8b(extraS8b + 1, - numSimple8bBlocksForControlByte(*extraS8b) * sizeof(uint64_t), - lastForS8b)); - } - - // Reset last value if RLE is not possible due to the values appended above - encoder.simple8bBuilder.resetLastForRLEIfNeeded(); - - // Finally we need to set the necessary state to calculate deltas for future inserts. We - // can take this from our decompressor state. + // 4. Set up the encoder so we can calculate deltas for future appends into this + // BSONColumnBuilder. Some of this data can be taken from our decoder that was setup in the + // scan() function. auto& d64 = std::get(state.decoder); - // Hacky way to get an allocator to be able to materialize the last value. + // 5. Store last uncompressed value. We need to materialize it from the encoded state, this + // requires an allocator that we can take from a fresh BSONColum instance. This memory is + // temporary and _storePrevious will copy it again using the correct allocator used by this + // BSONColumnBuilder. auto allocator = BSONColumn(nullptr, 1).release(); bool deltaOfDelta = usesDeltaOfDelta(type); regular._storePrevious([&]() { @@ -966,8 +753,8 @@ void BSONColumnBuilder::BinaryReopen::_reopen64BitTypes( return d64.materialize(*allocator, lastUncompressed, ""_sd); }()); - // _prevEncoded64 is just set for a few types. We don't use Encoder64::initialize() as it - // overwrites more members already set by this function. + // 6. Store the previous encoded state, this is typically a copy from the decoder. We cannot use + // Encoder64::initialize() as it overwrites more members already set by this reopen procedure. if (deltaOfDelta) { if (type == BSONType::oid) { encoder.prevEncoded64 = d64.lastEncodedValueForDeltaOfDelta; @@ -977,16 +764,19 @@ void BSONColumnBuilder::BinaryReopen::_reopen64BitTypes( if (type == BSONType::numberDouble) { encoder.prevEncoded64 = d64.lastEncodedValue; - // Calculate last double in previous block by reversing the final pending state and - // final delta. + // 7. For the double type we also have to calculate the last double from the last block + // written to the buffer. We calculate this by going backwards from our last unencoded + // value and subtract deltas from our pending state. This can be skipped if we know that + // all values are identical with all 0 deltas or skip as that doesn't affect the delta. auto current = encoder.prevEncoded64; - for (auto it = encoder.simple8bBuilder.rbegin(), end = encoder.simple8bBuilder.rend(); - it != end; - ++it) { - if (const boost::optional& encoded = *it) { - // As we're going backwards we need to 'expandDelta' backwards which is the same - // as 'calcDelta'. - current = calcDelta(current, Simple8bTypeUtil::decodeInt64(*encoded)); + if (!op.allValuesIdentical() || op.last().value_or(0) != 0) { + for (auto it = encoder.simple8bBuilder.rbegin(), + end = encoder.simple8bBuilder.rend(); + it != end; + ++it) { + if (const boost::optional& encoded = *it) { + current = calcDelta(current, Simple8bTypeUtil::decodeInt64(*encoded)); + } } } @@ -995,16 +785,25 @@ void BSONColumnBuilder::BinaryReopen::_reopen64BitTypes( } } + // 8. Final adjustments to offset and control bytes. When appending values to pending in step + // (3) simple8b blocks can be flushed out because our overflow point wasn't calculated correctly + // because we used an estimate for the last value rather than the actual last. Likewise, the + // scale factor might not be correctly set. if (regular._controlByteOffset == kNoSimple8bControl) { - // Appending pending values can flush out the control byte and leave all remaining values as - // pending. We can discard our buffer in this case as this is equivalent to overflowing in - // the last simple8b of the 'last' control block. + // Simple8b block was flushed out during append of pending and it caused the control block + // to be full. This needs to be treated as a no-overflow which we do by increasing our + // binary offset and discard the buffer. offset += buffer.len(); buffer.setlen(0); } else { + // Ensure that the correct scale index is written to the last control byte. + char* lastControlToUpdate = buffer.buf() + regular._controlByteOffset; + *lastControlToUpdate = + kControlByteForScaleIndex[op.scaleIndex()] | (*lastControlToUpdate & kCountMask); // Set last control to current - lastControl = *control; + lastControl = op.lastControl(); } + lastControlOffset = op.lastControlOffset(); } template @@ -1014,165 +813,30 @@ void BSONColumnBuilder::BinaryReopen::_reopen128BitTypes( allocator_aware::BufBuilder& buffer, int& offset, uint8_t& lastControl) const { - // The main difficulty with re-initializing the compressor from a compressed binary is - // to undo the 'finalize()' call where pending values are flushed out to simple8b - // blocks. We need to undo this operation by putting values back into the pending state. - // The algorithm to perform this is to start from the end and add the values to a dummy - // Simple8bBuilder and discover when this becomes full and writes out a simple8b block. - // We will call this the 'overflow' point and all values in subsequent blocks in the - // binary can be put back in the pending state. - const char* control = current.control; - const char* extraS8b = nullptr; - bool overflow = false; - Simple8bBuilder s8bBuilder; - // Calculate how many simple8b blocks this control byte contains - auto currNumBlocks = numSimple8bBlocksForControlByte(*control); + using namespace bsoncolumn::internal; - // First setup RLE state, the implementation for doing this differ if the last block actually - // ends with RLE or not. - const char* lastBlock = control + (sizeof(uint64_t) * (currNumBlocks - 1)) + 1; - bool rle = (ConstDataView(lastBlock).read>() & - simple8b_internal::kBaseSelectorMask) == simple8b_internal::kRleSelector; + // 1. Calculate overflow point. Anything before the overflow needs to be written to the buffer + // and anything after is put back into the pending state. + OverflowState overflow(controls.back()); + const OverflowPoint& op = overflow.detect(controls); - boost::optional lastForS8b; - int currIndex; - int pendingRle = -1; - if (rle) { - // If the last block ends with RLE we just need to look for the last non-RLE block to - // discover the overflow point. The last value for RLE will be the actual last in this block - // as we know the RLE will follow. - std::tie(lastForS8b, currIndex) = _appendUntilOverflowForRLE( - encoder.simple8bBuilder, overflow, control, currNumBlocks - 2); + // 2. Write buffer based on our overflow point. + offset = op.control() - scannedBinary; + _writeBuffer(op, buffer, regular._controlByteOffset, Simple8bTypeUtil::kMemoryAsInteger); - } else { - // Assume that the last value in Simple8b blocks is the same as the one before the first. - // This assumption will hold if all values are equal and RLE is eligible. If it turns out to - // be incorrect the Simple8bBuilder will internally reset and disregard RLE. - lastForS8b = _setupRLEForOverflowDetector(s8bBuilder, control, currNumBlocks - 1); + // 3. Add remaining values as pending + _writePendingValues(op, buffer, lastUncompressed.type(), encoder, regular._controlByteOffset); - // When RLE is setup we append as many values as we can to detect when we overflow - std::tie(currIndex, pendingRle) = _appendUntilOverflow( - s8bBuilder, encoder.simple8bBuilder, overflow, lastForS8b, control, currNumBlocks - 1); - } - - // If we have pending RLE but no more control blocks to consider then set last for RLE to 0 as - // the binary begins with RLE. - if (!overflow && !last.control && pendingRle != -1) { - lastForS8b = 0; - } - - // If we have not yet overflowed then continue the same operation from the previous - // simple8b block - if (!overflow && last.control) { - - auto blocks = numSimple8bBlocksForControlByte(*last.control); - // Append values from control block to detect overflow. - int overflowIndex; - // Flag to back out of processing last control if we determined that overflow happened in - // RLE in current. - bool resumeCurrent = false; - if (rle) { - std::tie(lastForS8b, overflowIndex) = _appendUntilOverflowForRLE( - encoder.simple8bBuilder, overflow, last.control, blocks - 1); - } else if (pendingRle != -1) { - // Pending RLE block from current control we need to find overflow where we had our - // overflow. - auto [lastForRLE, rleIndexOverflow] = _appendUntilOverflowForRLE( - encoder.simple8bBuilder, overflow, last.control, blocks - 1); - if (lastForRLE == lastForS8b) { - // Last value prior to RLE matches our RLE state after RLE. We then overflow in - // the block prior to RLE. - overflowIndex = rleIndexOverflow; - } else { - // Values to not match, so the overflow happened in the pending RLE block. - currIndex = pendingRle; - resumeCurrent = true; - } - } else { - std::tie(overflowIndex, pendingRle) = _appendUntilOverflow(s8bBuilder, - encoder.simple8bBuilder, - overflow, - lastForS8b, - last.control, - blocks - 1); - } - - if (!resumeCurrent) { - // Check if we overflowed in the first simple8b in this second control block. We can - // then disregard this control block and proceed as-if we didn't overflow in the - // first as there's nothing to re-write in the second control block. - if (overflowIndex == blocks - 1) { - overflow = false; - } else { - // If overflow happens later, we switch to this control byte as our new - // 'current'. The previous current is remembered so we can add its values to - // pending later. - extraS8b = control; - control = last.control; - currNumBlocks = blocks; - currIndex = overflowIndex; - } - } - } - - if (!overflow) { - // No overflow, discard entire buffer and record the offset up to this control byte. We will - // then add everything in this control as pending which might write a control block again - // because the values are now added in the correct order. - offset = control - scannedBinary; - } else { - // Overflow, copy everything from the control byte up to the overflow point - buffer.appendBuf(control, 1 + (currIndex + 1) * sizeof(uint64_t)); - - // Set binary offset to this control byte (the binary starts with it, see the copy above) - regular._controlByteOffset = 0; - offset = control - scannedBinary; - - // Update count inside last control byte - char* lastControlToUpdate = buffer.buf() + regular._controlByteOffset; - *lastControlToUpdate = kControlByteForScaleIndex[Simple8bTypeUtil::kMemoryAsInteger] | - (currIndex & kCountMask); - } - - // Append remaining values from our current control block and add all from the next - // block if needed - auto appendPending = [&](const Simple8b& s8b) { - for (auto&& elem : s8b) { - if (elem) { - encoder.append(lastUncompressed.type(), - *elem, - buffer, - regular._controlByteOffset, - NoopControlBlockWriter{}); - } else { - encoder.skip(lastUncompressed.type(), - buffer, - regular._controlByteOffset, - NoopControlBlockWriter{}); - } - } - }; - - appendPending(Simple8b(control + sizeof(uint64_t) * (currIndex + 1) + 1, - (currNumBlocks - currIndex - 1) * sizeof(uint64_t), - lastForS8b)); - - if (extraS8b) { - appendPending( - Simple8b(extraS8b + 1, - numSimple8bBlocksForControlByte(*extraS8b) * sizeof(uint64_t), - lastForS8b)); - } - - // Reset last value if RLE is not possible due to the values appended above - encoder.simple8bBuilder.resetLastForRLEIfNeeded(); - - // Finally we need to set the necessary state to calculate deltas for future inserts. We - // can take this from our decompressor state. + // 4. Set up the encoder so we can calculate deltas for future appends into this + // BSONColumnBuilder. Some of this data can be taken from our decoder that was setup in the + // scan() function. auto& d128 = std::get(state.decoder); - // Hacky way to get an allocator to be able to materialize the last value. + // 5. Store last uncompressed value. We need to materialize it from the encoded state, this + // requires an allocator that we can take from a fresh BSONColum instance. This memory is + // temporary and _storePrevious will copy it again using the correct allocator used by this + // BSONColumnBuilder. auto allocator = BSONColumn(nullptr, 1).release(); regular._storePrevious([&]() { // Zero delta is repeat of last uncompressed literal, avoid materialization (which might @@ -1185,166 +849,119 @@ void BSONColumnBuilder::BinaryReopen::_reopen128BitTypes( } return d128.materialize(*allocator, lastUncompressed, ""_sd); }()); + // 6. Initialize our encoder with the previous value. encoder.initialize(regular._previous()); + // 7. Final adjustments to offset and control bytes. When appending values to pending in step + // (3) simple8b blocks can be flushed out because our overflow point wasn't calculated correctly + // because we used an estimate for the last value rather than the actual last. Likewise, the + // scale factor might not be correctly set. if (regular._controlByteOffset == kNoSimple8bControl) { - // Appending pending values can flush out the control byte and leave all remaining values as - // pending. We can discard our buffer in this case as this is equivalent to overflowing in - // the last simple8b of the 'last' control block. + // Simple8b block was flushed out during append of pending and it caused the control block + // to be full. This needs to be treated as a no-overflow which we do by increasing our + // binary offset and discard the buffer. offset += buffer.len(); buffer.setlen(0); } else { // Set last control to current - lastControl = *control; + lastControl = op.lastControl(); } } template template -boost::optional BSONColumnBuilder::BinaryReopen::_setupRLEForOverflowDetector( - Simple8bBuilder& overflowDetector, const char* s8bBlock, int index) { - // Limit the search for a non-skip value. If we go above 60 without overflow then we consider - // skip to be the last value for RLE as it would be the only one eligible for RLE. - constexpr int kMaxNumSkipInNonRLEBlock = 60; - for (int numSkips = 0; index >= 0 && numSkips < kMaxNumSkipInNonRLEBlock; --index) { - const char* block = s8bBlock + sizeof(uint64_t) * index + 1; - bool rle = (ConstDataView(block).read>() & - simple8b_internal::kBaseSelectorMask) == simple8b_internal::kRleSelector; - // Abort this operation when an RLE block is found, they are handled in a separate code - // path. - if (rle) { - break; - } - Simple8b s8b(block, sizeof(uint64_t)); - for (auto it = s8b.begin(), end = s8b.end(); - it != end && numSkips < kMaxNumSkipInNonRLEBlock; - ++it) { - const auto& elem = *it; - if (elem) { - // We do not need to use the actual last value for RLE when determining overflow - // point later. We can use the first value we discover when performing this - // iteration. For a RLE block to be undone and put back into the pending state all - // values need to be the same. So if a value later in this Simple8b block is - // different from this value we cannot undo all these containing a RLE. If the - // values are not all the same we will not fit 120 zeros in pending and the RLE - // block will be left as-is. - overflowDetector.setLastForRLE(elem); - return elem; - } - ++numSkips; - } +void BSONColumnBuilder::BinaryReopen::_writeBuffer( + const bsoncolumn::internal::OverflowPoint& op, + allocator_aware::BufBuilder& buffer, + std::ptrdiff_t& controlByteOffset, + uint8_t scaleIndex) const { + + // Nothing to write to the buffer if we don't have overflow + if (!op.overflow()) { + return; } - // We did not find any value, so use skip as RLE. It is important that we use 'none' to - // interpret RLE blocks going forward so we can properly undo simple8b blocks containing all - // skip and RLE blocks. - overflowDetector.setLastForRLE(boost::none); - return boost::none; + + // Copy everything from the control byte up to the overflow point + buffer.appendBuf(op.control(), + /* include simple block with overflow */ (op.index() + 1) * sizeof(uint64_t) + + /* one extra byte for the control */ 1); + + // Set binary offset to this control byte, the binary starts with it. + controlByteOffset = 0; + + // Write count inside the control byte + char* lastControlToUpdate = buffer.buf() + controlByteOffset; + *lastControlToUpdate = kControlByteForScaleIndex[scaleIndex] | (op.index() & kCountMask); } template -template -std::pair BSONColumnBuilder::BinaryReopen::_appendUntilOverflow( - Simple8bBuilder& overflowDetector, - Simple8bBuilder& mainBuilder, - bool& overflow, - const boost::optional& lastValForRLE, - const char* s8bBlock, - int index) { - auto writeFn = [&overflow](uint64_t block) mutable { - overflow = true; - }; - for (; index >= 0; --index) { - const char* block = s8bBlock + - /* offset to block at index */ index * sizeof(uint64_t) + - /* control byte*/ - 1; - bool rle = (ConstDataView(block).read>() & - simple8b_internal::kBaseSelectorMask) == simple8b_internal::kRleSelector; - if (rle) { - // RLE detected, we need to continue to detect overflow if the overflow detector is in a - // state where RLE is possible. Depending on if the last value before the RLE block - // matches or current last we overflowed in this RLE block or in the first non-RLE block - // prior. - if (overflowDetector.rlePossible()) { - auto [lastForRLE, rleIndexOverflow] = - _appendUntilOverflowForRLE(mainBuilder, overflow, s8bBlock, index - 1); - if (lastForRLE == lastValForRLE) { - // Last value prior to RLE matches our RLE state after RLE. We then overflow in - // the block prior to RLE. - return std::pair(rleIndexOverflow, -1); - } else if (rleIndexOverflow == -1) { - // We exhausted this control block without determining where the overflow point - // is. Return pending RLE index so we can continue this operation in the prior - // control block. - return std::pair(-1, index); +template +void BSONColumnBuilder::BinaryReopen::_writePendingValues( + const bsoncolumn::internal::OverflowPoint& op, + allocator_aware::BufBuilder& buffer, + BSONType type, + Encoder& encoder, + std::ptrdiff_t& controlByteOffset) const { + using namespace bsoncolumn::internal; + // First we need to initialize the RLE state for our internal simple8b builder, this is simply + // taken from the calculated last value. If we know that all values are identical, we can invoke + // a fast-path to count number of elements stored and initialize the RLE count with this. + size_t num = 0; + if (op.allValuesIdentical()) { + // Calculate number of elements after the overflow point. + num = simple8b::count(s8b(op.control(), op.index() + 1), + (numSimple8bBlocksForControlByte(*op.control()) - op.index() - 1) * + sizeof(uint64_t)); + // Add all remaining elements to our count. + for (auto&& cb : op.remaining()) { + num += simple8b::count(s8b(cb.control, 0), + numSimple8bBlocksForControlByte(*cb.control) * sizeof(uint64_t)); + } + } + + // Initialize the builder with our RLE state and optional count. + encoder.simple8bBuilder = + Simple8bBuilder(op.last(), num, encoder.simple8bBuilder.allocator()); + + // If all values are not identical, we append them now in order to our simple8b builder. This + // may trigger simple8b blocks to be written out if the overflow point was inaccuratley + // calculated to be too early because we used an estimated last. + if (!op.allValuesIdentical()) { + // Helper function to perform the append. It extracts all simple8b values and appends them + // to our builder. Returning the last observed value. + auto append = [&](const char* block, + size_t size, + const boost::optional& last) -> boost::optional { + Simple8b decompressor(block, size, last); + auto it = decompressor.begin(); + auto end = decompressor.end(); + for (; it != end; ++it) { + auto&& elem = *it; + if (elem) { + encoder.append( + type, *elem, buffer, controlByteOffset, NoopControlBlockWriter{}); + } else { + encoder.skip(type, buffer, controlByteOffset, NoopControlBlockWriter{}); } - } else { - // Got RLE block when we're not in a state to do RLE, then we're guaranteed to - // overflow in that RLE. No need to actually append the data, just set the overflow - // flag and return. - overflow = true; } + // It is safe to dereference iterator when we've reached end. + return *it; + }; - // Overflow inside the RLE block, we're done. - break; - } + auto last = op.last(); + // Append all values after the overflow point + last = append(s8b(op.control(), op.index() + 1), + (numSimple8bBlocksForControlByte(*op.control()) - op.index() - 1) * + sizeof(uint64_t), + last); - Simple8b s8b(block, - /* one block at a time */ sizeof(uint64_t), - lastValForRLE); - boost::optional last; - for (auto&& elem : s8b) { - last = elem; - if (elem) { - overflowDetector.append(*last, writeFn); - } else { - overflowDetector.skip(writeFn); - } - } - - if (overflow) { - // Overflow point detected, record the last value in last Simple8b block - // before our pending values. This is necessary to be able to resume with - // RLE. - mainBuilder.setLastForRLE(last); - break; + // Append all values from the remaining blocks. + for (auto&& cb : op.remaining()) { + last = append(s8b(cb.control, 0), + numSimple8bBlocksForControlByte(*cb.control) * sizeof(uint64_t), + last); } } - return std::pair(index, -1); -} - -template -template -std::pair, int> -BSONColumnBuilder::BinaryReopen::_appendUntilOverflowForRLE( - Simple8bBuilder& mainBuilder, bool& overflow, const char* s8bBlock, int index) { - for (; index >= 0; --index) { - const char* block = s8bBlock + - /* offset to block at index */ index * sizeof(uint64_t) + - /* control byte*/ 1; - bool rle = (ConstDataView(block).read>() & - simple8b_internal::kBaseSelectorMask) == simple8b_internal::kRleSelector; - - if (rle) { - continue; - } - - Simple8b s8b(block, sizeof(uint64_t), T{}); - - boost::optional last; - for (auto&& elem : s8b) { - last = elem; - } - - // Overflow point detected, record the last value in last Simple8b block - // before our pending values. This is necessary to be able to resume with - // RLE. - mainBuilder.setLastForRLE(last); - overflow = true; - return std::make_pair(last, index); - } - - return std::make_pair(T{}, index); } template @@ -1616,9 +1233,29 @@ typename BSONColumnBuilder::BinaryDiff BSONColumnBuilder:: newState.offset += controlOffset; newState.lastBufLength = length - controlOffset; + // Helper to calculate the offset to the last control byte from an existing known control + // byte. This is needed when the stream ends with a completely full control byte as the + // BSONColumnBuilder stops tracking full control bytes. + auto calculateLastControlOffset = [&](ptrdiff_t offset) { + ptrdiff_t next = 0; + while (_bufBuilder.len() > offset + next + /* null terminator */ 1) { + offset += next; + uint8_t controlByte = *(_bufBuilder.buf() + offset); + next = numSimple8bBlocksForControlByte(controlByte) * sizeof(uint64_t) + + /* control byte */ 1; + } + return offset; + }; // Extract new control byte from finalized state ptrdiff_t finalizedControlOffset = - visit(OverloadedVisitor{[](const typename InternalState::Regular& regular) { + visit(OverloadedVisitor{[&](const typename InternalState::Regular& regular) { + if (regular._controlByteOffset == kNoSimple8bControl) { + // Calculate last control byte from last known. We don't + // know if finalize() above just filled this control + // byte or if more control bytes also got created and + // filled. + return calculateLastControlOffset(controlOffset); + } return regular._controlByteOffset; }, [](const typename InternalState::Interleaved&) { @@ -2309,51 +1946,6 @@ void EncodingState::_initializeFromPrevious(const Allocator& allocato } } -template -template -ptrdiff_t EncodingState::_incrementSimple8bCount( - allocator_aware::BufBuilder& buffer, F controlBlockWriter) { - char* byte; - uint8_t count; - uint8_t scaleIndex = Simple8bTypeUtil::kMemoryAsInteger; - if (auto encoder = std::get_if(&_encoder)) { - scaleIndex = encoder->scaleIndex; - } - uint8_t control = kControlByteForScaleIndex[scaleIndex]; - - if (_controlByteOffset == kNoSimple8bControl) { - // Allocate new control byte if we don't already have one. Record its offset so we can find - // it even if the underlying buffer reallocates. - byte = buffer.skip(1); - _controlByteOffset = std::distance(buffer.buf(), byte); - count = 0; - } else { - // Read current count from previous control byte - byte = buffer.buf() + _controlByteOffset; - - // If previous byte was written with a different control byte then we can't re-use and need - // to start a new one - if ((*byte & kControlMask) != control) { - controlBlockWriter(_controlByteOffset, buffer.len() - _controlByteOffset); - - _controlByteOffset = kNoSimple8bControl; - _incrementSimple8bCount(buffer, controlBlockWriter); - return kNoSimple8bControl; - } - count = (*byte & kCountMask) + 1; - } - - // Write back new count and clear offset if we have reached max count - *byte = control | (count & kCountMask); - if (count + 1 == kMaxCount) { - auto prevControlByteOffset = _controlByteOffset; - _controlByteOffset = kNoSimple8bControl; - return prevControlByteOffset; - } - - return kNoSimple8bControl; -} - template template void EncodingState::Simple8bBlockWriter128::operator()(uint64_t block) { diff --git a/src/mongo/bson/column/bsoncolumnbuilder.h b/src/mongo/bson/column/bsoncolumnbuilder.h index e3ad0797b6e..63e5dc4e85e 100644 --- a/src/mongo/bson/column/bsoncolumnbuilder.h +++ b/src/mongo/bson/column/bsoncolumnbuilder.h @@ -269,9 +269,6 @@ struct EncodingState { F controlBlockWriter, const Allocator&); void _initializeFromPrevious(const Allocator&); - template - ptrdiff_t _incrementSimple8bCount(allocator_aware::BufBuilder& buffer, - F controlBlockWriter); // Encoders for 64bit and 128bit types. std::variant _encoder; @@ -521,7 +518,7 @@ private: int lastBufLength = 0; // Finalized state of last control byte written out by the previous intermediate() call. uint8_t lastControl; - uint8_t lastControlOffset = 0; + uint16_t lastControlOffset = 0; }; // Internal helper to perform reopen/initialization of this class from a BSONColumn binary. diff --git a/src/mongo/bson/column/simple8b.h b/src/mongo/bson/column/simple8b.h index d12fbb4be44..daa8e8c143f 100644 --- a/src/mongo/bson/column/simple8b.h +++ b/src/mongo/bson/column/simple8b.h @@ -352,6 +352,9 @@ static constexpr uint64_t kSingleSkip = 0xFFFFFFFFFFFFFFFE; // Constant for a simple8b block containing a single zero value. static constexpr uint64_t kSingleZero = 0xE; +// Constant for an invalid simple8b block, trying to read this will throw. +static constexpr uint64_t kInvalidSimple8b = 0; + /** * Visits all values in sequence with provided callbacks * visit - a callback for receiving all non-missing values (including 0) @@ -382,6 +385,13 @@ MONGO_COMPILER_ALWAYS_INLINE_GCC14 inline size_t visitAll(const char* buffer, */ inline size_t count(const char* buffer, size_t size); +/** + * Returns the last value (can be missing) over multiple Simple8b blocks. If called with unsigned T + * it returns the encoded value in this slot. If called with signed T it returns the decoded value. + */ +template +boost::optional last(const char* buffer, size_t size, uint64_t& prevNonRLE); + /** * Calculates the sum for multiple simple8b blocks in a buffer. 'prevNonRLE' should be initialized * to 'kSingleSkip' when calculating sum for the first buffer. If the caller needs sum from multiple diff --git a/src/mongo/bson/column/simple8b.inl b/src/mongo/bson/column/simple8b.inl index 87ae27a293b..1aaf83eae7b 100644 --- a/src/mongo/bson/column/simple8b.inl +++ b/src/mongo/bson/column/simple8b.inl @@ -103,12 +103,20 @@ struct SimpleDecoder { } // Returns value of last slot. 'kMissing' is returned for missing. - static int64_t last(uint64_t encoded) { + static int64_t lastDecoded(uint64_t encoded) { encoded >>= (bits * (iters - 1)); if (encoded == mask) return kMissing; return Simple8bTypeUtil::decodeInt64(encoded); } + + // Returns value of last slot. 'kMissing' is returned for missing. + static uint64_t lastEncoded(uint64_t encoded) { + encoded >>= (bits * (iters - 1)); + if (encoded == mask) + return kMissing; + return encoded; + } }; // Table-based decoder that uses a lookup table for decoding unsigned integers into signed. Suitable @@ -212,7 +220,7 @@ struct TableDecoder { } // Returns value of last slot. 'kMissing' is returned for missing. - int64_t last(uint64_t encoded) const { + int64_t lastDecoded(uint64_t encoded) const { encoded >>= (bits * (iters - 1)); const auto& entry = table[encoded]; if (!entry.num) { @@ -220,6 +228,15 @@ struct TableDecoder { } return entry.decoded; } + + uint64_t lastEncoded(uint64_t encoded) const { + encoded >>= (bits * (iters - 1)); + const auto& entry = table[encoded]; + if (!entry.num) { + return kMissing; + } + return encoded; + } }; // Table-based decoder that uses a lookup table for decoding multiple unsigned integers into signed @@ -510,7 +527,7 @@ struct ExtendedDecoder { // Returns value of last slot. 'kMissing' is returned for missing. template - T last(uint64_t encoded) const { + T lastDecoded(uint64_t encoded) const { encoded >>= (bits * (iters - 1)); if ((encoded & mask) == mask) return kMissing; @@ -525,6 +542,24 @@ struct ExtendedDecoder { return Simple8bTypeUtil::decodeInt(value << numZeroes); } + + // Returns value of last slot. 'kMissing' is returned for missing. + template + T lastEncoded(uint64_t encoded) const { + encoded >>= (bits * (iters - 1)); + if ((encoded & mask) == mask) + return kMissing; + + uint64_t count = encoded & countMask; + T value = (encoded >> countBits) & valueMask; + auto numZeroes = count * countScale; + // UBSAN will complain if shift values are greater than bit length + if constexpr (std::is_same::value) { + numZeroes %= 64; + } + + return value << numZeroes; + } }; // Storage for all decoders that we need for our various selector types @@ -672,55 +707,57 @@ T decodeLastSlotIgnoreSkip(uint64_t encoded) { case 15: break; default: + uasserted(10065906, "Bad selector"); break; } return 0; } template -T decodeLastSlot(uint64_t encoded) { +T lastDecoded(uint64_t encoded) { auto selector = encoded & simple8b_internal::kBaseSelectorMask; encoded >>= 4; switch (selector) { case 1: + // Encoded and decoded value is the same for the 1 bit case return decoder1.last(encoded); case 2: - return decoder2.last(encoded); + return decoder2.lastDecoded(encoded); case 3: - return decoder3.last(encoded); + return decoder3.lastDecoded(encoded); case 4: - return decoder4.last(encoded); + return decoder4.lastDecoded(encoded); case 5: - return decoder5.last(encoded); + return decoder5.lastDecoded(encoded); case 6: - return decoder6.last(encoded); + return decoder6.lastDecoded(encoded); case 7: { auto extended = encoded & simple8b_internal::kBaseSelectorMask; encoded >>= 4; switch (extended) { case 0: - return decoder7.last(encoded); + return decoder7.lastDecoded(encoded); case 1: - return decoderExtended7_1.last(encoded); + return decoderExtended7_1.lastDecoded(encoded); case 2: - return decoderExtended7_2.last(encoded); + return decoderExtended7_2.lastDecoded(encoded); case 3: - return decoderExtended7_3.last(encoded); + return decoderExtended7_3.lastDecoded(encoded); case 4: - return decoderExtended7_4.last(encoded); + return decoderExtended7_4.lastDecoded(encoded); case 5: - return decoderExtended7_5.last(encoded); + return decoderExtended7_5.lastDecoded(encoded); case 6: - return decoderExtended7_6.last(encoded); + return decoderExtended7_6.lastDecoded(encoded); case 7: - return decoderExtended7_7.last(encoded); + return decoderExtended7_7.lastDecoded(encoded); case 8: - return decoderExtended7_8.last(encoded); + return decoderExtended7_8.lastDecoded(encoded); case 9: - return decoderExtended7_9.last(encoded); + return decoderExtended7_9.lastDecoded(encoded); default: - invariant(false); // invalid encoding + uasserted(10065900, "Bad extended selector"); break; } break; @@ -730,54 +767,163 @@ T decodeLastSlot(uint64_t encoded) { encoded >>= 4; switch (extended) { case 0: - return decoder8.last(encoded); + return decoder8.lastDecoded(encoded); case 1: - return decoderExtended8_1.last(encoded); + return decoderExtended8_1.lastDecoded(encoded); case 2: - return decoderExtended8_2.last(encoded); + return decoderExtended8_2.lastDecoded(encoded); case 3: - return decoderExtended8_3.last(encoded); + return decoderExtended8_3.lastDecoded(encoded); case 4: - return decoderExtended8_4.last(encoded); + return decoderExtended8_4.lastDecoded(encoded); case 5: - return decoderExtended8_5.last(encoded); + return decoderExtended8_5.lastDecoded(encoded); case 6: - return decoderExtended8_6.last(encoded); + return decoderExtended8_6.lastDecoded(encoded); case 7: - return decoderExtended8_7.last(encoded); + return decoderExtended8_7.lastDecoded(encoded); case 8: - return decoderExtended8_8.last(encoded); + return decoderExtended8_8.lastDecoded(encoded); case 9: - return decoderExtended8_9.last(encoded); + return decoderExtended8_9.lastDecoded(encoded); case 10: - return decoderExtended8_10.last(encoded); + return decoderExtended8_10.lastDecoded(encoded); case 11: - return decoderExtended8_11.last(encoded); + return decoderExtended8_11.lastDecoded(encoded); case 12: - return decoderExtended8_12.last(encoded); + return decoderExtended8_12.lastDecoded(encoded); case 13: - return decoderExtended8_13.last(encoded); + return decoderExtended8_13.lastDecoded(encoded); default: - invariant(false); // invalid encoding + uasserted(10065901, "Bad extended selector"); break; } break; } case 9: - return decoder10.last(encoded); + return decoder10.lastDecoded(encoded); case 10: - return decoder12.last(encoded); + return decoder12.lastDecoded(encoded); case 11: - return decoder15.last(encoded); + return decoder15.lastDecoded(encoded); case 12: - return decoder20.last(encoded); + return decoder20.lastDecoded(encoded); case 13: - return decoder30.last(encoded); + return decoder30.lastDecoded(encoded); case 14: - return decoder60.last(encoded); + return decoder60.lastDecoded(encoded); case 15: break; default: + uasserted(10065905, "Bad selector"); + break; + } + return 0; +} + +template +T lastEncoded(uint64_t encoded) { + auto selector = encoded & simple8b_internal::kBaseSelectorMask; + encoded >>= 4; + switch (selector) { + case 1: + // Encoded and decoded value is the same for the 1 bit case + return decoder1.last(encoded); + case 2: + return decoder2.lastEncoded(encoded); + case 3: + return decoder3.lastEncoded(encoded); + case 4: + return decoder4.lastEncoded(encoded); + case 5: + return decoder5.lastEncoded(encoded); + case 6: + return decoder6.lastEncoded(encoded); + case 7: { + + auto extended = encoded & simple8b_internal::kBaseSelectorMask; + encoded >>= 4; + switch (extended) { + case 0: + return decoder7.lastEncoded(encoded); + case 1: + return decoderExtended7_1.lastEncoded(encoded); + case 2: + return decoderExtended7_2.lastEncoded(encoded); + case 3: + return decoderExtended7_3.lastEncoded(encoded); + case 4: + return decoderExtended7_4.lastEncoded(encoded); + case 5: + return decoderExtended7_5.lastEncoded(encoded); + case 6: + return decoderExtended7_6.lastEncoded(encoded); + case 7: + return decoderExtended7_7.lastEncoded(encoded); + case 8: + return decoderExtended7_8.lastEncoded(encoded); + case 9: + return decoderExtended7_9.lastEncoded(encoded); + default: + uasserted(10065902, "Bad extended selector"); + break; + } + break; + } + case 8: { + auto extended = encoded & simple8b_internal::kBaseSelectorMask; + encoded >>= 4; + switch (extended) { + case 0: + return decoder8.lastEncoded(encoded); + case 1: + return decoderExtended8_1.lastEncoded(encoded); + case 2: + return decoderExtended8_2.lastEncoded(encoded); + case 3: + return decoderExtended8_3.lastEncoded(encoded); + case 4: + return decoderExtended8_4.lastEncoded(encoded); + case 5: + return decoderExtended8_5.lastEncoded(encoded); + case 6: + return decoderExtended8_6.lastEncoded(encoded); + case 7: + return decoderExtended8_7.lastEncoded(encoded); + case 8: + return decoderExtended8_8.lastEncoded(encoded); + case 9: + return decoderExtended8_9.lastEncoded(encoded); + case 10: + return decoderExtended8_10.lastEncoded(encoded); + case 11: + return decoderExtended8_11.lastEncoded(encoded); + case 12: + return decoderExtended8_12.lastEncoded(encoded); + case 13: + return decoderExtended8_13.lastEncoded(encoded); + default: + uasserted(10065903, "Bad extended selector"); + break; + } + break; + } + case 9: + return decoder10.lastEncoded(encoded); + case 10: + return decoder12.lastEncoded(encoded); + case 11: + return decoder15.lastEncoded(encoded); + case 12: + return decoder20.lastEncoded(encoded); + case 13: + return decoder30.lastEncoded(encoded); + case 14: + return decoder60.lastEncoded(encoded); + case 15: + break; + default: + uasserted(10065904, "Bad selector"); break; } return 0; @@ -925,7 +1071,7 @@ MONGO_COMPILER_ALWAYS_INLINE_GCC14 inline size_t decodeAndVisit(uint64_t encoded return decoder60.visitAll(encoded, visit, visitZero, visitMissing); break; case simple8b_internal::kRleSelector: { - const T lastValue = decodeLastSlot(*prevNonRLE); + const T lastValue = lastDecoded(*prevNonRLE); size_t count = ((encoded & 0xf) + 1) * simple8b_internal::kRleMultiplier; if (lastValue == kMissing) { for (size_t i = 0; i < count; ++i) { @@ -1168,7 +1314,7 @@ T decodeAndPrefixSum(uint64_t encoded, T& prefix, uint64_t* prevNonRLE) { case 14: return decoder60.prefixSum(encoded, prefix); case simple8b_internal::kRleSelector: { - T last = decodeLastSlot(*prevNonRLE); + T last = lastDecoded(*prevNonRLE); if (last == kMissing) return 0; @@ -1229,6 +1375,29 @@ inline size_t count(const char* buffer, size_t size) { return numElements; } +template +boost::optional last(const char* buffer, size_t size, uint64_t& prevNonRLE) { + invariant(size % 8 == 0); + const char* end = buffer + size; + while (buffer != end) { + uint64_t encoded = ConstDataView(buffer).read>(); + auto selector = encoded & simple8b_internal::kBaseSelectorMask; + if (selector != simple8b_internal::kRleSelector) { + prevNonRLE = encoded; + } + + buffer += sizeof(uint64_t); + } + + if constexpr (std::is_same_v || std::is_same_v) { + T encoded = lastEncoded(prevNonRLE); + return encoded == kMissing ? boost::optional{} : boost::optional{encoded}; + } else { + T decoded = lastDecoded(prevNonRLE); + return decoded == kMissing ? boost::optional{} : boost::optional{decoded}; + } +} + template T sum(const char* buffer, size_t size, uint64_t& prevNonRLE) { invariant(size % 8 == 0); diff --git a/src/mongo/bson/column/simple8b_builder.h b/src/mongo/bson/column/simple8b_builder.h index 9946693bb65..9c121f0e524 100644 --- a/src/mongo/bson/column/simple8b_builder.h +++ b/src/mongo/bson/column/simple8b_builder.h @@ -71,6 +71,7 @@ public: // Callback to handle writing of finalized Simple-8b blocks. Machine Endian byte order, the // value need to be converted to Little Endian before persisting. explicit Simple8bBuilder(Allocator = {}); + Simple8bBuilder(boost::optional val, int64_t num, Allocator = {}); ~Simple8bBuilder(); Simple8bBuilder(const Simple8bBuilder&) = default; @@ -79,6 +80,20 @@ public: Simple8bBuilder& operator=(const Simple8bBuilder&) = default; Simple8bBuilder& operator=(Simple8bBuilder&&) = default; + /** + * Returns the allocator used by this Simple8bBuilder. + */ + Allocator allocator() const { + // There is a bug in the version of MSVC we are using that fails to perform this cast when + // the allocator is std::allocator. It is a stateless allocator so we just return a + // new instance as a workaround. + if constexpr (std::is_same_v>) { + return Allocator{}; + } else { + return _pendingValues.get_allocator(); + } + } + /** * Appends a multiple missing value to Simple8b. Should be called before any other values are * appended. This is intended to be used to initialize a new builder with a large series of @@ -175,17 +190,6 @@ public: */ bool rlePossible() const; - /** - * Forcibly set last value so future append/skip calls may use this to construct RLE. This - * should not be called in normal operation. - */ - void setLastForRLE(boost::optional val); - - /** - * Reset RLE state on the last value, if needed. This should not be called in normal operation. - */ - void resetLastForRLEIfNeeded(); - /** * Initialize RLE state from another builder */ @@ -499,6 +503,20 @@ bool Simple8bBuilder::PendingIterator::operator!=( template Simple8bBuilder::Simple8bBuilder(Allocator allocator) : _pendingValues(allocator) {} +template +Simple8bBuilder::Simple8bBuilder(boost::optional val, + int64_t num, + Allocator allocator) + : _rleCount(num), _lastValueInPrevWord(val), _pendingValues(allocator) { + if (val) { + auto pendingValue = _calculatePendingValue(*val); + invariant(pendingValue); + invariant(_doesIntegerFitInCurrentWord(*pendingValue)); + } + _lastValidExtensionType = 0; + isSelectorPossible.fill(true); +} + template Simple8bBuilder::~Simple8bBuilder() = default; @@ -584,23 +602,6 @@ void Simple8bBuilder::flush(F&& writeFn) { } } -template -void Simple8bBuilder::setLastForRLE(boost::optional val) { - _lastValueInPrevWord = val; - if (val) { - auto pendingValue = _calculatePendingValue(*val); - invariant(pendingValue); - invariant(_doesIntegerFitInCurrentWord(*pendingValue)); - } -} - -template -void Simple8bBuilder::resetLastForRLEIfNeeded() { - if (!rlePossible()) { - _lastValueInPrevWord = 0; - } -} - template void Simple8bBuilder::initializeRLEFrom(const Simple8bBuilder& other) { if (other.rlePossible()) { diff --git a/src/mongo/bson/column/simple8b_fuzzer.cpp b/src/mongo/bson/column/simple8b_fuzzer.cpp index dbdc28edf9f..79bd929bc03 100644 --- a/src/mongo/bson/column/simple8b_fuzzer.cpp +++ b/src/mongo/bson/column/simple8b_fuzzer.cpp @@ -31,6 +31,7 @@ #include "mongo/bson/column/simple8b.h" #include "mongo/bson/column/simple8b_type_util.h" #include "mongo/logv2/log.h" +#include "mongo/platform/int128.h" #include "mongo/util/hex.h" #define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kDefault @@ -39,6 +40,11 @@ static constexpr int128_t add(int128_t lhs, int128_t rhs) { return static_cast(static_cast(lhs) + static_cast(rhs)); } +struct LastResult { + boost::optional encoded; + boost::optional decoded; +}; + extern "C" int LLVMFuzzerTestOneInput(const char* Data, size_t Size) { using namespace mongo; @@ -65,16 +71,45 @@ extern "C" int LLVMFuzzerTestOneInput(const char* Data, size_t Size) { } }(); + auto oldLast = [&]() -> boost::optional { + try { + boost::optional last = uint128_t{0}; + Simple8b s8b(Data, bufferSize); + for (auto&& val : s8b) { + last = val; + } + if (last) { + return LastResult{last, Simple8bTypeUtil::decodeInt(*last)}; + } else { + return LastResult{boost::optional(boost::none), + boost::optional(boost::none)}; + } + + } catch (const DBException&) { + return boost::none; + } + }(); + auto sum = [&]() -> boost::optional { try { - uint64_t prev = - 0xE; // Previous value 0, this is one simple8b value containing a zero. + uint64_t prev = simple8b::kSingleZero; return simple8b::sum(Data, bufferSize, prev); } catch (const DBException&) { return boost::none; } }(); + auto last = [&]() -> boost::optional { + try { + uint64_t prev1 = simple8b::kSingleZero; + uint64_t prev2 = simple8b::kSingleZero; + return LastResult{simple8b::last(Data, bufferSize, prev1), + simple8b::last(Data, bufferSize, prev2)}; + } catch (const DBException&) { + return boost::none; + } + }(); + if (sum != oldSum) { LOGV2_DEBUG(8384500, 2, @@ -91,6 +126,11 @@ extern "C" int LLVMFuzzerTestOneInput(const char* Data, size_t Size) { // (as they'd lead to crashes), while using edge cases leading to interesting control flow // paths in both implementations. invariant(sum == oldSum); + // simple8b::last is not required to decode everything so an invalid binary might not throw. + if (last && oldLast) { + invariant(last->encoded == oldLast->encoded); + invariant(last->decoded == oldLast->decoded); + } } diff --git a/src/mongo/bson/column/simple8b_test.cpp b/src/mongo/bson/column/simple8b_test.cpp index bbd8e2b286f..dd0ec900874 100644 --- a/src/mongo/bson/column/simple8b_test.cpp +++ b/src/mongo/bson/column/simple8b_test.cpp @@ -119,18 +119,34 @@ void testSimple8b(const std::vector>& expectedValues, assertValuesEqual(s8b, expectedValues); make_signed_t sum = 0; + boost::optional last = T{0}; for (auto&& val : expectedValues) { if (val) { sum = add(sum, Simple8bTypeUtil::decodeInt(*val)); } + last = val; } - uint64_t prev = 0xE; // Tests in this file assume that the previous value was '0'. This is - // different semantics from BSONColumn. + uint64_t prev = simple8b::kSingleZero; auto s = simple8b::sum>( reinterpret_cast(expectedBinary.data()), expectedBinary.size(), prev); ASSERT_EQ(s, sum); + // Test last + prev = simple8b::kSingleZero; + ASSERT_EQ(last, + simple8b::last(reinterpret_cast(expectedBinary.data()), + expectedBinary.size(), + prev)); + if (last.has_value()) { + prev = simple8b::kSingleZero; + ASSERT_EQ( + Simple8bTypeUtil::decodeInt(*last), + simple8b::last>( + reinterpret_cast(expectedBinary.data()), expectedBinary.size(), prev)); + } + + auto testPrefixSum = [&](auto prefix) { make_signed_t sum = prefix; make_signed_t prefixSum = 0; @@ -142,8 +158,7 @@ void testSimple8b(const std::vector>& expectedValues, } } - uint64_t prev = 0xE; // Tests in this file assume that the previous value was '0'. This is - // different semantics from BSONColumn. + uint64_t prev = simple8b::kSingleZero; auto ps = simple8b::prefixSum>( reinterpret_cast(expectedBinary.data()), expectedBinary.size(),