mongo/src/mongo/s/write_ops/bulk_write_exec.cpp

/**
 *    Copyright (C) 2023-present MongoDB, Inc.
 *
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the Server Side Public License, version 1,
 *    as published by MongoDB, Inc.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    Server Side Public License for more details.
 *
 *    You should have received a copy of the Server Side Public License
 *    along with this program. If not, see
 *    <http://www.mongodb.com/licensing/server-side-public-license>.
 *
 *    As a special exception, the copyright holders give permission to link the
 *    code of portions of this program with the OpenSSL library under certain
 *    conditions as described in each individual source file and distribute
 *    linked combinations including the program with the OpenSSL library. You
 *    must comply with the Server Side Public License in all respects for
 *    all of the code used other than as permitted herein. If you modify file(s)
 *    with this exception, you may extend this exception to your version of the
 *    file(s), but you are not obligated to do so. If you do not wish to do so,
 *    delete this exception statement from your version. If you delete this
 *    exception statement from all source files in the program, then also delete
 *    it in the license file.
 */

#include "mongo/s/write_ops/bulk_write_exec.h"

// IWYU pragma: no_include "ext/alloc_traits.h"
#include "mongo/base/error_codes.h"
#include "mongo/bson/bsonelement.h"
#include "mongo/bson/bsonobj.h"
#include "mongo/bson/bsonobjbuilder.h"
#include "mongo/bson/timestamp.h"
#include "mongo/client/read_preference.h"
#include "mongo/db/basic_types_gen.h"
#include "mongo/db/commands/query_cmd/bulk_write_common.h"
#include "mongo/db/commands/query_cmd/bulk_write_crud_op.h"
#include "mongo/db/commands/query_cmd/bulk_write_gen.h"
#include "mongo/db/commands/query_cmd/bulk_write_parser.h"
#include "mongo/db/database_name.h"
#include "mongo/db/error_labels.h"
#include "mongo/db/global_catalog/chunk_manager.h"
#include "mongo/db/global_catalog/ddl/cannot_implicitly_create_collection_info.h"
#include "mongo/db/namespace_string.h"
#include "mongo/db/query/client_cursor/cursor_server_params_gen.h"
#include "mongo/db/query/write_ops/write_ops.h"
#include "mongo/db/query/write_ops/write_ops_parsers.h"
#include "mongo/db/session/logical_session_id_helpers.h"
#include "mongo/db/shard_role/shard_catalog/raw_data_operation.h"
#include "mongo/db/sharding_environment/client/shard.h"
#include "mongo/db/sharding_environment/grid.h"
#include "mongo/db/stats/counters.h"
#include "mongo/db/tenant_id.h"
#include "mongo/db/versioning_protocol/chunk_version.h"
#include "mongo/db/versioning_protocol/database_version.h"
#include "mongo/db/versioning_protocol/shard_version.h"
#include "mongo/db/versioning_protocol/shard_version_factory.h"
#include "mongo/db/versioning_protocol/stale_exception.h"
#include "mongo/db/write_concern_options.h"
#include "mongo/executor/remote_command_response.h"
#include "mongo/executor/task_executor_pool.h"
#include "mongo/idl/idl_parser.h"
#include "mongo/logv2/log.h"
#include "mongo/s/commands/query_cmd/populate_cursor.h"
#include "mongo/s/multi_statement_transaction_requests_sender.h"
#include "mongo/s/transaction_router.h"
#include "mongo/s/write_ops/batch_write_op.h"
#include "mongo/s/write_ops/batched_command_request.h"
#include "mongo/s/write_ops/coordinate_multi_update_util.h"
#include "mongo/s/write_ops/write_op.h"
#include "mongo/s/write_ops/write_without_shard_key_util.h"
#include "mongo/util/assert_util.h"
#include "mongo/util/exit.h"
#include "mongo/util/str.h"
#include "mongo/util/uuid.h"

#include <cstddef>
#include <cstdint>
#include <numeric>
#include <string>
#include <utility>
#include <variant>

#include <absl/container/node_hash_map.h>
#include <absl/meta/type_traits.h>
#include <boost/move/utility_core.hpp>
#include <boost/optional.hpp>
#include <boost/optional/optional.hpp>

#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kSharding

namespace mongo {
namespace bulk_write_exec {
namespace {

// The number of times we'll try to continue a batch op if no progress is being made. This only
// applies when no writes are occurring and metadata is not changing on reload.
const int kMaxRoundsWithoutProgress(5);

/**
 * Send and process the child batches. Each child batch is targeted at a unique shard: therefore one
 * shard will have only one batch incoming.
 */
void executeChildBatches(
    OperationContext* opCtx,
    const std::vector<std::unique_ptr<NSTargeter>>& targeters,
    TargetedBatchMap& childBatches,
    BulkWriteOp& bulkWriteOp,
    stdx::unordered_map<NamespaceString, TrackedErrors>& errorsPerNamespace,
    boost::optional<bool> allowShardKeyUpdatesWithoutFullShardKeyInQuery = boost::none) {
    // We are starting a new round of execution and so this should have been reset to false.
    invariant(!bulkWriteOp.shouldStopCurrentRound());
    std::vector<AsyncRequestsSender::Request> requests;
    for (auto& childBatch : childBatches) {
        bulkWriteOp.noteTargetedShard(*childBatch.second);

        auto request = [&]() {
            auto bulkReq = bulkWriteOp.buildBulkCommandRequest(
                targeters, *childBatch.second, allowShardKeyUpdatesWithoutFullShardKeyInQuery);

            // Transform the request into a sendable BSON.
            BSONObjBuilder builder;
            bulkReq.serialize(&builder);

            logical_session_id_helpers::serializeLsidAndTxnNumber(opCtx, &builder);
            if (!TransactionRouter::get(opCtx)) {
                builder.append(WriteConcernOptions::kWriteConcernField,
                               opCtx->getWriteConcern().toBSON());
            }

            auto obj = builder.obj();

            // When running a debug build, verify that estSize is at least the BSON
            // serialization size.
            //
            // The estimated size doesn't take into account the size of the internal
            // '_allowShardKeyUpdatesWithoutFullShardKeyInQuery' field for updates. When
            // allowShardKeyUpdatesWithoutFullShardKeyInQuery is set, we are running a single
            // updateOne without shard key in its own child batch. So it doesn't matter what the
            // estimated size is, skip the debug check.
            dassert(allowShardKeyUpdatesWithoutFullShardKeyInQuery ||
                    childBatch.second->getEstimatedSizeBytes() >= obj.objsize());

            return obj;
        }();

        requests.emplace_back(childBatch.first, request);
    }

    // Note we check this rather than `isRetryableWrite()` because we do not want to retry
    // commands within retryable internal transactions.
    bool shouldRetry = opCtx->getTxnNumber() && !TransactionRouter::get(opCtx);

    // Use MultiStatementTransactionRequestsSender to send any ready sub-batches to targeted
    // shard endpoints. Requests are sent on construction.
    MultiStatementTransactionRequestsSender ars(
        opCtx,
        Grid::get(opCtx)->getExecutorPool()->getArbitraryExecutor(),
        DatabaseName::kAdmin,
        requests,
        ReadPreferenceSetting(ReadPreference::PrimaryOnly),
        shouldRetry ? Shard::RetryPolicy::kIdempotent : Shard::RetryPolicy::kStrictlyNotIdempotent);

    while (!ars.done()) {
        // Block until a response is available.
        auto response = ars.next();

        // We wait for the responses from pending shards due to SERVER-85857, and skip processing
        // them - see 'BulkWriteOp._shouldStopCurrentRound' for more details.
        if (bulkWriteOp.shouldStopCurrentRound()) {
            ars.stopRetrying();
            continue;
        }
        // The BulkWriteOp may be marked finished early if we are in a transaction and encounter an
        // error, which aborts the transaction. In those cases, we do not bother processing those
        // responses.
        if (bulkWriteOp.isFinished()) {
            break;
        }

        auto iter = childBatches.find(response.shardId);
        tassert(8971901,
                "Unexpectedly could not find batches sent to a shard",
                iter != childBatches.end());

        TargetedWriteBatch* writeBatch = iter->second.get();
        tassert(8048101, "Unexpectedly could not find write batch for shard", writeBatch);

        // When the responseStatus is not OK, this means that mongos was unable to receive a
        // response from the shard the write batch was sent to, or mongos faced some other local
        // error (for example, mongos was shutting down).
        // The status being OK does not mean that all operations within the bulkWrite succeeded, nor
        // that we got an ok:1 response from the shard.
        if (!response.swResponse.getStatus().isOK()) {
            bulkWriteOp.processLocalChildBatchError(*writeBatch, response);
        } else {
            bulkWriteOp.processChildBatchResponseFromRemote(
                *writeBatch, response, errorsPerNamespace);
        }
    }
}

}  // namespace

void BulkWriteExecStats::noteTargetedShard(const BulkWriteCommandRequest& clientRequest,
                                           const TargetedWriteBatch& targetedBatch) {
    const ShardId& shardId = targetedBatch.getShardId();
    _targetedShards.insert(shardId);
    for (const auto& write : targetedBatch.getWrites()) {
        BulkWriteCRUDOp bulkWriteOp(clientRequest.getOps().at(write->writeOpRef.first));
        auto nsIdx = bulkWriteOp.getNsInfoIdx();
        auto batchType = convertOpType(bulkWriteOp.getType());
        _targetedShardsPerNsAndBatchType[nsIdx][batchType].insert(shardId);
    }
}

void BulkWriteExecStats::noteNumShardsOwningChunks(size_t nsIdx, int nShardsOwningChunks) {
    _numShardsOwningChunks[nsIdx] = nShardsOwningChunks;
}

void BulkWriteExecStats::noteTwoPhaseWriteProtocol(const BulkWriteCommandRequest& clientRequest,
                                                   const TargetedWriteBatch& targetedBatch,
                                                   size_t nsIdx,
                                                   int nShardsOwningChunks) {
    for (const auto& write : targetedBatch.getWrites()) {
        BulkWriteCRUDOp bulkWriteOp(clientRequest.getOps().at(write->writeOpRef.first));
        auto nsIdx = bulkWriteOp.getNsInfoIdx();
        auto batchType = convertOpType(bulkWriteOp.getType());
        // In this case, we aren't really targetting targetedBatch.getShardId, so only create the
        // batchType entry in the map. updateHostsTargetedMetrics reports kManyShards in the case no
        // shards is targeted overall.
        _targetedShardsPerNsAndBatchType[nsIdx][batchType];
    }

    noteNumShardsOwningChunks(nsIdx, nShardsOwningChunks);
}

void BulkWriteExecStats::updateMetrics(OperationContext* opCtx,
                                       const std::vector<std::unique_ptr<NSTargeter>>& targeters,
                                       bool updatedShardKey) {
    // Record the number of shards targeted by this bulkWrite.
    CurOp::get(opCtx)->debug().nShards = _targetedShards.size();

    for (size_t nsIdx = 0; nsIdx < targeters.size(); ++nsIdx) {
        const auto& targeter = targeters[nsIdx];
        auto it = _targetedShardsPerNsAndBatchType.find(nsIdx);
        if (it == _targetedShardsPerNsAndBatchType.end()) {
            continue;
        }
        auto nShardsOwningChunks = getNumShardsOwningChunks(nsIdx);
        for (const auto& [batchType, shards] : it->second) {
            int nShards = shards.size();

            // If we have no information on the shards targeted, ignore updatedShardKey,
            // updateHostsTargetedMetrics will report this as TargetType::kManyShards.
            if (nShards != 0 && updatedShardKey) {
                nShards += 1;
            }

            if (nShardsOwningChunks.has_value()) {
                updateHostsTargetedMetrics(opCtx,
                                           batchType,
                                           nShardsOwningChunks.value(),
                                           nShards,
                                           targeter->isTargetedCollectionSharded());
            }
        }
    }
}

boost::optional<int> BulkWriteExecStats::getNumShardsOwningChunks(size_t nsIdx) const {
    auto it = _numShardsOwningChunks.find(nsIdx);
    if (it == _numShardsOwningChunks.end()) {
        return boost::none;
    }
    return it->second;
}

void executeRetryableTimeseriesUpdate(OperationContext* opCtx,
                                      TargetedBatchMap& childBatches,
                                      BulkWriteOp& bulkWriteOp) {
    invariant(!childBatches.empty());
    // Get the index of the targeted operation in the client bulkWrite request.
    auto opIdx = childBatches.begin()->second->getWrites()[0]->writeOpRef.first;

    // Construct a single-op update request based on the update operation at opIdx.
    auto& bulkWriteReq = bulkWriteOp.getClientRequest();

    BulkWriteCommandRequest singleUpdateRequest =
        bulk_write_common::makeSingleOpBulkWriteCommandRequest(bulkWriteReq, opIdx);

    auto executor = Grid::get(opCtx)->getExecutorPool()->getFixedExecutor();
    auto inlineExecutor = std::make_shared<executor::InlineExecutor>();
    txn_api::SyncTransactionWithRetries txn(
        opCtx, executor, nullptr /* resourceYielder */, inlineExecutor);
    BulkWriteCommandReply bulkWriteResponse;

    // Execute the singleUpdateRequest (a bulkWrite command) in an internal transaction to perform
    // the retryable timeseries update operation. This separate bulkWrite command will get executed
    // on its own via bulkWrite execute() logic again as a transaction, which handles retries of all
    // kinds. This function is just a client of the internal transaction spawned. As a result, we
    // must only receive a single final (non-retryable) response for the timeseries update
    // operation.
    auto swResult =
        txn.runNoThrow(opCtx,
                       [&singleUpdateRequest, &bulkWriteResponse](
                           const txn_api::TransactionClient& txnClient, ExecutorPtr txnExec) {
                           auto updateResponse = txnClient.runCRUDOpSync(singleUpdateRequest);
                           bulkWriteResponse = std::move(updateResponse);

                           return SemiFuture<void>::makeReady();
                       });

    Status responseStatus = swResult.getStatus();
    WriteConcernErrorDetail wcError;
    if (responseStatus.isOK()) {
        if (!swResult.getValue().cmdStatus.isOK()) {
            responseStatus = swResult.getValue().cmdStatus;
        }
        wcError = swResult.getValue().wcError;
    }
    if (!responseStatus.isOK()) {
        // Set an error for the operation.
        bulkWriteResponse = createEmulatedErrorReply(responseStatus, 1, boost::none);
    }

    // We should only get back one reply item for the single update, unless we are in errorsOnly
    // mode then we can get 0.
    const auto& replyItems = bulkWriteResponse.getCursor().getFirstBatch();
    tassert(7934203,
            "unexpected reply for retryable timeseries update",
            replyItems.size() == 1 || replyItems.size() == 0);
    boost::optional<BulkWriteReplyItem> replyItem = boost::none;
    if (replyItems.size() == 1) {
        replyItem = replyItems[0];
    }

    LOGV2_DEBUG(7934204,
                4,
                "Processing bulk write response for retryable timeseries update",
                "opIdx"_attr = opIdx,
                "singleUpdateRequest"_attr = redact(singleUpdateRequest.toBSON()),
                "replyItem"_attr = replyItem,
                "wcError"_attr = wcError.toString());

    bulkWriteOp.noteWriteOpFinalResponse(opIdx,
                                         replyItem,
                                         bulkWriteResponse,
                                         ShardWCError(childBatches.begin()->first, wcError),
                                         bulkWriteResponse.getRetriedStmtIds());
}

void executeWriteWithoutShardKey(
    OperationContext* opCtx,
    const std::vector<std::unique_ptr<NSTargeter>>& targeters,
    TargetedBatchMap& childBatches,
    BulkWriteOp& bulkWriteOp,
    stdx::unordered_map<NamespaceString, TrackedErrors>& errorsPerNamespace) {
    // If the targetStatus value is 'WithoutShardKeyOrId', then we have detected an
    // updateOne/deleteOne request without a shard key or _id. We will use a two
    // phase protocol to apply the write.
    tassert(7298300, "Executing empty write batch without shard key", !childBatches.empty());

    // Get the index of the targeted operation in the client bulkWrite request.
    const auto opIdx = childBatches.begin()->second->getWrites()[0]->writeOpRef.first;
    auto op = BulkWriteCRUDOp(bulkWriteOp.getClientRequest().getOps()[opIdx]);
    const auto nsIdx = op.getNsInfoIdx();
    auto& targeter = targeters[nsIdx];

    auto allowShardKeyUpdatesWithoutFullShardKeyInQuery =
        opCtx->isRetryableWrite() || opCtx->inMultiDocumentTransaction();

    // If there is a targeted write with a sampleId, use that write instead in order to pass the
    // sampleId to the two phase write protocol. Otherwise, just choose the first targeted write.
    const auto targetedWriteBatch = [&] {
        for (auto&& [_ /* shardId */, childBatch] : childBatches) {
            auto nextBatch = childBatch.get();

            // For a write without shard key, we expect each TargetedWriteBatch in childBatches
            // to contain only one TargetedWrite directed to each shard.
            tassert(7787100,
                    "There must be only 1 targeted write in this targeted write batch.",
                    nextBatch->getWrites().size() == 1);

            auto targetedWrite = nextBatch->getWrites().begin()->get();
            if (targetedWrite->sampleId) {
                return nextBatch;
            }
        }
        return childBatches.begin()->second.get();
    }();

    // Note: It is fine to use 'getAproxNShardsOwningChunks' here because the result is only
    // used to update stats.
    bulkWriteOp.noteTwoPhaseWriteProtocol(
        *targetedWriteBatch, nsIdx, targeter->getAproxNShardsOwningChunks());

    auto cmdObj = bulkWriteOp
                      .buildBulkCommandRequest(targeters,
                                               *targetedWriteBatch,
                                               allowShardKeyUpdatesWithoutFullShardKeyInQuery)
                      .toBSON();

    boost::optional<WriteConcernErrorDetail> wce;
    auto swRes = write_without_shard_key::runTwoPhaseWriteProtocol(
        opCtx, targeter->getNS(), std::move(cmdObj), wce);

    BulkWriteCommandReply bulkWriteResponse;
    WriteConcernErrorDetail wcError;
    if (wce.has_value()) {
        wcError = std::move(*wce);
    }

    Status responseStatus = swRes.getStatus();
    if (swRes.isOK()) {
        std::string errMsg;
        if (swRes.getValue().getResponse().isEmpty()) {
            // When we get an empty response, it means that the predicate didn't match anything
            // and no write was done. So we can just set a trivial ok response. Unless we are
            // running errors only in which case we set an empty vector.
            auto items = std::vector<mongo::BulkWriteReplyItem>{};
            if (!bulkWriteOp.getClientRequest().getErrorsOnly()) {
                BulkWriteReplyItem item(0);
                if (op.getType() == BulkWriteCRUDOp::OpType::kUpdate) {
                    item.setNModified(0);
                }
                items.push_back(std::move(item));
            }
            bulkWriteResponse.setCursor(
                BulkWriteCommandResponseCursor(0,  // cursorId
                                               items,
                                               NamespaceString::makeBulkWriteNSS(boost::none)));
            bulkWriteResponse.setNErrors(0);
            bulkWriteResponse.setNInserted(0);
            bulkWriteResponse.setNMatched(0);
            bulkWriteResponse.setNModified(0);
            bulkWriteResponse.setNUpserted(0);
            bulkWriteResponse.setNDeleted(0);
        } else {
            try {
                bulkWriteResponse = BulkWriteCommandReply::parse(
                    swRes.getValue().getResponse(),
                    IDLParserContext("BulkWriteCommandReplyForWriteWithoutShardKey"));
            } catch (const DBException& ex) {
                responseStatus = ex.toStatus().withContext(
                    "Failed to parse response from writes without shard key");
            }
        }
    }

    if (!responseStatus.isOK()) {
        // Set an error for the operation.
        bulkWriteResponse = createEmulatedErrorReply(responseStatus, 1, boost::none);
    }

    // We should get back just one reply item for the single update we are running.
    const auto& replyItems = bulkWriteResponse.getCursor().getFirstBatch();
    tassert(7298301,
            "unexpected bulkWrite reply for writes without shard key",
            replyItems.size() == 1 || replyItems.size() == 0);
    boost::optional<BulkWriteReplyItem> replyItem = boost::none;
    if (replyItems.size() == 1) {
        replyItem = replyItems[0];
    }

    LOGV2_DEBUG(7298302,
                4,
                "Processing bulk write response for writes without shard key",
                "opIdx"_attr = opIdx,
                "replyItem"_attr = replyItem,
                "wcError"_attr = wcError.toString());

    bulkWriteOp.noteWriteOpFinalResponse(opIdx,
                                         replyItem,
                                         bulkWriteResponse,
                                         ShardWCError(childBatches.begin()->first, wcError),
                                         bulkWriteResponse.getRetriedStmtIds());
}

void executeNonTargetedWriteWithoutShardKeyWithId(
    OperationContext* opCtx,
    const std::vector<std::unique_ptr<NSTargeter>>& targeters,
    TargetedBatchMap& childBatches,
    BulkWriteOp& bulkWriteOp,
    stdx::unordered_map<NamespaceString, TrackedErrors>& errorsPerNamespace) {
    executeChildBatches(opCtx,
                        targeters,
                        childBatches,
                        bulkWriteOp,
                        errorsPerNamespace,
                        /*allowShardKeyUpdatesWithoutFullShardKeyInQuery=*/boost::none);

    // We need to know if the targeters encountered a Stale Shard Response
    // to determine whether to parse the _deferredResponses.
    bulkWriteOp.noteStaleResponses(targeters, errorsPerNamespace);
    bulkWriteOp.finishExecutingWriteWithoutShardKeyWithId();
}

void coordinateMultiUpdate(OperationContext* opCtx,
                           TargetedBatchMap& childBatches,
                           BulkWriteOp& bulkWriteOp) {
    auto bulkWriteResponse = coordinate_multi_update_util::executeCoordinateMultiUpdate(
        opCtx, childBatches, bulkWriteOp);

    const auto& replyItems = bulkWriteResponse.getCursor().getFirstBatch();
    tassert(8127600,
            "Unexpected reply for coordinateMultiUpdate",
            replyItems.size() == 1 || replyItems.size() == 0);
    boost::optional<BulkWriteReplyItem> replyItem = boost::none;
    if (replyItems.size() == 1) {
        replyItem = replyItems[0];
    }

    bulkWriteOp.noteWriteOpFinalResponse(
        coordinate_multi_update_util::getWriteOpIndex(childBatches),
        replyItem,
        bulkWriteResponse,
        ShardWCError(childBatches.begin()->first, WriteConcernErrorDetail{}),
        bulkWriteResponse.getRetriedStmtIds());
}

BulkWriteReplyInfo execute(OperationContext* opCtx,
                           const std::vector<std::unique_ptr<NSTargeter>>& targeters,
                           const BulkWriteCommandRequest& clientRequest,
                           BulkWriteExecStats& execStats) {
    LOGV2_DEBUG(7263700,
                4,
                "Starting execution of a bulkWrite",
                "clientRequest"_attr = redact(clientRequest.toBSON()));

    BulkWriteOp bulkWriteOp(opCtx, clientRequest);

    bool refreshedTargeter = false;
    int rounds = 0;
    int numCompletedOps = 0;
    int numRoundsWithoutProgress = 0;
    Backoff backoff(Seconds(1), Seconds(2));

    while (!bulkWriteOp.isFinished()) {
        // Make sure we are not over our maximum memory allocation, if we are then mark the next
        // write op with an error and abort the operation.
        if (bulkWriteOp.aboveBulkWriteRepliesMaxSize()) {
            bulkWriteOp.abortDueToMaxSizeError();
        }

        // Target remaining ops with the appropriate targeter based on the namespace index and
        // re-batch ops based on their targeted shard id.
        TargetedBatchMap childBatches;

        // Divide and group ("target") the operations in the bulk write command. Some operations may
        // be split up (such as an update that needs to go to more than one shard), while others may
        // be grouped together if they need to go to the same shard.
        // These operations are grouped by shardId in the TargetedBatchMap childBatches.
        bool recordTargetErrors = refreshedTargeter;
        auto targetStatus = bulkWriteOp.target(targeters, recordTargetErrors, childBatches);
        if (!targetStatus.isOK()) {
            bulkWriteOp.processTargetingError(targetStatus);

            dassert(childBatches.size() == 0u);
            // The target error comes from one of the targeters. But to avoid getting another target
            // error from another targeter in retry, we simply refresh all targeters and only retry
            // once for target errors. The performance hit should be negligible as target errors
            // should be rare.
            for (auto& targeter : targeters) {
                targeter->noteCouldNotTarget();
            }
            refreshedTargeter = true;
        } else {
            stdx::unordered_map<NamespaceString, TrackedErrors> errorsPerNamespace;

            try {
                if (targetStatus.getValue() == WriteType::TimeseriesRetryableUpdate) {
                    executeRetryableTimeseriesUpdate(opCtx, childBatches, bulkWriteOp);
                } else if (targetStatus.getValue() == WriteType::WithoutShardKeyOrId) {
                    executeWriteWithoutShardKey(
                        opCtx, targeters, childBatches, bulkWriteOp, errorsPerNamespace);
                } else if (targetStatus.getValue() == WriteType::WithoutShardKeyWithId) {
                    executeNonTargetedWriteWithoutShardKeyWithId(
                        opCtx, targeters, childBatches, bulkWriteOp, errorsPerNamespace);
                } else if (targetStatus.getValue() == WriteType::MultiWriteBlockingMigrations) {
                    coordinateMultiUpdate(opCtx, childBatches, bulkWriteOp);
                } else {
                    // Send the child batches and wait for responses.
                    executeChildBatches(
                        opCtx,
                        targeters,
                        childBatches,
                        bulkWriteOp,
                        errorsPerNamespace,
                        /*allowShardKeyUpdatesWithoutFullShardKeyInQuery=*/boost::none);
                }
            } catch (const DBException&) {
                bulkWriteOp.noteStaleResponses(targeters, errorsPerNamespace);
                throw;
            }

            // If we saw any staleness errors, tell the targeters to invalidate their cache
            // so that they may be refreshed.
            // The staleness errors for WithoutShardKeyWithId writes have been evaluated earlier.
            if (targetStatus.getValue() != WriteType::WithoutShardKeyWithId)
                bulkWriteOp.noteStaleResponses(targeters, errorsPerNamespace);
        }

        rounds++;

        if (bulkWriteOp.isFinished()) {
            // No need to refresh the targeters if we are done.
            break;
        }

        // Refresh the targeter(s) if we received a target error or a stale config/db error.
        bool targeterChanged = false;
        try {
            for (auto& targeter : targeters) {
                targeterChanged |= targeter->createCollectionIfNeeded(opCtx);
            }
            LOGV2_DEBUG(7298200, 2, "Refreshing all targeters for bulkWrite");
            for (auto& targeter : targeters) {
                targeterChanged |= targeter->refreshIfNeeded(opCtx);
            }
            LOGV2_DEBUG(7298201,
                        2,
                        "Successfully refreshed all targeters for bulkWrite",
                        "targeterChanged"_attr = targeterChanged);
        } catch (const ExceptionFor<ErrorCodes::StaleEpoch>& ex) {
            LOGV2_DEBUG(
                7298203,
                2,
                "Failed to refresh all targeters for bulkWrite because collection was dropped",
                "error"_attr = redact(ex));

            bulkWriteOp.noteErrorForRemainingWrites(
                ex.toStatus("collection was dropped in the middle of the operation"));
            break;
        } catch (const DBException& ex) {
            LOGV2_WARNING(7298204,
                          "Failed to refresh all targeters for bulkWrite",
                          "error"_attr = redact(ex));
        }

        int currCompletedOps = bulkWriteOp.numWriteOpsIn(WriteOpState_Completed);
        if (currCompletedOps == numCompletedOps && !targeterChanged) {
            ++numRoundsWithoutProgress;
        } else {
            numRoundsWithoutProgress = 0;
        }
        numCompletedOps = currCompletedOps;
        bulkWriteOp.setTargeterHasStaleShardResponse(false);
        LOGV2_DEBUG(7934202,
                    2,
                    "Completed a round of bulkWrite execution",
                    "rounds"_attr = rounds,
                    "numCompletedOps"_attr = numCompletedOps,
                    "targeterChanged"_attr = targeterChanged,
                    "numRoundsWithoutProgress"_attr = numRoundsWithoutProgress);

        if (numRoundsWithoutProgress > kMaxRoundsWithoutProgress) {
            bulkWriteOp.noteErrorForRemainingWrites(
                {ErrorCodes::NoProgressMade,
                 str::stream() << "no progress was made executing bulkWrite ops in after "
                               << kMaxRoundsWithoutProgress << " rounds (" << numCompletedOps
                               << " ops completed in " << rounds << " rounds total)"});
            break;
        }
        if (numRoundsWithoutProgress > 0) {
            sleepFor(backoff.nextSleep());
        }
    }

    for (size_t nsIdx = 0; nsIdx < targeters.size(); ++nsIdx) {
        // Note: It is fine to use 'getAproxNShardsOwningChunks' here because the result is only
        // used to update stats.
        bulkWriteOp.noteNumShardsOwningChunks(nsIdx,
                                              targeters[nsIdx]->getAproxNShardsOwningChunks());
    }

    LOGV2_DEBUG(7263701, 4, "Finished execution of bulkWrite");
    execStats = bulkWriteOp.getExecStats();
    return bulkWriteOp.generateReplyInfo();
}

BulkWriteCommandReply createEmulatedErrorReply(const Status& error,
                                               int errorCount,
                                               const boost::optional<TenantId>& tenantId) {
    std::vector<BulkWriteReplyItem> emulatedReplies;
    emulatedReplies.reserve(errorCount);

    for (int i = 0; i < errorCount; i++) {
        emulatedReplies.emplace_back(i, error);
    }

    BulkWriteCommandReply emulatedReply;
    emulatedReply.setCursor(BulkWriteCommandResponseCursor(
        0, emulatedReplies, NamespaceString::makeBulkWriteNSS(tenantId)));
    emulatedReply.setNErrors(errorCount);
    emulatedReply.setNDeleted(0);
    emulatedReply.setNModified(0);
    emulatedReply.setNInserted(0);
    emulatedReply.setNUpserted(0);
    emulatedReply.setNMatched(0);
    return emulatedReply;
}

BulkWriteOp::BulkWriteOp(OperationContext* opCtx, const BulkWriteCommandRequest& clientRequest)
    : _opCtx(opCtx),
      _clientRequest(clientRequest),
      _txnNum(_opCtx->getTxnNumber()),
      _writeConcern(opCtx->getWriteConcern()),
      _inTransaction(static_cast<bool>(TransactionRouter::get(opCtx))),
      _isRetryableWrite(opCtx->isRetryableWrite()) {
    _writeOps.reserve(_clientRequest.getOps().size());
    _retriedStmtIds = stdx::unordered_set<StmtId>();
    for (size_t i = 0; i < _clientRequest.getOps().size(); ++i) {
        _writeOps.emplace_back(BatchItemRef(&_clientRequest, i), _inTransaction);
    }
}

StatusWith<WriteType> BulkWriteOp::target(const std::vector<std::unique_ptr<NSTargeter>>& targeters,
                                          bool recordTargetErrors,
                                          TargetedBatchMap& targetedBatches) {
    const auto ordered = _clientRequest.getOrdered();

    auto cmdRef = WriteCommandRef{_clientRequest};
    write_op_helpers::BulkCommandSizeEstimator sizeEstimator(_opCtx, cmdRef);

    return targetWriteOps(
        _opCtx,
        _writeOps,
        ordered,
        recordTargetErrors,
        _pauseMigrationsDuringMultiUpdatesParameter,
        [&](const WriteOp& writeOp) -> const NSTargeter& {
            const auto opIdx = writeOp.getWriteItem().getItemIndex();
            const auto& bulkWriteOp = BulkWriteCRUDOp(_clientRequest.getOps()[opIdx]);
            return *targeters[bulkWriteOp.getNsInfoIdx()];
        },
        sizeEstimator,
        targetedBatches);
}

BulkWriteCommandRequest BulkWriteOp::buildBulkCommandRequest(
    const std::vector<std::unique_ptr<NSTargeter>>& targeters,
    const TargetedWriteBatch& targetedBatch,
    boost::optional<bool> allowShardKeyUpdatesWithoutFullShardKeyInQuery) const {
    BulkWriteCommandRequest request;

    // A single bulk command request batch may contain operations of different
    // types, i.e. they may be inserts, updates or deletes.
    std::vector<BulkWriteOpVariant> ops;
    std::vector<NamespaceInfoEntry> nsInfo = _clientRequest.getNsInfo();
    std::vector<NamespaceInfoEntry> batchNsInfo = std::vector<NamespaceInfoEntry>();

    // So we don't send unnecessary nsInfos in the subbatch we need to keep track of a mapping of
    // the original nsInfoIdx to the new nsInfoidx. If we don't do this then document sequenced
    // nsInfo array can cause child batches to be over the max BSON size.
    absl::flat_hash_map<NamespaceString, int> nsInfoIndexMap;

    std::vector<int> stmtIds;
    if (_isRetryableWrite)
        stmtIds.reserve(targetedBatch.getNumOps());

    for (const auto& targetedWrite : targetedBatch.getWrites()) {
        const ItemIndexChildIndexPair& writeOpRef = targetedWrite->writeOpRef;
        ops.push_back(_clientRequest.getOps().at(writeOpRef.first));

        if (targetedWrite->sampleId.has_value()) {
            visit(
                OverloadedVisitor{
                    [&](mongo::BulkWriteInsertOp& op) { return; },
                    [&](mongo::BulkWriteUpdateOp& op) { op.setSampleId(targetedWrite->sampleId); },
                    [&](mongo::BulkWriteDeleteOp& op) { op.setSampleId(targetedWrite->sampleId); },
                },
                ops.back());
        }

        // Set the nsInfo's shardVersion & databaseVersion fields based on the endpoint
        // of each operation. Since some operations may be on the same namespace, this
        // might result in the same nsInfo entry being written to multiple times. This
        // is OK, since we know that in a single batch, all operations on the same
        // namespace MUST have the same shardVersion & databaseVersion.
        // Invariant checks that either the shardVersion & databaseVersion in nsInfo are
        // null OR the new versions in the targetedWrite match the existing version in
        // nsInfo.
        const auto& bulkWriteOp = BulkWriteCRUDOp(ops.back());
        auto nsIdx = bulkWriteOp.getNsInfoIdx();
        auto nss = nsInfo[nsIdx].getNs();

        // See if we have already added this index to the childBatch. If not then we need to add it
        // here.
        auto iter = nsInfoIndexMap.find(nss);
        if (iter == nsInfoIndexMap.end()) {
            batchNsInfo.push_back(nsInfo.at(nsIdx));
            iter = nsInfoIndexMap.insert({nss, batchNsInfo.size() - 1}).first;
        }

        // Set the new nsInfoIdx on the op for the childBatch.
        visit(
            OverloadedVisitor{
                [&](auto& op) { op.setNsInfoIdx(iter->second); },
            },
            ops.back());

        auto& nsInfoEntry = batchNsInfo.at(iter->second);
        auto& targeter = targeters.at(nsIdx);

        auto isClientRequestOnTimeseriesBucketCollection =
            nsInfoEntry.getNs().isTimeseriesBucketsCollection();
        if (targeter->isTrackedTimeSeriesBucketsNamespace() &&
            !isClientRequestOnTimeseriesBucketCollection) {
            // For tracked timeseries collections, only the bucket collections are tracked. This
            // sets the namespace to the namespace of the tracked bucket collection.
            nsInfoEntry.setNs(targeter->getNS());
            if (!isRawDataOperation(_opCtx)) {
                nsInfoEntry.setIsTimeseriesNamespace(true);
            }
        }

        // If we are using the two phase write protocol introduced in PM-1632, we allow shard key
        // updates without specifying the full shard key in the query if we execute the update in a
        // retryable write/transaction.
        if (bulkWriteOp.getType() == BulkWriteCRUDOp::OpType::kUpdate &&
            allowShardKeyUpdatesWithoutFullShardKeyInQuery.has_value()) {
            auto mutableUpdateOp = get_if<BulkWriteUpdateOp>(&ops.back());
            mutableUpdateOp->setAllowShardKeyUpdatesWithoutFullShardKeyInQuery(
                allowShardKeyUpdatesWithoutFullShardKeyInQuery);
        }

        invariant((!nsInfoEntry.getShardVersion() ||
                   nsInfoEntry.getShardVersion() == targetedWrite->endpoint.shardVersion) &&
                  (!nsInfoEntry.getDatabaseVersion() ||
                   nsInfoEntry.getDatabaseVersion() == targetedWrite->endpoint.databaseVersion));

        nsInfoEntry.setShardVersion(targetedWrite->endpoint.shardVersion);
        nsInfoEntry.setDatabaseVersion(targetedWrite->endpoint.databaseVersion);

        if (_isRetryableWrite) {
            stmtIds.push_back(bulk_write_common::getStatementId(_clientRequest, writeOpRef.first));
        }
    }

    request.setOps(ops);
    request.setNsInfo(batchNsInfo);

    // It isn't necessary to copy the cursor options over, because the cursor options
    // are for use in the interaction between the mongos and the client and not
    // internally between the mongos and the mongods.
    request.setOrdered(_clientRequest.getOrdered());
    request.setBypassDocumentValidation(_clientRequest.getBypassDocumentValidation());
    request.setLet(_clientRequest.getLet());
    request.setErrorsOnly(_clientRequest.getErrorsOnly());
    request.setComment(_clientRequest.getComment());

    if (_isRetryableWrite) {
        request.setStmtIds(stmtIds);
    }

    request.setBypassEmptyTsReplacement(_clientRequest.getBypassEmptyTsReplacement());

    request.setDbName(DatabaseName::kAdmin);

    return request;
}

bool BulkWriteOp::isFinished() const {
    // We encountered some error requiring us to abort execution. Note this may mean that some ops
    // are left in state pending.
    if (_aborted) {
        return true;
    }

    // TODO: Track ops lifetime.
    const bool ordered = _clientRequest.getOrdered();
    for (auto& writeOp : _writeOps) {
        if (writeOp.getWriteState() < WriteOpState_Completed) {
            return false;
        } else if (writeOp.getWriteState() == WriteOpState_Error) {
            // If the WriteOp's state is "WriteOpState_Error" and if '_inTransaction || ordered'
            // is true, then normally isFinished() will return true. Doing this allows the operation
            // to be aborted quickly, without having to wait for any remaining child ops.
            //
            // However, the logic in "cluster_write_cmd.cpp" and "cluster_find_and_modify_cmd.cpp"
            // that handles WouldChangeOwningShard errors requires that the current transaction
            // (if any) not be aborted when a WouldChangeOwningShard error occurs.
            //
            // Thus, if the WriteOp's state is "WriteOpState_Error" with a WouldChangeOwningShard
            // error and '_inTransaction || ordered' is true, and if the WriteOp still has pending
            // child ops, then isFinished() must return false to allow the pending child ops to
            // finish cleanly (to avoid causing the transaction to abort).
            if (writeOp.getOpError().getStatus().code() == ErrorCodes::WouldChangeOwningShard &&
                writeOp.hasPendingChildOps() && _inTransaction) {
                return false;
            }

            // If the BulkWriteOp is ordered or we're in a transaction -AND- if this WriteOp
            // encountered an error (excluding the WouldChangeOwningShard case handled above),
            // then return true to indicate that this BulkWriteOp is finished.
            if (_inTransaction || ordered) {
                return true;
            }
        }
    }
    return true;
}

bool BulkWriteOp::aboveBulkWriteRepliesMaxSize() const {
    return _approximateSize >= gBulkWriteMaxRepliesSize.loadRelaxed();
}

void BulkWriteOp::abortDueToMaxSizeError() {
    // Need to find the next writeOp so we can store an error in it.
    for (auto& writeOp : _writeOps) {
        if (writeOp.getWriteState() < WriteOpState_Completed) {
            writeOp.setOpError(write_ops::WriteError(
                0,
                Status{ErrorCodes::ExceededMemoryLimit,
                       fmt::format("BulkWrite response size exceeded limit ({} bytes)",
                                   _approximateSize)}));
            break;
        }
    }
    _aborted = true;
}

const WriteOp& BulkWriteOp::getWriteOp_forTest(int i) const {
    return _writeOps[i];
}

int BulkWriteOp::numWriteOpsIn(WriteOpState opState) const {
    return std::accumulate(
        _writeOps.begin(), _writeOps.end(), 0, [opState](int sum, const WriteOp& writeOp) {
            return sum + (writeOp.getWriteState() == opState ? 1 : 0);
        });
}

void BulkWriteOp::noteErrorForRemainingWrites(const Status& status) {
    dassert(!isFinished());
    dassert(numWriteOpsIn(WriteOpState_Pending) == 0);

    const auto ordered = _clientRequest.getOrdered();
    for (auto& writeOp : _writeOps) {
        if (writeOp.getWriteState() < WriteOpState_Completed) {
            const auto opIdx = writeOp.getWriteItem().getItemIndex();
            writeOp.setOpError(write_ops::WriteError(opIdx, status));

            // Only return the first error if we are ordered or are within a transaction.
            if (ordered || _inTransaction)
                break;
        }
    }

    dassert(isFinished());
}

void BulkWriteOp::processChildBatchResponseFromRemote(
    const TargetedWriteBatch& writeBatch,
    const AsyncRequestsSender::Response& response,
    boost::optional<stdx::unordered_map<NamespaceString, TrackedErrors>&> errorsPerNamespace) {
    invariant(response.swResponse.getStatus(), "Response status was unexpectedly not OK");

    auto childBatchResponse = response.swResponse.getValue();
    LOGV2_DEBUG(7279200,
                4,
                "Processing bulk write response from shard.",
                "shard"_attr = response.shardId,
                "response"_attr = childBatchResponse.data);

    auto childBatchStatus = getStatusFromCommandResult(childBatchResponse.data);
    if (childBatchStatus.isOK()) {
        auto bwReply = BulkWriteCommandReply::parse(childBatchResponse.data);
        if (bwReply.getWriteConcernError()) {
            saveWriteConcernError(
                response.shardId, bwReply.getWriteConcernError().value(), writeBatch);
        }

        // Capture the errors if any exist and mark the writes in the TargetedWriteBatch so that
        // they may be re-targeted if needed.
        noteChildBatchResponse(writeBatch, bwReply, errorsPerNamespace);
    } else {
        noteChildBatchError(writeBatch, childBatchStatus, errorsPerNamespace);

        // If we are in a transaction, we must abort execution on any error, excluding
        // WouldChangeOwningShard. We do not abort on WouldChangeOwningShard because the error is
        // returned from the shard and recorded here as a placeholder, as we will end up processing
        // the update (as a delete + insert on the corresponding shards in a txn) at the level of
        // ClusterBulkWriteCmd.
        if (TransactionRouter::get(_opCtx) &&
            childBatchStatus != ErrorCodes::WouldChangeOwningShard) {
            _aborted = true;

            auto errorReply = ErrorReply::parse(childBatchResponse.data);

            // Transient transaction errors should be returned directly as top level errors to allow
            // the client to retry.
            if (hasTransientTransactionErrorLabel(errorReply)) {
                const auto shardInfo = response.shardHostAndPort
                    ? response.shardHostAndPort->toString()
                    : writeBatch.getShardId();
                auto newStatus = childBatchStatus.withContext(
                    str::stream() << "Encountered error from " << shardInfo
                                  << " during a transaction");

                uassertStatusOK(newStatus);
            }
        }
    }
}

void BulkWriteOp::noteWriteOpResponse(const std::unique_ptr<TargetedWrite>& targetedWrite,
                                      WriteOp& op,
                                      const BulkWriteCommandReply& commandReply,
                                      size_t numOps,
                                      const boost::optional<const BulkWriteReplyItem&> replyItem) {
    if (op.getWriteType() == WriteType::WithoutShardKeyWithId) {
        // We have to extract the fields that are equivalent to 'n' in 'update' and 'delete'
        // command replies:
        // - For an update, this is 'nMatched' (rather than 'nUpdated', as it is possible the update
        // matches a document but the update modification is a no-op, e.g. it sets a field to its
        // current value, and in that case we should consider the write as done).
        // - For a delete, we can just consult 'nDeleted'.
        // Since the write is either an update or a delete, summing these two values gives us the
        // correct value of 'n'.
        auto n = commandReply.getNMatched() + commandReply.getNDeleted();
        op.noteWriteWithoutShardKeyWithIdResponse(_opCtx, *targetedWrite, n, numOps, replyItem);

        if (op.getWriteState() == WriteOpState_Completed) {
            _shouldStopCurrentRound = true;
        }
    } else {
        op.noteWriteComplete(_opCtx, *targetedWrite, replyItem);
    }
}

void BulkWriteOp::noteChildBatchResponse(
    const TargetedWriteBatch& targetedBatch,
    const BulkWriteCommandReply& commandReply,
    boost::optional<stdx::unordered_map<NamespaceString, TrackedErrors>&> errorsPerNamespace) {
    int firstTargetedWriteOpIdx = targetedBatch.getWrites().front()->writeOpRef.first;
    bool isWithoutShardKeyWithIdWrite =
        (_writeOps[firstTargetedWriteOpIdx].getWriteType() == WriteType::WithoutShardKeyWithId);
    bool shouldDeferWriteWithoutShardKeyReponse =
        isWithoutShardKeyWithIdWrite && targetedBatch.getNumOps() > 1;

    const auto replyItems =
        exhaustCursorForReplyItems(_opCtx, targetedBatch.getShardId(), commandReply);

    _nInserted += commandReply.getNInserted();
    _nDeleted += commandReply.getNDeleted();
    _nMatched += commandReply.getNMatched();
    _nUpserted += commandReply.getNUpserted();
    _nModified += commandReply.getNModified();

    // To support errorsOnly:true we need to keep separate track of the index in the replyItems
    // array and the index of the write ops we need to mark. This is because with errorsOnly we do
    // not guarantee that writeOps.size == replyItems.size, successful writes do not return a reply.
    // We need to be able to check if the write
    // op has the same index as the next reply we received, which is why we need to track 2
    // different indexes in this loop. Our goal is to iterate the arrays as such
    // writes:  [0, 1, 2, 3] -> [0, 1, 2, 3] -> [0, 1, 2, 3] -> [0, 1, 2, 3]
    //           ^                  ^                  ^                  ^
    // replies: [1, 3]       -> [1, 3]       -> [1, 3]       -> [1, 3]
    //           ^               ^                  ^               ^
    // Only moving forward in replies when we see a matching write op.
    int replyIndex = -1;
    // A batch will fail on an error if the request was sent with ordered:true or we are executing
    // the request within a transaction.
    bool batchWillContinue = !_clientRequest.getOrdered() && !_inTransaction;
    boost::optional<write_ops::WriteError> lastError;
    for (int writeOpIdx = 0; writeOpIdx < (int)targetedBatch.getWrites().size(); ++writeOpIdx) {
        const auto& write = targetedBatch.getWrites()[writeOpIdx];
        WriteOp& writeOp = _writeOps[write->writeOpRef.first];

        // This is only possible if we ran an errorsOnly:true command and succeeded all writes.
        if (replyItems.size() == 0) {
            tassert(8266001,
                    "bulkWrite should always get replies when not in errorsOnly",
                    _clientRequest.getErrorsOnly());
            if (shouldDeferWriteWithoutShardKeyReponse) {
                if (!_deferredResponses) {
                    _deferredResponses.emplace();
                }
                _deferredResponses->push_back(
                    std::make_tuple(&targetedBatch, commandReply, boost::none));
            }
            noteWriteOpResponse(
                write, writeOp, commandReply, targetedBatch.getNumOps(), boost::none);
            continue;
        }

        replyIndex++;

        // When an error is encountered on an ordered bulk write, it is impossible for any of the
        // remaining operations to have been executed. For that reason we reset them here so they
        // may be retargeted and retried if the error we saw is one we can retry after (e.g.
        // StaleConfig.).
        if (!batchWillContinue && lastError) {
            tassert(8266002,
                    "bulkWrite should not see replies after an error when ordered:true",
                    replyIndex >= (int)replyItems.size());
            writeOp.resetWriteToReady(_opCtx);
            continue;
        }

        // On most errors (for example, a DuplicateKeyError) unordered bulkWrite on a shard attempts
        // to execute following operations even if a preceding operation errored. This isn't true
        // for StaleConfig, StaleDbVersion of ShardCannotRefreshDueToLocksHeld errors. On these
        // errors, since the shard knows that following operations will fail for the same reason, it
        // stops right away (except for unordered timeseries inserts, see SERVER-80796).
        // As a consequence, although typically we can expect the size of replyItems to match the
        // size of the number of operations sent (even in the case of errors), when a
        // staleness/cache busy error is received the size of replyItems will be <= the size of the
        // number of operations. When this is the case, we treat all the remaining operations which
        // may not have a replyItem as having failed due to the same cause.
        bool isStaleError = lastError &&
            (lastError->getStatus().code() == ErrorCodes::StaleDbVersion ||
             ErrorCodes::isStaleShardVersionError(lastError->getStatus()) ||
             lastError->getStatus().code() == ErrorCodes::ShardCannotRefreshDueToLocksHeld ||
             lastError->getStatus() == ErrorCodes::CannotImplicitlyCreateCollection);

        if (batchWillContinue && isStaleError && (replyIndex == (int)replyItems.size())) {
            // Decrement the replyIndex so it keeps pointing to the same error (i.e. the
            // last error, which is a staleness error).
            LOGV2_DEBUG(7695304,
                        4,
                        "Duplicating the error for op",
                        "opIdx"_attr = write->writeOpRef.first,
                        "error"_attr = lastError->getStatus());
            replyIndex--;
        }

        // If we are out of replyItems but have more write ops then we must be in an ordered:false
        // errorsOnly:true bulkWrite where we have successful results after the last error.
        if (replyIndex >= (int)replyItems.size()) {
            tassert(8516601,
                    "bulkWrite received more replies than writes",
                    _clientRequest.getErrorsOnly());
            noteWriteOpResponse(
                write, writeOp, commandReply, targetedBatch.getNumOps(), boost::none);
            continue;
        }

        auto& reply = replyItems[replyIndex];

        // This can only happen when running an errorsOnly:true bulkWrite. We will only receive a
        // bulkWriteReplyItem for an error response when this flag is enabled. This means that
        // any writeOp which does not have a reply must have succeeded.
        // Since both the writeOps and the replies are stored in ascending index order this is
        // a safe assumption.
        // writeOpIdx can be > than reply.getIdx when we are duplicating the last error
        // as described in the block above.
        if (writeOpIdx < reply.getIdx()) {
            tassert(8266003,
                    "bulkWrite should get a reply for every write op when not in errorsOnly mode",
                    _clientRequest.getErrorsOnly());

            noteWriteOpResponse(
                write, writeOp, commandReply, targetedBatch.getNumOps(), boost::none);
            // We need to keep the replyIndex where it is until we see the op matching its index.
            replyIndex--;
            continue;
        }

        // A staleness error will end up being retried by mongos so we should not consider this
        // result final and count towards the maximum size limit for a response.
        if (!isStaleError) {
            _approximateSize += reply.getApproximateSize();
        }

        if (shouldDeferWriteWithoutShardKeyReponse) {
            if (!_deferredResponses) {
                _deferredResponses.emplace();
            }
            auto newReply = BulkWriteReplyItem::parse(reply.toBSON());
            _deferredResponses->push_back(std::make_tuple(&targetedBatch, commandReply, newReply));
        }

        if (reply.getStatus().isOK()) {
            noteWriteOpResponse(write, writeOp, commandReply, targetedBatch.getNumOps(), reply);
        } else {
            lastError.emplace(write->writeOpRef.first, reply.getStatus());
            writeOp.noteWriteError(_opCtx, *write, *lastError);

            auto origWrite = BulkWriteCRUDOp(_clientRequest.getOps()[write->writeOpRef.first]);
            auto nss = _clientRequest.getNsInfo()[origWrite.getNsInfoIdx()].getNs();

            // We don't always want to track errors per-namespace, e.g. when we encounter errors
            // local to mongos.
            if (errorsPerNamespace) {
                if (errorsPerNamespace->find(nss) == errorsPerNamespace->end()) {
                    TrackedErrors trackedErrors;
                    // Stale routing info errors need to be tracked in order to trigger a refresh of
                    // the targeter. On the other hand, errors caused by the catalog cache being
                    // temporarily unavailable (such as ShardCannotRefreshDueToLocksHeld) are
                    // ignored in this context, since no deduction can be made around possible
                    // placement changes.
                    trackedErrors.startTracking(ErrorCodes::StaleConfig);
                    trackedErrors.startTracking(ErrorCodes::StaleDbVersion);
                    trackedErrors.startTracking(ErrorCodes::CannotImplicitlyCreateCollection);
                    errorsPerNamespace->emplace(nss, trackedErrors);
                }

                auto trackedErrors = errorsPerNamespace->find(nss);
                invariant(trackedErrors != errorsPerNamespace->end());
                if (trackedErrors->second.isTracking(reply.getStatus().code())) {
                    trackedErrors->second.addError(ShardError(write->endpoint, *lastError));
                }
            }
        }
    }

    if (auto retriedStmtIds = commandReply.getRetriedStmtIds();
        retriedStmtIds && !retriedStmtIds->empty()) {
        for (auto retriedStmtId : *retriedStmtIds) {
            _retriedStmtIds->insert(retriedStmtId);
        }
    }
}

void BulkWriteOp::processTargetingError(const StatusWith<WriteType>& targetStatus) {
    invariant(!targetStatus.isOK());
    // Note that the targeting logic already handles recording the error for the appropriate
    // WriteOp, so we only need to update the BulkWriteOp state here.
    if (_inTransaction) {
        _aborted = true;

        // Throw when there is a transient transaction error since this should be a top
        // level error and not just a write error.
        if (isTransientTransactionError(targetStatus.getStatus().code(),
                                        false /* hasWriteConcernError */,
                                        false /* isCommitOrAbort */)) {
            uassertStatusOK(targetStatus);
        }
    }
}

void BulkWriteOp::abortIfNeeded(const mongo::Status& error) {
    invariant(!error.isOK());

    // If we see a local shutdown error, it means mongos itself is shutting down. A remote shutdown
    // error would have been returned with response.swResponse.getStatus() being OK.
    // If we see a local CallbackCanceled error, it is likely also due to mongos shutting down,
    // therefore shutting down executor thread pools and cancelling any work scheduled on them.
    // While we don't currently know of any other cases we'd see CallbackCanceled here, we check
    // the shutdown flag as well to ensure the cancellation is due to shutdown.
    // While the shutdown flag check is deprecated, that is because modules shouldn't consult it
    // to coordinate their own shutdowns. But it is OK to use here because we are only checking
    // whether a shutdown has started.
    if (ErrorCodes::isShutdownError(error) ||
        (error == ErrorCodes::CallbackCanceled && globalInShutdownDeprecated())) {
        // We shouldn't continue execution (even if unordered) if we are shutting down since
        // further batches will fail to execute as well.
        _aborted = true;

        // We want to throw such an error at the top level so that it can be returned to the client
        // directly with the appropriate error labels,  allowing them to retry it.
        uassertStatusOK(error);
    }

    // If we are in a transaction, we must stop immediately (even for unordered).
    if (_inTransaction) {
        // Even if we aren't throwing a top-level error, we won't continue processing any
        // outstanding writes after seeing this error since the transaction is aborted.
        _aborted = true;

        // Throw when there is a transient transaction error as those must be returned to the client
        // at the top level to allow them to retry.
        if (isTransientTransactionError(error.code(), false, false)) {
            uassertStatusOK(error);
        }
    }
}

void BulkWriteOp::processLocalChildBatchError(const TargetedWriteBatch& batch,
                                              const AsyncRequestsSender::Response& response) {
    const auto& responseStatus = response.swResponse.getStatus();
    invariant(!responseStatus.isOK(), "Response status was unexpectedly OK");

    const auto shardInfo =
        response.shardHostAndPort ? response.shardHostAndPort->toString() : batch.getShardId();

    const Status status = responseStatus.withContext(
        str::stream() << "bulkWrite results unavailable "
                      << (response.shardHostAndPort ? "from "
                                                    : "from failing to target a host in the shard ")
                      << shardInfo);

    noteChildBatchError(batch, status, boost::none);

    LOGV2_DEBUG(8048100,
                4,
                "Unable to receive bulkWrite results from shard",
                "shardInfo"_attr = shardInfo,
                "error"_attr = redact(status));

    abortIfNeeded(responseStatus);
}

void BulkWriteOp::noteChildBatchError(
    const TargetedWriteBatch& targetedBatch,
    const Status& status,
    boost::optional<stdx::unordered_map<NamespaceString, TrackedErrors>&> errorsPerNamespace) {
    // Treat an error to get a batch response as failures of the contained write(s).
    const int numErrors =
        (_clientRequest.getOrdered() || _inTransaction) ? 1 : targetedBatch.getWrites().size();
    auto emulatedReply =
        createEmulatedErrorReply(status, numErrors, _clientRequest.getDbName().tenantId());

    // This error isn't actually specific to any namespaces and so we do not want to track it.
    noteChildBatchResponse(targetedBatch, emulatedReply, errorsPerNamespace);
}

void BulkWriteOp::noteWriteOpFinalResponse(
    size_t opIdx,
    const boost::optional<BulkWriteReplyItem>& reply,
    const BulkWriteCommandReply& response,
    const ShardWCError& shardWCError,
    const boost::optional<std::vector<StmtId>>& retriedStmtIds) {
    WriteOp& writeOp = _writeOps[opIdx];

    // Cancel all childOps if any.
    writeOp.resetWriteToReady(_opCtx);

    if (!shardWCError.error.toStatus().isOK()) {
        saveWriteConcernError(shardWCError);
    }

    if (reply) {
        _approximateSize += reply->getApproximateSize();
    }

    if (response.getNErrors() == 0) {
        if (writeOp.getWriteItem().getOpType() == BatchedCommandRequest::BatchType_Insert) {
            _nInserted += response.getNInserted();
        } else if (writeOp.getWriteItem().getOpType() == BatchedCommandRequest::BatchType_Delete) {
            _nDeleted += response.getNDeleted();
        } else {
            _nModified += response.getNModified();
            _nMatched += response.getNMatched();
            _nUpserted += response.getNUpserted();
        }
        writeOp.setOpComplete(reply);
    } else {
        auto writeError = write_ops::WriteError(opIdx, reply->getStatus());
        writeOp.setOpError(writeError);
        abortIfNeeded(reply->getStatus());
    }

    if (retriedStmtIds && !retriedStmtIds->empty()) {
        for (auto retriedStmtId : *retriedStmtIds) {
            _retriedStmtIds->insert(retriedStmtId);
        }
    }
}

BulkWriteReplyInfo BulkWriteOp::generateReplyInfo() {
    dassert(isFinished());
    std::vector<BulkWriteReplyItem> replyItems;
    SummaryFields summary;
    summary.nInserted = _nInserted;
    summary.nDeleted = _nDeleted;
    summary.nMatched = _nMatched;
    summary.nModified = _nModified;
    summary.nUpserted = _nUpserted;
    replyItems.reserve(_writeOps.size());

    std::vector<boost::optional<std::string>> actualCollections(_clientRequest.getNsInfo().size(),
                                                                boost::none);
    std::deque<bool> hasContactedPrimaryShard(_clientRequest.getNsInfo().size(), false);

    const auto ordered = _clientRequest.getOrdered();
    for (auto& writeOp : _writeOps) {
        // If we encountered an error causing us to abort execution we may not have waited for
        // responses to all outstanding requests.
        dassert(writeOp.getWriteState() != WriteOpState_Pending || _aborted);
        auto writeOpState = writeOp.getWriteState();

        if (writeOpState == WriteOpState_Completed || writeOpState == WriteOpState_Error) {
            switch (writeOp.getWriteItem().getOpType()) {
                case BatchedCommandRequest::BatchType_Insert:
                    serviceOpCounters(ClusterRole::RouterServer).gotInsert();
                    break;
                case BatchedCommandRequest::BatchType_Update: {
                    // It is easier to handle the metric in handleWouldChangeOwningShardError for
                    // WouldChangeOwningShard. See getWouldChangeOwningShardErrorInfo for the batch
                    // size check. In the case of a WouldChangeOwningShard outside of a transaction,
                    // we will re-run cluster::bulkWrite so generateReplyInfo() gets called twice.
                    if (writeOpState != WriteOpState_Error ||
                        writeOp.getOpError().getStatus() != ErrorCodes::WouldChangeOwningShard ||
                        _writeOps.size() > 1) {
                        serviceOpCounters(ClusterRole::RouterServer).gotUpdate();
                    }

                    UpdateOpRef updateRef = writeOp.getWriteItem().getUpdateOp();

                    const auto opIdx = writeOp.getWriteItem().getItemIndex();
                    const auto& bulkWriteOp = BulkWriteCRUDOp(_clientRequest.getOps()[opIdx]);
                    const auto& ns = _clientRequest.getNsInfo()[bulkWriteOp.getNsInfoIdx()].getNs();
                    // 'isMulti' is set to false as the metrics for multi updates were registered
                    // for each operation individually.
                    bulk_write_common::incrementBulkWriteUpdateMetrics(getQueryCounters(_opCtx),
                                                                       ClusterRole::RouterServer,
                                                                       updateRef.getUpdateMods(),
                                                                       ns,
                                                                       updateRef.getArrayFilters(),
                                                                       false /* isMulti */);
                    break;
                }
                case BatchedCommandRequest::BatchType_Delete:
                    serviceOpCounters(ClusterRole::RouterServer).gotDelete();
                    break;
                default:
                    MONGO_UNREACHABLE
            }
        }

        if (writeOpState == WriteOpState_Completed) {
            if (writeOp.hasBulkWriteReplyItem()) {
                replyItems.push_back(writeOp.takeBulkWriteReplyItem());
            }
        } else if (writeOpState == WriteOpState_Error) {
            auto nsInfoIdx =
                BulkWriteCRUDOp(_clientRequest.getOps()[writeOp.getWriteItem().getItemIndex()])
                    .getNsInfoIdx();

            // Need to make a modifyable copy of the error.
            auto error = writeOp.getOpError();

            // If the error is not a collection UUID error then this function will not modify the
            // error, so we can call this on every iteration without checks.
            populateCollectionUUIDMismatch(_opCtx,
                                           &error,
                                           &actualCollections[nsInfoIdx],
                                           &hasContactedPrimaryShard[nsInfoIdx]);

            auto replyItem =
                BulkWriteReplyItem(writeOp.getWriteItem().getItemIndex(), error.getStatus());

            if (writeOp.hasBulkWriteReplyItem()) {
                auto successesReplyItem = writeOp.takeBulkWriteReplyItem();

                replyItem.setN(successesReplyItem.getN());
                replyItem.setNModified(successesReplyItem.getNModified());
                replyItem.setUpserted(successesReplyItem.getUpserted());
            } else {
                // If there was no previous successful response we still need to set nModified=0
                // for an update op since we lose that information in the BulkWriteReplyItem ->
                // WriteError transformation.
                if (writeOp.getWriteItem().getOpType() == BatchedCommandRequest::BatchType_Update) {
                    replyItem.setNModified(0);
                }
            }

            replyItems.emplace_back(replyItem);

            // We only count nErrors at the end of the command because it is simpler and less error
            // prone. If we counted errors as we encountered them we could hit edge cases where we
            // accidentally count the same error multiple times. At this point in the execution we
            // have already resolved any repeat errors.
            summary.nErrors++;
            // Only return the first error if we are ordered or are in a transaction.
            if (ordered || _inTransaction)
                break;
        }
    }
    std::vector<StmtId> retriedStmtIds;
    if (_retriedStmtIds.has_value()) {
        for (auto stmtId : *_retriedStmtIds) {
            retriedStmtIds.emplace_back(stmtId);
        }
    }
    return {std::move(replyItems), summary, generateWriteConcernError(), retriedStmtIds};
}

void BulkWriteOp::saveWriteConcernError(ShardId shardId,
                                        BulkWriteWriteConcernError wcError,
                                        const TargetedWriteBatch& writeBatch) {
    WriteConcernErrorDetail wce;
    wce.setStatus(Status(ErrorCodes::Error(wcError.getCode()), wcError.getErrmsg()));

    // WriteType::WithoutShardKeyWithId is always in its own batch, and so we only need to
    // inspect the first write here to determine if the batch is for a write of that type.
    auto opIdx = writeBatch.getWrites().front()->writeOpRef.first;
    if (_writeOps[opIdx].getWriteType() == WriteType::WithoutShardKeyWithId) {
        if (!_deferredWCErrors) {
            _deferredWCErrors.emplace();
        }
        (*_deferredWCErrors)[opIdx].push_back(ShardWCError(shardId, wce));
    } else {
        _wcErrors.push_back(ShardWCError(shardId, wce));
    }
}

void BulkWriteOp::saveWriteConcernError(ShardWCError shardWCError) {
    _wcErrors.push_back(std::move(shardWCError));
}

boost::optional<BulkWriteWriteConcernError> BulkWriteOp::generateWriteConcernError() const {
    if (auto mergedWce = mergeWriteConcernErrors(_wcErrors)) {
        auto totalWcError = BulkWriteWriteConcernError();
        totalWcError.setCode(mergedWce->toStatus().code());
        totalWcError.setErrmsg(mergedWce->toStatus().reason());

        return boost::optional<BulkWriteWriteConcernError>(totalWcError);
    }

    return boost::none;
}

void BulkWriteOp::noteStaleResponses(
    const std::vector<std::unique_ptr<NSTargeter>>& targeters,
    const stdx::unordered_map<NamespaceString, TrackedErrors>& errorsPerNamespace) {
    auto& nsInfo = _clientRequest.getNsInfo();
    for (size_t i = 0; i < nsInfo.size(); i++) {
        auto& nsEntry = nsInfo.at(i);
        auto& targeter = targeters.at(i);
        // We must use the namespace from the original client request instead of the targeter's
        // namespace because the targeter's namespace could be pointing to the bucket collection for
        // tracked timeseries collections.
        auto errors = errorsPerNamespace.find(nsEntry.getNs());
        if (errors != errorsPerNamespace.cend()) {
            for (const auto& error : errors->second.getErrors(ErrorCodes::StaleConfig)) {
                LOGV2_DEBUG(7279201,
                            4,
                            "Noting stale config response.",
                            "shardId"_attr = error.endpoint.shardName,
                            "status"_attr = error.error.getStatus());
                targeter->noteStaleCollVersionResponse(
                    _opCtx, *error.error.getStatus().extraInfo<StaleConfigInfo>());
                setTargeterHasStaleShardResponse(true);
            }
            for (const auto& error : errors->second.getErrors(ErrorCodes::StaleDbVersion)) {
                LOGV2_DEBUG(7279202,
                            4,
                            "Noting stale database response.",
                            "shardId"_attr = error.endpoint.shardName,
                            "status"_attr = error.error.getStatus());
                targeter->noteStaleDbVersionResponse(
                    _opCtx, *error.error.getStatus().extraInfo<StaleDbRoutingVersion>());
                setTargeterHasStaleShardResponse(true);
            }
            for (const auto& error :
                 errors->second.getErrors(ErrorCodes::CannotImplicitlyCreateCollection)) {
                LOGV2_DEBUG(8037203,
                            0,
                            "Noting cannotImplicitlyCreateCollection response.",
                            "status"_attr = error.error.getStatus());
                targeter->noteCannotImplicitlyCreateCollectionResponse(
                    _opCtx,
                    *error.error.getStatus().extraInfo<CannotImplicitlyCreateCollectionInfo>());
            }
        }
    }
}

void BulkWriteOp::finishExecutingWriteWithoutShardKeyWithId() {
    if (_deferredResponses) {
        for (unsigned long idx = 0; idx < _deferredResponses->size(); idx++) {
            auto [targetedWriteBatch, response, replyItem] = _deferredResponses->at(idx);
            LOGV2_DEBUG(864041,
                        4,
                        "Processing deferred response for WithoutShardKeyWithId: ",
                        "idx"_attr = idx,
                        "shardId"_attr = targetedWriteBatch->getShardId().toString(),
                        "responseN"_attr = response.getNModified());
            const auto& write =
                targetedWriteBatch->getWrites()[idx % targetedWriteBatch->getWrites().size()];
            WriteOp& writeOp = _writeOps[write->writeOpRef.first];
            if (targeterHasStaleShardResponse()) {
                if (writeOp.getWriteState() != WriteOpState_Ready) {
                    writeOp.resetWriteToReady(_opCtx);
                }
            } else if (writeOp.getWriteState() != WriteOpState_Error) {
                auto nVal = response.getNModified() + response.getNDeleted();
                auto repl = replyItem.has_value()
                    ? boost::optional<const BulkWriteReplyItem&>(replyItem.value())
                    : boost::optional<const BulkWriteReplyItem&>(boost::none);
                writeOp.noteWriteWithoutShardKeyWithIdResponse(
                    _opCtx, *write, nVal, targetedWriteBatch->getNumOps(), repl);
            }
        }
        _deferredResponses = boost::none;
    }

    // See _deferredWCErrors for details.
    if (_deferredWCErrors) {
        for (auto& it : *_deferredWCErrors) {
            auto& op = _writeOps[it.first];
            invariant(op.getWriteType() == WriteType::WithoutShardKeyWithId);
            auto& wcErrors = it.second;
            if (op.getWriteState() >= WriteOpState_Completed) {
                _wcErrors.insert(_wcErrors.end(), wcErrors.begin(), wcErrors.end());
            } else {
                // If we are here for any op it means that the whole batch is retried.
                break;
            }
        }
        _deferredWCErrors = boost::none;
    }

    // Setting _shouldStopCurrentRound to false here allows for the processing of any pending
    // writeOps. The decision to stop retrying for the current writeOp is made before this point.
    _shouldStopCurrentRound = false;
}

void BulkWriteOp::noteTargetedShard(const TargetedWriteBatch& targetedBatch) {
    _stats.noteTargetedShard(_clientRequest, targetedBatch);
}

void BulkWriteOp::noteNumShardsOwningChunks(size_t nsIdx, int nShardsOwningChunks) {
    _stats.noteNumShardsOwningChunks(nsIdx, nShardsOwningChunks);
}

void BulkWriteOp::noteTwoPhaseWriteProtocol(const TargetedWriteBatch& targetedBatch,
                                            size_t nsIdx,
                                            int nShardsOwningChunks) {
    _stats.noteTwoPhaseWriteProtocol(_clientRequest, targetedBatch, nsIdx, nShardsOwningChunks);
}

void addIdsForInserts(BulkWriteCommandRequest& origCmdRequest) {
    std::vector<BulkWriteOpVariant> newOps;
    newOps.reserve(origCmdRequest.getOps().size());

    for (const auto& op : origCmdRequest.getOps()) {
        auto crudOp = BulkWriteCRUDOp(op);
        if (crudOp.getType() == BulkWriteCRUDOp::kInsert &&
            crudOp.getInsert()->getDocument()["_id"].eoo()) {
            auto insert = crudOp.getInsert();
            auto doc = insert->getDocument();
            BSONObjBuilder idInsertB;
            idInsertB.append("_id", OID::gen());
            idInsertB.appendElements(doc);
            auto newDoc = idInsertB.obj();
            auto newOp = BulkWriteInsertOp(insert->getNsInfoIdx(), std::move(newDoc));
            newOps.push_back(std::move(newOp));
        } else {
            newOps.push_back(std::move(op));
        }
    }

    origCmdRequest.setOps(newOps);
}


}  // namespace bulk_write_exec

}  // namespace mongo