PYTHON-5731 - Server selection deprioritization only for overload errors on replica sets

This commit is contained in:
Noah Stapp 2026-02-19 13:34:25 -05:00
parent b60d266ad7
commit 003796fdbe
4 changed files with 154 additions and 2 deletions

View File

@ -2825,7 +2825,11 @@ class _ClientConnectionRetryable(Generic[T]):
if self._last_error is None:
self._last_error = exc
if self._server is not None:
if (
self._server is not None
and self._client.topology_description.topology_type_name == "Sharded"
or exc.has_error_label("SystemOverloadedError")
):
self._deprioritized_servers.append(self._server)
def _is_not_eligible_for_retry(self) -> bool:

View File

@ -2815,7 +2815,11 @@ class _ClientConnectionRetryable(Generic[T]):
if self._last_error is None:
self._last_error = exc
if self._server is not None:
if (
self._server is not None
and self._client.topology_description.topology_type_name == "Sharded"
or exc.has_error_label("SystemOverloadedError")
):
self._deprioritized_servers.append(self._server)
def _is_not_eligible_for_retry(self) -> bool:

View File

@ -261,6 +261,78 @@ class TestRetryableReads(AsyncIntegrationTest):
self.assertEqual(command_docs[0]["lsid"], command_docs[1]["lsid"])
self.assertIsNot(command_docs[0], command_docs[1])
@async_client_context.require_replica_set
@async_client_context.require_failCommand_fail_point
async def test_retryable_reads_caused_by_overload_errors_are_retried_on_a_different_replicaset_server_when_one_is_available(
self
):
listener = OvertCommandListener()
# Create a client `client` with `retryReads=true`, `readPreference=primaryPreferred`, and command event monitoring enabled.
client = await self.async_rs_or_single_client(
event_listeners=[listener], retryReads=True, readPreference="primaryPreferred"
)
# Configure a fail point with the RetryableError and SystemOverloadedError error labels.
command_args = {
"configureFailPoint": "failCommand",
"mode": {"times": 1},
"data": {
"failCommands": ["find"],
"errorLabels": ["RetryableError", "SystemOverloadedError"],
"errorCode": 6,
},
}
await async_set_fail_point(client, command_args)
# Reset the command event monitor to clear the fail point command from its stored events.
listener.reset()
# Execute a `find` command with `client`.
await client.t.t.find_one({})
# Assert that one failed command event and one successful command event occurred.
self.assertEqual(len(listener.failed_events), 1)
self.assertEqual(len(listener.succeeded_events), 1)
# Assert that both events occurred on different servers.
assert listener.failed_events[0].connection_id != listener.succeeded_events[0].connection_id
@async_client_context.require_replica_set
@async_client_context.require_failCommand_fail_point
async def test_retryable_reads_error_are_retried_on_same_replicaset_server(self):
listener = OvertCommandListener()
# Create a client `client` with `retryReads=true`, `readPreference=primaryPreferred`, and command event monitoring enabled.
client = await self.async_rs_or_single_client(
event_listeners=[listener], retryReads=True, readPreference="primaryPreferred"
)
# Configure a fail point with the RetryableError error label.
command_args = {
"configureFailPoint": "failCommand",
"mode": {"times": 1},
"data": {
"failCommands": ["find"],
"errorLabels": ["RetryableError"],
"errorCode": 6,
},
}
await async_set_fail_point(client, command_args)
# Reset the command event monitor to clear the fail point command from its stored events.
listener.reset()
# Execute a `find` command with `client`.
await client.t.t.find_one({})
# Assert that one failed command event and one successful command event occurred.
self.assertEqual(len(listener.failed_events), 1)
self.assertEqual(len(listener.succeeded_events), 1)
# Assert that both events occurred the same server.
assert listener.failed_events[0].connection_id == listener.succeeded_events[0].connection_id
if __name__ == "__main__":
unittest.main()

View File

@ -259,6 +259,78 @@ class TestRetryableReads(IntegrationTest):
self.assertEqual(command_docs[0]["lsid"], command_docs[1]["lsid"])
self.assertIsNot(command_docs[0], command_docs[1])
@client_context.require_replica_set
@client_context.require_failCommand_fail_point
def test_retryable_reads_caused_by_overload_errors_are_retried_on_a_different_replicaset_server_when_one_is_available(
self
):
listener = OvertCommandListener()
# Create a client `client` with `retryReads=true`, `readPreference=primaryPreferred`, and command event monitoring enabled.
client = self.rs_or_single_client(
event_listeners=[listener], retryReads=True, readPreference="primaryPreferred"
)
# Configure a fail point with the RetryableError and SystemOverloadedError error labels.
command_args = {
"configureFailPoint": "failCommand",
"mode": {"times": 1},
"data": {
"failCommands": ["find"],
"errorLabels": ["RetryableError", "SystemOverloadedError"],
"errorCode": 6,
},
}
set_fail_point(client, command_args)
# Reset the command event monitor to clear the fail point command from its stored events.
listener.reset()
# Execute a `find` command with `client`.
client.t.t.find_one({})
# Assert that one failed command event and one successful command event occurred.
self.assertEqual(len(listener.failed_events), 1)
self.assertEqual(len(listener.succeeded_events), 1)
# Assert that both events occurred on different servers.
assert listener.failed_events[0].connection_id != listener.succeeded_events[0].connection_id
@client_context.require_replica_set
@client_context.require_failCommand_fail_point
def test_retryable_reads_error_are_retried_on_same_replicaset_server(self):
listener = OvertCommandListener()
# Create a client `client` with `retryReads=true`, `readPreference=primaryPreferred`, and command event monitoring enabled.
client = self.rs_or_single_client(
event_listeners=[listener], retryReads=True, readPreference="primaryPreferred"
)
# Configure a fail point with the RetryableError error label.
command_args = {
"configureFailPoint": "failCommand",
"mode": {"times": 1},
"data": {
"failCommands": ["find"],
"errorLabels": ["RetryableError"],
"errorCode": 6,
},
}
set_fail_point(client, command_args)
# Reset the command event monitor to clear the fail point command from its stored events.
listener.reset()
# Execute a `find` command with `client`.
client.t.t.find_one({})
# Assert that one failed command event and one successful command event occurred.
self.assertEqual(len(listener.failed_events), 1)
self.assertEqual(len(listener.succeeded_events), 1)
# Assert that both events occurred the same server.
assert listener.failed_events[0].connection_id == listener.succeeded_events[0].connection_id
if __name__ == "__main__":
unittest.main()