From b92a12c682e91734d01e9a1c406ae862037af0a0 Mon Sep 17 00:00:00 2001 From: Daniel Moody Date: Wed, 6 Aug 2025 13:32:52 -0500 Subject: [PATCH] SERVER-108845 switch back to rapidyaml (#39670) GitOrigin-RevId: 90eb45aaa0f42fe62b7ae8180e82a9c6e17f9ce5 --- buildscripts/ciconfig/BUILD.bazel | 1 + buildscripts/ciconfig/evergreen.py | 8 ++-- buildscripts/ciconfig/yaml_load.py | 46 +++++++++++++++++++ .../tests/resmoke_end2end/test_resmoke.py | 2 +- poetry.lock | 40 +++++++++++++++- pyproject.toml | 5 ++ 6 files changed, 96 insertions(+), 6 deletions(-) create mode 100644 buildscripts/ciconfig/yaml_load.py diff --git a/buildscripts/ciconfig/BUILD.bazel b/buildscripts/ciconfig/BUILD.bazel index b80c6e14e2b..7488fb682e1 100644 --- a/buildscripts/ciconfig/BUILD.bazel +++ b/buildscripts/ciconfig/BUILD.bazel @@ -5,6 +5,7 @@ py_library( srcs = [ "__init__.py", "evergreen.py", + "yaml_load.py", ], visibility = ["//visibility:public"], deps = [ diff --git a/buildscripts/ciconfig/evergreen.py b/buildscripts/ciconfig/evergreen.py index b624671b9ae..47b1e48df23 100644 --- a/buildscripts/ciconfig/evergreen.py +++ b/buildscripts/ciconfig/evergreen.py @@ -15,7 +15,8 @@ import sys from typing import Any, Dict, List, Optional, Set import structlog -import yaml + +from buildscripts.ciconfig.yaml_load import yaml_load ENTERPRISE_MODULE_NAME = "enterprise" ASAN_SIGNATURE = "detect_leaks=1" @@ -80,10 +81,11 @@ def parse_evergreen_file(path, evergreen_binary="evergreen"): path, result.stdout, result.stderr ) ) - config = yaml.safe_load(result.stdout) + config: dict = yaml_load(result.stdout) else: with open(path, "r", encoding="utf8") as fstream: - config = yaml.safe_load(fstream) + data = fstream.read() + config: dict = yaml_load(data) return EvergreenProjectConfig(config) diff --git a/buildscripts/ciconfig/yaml_load.py b/buildscripts/ciconfig/yaml_load.py new file mode 100644 index 00000000000..505ae21af99 --- /dev/null +++ b/buildscripts/ciconfig/yaml_load.py @@ -0,0 +1,46 @@ +from typing import Any + +# PyYaml is very easy to use, but it is very slow. This is a problem for us since the main evergreen.yml file is quite large. +# PyYaml was taking over 10s to just load the file, which needed to be done every single task and so was a significant bottleneck. +# We use the rapidyaml library instead, which is much more low level but much faster (sub 1s to load the same file). This is not a +# full drop in replacement for PyYaml and does not fully satisfy the yaml spec, but it is sufficient for our needs. + + +try: + import ryml + + def ryml_to_dict(tree: ryml.Tree, index: int = 0) -> Any: + """Walk through the ryml tree and convert nodes.""" + if tree.is_map(index): + return { + str(tree.key(child_index), "utf8"): ryml_to_dict(tree, child_index) + for child_index in ryml.children(tree, index) + } + elif tree.is_seq(index): + return [ryml_to_dict(tree, child_index) for child_index in ryml.children(tree, index)] + else: + decoded_value = tree.val(index).tobytes().decode("utf8") + if decoded_value == "true": + return True + elif decoded_value == "false": + return False + elif decoded_value == "null" or decoded_value == "~": + return None + try: + int_value = int(decoded_value) + return int_value + except ValueError: + pass + try: + float_value = float(decoded_value) + return float_value + except ValueError: + pass + return decoded_value + + def yaml_load(data: str) -> dict: + """Safely load YAML data.""" + return ryml_to_dict(ryml.parse_in_arena(data)) + +except ImportError: + from yaml import safe_load as yaml_load # noqa diff --git a/buildscripts/tests/resmoke_end2end/test_resmoke.py b/buildscripts/tests/resmoke_end2end/test_resmoke.py index b997486941b..3335c8f028a 100644 --- a/buildscripts/tests/resmoke_end2end/test_resmoke.py +++ b/buildscripts/tests/resmoke_end2end/test_resmoke.py @@ -808,7 +808,7 @@ class TestEvergreenYML(unittest.TestCase): generate_func = task.find_func_command("generate resmoke tasks") if ( generate_func is None - or get_dict_value(generate_func, ["vars", "is_jstestfuzz"]) != "true" + or get_dict_value(generate_func, ["vars", "is_jstestfuzz"]) is not True ): continue diff --git a/poetry.lock b/poetry.lock index d337c0b7813..06a506f5181 100644 --- a/poetry.lock +++ b/poetry.lock @@ -714,6 +714,22 @@ wrapt = ">=1.10,<2" [package.extras] dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "setuptools", "tox"] +[[package]] +name = "deprecation" +version = "2.1.0" +description = "A library to handle automated deprecations" +optional = false +python-versions = "*" +groups = ["powercycle-incompatible"] +markers = "(platform_machine != \"s390x\" and platform_machine != \"ppc64le\" or platform_machine == \"s390x\" or platform_machine == \"ppc64le\") and platform_system != \"Windows\"" +files = [ + {file = "deprecation-2.1.0-py2.py3-none-any.whl", hash = "sha256:a10811591210e1fb0e768a8c25517cabeabcba6f0bf96564f8ff45189f90b14a"}, + {file = "deprecation-2.1.0.tar.gz", hash = "sha256:72b3bde64e5d778694b0cf68178aed03d15e15477116add3fb773e581f9518ff"}, +] + +[package.dependencies] +packaging = "*" + [[package]] name = "distlib" version = "0.3.9" @@ -2563,7 +2579,7 @@ version = "24.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" -groups = ["compile", "export", "idl", "testing"] +groups = ["compile", "export", "idl", "powercycle-incompatible", "testing"] markers = "platform_machine != \"s390x\" and platform_machine != \"ppc64le\" or platform_machine == \"s390x\" or platform_machine == \"ppc64le\"" files = [ {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, @@ -3601,6 +3617,26 @@ files = [ [package.extras] all = ["numpy"] +[[package]] +name = "rapidyaml" +version = "0.0.post1671" +description = "Rapid YAML - a library to parse and emit YAML, and do it fast" +optional = false +python-versions = ">=3.6" +groups = ["powercycle-incompatible"] +markers = "(platform_machine != \"s390x\" and platform_machine != \"ppc64le\" or platform_machine == \"s390x\" or platform_machine == \"ppc64le\") and platform_system != \"Windows\"" +files = [] +develop = false + +[package.dependencies] +deprecation = "*" + +[package.source] +type = "git" +url = "https://github.com/mongodb-forks/rapidyaml.git" +reference = "a5d485fd44719e1c03e059177fc1f695fc462b66" +resolved_reference = "a5d485fd44719e1c03e059177fc1f695fc462b66" + [[package]] name = "referencing" version = "0.36.2" @@ -5527,4 +5563,4 @@ libdeps = ["cxxfilt", "eventlet", "flask", "flask-cors", "gevent", "lxml", "prog [metadata] lock-version = "2.1" python-versions = ">=3.10,<4.0" -content-hash = "5dedf21a2566f81a279d675c3aecc911150080b38f1ae9213ad0773f6c29ff97" +content-hash = "50627e8fc8d530805753cda3bea5c4585817f04364ae29fe4f7c23fdf14060fb" diff --git a/pyproject.toml b/pyproject.toml index 52cdd5240b1..9e11e6aca22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,6 +75,11 @@ typing-extensions = "^4.12.2" typer = "^0.12.3" tenacity = "^9.0.0" +# specifically rapidyaml is broken on atlas distros with powercycle. +# current we exclude this when running poetry install in powercycle. +[tool.poetry.group.powercycle-incompatible.dependencies] +rapidyaml = {git = "https://github.com/mongodb-forks/rapidyaml.git@master", rev = "a5d485fd44719e1c03e059177fc1f695fc462b66", markers = "platform_system != 'Windows'"} + [tool.poetry.group.export.dependencies] pipx = "1.6.0" # TODO: Add in pex as we move forward with this