mongo/buildscripts/bazel_custom_formatter.py
Nick Jefferies a4130ea9ab SERVER-119906: Enable UP006 ruff rule (#48284)
GitOrigin-RevId: 2069fa7bda111a89d4a9a43a56e71f06cc2e9a7c
2026-02-20 18:55:17 +00:00

646 lines
21 KiB
Python

import argparse
import hashlib
import json
import os
import subprocess
import sys
import time
from collections import deque
from pathlib import Path
sys.path.append(".")
from buildscripts.install_bazel import install_bazel, install_buildozer
from buildscripts.simple_report import make_report, put_report, try_combine_reports
groups_sort_keys = {
"first": 1,
"second": 2,
"third": 3,
"fourth": 4,
"fifth": 5,
"sixth": 6,
"seventh": 7,
"eighth": 8,
}
def find_group(unittest_paths):
groups = {
# group1
"0": "first",
"1": "first",
# group2
"2": "second",
"3": "second",
# group3
"4": "third",
"5": "third",
# group4
"6": "fourth",
"7": "fourth",
# group5
"8": "fifth",
"9": "fifth",
# group6
"a": "sixth",
"b": "sixth",
# group7
"c": "seventh",
"d": "seventh",
# group8
"e": "eighth",
"f": "eighth",
}
group_to_path: dict[str, list[str]] = {}
for path in unittest_paths:
norm_path = path.replace(":", "/").replace("\\", "/")
if norm_path.startswith("//"):
norm_path = norm_path[2:]
if not norm_path.startswith("src/"):
print(f"ERROR: {path} not relative to mongo repo root")
sys.exit(1)
basename = os.path.basename(norm_path)
if basename.startswith("lib"):
basename = basename[3:]
ext = basename.find(".")
if ext != -1:
basename = basename[:ext]
dirname = os.path.dirname(norm_path)
hash_path = os.path.join(dirname, basename).replace("\\", "/")
first_char = hashlib.sha256(hash_path.encode()).hexdigest()[0]
group = groups[first_char]
if group not in group_to_path:
group_to_path[group] = []
group_to_path[group].append(path)
return json.dumps(group_to_path, indent=4)
def find_multiple_groups(test, groups):
tagged_groups = []
for group in groups:
if test in groups[group]:
tagged_groups.append(group)
return tagged_groups
def iter_clang_tidy_files(root: str | Path) -> list[Path]:
"""Return a list of repo-relative Paths to '.clang-tidy' files.
- Uses os.scandir for speed
- Does NOT follow symlinks
"""
root = Path(root).resolve()
results: list[Path] = []
stack = deque([root])
while stack:
current = stack.pop()
try:
with os.scandir(current) as it:
for entry in it:
name = entry.name
if entry.is_dir(follow_symlinks=False):
stack.append(Path(entry.path))
elif entry.is_file(follow_symlinks=False) and name == ".clang-tidy":
# repo-relative path
results.append(Path(entry.path).resolve().relative_to(root))
except PermissionError:
continue
return results
def validate_clang_tidy_configs(generate_report, fix):
buildozer = install_buildozer()
mongo_dir = "src/mongo"
tidy_files = iter_clang_tidy_files("src/mongo")
p = subprocess.run(
[buildozer, "print label srcs", "//:clang_tidy_config_files"],
capture_output=True,
text=True,
)
tidy_targets = None
for line in p.stdout.splitlines():
if line.startswith("//") and line.endswith("]"):
tokens = line.split("[")
tidy_targets = tokens[1][:-1].split(" ")
break
if tidy_targets is None:
print(p.stderr)
raise Exception(f"could not parse tidy config targets from '{p.stdout}'")
if tidy_targets == [""]:
tidy_targets = []
all_targets = []
for tidy_file in tidy_files:
tidy_file_target = (
"//" + os.path.dirname(os.path.join(mongo_dir, tidy_file)) + ":clang_tidy_config"
)
all_targets.append(tidy_file_target)
if all_targets != tidy_targets:
msg = f"Incorrect clang tidy config targets: {all_targets} != {tidy_targets}"
print(msg)
if generate_report:
report = make_report("//:clang_tidy_config_files", msg, 1)
try_combine_reports(report)
put_report(report)
if fix:
subprocess.run(
[buildozer, f"set srcs {' '.join(all_targets)}", "//:clang_tidy_config_files"]
)
def validate_bazel_groups(generate_report, fix):
buildozer = install_buildozer()
bazel_bin = install_bazel(".")
query_opts = [
"--implicit_deps=False",
"--tool_deps=False",
"--include_aspects=False",
"--bes_backend=",
"--bes_results_url=",
]
try:
start = time.time()
sys.stdout.write("Query all unittest binaries... ")
sys.stdout.flush()
query_proc = subprocess.run(
[
bazel_bin,
"query",
r'kind(extract_debug, attr(tags, "[\[ ]mongo_unittest[,\]]", //src/...))',
]
+ query_opts,
capture_output=True,
text=True,
check=True,
)
bazel_unittests = query_proc.stdout.splitlines()
sys.stdout.write("{:0.2f}s\n".format(time.time() - start))
except subprocess.CalledProcessError as exc:
print("BAZEL ERROR:")
print(exc.stdout)
print(exc.stderr)
sys.exit(exc.returncode)
buildozer_update_cmds = []
groups = json.loads(find_group(bazel_unittests))
failures = []
for group in sorted(groups, key=lambda x: groups_sort_keys[x]):
try:
start = time.time()
sys.stdout.write(f"Query all mongo_unittest_{group}_group unittests... ")
sys.stdout.flush()
query_proc = subprocess.run(
[
bazel_bin,
"query",
rf'kind(extract_debug, attr(tags, "[\[ ]mongo_unittest_{group}_group[,\]]", //src/...))',
]
+ query_opts,
capture_output=True,
text=True,
check=True,
)
sys.stdout.write("{:0.2f}s\n".format(time.time() - start))
group_tests = query_proc.stdout.splitlines()
except subprocess.CalledProcessError as exc:
print("BAZEL ERROR:")
print(exc.stdout)
print(exc.stderr)
sys.exit(exc.returncode)
if groups[group] != group_tests:
for test in group_tests:
if test not in bazel_unittests:
failures.append(
[
test + " tag",
f"{test} not a 'mongo_unittest' but has 'mongo_unittest_{group}_group' tag.",
]
)
print(failures[-1][1])
if fix:
buildozer_update_cmds += [
[f"remove tags mongo_unittest_{group}_group", test]
]
for test in groups[group]:
if test not in group_tests:
failures.append(
[test + " tag", f"{test} missing 'mongo_unittest_{group}_group'"]
)
print(failures[-1][1])
if fix:
buildozer_update_cmds += [[f"add tags mongo_unittest_{group}_group", test]]
for test in group_tests:
if test not in groups[group]:
failures.append(
[
test + " tag",
f"{test} is tagged in the wrong group.",
]
)
print(failures[-1][1])
if fix:
buildozer_update_cmds += [
[f"remove tags mongo_unittest_{group}_group", test]
]
if fix:
for cmd in buildozer_update_cmds:
subprocess.run([buildozer] + cmd)
if failures:
for failure in failures:
if generate_report:
report = make_report(failure[0], failure[1], 1)
try_combine_reports(report)
put_report(report)
def validate_idl_naming(generate_report: bool, fix: bool) -> None:
"""
Enforce:
idl_generator(
name = "<stem>_gen",
src = "<stem>.idl" | ":gen_target" # where gen_target produces exactly one .idl
)
Single `bazel query --output=xml`, parse in-process. Also resolves src labels to generators.
"""
import xml.etree.ElementTree as ET
bazel_bin = install_bazel(".")
qopts = [
"--implicit_deps=False",
"--tool_deps=False",
"--include_aspects=False",
"--bes_backend=",
"--bes_results_url=",
]
# One narrowed query: only rules created by the idl_generator macro
try:
proc = subprocess.run(
[
bazel_bin,
"query",
"attr(generator_function, idl_generator, //src/...)",
"--output=xml",
]
+ qopts,
capture_output=True,
text=True,
check=True,
)
except subprocess.CalledProcessError as exc:
print("BAZEL ERROR (narrowed xml):")
print(exc.stdout)
print(exc.stderr)
sys.exit(exc.returncode)
root = ET.fromstring(proc.stdout)
failures: list[tuple[str, str]] = []
def _val(rule, kind, attr):
n = rule.find(f'./{kind}[@name="{attr}"]')
return n.get("value") if n is not None else None
# Prepass: map rule label -> outputs so we can resolve src labels that generate an .idl
outputs_by_rule: dict[str, list[str]] = {}
for r in root.findall(".//rule"):
rname = r.get("name")
if not rname:
continue
outs = [n.get("name") for n in r.findall("./rule-output") if n.get("name")]
outputs_by_rule[rname] = outs
for rule in root.findall(".//rule"):
# Already narrowed, but keep the sentinel check cheap
if _val(rule, "string", "generator_function") != "idl_generator":
continue
rlabel = rule.get("name") or ""
if not (rlabel.startswith("//") and ":" in rlabel):
failures.append((rlabel or "<unknown>", "Malformed idl_generator rule label"))
continue
pkg, name = rlabel[2:].split(":", 1)
# Resolve src from label/string/srcs list
src_val = _val(rule, "label", "src") or _val(rule, "string", "src")
if not src_val:
srcs_vals = []
for lst in rule.findall('./list[@name="srcs"]'):
srcs_vals += [n.get("value") for n in lst.findall("./label") if n.get("value")]
srcs_vals += [n.get("value") for n in lst.findall("./string") if n.get("value")]
if len(srcs_vals) == 1:
src_val = srcs_vals[0]
else:
failures.append(
(rlabel, f"'src'/'srcs' must have exactly one entry, got: {srcs_vals}")
)
continue
src = src_val.replace("\\", "/")
src_base: str | None = None
if src.startswith("//"):
spkg, sname = src[2:].split(":")
if spkg != pkg:
failures.append((rlabel, f"'src' must be in same package '{pkg}', got '{src}'"))
if sname.endswith(".idl"):
src_base = os.path.basename(sname)
else:
idl_outs = [o for o in outputs_by_rule.get(src, []) if o.endswith(".idl")]
if len(idl_outs) != 1:
failures.append(
(
rlabel,
f"'src' '{src}' must produce exactly one .idl, got: {idl_outs or outputs_by_rule.get(src, [])}",
)
)
continue
src_base = os.path.basename(idl_outs[0].split(":", 1)[1])
elif src.startswith(":"):
sname = src[1:]
if sname.endswith(".idl"):
src_base = os.path.basename(sname)
else:
abs_label = f"//{pkg}:{sname}"
idl_outs = [o for o in outputs_by_rule.get(abs_label, []) if o.endswith(".idl")]
if len(idl_outs) != 1:
failures.append(
(
rlabel,
f"'src' '{src}' must produce exactly one .idl, got: {idl_outs or outputs_by_rule.get(abs_label, [])}",
)
)
continue
src_base = os.path.basename(idl_outs[0].split(":", 1)[1])
else:
if src.startswith("../") or "/../" in src:
failures.append((rlabel, f"'src' must be within package '{pkg}', got '{src}'"))
src_base = os.path.basename(src)
if not (src_base and src_base.endswith(".idl")):
failures.append((rlabel, f"'src' must resolve to a .idl file, got: {src_base or src}"))
continue
if not name.endswith("_gen"):
failures.append((rlabel, "Target name must end with '_gen'"))
stem_from_name = name[:-4] if name.endswith("_gen") else name
stem_from_src = src_base[:-4]
if stem_from_name != stem_from_src:
failures.append(
(
rlabel,
f"Stem mismatch: name '{name}' vs src '{src_base}'. "
f"Expected src basename '{stem_from_name}.idl'.",
)
)
if failures:
for lbl, msg in failures:
print(f"IDL naming violation: {lbl}: {msg}")
if generate_report:
report = make_report(lbl, msg, 1)
try_combine_reports(report)
put_report(report)
# print(time.time() - start)
if fix and failures:
sys.exit(1)
def validate_private_headers(generate_report: bool, fix: bool) -> None:
"""
Fast header linter/fixer using concurrent buildozer reads:
buildozer print label srcs //<scope>:%<macro>
- Lints if any header appears anywhere in the printed block (including select()/glob()).
- Auto-fixes ONLY concrete items in the first [...] (top-level list).
- Fails the run if a non-concrete header is detected (select()/glob()).
"""
import re
import subprocess
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from shlex import split as shlex_split
# ---- Config ----
HEADER_EXTS = (".h", ".hh", ".hpp", ".hxx")
HEADER_RE = re.compile(r"\.(h|hh|hpp|hxx)\b")
PUBLIC_KEEP = {
"//src/mongo/platform:basic.h",
"//src/mongo/platform:windows_basic.h",
}
SCOPE = "//src/mongo/..." # limit to your subtree
MACRO_SELECTORS = [
"%mongo_cc_library",
"%mongo_cc_binary",
"%mongo_cc_unit_test",
"%mongo_cc_benchmark",
"%mongo_cc_integration_test",
"%mongo_cc_fuzzer_test",
"%mongo_cc_extension_shared_library",
]
SKIP_SUFFIXES = ("_shared_archive", "_hdrs_wrap")
SKIP_PKG_SUBSTR = "/third_party/"
# If True, exit(1) whenever a header is found only via select()/glob()
FAIL_ON_STRUCTURED = True
buildozer = install_buildozer()
def _run_print(selector: str) -> tuple[str, str]:
"""Run one buildozer print invocation; return (selector, stdout)."""
try:
out = subprocess.run(
[buildozer, "print label srcs", f"{SCOPE}:{selector}"],
capture_output=True,
text=True,
check=True,
).stdout
return selector, out
except subprocess.CalledProcessError as exc:
# surface error and keep going (treated as empty output)
print(f"BUILDOZER ERROR (print label srcs) for selector {selector}:", file=sys.stderr)
print(exc.stdout, file=sys.stderr)
print(exc.stderr, file=sys.stderr)
return selector, ""
# 1) Run all macro prints concurrently
outputs: list[str] = []
with ThreadPoolExecutor(max_workers=min(4, max(1, len(MACRO_SELECTORS)))) as ex:
futs = [ex.submit(_run_print, sel) for sel in MACRO_SELECTORS]
for fut in as_completed(futs):
_, stdout = fut.result()
if stdout:
outputs.append(stdout)
if not outputs:
return
combined = "\n".join(outputs)
# 2) Parse into target blocks: start at lines beginning with //src/mongo...
target_line_re = re.compile(r"^//src/mongo/[^:\s\[]+:[^\s\[]+")
lines = combined.splitlines()
blocks: list[tuple[str, list[str]]] = []
cur_target: str | None = None
cur_buf: list[str] = []
def flush():
nonlocal cur_target, cur_buf
if cur_target is not None:
blocks.append((cur_target, cur_buf))
cur_target, cur_buf = None, []
for line in lines:
if target_line_re.match(line):
flush()
cur_target = line.split()[0]
cur_buf = [line]
elif cur_target is not None:
cur_buf.append(line)
flush()
failures: list[tuple[str, str]] = []
fixes: list[tuple[str, str]] = [] # (cmd, target)
structured_fail_found = False # to enforce FAIL_ON_STRUCTURED
def pkg_of(label: str) -> str:
return label[2:].split(":", 1)[0]
def normalize_token(pkg: str, tok: str) -> str | None:
t = tok.strip().strip(",")
if not t:
return None
if t.startswith(("select(", "glob(")):
return None
if t.startswith("//"):
return t
if t.startswith(":"):
return f"//{pkg}:{t[1:]}"
# bare filename/path → pkg-local
if not any(ch in t for ch in " []{}:\t\n"):
return f"//{pkg}:{t}"
return None
for target, buf in blocks:
if target.endswith(SKIP_SUFFIXES) or SKIP_PKG_SUBSTR in target:
continue
text = "\n".join(buf)
# quick lint: any .h* anywhere?
if not HEADER_RE.search(text):
continue
# first [...] only (top-level list)
m = re.search(r"\[(.*?)\]", text, flags=re.DOTALL)
top_tokens: list[str] = []
if m:
inner = m.group(1).replace("\n", " ").strip()
if inner:
try:
top_tokens = shlex_split(inner)
except ValueError:
top_tokens = inner.split()
pkg = pkg_of(target)
concrete_headers: list[str] = []
for tok in top_tokens:
norm = normalize_token(pkg, tok)
if not norm:
continue
if norm in PUBLIC_KEEP:
continue
base = norm.split(":", 1)[1]
if base.endswith(HEADER_EXTS):
concrete_headers.append(norm)
structured_has_hdr = False
if not concrete_headers:
# If there were headers somewhere but none in first [...], we assume select()/glob()
structured_has_hdr = True
if not concrete_headers and not structured_has_hdr:
continue
canon_target = target.replace("_with_debug", "")
parts = []
if concrete_headers:
parts.append(f"concrete headers: {concrete_headers}")
if structured_has_hdr:
parts.append("headers via select()/glob() (not auto-fixed)")
structured_fail_found = True
msg = f"{canon_target} has headers in srcs: " + "; ".join(parts)
print(msg)
failures.append((canon_target, msg))
if fix and concrete_headers:
for h in concrete_headers:
fixes.append((f"add private_hdrs {h}", canon_target))
fixes.append((f"remove srcs {h}", canon_target))
# 3) Apply fixes (dedupe)
if fix and fixes:
seen = set()
for cmd, tgt in fixes:
key = (cmd, tgt)
if key in seen:
continue
seen.add(key)
subprocess.run([buildozer, cmd, tgt])
# 4) CI reports
if failures and generate_report:
for tlabel, msg in failures:
report = make_report(tlabel, msg, 1)
try_combine_reports(report)
put_report(report)
# 5) Failing rules
# - Always fail if any violation and not fixing (your existing behavior)
# - Also fail if we saw non-concrete (structured) headers anywhere (requested)
if (failures and not fix) or (structured_fail_found and FAIL_ON_STRUCTURED):
sys.exit(1)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--generate-report", default=False, action="store_true")
parser.add_argument("--fix", default=False, action="store_true")
args = parser.parse_args()
validate_clang_tidy_configs(args.generate_report, args.fix)
validate_bazel_groups(args.generate_report, args.fix)
validate_idl_naming(args.generate_report, args.fix)
validate_private_headers(args.generate_report, args.fix)
if __name__ == "__main__":
main()