mongo/buildscripts/large_file_check.py
Nick Jefferies a4130ea9ab SERVER-119906: Enable UP006 ruff rule (#48284)
GitOrigin-RevId: 2069fa7bda111a89d4a9a43a56e71f06cc2e9a7c
2026-02-20 18:55:17 +00:00

153 lines
4.7 KiB
Python
Executable File

#!/usr/bin/env python3
"""Check files in git diff to ensure they are within a given size limit."""
import argparse
import fnmatch
import logging
import os
import pathlib
import sys
import textwrap
from typing import Any, Optional
import structlog
from git import Repo
mongo_dir = os.path.dirname(os.path.dirname(os.path.abspath(os.path.realpath(__file__))))
# Get relative imports to work when the package is not installed on the PYTHONPATH.
if __name__ == "__main__" and __package__ is None:
sys.path.append(mongo_dir)
from buildscripts.linter import git
from buildscripts.patch_builds.change_data import (
RevisionMap,
find_changed_files_in_repos,
generate_revision_map,
)
# Console renderer for structured logging
def renderer(_logger: logging.Logger, _name: str, eventdict: dict[Any, Any]) -> str:
if "files" in eventdict:
return "{event}: {files}".format(**eventdict)
if "repo" in eventdict:
return "{event}: {repo}".format(**eventdict)
if "file" in eventdict:
if "bytes" in eventdict:
return "{event}: {file} {bytes} bytes".format(**eventdict)
return "{event}: {file}".format(**eventdict)
return "{event}".format(**eventdict)
# Configure the logger so it doesn't spam output on huge diffs
structlog.configure(
logger_factory=structlog.stdlib.LoggerFactory(),
wrapper_class=structlog.stdlib.BoundLogger,
cache_logger_on_first_use=True,
processors=[
structlog.stdlib.filter_by_level,
renderer,
],
)
LOGGER = structlog.get_logger(__name__)
MONGO_REVISION_ENV_VAR = "REVISION"
def _get_repos_and_revisions() -> tuple[list[Repo], RevisionMap]:
"""Get the repo object and a map of revisions to compare against."""
repo_dir = os.environ.get("BUILD_WORKSPACE_DIRECTORY", None)
if repo_dir:
repos = [Repo(repo_dir)]
else:
repos = [Repo(git.get_base_dir())]
revision_map = generate_revision_map(repos, {"mongo": os.environ.get(MONGO_REVISION_ENV_VAR)})
return repos, revision_map
def git_changed_files(excludes: list[pathlib.Path]) -> list[pathlib.Path]:
"""
Get the files that have changes since the last git commit.
:param excludes: A list of files which should be excluded from changed file checks.
:return: List of changed files.
"""
repos, revision_map = _get_repos_and_revisions()
LOGGER.debug("revisions", revision=revision_map)
def _filter_fn(file_path: pathlib.Path) -> bool:
if not file_path.exists():
return False
for exclude in excludes:
if fnmatch.fnmatch(file_path, exclude):
return False
return True
files = [
filename
for filename in list(map(pathlib.Path, find_changed_files_in_repos(repos, revision_map)))
if _filter_fn(filename)
]
LOGGER.debug("Found files to check", files=list(map(str, files)))
return files
def diff_file_sizes(size_limit: int, excludes: Optional[list[str]] = None) -> list[pathlib.Path]:
if excludes is None:
excludes = []
large_files: list[pathlib.Path] = []
for file_path in git_changed_files(excludes):
LOGGER.debug("Checking file size", file=str(file_path))
file_size = file_path.stat().st_size
if file_size > size_limit:
LOGGER.error("File too large", file=str(file_path), bytes=file_size)
large_files.append(file_path)
return large_files
def main(*args: str) -> int:
"""Execute Main entry point."""
parser = argparse.ArgumentParser(
description="Git commit large file checker.",
epilog=textwrap.dedent("""\
NOTE: The --exclude argument is an exact match but can accept glob patterns. If * is used,
it matches *all* characters, including path separators.
"""),
)
parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
parser.add_argument(
"--exclude",
help="Paths to exclude from check",
nargs="+",
type=pathlib.Path,
required=False,
)
parser.add_argument("--size-mb", help="File size limit (MiB)", type=int, default="10")
parsed_args = parser.parse_args(args[1:])
if parsed_args.verbose:
logging.basicConfig(level=logging.DEBUG)
structlog.stdlib.filter_by_level(LOGGER, "debug", {})
else:
logging.basicConfig(level=logging.INFO)
structlog.stdlib.filter_by_level(LOGGER, "info", {})
large_files = diff_file_sizes(parsed_args.size_mb * 1024 * 1024, parsed_args.exclude)
if len(large_files) == 0:
LOGGER.info("All files passed size check")
return 0
LOGGER.error("Some files failed size check", files=list(map(str, large_files)))
return 1
if __name__ == "__main__":
sys.exit(main(*sys.argv))