mongo/buildscripts/resmokelib/bazel_suite_parser.py

"""Parser for BUILD.bazel files to extract resmoke_suite_test configuration.

This module parses BUILD.bazel files without invoking bazel, supporting a simplified
subset of Bazel syntax:
- Simple lists of targets (no select() expressions)
- Direct file targets (e.g., "//jstests/foo:bar.js")
- all_javascript_files targets (globs *.js in directory)
- all_subpackage_javascript_files targets (recursively includes all JS from subpackages)
"""

import functools
import os
import re


class BazelParseError(Exception):
    """Exception raised when parsing BUILD.bazel files fails."""

    pass


@functools.cache
def parse_resmoke_suite_test(target_label: str) -> dict[str, list[str]]:
    """Parse a resmoke_suite_test target from BUILD.bazel.
    Args:
        target_label: Bazel target label like "//buildscripts/resmokeconfig:core"
    Returns:
        Dictionary with extracted attributes:
            - srcs: List of test file labels
            - exclude_files: List of test file labels to exclude
            - exclude_with_any_tags: List of tag strings
            - include_with_any_tags: List of tag strings
            - group_size: Integer or None for number of tests per group (for test_kind: parallel_fsm_workload_test)
            - group_count_multiplier: String for group count multiplier (for test_kind: parallel_fsm_workload_test)
    Raises:
        BazelParseError: If BUILD.bazel file not found or target not found
    """
    package, target_name = _parse_label(target_label)
    build_file = os.path.join(package, "BUILD.bazel")

    if not os.path.exists(build_file):
        raise BazelParseError(
            f"BUILD.bazel file not found at '{build_file}' for target '{target_label}'"
        )
    with open(build_file, "r") as f:
        content = f.read()

    # Parse load statements to build identifier -> .bzl file mapping
    identifier_to_bzl_file = _parse_load_statements(content, package)

    # Find the resmoke_suite_test block
    # Pattern matches: resmoke_suite_test(name = "target_name", ...)
    pattern = r'resmoke_suite_test\s*\(\s*name\s*=\s*["\']' + re.escape(target_name) + r'["\']'
    match = re.search(pattern, content)
    if not match:
        raise BazelParseError(
            f"Target '{target_name}' not found in '{build_file}'. "
            f'Expected a resmoke_suite_test rule with name = "{target_name}"'
        )

    # Extract the rule block by finding balanced parentheses
    rule_start = match.start()
    paren_start = content.index("(", rule_start)
    paren_count = 0
    rule_end = paren_start
    for i in range(paren_start, len(content)):
        if content[i] == "(":
            paren_count += 1
        elif content[i] == ")":
            paren_count -= 1
            if paren_count == 0:
                rule_end = i + 1
                break

    if paren_count != 0:
        raise BazelParseError(
            f"Unbalanced parentheses in resmoke_suite_test definition for '{target_label}'"
        )
    rule_block = content[rule_start:rule_end]

    return {
        "srcs": _extract_attribute(rule_block, "srcs", identifier_to_bzl_file, build_file),
        "exclude_files": _extract_attribute(
            rule_block, "exclude_files", identifier_to_bzl_file, build_file
        ),
        "exclude_with_any_tags": _extract_attribute(
            rule_block, "exclude_with_any_tags", identifier_to_bzl_file, build_file
        ),
        "include_with_any_tags": _extract_attribute(
            rule_block, "include_with_any_tags", identifier_to_bzl_file, build_file
        ),
        "group_size": _extract_int_attribute(rule_block, "group_size"),
        "group_count_multiplier": _extract_scalar_attribute(rule_block, "group_count_multiplier"),
    }


def _parse_label(target_label: str) -> tuple[str, str]:
    """Parse a Bazel target label into package path and target name.
    Args:
        target_label: A Bazel target label like "//package/path:target_name"
    Returns:
        Tuple of (package_path, target_name)
    Raises:
        BazelParseError: If the label format is invalid
    """
    if not target_label.startswith("//"):
        raise BazelParseError(
            f"Unsupported Bazel target label '{target_label}': must start with '//'"
        )
    # Remove leading "//"
    label_without_prefix = target_label[2:]

    # Split on ":"
    if ":" not in label_without_prefix:
        raise BazelParseError(
            f"Unsupported Bazel target label '{target_label}': must contain ':' separator"
        )
    package, target_name = label_without_prefix.split(":", 1)

    return package, target_name


def _parse_load_statements(content: str, package: str) -> dict[str, str]:
    """Parse load statements from BUILD.bazel content.

    Extracts identifier to .bzl file mappings from load statements.
    Example: load("//path/to:file.bzl", "identifier1", "identifier2")

    Args:
        content: The BUILD.bazel file content
        package: The package path of the BUILD.bazel file

    Returns:
        Dictionary mapping identifier names to absolute .bzl file paths
    """
    identifier_to_bzl_file = {}

    # Find all load statements
    for line in content.split("\n"):
        if not line.strip().startswith("load("):
            continue

        # Extract the full load statement (may span multiple lines)
        # Looking for load("//path/to:file.bzl", "identifier1", "identifier2", ...)
        match = re.match(r'load\s*\(\s*["\']([^"\']+)["\'](.+?)\)', line)
        if match:
            bzl_label = match.group(1)
            identifiers_str = match.group(2)

            # Convert the .bzl label to a file path
            # Example: "//jstests/suites:selectors.bzl"
            #       -> "jstests/suites/selectors.bzl"
            if bzl_label.startswith("//"):
                bzl_path = bzl_label[2:].replace(":", "/")
            else:
                # Relative path - resolve relative to current package
                bzl_path = os.path.join(package, bzl_label.replace(":", ""))
            bzl_path = os.path.join(*bzl_path.split("/"))

            # Extract all identifiers from the load statement
            identifier_pattern = r'["\']([^"\']+)["\']'
            identifiers = re.findall(identifier_pattern, identifiers_str)

            # Map each identifier to the .bzl file
            for identifier in identifiers:
                identifier_to_bzl_file[identifier] = bzl_path

    return identifier_to_bzl_file


def _extract_attribute(
    block: str,
    attribute_name: str,
    identifier_to_bzl_file: dict[str, str] = None,
    build_file: str = None,
) -> list[str]:
    """Extract an attribute from a BUILD.bazel rule block.

    Supports simple lists and list concatenation with identifiers:
    - Simple list: srcs = ["file1.js", "file2.js"]
    - List concatenation: srcs = ["file1.js"] + some_identifier

    Args:
        block: The text content of a BUILD.bazel rule block
        attribute_name: The name of the attribute to extract (e.g., "srcs")
        identifier_to_bzl_file: Dictionary mapping identifier names to .bzl file paths
        build_file: Path to the BUILD.bazel file for resolving local identifiers
    Returns:
        List of string values from the attribute. Returns empty list if attribute not found.
    """
    if identifier_to_bzl_file is None:
        identifier_to_bzl_file = {}

    # Pattern to match: attribute_name = <expression>
    # Captures everything until comma + newline + next attribute/paren, or end of string
    pattern = rf"{attribute_name}\s*=\s*(.+?)(?=,\s*\n\s*(?:\w+\s*=|\))|\Z)"
    match = re.search(pattern, block, re.DOTALL | re.MULTILINE)
    if not match:
        return []

    expression = match.group(1).strip()

    # Split expression by '+' operator to handle concatenation
    items = []
    parts = re.split(r"\+", expression)

    for part in parts:
        part = part.strip()

        # Check if this part is a list literal
        if part.startswith("[") and part.endswith("]"):
            # Extract the content between brackets
            list_content = part[1:-1]

            # Extract quoted strings, handling both single and double quotes
            for line in list_content.split("\n"):
                # Remove inline comments
                line = re.sub(r"#.*$", "", line)

                # Find all quoted strings in the line
                string_pattern = r'["\']([^"\']+)["\']'
                items.extend(re.findall(string_pattern, line))

        # Check if this part is an identifier (not a list literal)
        elif re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", part):
            # Resolve the identifier to a list of labels
            resolved_items = _resolve_identifier_to_labels(part, identifier_to_bzl_file, build_file)
            items.extend(resolved_items)

    return items


def _resolve_identifier_to_labels(
    identifier: str, identifier_to_bzl_file: dict[str, str], build_file: str = None
) -> list[str]:
    """Convert a Bazel identifier to a list of labels.

    This function resolves identifiers used in list concatenation expressions.
    For example, in: srcs = ["file.js"] + sharding_jscore_passthrough_srcs
    The identifier 'sharding_jscore_passthrough_srcs' would be resolved to its
    corresponding list of labels by reading its definition from the .bzl file.

    If the identifier is not found in load statements, this function will attempt
    to find it defined in the BUILD.bazel file itself.

    Args:
        identifier: The identifier name to resolve (e.g., "sharding_jscore_passthrough_srcs")
        identifier_to_bzl_file: Dictionary mapping identifier names to .bzl file paths
        build_file: Path to the BUILD.bazel file for resolving local identifiers

    Returns:
        List of resolved label strings
    """
    identifier_pattern = rf"^{re.escape(identifier)}\s*=\s*\[(.+?)\]"

    if identifier in identifier_to_bzl_file:
        bzl_file_path = identifier_to_bzl_file[identifier]
        with open(bzl_file_path, "r") as f:
            bzl_content = f.read()

        # Find the identifier definition in the .bzl file
        match = re.search(identifier_pattern, bzl_content, re.MULTILINE | re.DOTALL)
        if not match:
            raise BazelParseError(
                f"Could not find definition of identifier '{identifier}' in '{bzl_file_path}'"
            )
    else:
        # Try to find the identifier in the BUILD.bazel file itself
        with open(build_file, "r") as f:
            build_content = f.read()

        # Look for identifier definition in BUILD.bazel
        match = re.search(identifier_pattern, build_content, re.MULTILINE | re.DOTALL)
        if not match:
            raise BazelParseError(
                f"Identifier '{identifier}' referenced but not found in load statements "
                f"or in BUILD.bazel file."
            )

    # Extract all quoted strings from the list
    items = []
    for line in match.group(1).split("\n"):
        # Remove inline comments
        line = re.sub(r"#.*$", "", line)

        # Find all quoted strings in the line
        string_pattern = r'["\']([^"\']+)["\']'
        items.extend(re.findall(string_pattern, line))

    return items


def _extract_int_attribute(block: str, attribute_name: str) -> int | None:
    """Extract an integer attribute from a BUILD.bazel rule block.
    Args:
        block: The text content of a BUILD.bazel rule block
        attribute_name: The name of the attribute to extract (e.g., "group_size")
    Returns:
        Integer value of the attribute. Returns None if attribute not found.
    """
    # Pattern to match: attribute_name = <integer>
    pattern = rf"{attribute_name}\s*=\s*(\d+)"
    match = re.search(pattern, block)
    if not match:
        return None
    return int(match.group(1))


def _extract_scalar_attribute(block: str, attribute_name: str) -> str:
    """Extract a string attribute from a BUILD.bazel rule block.
    Args:
        block: The text content of a BUILD.bazel rule block
        attribute_name: The name of the attribute to extract (e.g., "group_count_multiplier")
    Returns:
        String value of the attribute. Returns empty string if attribute not found.
    """
    # Pattern to match: attribute_name = "<value>"
    quoted_pattern = rf'{attribute_name}\s*=\s*["\']([^"\']+)["\']'
    match = re.search(quoted_pattern, block)
    if match:
        return match.group(1)

    return ""


def resolve_target_to_files(target_label: str) -> str:
    """Resolve a Bazel target label to glob patterns or file paths.
    Supported target types:
    - Direct file: "//jstests/foo:bar.js" → "jstests/foo/bar.js"
    - all_javascript_files: returns glob pattern "package/*.js"
    - all_subpackage_javascript_files: returns glob pattern "package/**/*.js"
    Args:
        target_label: Bazel target label to resolve
    Returns:
        File path or glob pattern (relative to repo root)
    Raises:
        BazelParseError: If target type is unsupported
    """
    package, target_name = _parse_label(target_label)

    if target_name.endswith(".js"):
        # Direct file reference
        return os.path.join(package, target_name)

    elif target_name == "all_javascript_files":
        # Return glob pattern for *.js in package directory
        return os.path.join(package, "*.js")

    elif target_name == "all_subpackage_javascript_files":
        # Return glob pattern for recursive **/*.js
        return os.path.join(package, "**/*.js")

    else:
        raise BazelParseError(
            f"Unsupported target type '{target_label}'. "
            f"Supported types: direct .js files, all_javascript_files, all_subpackage_javascript_files"
        )