mongo/bazel/repository_rules/bolt_data.bzl

load("//bazel/repository_rules:pgo_data.bzl", "get_all_files")
load("//bazel/repository_rules:profiling_data.bzl", "DEFAULT_BOLT_DATA_CHECKSUM", "DEFAULT_BOLT_DATA_URL")

# This is used so we can tell when the build created new bolt files vs. using ones from stored url
CREATED_FILEGROUP = """
filegroup(
    name = "created_bolt_fdata",
    srcs = glob(["**/*.fdata"]),
)
"""

EMPTY_CREATED_FILEGROUP = """
filegroup(
    name = "created_bolt_fdata",
    srcs = [],
    target_compatible_with = ["@platforms//:incompatible"],
)
"""

def _setup_bolt_data(repository_ctx):
    bolt_fdata_filename = "bolt.fdata"

    # This potentially contains multiple urls, separated by a | eg. url1|url2|url3
    bolt_profile_urls_env = repository_ctx.os.environ.get("bolt_profile_url", None)

    # This is the binary the bolt data came from
    bolt_binary_env = repository_ctx.os.environ.get("bolt_binary_url", None)

    # Incase you want to bolt a binary instead of the main binary mongod
    bolt_binary_name = repository_ctx.os.environ.get("bolt_binary_name", None)

    created_files = EMPTY_CREATED_FILEGROUP

    # Perf2bolt will use the path to call the perf tool
    path_env = repository_ctx.os.environ.get("PATH", None)
    perf_path_env = str(repository_ctx.path(repository_ctx.attr._perf_binary).dirname) + ":" + path_env

    if bolt_binary_name == None:
        bolt_binary_name = "mongod"

    # We should be using the default bolt data because we are not being passed bolt data
    if bolt_profile_urls_env == None and bolt_binary_env == None:
        repository_ctx.download(DEFAULT_BOLT_DATA_URL, bolt_fdata_filename, sha256 = DEFAULT_BOLT_DATA_CHECKSUM)

        # This is mainly used for the bolt training pipeline
    else:
        # 2 main scenarios
        # 1. They are passing us a single bolt .fdata file, just download the file
        # 2. They are passing us one or more unprocessed .tgz files with .data files inside
        # the .data files need to be turned into fdata files using perf2bolt, which then can be merged
        # into a single fdata file using merge-fdata

        bolt_urls = bolt_profile_urls_env.split("|")
        created_files = CREATED_FILEGROUP

        # They are passing us a single bolt fdata file, just download the file
        if len(bolt_urls) == 1 and bolt_urls[0].endswith(".fdata"):
            print("Downloading single fdata file for bolt: " + bolt_urls[0])
            repository_ctx.download(bolt_profile_urls_env, bolt_fdata_filename)
        else:
            url_num = 0
            print("Downloading multiple bolt files.")
            need_perf2bolt = False
            for url in bolt_urls:
                if url.endswith(".fdata"):
                    print("Downloading fdata: " + url)
                    repository_ctx.download(url, str(url_num) + "/bolt.fdata")
                else:
                    print("Downloading and extracting: " + url)
                    repository_ctx.download_and_extract(url, str(url_num))
                    need_perf2bolt = True
                url_num += 1

            files = get_all_files(repository_ctx.path("."), 20)
            data_files = [file for file in files if file.basename.endswith(".data")]
            fdata_files = [file for file in files if file.basename.endswith(".fdata")]

            # This is scenario 2, we need to turn these data files into fdata files using perf2bolt
            if need_perf2bolt and len(data_files) > 0:
                # Download the mongod binary for perf2bolt
                print("Download the mongo binaries from: " + bolt_binary_env)
                repository_ctx.download_and_extract(bolt_binary_env, "binaries")

                files = get_all_files(repository_ctx.path("."), 20)
                binary = [file for file in files if file.basename.endswith(bolt_binary_name)][0]

                processed_fdata_files = 0
                print("Found data files, turning them into fdata files with perf2bolt.")
                for file in data_files:
                    fdata_file_name = "bolt" + str(processed_fdata_files) + ".fdata"
                    arguments = [repository_ctx.attr._perf2bolt_binary, "-nl", "-p", file, "-o", fdata_file_name, binary]

                    result = repository_ctx.execute(arguments, environment = {"PATH": perf_path_env})
                    print(result.stdout)
                    if result.return_code != 0:
                        print(result.stderr)
                        fail("Failed to run perf2bolt.")
                    processed_fdata_files += 1
                    fdata_files.append(fdata_file_name)

            # If we have multiple fdata files we need to merge them together using merge-fdata
            if len(fdata_files) > 1:
                print("Merging fdata files with merge-fdata.")
                arguments = [repository_ctx.attr._merge_fdata_binary, "-o", bolt_fdata_filename] + fdata_files
                result = repository_ctx.execute(arguments)
                print(result.stdout)
                if result.return_code != 0:
                    print(result.stderr)
                    fail("Failed to run merge-fdata.")

                # clean up the pre-merged fdata files
                for file in fdata_files:
                    repository_ctx.delete(file)

    repository_ctx.file(
        "BUILD.bazel",
        """
package(default_visibility = ["//visibility:public"])

filegroup(
    name = "bolt_fdata",
    srcs = glob(["**/*.fdata"]),
)
""" + created_files,
    )

setup_bolt_data = repository_rule(
    implementation = _setup_bolt_data,
    environ = ["bolt_profile_url", "bolt_binary_url", "bolt_binary_name", "PATH"],
    attrs = {
        # There is a bug where the repo rule does not properly evaluate these labels so we have to list the full path to the binaries
        "_merge_fdata_binary": attr.label(allow_single_file = True, default = "@bolt_binaries//:bolt/bin/merge-fdata", executable = True, cfg = "host"),
        "_perf2bolt_binary": attr.label(allow_single_file = True, default = "@bolt_binaries//:bolt/bin/perf2bolt", executable = True, cfg = "host"),
        "_perf_binary": attr.label(allow_single_file = True, default = "@bolt_binaries//:bolt/bin/perf", executable = True, cfg = "host"),
    },
)