mongo/buildscripts/sbom/sbom_utils.py
mongo-pr-bot[bot] 650d9f458c SERVER-111072 Auto-generated SBOM files [master] (#54314)
Co-authored-by: mongo-pr-bot[bot] <230616009+mongo-pr-bot[bot]@users.noreply.github.com>
Co-authored-by: Jason Hills <jason.hills@mongodb.com>
GitOrigin-RevId: b8ba75da724800391249ed4266928c96bd537875
2026-05-22 18:11:12 +00:00

374 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Utility functions for processing CycloneDX SBOMs
"""
import json
import logging
import os
import re
import urllib.parse
logger = logging.getLogger("generate_sbom")
logger.setLevel(logging.NOTSET)
# ################ PURL Validation ################
REGEX_STR_PURL_OPTIONAL = ( # Optional Version (any chars except ? @ #)
r"(?:@[^?@#]*)?"
# Optional Qualifiers (any chars except @ #)
r"(?:\?[^@#]*)?"
# Optional Subpath (any chars)
r"(?:#.*)?$"
)
REGEX_PURL = {
# deb PURL. https://github.com/package-url/purl-spec/blob/main/types-doc/deb-definition.md
"deb": re.compile(
r"^pkg:deb/" # Scheme and type
# Namespace (organization/user), letters must be lowercase
r"(debian|ubuntu)+"
r"/"
r"[a-z0-9._-]+" + REGEX_STR_PURL_OPTIONAL # Name
),
# Generic PURL. https://github.com/package-url/purl-spec/blob/main/types-doc/generic-definition.md
"generic": re.compile(
r"^pkg:generic/" # Scheme and type
r"([a-zA-Z0-9._-]+/)?" # Optional namespace segment
r"[a-zA-Z0-9._-]+" + REGEX_STR_PURL_OPTIONAL # Name (required)
),
# GitHub PURL. https://github.com/package-url/purl-spec/blob/main/types-doc/github-definition.md
"github": re.compile(
r"^pkg:github/" # Scheme and type
# Namespace (organization/user), letters must be lowercase
r"[a-z0-9-]+"
r"/"
r"[a-z0-9._-]+" + REGEX_STR_PURL_OPTIONAL # Name (repository)
),
# PyPI PURL. https://github.com/package-url/purl-spec/blob/main/types-doc/pypi-definition.md
"pypi": re.compile(
r"^pkg:pypi/" # Scheme and type
r"[a-z0-9_-]+" # Name, letters must be lowercase, dashes, underscore
+ REGEX_STR_PURL_OPTIONAL
),
# Cargo PURL. https://github.com/package-url/purl-spec/blob/main/types-doc/cargo-definition.md
"cargo": re.compile(
r"^pkg:cargo/" # Scheme and type
r"[a-zA-Z0-9_-]+" + REGEX_STR_PURL_OPTIONAL # Name (no namespace)
),
# Maven PURL. https://github.com/package-url/purl-spec/blob/main/types-doc/maven-definition.md
"maven": re.compile(
r"^pkg:maven/" # Scheme and type
r"[a-zA-Z0-9._-]+" # Namespace (group ID, required)
r"/"
r"[a-zA-Z0-9._-]+" + REGEX_STR_PURL_OPTIONAL # Name (artifact ID)
),
}
# Metadata SBOM requirements
METADATA_FIELDS_REQUIRED = [
"type",
"bom-ref",
"group",
"name",
"version",
"description",
"licenses",
"copyright",
"externalReferences",
"scope",
]
METADATA_FIELDS_ONE_OF = [
["author", "supplier"],
["purl", "cpe"],
]
def add_component_property(component: dict, name: str, value: str) -> None:
"""Add a key/value to to 'properties' in SBOM component"""
if "properties" not in component:
component["properties"] = []
component["properties"].append({"name": name, "value": value})
def check_metadata_sbom(meta_bom: dict) -> None:
"""Run checks on SBOM component metadata for expected fields."""
for component in meta_bom["components"]:
for field in METADATA_FIELDS_REQUIRED:
if field not in component:
logger.warning(
"METADATA: %s is missing required field '%s'.",
(component.get("bom-ref") or component.get("name")),
field,
)
for fields in METADATA_FIELDS_ONE_OF:
found = False
for field in fields:
found = found or field in component
if not found:
logger.warning(
"METADATA: %s is missing one of fields '%s'.",
(component.get("bom-ref") or component.get("name")),
fields,
)
def check_components_and_dependencies(sbom: dict, label: str = "") -> None:
"""Warn if .components[].bom-ref and .dependencies[].ref are not in one-to-one correspondence."""
prefix = f"{label}: " if label else ""
component_refs = {c["bom-ref"] for c in sbom.get("components", [])}
if primary_ref := sbom.get("metadata", {}).get("component", {}).get("bom-ref"):
component_refs.add(primary_ref)
dependency_refs = {d["ref"] for d in sbom.get("dependencies", [])}
in_components_not_deps = component_refs - dependency_refs
in_deps_not_components = dependency_refs - component_refs
if in_components_not_deps:
logger.warning(
"%sCOMPONENTS/DEPENDENCIES MISMATCH: components with no matching dependency ref: %s",
prefix,
sorted(in_components_not_deps),
)
if in_deps_not_components:
logger.warning(
"%sCOMPONENTS/DEPENDENCIES MISMATCH: dependency refs with no matching component: %s",
prefix,
sorted(in_deps_not_components),
)
def reconcile_dependency_refs(sbom: dict) -> None:
"""Add stub dependency entries for missing component refs; remove and warn about orphaned refs."""
component_refs = {c["bom-ref"] for c in sbom.get("components", [])}
meta_component = sbom.get("metadata", {}).get("component", {})
if primary_ref := meta_component.get("bom-ref"):
component_refs.add(primary_ref)
for sub in meta_component.get("components", []):
if sub_ref := sub.get("bom-ref"):
component_refs.add(sub_ref)
dependency_refs = {d["ref"] for d in sbom.get("dependencies", [])}
missing = component_refs - dependency_refs
if missing:
if "dependencies" not in sbom:
sbom["dependencies"] = []
for ref in sorted(missing):
sbom["dependencies"].append({"ref": ref, "dependsOn": []})
logger.debug("reconcile_dependency_refs: added missing dependency ref '%s'", ref)
orphaned = dependency_refs - component_refs
if orphaned:
logger.warning(
"COMPONENTS/DEPENDENCIES MISMATCH: removing orphaned dependency refs with no matching component: %s",
sorted(orphaned),
)
sbom["dependencies"] = [d for d in sbom["dependencies"] if d["ref"] not in orphaned]
def convert_sbom_to_public(sbom_dict: dict):
"""Remove internal-only properties and components from SBOM"""
original_components_len = len(sbom_dict["components"])
# Identify internal components based on evidence occurrence in internal folders
internal_components = [
c["bom-ref"]
for c in sbom_dict["components"]
if any(
occurence.get("location", "").startswith("src/third_party/private")
for occurence in c.get("evidence", {}).get("occurrences", [])
)
or any(
property.get("name", "") in ["internal:as-is_component", "internal:private"]
and property.get("value") == "true"
for property in c.get("properties", [])
)
]
# Remove internal components and any dependencies on them from the SBOM
sbom_dict["components"] = [
c for c in sbom_dict["components"] if c["bom-ref"] not in internal_components
]
sbom_dict["dependencies"] = [
d for d in sbom_dict["dependencies"] if d["ref"] not in internal_components
]
for dependency in sbom_dict["dependencies"]:
dependency["dependsOn"] = [
d for d in dependency["dependsOn"] if d not in internal_components
]
logger.info(
"PUBLIC SBOM: Removed %d internal components",
original_components_len - len(sbom_dict["components"]),
)
# Remove orphaned dependency entries — refs that are not present in the final component set.
# This covers entries added from Endor Labs that reference components which were filtered out
# at earlier stages (e.g. sub-packages, removed components) and are not internal-flagged.
valid_refs = {sbom_dict["metadata"]["component"]["bom-ref"]} | {
c["bom-ref"] for c in sbom_dict["components"]
}
original_deps_len = len(sbom_dict["dependencies"])
sbom_dict["dependencies"] = [d for d in sbom_dict["dependencies"] if d["ref"] in valid_refs]
for dependency in sbom_dict["dependencies"]:
dependency["dependsOn"] = [d for d in dependency["dependsOn"] if d in valid_refs]
removed_deps = original_deps_len - len(sbom_dict["dependencies"])
if removed_deps:
logger.info("PUBLIC SBOM: Removed %d orphaned dependency entries", removed_deps)
# Remove internal properties from public components
original_properties_len = sum(len(c.get("properties", [])) for c in sbom_dict["components"])
for component in sbom_dict["components"]:
if "properties" in component:
component["properties"] = [
p
for p in component.get("properties", [])
if not p.get("name", "").startswith("internal:")
]
logger.info(
"PUBLIC SBOM: Removed %d internal properties from public components",
original_properties_len
- sum(len(c.get("properties", [])) for c in sbom_dict["components"]),
)
def is_valid_purl(purl: str) -> bool:
"""Validate a GitHub or Generic PURL"""
for purl_type, regex in REGEX_PURL.items():
if regex.match(purl):
logger.debug(
"PURL: %s matched PURL type '%s' regex '%s'", purl, purl_type, regex.pattern
)
return True
return False
def read_sbom_json_file(file_path: str) -> dict:
"""Load a JSON SBOM file (schema is not validated)"""
try:
with open(file_path, "r", encoding="utf-8") as input_json:
sbom_json = input_json.read()
result = json.loads(sbom_json)
logger.info("SBOM loaded from %s with %d components", file_path, len(result["components"]))
return result
except OSError as e:
logger.error("Error loading SBOM file from %s", file_path)
logger.error(e)
except json.JSONDecodeError as e:
logger.error("Error decoding JSON SBOM file from %s", file_path)
logger.error(e)
def remove_sbom_component(sbom_dict: dict, component_key: str) -> None:
"""Remove a component from the SBOM by its bom-ref key"""
sbom_dict["components"] = [
c for c in sbom_dict["components"] if not c["bom-ref"].startswith(component_key)
]
sbom_dict["dependencies"] = [
d for d in sbom_dict["dependencies"] if not d["ref"].startswith(component_key)
]
for dependency in sbom_dict["dependencies"]:
dependency["dependsOn"] = [
d for d in dependency["dependsOn"] if not d.startswith(component_key)
]
logger.debug("Removed component '%s' from SBOM", component_key)
def set_component_version(
component: dict, version: str, purl_version: str = None, cpe_version: str = None
) -> None:
"""Update the appropriate version fields in a component from the metadata SBOM"""
if not purl_version:
purl_version = version
if not cpe_version:
cpe_version = version
component["bom-ref"] = component["bom-ref"].replace("{{VERSION}}", purl_version)
component["version"] = component["version"].replace("{{VERSION}}", version)
if component.get("purl"):
component["purl"] = component["purl"].replace(
"{{VERSION}}", urllib.parse.quote(purl_version)
)
if not is_valid_purl(component["purl"]):
logger.warning("PURL: Invalid PURL (%s)", component["purl"])
if component.get("cpe"):
component["cpe"] = component["cpe"].replace("{{VERSION}}", cpe_version)
def set_dependency_version(dependencies: list, meta_bom_ref: str, purl_version: str) -> None:
"""Update the appropriate dependency version fields from the metadata SBOM"""
r = 0
d = 0
for dependency in dependencies:
if "{{VERSION}}" in dependency["ref"] and dependency["ref"] == meta_bom_ref:
dependency["ref"] = dependency["ref"].replace("{{VERSION}}", purl_version)
r += 1
for i in range(len(dependency["dependsOn"])):
if dependency["dependsOn"][i] == meta_bom_ref:
dependency["dependsOn"][i] = dependency["dependsOn"][i].replace(
"{{VERSION}}", purl_version
)
d += 1
logger.debug(
"set_dependency_version: '%s' updated %d refs and %d dependsOn", meta_bom_ref, r, d
)
def add_component_dependsOn(dependencies: list, component_ref: str, depends_on_ref: str) -> None:
"""Add a dependsOn reference to a component in the SBOM dependencies"""
for dependency in dependencies:
if dependency["ref"] == component_ref:
if depends_on_ref not in dependency["dependsOn"]:
dependency["dependsOn"].append(depends_on_ref)
logger.debug(
"Added dependsOn reference '%s' to component '%s'",
depends_on_ref,
component_ref,
)
else:
logger.debug(
"Component '%s' already has dependsOn reference '%s'",
component_ref,
depends_on_ref,
)
return
# ref missing from .dependencies[]
dependencies.append({"ref": component_ref, "dependsOn": [depends_on_ref]})
logger.debug(
"Added new dependency ref for component '%s' with dependsOn reference '%s'",
component_ref,
depends_on_ref,
)
def sbom_components_to_dict(sbom: dict, with_version: bool = False) -> dict:
"""Create a dict of SBOM components with a version-less PURL as the key"""
components = sbom["components"]
if with_version:
components_dict = {
urllib.parse.unquote(component["bom-ref"]): component for component in components
}
else:
components_dict = {
urllib.parse.unquote(component["bom-ref"]).split("@")[0]: component
for component in components
}
return components_dict
def write_sbom_json_file(sbom_dict: dict, file_path: str) -> None:
"""Save a JSON SBOM file (schema is not validated)"""
try:
file_path = os.path.abspath(file_path)
with open(file_path, "w", encoding="utf-8") as output_json:
formatted_sbom = json.dumps(sbom_dict, indent=2)
output_json.write(formatted_sbom)
except OSError as e:
logger.error("Error writing SBOM file to %s", file_path)
logger.error(e)
except TypeError as e:
logger.error("Error serializing SBOM to JSON for file %s", file_path)
logger.error(e)
else:
logger.info("SBOM file saved to %s", file_path)