mongo/buildscripts/monitor_build_status/cli.py
Jonas Bergler 31bfafd041 SERVER-127124: block-on-red, group level overrides for dta (#54065)
GitOrigin-RevId: d72cd2fe25fd31a620ad7fb5bc9564b1b241ea25
2026-05-21 14:35:23 +00:00

344 lines
13 KiB
Python

from __future__ import annotations
import os
import sys
from enum import Enum
import requests
import structlog
import typer
from tabulate import tabulate
from typing_extensions import Annotated
# Get relative imports to work when the package is not installed on the PYTHONPATH.
if __name__ == "__main__" and __package__ is None:
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from buildscripts.client.jiraclient import JiraAuth, JiraClient
from buildscripts.monitor_build_status.code_lockdown_config import (
CodeLockdownConfig,
IssueThresholds,
NotificationsConfig,
ScopesConfig,
)
from buildscripts.monitor_build_status.issue_report import IssueCategory, IssueReport
from buildscripts.monitor_build_status.jira_service import JiraService
from buildscripts.util.cmdutils import enable_logging
from buildscripts.util.expansions import get_expansion
LOGGER = structlog.get_logger(__name__)
CODE_LOCKDOWN_CONFIG = "etc/code_lockdown.yml"
JIRA_SERVER = "https://jira.mongodb.org"
DEFAULT_REPO = "10gen/mongo"
ORG_OVERALL = "[Org] Overall"
class CodeMergeStatus(Enum):
RED = "RED"
GREEN = "GREEN"
@classmethod
def from_threshold_percentages(cls, threshold_percentages: list[float]) -> CodeMergeStatus:
if any(percentage > 100 for percentage in threshold_percentages):
return cls.RED
return cls.GREEN
class SummaryMsg(Enum):
BELOW_THRESHOLDS = "All metrics are within 100% of their thresholds. All merges are allowed."
THRESHOLD_EXCEEDED = (
"At least one team exceeds 100% of its threshold. "
"Approve only changes that fix BFs, Bugs, and Performance Regressions in the following scopes:"
)
ZERO_QUOTA_EXCEEDED = (
"The following triage queues should be kept empty but currently have issues. "
"Route or resolve these immediately:"
)
class MonitorBuildStatusOrchestrator:
def __init__(
self,
jira_service: JiraService,
code_lockdown_config: CodeLockdownConfig,
slack_webhook_url: str = None,
) -> None:
self.jira_service = jira_service
self.code_lockdown_config = code_lockdown_config
self.slack_webhook_url = slack_webhook_url
def evaluate_build_redness(self, notify: bool) -> None:
for notification_config in self.code_lockdown_config.notifications:
status_message = f"\n`[STATUS]` Issue count for '{DEFAULT_REPO}' repo\n"
summaries = ""
for scopes_config in notification_config.scopes:
scope_percentages: dict[str, list[float]] = {}
issue_report = self._make_report(scopes_config)
issue_count_status_msg, issue_count_percentages, zero_quota_labels = (
self._get_issue_counts_status(
scopes_config.name, issue_report, notification_config
)
)
status_message = f"{status_message}{issue_count_status_msg}\n"
scope_percentages.update(issue_count_percentages)
summary = self._summarize(scopes_config.name, scope_percentages, zero_quota_labels)
summaries = f"{summaries}{summary}\n"
status_message = f"{status_message}{summaries}"
status_message = f"{status_message}\n{notification_config.slack.message_footer}"
for line in status_message.split("\n"):
LOGGER.info(line)
if notify:
LOGGER.info("Sending Slack webhook notification")
self._send_slack_webhook(status_message.strip())
def _send_slack_webhook(self, message: str) -> None:
"""Send a message to Slack using a webhook URL from the Devprod Correctness Slack app."""
if not self.slack_webhook_url:
raise ValueError(
"Slack webhook URL is required for notifications. "
"Please provide --slack-webhook-url parameter."
)
payload = {"text": message}
try:
response = requests.post(self.slack_webhook_url, json=payload, timeout=30)
response.raise_for_status()
LOGGER.info("Successfully sent Slack webhook notification")
except requests.exceptions.RequestException as e:
LOGGER.error("Failed to send Slack webhook notification", error=str(e))
raise
def _make_report(self, scopes_config: ScopesConfig) -> IssueReport:
LOGGER.info("Processing scope", name=scopes_config.name)
hot_query = scopes_config.jira_queries.hot
LOGGER.info("Getting hot issues from Jira", query=hot_query)
hot_issues = self.jira_service.fetch_issues(hot_query)
cold_query = scopes_config.jira_queries.cold
LOGGER.info("Getting cold issues from Jira", query=cold_query)
cold_issues = self.jira_service.fetch_issues(cold_query)
LOGGER.info("Got active Issues", count_hot=len(hot_issues), count_cold=len(cold_issues))
report = IssueReport.empty()
report.add_issues(hot=hot_issues, cold=cold_issues)
return report
def _get_issue_counts_status(
self, scope_name: str, issue_report: IssueReport, notification_config: NotificationsConfig
) -> tuple[str, dict[str, list[float]], set[str]]:
percentages: dict[str, list[float]] = {}
zero_quota_labels: set[str] = set()
headers = [scope_name, "Hot Issues", "Cold Issues"]
table_data = []
def _process_thresholds(
sub_scope_name: str,
hot_issue_count: int,
cold_issue_count: int,
thresholds: IssueThresholds,
slack_tags: list[str],
) -> None:
if all(count == 0 for count in [hot_issue_count, cold_issue_count]):
return
try:
hot_bf_percentage = hot_issue_count / thresholds.hot.count * 100
hot_bf_percentage_str = f"{hot_bf_percentage:.0f}"
except ZeroDivisionError:
if hot_issue_count > 0:
hot_bf_percentage = 9999
hot_bf_percentage_str = ""
else:
hot_bf_percentage = 0
hot_bf_percentage_str = "0"
try:
cold_bf_percentage = cold_issue_count / thresholds.cold.count * 100
cold_bf_percentage_str = f"{cold_bf_percentage:.0f}"
except ZeroDivisionError:
if cold_issue_count > 0:
cold_bf_percentage = 9999
cold_bf_percentage_str = ""
else:
cold_bf_percentage = 0
cold_bf_percentage_str = "0"
label = f"{sub_scope_name} {' '.join(slack_tags)}"
if (thresholds.hot.count == 0 and hot_issue_count > 0) or (
thresholds.cold.count == 0 and cold_issue_count > 0
):
zero_quota_labels.add(label)
percentages[label] = [hot_bf_percentage, cold_bf_percentage]
if (
sub_scope_name != ORG_OVERALL
and notification_config.slack.short_issue_data_table
and CodeMergeStatus.from_threshold_percentages(
[hot_bf_percentage, cold_bf_percentage]
)
== CodeMergeStatus.GREEN
):
return
table_data.append(
[
sub_scope_name,
f"{hot_bf_percentage_str}% ({hot_issue_count} / {thresholds.hot.count})",
f"{cold_bf_percentage_str}% ({cold_issue_count} / {thresholds.cold.count})",
]
)
overall_thresholds = notification_config.thresholds.overall
overall_slack_tags = notification_config.slack.overall_scope_tags
_process_thresholds(
ORG_OVERALL,
issue_report.get_issue_count(
IssueCategory.HOT,
overall_thresholds.hot.grace_period_days * 24,
),
issue_report.get_issue_count(
IssueCategory.COLD,
overall_thresholds.cold.grace_period_days * 24,
),
overall_thresholds,
overall_slack_tags,
)
for group_name in sorted(self.code_lockdown_config.get_all_group_names()):
group_teams = self.code_lockdown_config.get_group_teams(group_name)
group_thresholds = self.code_lockdown_config.get_group_thresholds(
group_name, notification_config.thresholds.group
)
group_slack_tags = self.code_lockdown_config.get_group_slack_tags(group_name)
_process_thresholds(
f"[Group] {group_name}",
issue_report.get_issue_count(
IssueCategory.HOT,
group_thresholds.hot.grace_period_days * 24,
group_teams,
),
issue_report.get_issue_count(
IssueCategory.COLD,
group_thresholds.cold.grace_period_days * 24,
group_teams,
),
group_thresholds,
group_slack_tags,
)
for assigned_team in sorted(list(issue_report.team_reports.keys())):
team_thresholds = self.code_lockdown_config.get_team_thresholds(
assigned_team, notification_config.thresholds.team
)
team_slack_tags = self.code_lockdown_config.get_team_slack_tags(assigned_team)
_process_thresholds(
f"[Team] {assigned_team}",
issue_report.get_issue_count(
IssueCategory.HOT,
team_thresholds.hot.grace_period_days * 24,
[assigned_team],
),
issue_report.get_issue_count(
IssueCategory.COLD,
team_thresholds.cold.grace_period_days * 24,
[assigned_team],
),
team_thresholds,
team_slack_tags,
)
table_str = tabulate(
table_data, headers, tablefmt="outline", colalign=("left", "right", "right")
)
message = f"```\n{table_str}\n```"
return message, percentages, zero_quota_labels
@staticmethod
def _summarize(
scope_name: str,
scope_percentages: dict[str, list[float]],
zero_quota_labels: set[str],
) -> str:
summary = f"`SUMMARY [{scope_name}]`"
normal_red = []
zero_quota_red = []
for sub_scope, percentages in scope_percentages.items():
if CodeMergeStatus.from_threshold_percentages(percentages) == CodeMergeStatus.RED:
if sub_scope in zero_quota_labels:
zero_quota_red.append(sub_scope)
else:
normal_red.append(sub_scope)
if not normal_red and not zero_quota_red:
summary = f"{summary} {SummaryMsg.BELOW_THRESHOLDS.value}"
else:
if normal_red:
summary = f"{summary} {SummaryMsg.THRESHOLD_EXCEEDED.value}"
for sub_scope in normal_red:
summary = f"{summary}\n\t- {sub_scope}"
if zero_quota_red:
summary = f"{summary}\n{SummaryMsg.ZERO_QUOTA_EXCEEDED.value}"
for sub_scope in zero_quota_red:
summary = f"{summary}\n\t- {sub_scope}"
return summary
def main(
notify: Annotated[
bool, typer.Option(help="Whether to send slack notification with the results")
] = False, # default to the more "quiet" setting
webhook_expansion_name: Annotated[
str, typer.Option(help="Evergreen expansion name for the Slack webhook URL")
] = "mongo-code-lockdown-webhook",
) -> None:
"""
Analyze Jira BFs count for redness reports.
For Jira API authentication, use `JIRA_AUTH_PAT` env variable.
More about Jira Personal Access Tokens (PATs) here:
- https://wiki.corp.mongodb.com/pages/viewpage.action?pageId=218995581
Example:
JIRA_AUTH_PAT=<auth-token> python buildscripts/monitor_build_status/cli.py --help
"""
enable_logging(verbose=False)
# Get webhook URL from Evergreen expansion
slack_webhook_url = get_expansion(webhook_expansion_name)
jira_client = JiraClient(JIRA_SERVER, JiraAuth())
jira_service = JiraService(jira_client=jira_client)
code_lockdown_config = CodeLockdownConfig.from_yaml_config(CODE_LOCKDOWN_CONFIG)
orchestrator = MonitorBuildStatusOrchestrator(
jira_service=jira_service,
code_lockdown_config=code_lockdown_config,
slack_webhook_url=slack_webhook_url,
)
orchestrator.evaluate_build_redness(notify)
app = typer.Typer(pretty_exceptions_show_locals=False)
app.command()(main)
if __name__ == "__main__":
app()