From 04c22e97149047687e83b68f01196b6506ff3bdd Mon Sep 17 00:00:00 2001
From: Steve McClure <steve.mcclure@mongodb.com>
Date: Fri, 17 Apr 2026 10:24:10 -0400
Subject: [PATCH] SERVER-124192: Make lint_markdown_links.py more robust
 (#51892)

GitOrigin-RevId: cc7170db4cbabbb75c3cb980e9c3619e6821011c
---
 buildscripts/lint_markdown_links.py           | 268 +++++++++++-------
 src/mongo/db/exec/sbe/README.md               |   2 +-
 src/mongo/db/index_builds/README.md           |   2 +-
 src/mongo/db/repl/README.md                   |   2 +-
 .../db/shard_role/lock_manager/README.md      |   2 +-
 .../db/shard_role/shard_catalog/README.md     |   2 +-
 6 files changed, 170 insertions(+), 108 deletions(-)
diff --git a/buildscripts/lint_markdown_links.py b/buildscripts/lint_markdown_links.py
index b3eb39b6ced..dacfa5e8ab0 100755
--- a/buildscripts/lint_markdown_links.py
+++ b/buildscripts/lint_markdown_links.py
@@ -124,14 +124,31 @@ from typing import Iterable, Optional
 
 HEADING_RE = re.compile(r"^(#{1,6})\s+(.*)$")
 HTML_ANCHOR_RE = re.compile(r'<a\s+(?:name|id)=["\']([^"\']+)["\']\s*>\s*</a>?', re.IGNORECASE)
-LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
+# Matches [text](url) where url is either an angle-bracket form <url> (CommonMark §6.3,
+# allows parentheses inside the URL) or a plain URL (stops at the first unescaped ')').
+LINK_RE = re.compile(r"\[([^\]]+)\]\((<[^>]*>|[^)]+)\)")
 # Inline link references: [text]: url
 REF_DEF_RE = re.compile(r"^\s*\[([^\]]+)\]:\s+(\S+)")
+# Multi-line ref def: label on one line, URL indented on the next (valid CommonMark/Prettier output)
+REF_DEF_LABEL_ONLY_RE = re.compile(r"^\s*\[([^\]]+)\]:\s*$")
 # Reference-style links: [text][label] or [text][] but NOT [[double brackets]]
 # Negative lookbehind (?<!\[) ensures first [ is not preceded by [
 # Negative lookahead (?!\[) ensures first [ is not followed by another [
 REF_USE_RE = re.compile(r"(?<!\[)\[([^\]]+)\](?!\])\[(?:(?:[^\]]+))?\]")
 
+
+def _has_dangling_open_bracket(text: str) -> bool:
+    """Return True when *text* contains a '[' that opens a potential inline link not yet closed.
+
+    Used to decide whether to join the next line when scanning for multi-line inline links.
+    """
+    t = re.sub(r"`[^`]*`", "", text)  # strip inline code spans
+    t = re.sub(r"\[[^\]]*\]\([^)]*\)", "", t)  # strip complete [text](url) links
+    t = re.sub(r"\[[^\]]*\]\[[^\]]*\]", "", t)  # strip complete [text][label] links
+    t = re.sub(r"\[[^\]]*\]", "", t)  # strip any remaining closed [...] (shortcut refs)
+    return "[" in t
+
+
 # Characters removed for anchor IDs (GitHub rules simplified). We strip most punctuation except hyphen and underscore.
 PUNCT_TO_STRIP = "\"'!#$%&()*+,./:;<=>?@[]^`{|}~"  # punctuation characters to remove
 ANCHOR_CACHE: dict[str, set[str]] = {}
@@ -251,18 +268,37 @@ def collect_headings(path: str) -> set[str]:
 
 
 def collect_reference_definitions(path: str) -> dict[str, str]:
-    """Parse all reference-style link definitions [label]: url from a markdown file."""
+    """Parse all reference-style link definitions [label]: url from a markdown file.
+
+    Handles both single-line and Prettier-style multi-line forms:
+      [label]: https://...          (single-line)
+      [label]:                      (multi-line — CommonMark allows URL on next line)
+        https://...
+    """
     if path in REFERENCE_CACHE:
         return REFERENCE_CACHE[path]
     references: dict[str, str] = {}
     try:
         with open(path, "r", encoding="utf-8") as f:
-            for line in f:
-                m = REF_DEF_RE.match(line)
-                if m:
-                    label = m.group(1).strip().lower()  # case-insensitive matching
-                    target = m.group(2).strip()
-                    references[label] = target
+            lines = f.readlines()
+        i = 0
+        while i < len(lines):
+            line = lines[i]
+            m = REF_DEF_RE.match(line)
+            if m:
+                label = m.group(1).strip().lower()  # case-insensitive matching
+                target = m.group(2).strip()
+                references[label] = target
+            else:
+                m2 = REF_DEF_LABEL_ONLY_RE.match(line)
+                if m2 and i + 1 < len(lines):
+                    next_line = lines[i + 1]
+                    m3 = re.match(r"^\s+(\S+)", next_line)
+                    if m3:
+                        label = m2.group(1).strip().lower()
+                        references[label] = m3.group(1).strip()
+                        i += 1  # consume the URL line
+            i += 1
     except Exception:
         pass
     REFERENCE_CACHE[path] = references
@@ -286,111 +322,133 @@ def parse_links(file_path: str) -> list[tuple[int, str, str]]:
     links: list[tuple[int, str, str]] = []
     try:
         with open(file_path, "r", encoding="utf-8") as f:
-            in_fence = False
-            in_blockquote = False
-            fence_delim = None  # track ``` or ~~~
-            for idx, raw_line in enumerate(f, start=1):
-                line = raw_line.rstrip("\n")
-                # Detect start/end of fenced code blocks. Accept ``` or ~~~ with optional language and leading whitespace.
-                fence_match = re.match(r"^\s*(?P<delim>`{3,}|~{3,})(.*)$", line)
-                if fence_match:
-                    full = fence_match.group("delim")
-                    # Toggle if same delimiter starts/ends
-                    if not in_fence:
-                        in_fence = True
-                        fence_delim = full
-                        continue
-                    else:
-                        # Only close if same delimiter length & char
-                        if fence_delim == full:
-                            in_fence = False
-                            fence_delim = None
-                            continue
-                if in_fence:
-                    continue  # skip link detection inside code fences
-                # Blockquote handling: if line starts with '>' treat entire following wrapped paragraph as quoted until blank line
-                if re.match(r"^\s*>", line):
-                    in_blockquote = True
+            raw_lines = f.readlines()
+        in_fence = False
+        in_blockquote = False
+        fence_delim = None  # track ``` or ~~~
+        i = 0
+        while i < len(raw_lines):
+            idx = i + 1  # 1-based line number
+            line = raw_lines[i].rstrip("\n")
+            i += 1
+            # Detect start/end of fenced code blocks. Accept ``` or ~~~ with optional language and leading whitespace.
+            fence_match = re.match(r"^\s*(?P<delim>`{3,}|~{3,})(.*)$", line)
+            if fence_match:
+                full = fence_match.group("delim")
+                # Toggle if same delimiter starts/ends
+                if not in_fence:
+                    in_fence = True
+                    fence_delim = full
                     continue
-                if in_blockquote:
-                    if line.strip() == "":
-                        in_blockquote = False
-                    else:
+                else:
+                    # Only close if same delimiter length & char
+                    if fence_delim == full:
+                        in_fence = False
+                        fence_delim = None
                         continue
-                # Skip lines that are reference definitions themselves
-                if REF_DEF_RE.match(line):
+            if in_fence:
+                continue  # skip link detection inside code fences
+            # Blockquote handling: if line starts with '>' treat entire following wrapped paragraph as quoted until blank line
+            if re.match(r"^\s*>", line):
+                in_blockquote = True
+                continue
+            if in_blockquote:
+                if line.strip() == "":
+                    in_blockquote = False
+                else:
                     continue
+            # Skip lines that are reference definitions themselves (single-line or multi-line forms)
+            if REF_DEF_RE.match(line):
+                continue
+            if REF_DEF_LABEL_ONLY_RE.match(line):
+                # Multi-line ref def: skip this line and the indented URL on the next line
+                if i < len(raw_lines) and re.match(r"^\s+\S", raw_lines[i]):
+                    i += 1
+                continue
 
-                # Find all backtick regions to exclude from link detection
-                # Build a set of character positions that are inside backticks
-                backtick_positions = set()
-                in_code = False
-                for i, char in enumerate(line):
-                    if char == "`":
-                        in_code = not in_code
-                    elif in_code:
-                        backtick_positions.add(i)
+            # Join continuation lines for multi-line inline links.
+            # CommonMark allows link text (and destination) to span lines, e.g.:
+            #   [Sync Source
+            #   Selection](https://...)
+            # Consume following lines until the bracket is closed or a blank line is hit.
+            scan_line = line
+            while _has_dangling_open_bracket(scan_line) and i < len(raw_lines):
+                next_raw = raw_lines[i].rstrip("\n")
+                if not next_raw.strip():
+                    break  # blank line ends a paragraph; inline links can't span it
+                scan_line = scan_line + " " + next_raw
+                i += 1
 
-                # Helper function to check if the opening bracket of a link is inside backticks
-                # We only check the start position because if the [ is in code, the whole link should be skipped
-                def is_in_code_span(match_start):
-                    return match_start in backtick_positions
+            # Find all backtick regions to exclude from link detection
+            # Build a set of character positions that are inside backticks
+            backtick_positions = set()
+            in_code = False
+            for j, char in enumerate(scan_line):
+                if char == "`":
+                    in_code = not in_code
+                elif in_code:
+                    backtick_positions.add(j)
 
-                # Track character ranges of all matched links to avoid double-processing
-                matched_ranges = []
+            # Helper function to check if the opening bracket of a link is inside backticks
+            # We only check the start position because if the [ is in code, the whole link should be skipped
+            def is_in_code_span(match_start):
+                return match_start in backtick_positions
 
-                def overlaps_matched_range(start, end):
-                    """Check if a position range overlaps with any previously matched range."""
-                    for m_start, m_end in matched_ranges:
-                        # Check for any overlap
-                        if start < m_end and end > m_start:
-                            return True
-                    return False
+            # Track character ranges of all matched links to avoid double-processing
+            matched_ranges = []
 
-                # Inline links [text](url)
-                for m in LINK_RE.finditer(line):
-                    if is_in_code_span(m.start()):
-                        continue  # Skip links inside backticks
-                    text, target = m.group(1), m.group(2).strip()
-                    links.append((idx, text, target))
-                    matched_ranges.append((m.start(), m.end()))
+            def overlaps_matched_range(start, end):
+                """Check if a position range overlaps with any previously matched range."""
+                for m_start, m_end in matched_ranges:
+                    # Check for any overlap
+                    if start < m_end and end > m_start:
+                        return True
+                return False
 
-                # Reference-style links [text][label] or [text][]
-                for m in REF_USE_RE.finditer(line):
-                    if is_in_code_span(m.start()):
-                        continue  # Skip links inside backticks
-                    full_match = m.group(0)
-                    text = m.group(1).strip()
-                    # Extract label from [text][label] - if empty brackets [], use text as label
-                    label_part = full_match[len(text) + 2 :]  # skip [text]
-                    if label_part == "[]":
-                        label = text  # implicit reference: [text][] uses "text" as label
-                    else:
-                        # Explicit label: [text][label]
-                        label = label_part.strip("[]").strip()
+            # Inline links [text](url)
+            for m in LINK_RE.finditer(scan_line):
+                if is_in_code_span(m.start()):
+                    continue  # Skip links inside backticks
+                text, target = m.group(1), m.group(2).strip()
+                links.append((idx, text, target))
+                matched_ranges.append((m.start(), m.end()))
+
+            # Reference-style links [text][label] or [text][]
+            for m in REF_USE_RE.finditer(scan_line):
+                if is_in_code_span(m.start()):
+                    continue  # Skip links inside backticks
+                full_match = m.group(0)
+                text = m.group(1).strip()
+                # Extract label from [text][label] - if empty brackets [], use text as label
+                label_part = full_match[len(text) + 2 :]  # skip [text]
+                if label_part == "[]":
+                    label = text  # implicit reference: [text][] uses "text" as label
+                else:
+                    # Explicit label: [text][label]
+                    label = label_part.strip("[]").strip()
+                # Use special marker to indicate this is a reference link
+                links.append((idx, text, f"__REF__{label}"))
+                matched_ranges.append((m.start(), m.end()))
+
+            # Shortcut reference links [text] - single bracket that references a definition
+            # Only match if not already matched by inline or reference-style patterns
+            # Pattern: single bracket pair not preceded by [ and not followed by ( or [
+            for m in re.finditer(r"(?<!\[)\[([^\]]+)\](?![(\[])", scan_line):
+                if is_in_code_span(m.start()):
+                    continue  # Skip links inside backticks
+                # Skip if overlaps with already matched ranges
+                if overlaps_matched_range(m.start(), m.end()):
+                    continue
+                # Skip if this is part of a double bracket pattern [[...]]
+                if m.end() < len(scan_line) and scan_line[m.end()] == "]":
+                    continue
+                text = m.group(1).strip()
+                # Only treat as reference link if it could plausibly be one
+                # (contains text, not just punctuation or numbers)
+                if text and not text.isdigit():
                     # Use special marker to indicate this is a reference link
-                    links.append((idx, text, f"__REF__{label}"))
-                    matched_ranges.append((m.start(), m.end()))
-
-                # Shortcut reference links [text] - single bracket that references a definition
-                # Only match if not already matched by inline or reference-style patterns
-                # Pattern: single bracket pair not preceded by [ and not followed by ( or [
-                for m in re.finditer(r"(?<!\[)\[([^\]]+)\](?![(\[])", line):
-                    if is_in_code_span(m.start()):
-                        continue  # Skip links inside backticks
-                    # Skip if overlaps with already matched ranges
-                    if overlaps_matched_range(m.start(), m.end()):
-                        continue
-                    # Skip if this is part of a double bracket pattern [[...]]
-                    if m.end() < len(line) and line[m.end()] == "]":
-                        continue
-                    text = m.group(1).strip()
-                    # Only treat as reference link if it could plausibly be one
-                    # (contains text, not just punctuation or numbers)
-                    if text and not text.isdigit():
-                        # Use special marker to indicate this is a reference link
-                        # For shortcut references, the label is the text itself
-                        links.append((idx, text, f"__REF__{text}"))
+                    # For shortcut references, the label is the text itself
+                    links.append((idx, text, f"__REF__{text}"))
     except Exception:
         pass
     return links
@@ -467,6 +525,10 @@ def validate_link(current_file: str, line: int, text: str, target: str) -> Optio
             return LinkIssue(current_file, line, text, target, "anchor not found in this file")
         return None
 
+    # Strip CommonMark angle-bracket URL quoting: <https://...> -> https://...
+    if target.startswith("<") and target.endswith(">"):
+        target = target[1:-1]
+
     # Split fragment if present
     file_part, frag_part = target.split("#", 1) if "#" in target else (target, None)
 
diff --git a/src/mongo/db/exec/sbe/README.md b/src/mongo/db/exec/sbe/README.md
index 8f4013233ae..9c0eaed3ea2 100644
--- a/src/mongo/db/exec/sbe/README.md
+++ b/src/mongo/db/exec/sbe/README.md
@@ -461,7 +461,7 @@ for more details.
 ## Runtime Planners
 
 See [Classic Runtime Planners for SBE
-README](https://github.com/mongodb/mongo/blob/master/src/mongo/db/exec/runtime_planners/classic_runtime_planner_for_sbe/README.md).
+README](/src/mongo/db/exec/runtime_planners/classic_runtime_planner_for_sbe/README.md).
 
 ## Incomplete Sections Below (TODO)
 
diff --git a/src/mongo/db/index_builds/README.md b/src/mongo/db/index_builds/README.md
index 954b0c6028e..5cc841bb710 100644
--- a/src/mongo/db/index_builds/README.md
+++ b/src/mongo/db/index_builds/README.md
@@ -270,7 +270,7 @@ using the startup recovery logic that RTT uses to bring the node back to a writa
 For improved rollback semantics, resumable index builds require a majority read cursor during
 collection scan phase. Index builds wait for the majority commit point to advance before starting
 the collection scan. The majority wait happens after installing the [side table for intercepting new
-writes](#internal-side-table-for-new-writes).
+writes](#internal-table-for-side-writes).
 
 See
 [MultiIndexBlock::\_constructStateObject()](https://github.com/mongodb/mongo/blob/0d45dd9d7ba9d3a1557217a998ad31c68a897d47/src/mongo/db/catalog/multi_index_block.cpp#L900)
diff --git a/src/mongo/db/repl/README.md b/src/mongo/db/repl/README.md
index c6202165c59..0269d3731e1 100644
--- a/src/mongo/db/repl/README.md
+++ b/src/mongo/db/repl/README.md
@@ -2025,7 +2025,7 @@ candidate if `(vc, tc) >= (v, t)`. For a description of the complete voting beha
 ### Formal Specification
 
 For more details on the safe reconfig protocol and its behaviors, refer to the [TLA+
-specification](https://github.com/mongodb/mongo/tree/master/src/mongo/tla_plus/MongoReplReconfig)
+specification](/src/mongo/tla_plus/Replication/MongoReplReconfig)
 or the [paper on MongoDB's reconfig protocol](https://arxiv.org/abs/2102.11960), written in part by replication engineers.
 It defines two main invariants of the protocol, ElectionSafety and NeverRollbackCommitted,
 which assert, respectively, that no two leaders are elected in the same term and that majority
diff --git a/src/mongo/db/shard_role/lock_manager/README.md b/src/mongo/db/shard_role/lock_manager/README.md
index 798a9d2c0f5..870976a5e5c 100644
--- a/src/mongo/db/shard_role/lock_manager/README.md
+++ b/src/mongo/db/shard_role/lock_manager/README.md
@@ -106,7 +106,7 @@ ResourceType, as locking at this level is done in the storage engine itself for
 ### Document Level Concurrency Control
 
 Each storage engine is responsible for locking at the document level. The [WiredTiger storage
-engine](../storage/wiredtiger/README.md) uses MVCC [multi-version concurrency control][Multiversion concurrency control] along with optimistic locking in order to provide concurrency guarantees.
+engine](../../storage/wiredtiger/README.md) uses MVCC [multi-version concurrency control][Multiversion concurrency control] along with optimistic locking in order to provide concurrency guarantees.
 
 ## Two-Phase Locking
 
diff --git a/src/mongo/db/shard_role/shard_catalog/README.md b/src/mongo/db/shard_role/shard_catalog/README.md
index 356c6e20db0..03ebd0f9544 100644
--- a/src/mongo/db/shard_role/shard_catalog/README.md
+++ b/src/mongo/db/shard_role/shard_catalog/README.md
@@ -758,7 +758,7 @@ increasing cluster key values.
 Because unlike regular capped collections, clustered collections do not need to preserve insertion
 order, they allow non-serialized concurrent writes. In order to avoid missing documents while
 tailing a clustered collection, the user is required to enforce visibility rules similar to the ['no
-holes' point](../storage/README.md#oplog-visibility). Majority read concern is similarly required.
+holes' point](../../storage/README.md#oplog-visibility). Majority read concern is similarly required.
 
 ## Clustered RecordIds