From 04c22e97149047687e83b68f01196b6506ff3bdd Mon Sep 17 00:00:00 2001 From: Steve McClure Date: Fri, 17 Apr 2026 10:24:10 -0400 Subject: [PATCH] SERVER-124192: Make lint_markdown_links.py more robust (#51892) GitOrigin-RevId: cc7170db4cbabbb75c3cb980e9c3619e6821011c --- buildscripts/lint_markdown_links.py | 268 +++++++++++------- src/mongo/db/exec/sbe/README.md | 2 +- src/mongo/db/index_builds/README.md | 2 +- src/mongo/db/repl/README.md | 2 +- .../db/shard_role/lock_manager/README.md | 2 +- .../db/shard_role/shard_catalog/README.md | 2 +- 6 files changed, 170 insertions(+), 108 deletions(-) diff --git a/buildscripts/lint_markdown_links.py b/buildscripts/lint_markdown_links.py index b3eb39b6ced..dacfa5e8ab0 100755 --- a/buildscripts/lint_markdown_links.py +++ b/buildscripts/lint_markdown_links.py @@ -124,14 +124,31 @@ from typing import Iterable, Optional HEADING_RE = re.compile(r"^(#{1,6})\s+(.*)$") HTML_ANCHOR_RE = re.compile(r'\s*?', re.IGNORECASE) -LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)") +# Matches [text](url) where url is either an angle-bracket form (CommonMark §6.3, +# allows parentheses inside the URL) or a plain URL (stops at the first unescaped ')'). +LINK_RE = re.compile(r"\[([^\]]+)\]\((<[^>]*>|[^)]+)\)") # Inline link references: [text]: url REF_DEF_RE = re.compile(r"^\s*\[([^\]]+)\]:\s+(\S+)") +# Multi-line ref def: label on one line, URL indented on the next (valid CommonMark/Prettier output) +REF_DEF_LABEL_ONLY_RE = re.compile(r"^\s*\[([^\]]+)\]:\s*$") # Reference-style links: [text][label] or [text][] but NOT [[double brackets]] # Negative lookbehind (? bool: + """Return True when *text* contains a '[' that opens a potential inline link not yet closed. + + Used to decide whether to join the next line when scanning for multi-line inline links. + """ + t = re.sub(r"`[^`]*`", "", text) # strip inline code spans + t = re.sub(r"\[[^\]]*\]\([^)]*\)", "", t) # strip complete [text](url) links + t = re.sub(r"\[[^\]]*\]\[[^\]]*\]", "", t) # strip complete [text][label] links + t = re.sub(r"\[[^\]]*\]", "", t) # strip any remaining closed [...] (shortcut refs) + return "[" in t + + # Characters removed for anchor IDs (GitHub rules simplified). We strip most punctuation except hyphen and underscore. PUNCT_TO_STRIP = "\"'!#$%&()*+,./:;<=>?@[]^`{|}~" # punctuation characters to remove ANCHOR_CACHE: dict[str, set[str]] = {} @@ -251,18 +268,37 @@ def collect_headings(path: str) -> set[str]: def collect_reference_definitions(path: str) -> dict[str, str]: - """Parse all reference-style link definitions [label]: url from a markdown file.""" + """Parse all reference-style link definitions [label]: url from a markdown file. + + Handles both single-line and Prettier-style multi-line forms: + [label]: https://... (single-line) + [label]: (multi-line — CommonMark allows URL on next line) + https://... + """ if path in REFERENCE_CACHE: return REFERENCE_CACHE[path] references: dict[str, str] = {} try: with open(path, "r", encoding="utf-8") as f: - for line in f: - m = REF_DEF_RE.match(line) - if m: - label = m.group(1).strip().lower() # case-insensitive matching - target = m.group(2).strip() - references[label] = target + lines = f.readlines() + i = 0 + while i < len(lines): + line = lines[i] + m = REF_DEF_RE.match(line) + if m: + label = m.group(1).strip().lower() # case-insensitive matching + target = m.group(2).strip() + references[label] = target + else: + m2 = REF_DEF_LABEL_ONLY_RE.match(line) + if m2 and i + 1 < len(lines): + next_line = lines[i + 1] + m3 = re.match(r"^\s+(\S+)", next_line) + if m3: + label = m2.group(1).strip().lower() + references[label] = m3.group(1).strip() + i += 1 # consume the URL line + i += 1 except Exception: pass REFERENCE_CACHE[path] = references @@ -286,111 +322,133 @@ def parse_links(file_path: str) -> list[tuple[int, str, str]]: links: list[tuple[int, str, str]] = [] try: with open(file_path, "r", encoding="utf-8") as f: - in_fence = False - in_blockquote = False - fence_delim = None # track ``` or ~~~ - for idx, raw_line in enumerate(f, start=1): - line = raw_line.rstrip("\n") - # Detect start/end of fenced code blocks. Accept ``` or ~~~ with optional language and leading whitespace. - fence_match = re.match(r"^\s*(?P`{3,}|~{3,})(.*)$", line) - if fence_match: - full = fence_match.group("delim") - # Toggle if same delimiter starts/ends - if not in_fence: - in_fence = True - fence_delim = full - continue - else: - # Only close if same delimiter length & char - if fence_delim == full: - in_fence = False - fence_delim = None - continue - if in_fence: - continue # skip link detection inside code fences - # Blockquote handling: if line starts with '>' treat entire following wrapped paragraph as quoted until blank line - if re.match(r"^\s*>", line): - in_blockquote = True + raw_lines = f.readlines() + in_fence = False + in_blockquote = False + fence_delim = None # track ``` or ~~~ + i = 0 + while i < len(raw_lines): + idx = i + 1 # 1-based line number + line = raw_lines[i].rstrip("\n") + i += 1 + # Detect start/end of fenced code blocks. Accept ``` or ~~~ with optional language and leading whitespace. + fence_match = re.match(r"^\s*(?P`{3,}|~{3,})(.*)$", line) + if fence_match: + full = fence_match.group("delim") + # Toggle if same delimiter starts/ends + if not in_fence: + in_fence = True + fence_delim = full continue - if in_blockquote: - if line.strip() == "": - in_blockquote = False - else: + else: + # Only close if same delimiter length & char + if fence_delim == full: + in_fence = False + fence_delim = None continue - # Skip lines that are reference definitions themselves - if REF_DEF_RE.match(line): + if in_fence: + continue # skip link detection inside code fences + # Blockquote handling: if line starts with '>' treat entire following wrapped paragraph as quoted until blank line + if re.match(r"^\s*>", line): + in_blockquote = True + continue + if in_blockquote: + if line.strip() == "": + in_blockquote = False + else: continue + # Skip lines that are reference definitions themselves (single-line or multi-line forms) + if REF_DEF_RE.match(line): + continue + if REF_DEF_LABEL_ONLY_RE.match(line): + # Multi-line ref def: skip this line and the indented URL on the next line + if i < len(raw_lines) and re.match(r"^\s+\S", raw_lines[i]): + i += 1 + continue - # Find all backtick regions to exclude from link detection - # Build a set of character positions that are inside backticks - backtick_positions = set() - in_code = False - for i, char in enumerate(line): - if char == "`": - in_code = not in_code - elif in_code: - backtick_positions.add(i) + # Join continuation lines for multi-line inline links. + # CommonMark allows link text (and destination) to span lines, e.g.: + # [Sync Source + # Selection](https://...) + # Consume following lines until the bracket is closed or a blank line is hit. + scan_line = line + while _has_dangling_open_bracket(scan_line) and i < len(raw_lines): + next_raw = raw_lines[i].rstrip("\n") + if not next_raw.strip(): + break # blank line ends a paragraph; inline links can't span it + scan_line = scan_line + " " + next_raw + i += 1 - # Helper function to check if the opening bracket of a link is inside backticks - # We only check the start position because if the [ is in code, the whole link should be skipped - def is_in_code_span(match_start): - return match_start in backtick_positions + # Find all backtick regions to exclude from link detection + # Build a set of character positions that are inside backticks + backtick_positions = set() + in_code = False + for j, char in enumerate(scan_line): + if char == "`": + in_code = not in_code + elif in_code: + backtick_positions.add(j) - # Track character ranges of all matched links to avoid double-processing - matched_ranges = [] + # Helper function to check if the opening bracket of a link is inside backticks + # We only check the start position because if the [ is in code, the whole link should be skipped + def is_in_code_span(match_start): + return match_start in backtick_positions - def overlaps_matched_range(start, end): - """Check if a position range overlaps with any previously matched range.""" - for m_start, m_end in matched_ranges: - # Check for any overlap - if start < m_end and end > m_start: - return True - return False + # Track character ranges of all matched links to avoid double-processing + matched_ranges = [] - # Inline links [text](url) - for m in LINK_RE.finditer(line): - if is_in_code_span(m.start()): - continue # Skip links inside backticks - text, target = m.group(1), m.group(2).strip() - links.append((idx, text, target)) - matched_ranges.append((m.start(), m.end())) + def overlaps_matched_range(start, end): + """Check if a position range overlaps with any previously matched range.""" + for m_start, m_end in matched_ranges: + # Check for any overlap + if start < m_end and end > m_start: + return True + return False - # Reference-style links [text][label] or [text][] - for m in REF_USE_RE.finditer(line): - if is_in_code_span(m.start()): - continue # Skip links inside backticks - full_match = m.group(0) - text = m.group(1).strip() - # Extract label from [text][label] - if empty brackets [], use text as label - label_part = full_match[len(text) + 2 :] # skip [text] - if label_part == "[]": - label = text # implicit reference: [text][] uses "text" as label - else: - # Explicit label: [text][label] - label = label_part.strip("[]").strip() + # Inline links [text](url) + for m in LINK_RE.finditer(scan_line): + if is_in_code_span(m.start()): + continue # Skip links inside backticks + text, target = m.group(1), m.group(2).strip() + links.append((idx, text, target)) + matched_ranges.append((m.start(), m.end())) + + # Reference-style links [text][label] or [text][] + for m in REF_USE_RE.finditer(scan_line): + if is_in_code_span(m.start()): + continue # Skip links inside backticks + full_match = m.group(0) + text = m.group(1).strip() + # Extract label from [text][label] - if empty brackets [], use text as label + label_part = full_match[len(text) + 2 :] # skip [text] + if label_part == "[]": + label = text # implicit reference: [text][] uses "text" as label + else: + # Explicit label: [text][label] + label = label_part.strip("[]").strip() + # Use special marker to indicate this is a reference link + links.append((idx, text, f"__REF__{label}")) + matched_ranges.append((m.start(), m.end())) + + # Shortcut reference links [text] - single bracket that references a definition + # Only match if not already matched by inline or reference-style patterns + # Pattern: single bracket pair not preceded by [ and not followed by ( or [ + for m in re.finditer(r"(? Optio return LinkIssue(current_file, line, text, target, "anchor not found in this file") return None + # Strip CommonMark angle-bracket URL quoting: -> https://... + if target.startswith("<") and target.endswith(">"): + target = target[1:-1] + # Split fragment if present file_part, frag_part = target.split("#", 1) if "#" in target else (target, None) diff --git a/src/mongo/db/exec/sbe/README.md b/src/mongo/db/exec/sbe/README.md index 8f4013233ae..9c0eaed3ea2 100644 --- a/src/mongo/db/exec/sbe/README.md +++ b/src/mongo/db/exec/sbe/README.md @@ -461,7 +461,7 @@ for more details. ## Runtime Planners See [Classic Runtime Planners for SBE -README](https://github.com/mongodb/mongo/blob/master/src/mongo/db/exec/runtime_planners/classic_runtime_planner_for_sbe/README.md). +README](/src/mongo/db/exec/runtime_planners/classic_runtime_planner_for_sbe/README.md). ## Incomplete Sections Below (TODO) diff --git a/src/mongo/db/index_builds/README.md b/src/mongo/db/index_builds/README.md index 954b0c6028e..5cc841bb710 100644 --- a/src/mongo/db/index_builds/README.md +++ b/src/mongo/db/index_builds/README.md @@ -270,7 +270,7 @@ using the startup recovery logic that RTT uses to bring the node back to a writa For improved rollback semantics, resumable index builds require a majority read cursor during collection scan phase. Index builds wait for the majority commit point to advance before starting the collection scan. The majority wait happens after installing the [side table for intercepting new -writes](#internal-side-table-for-new-writes). +writes](#internal-table-for-side-writes). See [MultiIndexBlock::\_constructStateObject()](https://github.com/mongodb/mongo/blob/0d45dd9d7ba9d3a1557217a998ad31c68a897d47/src/mongo/db/catalog/multi_index_block.cpp#L900) diff --git a/src/mongo/db/repl/README.md b/src/mongo/db/repl/README.md index c6202165c59..0269d3731e1 100644 --- a/src/mongo/db/repl/README.md +++ b/src/mongo/db/repl/README.md @@ -2025,7 +2025,7 @@ candidate if `(vc, tc) >= (v, t)`. For a description of the complete voting beha ### Formal Specification For more details on the safe reconfig protocol and its behaviors, refer to the [TLA+ -specification](https://github.com/mongodb/mongo/tree/master/src/mongo/tla_plus/MongoReplReconfig) +specification](/src/mongo/tla_plus/Replication/MongoReplReconfig) or the [paper on MongoDB's reconfig protocol](https://arxiv.org/abs/2102.11960), written in part by replication engineers. It defines two main invariants of the protocol, ElectionSafety and NeverRollbackCommitted, which assert, respectively, that no two leaders are elected in the same term and that majority diff --git a/src/mongo/db/shard_role/lock_manager/README.md b/src/mongo/db/shard_role/lock_manager/README.md index 798a9d2c0f5..870976a5e5c 100644 --- a/src/mongo/db/shard_role/lock_manager/README.md +++ b/src/mongo/db/shard_role/lock_manager/README.md @@ -106,7 +106,7 @@ ResourceType, as locking at this level is done in the storage engine itself for ### Document Level Concurrency Control Each storage engine is responsible for locking at the document level. The [WiredTiger storage -engine](../storage/wiredtiger/README.md) uses MVCC [multi-version concurrency control][Multiversion concurrency control] along with optimistic locking in order to provide concurrency guarantees. +engine](../../storage/wiredtiger/README.md) uses MVCC [multi-version concurrency control][Multiversion concurrency control] along with optimistic locking in order to provide concurrency guarantees. ## Two-Phase Locking diff --git a/src/mongo/db/shard_role/shard_catalog/README.md b/src/mongo/db/shard_role/shard_catalog/README.md index 356c6e20db0..03ebd0f9544 100644 --- a/src/mongo/db/shard_role/shard_catalog/README.md +++ b/src/mongo/db/shard_role/shard_catalog/README.md @@ -758,7 +758,7 @@ increasing cluster key values. Because unlike regular capped collections, clustered collections do not need to preserve insertion order, they allow non-serialized concurrent writes. In order to avoid missing documents while tailing a clustered collection, the user is required to enforce visibility rules similar to the ['no -holes' point](../storage/README.md#oplog-visibility). Majority read concern is similarly required. +holes' point](../../storage/README.md#oplog-visibility). Majority read concern is similarly required. ## Clustered RecordIds