SERVER-124192: Make lint_markdown_links.py more robust (#51892)

GitOrigin-RevId: cc7170db4cbabbb75c3cb980e9c3619e6821011c
This commit is contained in:
Steve McClure 2026-04-17 10:24:10 -04:00 committed by MongoDB Bot
parent c58ea3fee0
commit 04c22e9714
6 changed files with 170 additions and 108 deletions

View File

@ -124,14 +124,31 @@ from typing import Iterable, Optional
HEADING_RE = re.compile(r"^(#{1,6})\s+(.*)$")
HTML_ANCHOR_RE = re.compile(r'<a\s+(?:name|id)=["\']([^"\']+)["\']\s*>\s*</a>?', re.IGNORECASE)
LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
# Matches [text](url) where url is either an angle-bracket form <url> (CommonMark §6.3,
# allows parentheses inside the URL) or a plain URL (stops at the first unescaped ')').
LINK_RE = re.compile(r"\[([^\]]+)\]\((<[^>]*>|[^)]+)\)")
# Inline link references: [text]: url
REF_DEF_RE = re.compile(r"^\s*\[([^\]]+)\]:\s+(\S+)")
# Multi-line ref def: label on one line, URL indented on the next (valid CommonMark/Prettier output)
REF_DEF_LABEL_ONLY_RE = re.compile(r"^\s*\[([^\]]+)\]:\s*$")
# Reference-style links: [text][label] or [text][] but NOT [[double brackets]]
# Negative lookbehind (?<!\[) ensures first [ is not preceded by [
# Negative lookahead (?!\[) ensures first [ is not followed by another [
REF_USE_RE = re.compile(r"(?<!\[)\[([^\]]+)\](?!\])\[(?:(?:[^\]]+))?\]")
def _has_dangling_open_bracket(text: str) -> bool:
"""Return True when *text* contains a '[' that opens a potential inline link not yet closed.
Used to decide whether to join the next line when scanning for multi-line inline links.
"""
t = re.sub(r"`[^`]*`", "", text) # strip inline code spans
t = re.sub(r"\[[^\]]*\]\([^)]*\)", "", t) # strip complete [text](url) links
t = re.sub(r"\[[^\]]*\]\[[^\]]*\]", "", t) # strip complete [text][label] links
t = re.sub(r"\[[^\]]*\]", "", t) # strip any remaining closed [...] (shortcut refs)
return "[" in t
# Characters removed for anchor IDs (GitHub rules simplified). We strip most punctuation except hyphen and underscore.
PUNCT_TO_STRIP = "\"'!#$%&()*+,./:;<=>?@[]^`{|}~" # punctuation characters to remove
ANCHOR_CACHE: dict[str, set[str]] = {}
@ -251,18 +268,37 @@ def collect_headings(path: str) -> set[str]:
def collect_reference_definitions(path: str) -> dict[str, str]:
"""Parse all reference-style link definitions [label]: url from a markdown file."""
"""Parse all reference-style link definitions [label]: url from a markdown file.
Handles both single-line and Prettier-style multi-line forms:
[label]: https://... (single-line)
[label]: (multi-line CommonMark allows URL on next line)
https://...
"""
if path in REFERENCE_CACHE:
return REFERENCE_CACHE[path]
references: dict[str, str] = {}
try:
with open(path, "r", encoding="utf-8") as f:
for line in f:
m = REF_DEF_RE.match(line)
if m:
label = m.group(1).strip().lower() # case-insensitive matching
target = m.group(2).strip()
references[label] = target
lines = f.readlines()
i = 0
while i < len(lines):
line = lines[i]
m = REF_DEF_RE.match(line)
if m:
label = m.group(1).strip().lower() # case-insensitive matching
target = m.group(2).strip()
references[label] = target
else:
m2 = REF_DEF_LABEL_ONLY_RE.match(line)
if m2 and i + 1 < len(lines):
next_line = lines[i + 1]
m3 = re.match(r"^\s+(\S+)", next_line)
if m3:
label = m2.group(1).strip().lower()
references[label] = m3.group(1).strip()
i += 1 # consume the URL line
i += 1
except Exception:
pass
REFERENCE_CACHE[path] = references
@ -286,111 +322,133 @@ def parse_links(file_path: str) -> list[tuple[int, str, str]]:
links: list[tuple[int, str, str]] = []
try:
with open(file_path, "r", encoding="utf-8") as f:
in_fence = False
in_blockquote = False
fence_delim = None # track ``` or ~~~
for idx, raw_line in enumerate(f, start=1):
line = raw_line.rstrip("\n")
# Detect start/end of fenced code blocks. Accept ``` or ~~~ with optional language and leading whitespace.
fence_match = re.match(r"^\s*(?P<delim>`{3,}|~{3,})(.*)$", line)
if fence_match:
full = fence_match.group("delim")
# Toggle if same delimiter starts/ends
if not in_fence:
in_fence = True
fence_delim = full
continue
else:
# Only close if same delimiter length & char
if fence_delim == full:
in_fence = False
fence_delim = None
continue
if in_fence:
continue # skip link detection inside code fences
# Blockquote handling: if line starts with '>' treat entire following wrapped paragraph as quoted until blank line
if re.match(r"^\s*>", line):
in_blockquote = True
raw_lines = f.readlines()
in_fence = False
in_blockquote = False
fence_delim = None # track ``` or ~~~
i = 0
while i < len(raw_lines):
idx = i + 1 # 1-based line number
line = raw_lines[i].rstrip("\n")
i += 1
# Detect start/end of fenced code blocks. Accept ``` or ~~~ with optional language and leading whitespace.
fence_match = re.match(r"^\s*(?P<delim>`{3,}|~{3,})(.*)$", line)
if fence_match:
full = fence_match.group("delim")
# Toggle if same delimiter starts/ends
if not in_fence:
in_fence = True
fence_delim = full
continue
if in_blockquote:
if line.strip() == "":
in_blockquote = False
else:
else:
# Only close if same delimiter length & char
if fence_delim == full:
in_fence = False
fence_delim = None
continue
# Skip lines that are reference definitions themselves
if REF_DEF_RE.match(line):
if in_fence:
continue # skip link detection inside code fences
# Blockquote handling: if line starts with '>' treat entire following wrapped paragraph as quoted until blank line
if re.match(r"^\s*>", line):
in_blockquote = True
continue
if in_blockquote:
if line.strip() == "":
in_blockquote = False
else:
continue
# Skip lines that are reference definitions themselves (single-line or multi-line forms)
if REF_DEF_RE.match(line):
continue
if REF_DEF_LABEL_ONLY_RE.match(line):
# Multi-line ref def: skip this line and the indented URL on the next line
if i < len(raw_lines) and re.match(r"^\s+\S", raw_lines[i]):
i += 1
continue
# Find all backtick regions to exclude from link detection
# Build a set of character positions that are inside backticks
backtick_positions = set()
in_code = False
for i, char in enumerate(line):
if char == "`":
in_code = not in_code
elif in_code:
backtick_positions.add(i)
# Join continuation lines for multi-line inline links.
# CommonMark allows link text (and destination) to span lines, e.g.:
# [Sync Source
# Selection](https://...)
# Consume following lines until the bracket is closed or a blank line is hit.
scan_line = line
while _has_dangling_open_bracket(scan_line) and i < len(raw_lines):
next_raw = raw_lines[i].rstrip("\n")
if not next_raw.strip():
break # blank line ends a paragraph; inline links can't span it
scan_line = scan_line + " " + next_raw
i += 1
# Helper function to check if the opening bracket of a link is inside backticks
# We only check the start position because if the [ is in code, the whole link should be skipped
def is_in_code_span(match_start):
return match_start in backtick_positions
# Find all backtick regions to exclude from link detection
# Build a set of character positions that are inside backticks
backtick_positions = set()
in_code = False
for j, char in enumerate(scan_line):
if char == "`":
in_code = not in_code
elif in_code:
backtick_positions.add(j)
# Track character ranges of all matched links to avoid double-processing
matched_ranges = []
# Helper function to check if the opening bracket of a link is inside backticks
# We only check the start position because if the [ is in code, the whole link should be skipped
def is_in_code_span(match_start):
return match_start in backtick_positions
def overlaps_matched_range(start, end):
"""Check if a position range overlaps with any previously matched range."""
for m_start, m_end in matched_ranges:
# Check for any overlap
if start < m_end and end > m_start:
return True
return False
# Track character ranges of all matched links to avoid double-processing
matched_ranges = []
# Inline links [text](url)
for m in LINK_RE.finditer(line):
if is_in_code_span(m.start()):
continue # Skip links inside backticks
text, target = m.group(1), m.group(2).strip()
links.append((idx, text, target))
matched_ranges.append((m.start(), m.end()))
def overlaps_matched_range(start, end):
"""Check if a position range overlaps with any previously matched range."""
for m_start, m_end in matched_ranges:
# Check for any overlap
if start < m_end and end > m_start:
return True
return False
# Reference-style links [text][label] or [text][]
for m in REF_USE_RE.finditer(line):
if is_in_code_span(m.start()):
continue # Skip links inside backticks
full_match = m.group(0)
text = m.group(1).strip()
# Extract label from [text][label] - if empty brackets [], use text as label
label_part = full_match[len(text) + 2 :] # skip [text]
if label_part == "[]":
label = text # implicit reference: [text][] uses "text" as label
else:
# Explicit label: [text][label]
label = label_part.strip("[]").strip()
# Inline links [text](url)
for m in LINK_RE.finditer(scan_line):
if is_in_code_span(m.start()):
continue # Skip links inside backticks
text, target = m.group(1), m.group(2).strip()
links.append((idx, text, target))
matched_ranges.append((m.start(), m.end()))
# Reference-style links [text][label] or [text][]
for m in REF_USE_RE.finditer(scan_line):
if is_in_code_span(m.start()):
continue # Skip links inside backticks
full_match = m.group(0)
text = m.group(1).strip()
# Extract label from [text][label] - if empty brackets [], use text as label
label_part = full_match[len(text) + 2 :] # skip [text]
if label_part == "[]":
label = text # implicit reference: [text][] uses "text" as label
else:
# Explicit label: [text][label]
label = label_part.strip("[]").strip()
# Use special marker to indicate this is a reference link
links.append((idx, text, f"__REF__{label}"))
matched_ranges.append((m.start(), m.end()))
# Shortcut reference links [text] - single bracket that references a definition
# Only match if not already matched by inline or reference-style patterns
# Pattern: single bracket pair not preceded by [ and not followed by ( or [
for m in re.finditer(r"(?<!\[)\[([^\]]+)\](?![(\[])", scan_line):
if is_in_code_span(m.start()):
continue # Skip links inside backticks
# Skip if overlaps with already matched ranges
if overlaps_matched_range(m.start(), m.end()):
continue
# Skip if this is part of a double bracket pattern [[...]]
if m.end() < len(scan_line) and scan_line[m.end()] == "]":
continue
text = m.group(1).strip()
# Only treat as reference link if it could plausibly be one
# (contains text, not just punctuation or numbers)
if text and not text.isdigit():
# Use special marker to indicate this is a reference link
links.append((idx, text, f"__REF__{label}"))
matched_ranges.append((m.start(), m.end()))
# Shortcut reference links [text] - single bracket that references a definition
# Only match if not already matched by inline or reference-style patterns
# Pattern: single bracket pair not preceded by [ and not followed by ( or [
for m in re.finditer(r"(?<!\[)\[([^\]]+)\](?![(\[])", line):
if is_in_code_span(m.start()):
continue # Skip links inside backticks
# Skip if overlaps with already matched ranges
if overlaps_matched_range(m.start(), m.end()):
continue
# Skip if this is part of a double bracket pattern [[...]]
if m.end() < len(line) and line[m.end()] == "]":
continue
text = m.group(1).strip()
# Only treat as reference link if it could plausibly be one
# (contains text, not just punctuation or numbers)
if text and not text.isdigit():
# Use special marker to indicate this is a reference link
# For shortcut references, the label is the text itself
links.append((idx, text, f"__REF__{text}"))
# For shortcut references, the label is the text itself
links.append((idx, text, f"__REF__{text}"))
except Exception:
pass
return links
@ -467,6 +525,10 @@ def validate_link(current_file: str, line: int, text: str, target: str) -> Optio
return LinkIssue(current_file, line, text, target, "anchor not found in this file")
return None
# Strip CommonMark angle-bracket URL quoting: <https://...> -> https://...
if target.startswith("<") and target.endswith(">"):
target = target[1:-1]
# Split fragment if present
file_part, frag_part = target.split("#", 1) if "#" in target else (target, None)

View File

@ -461,7 +461,7 @@ for more details.
## Runtime Planners
See [Classic Runtime Planners for SBE
README](https://github.com/mongodb/mongo/blob/master/src/mongo/db/exec/runtime_planners/classic_runtime_planner_for_sbe/README.md).
README](/src/mongo/db/exec/runtime_planners/classic_runtime_planner_for_sbe/README.md).
## Incomplete Sections Below (TODO)

View File

@ -270,7 +270,7 @@ using the startup recovery logic that RTT uses to bring the node back to a writa
For improved rollback semantics, resumable index builds require a majority read cursor during
collection scan phase. Index builds wait for the majority commit point to advance before starting
the collection scan. The majority wait happens after installing the [side table for intercepting new
writes](#internal-side-table-for-new-writes).
writes](#internal-table-for-side-writes).
See
[MultiIndexBlock::\_constructStateObject()](https://github.com/mongodb/mongo/blob/0d45dd9d7ba9d3a1557217a998ad31c68a897d47/src/mongo/db/catalog/multi_index_block.cpp#L900)

View File

@ -2025,7 +2025,7 @@ candidate if `(vc, tc) >= (v, t)`. For a description of the complete voting beha
### Formal Specification
For more details on the safe reconfig protocol and its behaviors, refer to the [TLA+
specification](https://github.com/mongodb/mongo/tree/master/src/mongo/tla_plus/MongoReplReconfig)
specification](/src/mongo/tla_plus/Replication/MongoReplReconfig)
or the [paper on MongoDB's reconfig protocol](https://arxiv.org/abs/2102.11960), written in part by replication engineers.
It defines two main invariants of the protocol, ElectionSafety and NeverRollbackCommitted,
which assert, respectively, that no two leaders are elected in the same term and that majority

View File

@ -106,7 +106,7 @@ ResourceType, as locking at this level is done in the storage engine itself for
### Document Level Concurrency Control
Each storage engine is responsible for locking at the document level. The [WiredTiger storage
engine](../storage/wiredtiger/README.md) uses MVCC [multi-version concurrency control][Multiversion concurrency control] along with optimistic locking in order to provide concurrency guarantees.
engine](../../storage/wiredtiger/README.md) uses MVCC [multi-version concurrency control][Multiversion concurrency control] along with optimistic locking in order to provide concurrency guarantees.
## Two-Phase Locking

View File

@ -758,7 +758,7 @@ increasing cluster key values.
Because unlike regular capped collections, clustered collections do not need to preserve insertion
order, they allow non-serialized concurrent writes. In order to avoid missing documents while
tailing a clustered collection, the user is required to enforce visibility rules similar to the ['no
holes' point](../storage/README.md#oplog-visibility). Majority read concern is similarly required.
holes' point](../../storage/README.md#oplog-visibility). Majority read concern is similarly required.
## Clustered RecordIds