-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathcheck-internal-links.py
More file actions
110 lines (88 loc) · 3.52 KB
/
check-internal-links.py
File metadata and controls
110 lines (88 loc) · 3.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python3
import re
import sys
from pathlib import Path
# Regex for Markdown links: [text](target)
LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
def slugify_heading(text: str) -> str:
"""Convert Markdown heading text to MkDocs anchor format."""
slug = text.strip().lower()
slug = re.sub(r"[^\w\s-]", "", slug) # remove punctuation
slug = re.sub(r"\s+", "-", slug) # spaces -> dashes
return slug
def extract_headings(md_file: Path):
"""Return a set of anchor slugs from headings in md files."""
headings = set()
if not md_file.is_file():
return headings
for line in md_file.read_text(encoding="utf-8").splitlines():
if line.startswith("#"):
heading_text = line.lstrip("#").strip()
headings.add(slugify_heading(heading_text))
return headings
def check_md_file(md_file: Path):
"""Check links in md files."""
errors = []
if not md_file.is_file(): # <-- add this check
return errors
count = 0
for line_number, line in enumerate(md_file.read_text(encoding="utf-8").splitlines(), start=1):
stripped = line.strip()
# Skip HTML and python comments
if stripped.startswith("#"):
continue
elif stripped.startswith("<!--") and stripped.endswith("-->"):
continue
for match in LINK_RE.finditer(line):
text, target = match.groups()
# Skip external links
if target.startswith(("http://", "https://", "mailto:")):
continue
# Remove leading slash for site-root relative links
elif target.startswith("/"):
target = target[1:]
count += 1
# Split anchor from file
if "#" in target:
if target.startswith("#"):
file_part, anchor = md_file, target[1:]
else:
file_part, anchor = target.split("#", 1)
else:
file_part, anchor = target, None
# Resolve relative path'
target_path = Path(file_part) if isinstance(file_part, Path) else Path(file_part) #Make sure it's a Path object
target_file = (md_file.parent / target_path).resolve()
if not target_file.exists() and not target_file.suffix:
target_file = (md_file.parent / (target_path.name + ".md")).resolve()
if not target_file.exists():
errors.append(f"{md_file}:{line_number}: File not found -> {target}")
continue
if target_file.is_file() and anchor:
headings = extract_headings(target_file)
if anchor not in headings:
errors.append(f"{md_file}:{line_number}: Anchor not found -> {target}")
if target_file.is_dir():
continue
return errors, count
def main(md_dir):
md_dir = Path(md_dir)
all_errors = []
total_count = 0
for md_file in md_dir.rglob("*.md"):
errors, count = check_md_file(md_file)
all_errors.extend(errors)
total_count += count
print(f"Checked {total_count} internal links in {md_dir}.")
if total_count == 0:
print("Expected internal links. None were found.")
return 1
if all_errors:
print(f'Found {len(all_errors)} internal link errors in md files:')
for e in all_errors:
print(e)
return 1
print("No internal link errors found.")
return 0
if __name__ == "__main__":
exit(main(sys.argv[1]))