docs/check-internal-links.py at master · waterlinked/docs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python3
import re
import sys
from pathlib import Path

# Regex for Markdown links: [text](target)
LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")

def slugify_heading(text: str) -> str:
    """Convert Markdown heading text to MkDocs anchor format."""
    slug = text.strip().lower()
    slug = re.sub(r"[^\w\s-]", "", slug)  # remove punctuation
    slug = re.sub(r"\s+", "-", slug)      # spaces -> dashes
    return slug


def extract_headings(md_file: Path):
    """Return a set of anchor slugs from headings in md files."""
    headings = set()
    if not md_file.is_file():
        return headings
    for line in md_file.read_text(encoding="utf-8").splitlines():
        if line.startswith("#"):
            heading_text = line.lstrip("#").strip()
            headings.add(slugify_heading(heading_text))
    return headings

def check_md_file(md_file: Path):
    """Check links in md files."""
    errors = []
    if not md_file.is_file():  # <-- add this check
        return errors

    count = 0
    for line_number, line in enumerate(md_file.read_text(encoding="utf-8").splitlines(), start=1):

        stripped = line.strip()
        # Skip HTML and python comments
        if stripped.startswith("#"):
            continue
        elif stripped.startswith("<!--") and stripped.endswith("-->"):
            continue

        for match in LINK_RE.finditer(line):
            text, target = match.groups()

            # Skip external links
            if target.startswith(("http://", "https://", "mailto:")):
                continue
            # Remove leading slash for site-root relative links
            elif target.startswith("/"):
                target = target[1:]

            count += 1

            # Split anchor from file
            if "#" in target:
                if target.startswith("#"):
                    file_part, anchor = md_file, target[1:]
                else:
                    file_part, anchor = target.split("#", 1)
            else:
                file_part, anchor = target, None

            # Resolve relative path'
            target_path = Path(file_part) if isinstance(file_part, Path) else Path(file_part) #Make sure it's a Path object
            target_file = (md_file.parent / target_path).resolve()

            if not target_file.exists() and not target_file.suffix:
                target_file = (md_file.parent / (target_path.name + ".md")).resolve()
                if not target_file.exists():
                    errors.append(f"{md_file}:{line_number}: File not found -> {target}")
                    continue

            if target_file.is_file() and anchor:
                headings = extract_headings(target_file)
                if anchor not in headings:
                    errors.append(f"{md_file}:{line_number}: Anchor not found -> {target}")

            if target_file.is_dir():
                continue

    return errors, count


def main(md_dir):
    md_dir = Path(md_dir)
    all_errors = []
    total_count = 0
    for md_file in md_dir.rglob("*.md"):
        errors, count = check_md_file(md_file)
        all_errors.extend(errors)
        total_count += count

    print(f"Checked {total_count} internal links in {md_dir}.")

    if total_count == 0:
        print("Expected internal links. None were found.")
        return 1

    if all_errors:
        print(f'Found {len(all_errors)} internal link errors in md files:')
        for e in all_errors:
            print(e)
        return 1
    print("No internal link errors found.")
    return 0

if __name__ == "__main__":
    exit(main(sys.argv[1]))