diff --git a/code/400_find_terms_without_example.py b/code/400_find_terms_without_example.py
new file mode 100755
index 000000000..c1b4fa039
--- /dev/null
+++ b/code/400_find_terms_without_example.py
@@ -0,0 +1,492 @@
+#!/usr/bin/env python3
+
+# SPDX-FileContributor: Arthit Suriyawongkul
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: W3C-20150513
+
+"""
+Find defined terms without an example and terms used in examples but not defined.
+"""
+
+import argparse
+import os
+import re
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import Iterator
+
+from rdflib import RDF, RDFS, SKOS, Graph
+
+# Assuming consistent prefixes in vocabulary files
+SKIP_PREFIXES = {
+ "ex", # Example namespaces
+ "exA",
+ "exB",
+ "exC",
+ "exD",
+ "exE",
+ "exF",
+ "exG",
+ "_", # Blank nodes
+ "bibo",
+ "dcat",
+ "dct",
+ "foaf",
+ "odrl",
+ "org",
+ "owl",
+ "profile",
+ "rdf",
+ "rdfs",
+ "role",
+ "schema",
+ "scoro",
+ "skos",
+ "sw",
+ "time",
+ "vann",
+ "xsd",
+}
+
+DEFAULT_VOCAB_DIR = "../2.2"
+DEFAULT_EXAMPLES_DIR = "../examples"
+
+
+def parse_args() -> argparse.Namespace:
+ """Parse command line arguments"""
+ parser = argparse.ArgumentParser(description="Find terms without an example")
+ parser.add_argument(
+ "-d",
+ "--vocab-dir",
+ default=DEFAULT_VOCAB_DIR,
+ help="Directory containing vocabulary TTL files",
+ )
+ parser.add_argument(
+ "-e",
+ "--examples-dir",
+ default=DEFAULT_EXAMPLES_DIR,
+ help="Directory containing example TTL files",
+ )
+ parser.add_argument("-v", "--verbose", action="store_true", help="Print everything")
+ parser.add_argument(
+ "-l",
+ "--list-unused-terms",
+ action="store_true",
+ help="Print terms without examples",
+ )
+ parser.add_argument(
+ "-t",
+ "--top-unused-parents",
+ action="store_true",
+ help="Print top parents of terms without examples",
+ )
+ parser.add_argument(
+ "-c",
+ "--count-used-terms",
+ action="store_true",
+ help="Print term counts and example counts per namespace",
+ )
+ parser.add_argument(
+ "-u",
+ "--list-undefined-terms",
+ action="store_true",
+ help="Print terms used in examples but not defined in vocabulary files",
+ )
+ parser.add_argument(
+ "-x",
+ "--list-undefined-html-terms",
+ action="store_true",
+ help="Print terms used in HTMLs but not defined in vocabulary files",
+ )
+ return parser.parse_args()
+
+
+def get_ttl_files(root: str) -> Iterator[str]:
+ """Yield .ttl files, excluding files ending with -owl.ttl"""
+ base = Path(root)
+ if not base.exists():
+ return
+ for p in base.rglob("*.ttl"):
+ name = p.name.lower()
+ if not name.endswith("-owl.ttl"):
+ yield str(p)
+
+
+def get_html_files(root: str) -> Iterator[str]:
+ """Yield .html files, excluding files ending with -en.html"""
+ base = Path(root)
+ if not base.exists():
+ return
+ for p in base.rglob("*.html"):
+ name = p.name.lower()
+ if not name.endswith("-en.html"):
+ yield str(p)
+
+
+def collect_terms_in_vocabs(
+ files: list[str],
+) -> tuple[set[str], set[str], dict[str, set[str]], Counter[str], Counter[str]]:
+ """Collect terms defined in vocabulary files"""
+ classes: set[str] = set()
+ properties: set[str] = set()
+ parents: dict[str, set[str]] = {}
+ classes_ns_count: Counter[str] = Counter()
+ properties_ns_count: Counter[str] = Counter()
+
+ for f in files:
+ g = Graph()
+ g.parse(f, format="turtle")
+ for s in g.subjects(RDF.type, RDFS.Class):
+ name = g.qname(str(s))
+ ns, sep, _ = name.partition(":")
+ if ns in SKIP_PREFIXES:
+ continue
+ if name not in classes:
+ classes.add(name)
+ if sep:
+ classes_ns_count[ns] += 1
+ for o in g.objects(s, RDFS.subClassOf):
+ parents.setdefault(name, set()).add(g.qname(str(o)))
+ for o in g.objects(s, SKOS.broader):
+ parents.setdefault(name, set()).add(g.qname(str(o)))
+ for s in g.subjects(RDF.type, RDF.Property):
+ name = g.qname(str(s))
+ ns, sep, _ = name.partition(":")
+ if ns in SKIP_PREFIXES:
+ continue
+ if name not in properties:
+ properties.add(name)
+ if sep:
+ properties_ns_count[ns] += 1
+ for o in g.objects(s, RDFS.subPropertyOf):
+ parents.setdefault(name, set()).add(g.qname(str(o)))
+ for o in g.objects(s, SKOS.broader):
+ parents.setdefault(name, set()).add(g.qname(str(o)))
+
+ return classes, properties, parents, classes_ns_count, properties_ns_count
+
+
+def collect_terms_in_examples(files: list[str]) -> dict[str, set[str]]:
+ """
+ Collect terms used in example files
+
+ Since TTLs in examples directory does not have namespaces defined,
+ we cannot use RDFLib to parse them.
+ Instead, we use regex to find terms.
+ """
+ used: dict[str, set[str]] = defaultdict(set)
+ pattern = re.compile(r"(? dict[str, dict[str, set[int]]]:
+ """Collect terms mentioned in description section of HTML files"""
+ # { term: { filepath: set of line numbers } } }
+ used: dict[str, dict[str, set[int]]] = defaultdict(lambda: defaultdict(set))
+
+ # respecConfig.shortName
+ global_prefix_pat = re.compile(
+ r"respecConfig\s*=\s*{[\s\S]*?shortName\s*[:=]\s*['\"]([^'\"]+)['\"]",
+ re.IGNORECASE,
+ )
+ # [=term=]
+ term_bracket_pat = re.compile(r"\[=\s*?([a-zA-Z0-9_\-]+?)\s*?=\]")
+ # term -- too many false positives but can be useful if you have time to comb through
+ # term_code_pat = re.compile(r"\s*?([a-zA-Z0-9_\-]+?)\s*?")
+ # prefix:term
+ term_code_prefix_pat = re.compile(
+ r"\s*?([a-zA-Z0-9_\-]+?:[a-zA-Z0-9_\-]+?)\s*?"
+ )
+ # `prefix:term`
+ term_backtick_prefix_pat = re.compile(r"`([a-zA-Z0-9_\-]+?:[a-zA-Z0-9_\-]+?)`")
+
+ for f in files:
+ try:
+ with open(f, encoding="utf-8", errors="ignore") as fh:
+ html = fh.read()
+ except OSError:
+ continue
+
+ match = global_prefix_pat.search(html)
+ prefix = match.group(1).lower() if match else ""
+
+ for n, line in enumerate(html.splitlines(), start=1):
+ for m in term_bracket_pat.finditer(line):
+ term = m.group(1)
+ term = f"{prefix}:{term}" if prefix else term
+ used[term][f].add(n)
+
+ # for m in term_code_pat.finditer(line):
+ # term = m.group(1)
+ # term = f"{prefix}:{term}" if prefix else term
+ # used[term][f+"**"].add(n)
+
+ for m in term_code_prefix_pat.finditer(line):
+ term = m.group(1) # full term (prefix:term)
+ used[term][f].add(n)
+
+ for m in term_backtick_prefix_pat.finditer(line):
+ term = m.group(1) # full term (prefix:term)
+ used[term][f].add(n)
+
+ return used
+
+
+def is_used_or_parent_used(
+ term: str, used_keys: set[str], parents: dict[str, set[str]]
+) -> bool:
+ """Check if a term is used or any of its parents is used."""
+ if term in used_keys:
+ return True
+ return bool(parents.get(term, set()) & used_keys)
+
+
+def count_parents(terms: set[str], parents: dict[str, set[str]]) -> Counter[str]:
+ """Count parents of terms"""
+ counter: Counter[str] = Counter()
+ for term in terms:
+ for parent in parents.get(term, set()):
+ counter[parent] += 1
+ return counter
+
+
+def print_terms_without_examples(
+ terms: set[str], parents: dict[str, set[str]], label: str = "Terms"
+) -> None:
+ """Print terms without examples with their parent info."""
+ print(f"\n{label} without examples ({len(terms)})")
+ print("-------------------------------")
+
+ if not terms:
+ print("Not found.")
+ return
+
+ for term in sorted(terms):
+ parent = parents.get(term)
+ if parent:
+ print(f"{term:<40} ⊂ {', '.join(parent)}")
+ else:
+ print(term)
+
+
+def print_undefined_example_terms(
+ used: dict[str, set[str]],
+ classes: set[str],
+ properties: set[str],
+ base_dir: str,
+) -> None:
+ """
+ Print terms that appear in example files but are not defined in the vocabulary files.
+ """
+ defined = classes | properties
+ undefined = sorted(
+ term
+ for term in used.keys()
+ if ":" in term
+ and term.split(":", 1)[0] not in SKIP_PREFIXES
+ and term not in defined
+ )
+
+ print(
+ f"\nTerms used in examples but NOT defined in vocabulary files ({len(undefined)})"
+ )
+ print("----------------------------------------------------------")
+
+ if not undefined:
+ print("Not found.")
+ return
+
+ for term in undefined:
+ files = used.get(term, ())
+ found_in = [
+ (os.path.relpath(f, start=base_dir) if base_dir else f)
+ for f in sorted(files)
+ ]
+ print(f"{term:<40} in: {', '.join(found_in)}")
+
+
+def print_undefined_html_terms(
+ used: dict[str, dict[str, set[int]]],
+ classes: set[str],
+ properties: set[str],
+ base_dir: str,
+) -> None:
+ """Print terms referenced in HTML but not defined in vocabulary files.
+
+ html_used: term -> { filepath -> set(line_numbers) }
+ """
+ undefined = {
+ term: filename_linenum
+ for term, filename_linenum in used.items()
+ if term not in classes
+ and term not in properties
+ and term.split(":", 1)[0] not in SKIP_PREFIXES
+ }
+
+ print(
+ f"\nTerms referenced in HTML but NOT defined in vocabulary files ({len(undefined)})"
+ )
+ print("------------------------------------------------------------")
+ if not undefined:
+ print("Not found.")
+ return
+
+ for term, filename_linenum in sorted(undefined.items()):
+ found_in: list[str] = []
+ for path in sorted(filename_linenum):
+ rel = os.path.relpath(path, start=base_dir)
+ lines = sorted(filename_linenum[path])
+ found_in.append(f"{rel}:{','.join(map(str, lines))}")
+ print(f"{term:<40} in: {', '.join(found_in)}")
+
+
+def print_summary_terms_with_examples(
+ classes_ns_count: Counter[str],
+ properties_ns_count: Counter[str],
+ used_classes_ns_count: Counter[str],
+ used_properties_ns_count: Counter[str],
+ print_complete_ns: bool = False,
+) -> None:
+ """Print per-namespace summary of classes and properties with examples.
+
+ If print_complete_ns is False,
+ do not print namespaces where all classes and properties have examples.
+ """
+ print()
+ print("Namespace Class w/ Examples Prop. w/ Examples")
+ print("-------------------------- ----------------- -----------------")
+ for ns in sorted(classes_ns_count.keys() | properties_ns_count.keys()):
+ if ns in SKIP_PREFIXES:
+ continue
+ c_used = used_classes_ns_count.get(ns, 0)
+ c_total = classes_ns_count.get(ns, 0)
+ p_used = used_properties_ns_count.get(ns, 0)
+ p_total = properties_ns_count.get(ns, 0)
+ if not print_complete_ns and c_used == c_total and p_used == p_total:
+ continue
+ print(f"{ns:<26} {c_used:>7} / {c_total:<7} {p_used:>7} / {p_total:<7}")
+
+
+def print_top_unused_parents(
+ unused: set[str],
+ parents: dict[str, set[str]],
+ exclude_prefixes: set[str],
+ top_n: int = 10,
+ label: str = "Terms",
+) -> None:
+ """
+ Print top parent terms among unused terms
+
+ exclude_prefixes: prefixes to filter out
+ top_n: number of top parents to show
+ """
+ filtered_terms = {
+ c
+ for c in unused
+ if not any(c.startswith(f"{prefix}:") for prefix in exclude_prefixes)
+ }
+ top_parents = count_parents(filtered_terms, parents).most_common(top_n)
+
+ print(f"\nTop parents among {label} without examples")
+ if exclude_prefixes:
+ print(f"(excluding child with prefixes: {', '.join(sorted(exclude_prefixes))})")
+ print("-------------------------------------------------------------")
+ for parent, count in top_parents:
+ print(f"{count:>7} {parent}")
+
+
+def main() -> None:
+ """Main function"""
+ args = parse_args()
+ vocab_dir = args.vocab_dir
+ examples_dir = args.examples_dir
+
+ vocab_files = list(get_ttl_files(vocab_dir))
+ ex_files = list(get_ttl_files(examples_dir))
+ if args.verbose:
+ print(f"Vocabulary directory: {vocab_dir}")
+ print(f"Example directory: {examples_dir}")
+ print(f"Vocabulary TTL files found: {len(vocab_files)}")
+ print(f"Example TTL files found: {len(ex_files)}")
+
+ classes, properties, parents, classes_ns_count, properties_ns_count = (
+ collect_terms_in_vocabs(vocab_files)
+ )
+
+ used = collect_terms_in_examples(ex_files)
+ used_keys = set(used) # cache keys for faster lookup
+
+ used_classes: set[str] = set()
+ used_properties: set[str] = set()
+ used_classes_ns_count: Counter[str] = Counter()
+ used_properties_ns_count: Counter[str] = Counter()
+
+ for c in classes:
+ if is_used_or_parent_used(c, used_keys, parents):
+ used_classes.add(c)
+ ns, sep, _ = c.partition(":")
+ if sep:
+ used_classes_ns_count[ns] += 1
+
+ for p in properties:
+ if is_used_or_parent_used(p, used_keys, parents):
+ used_properties.add(p)
+ ns, sep, _ = p.partition(":")
+ if sep:
+ used_properties_ns_count[ns] += 1
+
+ print(f"Classes (inc. parents) with examples: {len(used_classes)} / {len(classes)}")
+ print(
+ f"Properties (inc. parents) with examples: {len(used_properties)} / {len(properties)}"
+ )
+
+ unused_classes = classes - used_classes
+ unused_properties = properties - used_properties
+
+ if args.list_unused_terms or args.verbose:
+ print_terms_without_examples(unused_classes, parents, "Classes")
+ print_terms_without_examples(unused_properties, parents, "Properties")
+
+ print_summary_terms_with_examples(
+ classes_ns_count,
+ properties_ns_count,
+ used_classes_ns_count,
+ used_properties_ns_count,
+ print_complete_ns=args.verbose,
+ )
+
+ if args.top_unused_parents or args.verbose:
+ print_top_unused_parents(
+ unused_classes,
+ parents,
+ exclude_prefixes={"loc"}, # There are many unused loc: terms
+ top_n=10,
+ label="classes",
+ )
+ print_top_unused_parents(
+ unused_properties,
+ parents,
+ exclude_prefixes=set(),
+ top_n=10,
+ label="properties",
+ )
+
+ if args.list_undefined_terms or args.verbose:
+ print_undefined_example_terms(used, classes, properties, examples_dir)
+
+ if args.list_undefined_html_terms or args.verbose:
+ html_files = list(get_html_files(vocab_dir))
+ html_used = collect_terms_in_htmls(html_files)
+ print_undefined_html_terms(html_used, classes, properties, vocab_dir)
+
+
+if __name__ == "__main__":
+ main()