From 9e1c66d725ed353892bd10eebf32cd24d8094620 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 14 Aug 2025 08:31:53 +0100 Subject: [PATCH 01/14] Create 295_find_terms_without_example.py Find terms without an example in examples directory Signed-off-by: Arthit Suriyawongkul --- code/295_find_terms_without_example.py | 174 +++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100755 code/295_find_terms_without_example.py diff --git a/code/295_find_terms_without_example.py b/code/295_find_terms_without_example.py new file mode 100755 index 000000000..692ba2500 --- /dev/null +++ b/code/295_find_terms_without_example.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 + +# SPDX-FileContributor: Arthit Suriyawongkul +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: W3C-20150513 + +""" +Find terms without an example +""" + +import os +import re +import sys +from collections import Counter + +from rdflib import RDF, RDFS, SKOS, Graph + +SKIP_PREFIXES = { + "ex", + "_", + "bibo", + "dcat", + "dct", + "foaf", + "org", + "owl", + "profile", + "rdf", + "rdfs", + "role", + "schema", + "scoro", + "skos", + "sw", + "time", + "vann", + "xsd", +} + + +vocab_dir = "../2.2" +examples_dir = "../examples" + +verbose = "--verbose" in sys.argv or "-v" in sys.argv + + +def get_ttl_files(root: str): + for dirpath, _, filenames in os.walk(root): + for f in filenames: + if f.endswith(".ttl"): + yield os.path.join(dirpath, f) + + +def collect_terms(files: list[str]) -> tuple[set[str], set[str], dict[str, set[str]]]: + classes: set[str] = set() + properties: set[str] = set() + parents: dict[str, set[str]] = {} + for f in files: + g = Graph() + g.parse(f, format="turtle") + for s in g.subjects(RDF.type, RDFS.Class): + name = g.qname(s) + if name.split(":")[0] in SKIP_PREFIXES: + continue + classes.add(name) + for o in g.objects(s, RDFS.subClassOf): + parents.setdefault(name, set()).add(g.qname(o)) + for o in g.objects(s, SKOS.broader): + parents.setdefault(name, set()).add(g.qname(o)) + for s in g.subjects(RDF.type, RDF.Property): + name = g.qname(s) + if name.split(":")[0] in SKIP_PREFIXES: + continue + properties.add(name) + for o in g.objects(s, RDFS.subPropertyOf): + parents.setdefault(name, set()).add(g.qname(o)) + for o in g.objects(s, SKOS.broader): + parents.setdefault(name, set()).add(g.qname(o)) + + return classes, properties, parents + + +def collect_used_terms(files: list[str]): + used: set[str] = set() + pattern = re.compile(r"\b([a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+)\b") + for f in files: + with open(f, encoding="utf-8") as fh: + for line in fh: + for match in pattern.findall(line): + prefix = match.split(":")[0] + if prefix not in SKIP_PREFIXES: + used.add(match) + return used + + +def is_used_or_parent_used( + term: str, used_terms: set[str], parents: dict[str, set[str]] +): + if term in used_terms: + return True + + for parent in parents.get(term, set()): + if parent in used_terms: + return True + + return False + + +def count_per_namespace(terms: set[str]) -> dict[str, int]: + ns_counter: dict[str, int] = Counter() + for item in terms: + if ":" in item: + ns = item.split(":")[0] + ns_counter[ns] += 1 + return ns_counter + + +vocab_files = list(get_ttl_files(vocab_dir)) +ex_files = list(get_ttl_files(examples_dir)) +if verbose: + print(f"Vocabulary directory: {vocab_dir}") + print(f"Example directory: {examples_dir}") + print(f"Vocabulary TTL files found: {len(vocab_files)}") + print(f"Example TTL files found: {len(ex_files)}") + +classes, properties, parents = collect_terms(vocab_files) +if verbose: + print(f"Classes defined in vocabulary files: {len(classes)}") + print(f"Properties defined in vocabulary files: {len(properties)}") + +used = collect_used_terms(ex_files) +used_classes = {c for c in classes if is_used_or_parent_used(c, used, parents)} +used_properties = {p for p in properties if is_used_or_parent_used(p, used, parents)} +if verbose: + print(f"Classes with examples: {len(used_classes)} / {len(classes)}") + print(f"Properties with examples: {len(used_properties)} / {len(properties)}") + +# Print all terms without examples +if verbose: + unused_classes = sorted(classes - set(used_classes)) + unused_properties = sorted(properties - set(used_properties)) + print("\n==== Classes without examples ====\n") + for c in unused_classes: + parent = parents.get(c) + if parent: + print(f"{c} (broader/subClassOf {', '.join(parent)})") + else: + print(c) + print("\n==== Properties without examples ====\n") + for p in unused_properties: + parent = parents.get(p) + if parent: + print(f"{p} (broader/subPropertyOf {', '.join(parent)})") + else: + print(p) + +all_classes_ns = count_per_namespace(classes) +all_properties_ns = count_per_namespace(properties) +used_classes_ns = count_per_namespace(used_classes) +used_properties_ns = count_per_namespace(used_properties) + +print() +print("Namespace Class w/ Examples Prop. w/ Examples") +print("---------------------- ----------------- -----------------") +for ns in sorted(set(all_classes_ns) | set(all_properties_ns)): + if ns in SKIP_PREFIXES: + continue + c_used = used_classes_ns.get(ns, 0) + c_total = all_classes_ns.get(ns, 0) + p_used = used_properties_ns.get(ns, 0) + p_total = all_properties_ns.get(ns, 0) + if not verbose and c_used == c_total and p_used == p_total: + continue + print(f"{ns:<22} {c_used:>7} / {c_total:<7} {p_used:>7} / {p_total:<7}") From 2131f3ccb9dbac9160f16dfbab34338db2d51edc Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 14 Aug 2025 11:37:55 +0100 Subject: [PATCH 02/14] Add comments Signed-off-by: Arthit Suriyawongkul --- code/295_find_terms_without_example.py | 136 ++++++++++++++----------- 1 file changed, 77 insertions(+), 59 deletions(-) diff --git a/code/295_find_terms_without_example.py b/code/295_find_terms_without_example.py index 692ba2500..e633cac3e 100755 --- a/code/295_find_terms_without_example.py +++ b/code/295_find_terms_without_example.py @@ -15,9 +15,10 @@ from rdflib import RDF, RDFS, SKOS, Graph +# Assuming consistent prefixes in vocabulary files SKIP_PREFIXES = { - "ex", - "_", + "ex", # Example namespace + "_", # Blank nodes "bibo", "dcat", "dct", @@ -52,6 +53,7 @@ def get_ttl_files(root: str): def collect_terms(files: list[str]) -> tuple[set[str], set[str], dict[str, set[str]]]: + """Collect terms defined in vocabulary files""" classes: set[str] = set() properties: set[str] = set() parents: dict[str, set[str]] = {} @@ -81,6 +83,13 @@ def collect_terms(files: list[str]) -> tuple[set[str], set[str], dict[str, set[s def collect_used_terms(files: list[str]): + """ + Collect terms used in TTL files + + Since TTLs in examples directory is not in full Turtle format, + we cannot use RDFLib to parse them. + Instead, we use regex to find terms. + """ used: set[str] = set() pattern = re.compile(r"\b([a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+)\b") for f in files: @@ -96,6 +105,7 @@ def collect_used_terms(files: list[str]): def is_used_or_parent_used( term: str, used_terms: set[str], parents: dict[str, set[str]] ): + """Check if a term is used or any of its parents is used.""" if term in used_terms: return True @@ -107,6 +117,7 @@ def is_used_or_parent_used( def count_per_namespace(terms: set[str]) -> dict[str, int]: + """Count terms per namespace""" ns_counter: dict[str, int] = Counter() for item in terms: if ":" in item: @@ -115,60 +126,67 @@ def count_per_namespace(terms: set[str]) -> dict[str, int]: return ns_counter -vocab_files = list(get_ttl_files(vocab_dir)) -ex_files = list(get_ttl_files(examples_dir)) -if verbose: - print(f"Vocabulary directory: {vocab_dir}") - print(f"Example directory: {examples_dir}") - print(f"Vocabulary TTL files found: {len(vocab_files)}") - print(f"Example TTL files found: {len(ex_files)}") - -classes, properties, parents = collect_terms(vocab_files) -if verbose: - print(f"Classes defined in vocabulary files: {len(classes)}") - print(f"Properties defined in vocabulary files: {len(properties)}") - -used = collect_used_terms(ex_files) -used_classes = {c for c in classes if is_used_or_parent_used(c, used, parents)} -used_properties = {p for p in properties if is_used_or_parent_used(p, used, parents)} -if verbose: - print(f"Classes with examples: {len(used_classes)} / {len(classes)}") - print(f"Properties with examples: {len(used_properties)} / {len(properties)}") - -# Print all terms without examples -if verbose: - unused_classes = sorted(classes - set(used_classes)) - unused_properties = sorted(properties - set(used_properties)) - print("\n==== Classes without examples ====\n") - for c in unused_classes: - parent = parents.get(c) - if parent: - print(f"{c} (broader/subClassOf {', '.join(parent)})") - else: - print(c) - print("\n==== Properties without examples ====\n") - for p in unused_properties: - parent = parents.get(p) - if parent: - print(f"{p} (broader/subPropertyOf {', '.join(parent)})") - else: - print(p) - -all_classes_ns = count_per_namespace(classes) -all_properties_ns = count_per_namespace(properties) -used_classes_ns = count_per_namespace(used_classes) -used_properties_ns = count_per_namespace(used_properties) - -print() -print("Namespace Class w/ Examples Prop. w/ Examples") -print("---------------------- ----------------- -----------------") -for ns in sorted(set(all_classes_ns) | set(all_properties_ns)): - if ns in SKIP_PREFIXES: - continue - c_used = used_classes_ns.get(ns, 0) - c_total = all_classes_ns.get(ns, 0) - p_used = used_properties_ns.get(ns, 0) - p_total = all_properties_ns.get(ns, 0) - if not verbose and c_used == c_total and p_used == p_total: - continue - print(f"{ns:<22} {c_used:>7} / {c_total:<7} {p_used:>7} / {p_total:<7}") +def main(): + vocab_files = list(get_ttl_files(vocab_dir)) + ex_files = list(get_ttl_files(examples_dir)) + if verbose: + print(f"Vocabulary directory: {vocab_dir}") + print(f"Example directory: {examples_dir}") + print(f"Vocabulary TTL files found: {len(vocab_files)}") + print(f"Example TTL files found: {len(ex_files)}") + + classes, properties, parents = collect_terms(vocab_files) + if verbose: + print(f"Classes defined in vocabulary files: {len(classes)}") + print(f"Properties defined in vocabulary files: {len(properties)}") + + used = collect_used_terms(ex_files) + used_classes = {c for c in classes if is_used_or_parent_used(c, used, parents)} + used_properties = { + p for p in properties if is_used_or_parent_used(p, used, parents) + } + if verbose: + print(f"Classes with examples: {len(used_classes)} / {len(classes)}") + print(f"Properties with examples: {len(used_properties)} / {len(properties)}") + + # Print all terms without examples + if verbose: + unused_classes = sorted(classes - set(used_classes)) + unused_properties = sorted(properties - set(used_properties)) + print("\n==== Classes without examples ====\n") + for c in unused_classes: + parent = parents.get(c) + if parent: + print(f"{c} (broader/subClassOf {', '.join(parent)})") + else: + print(c) + print("\n==== Properties without examples ====\n") + for p in unused_properties: + parent = parents.get(p) + if parent: + print(f"{p} (broader/subPropertyOf {', '.join(parent)})") + else: + print(p) + + all_classes_ns = count_per_namespace(classes) + all_properties_ns = count_per_namespace(properties) + used_classes_ns = count_per_namespace(used_classes) + used_properties_ns = count_per_namespace(used_properties) + + print() + print("Namespace Class w/ Examples Prop. w/ Examples") + print("---------------------- ----------------- -----------------") + for ns in sorted(set(all_classes_ns) | set(all_properties_ns)): + if ns in SKIP_PREFIXES: + continue + c_used = used_classes_ns.get(ns, 0) + c_total = all_classes_ns.get(ns, 0) + p_used = used_properties_ns.get(ns, 0) + p_total = all_properties_ns.get(ns, 0) + if not verbose and c_used == c_total and p_used == p_total: + continue + print(f"{ns:<22} {c_used:>7} / {c_total:<7} {p_used:>7} / {p_total:<7}") + + +if __name__ == "__main__": + main() From 11a9903b538463fedbfc9036ef2990640201daed Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 14 Aug 2025 22:36:23 +0100 Subject: [PATCH 03/14] Allow to set vocab_dir and examples_dir from command line Signed-off-by: Arthit Suriyawongkul --- code/295_find_terms_without_example.py | 40 ++++++++++++++++++++------ 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/code/295_find_terms_without_example.py b/code/295_find_terms_without_example.py index e633cac3e..f6f05adb4 100755 --- a/code/295_find_terms_without_example.py +++ b/code/295_find_terms_without_example.py @@ -8,6 +8,7 @@ Find terms without an example """ +import argparse import os import re import sys @@ -38,14 +39,31 @@ "xsd", } - -vocab_dir = "../2.2" -examples_dir = "../examples" - -verbose = "--verbose" in sys.argv or "-v" in sys.argv +DEFAULT_VOCAB_DIR = "../2.2" +DEFAULT_EXAMPLES_DIR = "../examples" + + +def parse_args() -> argparse.Namespace: + """Parse command line arguments""" + parser = argparse.ArgumentParser(description="Find terms without an example") + parser.add_argument( + "--vocab-dir", + default=DEFAULT_VOCAB_DIR, + help="Directory containing vocabulary TTL files", + ) + parser.add_argument( + "--examples-dir", + default=DEFAULT_EXAMPLES_DIR, + help="Directory containing example TTL files", + ) + parser.add_argument( + "-v", "--verbose", action="store_true", help="Enable verbose output" + ) + return parser.parse_args() def get_ttl_files(root: str): + """Yield all TTL files in the given directory and its subdirectories""" for dirpath, _, filenames in os.walk(root): for f in filenames: if f.endswith(".ttl"): @@ -82,7 +100,7 @@ def collect_terms(files: list[str]) -> tuple[set[str], set[str], dict[str, set[s return classes, properties, parents -def collect_used_terms(files: list[str]): +def collect_used_terms(files: list[str]) -> set[str]: """ Collect terms used in TTL files @@ -104,7 +122,7 @@ def collect_used_terms(files: list[str]): def is_used_or_parent_used( term: str, used_terms: set[str], parents: dict[str, set[str]] -): +) -> bool: """Check if a term is used or any of its parents is used.""" if term in used_terms: return True @@ -126,7 +144,13 @@ def count_per_namespace(terms: set[str]) -> dict[str, int]: return ns_counter -def main(): +def main() -> None: + """Main function""" + args = parse_args() + vocab_dir = args.vocab_dir + examples_dir = args.examples_dir + verbose = args.verbose + vocab_files = list(get_ttl_files(vocab_dir)) ex_files = list(get_ttl_files(examples_dir)) if verbose: From 483be9e9d57b01a77997ec32ac4b4ed8e3108788 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 15 Aug 2025 09:23:57 +0100 Subject: [PATCH 04/14] Print top parents Signed-off-by: Arthit Suriyawongkul --- code/295_find_terms_without_example.py | 126 ++++++++++++++++--------- 1 file changed, 82 insertions(+), 44 deletions(-) diff --git a/code/295_find_terms_without_example.py b/code/295_find_terms_without_example.py index f6f05adb4..f2a0f134a 100755 --- a/code/295_find_terms_without_example.py +++ b/code/295_find_terms_without_example.py @@ -11,8 +11,8 @@ import argparse import os import re -import sys from collections import Counter +from typing import Iterator from rdflib import RDF, RDFS, SKOS, Graph @@ -62,7 +62,7 @@ def parse_args() -> argparse.Namespace: return parser.parse_args() -def get_ttl_files(root: str): +def get_ttl_files(root: str) -> Iterator[str]: """Yield all TTL files in the given directory and its subdirectories""" for dirpath, _, filenames in os.walk(root): for f in filenames: @@ -104,7 +104,7 @@ def collect_used_terms(files: list[str]) -> set[str]: """ Collect terms used in TTL files - Since TTLs in examples directory is not in full Turtle format, + Since TTLs in examples directory does not have namespaces defined, we cannot use RDFLib to parse them. Instead, we use regex to find terms. """ @@ -134,9 +134,9 @@ def is_used_or_parent_used( return False -def count_per_namespace(terms: set[str]) -> dict[str, int]: +def count_per_namespace(terms: set[str]) -> Counter[str]: """Count terms per namespace""" - ns_counter: dict[str, int] = Counter() + ns_counter: Counter[str] = Counter() for item in terms: if ":" in item: ns = item.split(":")[0] @@ -144,6 +144,60 @@ def count_per_namespace(terms: set[str]) -> dict[str, int]: return ns_counter +def count_parents(terms: set[str], parents: dict[str, set[str]]) -> Counter[str]: + """Count parents of terms""" + counter: Counter[str] = Counter() + for term in terms: + for parent in parents.get(term, []): + counter[parent] += 1 + return counter + + +def print_terms_without_examples( + terms: set[str], parents: dict[str, set[str]], label: str = "Terms" +) -> None: + """Print terms without examples with their parent info.""" + print(f"\n==== {label} without examples ====\n") + for term in sorted(terms): + parent = parents.get(term) + if parent: + print(f"{term:<40} ⊂ {', '.join(parent)}") + else: + print(term) + + +def print_summary( + classes: set[str], + properties: set[str], + used_classes: set[str], + used_properties: set[str], + print_complete_ns: bool = False, +) -> None: + """Print per-namespace summary of classes and properties with examples. + + If print_complete_ns is False, + do not print namespaces where all classes and properties have examples. + """ + classes_ns_count = count_per_namespace(classes) + properties_ns_count = count_per_namespace(properties) + used_classes_ns_count = count_per_namespace(used_classes) + used_properties_ns_count = count_per_namespace(used_properties) + + print() + print("Namespace Class w/ Examples Prop. w/ Examples") + print("-------------------------- ----------------- -----------------") + for ns in sorted(set(classes_ns_count) | set(properties_ns_count)): + if ns in SKIP_PREFIXES: + continue + c_used = used_classes_ns_count.get(ns, 0) + c_total = classes_ns_count.get(ns, 0) + p_used = used_properties_ns_count.get(ns, 0) + p_total = properties_ns_count.get(ns, 0) + if not print_complete_ns and c_used == c_total and p_used == p_total: + continue + print(f"{ns:<26} {c_used:>7} / {c_total:<7} {p_used:>7} / {p_total:<7}") + + def main() -> None: """Main function""" args = parse_args() @@ -161,56 +215,40 @@ def main() -> None: classes, properties, parents = collect_terms(vocab_files) if verbose: - print(f"Classes defined in vocabulary files: {len(classes)}") - print(f"Properties defined in vocabulary files: {len(properties)}") + print(f"Classes defined: {len(classes)}") + print(f"Properties defined: {len(properties)}") used = collect_used_terms(ex_files) used_classes = {c for c in classes if is_used_or_parent_used(c, used, parents)} used_properties = { p for p in properties if is_used_or_parent_used(p, used, parents) } + unused_classes = classes - used_classes + unused_properties = properties - used_properties if verbose: print(f"Classes with examples: {len(used_classes)} / {len(classes)}") print(f"Properties with examples: {len(used_properties)} / {len(properties)}") + print_terms_without_examples(unused_classes, parents, "Classes") + print_terms_without_examples(unused_properties, parents, "Properties") + + print_summary( + classes, + properties, + used_classes, + used_properties, + print_complete_ns=verbose, + ) - # Print all terms without examples - if verbose: - unused_classes = sorted(classes - set(used_classes)) - unused_properties = sorted(properties - set(used_properties)) - print("\n==== Classes without examples ====\n") - for c in unused_classes: - parent = parents.get(c) - if parent: - print(f"{c} (broader/subClassOf {', '.join(parent)})") - else: - print(c) - print("\n==== Properties without examples ====\n") - for p in unused_properties: - parent = parents.get(p) - if parent: - print(f"{p} (broader/subPropertyOf {', '.join(parent)})") - else: - print(p) - - all_classes_ns = count_per_namespace(classes) - all_properties_ns = count_per_namespace(properties) - used_classes_ns = count_per_namespace(used_classes) - used_properties_ns = count_per_namespace(used_properties) - - print() - print("Namespace Class w/ Examples Prop. w/ Examples") - print("---------------------- ----------------- -----------------") - for ns in sorted(set(all_classes_ns) | set(all_properties_ns)): - if ns in SKIP_PREFIXES: - continue - c_used = used_classes_ns.get(ns, 0) - c_total = all_classes_ns.get(ns, 0) - p_used = used_properties_ns.get(ns, 0) - p_total = all_properties_ns.get(ns, 0) - if not verbose and c_used == c_total and p_used == p_total: - continue - print(f"{ns:<22} {c_used:>7} / {c_total:<7} {p_used:>7} / {p_total:<7}") + unused_classes_no_loc = {c for c in unused_classes if not c.startswith("loc:")} + top_classes_parents = count_parents(unused_classes_no_loc, parents).most_common(10) + top_properties_parents = count_parents(unused_properties, parents).most_common(10) + print("\nTop parents among classes without examples (excluding 'loc:'):") + for parent, count in top_classes_parents: + print(f"{count:>7} {parent}") + print("\nTop parents among properties without examples:") + for parent, count in top_properties_parents: + print(f"{count:>7} {parent}") if __name__ == "__main__": main() From e6f3743ed2d8485f0b9bb3fe98c9ff68ccbce697 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 19 Aug 2025 17:03:58 +0100 Subject: [PATCH 05/14] Print terms found in examples but not defined Signed-off-by: Arthit Suriyawongkul --- code/295_find_terms_without_example.py | 48 ++++++++++++++++++++------ 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/code/295_find_terms_without_example.py b/code/295_find_terms_without_example.py index f2a0f134a..fc3eb8564 100755 --- a/code/295_find_terms_without_example.py +++ b/code/295_find_terms_without_example.py @@ -5,7 +5,7 @@ # SPDX-License-Identifier: W3C-20150513 """ -Find terms without an example +Find defined terms without an example and terms used in examples but not defined. """ import argparse @@ -70,7 +70,7 @@ def get_ttl_files(root: str) -> Iterator[str]: yield os.path.join(dirpath, f) -def collect_terms(files: list[str]) -> tuple[set[str], set[str], dict[str, set[str]]]: +def collect_terms_in_vocabs(files: list[str]) -> tuple[set[str], set[str], dict[str, set[str]]]: """Collect terms defined in vocabulary files""" classes: set[str] = set() properties: set[str] = set() @@ -100,9 +100,9 @@ def collect_terms(files: list[str]) -> tuple[set[str], set[str], dict[str, set[s return classes, properties, parents -def collect_used_terms(files: list[str]) -> set[str]: +def collect_terms_in_examples(files: list[str]) -> set[str]: """ - Collect terms used in TTL files + Collect terms used in example files Since TTLs in examples directory does not have namespaces defined, we cannot use RDFLib to parse them. @@ -110,6 +110,8 @@ def collect_used_terms(files: list[str]) -> set[str]: """ used: set[str] = set() pattern = re.compile(r"\b([a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+)\b") + # Match terms (prefix:term) not surrounded by quotes (since it can be literal) + # pattern = re.compile(r'(? None: + """ + Print terms that appear in example files but are not defined in the vocabulary files. + + used_terms: set of 'prefix:Term' found in examples + classes/properties: sets of 'prefix:Term' defined in vocab files + """ + defined = classes | properties + undefined = sorted( + t + for t in used_terms + if t not in defined and (":" in t and t.split(":")[0] not in SKIP_PREFIXES) + ) -def print_summary( + if not undefined: + print("\nNo undefined terms found in examples (all used terms are defined).") + return + + print("\n==== Terms used in examples but NOT defined in vocabulary files ====\n") + for t in undefined: + print(t) + + +def print_summary_terms_with_examples( classes: set[str], properties: set[str], used_classes: set[str], @@ -213,12 +239,12 @@ def main() -> None: print(f"Vocabulary TTL files found: {len(vocab_files)}") print(f"Example TTL files found: {len(ex_files)}") - classes, properties, parents = collect_terms(vocab_files) + classes, properties, parents = collect_terms_in_vocabs(vocab_files) if verbose: print(f"Classes defined: {len(classes)}") print(f"Properties defined: {len(properties)}") - used = collect_used_terms(ex_files) + used = collect_terms_in_examples(ex_files) used_classes = {c for c in classes if is_used_or_parent_used(c, used, parents)} used_properties = { p for p in properties if is_used_or_parent_used(p, used, parents) @@ -231,7 +257,7 @@ def main() -> None: print_terms_without_examples(unused_classes, parents, "Classes") print_terms_without_examples(unused_properties, parents, "Properties") - print_summary( + print_summary_terms_with_examples( classes, properties, used_classes, @@ -243,12 +269,14 @@ def main() -> None: top_classes_parents = count_parents(unused_classes_no_loc, parents).most_common(10) top_properties_parents = count_parents(unused_properties, parents).most_common(10) - print("\nTop parents among classes without examples (excluding 'loc:'):") + print("\n==== Top parents among classes without examples (excluding 'loc:')====") for parent, count in top_classes_parents: print(f"{count:>7} {parent}") - print("\nTop parents among properties without examples:") + print("\n==== Top parents among properties without examples ====") for parent, count in top_properties_parents: print(f"{count:>7} {parent}") + print_terms_used_but_undefined(used, classes, properties) + if __name__ == "__main__": main() From 06c85b671d3970c1047c044457c5ed53da5e9e44 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 19 Aug 2025 17:28:16 +0100 Subject: [PATCH 06/14] Print filename of undefined terms in examples Signed-off-by: Arthit Suriyawongkul --- code/295_find_terms_without_example.py | 54 ++++++++++++++++---------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/code/295_find_terms_without_example.py b/code/295_find_terms_without_example.py index fc3eb8564..bae092228 100755 --- a/code/295_find_terms_without_example.py +++ b/code/295_find_terms_without_example.py @@ -11,7 +11,7 @@ import argparse import os import re -from collections import Counter +from collections import Counter, defaultdict from typing import Iterator from rdflib import RDF, RDFS, SKOS, Graph @@ -70,7 +70,9 @@ def get_ttl_files(root: str) -> Iterator[str]: yield os.path.join(dirpath, f) -def collect_terms_in_vocabs(files: list[str]) -> tuple[set[str], set[str], dict[str, set[str]]]: +def collect_terms_in_vocabs( + files: list[str], +) -> tuple[set[str], set[str], dict[str, set[str]]]: """Collect terms defined in vocabulary files""" classes: set[str] = set() properties: set[str] = set() @@ -100,7 +102,7 @@ def collect_terms_in_vocabs(files: list[str]) -> tuple[set[str], set[str], dict[ return classes, properties, parents -def collect_terms_in_examples(files: list[str]) -> set[str]: +def collect_terms_in_examples(files: list[str]) -> dict[str, set[str]]: """ Collect terms used in example files @@ -108,7 +110,7 @@ def collect_terms_in_examples(files: list[str]) -> set[str]: we cannot use RDFLib to parse them. Instead, we use regex to find terms. """ - used: set[str] = set() + used: dict[str, set[str]] = defaultdict(set) pattern = re.compile(r"\b([a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+)\b") # Match terms (prefix:term) not surrounded by quotes (since it can be literal) # pattern = re.compile(r'(? set[str]: with open(f, encoding="utf-8") as fh: for line in fh: for match in pattern.findall(line): - prefix = match.split(":")[0] + prefix = match.split(":", 1)[0] if prefix not in SKIP_PREFIXES: - used.add(match) + used[match].add(f) return used def is_used_or_parent_used( - term: str, used_terms: set[str], parents: dict[str, set[str]] + term: str, used_terms: dict[str, set[str]], parents: dict[str, set[str]] ) -> bool: """Check if a term is used or any of its parents is used.""" - if term in used_terms: + if term in used_terms.keys(): return True for parent in parents.get(term, set()): - if parent in used_terms: + if parent in used_terms.keys(): return True return False @@ -167,29 +169,40 @@ def print_terms_without_examples( else: print(term) + def print_terms_used_but_undefined( - used_terms: set[str], classes: set[str], properties: set[str] + used_terms: dict[str, set[str]], + classes: set[str], + properties: set[str], + examples_dir: str, ) -> None: """ Print terms that appear in example files but are not defined in the vocabulary files. - - used_terms: set of 'prefix:Term' found in examples - classes/properties: sets of 'prefix:Term' defined in vocab files """ defined = classes | properties undefined = sorted( - t - for t in used_terms - if t not in defined and (":" in t and t.split(":")[0] not in SKIP_PREFIXES) + term + for term in used_terms.keys() + if term not in defined + and (":" in term and term.split(":")[0] not in SKIP_PREFIXES) ) if not undefined: print("\nNo undefined terms found in examples (all used terms are defined).") return - print("\n==== Terms used in examples but NOT defined in vocabulary files ====\n") - for t in undefined: - print(t) + print("\n==== Terms used in examples but NOT defined in vocabulary files ====") + for term in undefined: + files = sorted(used_terms.get(term, [])) + rel_files: list[str] = [] + for f in files: + try: + rel = os.path.relpath(f, start=examples_dir) + except Exception: + rel = f + rel_files.append(rel) + files_str = ", ".join(rel_files) + print(f"{term:<40} in: {files_str}") def print_summary_terms_with_examples( @@ -276,7 +289,8 @@ def main() -> None: for parent, count in top_properties_parents: print(f"{count:>7} {parent}") - print_terms_used_but_undefined(used, classes, properties) + print_terms_used_but_undefined(used, classes, properties, examples_dir) + if __name__ == "__main__": main() From 2d88b5a98c10be75f8a4b46214626d45a61f237a Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 19 Aug 2025 17:34:42 +0100 Subject: [PATCH 07/14] Add odrl prefix to skip list Signed-off-by: Arthit Suriyawongkul --- code/295_find_terms_without_example.py | 1 + 1 file changed, 1 insertion(+) diff --git a/code/295_find_terms_without_example.py b/code/295_find_terms_without_example.py index bae092228..2ff17f7ba 100755 --- a/code/295_find_terms_without_example.py +++ b/code/295_find_terms_without_example.py @@ -24,6 +24,7 @@ "dcat", "dct", "foaf", + "odrl", "org", "owl", "profile", From 70a769295cc2eec3aae7bc25721103b787184e3f Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 19 Aug 2025 23:02:57 +0100 Subject: [PATCH 08/14] Search faster Signed-off-by: Arthit Suriyawongkul --- code/295_find_terms_without_example.py | 223 +++++++++++++++---------- 1 file changed, 133 insertions(+), 90 deletions(-) diff --git a/code/295_find_terms_without_example.py b/code/295_find_terms_without_example.py index 2ff17f7ba..30a8b619e 100755 --- a/code/295_find_terms_without_example.py +++ b/code/295_find_terms_without_example.py @@ -57,8 +57,30 @@ def parse_args() -> argparse.Namespace: default=DEFAULT_EXAMPLES_DIR, help="Directory containing example TTL files", ) + parser.add_argument("-v", "--verbose", action="store_true", help="Print everything") parser.add_argument( - "-v", "--verbose", action="store_true", help="Enable verbose output" + "-l", + "--list-unused-terms", + action="store_true", + help="Print terms without examples", + ) + parser.add_argument( + "-t", + "--top-unused-parents", + action="store_true", + help="Print top parents of terms without examples", + ) + parser.add_argument( + "-c", + "--count-used-terms", + action="store_true", + help="Print term counts and example counts per namespace", + ) + parser.add_argument( + "-u", + "--list-undefined-terms", + action="store_true", + help="Print terms used in examples but not defined in vocabulary files", ) return parser.parse_args() @@ -73,34 +95,45 @@ def get_ttl_files(root: str) -> Iterator[str]: def collect_terms_in_vocabs( files: list[str], -) -> tuple[set[str], set[str], dict[str, set[str]]]: +) -> tuple[set[str], set[str], dict[str, set[str]], Counter[str], Counter[str]]: """Collect terms defined in vocabulary files""" classes: set[str] = set() properties: set[str] = set() parents: dict[str, set[str]] = {} + classes_ns_count: Counter[str] = Counter() + properties_ns_count: Counter[str] = Counter() + for f in files: g = Graph() g.parse(f, format="turtle") for s in g.subjects(RDF.type, RDFS.Class): - name = g.qname(s) - if name.split(":")[0] in SKIP_PREFIXES: + name = g.qname(str(s)) + ns, sep, _ = name.partition(":") + if ns in SKIP_PREFIXES: continue - classes.add(name) + if name not in classes: + classes.add(name) + if sep: + classes_ns_count[ns] += 1 for o in g.objects(s, RDFS.subClassOf): - parents.setdefault(name, set()).add(g.qname(o)) + parents.setdefault(name, set()).add(g.qname(str(o))) for o in g.objects(s, SKOS.broader): - parents.setdefault(name, set()).add(g.qname(o)) + parents.setdefault(name, set()).add(g.qname(str(o))) for s in g.subjects(RDF.type, RDF.Property): - name = g.qname(s) - if name.split(":")[0] in SKIP_PREFIXES: + name = g.qname(str(s)) + ns, sep, _ = name.partition(":") + if ns in SKIP_PREFIXES: continue - properties.add(name) + if name not in properties: + properties.add(name) + if sep: + properties_ns_count[ns] += 1 for o in g.objects(s, RDFS.subPropertyOf): - parents.setdefault(name, set()).add(g.qname(o)) + parents.setdefault(name, set()).add(g.qname(str(o))) for o in g.objects(s, SKOS.broader): - parents.setdefault(name, set()).add(g.qname(o)) + parents.setdefault(name, set()).add(g.qname(str(o))) - return classes, properties, parents + return classes, properties, parents, classes_ns_count, properties_ns_count def collect_terms_in_examples(files: list[str]) -> dict[str, set[str]]: @@ -112,48 +145,34 @@ def collect_terms_in_examples(files: list[str]) -> dict[str, set[str]]: Instead, we use regex to find terms. """ used: dict[str, set[str]] = defaultdict(set) - pattern = re.compile(r"\b([a-zA-Z0-9_-]+:[a-zA-Z0-9_-]+)\b") + pattern = re.compile(r"\b([a-zA-Z0-9_-]+):([a-zA-Z0-9_-]+)\b") # Match terms (prefix:term) not surrounded by quotes (since it can be literal) # pattern = re.compile(r'(? bool: """Check if a term is used or any of its parents is used.""" - if term in used_terms.keys(): + if term in used_keys: return True - - for parent in parents.get(term, set()): - if parent in used_terms.keys(): - return True - - return False - - -def count_per_namespace(terms: set[str]) -> Counter[str]: - """Count terms per namespace""" - ns_counter: Counter[str] = Counter() - for item in terms: - if ":" in item: - ns = item.split(":")[0] - ns_counter[ns] += 1 - return ns_counter + return bool(parents.get(term, set()) & used_keys) def count_parents(terms: set[str], parents: dict[str, set[str]]) -> Counter[str]: """Count parents of terms""" counter: Counter[str] = Counter() for term in terms: - for parent in parents.get(term, []): + for parent in parents.get(term, set()): counter[parent] += 1 return counter @@ -162,7 +181,12 @@ def print_terms_without_examples( terms: set[str], parents: dict[str, set[str]], label: str = "Terms" ) -> None: """Print terms without examples with their parent info.""" - print(f"\n==== {label} without examples ====\n") + print(f"\n{label} without examples") + print("-------------------------------") + if len(terms) == 0: + print("Not found.") + return + for term in sorted(terms): parent = parents.get(term) if parent: @@ -184,33 +208,31 @@ def print_terms_used_but_undefined( undefined = sorted( term for term in used_terms.keys() - if term not in defined - and (":" in term and term.split(":")[0] not in SKIP_PREFIXES) + if ":" in term + and term.split(":", 1)[0] not in SKIP_PREFIXES + and term not in defined ) if not undefined: print("\nNo undefined terms found in examples (all used terms are defined).") return - print("\n==== Terms used in examples but NOT defined in vocabulary files ====") + print("\nTerms used in examples but NOT defined in vocabulary files") + print("----------------------------------------------------------") for term in undefined: - files = sorted(used_terms.get(term, [])) - rel_files: list[str] = [] - for f in files: - try: - rel = os.path.relpath(f, start=examples_dir) - except Exception: - rel = f - rel_files.append(rel) - files_str = ", ".join(rel_files) - print(f"{term:<40} in: {files_str}") + files = used_terms.get(term, ()) + rel_files = [ + (os.path.relpath(f, start=examples_dir) if examples_dir else f) + for f in sorted(files) + ] + print(f"{term:<40} in: {", ".join(rel_files)}") def print_summary_terms_with_examples( - classes: set[str], - properties: set[str], - used_classes: set[str], - used_properties: set[str], + classes_ns_count: Counter[str], + properties_ns_count: Counter[str], + used_classes_ns_count: Counter[str], + used_properties_ns_count: Counter[str], print_complete_ns: bool = False, ) -> None: """Print per-namespace summary of classes and properties with examples. @@ -218,15 +240,10 @@ def print_summary_terms_with_examples( If print_complete_ns is False, do not print namespaces where all classes and properties have examples. """ - classes_ns_count = count_per_namespace(classes) - properties_ns_count = count_per_namespace(properties) - used_classes_ns_count = count_per_namespace(used_classes) - used_properties_ns_count = count_per_namespace(used_properties) - print() print("Namespace Class w/ Examples Prop. w/ Examples") print("-------------------------- ----------------- -----------------") - for ns in sorted(set(classes_ns_count) | set(properties_ns_count)): + for ns in sorted(classes_ns_count.keys() | properties_ns_count.keys()): if ns in SKIP_PREFIXES: continue c_used = used_classes_ns_count.get(ns, 0) @@ -243,54 +260,80 @@ def main() -> None: args = parse_args() vocab_dir = args.vocab_dir examples_dir = args.examples_dir - verbose = args.verbose vocab_files = list(get_ttl_files(vocab_dir)) ex_files = list(get_ttl_files(examples_dir)) - if verbose: + if args.verbose: print(f"Vocabulary directory: {vocab_dir}") print(f"Example directory: {examples_dir}") print(f"Vocabulary TTL files found: {len(vocab_files)}") print(f"Example TTL files found: {len(ex_files)}") - classes, properties, parents = collect_terms_in_vocabs(vocab_files) - if verbose: - print(f"Classes defined: {len(classes)}") - print(f"Properties defined: {len(properties)}") + classes, properties, parents, classes_ns_count, properties_ns_count = ( + collect_terms_in_vocabs(vocab_files) + ) used = collect_terms_in_examples(ex_files) - used_classes = {c for c in classes if is_used_or_parent_used(c, used, parents)} - used_properties = { - p for p in properties if is_used_or_parent_used(p, used, parents) - } + used_keys = set(used) # cache keys for faster lookup + + used_classes: set[str] = set() + used_properties: set[str] = set() + used_classes_ns_count: Counter[str] = Counter() + used_properties_ns_count: Counter[str] = Counter() + + for c in classes: + if is_used_or_parent_used(c, used_keys, parents): + used_classes.add(c) + ns, sep, _ = c.partition(":") + if sep: + used_classes_ns_count[ns] += 1 + + for p in properties: + if is_used_or_parent_used(p, used_keys, parents): + used_properties.add(p) + ns, sep, _ = p.partition(":") + if sep: + used_properties_ns_count[ns] += 1 + + print(f"Classes (inc. parents) with examples: {len(used_classes)} / {len(classes)}") + print( + f"Properties (inc. parents) with examples: {len(used_properties)} / {len(properties)}" + ) + unused_classes = classes - used_classes unused_properties = properties - used_properties - if verbose: - print(f"Classes with examples: {len(used_classes)} / {len(classes)}") - print(f"Properties with examples: {len(used_properties)} / {len(properties)}") + + if args.list_unused_terms or args.verbose: print_terms_without_examples(unused_classes, parents, "Classes") print_terms_without_examples(unused_properties, parents, "Properties") print_summary_terms_with_examples( - classes, - properties, - used_classes, - used_properties, - print_complete_ns=verbose, + classes_ns_count, + properties_ns_count, + used_classes_ns_count, + used_properties_ns_count, + print_complete_ns=args.verbose, ) - unused_classes_no_loc = {c for c in unused_classes if not c.startswith("loc:")} - top_classes_parents = count_parents(unused_classes_no_loc, parents).most_common(10) - top_properties_parents = count_parents(unused_properties, parents).most_common(10) - - print("\n==== Top parents among classes without examples (excluding 'loc:')====") - for parent, count in top_classes_parents: - print(f"{count:>7} {parent}") - print("\n==== Top parents among properties without examples ====") - for parent, count in top_properties_parents: - print(f"{count:>7} {parent}") - - print_terms_used_but_undefined(used, classes, properties, examples_dir) + if args.top_unused_parents or args.verbose: + unused_classes_no_loc = {c for c in unused_classes if not c.startswith("loc:")} + top_classes_parents = count_parents(unused_classes_no_loc, parents).most_common( + 10 + ) + top_properties_parents = count_parents(unused_properties, parents).most_common( + 10 + ) + print("\nTop parents among classes without examples (excluding 'loc:')") + print("-------------------------------------------------------------") + for parent, count in top_classes_parents: + print(f"{count:>7} {parent}") + print("\nTop parents among properties without examples") + print("---------------------------------------------") + for parent, count in top_properties_parents: + print(f"{count:>7} {parent}") + + if args.list_undefined_terms or args.verbose: + print_terms_used_but_undefined(used, classes, properties, examples_dir) if __name__ == "__main__": From dd9621f1e4694dcba464377467dd0117e19b314d Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 21 Aug 2025 09:54:41 +0100 Subject: [PATCH 09/14] Exclude -owl.ttl Signed-off-by: Arthit Suriyawongkul --- code/295_find_terms_without_example.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/code/295_find_terms_without_example.py b/code/295_find_terms_without_example.py index 30a8b619e..b39f45687 100755 --- a/code/295_find_terms_without_example.py +++ b/code/295_find_terms_without_example.py @@ -12,6 +12,7 @@ import os import re from collections import Counter, defaultdict +from pathlib import Path from typing import Iterator from rdflib import RDF, RDFS, SKOS, Graph @@ -86,11 +87,14 @@ def parse_args() -> argparse.Namespace: def get_ttl_files(root: str) -> Iterator[str]: - """Yield all TTL files in the given directory and its subdirectories""" - for dirpath, _, filenames in os.walk(root): - for f in filenames: - if f.endswith(".ttl"): - yield os.path.join(dirpath, f) + """Yield .ttl files, excluding files ending with '-owl.ttl'""" + base = Path(root) + if not base.exists(): + return + for p in base.rglob("*.ttl"): + name = p.name.lower() + if not name.endswith("-owl.ttl"): + yield str(p) def collect_terms_in_vocabs( From d363ddab6ec63e259312f509369dbeaefb4291d8 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 21 Aug 2025 13:23:36 +0100 Subject: [PATCH 10/14] Find terms referenced in HTML but undefined Signed-off-by: Arthit Suriyawongkul --- code/295_find_terms_without_example.py | 188 ++++++++++++++++++++----- 1 file changed, 156 insertions(+), 32 deletions(-) diff --git a/code/295_find_terms_without_example.py b/code/295_find_terms_without_example.py index b39f45687..40d24bcf4 100755 --- a/code/295_find_terms_without_example.py +++ b/code/295_find_terms_without_example.py @@ -49,11 +49,13 @@ def parse_args() -> argparse.Namespace: """Parse command line arguments""" parser = argparse.ArgumentParser(description="Find terms without an example") parser.add_argument( + "-d", "--vocab-dir", default=DEFAULT_VOCAB_DIR, help="Directory containing vocabulary TTL files", ) parser.add_argument( + "-e", "--examples-dir", default=DEFAULT_EXAMPLES_DIR, help="Directory containing example TTL files", @@ -83,11 +85,17 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Print terms used in examples but not defined in vocabulary files", ) + parser.add_argument( + "-x", + "--list-undefined-html-terms", + action="store_true", + help="Print terms used in HTMLs but not defined in vocabulary files", + ) return parser.parse_args() def get_ttl_files(root: str) -> Iterator[str]: - """Yield .ttl files, excluding files ending with '-owl.ttl'""" + """Yield .ttl files, excluding files ending with -owl.ttl""" base = Path(root) if not base.exists(): return @@ -97,6 +105,17 @@ def get_ttl_files(root: str) -> Iterator[str]: yield str(p) +def get_html_files(root: str) -> Iterator[str]: + """Yield .html files, excluding files ending with -en.html""" + base = Path(root) + if not base.exists(): + return + for p in base.rglob("*.html"): + name = p.name.lower() + if not name.endswith("-en.html"): + yield str(p) + + def collect_terms_in_vocabs( files: list[str], ) -> tuple[set[str], set[str], dict[str, set[str]], Counter[str], Counter[str]]: @@ -149,20 +168,54 @@ def collect_terms_in_examples(files: list[str]) -> dict[str, set[str]]: Instead, we use regex to find terms. """ used: dict[str, set[str]] = defaultdict(set) - pattern = re.compile(r"\b([a-zA-Z0-9_-]+):([a-zA-Z0-9_-]+)\b") - # Match terms (prefix:term) not surrounded by quotes (since it can be literal) - # pattern = re.compile(r'(? dict[str, dict[str, set[int]]]: + """Collect terms mentioned in description section of HTML files""" + # { term: { filepath: set of line numbers } } } + used: dict[str, dict[str, set[int]]] = defaultdict(lambda: defaultdict(set)) + + # [=term=] + bracket_pat = re.compile(r"\[=\s*?([a-zA-Z0-9_\-]+?)\s*?=\]") + # prefix:term + code_pat = re.compile(r"\s*?([a-zA-Z0-9_\-]+?:[a-zA-Z0-9_\-]+?)\s*?") + prefix_pat = re.compile( + r"respecConfig\s*=\s*{[\s\S]*?shortName\s*[:=]\s*['\"]([^'\"]+)['\"]", + re.IGNORECASE, + ) + for f in files: + try: + with open(f, encoding="utf-8", errors="ignore") as fh: + html = fh.read() + except OSError: + continue + + match = prefix_pat.search(html) + prefix = match.group(1).lower() if match else "" + + for n, line in enumerate(html.splitlines(), start=1): + for m in bracket_pat.finditer(line): + term = m.group(1) + term = f"{prefix}:{term}" if prefix else term + used[term][f].add(n) + + for m in code_pat.finditer(line): + term = m.group(1) # full term (prefix:term) + used[term][f].add(n) + + return used + + def is_used_or_parent_used( term: str, used_keys: set[str], parents: dict[str, set[str]] ) -> bool: @@ -185,9 +238,10 @@ def print_terms_without_examples( terms: set[str], parents: dict[str, set[str]], label: str = "Terms" ) -> None: """Print terms without examples with their parent info.""" - print(f"\n{label} without examples") + print(f"\n{label} without examples ({len(terms)})") print("-------------------------------") - if len(terms) == 0: + + if not terms: print("Not found.") return @@ -199,11 +253,11 @@ def print_terms_without_examples( print(term) -def print_terms_used_but_undefined( - used_terms: dict[str, set[str]], +def print_undefined_example_terms( + used: dict[str, set[str]], classes: set[str], properties: set[str], - examples_dir: str, + base_dir: str, ) -> None: """ Print terms that appear in example files but are not defined in the vocabulary files. @@ -211,25 +265,63 @@ def print_terms_used_but_undefined( defined = classes | properties undefined = sorted( term - for term in used_terms.keys() + for term in used.keys() if ":" in term and term.split(":", 1)[0] not in SKIP_PREFIXES and term not in defined ) + print( + f"\nTerms used in examples but NOT defined in vocabulary files ({len(undefined)})" + ) + print("----------------------------------------------------------") + if not undefined: - print("\nNo undefined terms found in examples (all used terms are defined).") + print("Not found.") return - print("\nTerms used in examples but NOT defined in vocabulary files") - print("----------------------------------------------------------") for term in undefined: - files = used_terms.get(term, ()) - rel_files = [ - (os.path.relpath(f, start=examples_dir) if examples_dir else f) + files = used.get(term, ()) + found_in = [ + (os.path.relpath(f, start=base_dir) if base_dir else f) for f in sorted(files) ] - print(f"{term:<40} in: {", ".join(rel_files)}") + print(f"{term:<40} in: {", ".join(found_in)}") + + +def print_undefined_html_terms( + used: dict[str, dict[str, set[int]]], + classes: set[str], + properties: set[str], + base_dir: str, +) -> None: + """Print terms referenced in HTML but not defined in vocabulary files. + + html_used: term -> { filepath -> set(line_numbers) } + """ + undefined = { + term: filename_linenum + for term, filename_linenum in used.items() + if term not in classes + and term not in properties + and term.split(":", 1)[0] not in SKIP_PREFIXES + } + + print( + f"\nTerms referenced in HTML but NOT defined in vocabulary files ({len(undefined)})" + ) + print("------------------------------------------------------------") + if not undefined: + print("Not found.") + return + + for term, filename_linenum in sorted(undefined.items()): + found_in = [] + for path in sorted(filename_linenum): + rel = os.path.relpath(path, start=base_dir) + lines = sorted(filename_linenum[path]) + found_in.append(f"{rel}:{','.join(map(str, lines))}") + print(f"{term:<40} in: {', '.join(found_in)}") def print_summary_terms_with_examples( @@ -259,6 +351,34 @@ def print_summary_terms_with_examples( print(f"{ns:<26} {c_used:>7} / {c_total:<7} {p_used:>7} / {p_total:<7}") +def print_top_unused_parents( + unused: set[str], + parents: dict[str, set[str]], + exclude_prefixes: set[str], + top_n: int = 10, + label: str = "Terms", +) -> None: + """ + Print top parent terms among unused terms + + exclude_prefixes: prefixes to filter out + top_n: number of top parents to show + """ + filtered_terms = { + c + for c in unused + if not any(c.startswith(f"{prefix}:") for prefix in exclude_prefixes) + } + top_parents = count_parents(filtered_terms, parents).most_common(top_n) + + print(f"\nTop parents among {label} without examples") + if exclude_prefixes: + print(f"(excluding child with prefixes: {", ".join(sorted(exclude_prefixes))})") + print("-------------------------------------------------------------") + for parent, count in top_parents: + print(f"{count:>7} {parent}") + + def main() -> None: """Main function""" args = parse_args() @@ -320,24 +440,28 @@ def main() -> None: ) if args.top_unused_parents or args.verbose: - unused_classes_no_loc = {c for c in unused_classes if not c.startswith("loc:")} - top_classes_parents = count_parents(unused_classes_no_loc, parents).most_common( - 10 + print_top_unused_parents( + unused_classes, + parents, + exclude_prefixes={"loc"}, # There are many unused loc: terms + top_n=10, + label="classes", ) - top_properties_parents = count_parents(unused_properties, parents).most_common( - 10 + print_top_unused_parents( + unused_properties, + parents, + exclude_prefixes=set(), + top_n=10, + label="properties", ) - print("\nTop parents among classes without examples (excluding 'loc:')") - print("-------------------------------------------------------------") - for parent, count in top_classes_parents: - print(f"{count:>7} {parent}") - print("\nTop parents among properties without examples") - print("---------------------------------------------") - for parent, count in top_properties_parents: - print(f"{count:>7} {parent}") if args.list_undefined_terms or args.verbose: - print_terms_used_but_undefined(used, classes, properties, examples_dir) + print_undefined_example_terms(used, classes, properties, examples_dir) + + if args.list_undefined_html_terms or args.verbose: + html_files = list(get_html_files(vocab_dir)) + html_used = collect_terms_in_htmls(html_files) + print_undefined_html_terms(html_used, classes, properties, vocab_dir) if __name__ == "__main__": From b6bd1b2e0bfb14ed7c6d9f172c85509ee2e7df91 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Thu, 21 Aug 2025 13:44:41 +0100 Subject: [PATCH 11/14] Fix quote in quote Signed-off-by: Arthit Suriyawongkul --- code/295_find_terms_without_example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/295_find_terms_without_example.py b/code/295_find_terms_without_example.py index 40d24bcf4..5eedeb020 100755 --- a/code/295_find_terms_without_example.py +++ b/code/295_find_terms_without_example.py @@ -286,7 +286,7 @@ def print_undefined_example_terms( (os.path.relpath(f, start=base_dir) if base_dir else f) for f in sorted(files) ] - print(f"{term:<40} in: {", ".join(found_in)}") + print(f"{term:<40} in: {', '.join(found_in)}") def print_undefined_html_terms( @@ -373,7 +373,7 @@ def print_top_unused_parents( print(f"\nTop parents among {label} without examples") if exclude_prefixes: - print(f"(excluding child with prefixes: {", ".join(sorted(exclude_prefixes))})") + print(f"(excluding child with prefixes: {', '.join(sorted(exclude_prefixes))})") print("-------------------------------------------------------------") for parent, count in top_parents: print(f"{count:>7} {parent}") From 2f43618a72e60cdc45aa66aac549c6318f4b86e5 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 22 Aug 2025 00:01:05 +0100 Subject: [PATCH 12/14] Add extract term in backtick Signed-off-by: Arthit Suriyawongkul --- code/295_find_terms_without_example.py | 28 ++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/code/295_find_terms_without_example.py b/code/295_find_terms_without_example.py index 5eedeb020..4d8b603e4 100755 --- a/code/295_find_terms_without_example.py +++ b/code/295_find_terms_without_example.py @@ -185,14 +185,22 @@ def collect_terms_in_htmls(files: list[str]) -> dict[str, dict[str, set[int]]]: # { term: { filepath: set of line numbers } } } used: dict[str, dict[str, set[int]]] = defaultdict(lambda: defaultdict(set)) - # [=term=] - bracket_pat = re.compile(r"\[=\s*?([a-zA-Z0-9_\-]+?)\s*?=\]") - # prefix:term - code_pat = re.compile(r"\s*?([a-zA-Z0-9_\-]+?:[a-zA-Z0-9_\-]+?)\s*?") - prefix_pat = re.compile( + # respecConfig.shortName + global_prefix_pat = re.compile( r"respecConfig\s*=\s*{[\s\S]*?shortName\s*[:=]\s*['\"]([^'\"]+)['\"]", re.IGNORECASE, ) + # [=term=] + term_bracket_pat = re.compile(r"\[=\s*?([a-zA-Z0-9_\-]+?)\s*?=\]") + # prefix:term + term_code_prefix_pat = re.compile( + r"\s*?([a-zA-Z0-9_\-]+?:[a-zA-Z0-9_\-]+?)\s*?" + ) + # term + # term_code_pat = re.compile(r"\s*?([^:\s/]+?)\s*?") + # `prefix:term`` + term_backtick_prefix_pat = re.compile(r"`([a-zA-Z0-9_\-]+?:[a-zA-Z0-9_\-]+?)`") + for f in files: try: with open(f, encoding="utf-8", errors="ignore") as fh: @@ -200,16 +208,20 @@ def collect_terms_in_htmls(files: list[str]) -> dict[str, dict[str, set[int]]]: except OSError: continue - match = prefix_pat.search(html) + match = global_prefix_pat.search(html) prefix = match.group(1).lower() if match else "" for n, line in enumerate(html.splitlines(), start=1): - for m in bracket_pat.finditer(line): + for m in term_bracket_pat.finditer(line): term = m.group(1) term = f"{prefix}:{term}" if prefix else term used[term][f].add(n) - for m in code_pat.finditer(line): + for m in term_code_prefix_pat.finditer(line): + term = m.group(1) # full term (prefix:term) + used[term][f].add(n) + + for m in term_backtick_prefix_pat.finditer(line): term = m.group(1) # full term (prefix:term) used[term][f].add(n) From 6349c65791ab020705a146e8d3fef4bc4941e762 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 22 Aug 2025 16:48:19 +0100 Subject: [PATCH 13/14] Add comment Signed-off-by: Arthit Suriyawongkul --- code/295_find_terms_without_example.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/code/295_find_terms_without_example.py b/code/295_find_terms_without_example.py index 4d8b603e4..793aa0952 100755 --- a/code/295_find_terms_without_example.py +++ b/code/295_find_terms_without_example.py @@ -192,13 +192,13 @@ def collect_terms_in_htmls(files: list[str]) -> dict[str, dict[str, set[int]]]: ) # [=term=] term_bracket_pat = re.compile(r"\[=\s*?([a-zA-Z0-9_\-]+?)\s*?=\]") + # term -- too many false positives but can be useful if you have time to comb through + # term_code_pat = re.compile(r"\s*?([a-zA-Z0-9_\-]+?)\s*?") # prefix:term term_code_prefix_pat = re.compile( r"\s*?([a-zA-Z0-9_\-]+?:[a-zA-Z0-9_\-]+?)\s*?" ) - # term - # term_code_pat = re.compile(r"\s*?([^:\s/]+?)\s*?") - # `prefix:term`` + # `prefix:term` term_backtick_prefix_pat = re.compile(r"`([a-zA-Z0-9_\-]+?:[a-zA-Z0-9_\-]+?)`") for f in files: @@ -217,6 +217,11 @@ def collect_terms_in_htmls(files: list[str]) -> dict[str, dict[str, set[int]]]: term = f"{prefix}:{term}" if prefix else term used[term][f].add(n) + # for m in term_code_pat.finditer(line): + # term = m.group(1) + # term = f"{prefix}:{term}" if prefix else term + # used[term][f+"**"].add(n) + for m in term_code_prefix_pat.finditer(line): term = m.group(1) # full term (prefix:term) used[term][f].add(n) @@ -328,7 +333,7 @@ def print_undefined_html_terms( return for term, filename_linenum in sorted(undefined.items()): - found_in = [] + found_in: list[str] = [] for path in sorted(filename_linenum): rel = os.path.relpath(path, start=base_dir) lines = sorted(filename_linenum[path]) From a6fd0c471b037f9ed3a524040d61f4217a169280 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Tue, 9 Sep 2025 23:50:45 +0100 Subject: [PATCH 14/14] Fix example term regex Do not match strings (surrounded with quotes) and datetime-lookalike Co-Authored-By: Harshvardhan Pandit --- ...t_example.py => 400_find_terms_without_example.py} | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) rename code/{295_find_terms_without_example.py => 400_find_terms_without_example.py} (98%) diff --git a/code/295_find_terms_without_example.py b/code/400_find_terms_without_example.py similarity index 98% rename from code/295_find_terms_without_example.py rename to code/400_find_terms_without_example.py index 793aa0952..c1b4fa039 100755 --- a/code/295_find_terms_without_example.py +++ b/code/400_find_terms_without_example.py @@ -19,7 +19,14 @@ # Assuming consistent prefixes in vocabulary files SKIP_PREFIXES = { - "ex", # Example namespace + "ex", # Example namespaces + "exA", + "exB", + "exC", + "exD", + "exE", + "exF", + "exG", "_", # Blank nodes "bibo", "dcat", @@ -168,7 +175,7 @@ def collect_terms_in_examples(files: list[str]) -> dict[str, set[str]]: Instead, we use regex to find terms. """ used: dict[str, set[str]] = defaultdict(set) - pattern = re.compile(r"\b([a-zA-Z0-9_\-]+?):([a-zA-Z0-9_\-]+?)\b") + pattern = re.compile(r"(?