From 64c6e99ea73dd659b840e94b48803d3f5cd54f72 Mon Sep 17 00:00:00 2001 From: Mircea Filip Lungu Date: Tue, 20 Jan 2026 13:25:39 +0100 Subject: [PATCH] Fix example generator producing compound words with underscores The LLM was generating examples like "Bolig_forhold_" instead of using "forhold" as a standalone word. This made examples unsuitable for language learning exercises. Changes: - Update v3 prompt to require standalone word placement - Add word_appears_standalone() validation function - Add tools to find and fix existing bad examples Run on server: python -m tools.fix_underscore_examples --fix Co-Authored-By: Claude Opus 4.5 --- tools/fix_compound_examples.py | 367 ++++++++++++++++++ tools/fix_underscore_examples.py | 249 ++++++++++++ .../prompts/example_generation.py | 1 + .../core/tokenization/word_position_finder.py | 77 +++- 4 files changed, 691 insertions(+), 3 deletions(-) create mode 100644 tools/fix_compound_examples.py create mode 100644 tools/fix_underscore_examples.py diff --git a/tools/fix_compound_examples.py b/tools/fix_compound_examples.py new file mode 100644 index 000000000..8c418e09d --- /dev/null +++ b/tools/fix_compound_examples.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python3 +""" +Script to find and fix example sentences where the target word only appears +inside compound words (e.g., "forhold" in "Boligforhold" instead of standalone). + +These examples are problematic for language learning because: +1. The word is hard to recognize inside a compound +2. Fill-in-the-blank exercises don't work well with compound words +3. Learners can't easily practice the word in isolation + +Usage: + # Dry run - just report bad examples + python -m tools.fix_compound_examples --dry-run + + # Delete bad examples without regenerating + python -m tools.fix_compound_examples --delete + + # Delete and regenerate new examples + python -m tools.fix_compound_examples --delete --regenerate + + # Process specific meaning IDs + python -m tools.fix_compound_examples --meaning-ids 123,456,789 + + # Limit processing (for testing) + python -m tools.fix_compound_examples --max-examples 100 +""" + +import argparse +import sys +import os + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from zeeguu.api.app import create_app +from zeeguu.core.model import db + +app = create_app() +app.app_context().push() + +from zeeguu.core.model import ExampleSentence, Meaning, Phrase, Language +from zeeguu.core.model.example_sentence_context import ExampleSentenceContext +from zeeguu.core.tokenization.word_position_finder import word_appears_standalone +from zeeguu.logging import log + + +def has_underscore_formatting(word, sentence): + """ + Check if a sentence has underscore formatting around a word. + This catches LLM-generated examples like "Bolig_forhold_" instead of "Boligforhold". + """ + import re + # Look for patterns like: Word_word_, _word_, word_Word_ + # The word should be surrounded by or adjacent to underscores + patterns = [ + rf'\w+_{re.escape(word)}_', # Prefix_word_ + rf'_{re.escape(word)}_\w*', # _word_ or _word_suffix + rf'{re.escape(word)}_', # word_ at end + rf'_{re.escape(word)}', # _word at end + ] + + for pattern in patterns: + if re.search(pattern, sentence, re.IGNORECASE): + return True + return False + + +def is_valid_contraction(word, compound): + """ + Check if a "compound" is actually a valid contraction (like French l'été). + Valid contractions have the target word at a clear boundary. + """ + word_lower = word.lower() + compound_lower = compound.lower() + + # Check if word is at the end after an apostrophe (e.g., l'été) + if compound_lower.endswith(word_lower): + prefix = compound_lower[:-len(word_lower)] + if prefix.endswith("'") or prefix.endswith("'"): + return True + + # Check if word is at the start before an apostrophe (e.g., c'est) + if compound_lower.startswith(word_lower): + suffix = compound_lower[len(word_lower):] + if suffix.startswith("'") or suffix.startswith("'"): + return True + + return False + + +def find_compound_only_examples(max_examples=None, meaning_ids=None): + """ + Find examples where the target word only appears inside compound words + or has underscore formatting issues. + + Returns: + List of dicts with example info and issue type + """ + query = ( + db.session.query( + ExampleSentence.id, + ExampleSentence.sentence, + ExampleSentence.meaning_id, + Phrase.content.label("word"), + Language.code.label("lang_code"), + Language.id.label("lang_id"), + ) + .join(Meaning, ExampleSentence.meaning_id == Meaning.id) + .join(Phrase, Meaning.origin_id == Phrase.id) + .join(Language, Phrase.language_id == Language.id) + ) + + if meaning_ids: + query = query.filter(ExampleSentence.meaning_id.in_(meaning_ids)) + + if max_examples: + query = query.limit(max_examples) + + examples = query.all() + print(f"Checking {len(examples)} examples for compound-only words...") + + bad_examples = [] + + for i, ex in enumerate(examples): + if (i + 1) % 100 == 0: + print(f" Processed {i + 1}/{len(examples)} examples...") + + # First check for underscore formatting (the main bug) + if has_underscore_formatting(ex.word, ex.sentence): + bad_examples.append({ + 'id': ex.id, + 'sentence': ex.sentence, + 'word': ex.word, + 'meaning_id': ex.meaning_id, + 'lang_code': ex.lang_code, + 'issue_type': 'underscore_formatting', + 'compounds': [] + }) + continue + + # Get language object for tokenization + language = Language.find_by_id(ex.lang_id) + + # Check if word appears standalone + result = word_appears_standalone(ex.word, ex.sentence, language) + + if result['only_in_compounds']: + # Filter out valid contractions + real_compounds = [ + c for c in result['compound_examples'] + if c and len(c) > 1 and not is_valid_contraction(ex.word, c) + ] + + if real_compounds: + bad_examples.append({ + 'id': ex.id, + 'sentence': ex.sentence, + 'word': ex.word, + 'meaning_id': ex.meaning_id, + 'lang_code': ex.lang_code, + 'issue_type': 'compound_only', + 'compounds': real_compounds + }) + + return bad_examples + + +def delete_examples(example_ids): + """Delete examples by ID, skipping those linked to bookmarks.""" + if not example_ids: + return 0, 0 + + # Check which ones are linked to bookmarks + linked_ids = set( + row[0] + for row in db.session.query(ExampleSentenceContext.example_sentence_id) + .filter(ExampleSentenceContext.example_sentence_id.in_(example_ids)) + .all() + ) + + deletable_ids = set(example_ids) - linked_ids + + if deletable_ids: + deleted_count = ( + db.session.query(ExampleSentence) + .filter(ExampleSentence.id.in_(deletable_ids)) + .delete(synchronize_session=False) + ) + db.session.commit() + else: + deleted_count = 0 + + return deleted_count, len(linked_ids) + + +def regenerate_examples_for_meanings(meaning_ids, target_count=5): + """Regenerate examples for specific meanings.""" + from zeeguu.core.llm_services import get_llm_service + from zeeguu.core.model.ai_generator import AIGenerator + + llm_service = get_llm_service() + regenerated_count = 0 + + for meaning_id in meaning_ids: + meaning = Meaning.query.get(meaning_id) + if not meaning: + print(f" Warning: Meaning {meaning_id} not found, skipping") + continue + + # Check how many examples already exist + existing_count = ExampleSentence.query.filter( + ExampleSentence.meaning_id == meaning_id + ).count() + + examples_to_generate = max(0, target_count - existing_count) + if examples_to_generate == 0: + print(f" Meaning {meaning_id} already has {existing_count} examples, skipping") + continue + + origin_word = meaning.origin.content + translation = meaning.translation.content + origin_lang = meaning.origin.language.code + translation_lang = meaning.translation.language.code + + print(f" Generating {examples_to_generate} examples for '{origin_word}' -> '{translation}'...") + + try: + examples = llm_service.generate_examples( + word=origin_word, + translation=translation, + source_lang=origin_lang, + target_lang=translation_lang, + cefr_level="B1", # Default level + prompt_version="v3", + count=examples_to_generate, + ) + + # Get or create AIGenerator record + llm_model = examples[0]["llm_model"] if examples else "unknown" + prompt_version = examples[0]["prompt_version"] if examples else "v3" + + ai_generator = AIGenerator.find_or_create( + db.session, + llm_model, + prompt_version, + description="Regenerated after compound word fix", + ) + + for example in examples: + ExampleSentence.create_ai_generated( + db.session, + sentence=example["sentence"], + language=meaning.origin.language, + meaning=meaning, + ai_generator=ai_generator, + translation=example.get("translation"), + cefr_level=example.get("cefr_level", "B1"), + commit=False, + ) + + db.session.commit() + regenerated_count += len(examples) + print(f" Created {len(examples)} new examples") + + except Exception as e: + print(f" Error generating examples: {e}") + db.session.rollback() + + return regenerated_count + + +def main(): + parser = argparse.ArgumentParser( + description="Find and fix examples where words only appear in compounds" + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Only report bad examples, don't delete or regenerate", + ) + parser.add_argument( + "--delete", + action="store_true", + help="Delete bad examples", + ) + parser.add_argument( + "--regenerate", + action="store_true", + help="Regenerate examples for affected meanings (requires --delete)", + ) + parser.add_argument( + "--max-examples", + type=int, + help="Maximum number of examples to check (for testing)", + ) + parser.add_argument( + "--meaning-ids", + type=str, + help="Comma-separated list of meaning IDs to process", + ) + + args = parser.parse_args() + + if args.regenerate and not args.delete: + print("Error: --regenerate requires --delete") + sys.exit(1) + + meaning_ids = None + if args.meaning_ids: + meaning_ids = [int(x.strip()) for x in args.meaning_ids.split(",")] + + print("=" * 60) + print("Compound Word Example Fixer") + print("=" * 60) + + # Find bad examples + bad_examples = find_compound_only_examples( + max_examples=args.max_examples, + meaning_ids=meaning_ids, + ) + + if not bad_examples: + print("\nNo compound-only examples found!") + return + + print(f"\nFound {len(bad_examples)} examples where word only appears in compounds:\n") + + # Group by meaning for clearer output + by_meaning = {} + for ex in bad_examples: + if ex['meaning_id'] not in by_meaning: + by_meaning[ex['meaning_id']] = [] + by_meaning[ex['meaning_id']].append(ex) + + for meaning_id, examples in by_meaning.items(): + word = examples[0]['word'] + print(f"Word: '{word}' (meaning_id: {meaning_id})") + for ex in examples: + issue = ex.get('issue_type', 'unknown') + print(f" - ID {ex['id']} [{issue}]: \"{ex['sentence']}\"") + if ex.get('compounds'): + print(f" Compounds found: {ex['compounds']}") + print() + + affected_meaning_ids = list(by_meaning.keys()) + + if args.dry_run: + print("Dry run complete. Use --delete to remove these examples.") + return + + if args.delete: + print("\nDeleting bad examples...") + example_ids = [ex['id'] for ex in bad_examples] + deleted, linked = delete_examples(example_ids) + print(f" Deleted: {deleted} examples") + if linked: + print(f" Preserved (linked to bookmarks): {linked} examples") + + if args.regenerate: + print("\nRegenerating examples for affected meanings...") + regenerated = regenerate_examples_for_meanings(affected_meaning_ids) + print(f"\nRegenerated {regenerated} new examples") + + print("\nDone!") + + +if __name__ == "__main__": + main() diff --git a/tools/fix_underscore_examples.py b/tools/fix_underscore_examples.py new file mode 100644 index 000000000..d2bdfe78b --- /dev/null +++ b/tools/fix_underscore_examples.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +""" +Script to fix example sentences with underscore formatting issues. + +This fixes examples where the LLM generated compound words with underscores +like "Bolig_forhold_" instead of "Boligforhold" or used markdown-style +emphasis like "__word__". + +Usage: + # Dry run - show what would be fixed + python -m tools.fix_underscore_examples --dry-run + + # Delete bad examples and regenerate new ones + python -m tools.fix_underscore_examples --fix +""" + +import argparse +import sys +import os +import re + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from zeeguu.api.app import create_app +from zeeguu.core.model import db + +app = create_app() +app.app_context().push() + +from zeeguu.core.model import ExampleSentence, Meaning, Phrase, Language +from zeeguu.core.model.example_sentence_context import ExampleSentenceContext +from zeeguu.core.model.ai_generator import AIGenerator +from zeeguu.logging import log + + +def find_underscore_examples(): + """ + Find examples with underscore formatting issues. + + Patterns detected: + - Compound underscores: Prefix_word_ (like Bolig_forhold_) + - Markdown emphasis: __word__ + """ + query = db.session.query( + ExampleSentence.id, + ExampleSentence.sentence, + ExampleSentence.meaning_id, + Phrase.content.label('word'), + Language.code.label('lang_code'), + ).join(Meaning, ExampleSentence.meaning_id == Meaning.id + ).join(Phrase, Meaning.origin_id == Phrase.id + ).join(Language, Phrase.language_id == Language.id) + + results = query.all() + bad_examples = [] + + for r in results: + word = r.word + if len(word) < 2: + continue + + # Pattern 1: Compound underscores like Prefix_word_ or _word_Suffix + pattern1 = rf'\w+_{re.escape(word)}_' + pattern2 = rf'_{re.escape(word)}_\w*' + + # Pattern 2: Markdown emphasis like __word__ + pattern3 = rf'__{re.escape(word)}__' + + issue_type = None + if re.search(pattern1, r.sentence, re.IGNORECASE): + issue_type = 'compound_underscore' + elif re.search(pattern2, r.sentence, re.IGNORECASE): + issue_type = 'compound_underscore' + elif re.search(pattern3, r.sentence, re.IGNORECASE): + issue_type = 'markdown_emphasis' + + if issue_type: + bad_examples.append({ + 'id': r.id, + 'sentence': r.sentence, + 'word': r.word, + 'meaning_id': r.meaning_id, + 'lang_code': r.lang_code, + 'issue_type': issue_type, + }) + + return bad_examples + + +def delete_examples(example_ids): + """Delete examples by ID, returning count of deleted and skipped (linked to bookmarks).""" + if not example_ids: + return 0, 0 + + # Check which ones are linked to bookmarks + linked_ids = set( + row[0] + for row in db.session.query(ExampleSentenceContext.example_sentence_id) + .filter(ExampleSentenceContext.example_sentence_id.in_(example_ids)) + .all() + ) + + deletable_ids = set(example_ids) - linked_ids + + deleted_count = 0 + if deletable_ids: + deleted_count = ( + db.session.query(ExampleSentence) + .filter(ExampleSentence.id.in_(deletable_ids)) + .delete(synchronize_session=False) + ) + db.session.commit() + + return deleted_count, len(linked_ids) + + +def regenerate_examples_for_meaning(meaning_id, target_count=5): + """Regenerate examples for a specific meaning.""" + from zeeguu.core.llm_services import get_llm_service + + meaning = Meaning.query.get(meaning_id) + if not meaning: + print(f" Warning: Meaning {meaning_id} not found") + return 0 + + # Check current count + existing_count = ExampleSentence.query.filter( + ExampleSentence.meaning_id == meaning_id + ).count() + + examples_to_generate = max(0, target_count - existing_count) + if examples_to_generate == 0: + print(f" Meaning {meaning_id} already has {existing_count} examples") + return 0 + + origin_word = meaning.origin.content + translation = meaning.translation.content + origin_lang = meaning.origin.language.code + translation_lang = meaning.translation.language.code + + print(f" Generating {examples_to_generate} examples for '{origin_word}' -> '{translation}'...") + + try: + llm_service = get_llm_service() + examples = llm_service.generate_examples( + word=origin_word, + translation=translation, + source_lang=origin_lang, + target_lang=translation_lang, + cefr_level="B1", + prompt_version="v3", + count=examples_to_generate, + ) + + llm_model = examples[0]["llm_model"] if examples else "unknown" + prompt_version = examples[0]["prompt_version"] if examples else "v3" + + ai_generator = AIGenerator.find_or_create( + db.session, + llm_model, + prompt_version, + description="Regenerated after underscore fix", + ) + + for example in examples: + ExampleSentence.create_ai_generated( + db.session, + sentence=example["sentence"], + language=meaning.origin.language, + meaning=meaning, + ai_generator=ai_generator, + translation=example.get("translation"), + cefr_level=example.get("cefr_level", "B1"), + commit=False, + ) + + db.session.commit() + print(f" Created {len(examples)} new examples") + return len(examples) + + except Exception as e: + print(f" Error: {e}") + db.session.rollback() + return 0 + + +def main(): + parser = argparse.ArgumentParser(description="Fix underscore formatting in examples") + parser.add_argument("--dry-run", action="store_true", help="Only show what would be fixed") + parser.add_argument("--fix", action="store_true", help="Delete bad examples and regenerate") + args = parser.parse_args() + + if not args.dry_run and not args.fix: + print("Error: Specify --dry-run or --fix") + sys.exit(1) + + print("=" * 60) + print("Underscore Example Fixer") + print("=" * 60) + + # Find bad examples + bad_examples = find_underscore_examples() + + if not bad_examples: + print("\nNo underscore-formatted examples found!") + return + + print(f"\nFound {len(bad_examples)} examples with underscore issues:\n") + + # Group by meaning + by_meaning = {} + for ex in bad_examples: + if ex['meaning_id'] not in by_meaning: + by_meaning[ex['meaning_id']] = [] + by_meaning[ex['meaning_id']].append(ex) + + for meaning_id, examples in by_meaning.items(): + word = examples[0]['word'] + lang = examples[0]['lang_code'] + print(f"Word: '{word}' [{lang}] (meaning_id: {meaning_id})") + for ex in examples: + print(f" - ID {ex['id']} [{ex['issue_type']}]:") + print(f" \"{ex['sentence']}\"") + print() + + if args.dry_run: + print("Dry run complete. Use --fix to delete and regenerate.") + return + + # Delete bad examples + print("Deleting bad examples...") + example_ids = [ex['id'] for ex in bad_examples] + deleted, linked = delete_examples(example_ids) + print(f" Deleted: {deleted}") + if linked: + print(f" Skipped (linked to bookmarks): {linked}") + + # Regenerate for affected meanings + print("\nRegenerating examples...") + total_regenerated = 0 + for meaning_id in by_meaning.keys(): + regenerated = regenerate_examples_for_meaning(meaning_id) + total_regenerated += regenerated + + print(f"\nDone! Regenerated {total_regenerated} new examples.") + + +if __name__ == "__main__": + main() diff --git a/zeeguu/core/llm_services/prompts/example_generation.py b/zeeguu/core/llm_services/prompts/example_generation.py index b5a0bbd28..92649a0e2 100644 --- a/zeeguu/core/llm_services/prompts/example_generation.py +++ b/zeeguu/core/llm_services/prompts/example_generation.py @@ -98,6 +98,7 @@ 6. Include specific details, actions, or situations that uniquely point to this word with this specific meaning 7. Each sentence should be appropriate for {cefr_level} level learners 8. Sentences should be practical and relatable to everyday situations +9. WORD PLACEMENT: The target word "{word}" MUST appear as a standalone, clearly identifiable word in the sentence - NOT embedded inside compound words. For example, if the word is "forhold", use it standalone like "et godt forhold" (a good relationship), NOT inside compounds like "Boligforhold" where it's hard to recognize EXAMPLE OF VIOLATION TO AVOID: If word="virker" and translation="seem", do NOT create sentences where "virker" means "work/function" like: diff --git a/zeeguu/core/tokenization/word_position_finder.py b/zeeguu/core/tokenization/word_position_finder.py index a253c40fb..4c1742778 100644 --- a/zeeguu/core/tokenization/word_position_finder.py +++ b/zeeguu/core/tokenization/word_position_finder.py @@ -142,17 +142,88 @@ def validate_single_occurrence(target_word, context_text, from_lang): } +def word_appears_standalone(target_word, context_text, from_lang): + """ + Check if a word appears as a standalone token (not embedded in compounds). + + Args: + target_word (str): The word to check + context_text (str): The context text to search + from_lang (Language): The language object + + Returns: + dict: { + 'standalone': bool - True if word appears as standalone token, + 'only_in_compounds': bool - True if word only appears inside compounds, + 'compound_examples': list - Examples of compounds containing the word, + 'error_message': str or None + } + """ + try: + # Check strict matching (standalone tokens) + strict_result = find_word_positions_in_text(target_word, context_text, from_lang, strict_matching=True) + strict_positions = strict_result['found_positions'] + + # Check fuzzy matching (includes compounds) + fuzzy_result = find_word_positions_in_text(target_word, context_text, from_lang, strict_matching=False) + fuzzy_positions = fuzzy_result['found_positions'] + + # If strict finds it, word appears standalone + if len(strict_positions) > 0: + return { + 'standalone': True, + 'only_in_compounds': False, + 'compound_examples': [], + 'error_message': None + } + + # If fuzzy finds it but strict doesn't, word only appears in compounds + if len(fuzzy_positions) > 0: + # Extract the compound words + tokens_list = fuzzy_result['tokens_list'] + compound_examples = [] + for pos in fuzzy_positions: + # Find the token at this position + for token in tokens_list: + if token.sent_i == pos['sentence_i'] and token.token_i == pos['token_i']: + compound_examples.append(token.text) + break + + return { + 'standalone': False, + 'only_in_compounds': True, + 'compound_examples': compound_examples, + 'error_message': f"Word '{target_word}' only appears inside compound words: {compound_examples}" + } + + # Word not found at all + return { + 'standalone': False, + 'only_in_compounds': False, + 'compound_examples': [], + 'error_message': f"Word '{target_word}' not found in text" + } + + except Exception as e: + return { + 'standalone': False, + 'only_in_compounds': False, + 'compound_examples': [], + 'error_message': f"Error checking word: {str(e)}" + } + + def find_first_occurrence(target_word, context_text, from_lang): """ Find the first occurrence of a word in context text (fuzzy matching). - + Used for generated examples where we're more lenient about matching. - + Args: target_word (str): The word or phrase to find context_text (str): The context text to search from_lang (Language): The language object - + Returns: dict: { 'found': bool,