Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@ jobs:
uv sync
pnpm install
- name: Check Python formatting (black)
run: uv run black --check webapp/ tests/
- name: Check Python formatting & linting (ruff)
run: |
uv run ruff check webapp/ tests/
uv run ruff format --check webapp/ tests/
- name: Check TypeScript formatting (prettier)
run: pnpm format:check
Expand Down
15 changes: 15 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.15.6
hooks:
- id: ruff
args: [--fix]
- id: ruff-format

- repo: https://github.com/pre-commit/mirrors-prettier
rev: v3.5.3
hooks:
- id: prettier
types_or: [javascript, ts, json, vue]
additional_dependencies:
- prettier@3.8.1
14 changes: 8 additions & 6 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,11 @@ pnpm dev # Starts both Flask server + Vite watcher

### Before Committing
```bash
pnpm format # Format TypeScript
uv run black webapp/ tests/ # Format Python
pnpm test # Run TS tests
uv run pytest tests/ # Run Python tests
pnpm format # Format TypeScript
uv run ruff format webapp/ tests/ # Format Python
uv run ruff check webapp/ tests/ # Lint Python
pnpm test # Run TS tests
uv run pytest tests/ # Run Python tests
```

## Important Notes
Expand All @@ -107,8 +108,9 @@ These are tracked by pytest but not blocking - they're data issues, not code iss

### Python

- Black formatter, 100 char line length
- Run `uv run black webapp/ tests/` before committing
- Ruff formatter + linter, 100 char line length
- Run `uv run ruff format webapp/ tests/` and `uv run ruff check webapp/ tests/` before committing
- Pre-commit hooks run both automatically

## Don't

Expand Down
43 changes: 29 additions & 14 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,43 @@ dependencies = [
"flask>=3.1.0",
"flask-cors>=6.0.0",
"gunicorn>=24.0.0",
"Flask-FlatPages>=0.9.0",
"openai>=2.21.0",
"pillow>=12.1.1",
]

[dependency-groups]
dev = [
"pytest>=8.0.0",
"black>=24.0.0",
"ruff>=0.11.0",
"wordfreq>=3.1.1",
]

[tool.black]
[tool.ruff]
line-length = 100
target-version = ['py314']
include = '\.pyi?$'
extend-exclude = '''
/(
\.git
| \.venv
| venv
| node_modules
| webapp/static/dist
)/
'''
target-version = "py311"
exclude = [
".git",
".venv",
"venv",
"node_modules",
"webapp/static/dist",
"webapp/deprecated",
"tests/deprecated",
]

[tool.ruff.lint]
select = [
"E", # pycodestyle errors
"W", # pycodestyle warnings
"F", # pyflakes
"I", # isort
"UP", # pyupgrade
"B", # flake8-bugbear
"SIM", # flake8-simplify
]
ignore = [
"E501", # line too long (formatter handles this)
]

[tool.ruff.lint.isort]
known-first-party = ["webapp"]
31 changes: 15 additions & 16 deletions scripts/analyze_word_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import argparse
import sys
from collections import defaultdict
from math import log
from pathlib import Path

SCRIPT_DIR = Path(__file__).parent
Expand Down Expand Up @@ -133,7 +132,7 @@ def cmd_char_freq(args):
for char, freq in sorted_chars:
count = int(freq * len(words))
bar = "#" * int(freq * 100)
print(f" {char} {count:>6} {freq*100:>7.1f}% {bar}")
print(f" {char} {count:>6} {freq * 100:>7.1f}% {bar}")

# Threshold analysis
print(f"\n{'Threshold analysis':}")
Expand All @@ -144,7 +143,7 @@ def cmd_char_freq(args):
filtered = [w for w in words if any(c in rare_chars for c in w)]
remaining = len(words) - len(filtered)
print(
f" {threshold*100:>4.0f}% {len(rare_chars):>6} {len(filtered):>8} {remaining:>6}"
f" {threshold * 100:>4.0f}% {len(rare_chars):>6} {len(filtered):>8} {remaining:>6}"
)


Expand Down Expand Up @@ -181,7 +180,7 @@ def cmd_difficult_words(args):
# Filter by threshold if specified
if threshold is not None:
scored = [(w, f, c) for w, f, c in scored if f < threshold]
print(f"Words in {lang} daily list with rarest character below {threshold*100:.0f}%:")
print(f"Words in {lang} daily list with rarest character below {threshold * 100:.0f}%:")
else:
print(f"All words in {lang} daily list sorted by difficulty (hardest first):")

Expand All @@ -191,7 +190,7 @@ def cmd_difficult_words(args):
print(f"{'Word':<12} {'Rarest Char':>12} {'Char Freq %':>12}")
print("-" * 38)
for word, freq, char in scored:
print(f" {word:<10} {char:>8} {freq*100:>7.1f}%")
print(f" {word:<10} {char:>8} {freq * 100:>7.1f}%")

print(f"\nTotal: {len(scored)} words")

Expand Down Expand Up @@ -267,10 +266,10 @@ def cmd_hebrew_suffixes(args):
print(f" → Keep: {keep}, blocklist: {to_block}")
print()

print(f"{'='*50}")
print(f"{'=' * 50}")
print(f"Total groups: {len(groups)}")
print(f"Total words to blocklist: {total_to_blocklist}")
print(f"\nBlocklist additions (copy-paste ready):")
print("\nBlocklist additions (copy-paste ready):")
for w in sorted(blocklist_words):
print(w)

Expand All @@ -296,7 +295,7 @@ def cmd_hebrew_quality(args):
sys.exit(1)

print(f"Hebrew daily word quality analysis ({len(words)} words)")
print(f"Cross-referencing with wordfreq (Wikipedia, Reddit, Google Books, etc.)\n")
print("Cross-referencing with wordfreq (Wikipedia, Reddit, Google Books, etc.)\n")

# Score each word
not_in_wordfreq = []
Expand All @@ -313,16 +312,16 @@ def cmd_hebrew_quality(args):
normal.append((word, zf))

# Report
print(f"Category breakdown:")
print("Category breakdown:")
print(f" Normal (zipf >= 2.0): {len(normal):>5} words")
print(f" Low frequency (zipf < 2.0): {len(low_wordfreq):>5} words")
print(f" Not in wordfreq at all: {len(not_in_wordfreq):>5} words")

if not_in_wordfreq:
print(f"\n{'='*50}")
print(f"\n{'=' * 50}")
print(f"Words NOT found in wordfreq ({len(not_in_wordfreq)} words)")
print(f"These may be proper nouns, obscure, or malformed:")
print(f"{'='*50}")
print("These may be proper nouns, obscure, or malformed:")
print(f"{'=' * 50}")
# Show first N
limit = args.limit or 100
for word, zf in sorted(not_in_wordfreq)[:limit]:
Expand All @@ -331,10 +330,10 @@ def cmd_hebrew_quality(args):
print(f" ... and {len(not_in_wordfreq) - limit} more")

if low_wordfreq:
print(f"\n{'='*50}")
print(f"\n{'=' * 50}")
print(f"Low-frequency words (zipf < 2.0, {len(low_wordfreq)} words)")
print(f"These may be uncommon or domain-specific:")
print(f"{'='*50}")
print("These may be uncommon or domain-specific:")
print(f"{'=' * 50}")
low_wordfreq.sort(key=lambda x: x[1])
limit = args.limit or 50
for word, zf in low_wordfreq[:limit]:
Expand All @@ -348,7 +347,7 @@ def cmd_hebrew_quality(args):
if has_freq:
avg = sum(has_freq) / len(has_freq)
print(
f"\nWordfreq coverage: {len(has_freq)}/{len(words)} words ({100*len(has_freq)/len(words):.1f}%)"
f"\nWordfreq coverage: {len(has_freq)}/{len(words)} words ({100 * len(has_freq) / len(words):.1f}%)"
)
print(f"Average zipf frequency (of found words): {avg:.2f}")

Expand Down
19 changes: 6 additions & 13 deletions scripts/configs.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
"data_dir = \"../webapp/data/\"\n",
"language_codes = [f.split(\"/\")[-1] for f in glob.glob(f\"{data_dir}/languages/*\")]\n",
"\n",
"with open(f\"{data_dir}/languages.json\", \"r\") as f:\n",
"with open(f\"{data_dir}/languages.json\") as f:\n",
" languages = json.load(f)\n",
"\n",
"print(languages[\"en\"])"
Expand Down Expand Up @@ -107,7 +107,7 @@
],
"source": [
"# load english language config\n",
"with open(f\"{data_dir}/languages/en/language_config.json\", \"r\") as f:\n",
"with open(f\"{data_dir}/languages/en/language_config.json\") as f:\n",
" en_config = json.load(f)\n",
"\n",
"f\"{data_dir}/languages/en/language_config.json\""
Expand All @@ -131,7 +131,6 @@
" # load from_language config\n",
" with open(\n",
" f\"{data_dir}languages/{from_language}/language_config.json\",\n",
" \"r\",\n",
" encoding=\"utf-8\",\n",
" ) as f:\n",
" from_language_config = json.load(f)\n",
Expand All @@ -141,7 +140,6 @@
" if os.path.exists(f\"{data_dir}languages/{to_language}/language_config.json\"):\n",
" with open(\n",
" f\"{data_dir}languages/{to_language}/language_config.json\",\n",
" \"r\",\n",
" encoding=\"utf-8\",\n",
" ) as f:\n",
" to_language_config = json.load(f)\n",
Expand All @@ -158,9 +156,7 @@
" language_config[\"meta\"] = {}\n",
" language_config[\"meta\"][\"locale\"] = to_language\n",
" language_config[\"text\"] = {}\n",
" language_config[\"text\"][\"subheader\"] = languages[to_language][\n",
" \"language_name_native\"\n",
" ]\n",
" language_config[\"text\"][\"subheader\"] = languages[to_language][\"language_name_native\"]\n",
"\n",
" # defaults\n",
" language_config[\"language_code_3\"] = \"\"\n",
Expand Down Expand Up @@ -202,9 +198,7 @@
"\n",
" language_config[\"help\"] = {}\n",
" for key in from_language_config[\"help\"]:\n",
" translated_text = translate_text(\n",
" from_language_config[\"help\"][key], to_language\n",
" )\n",
" translated_text = translate_text(from_language_config[\"help\"][key], to_language)\n",
" if key in [\n",
" \"text_2_1\",\n",
" \"text_2_2\",\n",
Expand Down Expand Up @@ -553,14 +547,13 @@
"\n",
"import glob\n",
"import json\n",
"import os\n",
"\n",
"data_dir = \"../webapp/data/\"\n",
"\n",
"for lang in glob.glob(f\"{data_dir}languages/*\"):\n",
" lang = lang.split(\"/\")[-1]\n",
" try:\n",
" with open(f\"{data_dir}/languages/{lang}/language_config.json\", \"r\") as f:\n",
" with open(f\"{data_dir}/languages/{lang}/language_config.json\") as f:\n",
" language_config = json.load(f)\n",
"\n",
" with open(f\"{data_dir}/languages/{lang}/keyboard.json\", \"w\") as f:\n",
Expand Down Expand Up @@ -601,7 +594,7 @@
"for lang in glob.glob(f\"{data_dir}languages/*\"):\n",
" lang = lang.split(\"/\")[-1]\n",
" try:\n",
" with open(f\"{data_dir}/languages/{lang}/{lang}_keyboard.json\", \"r\") as f:\n",
" with open(f\"{data_dir}/languages/{lang}/{lang}_keyboard.json\") as f:\n",
" keyboard = json.load(f)\n",
"\n",
" keyboard = []\n",
Expand Down
4 changes: 1 addition & 3 deletions scripts/curate_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@

import argparse
import datetime
import os
import shutil
from pathlib import Path

# Paths
Expand Down Expand Up @@ -79,7 +77,7 @@ def extract_next_words(lang: str, num_days: int = 365) -> None:
f.write(f"{start_idx + i}: {word}\n")

print(f"Wrote {num_days} words to {output_file}")
print(f"Review the file and identify words to remove.")
print("Review the file and identify words to remove.")


def remove_words(lang: str, words_to_remove: list[str]) -> None:
Expand Down
5 changes: 2 additions & 3 deletions scripts/deprecated/capture_wiktionary_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
"""

import json
import os
import sys
import time
import urllib.parse
Expand All @@ -21,7 +20,7 @@
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT / "webapp"))

from wiktionary import parse_wikt_definition, WIKT_LANG_MAP
from wiktionary import WIKT_LANG_MAP, parse_wikt_definition

LANGUAGES_DIR = PROJECT_ROOT / "webapp" / "data" / "languages"
FIXTURES_DIR = PROJECT_ROOT / "tests" / "fixtures" / "wiktionary"
Expand All @@ -32,7 +31,7 @@ def load_word_list(lang_code):
word_file = LANGUAGES_DIR / lang_code / f"{lang_code}_5words.txt"
if not word_file.exists():
return []
with open(word_file, "r", encoding="utf-8") as f:
with open(word_file, encoding="utf-8") as f:
return [line.strip() for line in f if line.strip()]


Expand Down
15 changes: 9 additions & 6 deletions scripts/hunspellToJSON.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
#!/usr/bin/python3.3
import re, argparse, os, gzip, json
import argparse
import gzip
import json
import os
import re


def file_to_list(in_file):
Expand Down Expand Up @@ -393,15 +397,15 @@ def main():

# Open AFF file
try:
aff_file = open(aff_path, "r", encoding="ISO8859-1")
aff_file = open(aff_path, encoding="ISO8859-1")
aff_rules = AFF(aff_file)
aff_file.close()
except IOError:
except OSError:
print(aff_path + " not found")

# Open DIC file
try:
dict_file = open(dict_path, "r", encoding="ISO8859-1")
dict_file = open(dict_path, encoding="ISO8859-1")
dictionary = DICT(
dict_file,
aff_rules,
Expand All @@ -414,7 +418,6 @@ def main():

# Open output file
if args.output:

if args.gzip:
out_file = gzip.open(args.output, "wb")
else:
Expand All @@ -433,7 +436,7 @@ def main():
out_file.close()

dict_file.close()
except IOError:
except OSError:
print(dict_path + " not found")


Expand Down
Loading
Loading