|
| 1 | +"""Admin duplicate detection endpoints. |
| 2 | +
|
| 3 | +Provides three detection strategies: |
| 4 | +1. Same content_hash (exact file duplicates) |
| 5 | +2. Same ISBN |
| 6 | +3. Similar title+author (SequenceMatcher ratio > 0.85) |
| 7 | +
|
| 8 | +Dismissed pairs are persisted in the duplicate_dismissals table and excluded |
| 9 | +from subsequent GET results. |
| 10 | +""" |
| 11 | +import difflib |
| 12 | +import itertools |
| 13 | +import uuid |
| 14 | +from typing import Optional |
| 15 | + |
| 16 | +from fastapi import APIRouter, Depends, HTTPException |
| 17 | +from pydantic import BaseModel |
| 18 | +from sqlalchemy import func |
| 19 | +from sqlalchemy.orm import Session |
| 20 | + |
| 21 | +from backend.core.database import get_db |
| 22 | +from backend.core.security import get_current_user |
| 23 | +from backend.models.book import Book, BookTag |
| 24 | +from backend.models.duplicate_dismissal import DuplicateDismissal |
| 25 | +from backend.models.user import User |
| 26 | +from backend.models.user_book_status import UserBookStatus |
| 27 | +from backend.services.audit import audit |
| 28 | + |
| 29 | +router = APIRouter() |
| 30 | + |
| 31 | + |
| 32 | +# ── Schemas ────────────────────────────────────────────────────────────────── |
| 33 | + |
| 34 | +class BookFileOut(BaseModel): |
| 35 | + id: int |
| 36 | + format: str |
| 37 | + file_size: Optional[int] |
| 38 | + |
| 39 | + model_config = {"from_attributes": True} |
| 40 | + |
| 41 | + |
| 42 | +class DuplicateBookOut(BaseModel): |
| 43 | + id: int |
| 44 | + title: str |
| 45 | + subtitle: Optional[str] |
| 46 | + author: Optional[str] |
| 47 | + isbn: Optional[str] |
| 48 | + cover_path: Optional[str] |
| 49 | + series: Optional[str] |
| 50 | + year: Optional[int] |
| 51 | + files: list[BookFileOut] |
| 52 | + tags: list[str] |
| 53 | + library_ids: list[int] |
| 54 | + |
| 55 | + model_config = {"from_attributes": True} |
| 56 | + |
| 57 | + |
| 58 | +class DuplicateGroup(BaseModel): |
| 59 | + group_id: str |
| 60 | + match_reason: str # "content_hash" | "isbn" | "similar_title" |
| 61 | + books: list[DuplicateBookOut] |
| 62 | + |
| 63 | + |
| 64 | +class DuplicatesResponse(BaseModel): |
| 65 | + groups: list[DuplicateGroup] |
| 66 | + |
| 67 | + |
| 68 | +class MergeBody(BaseModel): |
| 69 | + keep_id: int |
| 70 | + remove_ids: list[int] |
| 71 | + |
| 72 | + |
| 73 | +class DismissBody(BaseModel): |
| 74 | + book_ids: list[int] |
| 75 | + |
| 76 | + |
| 77 | +# ── Helpers ─────────────────────────────────────────────────────────────────── |
| 78 | + |
| 79 | +def _require_admin(current_user: User = Depends(get_current_user)) -> User: |
| 80 | + if not current_user.is_admin: |
| 81 | + raise HTTPException(status_code=403, detail="Admin access required") |
| 82 | + return current_user |
| 83 | + |
| 84 | + |
| 85 | +def _book_to_out(book: Book) -> DuplicateBookOut: |
| 86 | + return DuplicateBookOut( |
| 87 | + id=book.id, |
| 88 | + title=book.title, |
| 89 | + subtitle=book.subtitle, |
| 90 | + author=book.author, |
| 91 | + isbn=book.isbn, |
| 92 | + cover_path=book.cover_path, |
| 93 | + series=book.series, |
| 94 | + year=book.year, |
| 95 | + files=[BookFileOut(id=f.id, format=f.format, file_size=f.file_size) for f in book.files], |
| 96 | + tags=[t.tag for t in book.tags], |
| 97 | + library_ids=book.library_ids, |
| 98 | + ) |
| 99 | + |
| 100 | + |
| 101 | +def _dismissed_set(db: Session) -> set[frozenset[int]]: |
| 102 | + """Load all dismissed pairs as a set of frozensets for O(1) lookup.""" |
| 103 | + rows = db.query(DuplicateDismissal).all() |
| 104 | + return {frozenset([r.book_id_a, r.book_id_b]) for r in rows} |
| 105 | + |
| 106 | + |
| 107 | +def _deduplicate_groups( |
| 108 | + groups: list[list[Book]], |
| 109 | + dismissed: set[frozenset[int]], |
| 110 | +) -> list[list[Book]]: |
| 111 | + """Remove groups where every pair of books has been dismissed.""" |
| 112 | + result = [] |
| 113 | + for group in groups: |
| 114 | + ids = [b.id for b in group] |
| 115 | + all_dismissed = all( |
| 116 | + frozenset([a, b]) in dismissed |
| 117 | + for a, b in itertools.combinations(ids, 2) |
| 118 | + ) |
| 119 | + if not all_dismissed: |
| 120 | + result.append(group) |
| 121 | + return result |
| 122 | + |
| 123 | + |
| 124 | +# ── GET /admin/duplicates ───────────────────────────────────────────────────── |
| 125 | + |
| 126 | +@router.get("/admin/duplicates", response_model=DuplicatesResponse) |
| 127 | +def get_duplicates( |
| 128 | + db: Session = Depends(get_db), |
| 129 | + current_user: User = Depends(_require_admin), |
| 130 | +): |
| 131 | + dismissed = _dismissed_set(db) |
| 132 | + |
| 133 | + # Strategy 1: same content_hash |
| 134 | + hash_dupes: list[list[Book]] = [] |
| 135 | + hash_counts = ( |
| 136 | + db.query(Book.content_hash, func.count(Book.id).label("cnt")) |
| 137 | + .filter(Book.content_hash.isnot(None)) |
| 138 | + .group_by(Book.content_hash) |
| 139 | + .having(func.count(Book.id) > 1) |
| 140 | + .all() |
| 141 | + ) |
| 142 | + for row in hash_counts: |
| 143 | + books = db.query(Book).filter(Book.content_hash == row.content_hash).all() |
| 144 | + if len(books) > 1: |
| 145 | + hash_dupes.append(books) |
| 146 | + |
| 147 | + hash_dupes = _deduplicate_groups(hash_dupes, dismissed) |
| 148 | + |
| 149 | + # Strategy 2: same ISBN |
| 150 | + isbn_dupes: list[list[Book]] = [] |
| 151 | + isbn_counts = ( |
| 152 | + db.query(Book.isbn, func.count(Book.id).label("cnt")) |
| 153 | + .filter(Book.isbn.isnot(None)) |
| 154 | + .group_by(Book.isbn) |
| 155 | + .having(func.count(Book.id) > 1) |
| 156 | + .all() |
| 157 | + ) |
| 158 | + for row in isbn_counts: |
| 159 | + books = db.query(Book).filter(Book.isbn == row.isbn).all() |
| 160 | + if len(books) > 1: |
| 161 | + isbn_dupes.append(books) |
| 162 | + |
| 163 | + isbn_dupes = _deduplicate_groups(isbn_dupes, dismissed) |
| 164 | + |
| 165 | + # Collect IDs already found by exact strategies to avoid redundancy |
| 166 | + already_paired: set[frozenset[int]] = set() |
| 167 | + for group in hash_dupes + isbn_dupes: |
| 168 | + ids = [b.id for b in group] |
| 169 | + for a, b in itertools.combinations(ids, 2): |
| 170 | + already_paired.add(frozenset([a, b])) |
| 171 | + |
| 172 | + # Strategy 3: similar title+author |
| 173 | + title_dupes: list[list[Book]] = [] |
| 174 | + all_books = db.query(Book).all() |
| 175 | + |
| 176 | + # Group by author (case-insensitive) for efficiency |
| 177 | + author_groups: dict[str, list[Book]] = {} |
| 178 | + for book in all_books: |
| 179 | + key = (book.author or "").strip().lower() |
| 180 | + author_groups.setdefault(key, []).append(book) |
| 181 | + |
| 182 | + for _, group in author_groups.items(): |
| 183 | + if len(group) < 2: |
| 184 | + continue |
| 185 | + matched: set[int] = set() |
| 186 | + pair_clusters: list[set[int]] = [] |
| 187 | + |
| 188 | + for i, a in enumerate(group): |
| 189 | + for b in group[i + 1:]: |
| 190 | + pair = frozenset([a.id, b.id]) |
| 191 | + if pair in dismissed or pair in already_paired: |
| 192 | + continue |
| 193 | + ratio = difflib.SequenceMatcher( |
| 194 | + None, |
| 195 | + a.title.lower().strip(), |
| 196 | + b.title.lower().strip(), |
| 197 | + ).ratio() |
| 198 | + if ratio > 0.85: |
| 199 | + # Merge into existing cluster if possible |
| 200 | + merged = False |
| 201 | + for cluster in pair_clusters: |
| 202 | + if a.id in cluster or b.id in cluster: |
| 203 | + cluster.add(a.id) |
| 204 | + cluster.add(b.id) |
| 205 | + merged = True |
| 206 | + break |
| 207 | + if not merged: |
| 208 | + pair_clusters.append({a.id, b.id}) |
| 209 | + matched.add(a.id) |
| 210 | + matched.add(b.id) |
| 211 | + |
| 212 | + # Build Book objects for each cluster |
| 213 | + book_index = {b.id: b for b in group} |
| 214 | + for cluster in pair_clusters: |
| 215 | + cluster_books = [book_index[bid] for bid in cluster if bid in book_index] |
| 216 | + if len(cluster_books) > 1: |
| 217 | + title_dupes.append(cluster_books) |
| 218 | + |
| 219 | + # Build response |
| 220 | + groups: list[DuplicateGroup] = [] |
| 221 | + for group in hash_dupes: |
| 222 | + groups.append(DuplicateGroup( |
| 223 | + group_id=str(uuid.uuid4()), |
| 224 | + match_reason="content_hash", |
| 225 | + books=[_book_to_out(b) for b in group], |
| 226 | + )) |
| 227 | + for group in isbn_dupes: |
| 228 | + groups.append(DuplicateGroup( |
| 229 | + group_id=str(uuid.uuid4()), |
| 230 | + match_reason="isbn", |
| 231 | + books=[_book_to_out(b) for b in group], |
| 232 | + )) |
| 233 | + for group in title_dupes: |
| 234 | + groups.append(DuplicateGroup( |
| 235 | + group_id=str(uuid.uuid4()), |
| 236 | + match_reason="similar_title", |
| 237 | + books=[_book_to_out(b) for b in group], |
| 238 | + )) |
| 239 | + |
| 240 | + return DuplicatesResponse(groups=groups) |
| 241 | + |
| 242 | + |
| 243 | +# ── POST /admin/duplicates/merge ────────────────────────────────────────────── |
| 244 | + |
| 245 | +@router.post("/admin/duplicates/merge") |
| 246 | +def merge_duplicates( |
| 247 | + body: MergeBody, |
| 248 | + db: Session = Depends(get_db), |
| 249 | + current_user: User = Depends(_require_admin), |
| 250 | +): |
| 251 | + keep = db.get(Book, body.keep_id) |
| 252 | + if not keep: |
| 253 | + raise HTTPException(status_code=404, detail=f"Book {body.keep_id} not found") |
| 254 | + |
| 255 | + if not body.remove_ids: |
| 256 | + raise HTTPException(status_code=400, detail="remove_ids must not be empty") |
| 257 | + |
| 258 | + if body.keep_id in body.remove_ids: |
| 259 | + raise HTTPException(status_code=400, detail="keep_id must not appear in remove_ids") |
| 260 | + |
| 261 | + merged_count = 0 |
| 262 | + |
| 263 | + for remove_id in body.remove_ids: |
| 264 | + remove = db.get(Book, remove_id) |
| 265 | + if not remove: |
| 266 | + continue |
| 267 | + |
| 268 | + # Move BookFile rows to keep |
| 269 | + for bf in list(remove.files): |
| 270 | + bf.book_id = keep.id |
| 271 | + db.flush() |
| 272 | + |
| 273 | + # Copy tags not already present on keep |
| 274 | + existing_tags = {t.tag for t in keep.tags} |
| 275 | + for tag in list(remove.tags): |
| 276 | + if tag.tag not in existing_tags: |
| 277 | + new_tag = BookTag(book_id=keep.id, tag=tag.tag, source=tag.source) |
| 278 | + db.add(new_tag) |
| 279 | + existing_tags.add(tag.tag) |
| 280 | + |
| 281 | + # Copy library memberships not already present |
| 282 | + keep_lib_ids = {lib.id for lib in keep.libraries} |
| 283 | + for lib in list(remove.libraries): |
| 284 | + if lib.id not in keep_lib_ids: |
| 285 | + keep.libraries.append(lib) |
| 286 | + keep_lib_ids.add(lib.id) |
| 287 | + |
| 288 | + # Handle UserBookStatus: keep the entry with higher progress_pct |
| 289 | + remove_statuses = db.query(UserBookStatus).filter(UserBookStatus.book_id == remove_id).all() |
| 290 | + for remove_status in remove_statuses: |
| 291 | + keep_status = ( |
| 292 | + db.query(UserBookStatus) |
| 293 | + .filter_by(user_id=remove_status.user_id, book_id=body.keep_id) |
| 294 | + .first() |
| 295 | + ) |
| 296 | + if keep_status is None: |
| 297 | + # No existing status for this user on keep — reassign |
| 298 | + remove_status.book_id = body.keep_id |
| 299 | + else: |
| 300 | + # Both exist — keep the one with higher progress_pct |
| 301 | + remove_pct = remove_status.progress_pct or 0.0 |
| 302 | + keep_pct = keep_status.progress_pct or 0.0 |
| 303 | + if remove_pct > keep_pct: |
| 304 | + keep_status.progress_pct = remove_pct |
| 305 | + keep_status.status = remove_status.status |
| 306 | + keep_status.cfi = remove_status.cfi |
| 307 | + # Remove the duplicate status row |
| 308 | + db.delete(remove_status) |
| 309 | + |
| 310 | + db.flush() |
| 311 | + db.delete(remove) |
| 312 | + db.flush() |
| 313 | + merged_count += 1 |
| 314 | + |
| 315 | + db.commit() |
| 316 | + db.refresh(keep) |
| 317 | + |
| 318 | + audit( |
| 319 | + db, |
| 320 | + "books.duplicates_merged", |
| 321 | + user_id=current_user.id, |
| 322 | + username=current_user.username, |
| 323 | + resource_type="book", |
| 324 | + resource_id=keep.id, |
| 325 | + resource_title=keep.title, |
| 326 | + details={"kept_id": body.keep_id, "removed_ids": body.remove_ids}, |
| 327 | + ) |
| 328 | + |
| 329 | + return {"merged": merged_count, "kept_id": body.keep_id} |
| 330 | + |
| 331 | + |
| 332 | +# ── POST /admin/duplicates/dismiss ──────────────────────────────────────────── |
| 333 | + |
| 334 | +@router.post("/admin/duplicates/dismiss") |
| 335 | +def dismiss_duplicates( |
| 336 | + body: DismissBody, |
| 337 | + db: Session = Depends(get_db), |
| 338 | + current_user: User = Depends(_require_admin), |
| 339 | +): |
| 340 | + if len(body.book_ids) < 2: |
| 341 | + raise HTTPException(status_code=400, detail="At least two book_ids required to dismiss a group") |
| 342 | + |
| 343 | + stored = 0 |
| 344 | + for a, b in itertools.combinations(sorted(body.book_ids), 2): |
| 345 | + # Skip if already dismissed |
| 346 | + existing = ( |
| 347 | + db.query(DuplicateDismissal) |
| 348 | + .filter_by(book_id_a=a, book_id_b=b) |
| 349 | + .first() |
| 350 | + ) |
| 351 | + if existing: |
| 352 | + continue |
| 353 | + db.add(DuplicateDismissal(book_id_a=a, book_id_b=b)) |
| 354 | + stored += 1 |
| 355 | + |
| 356 | + db.commit() |
| 357 | + return {"dismissed": stored} |
0 commit comments