Skip to content

Commit 32ccddc

Browse files
bndct-devopsclaude
andcommitted
feat: stats fix, duplicate detection, keyboard shortcuts, web reading sessions
- Fix books-finished chart grouping by date with correct tooltip - Add admin Duplicates tab (content hash, ISBN, fuzzy title matching + merge/dismiss) - Add keyboard navigation (j/k, Enter, Escape, ? help modal) on dashboard and detail page - Track web reader sessions in ReadingSession table for stats Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 3473c0a commit 32ccddc

11 files changed

Lines changed: 1073 additions & 79 deletions

File tree

backend/api/admin_duplicates.py

Lines changed: 357 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,357 @@
1+
"""Admin duplicate detection endpoints.
2+
3+
Provides three detection strategies:
4+
1. Same content_hash (exact file duplicates)
5+
2. Same ISBN
6+
3. Similar title+author (SequenceMatcher ratio > 0.85)
7+
8+
Dismissed pairs are persisted in the duplicate_dismissals table and excluded
9+
from subsequent GET results.
10+
"""
11+
import difflib
12+
import itertools
13+
import uuid
14+
from typing import Optional
15+
16+
from fastapi import APIRouter, Depends, HTTPException
17+
from pydantic import BaseModel
18+
from sqlalchemy import func
19+
from sqlalchemy.orm import Session
20+
21+
from backend.core.database import get_db
22+
from backend.core.security import get_current_user
23+
from backend.models.book import Book, BookTag
24+
from backend.models.duplicate_dismissal import DuplicateDismissal
25+
from backend.models.user import User
26+
from backend.models.user_book_status import UserBookStatus
27+
from backend.services.audit import audit
28+
29+
router = APIRouter()
30+
31+
32+
# ── Schemas ──────────────────────────────────────────────────────────────────
33+
34+
class BookFileOut(BaseModel):
35+
id: int
36+
format: str
37+
file_size: Optional[int]
38+
39+
model_config = {"from_attributes": True}
40+
41+
42+
class DuplicateBookOut(BaseModel):
43+
id: int
44+
title: str
45+
subtitle: Optional[str]
46+
author: Optional[str]
47+
isbn: Optional[str]
48+
cover_path: Optional[str]
49+
series: Optional[str]
50+
year: Optional[int]
51+
files: list[BookFileOut]
52+
tags: list[str]
53+
library_ids: list[int]
54+
55+
model_config = {"from_attributes": True}
56+
57+
58+
class DuplicateGroup(BaseModel):
59+
group_id: str
60+
match_reason: str # "content_hash" | "isbn" | "similar_title"
61+
books: list[DuplicateBookOut]
62+
63+
64+
class DuplicatesResponse(BaseModel):
65+
groups: list[DuplicateGroup]
66+
67+
68+
class MergeBody(BaseModel):
69+
keep_id: int
70+
remove_ids: list[int]
71+
72+
73+
class DismissBody(BaseModel):
74+
book_ids: list[int]
75+
76+
77+
# ── Helpers ───────────────────────────────────────────────────────────────────
78+
79+
def _require_admin(current_user: User = Depends(get_current_user)) -> User:
80+
if not current_user.is_admin:
81+
raise HTTPException(status_code=403, detail="Admin access required")
82+
return current_user
83+
84+
85+
def _book_to_out(book: Book) -> DuplicateBookOut:
86+
return DuplicateBookOut(
87+
id=book.id,
88+
title=book.title,
89+
subtitle=book.subtitle,
90+
author=book.author,
91+
isbn=book.isbn,
92+
cover_path=book.cover_path,
93+
series=book.series,
94+
year=book.year,
95+
files=[BookFileOut(id=f.id, format=f.format, file_size=f.file_size) for f in book.files],
96+
tags=[t.tag for t in book.tags],
97+
library_ids=book.library_ids,
98+
)
99+
100+
101+
def _dismissed_set(db: Session) -> set[frozenset[int]]:
102+
"""Load all dismissed pairs as a set of frozensets for O(1) lookup."""
103+
rows = db.query(DuplicateDismissal).all()
104+
return {frozenset([r.book_id_a, r.book_id_b]) for r in rows}
105+
106+
107+
def _deduplicate_groups(
108+
groups: list[list[Book]],
109+
dismissed: set[frozenset[int]],
110+
) -> list[list[Book]]:
111+
"""Remove groups where every pair of books has been dismissed."""
112+
result = []
113+
for group in groups:
114+
ids = [b.id for b in group]
115+
all_dismissed = all(
116+
frozenset([a, b]) in dismissed
117+
for a, b in itertools.combinations(ids, 2)
118+
)
119+
if not all_dismissed:
120+
result.append(group)
121+
return result
122+
123+
124+
# ── GET /admin/duplicates ─────────────────────────────────────────────────────
125+
126+
@router.get("/admin/duplicates", response_model=DuplicatesResponse)
127+
def get_duplicates(
128+
db: Session = Depends(get_db),
129+
current_user: User = Depends(_require_admin),
130+
):
131+
dismissed = _dismissed_set(db)
132+
133+
# Strategy 1: same content_hash
134+
hash_dupes: list[list[Book]] = []
135+
hash_counts = (
136+
db.query(Book.content_hash, func.count(Book.id).label("cnt"))
137+
.filter(Book.content_hash.isnot(None))
138+
.group_by(Book.content_hash)
139+
.having(func.count(Book.id) > 1)
140+
.all()
141+
)
142+
for row in hash_counts:
143+
books = db.query(Book).filter(Book.content_hash == row.content_hash).all()
144+
if len(books) > 1:
145+
hash_dupes.append(books)
146+
147+
hash_dupes = _deduplicate_groups(hash_dupes, dismissed)
148+
149+
# Strategy 2: same ISBN
150+
isbn_dupes: list[list[Book]] = []
151+
isbn_counts = (
152+
db.query(Book.isbn, func.count(Book.id).label("cnt"))
153+
.filter(Book.isbn.isnot(None))
154+
.group_by(Book.isbn)
155+
.having(func.count(Book.id) > 1)
156+
.all()
157+
)
158+
for row in isbn_counts:
159+
books = db.query(Book).filter(Book.isbn == row.isbn).all()
160+
if len(books) > 1:
161+
isbn_dupes.append(books)
162+
163+
isbn_dupes = _deduplicate_groups(isbn_dupes, dismissed)
164+
165+
# Collect IDs already found by exact strategies to avoid redundancy
166+
already_paired: set[frozenset[int]] = set()
167+
for group in hash_dupes + isbn_dupes:
168+
ids = [b.id for b in group]
169+
for a, b in itertools.combinations(ids, 2):
170+
already_paired.add(frozenset([a, b]))
171+
172+
# Strategy 3: similar title+author
173+
title_dupes: list[list[Book]] = []
174+
all_books = db.query(Book).all()
175+
176+
# Group by author (case-insensitive) for efficiency
177+
author_groups: dict[str, list[Book]] = {}
178+
for book in all_books:
179+
key = (book.author or "").strip().lower()
180+
author_groups.setdefault(key, []).append(book)
181+
182+
for _, group in author_groups.items():
183+
if len(group) < 2:
184+
continue
185+
matched: set[int] = set()
186+
pair_clusters: list[set[int]] = []
187+
188+
for i, a in enumerate(group):
189+
for b in group[i + 1:]:
190+
pair = frozenset([a.id, b.id])
191+
if pair in dismissed or pair in already_paired:
192+
continue
193+
ratio = difflib.SequenceMatcher(
194+
None,
195+
a.title.lower().strip(),
196+
b.title.lower().strip(),
197+
).ratio()
198+
if ratio > 0.85:
199+
# Merge into existing cluster if possible
200+
merged = False
201+
for cluster in pair_clusters:
202+
if a.id in cluster or b.id in cluster:
203+
cluster.add(a.id)
204+
cluster.add(b.id)
205+
merged = True
206+
break
207+
if not merged:
208+
pair_clusters.append({a.id, b.id})
209+
matched.add(a.id)
210+
matched.add(b.id)
211+
212+
# Build Book objects for each cluster
213+
book_index = {b.id: b for b in group}
214+
for cluster in pair_clusters:
215+
cluster_books = [book_index[bid] for bid in cluster if bid in book_index]
216+
if len(cluster_books) > 1:
217+
title_dupes.append(cluster_books)
218+
219+
# Build response
220+
groups: list[DuplicateGroup] = []
221+
for group in hash_dupes:
222+
groups.append(DuplicateGroup(
223+
group_id=str(uuid.uuid4()),
224+
match_reason="content_hash",
225+
books=[_book_to_out(b) for b in group],
226+
))
227+
for group in isbn_dupes:
228+
groups.append(DuplicateGroup(
229+
group_id=str(uuid.uuid4()),
230+
match_reason="isbn",
231+
books=[_book_to_out(b) for b in group],
232+
))
233+
for group in title_dupes:
234+
groups.append(DuplicateGroup(
235+
group_id=str(uuid.uuid4()),
236+
match_reason="similar_title",
237+
books=[_book_to_out(b) for b in group],
238+
))
239+
240+
return DuplicatesResponse(groups=groups)
241+
242+
243+
# ── POST /admin/duplicates/merge ──────────────────────────────────────────────
244+
245+
@router.post("/admin/duplicates/merge")
246+
def merge_duplicates(
247+
body: MergeBody,
248+
db: Session = Depends(get_db),
249+
current_user: User = Depends(_require_admin),
250+
):
251+
keep = db.get(Book, body.keep_id)
252+
if not keep:
253+
raise HTTPException(status_code=404, detail=f"Book {body.keep_id} not found")
254+
255+
if not body.remove_ids:
256+
raise HTTPException(status_code=400, detail="remove_ids must not be empty")
257+
258+
if body.keep_id in body.remove_ids:
259+
raise HTTPException(status_code=400, detail="keep_id must not appear in remove_ids")
260+
261+
merged_count = 0
262+
263+
for remove_id in body.remove_ids:
264+
remove = db.get(Book, remove_id)
265+
if not remove:
266+
continue
267+
268+
# Move BookFile rows to keep
269+
for bf in list(remove.files):
270+
bf.book_id = keep.id
271+
db.flush()
272+
273+
# Copy tags not already present on keep
274+
existing_tags = {t.tag for t in keep.tags}
275+
for tag in list(remove.tags):
276+
if tag.tag not in existing_tags:
277+
new_tag = BookTag(book_id=keep.id, tag=tag.tag, source=tag.source)
278+
db.add(new_tag)
279+
existing_tags.add(tag.tag)
280+
281+
# Copy library memberships not already present
282+
keep_lib_ids = {lib.id for lib in keep.libraries}
283+
for lib in list(remove.libraries):
284+
if lib.id not in keep_lib_ids:
285+
keep.libraries.append(lib)
286+
keep_lib_ids.add(lib.id)
287+
288+
# Handle UserBookStatus: keep the entry with higher progress_pct
289+
remove_statuses = db.query(UserBookStatus).filter(UserBookStatus.book_id == remove_id).all()
290+
for remove_status in remove_statuses:
291+
keep_status = (
292+
db.query(UserBookStatus)
293+
.filter_by(user_id=remove_status.user_id, book_id=body.keep_id)
294+
.first()
295+
)
296+
if keep_status is None:
297+
# No existing status for this user on keep — reassign
298+
remove_status.book_id = body.keep_id
299+
else:
300+
# Both exist — keep the one with higher progress_pct
301+
remove_pct = remove_status.progress_pct or 0.0
302+
keep_pct = keep_status.progress_pct or 0.0
303+
if remove_pct > keep_pct:
304+
keep_status.progress_pct = remove_pct
305+
keep_status.status = remove_status.status
306+
keep_status.cfi = remove_status.cfi
307+
# Remove the duplicate status row
308+
db.delete(remove_status)
309+
310+
db.flush()
311+
db.delete(remove)
312+
db.flush()
313+
merged_count += 1
314+
315+
db.commit()
316+
db.refresh(keep)
317+
318+
audit(
319+
db,
320+
"books.duplicates_merged",
321+
user_id=current_user.id,
322+
username=current_user.username,
323+
resource_type="book",
324+
resource_id=keep.id,
325+
resource_title=keep.title,
326+
details={"kept_id": body.keep_id, "removed_ids": body.remove_ids},
327+
)
328+
329+
return {"merged": merged_count, "kept_id": body.keep_id}
330+
331+
332+
# ── POST /admin/duplicates/dismiss ────────────────────────────────────────────
333+
334+
@router.post("/admin/duplicates/dismiss")
335+
def dismiss_duplicates(
336+
body: DismissBody,
337+
db: Session = Depends(get_db),
338+
current_user: User = Depends(_require_admin),
339+
):
340+
if len(body.book_ids) < 2:
341+
raise HTTPException(status_code=400, detail="At least two book_ids required to dismiss a group")
342+
343+
stored = 0
344+
for a, b in itertools.combinations(sorted(body.book_ids), 2):
345+
# Skip if already dismissed
346+
existing = (
347+
db.query(DuplicateDismissal)
348+
.filter_by(book_id_a=a, book_id_b=b)
349+
.first()
350+
)
351+
if existing:
352+
continue
353+
db.add(DuplicateDismissal(book_id_a=a, book_id_b=b))
354+
stored += 1
355+
356+
db.commit()
357+
return {"dismissed": stored}

0 commit comments

Comments
 (0)