diff --git a/reader3.py b/reader3.py index d0b9d3f..f9bee9d 100644 --- a/reader3.py +++ b/reader3.py @@ -192,7 +192,7 @@ def process_epub(epub_path: str, output_dir: str) -> Book: image_map = {} # Key: internal_path, Value: local_relative_path for item in book.get_items(): - if item.get_type() == ebooklib.ITEM_IMAGE: + if item.get_type() in (ebooklib.ITEM_IMAGE, ebooklib.ITEM_COVER): # Normalize filename original_fname = os.path.basename(item.get_name()) # Sanitize filename for OS @@ -216,61 +216,140 @@ def process_epub(epub_path: str, output_dir: str) -> Book: print("Warning: Empty TOC, building fallback from Spine...") toc_structure = get_fallback_toc(book) - # 6. Process Content (Spine-based to preserve HTML validity) + # 6. Determine reading order (Spine, TOC, or manifest fallback) + def normalize_href(href: str) -> str: + if not href: + return "" + value = href.strip() + if value.startswith("./"): + value = value[2:] + try: + value = unquote(value) + except Exception: + pass + return value + + def is_document_item(item) -> bool: + name = (item.get_name() or "").lower() + media_type = (getattr(item, "media_type", None) or "").lower() + if item.get_type() == ebooklib.ITEM_DOCUMENT: + return True + if media_type in ("text/html", "application/xhtml+xml"): + return True + return name.endswith((".html", ".xhtml", ".htm")) + + doc_items = [item for item in book.get_items() if is_document_item(item)] + doc_by_full = {item.get_name(): item for item in doc_items} + doc_by_base = {} + for item in doc_items: + base = os.path.basename(item.get_name()) + if base not in doc_by_base: + doc_by_base[base] = item + else: + # Avoid ambiguous basename matches. + doc_by_base[base] = None + + def resolve_doc_item(href: str): + cleaned = normalize_href(href) + if not cleaned: + return None + if cleaned in doc_by_full: + return doc_by_full[cleaned] + base = os.path.basename(cleaned) + if base in doc_by_full: + return doc_by_full[base] + if base in doc_by_base and doc_by_base[base] is not None: + return doc_by_base[base] + return None + + toc_files = [] + + def collect_toc_files(entries): + for entry in entries: + if entry.file_href: + toc_files.append(entry.file_href) + if entry.children: + collect_toc_files(entry.children) + + collect_toc_files(toc_structure) + + ordered_items = [] + seen_ids = set() + + if toc_files: + for href in toc_files: + item = resolve_doc_item(href) + if item and item.get_id() not in seen_ids: + ordered_items.append(item) + seen_ids.add(item.get_id()) + + if not ordered_items: + # Use spine order if present. + for item_id, _linear in book.spine: + item = book.get_item_with_id(item_id) + if item and item.get_type() == ebooklib.ITEM_DOCUMENT: + if item.get_id() not in seen_ids: + ordered_items.append(item) + seen_ids.add(item.get_id()) + + if not ordered_items: + ordered_items = doc_items[:] + seen_ids = {item.get_id() for item in ordered_items} + + # Append any remaining document items not referenced in TOC/spine. + for item in doc_items: + if item.get_id() not in seen_ids: + ordered_items.append(item) + seen_ids.add(item.get_id()) + + # 7. Process Content (Ordered documents to preserve HTML validity) print("Processing chapters...") spine_chapters = [] - # We iterate over the spine (linear reading order) - for i, spine_item in enumerate(book.spine): - item_id, linear = spine_item - item = book.get_item_with_id(item_id) - - if not item: - continue - - if item.get_type() == ebooklib.ITEM_DOCUMENT: - # Raw content - raw_content = item.get_content().decode('utf-8', errors='ignore') - soup = BeautifulSoup(raw_content, 'html.parser') - - # A. Fix Images - for img in soup.find_all('img'): - src = img.get('src', '') - if not src: continue - - # Decode URL (part01/image%201.jpg -> part01/image 1.jpg) - src_decoded = unquote(src) - filename = os.path.basename(src_decoded) - - # Try to find in map - if src_decoded in image_map: - img['src'] = image_map[src_decoded] - elif filename in image_map: - img['src'] = image_map[filename] - - # B. Clean HTML - soup = clean_html_content(soup) - - # C. Extract Body Content only - body = soup.find('body') - if body: - # Extract inner HTML of body - final_html = "".join([str(x) for x in body.contents]) - else: - final_html = str(soup) - - # D. Create Object - chapter = ChapterContent( - id=item_id, - href=item.get_name(), # Important: This links TOC to Content - title=f"Section {i+1}", # Fallback, real titles come from TOC - content=final_html, - text=extract_plain_text(soup), - order=i - ) - spine_chapters.append(chapter) - - # 7. Final Assembly + for i, item in enumerate(ordered_items): + # Raw content + raw_content = item.get_content().decode('utf-8', errors='ignore') + soup = BeautifulSoup(raw_content, 'html.parser') + + # A. Fix Images + for img in soup.find_all('img'): + src = img.get('src', '') + if not src: + continue + + # Decode URL (part01/image%201.jpg -> part01/image 1.jpg) + src_decoded = unquote(src) + filename = os.path.basename(src_decoded) + + # Try to find in map + if src_decoded in image_map: + img['src'] = image_map[src_decoded] + elif filename in image_map: + img['src'] = image_map[filename] + + # B. Clean HTML + soup = clean_html_content(soup) + + # C. Extract Body Content only + body = soup.find('body') + if body: + # Extract inner HTML of body + final_html = "".join([str(x) for x in body.contents]) + else: + final_html = str(soup) + + # D. Create Object + chapter = ChapterContent( + id=item.get_id(), + href=item.get_name(), # Important: This links TOC to Content + title=f"Section {i+1}", # Fallback, real titles come from TOC + content=final_html, + text=extract_plain_text(soup), + order=i + ) + spine_chapters.append(chapter) + + # 8. Final Assembly final_book = Book( metadata=metadata, spine=spine_chapters,