Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 37 additions & 1 deletion reader3.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
import os
import pickle
import shutil
import tarfile
import zipfile
import io
from dataclasses import dataclass, field
from typing import List, Dict, Optional, Any
from datetime import datetime
Expand Down Expand Up @@ -69,6 +72,31 @@ class Book:

# --- Utilities ---

def _convert_tar_to_zip_in_memory(tar_path: str) -> io.BytesIO:
"""
Converts a .tar (or compressed tar) file to a .zip file in memory.
Ebooklib requires a ZIP-format EPUB.
"""
zip_buffer = io.BytesIO()

with tarfile.open(tar_path, "r:*") as tar:
members = tar.getmembers()
# Find the root directory by locating the 'mimetype' file
mimetype_member = next((m for m in members if os.path.basename(m.name) == 'mimetype'), None)
root = os.path.dirname(mimetype_member.name) + '/' if mimetype_member and os.path.dirname(mimetype_member.name) else ""

with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
for member in members:
if member.isfile():
f = tar.extractfile(member)
if f:
arcname = member.name[len(root):] if member.name.startswith(root) else member.name
zip_file.writestr(arcname, f.read())

zip_buffer.seek(0)
return zip_buffer


def clean_html_content(soup: BeautifulSoup) -> BeautifulSoup:

# Remove dangerous/useless tags
Expand Down Expand Up @@ -176,7 +204,15 @@ def process_epub(epub_path: str, output_dir: str) -> Book:

# 1. Load Book
print(f"Loading {epub_path}...")
book = epub.read_epub(epub_path)

# Handle TAR files by converting them to ZIP in memory
if tarfile.is_tarfile(epub_path):
print(f"Detected TAR format for {epub_path}, converting to ZIP in memory...")
epub_resource = _convert_tar_to_zip_in_memory(epub_path)
else:
epub_resource = epub_path

book = epub.read_epub(epub_resource)

# 2. Extract Metadata
metadata = extract_metadata_robust(book)
Expand Down