digitalmethodsinitiative · sal-uva · Mar 12, 2026 · Mar 9, 2026 · Mar 9, 2026 · Mar 9, 2026
diff --git a/common/assets/llms.json b/common/assets/llms.json
@@ -10,9 +10,9 @@
         "model_card": "",
         "provider": ""
     },
-    "gpt-5.2": {
-        "name": "[OpenAI] GPT-5.2",
-        "model_card": "https://platform.openai.com/docs/models/gpt-5.2",
+    "gpt-5.4": {
+        "name": "[OpenAI] GPT-5.4",
+        "model_card": "https://platform.openai.com/docs/models/gpt-5.4",
         "provider": "openai"
     },
     "gpt-5-mini": {
@@ -25,9 +25,9 @@
         "model_card": "https://platform.openai.com/docs/models/gpt-5-nano",
         "provider": "openai"
     },
-    "gpt-5.2-pro": {
-        "name": "[OpenAI] GPT-5.2 Pro",
-        "model_card": "https://platform.openai.com/docs/models/gpt-5.2-pro",
+    "gpt-5.4-pro": {
+        "name": "[OpenAI] GPT-5.4 Pro",
+        "model_card": "https://platform.openai.com/docs/models/gpt-5.4-pro",
         "provider": "openai"
     },
     "gpt-4.1-mini": {
@@ -65,7 +65,7 @@
         "model_card": "https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/3-flash",
         "provider": "google"
     },
-    "gemini-3.1-flash-lite": {
+    "gemini-3.1-flash-lite-preview": {
         "name": "[Google] Gemini 3.1 Flash Lite",
         "provider": "google",
         "model_card": "https://docs.cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/3-1-flash-lite"

diff --git a/common/lib/llm.py b/common/lib/llm.py
@@ -1,4 +1,6 @@
 import json
+import base64
+import mimetypes
 import requests
 from pathlib import Path
 from typing import List, Optional, Union
@@ -127,15 +129,17 @@ def generate_text(
             messages: Union[str, List[BaseMessage]],
             system_prompt: Optional[str] = None,
             temperature: float = 0.1,
-            files: Optional[List[Union[str, Path, dict]]] = None,
+            files: Optional[List[str]] = None,
+            media_files: Optional[List[Union[str, Path]]] = None,
     ) -> BaseMessage:
         """
         Supports string input or LangChain message list, with optional multimodal files.
 
         :param messages: Text prompt or list of LangChain messages
         :param system_prompt: Optional system prompt
         :param temperature: Temperature for generation
-        :param files: Optional list of file paths or content dicts for multimodal input
+        :param files: Optional list of media URLs for multimodal input
+        :param media_files: Optional list of local file paths for multimodal input (base64-encoded)
         :returns: Generated response message
         """
         if isinstance(messages, str):
@@ -144,8 +148,12 @@ def generate_text(
                 lc_messages.append(SystemMessage(content=system_prompt))
 
             # Create multimodal content if files are provided
-            if files:
-                multimodal_content = self.create_multimodal_content(messages, files)
+            if files or media_files:
+                multimodal_content = self.create_multimodal_content(
+                    messages,
+                    media_urls=files,
+                    media_files=media_files,
+                )
                 lc_messages.append(HumanMessage(content=multimodal_content))
             else:
                 lc_messages.append(HumanMessage(content=messages))
@@ -166,39 +174,105 @@ def generate_text(
     def create_multimodal_content(
         self,
         text: str,
-        image_urls: Optional[List[str]] = None,
+        media_urls: Optional[List[str]] = None,
+        media_files: Optional[List[Union[str, Path]]] = None,
     ) -> List[dict]:
         """
-        Create multimodal content structure for LangChain messages with media URLs.
-        Only supports image URLs for now.
+        Create multimodal content structure for LangChain messages with media URLs
+        and/or local media files (base64-encoded).
+
+        Supports images, video, and audio depending on the provider and model.
 
         :param text: Text content
-        :param image_urls: List of media URLs (http/https)
+        :param media_urls: List of media URLs (http/https)
+        :param media_files: List of local file paths to encode as base64
         :returns: List of content blocks
         """
         content = []
 
-        # Add image URLs first
-        if image_urls:
-            for url in image_urls:
+        # Add media URLs
+        if media_urls:
+            for url in media_urls:
                 if not isinstance(url, str):
-                    raise ValueError(f"Image URL must be a string, got {type(url)}")
+                    raise ValueError(f"Media URL must be a string, got {type(url)}")
 
-                # Format based on provider
-                if self.provider == "anthropic":
-                    content.append(
-                        {"type": "image", "source": {"type": "url", "url": url}}
-                    )
-                else:
-                    # OpenAI-style format
-                    content.append({"type": "image_url", "image_url": {"url": url}})
+                mime_type = mimetypes.guess_type(url.split("?")[0])[0] or "application/octet-stream"
+                media_category = mime_type.split("/")[0]  # "image", "video", or "audio"
+                content.append(self._format_media_block(url=url, mime_type=mime_type, media_category=media_category))
+
+        # Add base64-encoded local files
+        if media_files:
+            for file_path in media_files:
+                file_path = Path(file_path)
+                if not file_path.exists():
+                    raise ValueError(f"Media file not found: {file_path}")
+
+                mime_type = mimetypes.guess_type(str(file_path))[0] or "application/octet-stream"
+                media_category = mime_type.split("/")[0]
+
+                with file_path.open("rb") as f:
+                    b64_data = base64.b64encode(f.read()).decode("utf-8")
+
+                content.append(self._format_media_block(
+                    b64_data=b64_data, mime_type=mime_type, media_category=media_category
+                ))
 
         # Add text content
         if text:
             content.append({"type": "text", "text": text})
 
         return content
 
+    def _format_media_block(
+        self,
+        url: Optional[str] = None,
+        b64_data: Optional[str] = None,
+        mime_type: str = "image/jpeg",
+        media_category: str = "image",
+    ) -> dict:
+        """
+        Format a single media block for the appropriate provider.
+
+        :param url: Media URL (if URL-based)
+        :param b64_data: Base64-encoded data (if file-based)
+        :param mime_type: MIME type of the media
+        :param media_category: "image", "video", or "audio"
+        :returns: Provider-formatted content block
+        """
+        if self.provider == "anthropic":
+            if media_category == "image":
+                if url:
+                    return {"type": "image", "source": {"type": "url", "url": url}}
+                else:
+                    return {"type": "image", "source": {
+                        "type": "base64", "media_type": mime_type, "data": b64_data
+                    }}
+            else:
+                # Anthropic uses document blocks for video/audio
+                if url:
+                    return {"type": "document", "source": {"type": "url", "url": url}}
+                else:
+                    return {"type": "document", "source": {
+                        "type": "base64", "media_type": mime_type, "data": b64_data
+                    }}
+        elif self.provider == "google":
+            if url:
+                return {"type": "image_url", "image_url": {"url": url}}
+            else:
+                data_uri = f"data:{mime_type};base64,{b64_data}"
+                return {"type": "image_url", "image_url": {"url": data_uri}}
+        else:
+            # OpenAI-style format (OpenAI, Mistral, DeepSeek, Ollama, LM Studio, vLLM)
+            if url:
+                return {"type": "image_url", "image_url": {"url": url}}
+            else:
+                data_uri = f"data:{mime_type};base64,{b64_data}"
+                if media_category == "audio" and self.provider == "openai":
+                    return {"type": "input_audio", "input_audio": {
+                        "data": b64_data, "format": mime_type.split("/")[-1]
+                    }}
+                return {"type": "image_url", "image_url": {"url": data_uri}}
+
     def set_structure(self, json_schema):
         if not json_schema:
             raise ValueError("json_schema is None")

diff --git a/processors/audio/audio_extractor.py b/processors/audio/audio_extractor.py
@@ -5,6 +5,8 @@
 https://ffmpeg.org/
 """
 import shutil
+import zipfile
+from pathlib import Path
 import oslex
 
 from backend.lib.processor import BasicProcessor
@@ -29,6 +31,7 @@ class AudioExtractor(BasicProcessor):
     title = "Extract audio from videos"  # title displayed in UI
     description = "Create audio files per video"  # description displayed in UI
     extension = "zip"  # extension of result file, used internally and in UI
+    media_type = "audio"
 
     followups = ["audio-to-text"]
 
@@ -72,20 +75,31 @@ def process(self):
         # Prepare staging areas for videos and video tracking
         output_dir = self.dataset.get_staging_area()
 
-        total_possible_videos = max_files if max_files != 0 and max_files < self.source_dataset.num_rows - 1 \
-            else self.source_dataset.num_rows
+        # Estimate how many actual video files we will attempt, excluding archive metadata.
+        total_possible_videos = self.source_dataset.num_rows
+        source_archive = self.source_dataset.get_results_path()
+        if source_archive.exists() and source_archive.suffix.lower() == ".zip":
+            with zipfile.ZipFile(source_archive, "r") as archive_file:
+                total_possible_videos = sum(
+                    1
+                    for archived_file in archive_file.infolist()
+                    if not archived_file.is_dir() and Path(archived_file.filename).name != ".metadata.json"
+                )
+
+        if max_files != 0:
+            total_possible_videos = min(total_possible_videos, max_files)
 
         processed_videos = 0
         written = 0
 
         self.dataset.update_status("Extracting video audio")
-        for item in self.source_dataset.iterate_items():
+        for item in self.source_dataset.iterate_items(processor=self, get_annotations=False):
             if self.interrupted:
                 raise ProcessorInterruptedException("Interrupted while determining image wall order")
 
             # Check for 4CAT's metadata JSON and copy it
             if item.file.name == '.metadata.json':
-                shutil.copy(item.file, output_dir.joinpath(".video_metadata.json"))
+                shutil.copy(item.file, output_dir.joinpath(".metadata.json"))
                 continue
 
             if max_files != 0 and processed_videos >= max_files:
@@ -102,6 +116,9 @@ def process(self):
 
             result = self.run_interruptable_process(command, cleanup_paths=(output_dir,))
 
+            # Count attempted conversions separately from successful outputs.
+            processed_videos += 1
+
             # Capture logs
             ffmpeg_output = result.stdout.decode("utf-8")
             ffmpeg_error = result.stderr.decode("utf-8")
@@ -123,11 +140,10 @@ def process(self):
                 error = 'Error Return Code with video %s: %s' % (vid_name, str(result.returncode))
                 self.dataset.log(error)
 
-            processed_videos += 1
-            self.dataset.update_status(f"Extracted audio from {processed_videos} of {total_possible_videos} videos")
-            self.dataset.update_progress(processed_videos / total_possible_videos)
+            self.dataset.update_status(f"Extracted audio from {written} of {processed_videos} attempted videos")
+            self.dataset.update_progress(min(1, processed_videos / max(total_possible_videos, 1)))
 
         # Finish up
-        warning = f"Extracted {written}/{total_possible_videos} audio files, check the logs for errors." \
-            if written < total_possible_videos else None
-        self.write_archive_and_finish(output_dir, num_items=processed_videos, warning=warning)
+        warning = f"Extracted {written}/{processed_videos} audio files, check the logs for errors." \
+            if written < processed_videos else None
+        self.write_archive_and_finish(output_dir, num_items=written, warning=warning)