Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions common/assets/llms.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
"model_card": "",
"provider": ""
},
"gpt-5.2": {
"name": "[OpenAI] GPT-5.2",
"model_card": "https://platform.openai.com/docs/models/gpt-5.2",
"gpt-5.4": {
"name": "[OpenAI] GPT-5.4",
"model_card": "https://platform.openai.com/docs/models/gpt-5.4",
"provider": "openai"
},
"gpt-5-mini": {
Expand All @@ -25,9 +25,9 @@
"model_card": "https://platform.openai.com/docs/models/gpt-5-nano",
"provider": "openai"
},
"gpt-5.2-pro": {
"name": "[OpenAI] GPT-5.2 Pro",
"model_card": "https://platform.openai.com/docs/models/gpt-5.2-pro",
"gpt-5.4-pro": {
"name": "[OpenAI] GPT-5.4 Pro",
"model_card": "https://platform.openai.com/docs/models/gpt-5.4-pro",
"provider": "openai"
},
"gpt-4.1-mini": {
Expand Down Expand Up @@ -65,7 +65,7 @@
"model_card": "https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/3-flash",
"provider": "google"
},
"gemini-3.1-flash-lite": {
"gemini-3.1-flash-lite-preview": {
"name": "[Google] Gemini 3.1 Flash Lite",
"provider": "google",
"model_card": "https://docs.cloud.google.com/vertex-ai/generative-ai/docs/models/gemini/3-1-flash-lite"
Expand Down
114 changes: 94 additions & 20 deletions common/lib/llm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import json
import base64
import mimetypes
import requests
from pathlib import Path
from typing import List, Optional, Union
Expand Down Expand Up @@ -127,15 +129,17 @@ def generate_text(
messages: Union[str, List[BaseMessage]],
system_prompt: Optional[str] = None,
temperature: float = 0.1,
files: Optional[List[Union[str, Path, dict]]] = None,
files: Optional[List[str]] = None,
media_files: Optional[List[Union[str, Path]]] = None,
) -> BaseMessage:
"""
Supports string input or LangChain message list, with optional multimodal files.

:param messages: Text prompt or list of LangChain messages
:param system_prompt: Optional system prompt
:param temperature: Temperature for generation
:param files: Optional list of file paths or content dicts for multimodal input
:param files: Optional list of media URLs for multimodal input
:param media_files: Optional list of local file paths for multimodal input (base64-encoded)
:returns: Generated response message
"""
if isinstance(messages, str):
Expand All @@ -144,8 +148,12 @@ def generate_text(
lc_messages.append(SystemMessage(content=system_prompt))

# Create multimodal content if files are provided
if files:
multimodal_content = self.create_multimodal_content(messages, files)
if files or media_files:
multimodal_content = self.create_multimodal_content(
messages,
media_urls=files,
media_files=media_files,
)
lc_messages.append(HumanMessage(content=multimodal_content))
else:
lc_messages.append(HumanMessage(content=messages))
Expand All @@ -166,39 +174,105 @@ def generate_text(
def create_multimodal_content(
self,
text: str,
image_urls: Optional[List[str]] = None,
media_urls: Optional[List[str]] = None,
media_files: Optional[List[Union[str, Path]]] = None,
) -> List[dict]:
"""
Create multimodal content structure for LangChain messages with media URLs.
Only supports image URLs for now.
Create multimodal content structure for LangChain messages with media URLs
and/or local media files (base64-encoded).

Supports images, video, and audio depending on the provider and model.

:param text: Text content
:param image_urls: List of media URLs (http/https)
:param media_urls: List of media URLs (http/https)
:param media_files: List of local file paths to encode as base64
:returns: List of content blocks
"""
content = []

# Add image URLs first
if image_urls:
for url in image_urls:
# Add media URLs
if media_urls:
for url in media_urls:
if not isinstance(url, str):
raise ValueError(f"Image URL must be a string, got {type(url)}")
raise ValueError(f"Media URL must be a string, got {type(url)}")

# Format based on provider
if self.provider == "anthropic":
content.append(
{"type": "image", "source": {"type": "url", "url": url}}
)
else:
# OpenAI-style format
content.append({"type": "image_url", "image_url": {"url": url}})
mime_type = mimetypes.guess_type(url.split("?")[0])[0] or "application/octet-stream"
media_category = mime_type.split("/")[0] # "image", "video", or "audio"
content.append(self._format_media_block(url=url, mime_type=mime_type, media_category=media_category))

# Add base64-encoded local files
if media_files:
for file_path in media_files:
file_path = Path(file_path)
if not file_path.exists():
raise ValueError(f"Media file not found: {file_path}")

mime_type = mimetypes.guess_type(str(file_path))[0] or "application/octet-stream"
media_category = mime_type.split("/")[0]

with file_path.open("rb") as f:
b64_data = base64.b64encode(f.read()).decode("utf-8")

content.append(self._format_media_block(
b64_data=b64_data, mime_type=mime_type, media_category=media_category
))

# Add text content
if text:
content.append({"type": "text", "text": text})

return content

def _format_media_block(
self,
url: Optional[str] = None,
b64_data: Optional[str] = None,
mime_type: str = "image/jpeg",
media_category: str = "image",
) -> dict:
"""
Format a single media block for the appropriate provider.

:param url: Media URL (if URL-based)
:param b64_data: Base64-encoded data (if file-based)
:param mime_type: MIME type of the media
:param media_category: "image", "video", or "audio"
:returns: Provider-formatted content block
"""
if self.provider == "anthropic":
if media_category == "image":
if url:
return {"type": "image", "source": {"type": "url", "url": url}}
else:
return {"type": "image", "source": {
"type": "base64", "media_type": mime_type, "data": b64_data
}}
else:
# Anthropic uses document blocks for video/audio
if url:
return {"type": "document", "source": {"type": "url", "url": url}}
else:
return {"type": "document", "source": {
"type": "base64", "media_type": mime_type, "data": b64_data
}}
elif self.provider == "google":
if url:
return {"type": "image_url", "image_url": {"url": url}}
else:
data_uri = f"data:{mime_type};base64,{b64_data}"
return {"type": "image_url", "image_url": {"url": data_uri}}
else:
# OpenAI-style format (OpenAI, Mistral, DeepSeek, Ollama, LM Studio, vLLM)
if url:
return {"type": "image_url", "image_url": {"url": url}}
else:
data_uri = f"data:{mime_type};base64,{b64_data}"
if media_category == "audio" and self.provider == "openai":
return {"type": "input_audio", "input_audio": {
"data": b64_data, "format": mime_type.split("/")[-1]
}}
return {"type": "image_url", "image_url": {"url": data_uri}}

def set_structure(self, json_schema):
if not json_schema:
raise ValueError("json_schema is None")
Expand Down
36 changes: 26 additions & 10 deletions processors/audio/audio_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
https://ffmpeg.org/
"""
import shutil
import zipfile
from pathlib import Path
import oslex

from backend.lib.processor import BasicProcessor
Expand All @@ -29,6 +31,7 @@ class AudioExtractor(BasicProcessor):
title = "Extract audio from videos" # title displayed in UI
description = "Create audio files per video" # description displayed in UI
extension = "zip" # extension of result file, used internally and in UI
media_type = "audio"

followups = ["audio-to-text"]

Expand Down Expand Up @@ -72,20 +75,31 @@ def process(self):
# Prepare staging areas for videos and video tracking
output_dir = self.dataset.get_staging_area()

total_possible_videos = max_files if max_files != 0 and max_files < self.source_dataset.num_rows - 1 \
else self.source_dataset.num_rows
# Estimate how many actual video files we will attempt, excluding archive metadata.
total_possible_videos = self.source_dataset.num_rows
source_archive = self.source_dataset.get_results_path()
if source_archive.exists() and source_archive.suffix.lower() == ".zip":
with zipfile.ZipFile(source_archive, "r") as archive_file:
total_possible_videos = sum(
1
for archived_file in archive_file.infolist()
if not archived_file.is_dir() and Path(archived_file.filename).name != ".metadata.json"
)

if max_files != 0:
total_possible_videos = min(total_possible_videos, max_files)

processed_videos = 0
written = 0

self.dataset.update_status("Extracting video audio")
for item in self.source_dataset.iterate_items():
for item in self.source_dataset.iterate_items(processor=self, get_annotations=False):
if self.interrupted:
raise ProcessorInterruptedException("Interrupted while determining image wall order")

# Check for 4CAT's metadata JSON and copy it
if item.file.name == '.metadata.json':
shutil.copy(item.file, output_dir.joinpath(".video_metadata.json"))
shutil.copy(item.file, output_dir.joinpath(".metadata.json"))
continue

if max_files != 0 and processed_videos >= max_files:
Expand All @@ -102,6 +116,9 @@ def process(self):

result = self.run_interruptable_process(command, cleanup_paths=(output_dir,))

# Count attempted conversions separately from successful outputs.
processed_videos += 1

# Capture logs
ffmpeg_output = result.stdout.decode("utf-8")
ffmpeg_error = result.stderr.decode("utf-8")
Expand All @@ -123,11 +140,10 @@ def process(self):
error = 'Error Return Code with video %s: %s' % (vid_name, str(result.returncode))
self.dataset.log(error)

processed_videos += 1
self.dataset.update_status(f"Extracted audio from {processed_videos} of {total_possible_videos} videos")
self.dataset.update_progress(processed_videos / total_possible_videos)
self.dataset.update_status(f"Extracted audio from {written} of {processed_videos} attempted videos")
self.dataset.update_progress(min(1, processed_videos / max(total_possible_videos, 1)))

# Finish up
warning = f"Extracted {written}/{total_possible_videos} audio files, check the logs for errors." \
if written < total_possible_videos else None
self.write_archive_and_finish(output_dir, num_items=processed_videos, warning=warning)
warning = f"Extracted {written}/{processed_videos} audio files, check the logs for errors." \
if written < processed_videos else None
self.write_archive_and_finish(output_dir, num_items=written, warning=warning)
Loading
Loading