feat: add image captioning service and endpoint, update dependencies

LightCyan01 · LightCyan01 · commit 2fd49d5adebb · 2025-11-14T23:03:48.000-05:00
diff --git a/backend/main.py b/backend/main.py
@@ -3,6 +3,7 @@
 from pydantic import BaseModel
 from service.image_rename import RenameService
 from service.image_upscaling import ImageUpscaleService
+from service.image_captioning import ImageCaptioningService
 from utils.image_util import get_device
 
 app = FastAPI()
@@ -19,10 +20,12 @@ class UpscaleRequest(BaseModel):
     save_path: str
     format: str
     use_tiling: bool = True
-
-@app.get("/")
-def root():
-    return {"Hello": "World"}
+    
+class CaptionRequest(BaseModel):
+    caption_model_path: str
+    load_path: str
+    save_path: str
+    prompt: str
 
 @app.post("/rename")
 def rename(request: RenameRequest):
@@ -42,6 +45,13 @@ def upscale(request: UpscaleRequest):
     service.upscale(use_tiling=request.use_tiling)
     return {"status": "Upscaling Complete"}
 
+@app.post("/caption")
+def caption(request: CaptionRequest):
+    service = ImageCaptioningService(Path(request.caption_model_path), Path(request.load_path), Path(request.save_path), request.prompt)
+    service.caption_image()
+    return{"status": "Captioning Complete"}
+    
+
 @app.get("/device")
 def device():
     device = get_device()
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
@@ -5,6 +5,7 @@ description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
+    "accelerate>=1.11.0",
     "difpy>=4.2.1",
     "fastapi[standard]>=0.116.2",
     "pillow>=11.3.0",
diff --git a/backend/service/image_captioning.py b/backend/service/image_captioning.py
@@ -2,41 +2,52 @@
 from PIL import Image
 from transformers import AutoProcessor, LlavaForConditionalGeneration, GenerationConfig
 from pathlib import Path
+from config.image_formats import SUPPORTED_INPUT_EXTENSIONS
+Image.MAX_IMAGE_PIXELS = None
 
-class ImageCaptioning:
-    def __init__(self, model,max_new_tokens=512, temperature=0.6, top_p=0.9, top_k=None):
+class ImageCaptioningService:
+    def __init__(self, model: Path, load_path: Path, save_path: Path, prompt: str, max_new_tokens=512, temperature=0.6, top_p=0.9, top_k=None):
         self._model, self._processor = self._load_model(model)
+        self.load_path = load_path
+        self.save_path = save_path
+        self.prompt = prompt
         self._generation_config = GenerationConfig(
             max_new_tokens=max_new_tokens,
+            do_sample=True,
             temperature=temperature,
             top_p=top_p,
             top_k=top_k
         )
         
     def _load_model(self, model_path):
-        model = LlavaForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map="auto")
+        model = LlavaForConditionalGeneration.from_pretrained(model_path, dtype=torch.bfloat16, device_map="auto")
         processor = AutoProcessor.from_pretrained(model_path)
         
         model.eval()
         return model, processor
     
-    def _image_inputs(self, image_path: Path, prompt: str):
+    def _image_inputs(self, image_path: Path):
         image = Image.open(image_path).convert('RGB')
         
         messages = [
             {
                 "role": "system",
-                "content": "You are a helpful image captioner."
+                        "content": 
+                        """
+                        You are a professional image captioner for machine learning datasets. 
+                        Provide accurate, detailed descriptions focusing on visual elements, composition, and relevant details. 
+                        Follow user instructions carefully.
+                        """
             },
             {
                 "role": "user",
-                "content": prompt
+                "content": self.prompt
             }
         ]
         
         formatted_prompt = self._processor.apply_chat_template(
             messages,
-            tokenize=False, #return string, not tokens yet
+            tokenize=False,
             add_generation_prompt=True
         )
         
@@ -47,20 +58,17 @@ def _image_inputs(self, image_path: Path, prompt: str):
             return_tensors="pt"
         )
         
-        #move inputs to GPU if available
         if torch.cuda.is_available():
             moved_inputs = {}
             
             for key, value in inputs.items():
-                #check if value can be moved to GPU (has a 'to' method)
                 if hasattr(value, 'to'):
                     moved_inputs[key] = value.to('cuda')
                 else:
                     moved_inputs[key] = value
             
             inputs = moved_inputs
             
-            #convert pixel values to match model dtype (bfloat16)
             if 'pixel_values' in inputs:
                 inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
     
@@ -70,7 +78,7 @@ def _generate_caption(self, inputs: dict):
         print("Generation Captions")
     
         generated_ids = self._model.generate(
-            **inputs, #unpack tensors
+            **inputs,
             generation_config=self._generation_config,
         )[0]
         
@@ -85,9 +93,14 @@ def _generate_caption(self, inputs: dict):
         
         return caption.strip()
     
-    def caption_image(self, image_path: Path, prompt: str):
-        inputs = self._image_inputs(image_path, prompt)
-        return self._generate_caption(inputs)
+    def caption_image(self):
+        files = [f for f in self.load_path.iterdir() 
+            if f.is_file() and f.suffix.lower() in SUPPORTED_INPUT_EXTENSIONS]
+        
+        for img_file in files:
+            inputs = self._image_inputs(img_file)
+            caption = self._generate_caption(inputs)
+            self.save_caption(caption, img_file, self.save_path)
     
     def save_caption(self, caption, image_path: Path, output_dir:Path = None):
         if output_dir:
diff --git a/backend/service/image_upscaling.py b/backend/service/image_upscaling.py
@@ -8,7 +8,7 @@
 from config.image_formats import SUPPORTED_OUTPUT_FORMATS, SUPPORTED_INPUT_EXTENSIONS
 
 class ImageUpscaleService:
-    def __init__(self, device, model_path: Path="", load_path: Path="", save_path: Path ="", format:str = "jpg", tile_size: int = 512, tile_overlap:int = 16):
+    def __init__(self, device, model_path: Path, load_path: Path, save_path: Path, format:str = "jpg", tile_size: int = 512, tile_overlap:int = 16):
         self.load_path = load_path
         self.save_path = save_path
         self.format = format
diff --git a/backend/uv.lock b/backend/uv.lock