diff --git a/src/content/__tests__/buildFfmpegArgs.test.ts b/src/content/__tests__/buildFfmpegArgs.test.ts new file mode 100644 index 0000000..808470e --- /dev/null +++ b/src/content/__tests__/buildFfmpegArgs.test.ts @@ -0,0 +1,61 @@ +import { describe, it, expect } from "vitest"; + +import { buildFfmpegArgs } from "../buildFfmpegArgs"; + +describe("buildFfmpegArgs", () => { + const baseCaptionLayout = { + lines: ["hello world"], + fontSize: 42, + lineHeight: 52, + position: "bottom" as const, + }; + + it("includes -map 0:a when hasAudio and hasOverlays", () => { + const args = buildFfmpegArgs({ + videoPath: "/tmp/video.mp4", + audioPath: "/tmp/audio.mp3", + captionLayout: baseCaptionLayout, + outputPath: "/tmp/out.mp4", + audioStartSeconds: 0, + audioDurationSeconds: 10, + hasAudio: true, + overlayImagePaths: ["/tmp/overlay-0.png"], + }); + + expect(args).toContain("-map"); + expect(args).toContain("[out]"); + expect(args).toContain("0:a"); + }); + + it("uses -vf filter when no overlays", () => { + const args = buildFfmpegArgs({ + videoPath: "/tmp/video.mp4", + audioPath: "/tmp/audio.mp3", + captionLayout: baseCaptionLayout, + outputPath: "/tmp/out.mp4", + audioStartSeconds: 0, + audioDurationSeconds: 10, + hasAudio: false, + overlayImagePaths: [], + }); + + expect(args).toContain("-vf"); + expect(args).not.toContain("-filter_complex"); + }); + + it("uses -filter_complex when overlays are present", () => { + const args = buildFfmpegArgs({ + videoPath: "/tmp/video.mp4", + audioPath: "/tmp/audio.mp3", + captionLayout: baseCaptionLayout, + outputPath: "/tmp/out.mp4", + audioStartSeconds: 5, + audioDurationSeconds: 15, + hasAudio: false, + overlayImagePaths: ["/tmp/overlay-0.png"], + }); + + expect(args).toContain("-filter_complex"); + expect(args).not.toContain("-vf"); + }); +}); diff --git a/src/content/__tests__/buildFilterComplex.test.ts b/src/content/__tests__/buildFilterComplex.test.ts new file mode 100644 index 0000000..d232a19 --- /dev/null +++ b/src/content/__tests__/buildFilterComplex.test.ts @@ -0,0 +1,46 @@ +import { describe, it, expect } from "vitest"; + +import { buildFilterComplex } from "../buildFilterComplex"; + +describe("buildFilterComplex", () => { + it("builds crop + scale + overlay + caption pipeline", () => { + const result = buildFilterComplex({ + overlayCount: 1, + captionFilters: ["drawtext=text='hello':fontsize=42"], + }); + + expect(result).toContain("[0:v]crop=ih*9/16:ih,scale=720:1280[video_base]"); + expect(result).toContain("[1:v]scale=150:150[ovr_0]"); + expect(result).toContain("overlay=30:30"); + expect(result).toContain("[out]"); + }); + + it("positions overlays in top-left stacked vertically", () => { + const result = buildFilterComplex({ + overlayCount: 2, + captionFilters: ["drawtext=text='hi':fontsize=42"], + }); + + expect(result).toMatch(/overlay=30:30/); + expect(result).toMatch(/overlay=30:200/); + }); + + it("chains multiple overlays correctly", () => { + const result = buildFilterComplex({ + overlayCount: 3, + captionFilters: [], + }); + + const overlayCount = (result.match(/overlay=/g) || []).length; + expect(overlayCount).toBe(3); + }); + + it("emits [out] label when captionFilters is empty", () => { + const result = buildFilterComplex({ + overlayCount: 1, + captionFilters: [], + }); + + expect(result).toContain("[out]"); + }); +}); diff --git a/src/content/__tests__/detectFace.test.ts b/src/content/__tests__/detectFace.test.ts new file mode 100644 index 0000000..6a8bf5b --- /dev/null +++ b/src/content/__tests__/detectFace.test.ts @@ -0,0 +1,118 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; + +vi.mock("../../sandboxes/logStep", () => ({ + logStep: vi.fn(), +})); + +const mockFalSubscribe = vi.fn(); +vi.mock("../falSubscribe", () => ({ + falSubscribe: (...args: unknown[]) => mockFalSubscribe(...args), +})); + +import { detectFace } from "../detectFace"; + +describe("detectFace", () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it("returns true when a person label is detected", async () => { + mockFalSubscribe.mockResolvedValue({ + data: { + results: { + bboxes: [[10, 20, 100, 200]], + labels: ["person"], + }, + }, + }); + + const result = await detectFace("https://example.com/headshot.png"); + + expect(result).toBe(true); + expect(mockFalSubscribe).toHaveBeenCalledWith( + "fal-ai/florence-2-large/object-detection", + { image_url: "https://example.com/headshot.png" }, + ); + }); + + it("returns true when a face label is detected among other objects", async () => { + mockFalSubscribe.mockResolvedValue({ + data: { + results: { + bboxes: [[0, 0, 50, 50], [10, 20, 100, 200]], + labels: ["chair", "human face"], + }, + }, + }); + + const result = await detectFace("https://example.com/photo.png"); + + expect(result).toBe(true); + }); + + it("returns false when no person or face labels are detected", async () => { + mockFalSubscribe.mockResolvedValue({ + data: { + results: { + bboxes: [[0, 0, 300, 300]], + labels: ["album cover"], + }, + }, + }); + + const result = await detectFace("https://example.com/album-cover.png"); + + expect(result).toBe(false); + }); + + it("returns false when results are empty", async () => { + mockFalSubscribe.mockResolvedValue({ + data: { + results: { + bboxes: [], + labels: [], + }, + }, + }); + + const result = await detectFace("https://example.com/blank.png"); + + expect(result).toBe(false); + }); + + it("returns false when detection fails", async () => { + mockFalSubscribe.mockRejectedValue(new Error("Detection failed")); + + const result = await detectFace("https://example.com/broken.png"); + + expect(result).toBe(false); + }); + + it("does not false-positive on labels containing face words as substrings", async () => { + mockFalSubscribe.mockResolvedValue({ + data: { + results: { + bboxes: [[0, 0, 200, 200]], + labels: ["ottoman", "mannequin", "womanizer"], + }, + }, + }); + + const result = await detectFace("https://example.com/furniture.png"); + + expect(result).toBe(false); + }); + + it("logs the error when detection fails", async () => { + const { logStep } = await import("../../sandboxes/logStep"); + mockFalSubscribe.mockRejectedValue(new Error("Rate limit exceeded")); + + await detectFace("https://example.com/broken.png"); + + expect(logStep).toHaveBeenCalledWith( + "Face detection failed, assuming no face", + false, + expect.objectContaining({ error: "Rate limit exceeded" }), + ); + }); +}); diff --git a/src/content/__tests__/downloadOverlayImages.test.ts b/src/content/__tests__/downloadOverlayImages.test.ts new file mode 100644 index 0000000..2b57883 --- /dev/null +++ b/src/content/__tests__/downloadOverlayImages.test.ts @@ -0,0 +1,39 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; + +vi.mock("../../sandboxes/logStep", () => ({ + logStep: vi.fn(), +})); + +vi.mock("node:fs/promises", () => ({ + writeFile: vi.fn(), +})); + +import { downloadOverlayImages } from "../downloadOverlayImages"; + +describe("downloadOverlayImages", () => { + beforeEach(() => { + vi.clearAllMocks(); + vi.restoreAllMocks(); + }); + + it("skips failed downloads without aborting remaining images", async () => { + const fetchSpy = vi.spyOn(globalThis, "fetch"); + + // First fetch throws a network error + fetchSpy.mockRejectedValueOnce(new Error("DNS resolution failed")); + + // Second fetch succeeds + fetchSpy.mockResolvedValueOnce( + new Response(new ArrayBuffer(8), { status: 200 }), + ); + + const result = await downloadOverlayImages( + ["https://example.com/fail.png", "https://example.com/ok.png"], + "/tmp/test", + ); + + // Should have one successful path, not zero (which would happen if exception aborted the loop) + expect(result).toHaveLength(1); + expect(result[0]).toContain("overlay-1.png"); + }); +}); diff --git a/src/content/__tests__/escapeDrawtext.test.ts b/src/content/__tests__/escapeDrawtext.test.ts new file mode 100644 index 0000000..f4fb88c --- /dev/null +++ b/src/content/__tests__/escapeDrawtext.test.ts @@ -0,0 +1,65 @@ +import { describe, it, expect } from "vitest"; + +import { escapeDrawtext } from "../escapeDrawtext"; + +describe("escapeDrawtext", () => { + it("replaces straight apostrophes with escaped quote safe for drawtext", () => { + const result = escapeDrawtext("didn't"); + + expect(result).not.toContain("'"); + // Must NOT use U+02BC (modifier letter apostrophe) — most fonts lack this glyph + expect(result).not.toContain("\u02BC"); + expect(result).toContain("didn"); + }); + + it("preserves curly right single quotation marks as-is", () => { + const result = escapeDrawtext("didn\u2019t"); + + // U+2019 is the replacement char — it should remain + expect(result).toContain("\u2019"); + expect(result).not.toContain("\u02BC"); + }); + + it("replaces curly left single quotation marks with right single quotation mark", () => { + const result = escapeDrawtext("\u2018hello\u2019"); + + expect(result).not.toContain("\u2018"); + expect(result).not.toContain("\u02BC"); + // Both should become U+2019 + expect(result).toBe("\u2019hello\u2019"); + }); + + it("escapes colons for ffmpeg", () => { + const result = escapeDrawtext("caption: hello"); + + expect(result).toContain("\\\\:"); + }); + + it("escapes percent to %% for a single literal % in ffmpeg drawtext", () => { + const result = escapeDrawtext("100%"); + + // ffmpeg drawtext: %% renders as single %. So "100%" should become "100%%". + expect(result).toBe("100%%"); + }); + + it("escapes backslashes", () => { + const result = escapeDrawtext("back\\slash"); + + expect(result).toContain("\\\\\\\\"); + }); + + it("strips newlines and carriage returns", () => { + expect(escapeDrawtext("line1\nline2")).toBe("line1 line2"); + expect(escapeDrawtext("line1\r\nline2")).toBe("line1 line2"); + expect(escapeDrawtext("line1\rline2")).toBe("line1line2"); + }); + + it("handles a real caption with apostrophes and special chars", () => { + const result = escapeDrawtext("didn't think anyone would hear this: it's real"); + + // Should not contain raw single quotes, left curly quotes, or U+02BC + expect(result).not.toMatch(/['\u2018\u2032\u02BC]/); + // Should contain escaped colon + expect(result).toContain("\\\\:"); + }); +}); diff --git a/src/content/__tests__/generateContentImage.test.ts b/src/content/__tests__/generateContentImage.test.ts new file mode 100644 index 0000000..0541152 --- /dev/null +++ b/src/content/__tests__/generateContentImage.test.ts @@ -0,0 +1,133 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; + +vi.mock("node:fs/promises", () => ({ + default: { readFile: vi.fn() }, +})); + +vi.mock("@fal-ai/client", () => ({ + fal: { + storage: { upload: vi.fn() }, + }, +})); + +vi.mock("@trigger.dev/sdk/v3", () => ({ + logger: { log: vi.fn() }, +})); + +vi.mock("../../sandboxes/logStep", () => ({ + logStep: vi.fn(), +})); + +const mockFalSubscribe = vi.fn(); +vi.mock("../falSubscribe", () => ({ + falSubscribe: (...args: unknown[]) => mockFalSubscribe(...args), +})); + +import fs from "node:fs/promises"; +import { fal } from "@fal-ai/client"; +import { generateContentImage } from "../generateContentImage"; + +describe("generateContentImage", () => { + beforeEach(() => { + vi.clearAllMocks(); + mockFalSubscribe.mockResolvedValue({ + data: { images: [{ url: "https://fal.ai/generated.png" }] }, + }); + }); + + it("passes face guide and reference image to fal", async () => { + vi.mocked(fs.readFile).mockResolvedValue(Buffer.from("ref-image")); + vi.mocked(fal.storage.upload).mockResolvedValue("https://fal.ai/ref.png"); + + await generateContentImage({ + faceGuideUrl: "https://fal.ai/face.png", + referenceImagePath: "/path/to/ref-01.png", + prompt: "test prompt", + }); + + const callArgs = mockFalSubscribe.mock.calls[0][1] as Record; + expect(callArgs.image_urls).toEqual([ + "https://fal.ai/face.png", + "https://fal.ai/ref.png", + ]); + }); + + it("includes additionalImageUrls in image_urls after face guide and reference", async () => { + vi.mocked(fs.readFile).mockResolvedValue(Buffer.from("ref-image")); + vi.mocked(fal.storage.upload).mockResolvedValue("https://fal.ai/ref.png"); + + await generateContentImage({ + faceGuideUrl: "https://fal.ai/face.png", + referenceImagePath: "/path/to/ref-01.png", + prompt: "test prompt", + additionalImageUrls: [ + "https://example.com/album-cover.png", + "https://example.com/playlist-cover.png", + ], + }); + + const callArgs = mockFalSubscribe.mock.calls[0][1] as Record; + expect(callArgs.image_urls).toEqual([ + "https://fal.ai/face.png", + "https://fal.ai/ref.png", + "https://example.com/album-cover.png", + "https://example.com/playlist-cover.png", + ]); + }); + + it("works with additionalImageUrls but no face guide or reference", async () => { + await generateContentImage({ + referenceImagePath: null, + prompt: "test prompt", + additionalImageUrls: ["https://example.com/cover.png"], + }); + + const callArgs = mockFalSubscribe.mock.calls[0][1] as Record; + expect(callArgs.image_urls).toEqual(["https://example.com/cover.png"]); + }); + + it("deduplicates additionalImageUrls that match faceGuideUrl", async () => { + await generateContentImage({ + faceGuideUrl: "https://fal.ai/face.png", + referenceImagePath: null, + prompt: "test prompt", + additionalImageUrls: ["https://fal.ai/face.png", "https://example.com/cover.png"], + }); + + const callArgs = mockFalSubscribe.mock.calls[0][1] as Record; + expect(callArgs.image_urls).toEqual([ + "https://fal.ai/face.png", + "https://example.com/cover.png", + ]); + }); + + it("ignores empty additionalImageUrls array", async () => { + await generateContentImage({ + faceGuideUrl: "https://fal.ai/face.png", + referenceImagePath: null, + prompt: "test prompt", + additionalImageUrls: [], + }); + + const callArgs = mockFalSubscribe.mock.calls[0][1] as Record; + expect(callArgs.image_urls).toEqual(["https://fal.ai/face.png"]); + }); + + it("deduplicates within additionalImageUrls itself", async () => { + await generateContentImage({ + referenceImagePath: null, + prompt: "test prompt", + additionalImageUrls: [ + "https://example.com/cover.png", + "https://example.com/cover.png", + "https://example.com/other.png", + ], + }); + + const callArgs = mockFalSubscribe.mock.calls[0][1] as Record; + expect(callArgs.image_urls).toEqual([ + "https://example.com/cover.png", + "https://example.com/other.png", + ]); + }); +}); diff --git a/src/content/__tests__/loadArtistReleaseEditorial.test.ts b/src/content/__tests__/loadArtistReleaseEditorial.test.ts new file mode 100644 index 0000000..e1f2748 --- /dev/null +++ b/src/content/__tests__/loadArtistReleaseEditorial.test.ts @@ -0,0 +1,55 @@ +import { describe, expect, it, vi } from "vitest"; + +vi.mock("../../sandboxes/logStep", () => ({ + logStep: vi.fn(), +})); + +import { loadTemplate } from "../loadTemplate"; + +describe("loadTemplate artist-release-editorial", () => { + it("loads the artist-release-editorial template", async () => { + const template = await loadTemplate("artist-release-editorial"); + + expect(template.name).toBe("artist-release-editorial"); + expect(template.imagePrompt).toBeTruthy(); + expect(template.usesFaceGuide).toBe(true); + expect(template.styleGuide).not.toBeNull(); + expect(template.captionGuide).not.toBeNull(); + expect(template.videoMoods.length).toBeGreaterThan(0); + expect(template.videoMovements.length).toBeGreaterThan(0); + expect(template.captionExamples.length).toBeGreaterThan(0); + }); + + it("has usesImageOverlay set to true", async () => { + const template = await loadTemplate("artist-release-editorial"); + + expect(template.usesImageOverlay).toBe(true); + }); + + it("has a customInstruction in the style guide", async () => { + const template = await loadTemplate("artist-release-editorial"); + const sg = template.styleGuide as Record; + + expect(sg.customInstruction).toBeTruthy(); + expect(typeof sg.customInstruction).toBe("string"); + }); + + it("customInstruction generates only the artist portrait with no composited elements", async () => { + const template = await loadTemplate("artist-release-editorial"); + const sg = template.styleGuide as Record; + const instruction = sg.customInstruction as string; + + expect(instruction.toLowerCase()).not.toContain("playlist cover"); + expect(instruction.toLowerCase()).not.toContain("streaming"); + expect(instruction.toLowerCase()).not.toContain("logo"); + expect(instruction.toLowerCase()).not.toContain("composite"); + }); + + it("imagePrompt describes only the editorial portrait with no composited elements", async () => { + const template = await loadTemplate("artist-release-editorial"); + + expect(template.imagePrompt.toLowerCase()).not.toContain("playlist"); + expect(template.imagePrompt.toLowerCase()).not.toContain("logo"); + expect(template.imagePrompt.toLowerCase()).not.toContain("composite"); + }); +}); diff --git a/src/content/__tests__/resolveFaceGuide.test.ts b/src/content/__tests__/resolveFaceGuide.test.ts index f2707c3..562c1fe 100644 --- a/src/content/__tests__/resolveFaceGuide.test.ts +++ b/src/content/__tests__/resolveFaceGuide.test.ts @@ -1,4 +1,4 @@ -import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; +import { describe, it, expect, vi, beforeEach } from "vitest"; import { resolveFaceGuide } from "../resolveFaceGuide"; vi.mock("../../sandboxes/logStep", () => ({ @@ -13,12 +13,17 @@ vi.mock("../fetchGithubFile", () => ({ fetchGithubFile: vi.fn(), })); +vi.mock("../detectFace", () => ({ + detectFace: vi.fn(), +})); + vi.mock("@fal-ai/client", () => ({ fal: { storage: { upload: vi.fn() } }, })); const { fetchImageFromUrl } = await import("../fetchImageFromUrl"); const { fetchGithubFile } = await import("../fetchGithubFile"); +const { detectFace } = await import("../detectFace"); const { fal } = await import("@fal-ai/client"); describe("resolveFaceGuide", () => { @@ -26,7 +31,7 @@ describe("resolveFaceGuide", () => { vi.clearAllMocks(); }); - it("returns null when usesFaceGuide is false and no images provided", async () => { + it("returns null faceGuideUrl and empty additionalImageUrls when no images and usesFaceGuide is false", async () => { const result = await resolveFaceGuide({ usesFaceGuide: false, images: undefined, @@ -34,41 +39,73 @@ describe("resolveFaceGuide", () => { artistSlug: "artist", }); - expect(result).toBeNull(); - expect(fetchImageFromUrl).not.toHaveBeenCalled(); + expect(result).toEqual({ faceGuideUrl: null, additionalImageUrls: [] }); }); - it("passes attached images through even when usesFaceGuide is false", async () => { - vi.mocked(fetchImageFromUrl).mockResolvedValue("https://fal.ai/uploaded.png"); + it("uses face image as faceGuideUrl and non-face images as additionalImageUrls", async () => { + vi.mocked(fetchImageFromUrl) + .mockResolvedValueOnce("https://fal.ai/face.png") + .mockResolvedValueOnce("https://fal.ai/cover.png"); + vi.mocked(detectFace) + .mockResolvedValueOnce(true) + .mockResolvedValueOnce(false); const result = await resolveFaceGuide({ - usesFaceGuide: false, - images: ["https://example.com/album-cover.png"], + usesFaceGuide: true, + images: [ + "https://example.com/headshot.png", + "https://example.com/album-cover.png", + ], githubRepo: "https://github.com/test/repo", artistSlug: "artist", }); - expect(fetchImageFromUrl).toHaveBeenCalledWith("https://example.com/album-cover.png"); - expect(result).toBe("https://fal.ai/uploaded.png"); + expect(result).toEqual({ + faceGuideUrl: "https://fal.ai/face.png", + additionalImageUrls: ["https://fal.ai/cover.png"], + }); }); - it("uses fetchImageFromUrl when images array has entries", async () => { - vi.mocked(fetchImageFromUrl).mockResolvedValue("https://fal.ai/uploaded.png"); + it("falls back to GitHub face-guide when no images contain a face", async () => { + vi.mocked(fetchImageFromUrl).mockResolvedValue("https://fal.ai/cover.png"); + vi.mocked(detectFace).mockResolvedValue(false); + vi.mocked(fetchGithubFile).mockResolvedValue(Buffer.from("face-data")); + vi.mocked(fal.storage.upload).mockResolvedValue("https://fal.ai/github-face.png"); const result = await resolveFaceGuide({ usesFaceGuide: true, - images: ["https://example.com/face.png"], + images: ["https://example.com/album-cover.png"], githubRepo: "https://github.com/test/repo", artistSlug: "artist", }); - expect(fetchImageFromUrl).toHaveBeenCalledWith("https://example.com/face.png"); - expect(result).toBe("https://fal.ai/uploaded.png"); + expect(result).toEqual({ + faceGuideUrl: "https://fal.ai/github-face.png", + additionalImageUrls: ["https://fal.ai/cover.png"], + }); }); - it("fetches from GitHub when no images provided", async () => { - const buffer = Buffer.from("image-data"); - vi.mocked(fetchGithubFile).mockResolvedValue(buffer); + it("puts all images in additionalImageUrls when usesFaceGuide is false", async () => { + vi.mocked(fetchImageFromUrl) + .mockResolvedValueOnce("https://fal.ai/img1.png") + .mockResolvedValueOnce("https://fal.ai/img2.png"); + + const result = await resolveFaceGuide({ + usesFaceGuide: false, + images: ["https://example.com/a.png", "https://example.com/b.png"], + githubRepo: "https://github.com/test/repo", + artistSlug: "artist", + }); + + expect(result).toEqual({ + faceGuideUrl: null, + additionalImageUrls: ["https://fal.ai/img1.png", "https://fal.ai/img2.png"], + }); + expect(detectFace).not.toHaveBeenCalled(); + }); + + it("fetches face-guide from GitHub when no images provided", async () => { + vi.mocked(fetchGithubFile).mockResolvedValue(Buffer.from("face-data")); vi.mocked(fal.storage.upload).mockResolvedValue("https://fal.ai/github.png"); const result = await resolveFaceGuide({ @@ -78,14 +115,13 @@ describe("resolveFaceGuide", () => { artistSlug: "artist", }); - expect(fetchGithubFile).toHaveBeenCalledWith( - "https://github.com/test/repo", - "artists/artist/context/images/face-guide.png", - ); - expect(result).toBe("https://fal.ai/github.png"); + expect(result).toEqual({ + faceGuideUrl: "https://fal.ai/github.png", + additionalImageUrls: [], + }); }); - it("throws when GitHub face-guide is not found", async () => { + it("throws when usesFaceGuide is true and GitHub face-guide is not found", async () => { vi.mocked(fetchGithubFile).mockResolvedValue(null); await expect( @@ -97,4 +133,28 @@ describe("resolveFaceGuide", () => { }), ).rejects.toThrow("face-guide.png not found"); }); + + it("uses first face image when multiple faces are provided", async () => { + vi.mocked(fetchImageFromUrl) + .mockResolvedValueOnce("https://fal.ai/face1.png") + .mockResolvedValueOnce("https://fal.ai/face2.png"); + vi.mocked(detectFace) + .mockResolvedValueOnce(true) + .mockResolvedValueOnce(true); + + const result = await resolveFaceGuide({ + usesFaceGuide: true, + images: [ + "https://example.com/headshot1.png", + "https://example.com/headshot2.png", + ], + githubRepo: "https://github.com/test/repo", + artistSlug: "artist", + }); + + expect(result).toEqual({ + faceGuideUrl: "https://fal.ai/face1.png", + additionalImageUrls: ["https://fal.ai/face2.png"], + }); + }); }); diff --git a/src/content/buildFfmpegArgs.ts b/src/content/buildFfmpegArgs.ts new file mode 100644 index 0000000..8e553a4 --- /dev/null +++ b/src/content/buildFfmpegArgs.ts @@ -0,0 +1,142 @@ +import { buildFilterComplex } from "./buildFilterComplex"; +import { escapeDrawtext } from "./escapeDrawtext"; + +/** Video frame dimensions */ +const FRAME_HEIGHT = 1280; +/** Bottom margin from the frame edge */ +const BOTTOM_MARGIN = 120; + +/** + * Builds the ffmpeg arguments for the final render. + * + * What it does (matching Remotion SocialPost): + * 1. Center-crop 16:9 → 9:16 portrait + * 2. Overlay audio from song clip (skip if lipsync — audio already in video) + * 3. Overlay caption text (white, black stroke, bottom center) + */ +export function buildFfmpegArgs({ + videoPath, + audioPath, + captionLayout, + outputPath, + audioStartSeconds, + audioDurationSeconds, + hasAudio, + overlayImagePaths, +}: { + videoPath: string; + audioPath: string; + captionLayout: { lines: string[]; fontSize: number; lineHeight: number; position: "bottom" | "center" | "top" }; + outputPath: string; + audioStartSeconds: number; + audioDurationSeconds: number; + hasAudio: boolean; + overlayImagePaths: string[]; +}): string[] { + const { lines, fontSize, lineHeight, position } = captionLayout; + + const cropFilter = "crop=ih*9/16:ih"; + const scaleFilter = "scale=720:1280"; + + const totalTextHeight = lines.length * lineHeight; + const borderWidth = Math.max(2, Math.round(fontSize / 14)); + + let blockStartY: number; + if (position === "bottom") { + blockStartY = FRAME_HEIGHT - BOTTOM_MARGIN - totalTextHeight; + } else if (position === "center") { + blockStartY = Math.round((FRAME_HEIGHT - totalTextHeight) / 2); + } else { + blockStartY = 180; + } + + const captionFilters = lines.map((line, i) => { + const escaped = escapeDrawtext(line); + + const yPos = blockStartY + (i * lineHeight); + + return [ + `drawtext=text='${escaped}'`, + `fontsize=${fontSize}`, + "fontcolor=white", + `borderw=${borderWidth}`, + "bordercolor=black", + "x=(w-tw)/2", + `y=${String(yPos)}`, + ].join(":"); + }); + + const hasOverlays = overlayImagePaths.length > 0; + + const args = ["-y"]; + + if (hasOverlays) { + const audioInputIndex = hasAudio ? -1 : 1 + overlayImagePaths.length; + + args.push("-i", videoPath); + for (const p of overlayImagePaths) { + args.push("-i", p); + } + if (!hasAudio) { + args.push("-ss", String(audioStartSeconds), "-t", String(audioDurationSeconds)); + args.push("-i", audioPath); + } + + const filterComplex = buildFilterComplex({ + overlayCount: overlayImagePaths.length, + captionFilters, + }); + + args.push("-filter_complex", filterComplex); + args.push("-map", "[out]"); + + if (hasAudio) { + args.push("-map", "0:a"); + args.push("-c:a", "aac"); + } else { + args.push("-map", `${audioInputIndex}:a:0`); + args.push("-c:a", "aac"); + } + + args.push( + "-c:v", "libx264", + "-pix_fmt", "yuv420p", + "-movflags", "+faststart", + "-shortest", + outputPath, + ); + } else { + const videoFilter = [cropFilter, scaleFilter, ...captionFilters].join(","); + + if (hasAudio) { + args.push( + "-i", videoPath, + "-vf", videoFilter, + "-c:v", "libx264", + "-c:a", "aac", + "-pix_fmt", "yuv420p", + "-movflags", "+faststart", + "-shortest", + outputPath, + ); + } else { + args.push( + "-i", videoPath, + "-ss", String(audioStartSeconds), + "-t", String(audioDurationSeconds), + "-i", audioPath, + "-vf", videoFilter, + "-c:v", "libx264", + "-c:a", "aac", + "-map", "0:v:0", + "-map", "1:a:0", + "-pix_fmt", "yuv420p", + "-movflags", "+faststart", + "-shortest", + outputPath, + ); + } + } + + return args; +} diff --git a/src/content/buildFilterComplex.ts b/src/content/buildFilterComplex.ts new file mode 100644 index 0000000..673858f --- /dev/null +++ b/src/content/buildFilterComplex.ts @@ -0,0 +1,55 @@ +/** Video frame dimensions */ +const FRAME_WIDTH = 720; +const FRAME_HEIGHT = 1280; + +/** Overlay image size */ +const OVERLAY_SIZE = 150; +const EDGE_PADDING = 30; +const OVERLAY_GAP = 20; + +/** + * Builds the ffmpeg filter_complex string for video with overlay images and captions. + * + * Pipeline: crop → scale → overlay images → drawtext captions + * + * @param overlayCount - Number of overlay image inputs (starting at input index 1) + * @param captionFilters - Array of drawtext filter strings for captions + * @returns The complete filter_complex string + */ +export function buildFilterComplex({ + overlayCount, + captionFilters, +}: { + overlayCount: number; + captionFilters: string[]; +}): string { + const parts: string[] = []; + + // Crop + scale video + parts.push("[0:v]crop=ih*9/16:ih,scale=720:1280[video_base]"); + + // Scale each overlay input + for (let i = 0; i < overlayCount; i++) { + const inputIdx = 1 + i; + parts.push(`[${inputIdx}:v]scale=${OVERLAY_SIZE}:${OVERLAY_SIZE}[ovr_${i}]`); + } + + // Chain overlays — stacked vertically from top-left + let prevLabel = "video_base"; + for (let i = 0; i < overlayCount; i++) { + const x = EDGE_PADDING; + const y = EDGE_PADDING + i * (OVERLAY_SIZE + OVERLAY_GAP); + const outLabel = i < overlayCount - 1 ? `ovr_out_${i}` : "ovr_final"; + parts.push(`[${prevLabel}][ovr_${i}]overlay=${x}:${y}[${outLabel}]`); + prevLabel = outLabel; + } + + // Apply captions on overlay result (or alias to [out] if no captions) + if (captionFilters.length > 0) { + parts.push(`[ovr_final]${captionFilters.join(",")}[out]`); + } else { + parts.push("[ovr_final]copy[out]"); + } + + return parts.join(";"); +} diff --git a/src/content/calculateCaptionLayout.ts b/src/content/calculateCaptionLayout.ts new file mode 100644 index 0000000..baaecfb --- /dev/null +++ b/src/content/calculateCaptionLayout.ts @@ -0,0 +1,63 @@ +import { wrapText } from "./wrapText"; + +/** Video frame dimensions */ +const FRAME_WIDTH = 720; +const FRAME_HEIGHT = 1280; +/** Maximum portion of the frame height captions can use (40%) */ +const MAX_CAPTION_HEIGHT_RATIO = 0.4; +/** Minimum font size — below this the text is unreadable */ +const MIN_FONT_SIZE = 20; +/** Maximum font size */ +const MAX_FONT_SIZE = 42; + +/** + * Calculates the optimal font size, line layout, and vertical position + * so captions never get cut off regardless of text length. + * + * Vertical positioning: + * - Short (1-3 lines): bottom of frame + * - Medium (4-6 lines): vertically centered + * - Long (7+ lines): starts from top area + * + * Font auto-shrinks if text doesn't fit within the available space. + */ +export function calculateCaptionLayout(text: string): { + lines: string[]; + fontSize: number; + lineHeight: number; + position: "bottom" | "center" | "top"; +} { + const maxHeight = FRAME_HEIGHT * MAX_CAPTION_HEIGHT_RATIO; + + let chosenLines: string[] = []; + let chosenFontSize = MIN_FONT_SIZE; + let chosenLineHeight = MIN_FONT_SIZE + 10; + + for (let fontSize = MAX_FONT_SIZE; fontSize >= MIN_FONT_SIZE; fontSize -= 2) { + const charsPerLine = Math.floor(FRAME_WIDTH * 0.85 / (fontSize * 0.55)); + const lineHeight = fontSize + 10; + const lines = wrapText(text, charsPerLine); + const totalHeight = lines.length * lineHeight; + + if (totalHeight <= maxHeight) { + chosenLines = lines; + chosenFontSize = fontSize; + chosenLineHeight = lineHeight; + break; + } + chosenLines = lines; + chosenFontSize = fontSize; + chosenLineHeight = lineHeight; + } + + let position: "bottom" | "center" | "top"; + if (chosenLines.length <= 3) { + position = "bottom"; + } else if (chosenLines.length <= 6) { + position = "center"; + } else { + position = "top"; + } + + return { lines: chosenLines, fontSize: chosenFontSize, lineHeight: chosenLineHeight, position }; +} diff --git a/src/content/classifyImages.ts b/src/content/classifyImages.ts new file mode 100644 index 0000000..f3683ec --- /dev/null +++ b/src/content/classifyImages.ts @@ -0,0 +1,42 @@ +import { detectFace } from "./detectFace"; +import { fetchImageFromUrl } from "./fetchImageFromUrl"; + +/** + * Uploads and classifies images into face guide and additional image URLs. + * + * For each image: + * - If usesFaceGuide and no face guide found yet, runs face detection + * - First face image becomes the face guide + * - All other images become additional image URLs + * + * @returns The first face image URL (or null) and remaining additional URLs + */ +export async function classifyImages({ + images, + usesFaceGuide, +}: { + images: string[]; + usesFaceGuide: boolean; +}): Promise<{ + faceGuideUrl: string | null; + additionalImageUrls: string[]; +}> { + let faceGuideUrl: string | null = null; + const additionalImageUrls: string[] = []; + + for (const imageUrl of images) { + const uploadedUrl = await fetchImageFromUrl(imageUrl); + + if (usesFaceGuide && !faceGuideUrl) { + const hasFace = await detectFace(uploadedUrl); + if (hasFace) { + faceGuideUrl = uploadedUrl; + continue; + } + } + + additionalImageUrls.push(uploadedUrl); + } + + return { faceGuideUrl, additionalImageUrls }; +} diff --git a/src/content/detectFace.ts b/src/content/detectFace.ts new file mode 100644 index 0000000..4715686 --- /dev/null +++ b/src/content/detectFace.ts @@ -0,0 +1,40 @@ +import { logStep } from "../sandboxes/logStep"; +import { falSubscribe } from "./falSubscribe"; + +const DETECTION_MODEL = "fal-ai/florence-2-large/object-detection"; + +/** Labels that indicate a human face or person is present in the image. */ +const FACE_LABELS = ["person", "face", "human face", "man", "woman", "boy", "girl"]; + +/** + * Detects whether an image contains a human face using Florence-2 object detection. + * + * @param imageUrl - URL of the image to analyze + * @returns true if at least one face/person is detected, false otherwise + */ +export async function detectFace(imageUrl: string): Promise { + try { + const result = await falSubscribe(DETECTION_MODEL, { + image_url: imageUrl, + }); + + const data = result.data as Record; + const results = data.results as { labels?: string[] } | undefined; + const labels = results?.labels ?? []; + + const hasFace = labels.some((label) => { + const lower = label.toLowerCase(); + return FACE_LABELS.some( + (faceLabel) => lower === faceLabel || lower.split(" ").includes(faceLabel), + ); + }); + logStep("Face detection result", false, { imageUrl: imageUrl.slice(0, 80), hasFace, labels }); + return hasFace; + } catch (err) { + logStep("Face detection failed, assuming no face", false, { + imageUrl: imageUrl.slice(0, 80), + error: err instanceof Error ? err.message : String(err), + }); + return false; + } +} diff --git a/src/content/downloadImageBuffer.ts b/src/content/downloadImageBuffer.ts new file mode 100644 index 0000000..ca15c53 --- /dev/null +++ b/src/content/downloadImageBuffer.ts @@ -0,0 +1,18 @@ +/** + * Downloads an image from a URL and returns its buffer and content type. + * + * @param imageUrl - Public URL of the image + * @returns The image buffer and content type + */ +export async function downloadImageBuffer(imageUrl: string): Promise<{ + buffer: Buffer; + contentType: string; +}> { + const response = await fetch(imageUrl); + if (!response.ok) { + throw new Error(`Failed to download image: ${response.status}`); + } + const buffer = Buffer.from(await response.arrayBuffer()); + const contentType = response.headers.get("content-type") || "image/png"; + return { buffer, contentType }; +} diff --git a/src/content/downloadOverlayImages.ts b/src/content/downloadOverlayImages.ts new file mode 100644 index 0000000..c2bfaa5 --- /dev/null +++ b/src/content/downloadOverlayImages.ts @@ -0,0 +1,38 @@ +import { writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import { logStep } from "../sandboxes/logStep"; +import { downloadImageBuffer } from "./downloadImageBuffer"; + +/** + * Downloads overlay image URLs to local temp files. + * Skips images that fail to download. + * + * @param urls - Image URLs to download + * @param tempDir - Directory to write temp files into + * @returns Array of local file paths for successfully downloaded images + */ +export async function downloadOverlayImages( + urls: string[], + tempDir: string, +): Promise { + if (urls.length === 0) return []; + + logStep("Downloading overlay images", true, { count: urls.length }); + const paths: string[] = []; + + for (let i = 0; i < urls.length; i++) { + try { + const { buffer } = await downloadImageBuffer(urls[i]); + const overlayPath = join(tempDir, `overlay-${i}.png`); + await writeFile(overlayPath, buffer); + paths.push(overlayPath); + } catch { + logStep("Overlay download failed, skipping", false, { + url: urls[i].slice(0, 80), + index: i, + }); + } + } + + return paths; +} diff --git a/src/content/escapeDrawtext.ts b/src/content/escapeDrawtext.ts new file mode 100644 index 0000000..214fcdb --- /dev/null +++ b/src/content/escapeDrawtext.ts @@ -0,0 +1,20 @@ +/** + * Escapes a text string for use in ffmpeg drawtext filters. + * + * Handles both -vf and filter_complex contexts by replacing all + * quote-like characters with the right single quotation mark (U+2019), + * which renders as an apostrophe in all standard fonts and is not + * parsed as a delimiter by ffmpeg. + * + * @param text - Raw caption text + * @returns Escaped text safe for ffmpeg drawtext + */ +export function escapeDrawtext(text: string): string { + return text + .replace(/\r/g, "") + .replace(/\n/g, " ") + .replace(/\\/g, "\\\\\\\\") + .replace(/['\u2018\u2032]/g, "\u2019") + .replace(/:/g, "\\\\:") + .replace(/%/g, "%%"); +} diff --git a/src/content/fetchGitHubFaceGuide.ts b/src/content/fetchGitHubFaceGuide.ts new file mode 100644 index 0000000..4932cbe --- /dev/null +++ b/src/content/fetchGitHubFaceGuide.ts @@ -0,0 +1,33 @@ +import { fal } from "@fal-ai/client"; +import { logStep } from "../sandboxes/logStep"; +import { fetchGithubFile } from "./fetchGithubFile"; + +/** + * Fetches the artist's face-guide.png from their GitHub repo and uploads it to fal.ai storage. + * + * @param githubRepo - The GitHub repo URL + * @param artistSlug - The artist slug for the file path + * @returns fal.ai storage URL of the face guide + * @throws Error if face-guide.png is not found + */ +export async function fetchGitHubFaceGuide( + githubRepo: string, + artistSlug: string, +): Promise { + logStep("Fetching face-guide from GitHub"); + const buffer = await fetchGithubFile( + githubRepo, + `artists/${artistSlug}/context/images/face-guide.png`, + ); + if (!buffer) { + throw new Error(`face-guide.png not found for artist ${artistSlug}`); + } + + logStep("Uploading face-guide to fal.ai storage", true, { + sizeBytes: buffer.byteLength, + }); + const file = new File([buffer], "face-guide.png", { type: "image/png" }); + const faceGuideUrl = await fal.storage.upload(file); + logStep("Face-guide uploaded", false, { faceGuideUrl }); + return faceGuideUrl; +} diff --git a/src/content/fetchImageFromUrl.ts b/src/content/fetchImageFromUrl.ts index daa7a51..dda6ead 100644 --- a/src/content/fetchImageFromUrl.ts +++ b/src/content/fetchImageFromUrl.ts @@ -1,5 +1,6 @@ import { fal } from "@fal-ai/client"; import { logStep } from "../sandboxes/logStep"; +import { downloadImageBuffer } from "./downloadImageBuffer"; /** * Downloads an image from a public URL and uploads it to fal.ai storage. @@ -9,18 +10,13 @@ import { logStep } from "../sandboxes/logStep"; */ export async function fetchImageFromUrl(imageUrl: string): Promise { logStep("Downloading image from URL"); - const response = await fetch(imageUrl); - if (!response.ok) { - throw new Error(`Failed to download image: ${response.status}`); - } - const imageBuffer = Buffer.from(await response.arrayBuffer()); + const { buffer, contentType } = await downloadImageBuffer(imageUrl); logStep("Uploading image to fal.ai storage", true, { - sizeBytes: imageBuffer.byteLength, + sizeBytes: buffer.byteLength, }); - const contentType = response.headers.get("content-type") || "image/png"; const originalName = new URL(imageUrl).pathname.split("/").pop() || "image.png"; - const faceGuideFile = new File([new Uint8Array(imageBuffer)], originalName, { type: contentType }); + const faceGuideFile = new File([new Uint8Array(buffer)], originalName, { type: contentType }); const falUrl = await fal.storage.upload(faceGuideFile); logStep("Image uploaded", false, { falUrl }); diff --git a/src/content/generateContentImage.ts b/src/content/generateContentImage.ts index e0bc3de..aef1689 100644 --- a/src/content/generateContentImage.ts +++ b/src/content/generateContentImage.ts @@ -1,6 +1,5 @@ import fs from "node:fs/promises"; import { fal } from "@fal-ai/client"; -import { logger } from "@trigger.dev/sdk/v3"; import { logStep } from "../sandboxes/logStep"; import { DEFAULT_PIPELINE_CONFIG } from "./defaultPipelineConfig"; import { falSubscribe } from "./falSubscribe"; @@ -23,11 +22,14 @@ export async function generateContentImage({ faceGuideUrl, referenceImagePath, prompt, + additionalImageUrls, }: { /** Guide image URL — omit for templates that don't use an input image. */ faceGuideUrl?: string; referenceImagePath: string | null; prompt: string; + /** Extra image URLs (e.g. album covers, playlist covers) to pass to the model. */ + additionalImageUrls?: string[]; }): Promise { const config = DEFAULT_PIPELINE_CONFIG; @@ -36,7 +38,7 @@ export async function generateContentImage({ if (faceGuideUrl) imageUrls.push(faceGuideUrl); if (referenceImagePath) { - logger.log("Uploading reference image to fal storage", { + logStep("Uploading reference image to fal storage", false, { path: referenceImagePath, }); const refBuffer = await fs.readFile(referenceImagePath); @@ -45,12 +47,23 @@ export async function generateContentImage({ imageUrls.push(refUrl); } + if (additionalImageUrls?.length) { + const unique = [...new Set(additionalImageUrls)]; + const deduped = unique.filter((url) => !imageUrls.includes(url)); + logStep("Adding additional image URLs", false, { + count: deduped.length, + urls: deduped.map((u) => u.slice(0, 80)), + }); + imageUrls.push(...deduped); + } + logStep("Generating image", false, { model: config.imageModel, promptLength: prompt.length, imageCount: imageUrls.length, hasFaceGuide: Boolean(faceGuideUrl), hasReferenceImage: Boolean(referenceImagePath), + hasAdditionalImages: Boolean(additionalImageUrls?.length), }); const result = await falSubscribe(config.imageModel, { @@ -71,7 +84,7 @@ export async function generateContentImage({ ); } - logger.log("Image generated", { imageUrl: imageUrl.slice(0, 80) }); + logStep("Image generated", false, { imageUrl: imageUrl.slice(0, 80) }); return imageUrl; } diff --git a/src/content/loadTemplate.ts b/src/content/loadTemplate.ts index c9bd437..c747d1f 100644 --- a/src/content/loadTemplate.ts +++ b/src/content/loadTemplate.ts @@ -1,9 +1,16 @@ import path from "node:path"; import fs from "node:fs/promises"; +import { z } from "zod"; import { logStep } from "../sandboxes/logStep"; import { resolveTemplatesDir } from "./resolveTemplatesDir"; import { loadJsonFile } from "./loadJsonFile"; +const styleGuideSchema = z.object({ + imagePrompt: z.string().default(""), + usesFaceGuide: z.boolean().default(true), + usesImageOverlay: z.boolean().default(false), +}).passthrough(); + /** * Template data loaded from the bundled templates directory. */ @@ -13,6 +20,8 @@ export interface TemplateData { imagePrompt: string; /** Whether this template uses the artist's face-guide for identity. Defaults to true. */ usesFaceGuide: boolean; + /** Whether attached images (playlist covers, logos) should be overlaid on the final video. Defaults to false. */ + usesImageOverlay: boolean; styleGuide: Record | null; captionGuide: Record | null; captionExamples: string[]; @@ -79,10 +88,12 @@ export async function loadTemplate(templateName: string): Promise logStep("loadTemplate: no reference images directory", false, { imagesDir }); } - // Read template-level fields from the style guide - const sg = styleGuide as Record | null; - const imagePrompt = (sg?.imagePrompt as string) ?? ""; - const usesFaceGuide = (sg?.usesFaceGuide as boolean) ?? true; + // Read and validate template-level fields from the style guide + const parsed = styleGuideSchema.safeParse(styleGuide ?? {}); + const sg = parsed.success ? parsed.data : { imagePrompt: "", usesFaceGuide: true, usesImageOverlay: false }; + const imagePrompt = sg.imagePrompt; + const usesFaceGuide = sg.usesFaceGuide; + const usesImageOverlay = sg.usesImageOverlay; logStep("loadTemplate: result summary", false, { template: templateName, @@ -94,12 +105,14 @@ export async function loadTemplate(templateName: string): Promise referenceImagesCount: referenceImagePaths.length, imagePromptLength: imagePrompt.length, usesFaceGuide, + usesImageOverlay, }); return { name: templateName, imagePrompt, usesFaceGuide, + usesImageOverlay, styleGuide, captionGuide, captionExamples, diff --git a/src/content/renderFinalVideo.ts b/src/content/renderFinalVideo.ts index c3a43c8..79a7464 100644 --- a/src/content/renderFinalVideo.ts +++ b/src/content/renderFinalVideo.ts @@ -6,140 +6,32 @@ import { join } from "node:path"; import { promisify } from "node:util"; import { logStep } from "../sandboxes/logStep"; import { fal } from "@fal-ai/client"; +import { buildFfmpegArgs } from "./buildFfmpegArgs"; +import { calculateCaptionLayout } from "./calculateCaptionLayout"; +import { stripEmoji } from "./stripEmoji"; +import { downloadOverlayImages } from "./downloadOverlayImages"; const execFileAsync = promisify(execFile); -/** - * Strips emoji and other non-ASCII characters that ffmpeg drawtext can't render. - */ -function stripEmoji(text: string): string { - return text - .replace(/[\u{1F000}-\u{1FFFF}]/gu, "") - .replace(/[\u{2600}-\u{27BF}]/gu, "") - .replace(/[\u{FE00}-\u{FE0F}]/gu, "") - .replace(/\s{2,}/g, " ") - .trim(); -} - -/** - * Wraps text to fit within a max character width per line. - * Breaks on word boundaries to avoid mid-word splits. - */ -function wrapText(text: string, maxCharsPerLine: number): string[] { - const words = text.split(" "); - const lines: string[] = []; - let currentLine = ""; - - for (const word of words) { - if (currentLine.length + word.length + 1 > maxCharsPerLine && currentLine.length > 0) { - lines.push(currentLine); - currentLine = word; - } else { - currentLine = currentLine ? `${currentLine} ${word}` : word; - } - } - if (currentLine) lines.push(currentLine); - - return lines; -} - -/** Video frame dimensions */ -const FRAME_WIDTH = 720; -const FRAME_HEIGHT = 1280; -/** Maximum portion of the frame height captions can use (40%) */ -const MAX_CAPTION_HEIGHT_RATIO = 0.4; -/** Minimum font size — below this the text is unreadable */ -const MIN_FONT_SIZE = 20; -/** Maximum font size */ -const MAX_FONT_SIZE = 42; -/** Bottom margin from the frame edge */ -const BOTTOM_MARGIN = 120; - -/** - * Calculates the optimal font size, line layout, and vertical position - * so captions never get cut off regardless of text length. - * - * Vertical positioning: - * - Short (1-3 lines): bottom of frame - * - Medium (4-6 lines): vertically centered - * - Long (7+ lines): starts from top area - * - * Font auto-shrinks if text doesn't fit within the available space. - */ -function calculateCaptionLayout(text: string): { - lines: string[]; - fontSize: number; - lineHeight: number; - /** "bottom" | "center" | "top" — determines where captions are placed */ - position: "bottom" | "center" | "top"; -} { - const maxHeight = FRAME_HEIGHT * MAX_CAPTION_HEIGHT_RATIO; - - let chosenLines: string[] = []; - let chosenFontSize = MIN_FONT_SIZE; - let chosenLineHeight = MIN_FONT_SIZE + 10; - - for (let fontSize = MAX_FONT_SIZE; fontSize >= MIN_FONT_SIZE; fontSize -= 2) { - const charsPerLine = Math.floor(FRAME_WIDTH * 0.85 / (fontSize * 0.55)); - const lineHeight = fontSize + 10; - const lines = wrapText(text, charsPerLine); - const totalHeight = lines.length * lineHeight; - - if (totalHeight <= maxHeight) { - chosenLines = lines; - chosenFontSize = fontSize; - chosenLineHeight = lineHeight; - break; - } - chosenLines = lines; - chosenFontSize = fontSize; - chosenLineHeight = lineHeight; - } - - // Determine position based on line count - let position: "bottom" | "center" | "top"; - if (chosenLines.length <= 3) { - position = "bottom"; - } else if (chosenLines.length <= 6) { - position = "center"; - } else { - position = "top"; - } - - return { lines: chosenLines, fontSize: chosenFontSize, lineHeight: chosenLineHeight, position }; -} - export interface RenderFinalVideoInput { - /** URL of the AI-generated video (16:9) */ videoUrl: string; - /** Raw mp3 bytes of the song */ songBuffer: Buffer; - /** Start time in the song to begin the audio clip (seconds) */ audioStartSeconds: number; - /** Duration of the clip (seconds) */ audioDurationSeconds: number; - /** Caption text to overlay on the video */ captionText: string; - /** Whether the video already has audio baked in (lipsync mode) */ hasAudio: boolean; + overlayImageUrls?: string[]; } export interface RenderFinalVideoOutput { - /** Data URL of the final rendered video */ - dataUrl: string; - /** MIME type */ + videoUrl: string; mimeType: string; - /** Size in bytes */ sizeBytes: number; } /** * Renders the final social post video using ffmpeg: - * 1. Downloads the AI-generated video - * 2. Crops 16:9 → 9:16 (portrait for TikTok/Reels) - * 3. Overlays audio clip from the song (unless lipsync mode) - * 4. Overlays caption text (white with black stroke, bottom center) - * 5. Returns the final video as a data URL + * crops 16:9 → 9:16, overlays audio + captions + images, uploads to fal.ai storage. */ export async function renderFinalVideo( input: RenderFinalVideoInput, @@ -149,27 +41,23 @@ export async function renderFinalVideo( const videoPath = join(tempDir, "input-video.mp4"); const audioPath = join(tempDir, "song.mp3"); - const captionPath = join(tempDir, "caption.txt"); const outputPath = join(tempDir, "final.mp4"); + let overlayPaths: string[] = []; try { - // Download the AI-generated video logStep("Downloading video for final render"); const videoResponse = await fetch(input.videoUrl); if (!videoResponse.ok) { throw new Error(`Failed to download video: ${videoResponse.status}`); } - const videoBuffer = Buffer.from(await videoResponse.arrayBuffer()); - await writeFile(videoPath, videoBuffer); - - // Write the song mp3 to disk + await writeFile(videoPath, Buffer.from(await videoResponse.arrayBuffer())); await writeFile(audioPath, input.songBuffer); - // Calculate adaptive caption layout (auto-shrinks font for long text) + overlayPaths = await downloadOverlayImages(input.overlayImageUrls ?? [], tempDir); + const cleanCaption = stripEmoji(input.captionText); const captionLayout = calculateCaptionLayout(cleanCaption); - // Build ffmpeg command const ffmpegArgs = buildFfmpegArgs({ videoPath, audioPath, @@ -178,150 +66,27 @@ export async function renderFinalVideo( audioStartSeconds: input.audioStartSeconds, audioDurationSeconds: input.audioDurationSeconds, hasAudio: input.hasAudio, + overlayImagePaths: overlayPaths, }); logStep("Running ffmpeg render", true, { - argCount: ffmpegArgs.length, hasAudio: input.hasAudio, - captionLength: input.captionText.length, + overlayCount: overlayPaths.length, }); await execFileAsync("ffmpeg", ffmpegArgs); - // Read the final video and upload to fal.ai storage (avoids base64 OOM) const finalBuffer = await readFile(outputPath); const sizeBytes = finalBuffer.length; - logStep("Final video rendered, uploading to fal.ai storage", true, { sizeBytes }); const videoFile = new File([finalBuffer], "final-video.mp4", { type: "video/mp4" }); const videoUrl = await fal.storage.upload(videoFile); - logStep("Final video uploaded to fal.ai storage", false, { videoUrl, sizeBytes }); - return { - videoUrl, - mimeType: "video/mp4", - sizeBytes, - }; + return { videoUrl, mimeType: "video/mp4", sizeBytes }; } finally { - // Clean up temp files - await Promise.all([ - unlink(videoPath).catch(() => undefined), - unlink(audioPath).catch(() => undefined), - unlink(captionPath).catch(() => undefined), - unlink(outputPath).catch(() => undefined), - ]); + const cleanupPaths = [videoPath, audioPath, outputPath, ...overlayPaths]; + await Promise.all(cleanupPaths.map((p) => unlink(p).catch(() => undefined))); } } - -/** - * Builds the ffmpeg arguments for the final render. - * - * What it does (matching Remotion SocialPost): - * 1. Center-crop 16:9 → 9:16 portrait - * 2. Overlay audio from song clip (skip if lipsync — audio already in video) - * 3. Overlay caption text (white, black stroke, bottom center) - */ -function buildFfmpegArgs({ - videoPath, - audioPath, - captionLayout, - outputPath, - audioStartSeconds, - audioDurationSeconds, - hasAudio, -}: { - videoPath: string; - audioPath: string; - captionLayout: { lines: string[]; fontSize: number; lineHeight: number; position: "bottom" | "center" | "top" }; - outputPath: string; - audioStartSeconds: number; - audioDurationSeconds: number; - hasAudio: boolean; -}): string[] { - const { lines, fontSize, lineHeight, position } = captionLayout; - - // Video filter: crop 16:9 → 9:16 (center crop) + scale to 720x1280 + caption lines - // Each line is a separate drawtext filter, centered horizontally, stacked from bottom - const cropFilter = "crop=ih*9/16:ih"; - const scaleFilter = "scale=720:1280"; - - const totalTextHeight = lines.length * lineHeight; - const borderWidth = Math.max(2, Math.round(fontSize / 14)); - - // Calculate the Y start based on position strategy - // "bottom" → text block sits near the bottom - // "center" → text block is vertically centered - // "top" → text block starts near the top - let blockStartY: number; - if (position === "bottom") { - blockStartY = FRAME_HEIGHT - BOTTOM_MARGIN - totalTextHeight; - } else if (position === "center") { - blockStartY = Math.round((FRAME_HEIGHT - totalTextHeight) / 2); - } else { - // "top" — start from upper area with some top margin - blockStartY = 180; - } - - // Build a drawtext filter for each line, centered horizontally - const captionFilters = lines.map((line, i) => { - // Escape ffmpeg drawtext special characters - const escaped = line - .replace(/\\/g, "\\\\\\\\") - .replace(/'/g, "\u2019") // replace apostrophe with curly quote - .replace(/:/g, "\\\\:") - .replace(/%/g, "%%%%") - .replace(/\n/g, " ") - .replace(/\r/g, ""); - - const yPos = blockStartY + (i * lineHeight); - - return [ - `drawtext=text='${escaped}'`, - `fontsize=${fontSize}`, - "fontcolor=white", - `borderw=${borderWidth}`, - "bordercolor=black", - "x=(w-tw)/2", - `y=${String(yPos)}`, - ].join(":"); - }); - - const videoFilter = [cropFilter, scaleFilter, ...captionFilters].join(","); - - const args = ["-y"]; - - if (hasAudio) { - // Lipsync mode: video already has audio, just crop + caption - args.push( - "-i", videoPath, - "-vf", videoFilter, - "-c:v", "libx264", - "-c:a", "aac", - "-pix_fmt", "yuv420p", - "-movflags", "+faststart", - "-shortest", - outputPath, - ); - } else { - // Normal mode: crop video + overlay song audio clip - args.push( - "-i", videoPath, - "-ss", String(audioStartSeconds), - "-t", String(audioDurationSeconds), - "-i", audioPath, - "-vf", videoFilter, - "-c:v", "libx264", - "-c:a", "aac", - "-map", "0:v:0", - "-map", "1:a:0", - "-pix_fmt", "yuv420p", - "-movflags", "+faststart", - "-shortest", - outputPath, - ); - } - - return args; -} diff --git a/src/content/resolveFaceGuide.ts b/src/content/resolveFaceGuide.ts index 3d5a0bd..3ab2e52 100644 --- a/src/content/resolveFaceGuide.ts +++ b/src/content/resolveFaceGuide.ts @@ -1,14 +1,20 @@ -import { fal } from "@fal-ai/client"; -import { logStep } from "../sandboxes/logStep"; -import { fetchImageFromUrl } from "./fetchImageFromUrl"; -import { fetchGithubFile } from "./fetchGithubFile"; +import { classifyImages } from "./classifyImages"; +import { fetchGitHubFaceGuide } from "./fetchGitHubFaceGuide"; + +export interface ResolveFaceGuideResult { + faceGuideUrl: string | null; + additionalImageUrls: string[]; +} /** - * Resolves the face guide URL for the content pipeline. - * Uses the first image from the images array if provided, - * otherwise fetches face-guide.png from the artist's GitHub repo. + * Resolves the face guide URL and additional image URLs for the content pipeline. + * + * Analyzes each image in the images array to detect faces: + * - The first face image becomes the face guide + * - Non-face images (album covers, playlist covers, etc.) become additional image URLs + * - If no face is found in images and usesFaceGuide is true, fetches face-guide.png from GitHub * - * @returns fal.ai storage URL, or null if template doesn't use face guide + * @returns faceGuideUrl (or null) and additionalImageUrls for the model */ export async function resolveFaceGuide({ usesFaceGuide, @@ -20,29 +26,16 @@ export async function resolveFaceGuide({ images: string[] | undefined; githubRepo: string; artistSlug: string; -}): Promise { - // Always pass attached images through (e.g. album cover art — no face-swap needed). - const imageUrl = images?.[0]; - if (imageUrl) { - return fetchImageFromUrl(imageUrl); - } - - if (!usesFaceGuide) return null; +}): Promise { + const { faceGuideUrl, additionalImageUrls } = images?.length + ? await classifyImages({ images, usesFaceGuide }) + : { faceGuideUrl: null, additionalImageUrls: [] }; - logStep("Fetching face-guide from GitHub"); - const buffer = await fetchGithubFile( - githubRepo, - `artists/${artistSlug}/context/images/face-guide.png`, - ); - if (!buffer) { - throw new Error(`face-guide.png not found for artist ${artistSlug}`); + // Fall back to GitHub face-guide if needed + if (usesFaceGuide && !faceGuideUrl) { + const fallbackUrl = await fetchGitHubFaceGuide(githubRepo, artistSlug); + return { faceGuideUrl: fallbackUrl, additionalImageUrls }; } - logStep("Uploading face-guide to fal.ai storage", true, { - sizeBytes: buffer.byteLength, - }); - const file = new File([buffer], "face-guide.png", { type: "image/png" }); - const falUrl = await fal.storage.upload(file); - logStep("Face-guide uploaded", false, { faceGuideUrl: falUrl }); - return falUrl; + return { faceGuideUrl, additionalImageUrls }; } diff --git a/src/content/stripEmoji.ts b/src/content/stripEmoji.ts new file mode 100644 index 0000000..66138cf --- /dev/null +++ b/src/content/stripEmoji.ts @@ -0,0 +1,11 @@ +/** + * Strips emoji and other non-ASCII characters that ffmpeg drawtext can't render. + */ +export function stripEmoji(text: string): string { + return text + .replace(/[\u{1F000}-\u{1FFFF}]/gu, "") + .replace(/[\u{2600}-\u{27BF}]/gu, "") + .replace(/[\u{FE00}-\u{FE0F}]/gu, "") + .replace(/\s{2,}/g, " ") + .trim(); +} diff --git a/src/content/templates/artist-release-editorial/caption-guide.json b/src/content/templates/artist-release-editorial/caption-guide.json new file mode 100644 index 0000000..307d232 --- /dev/null +++ b/src/content/templates/artist-release-editorial/caption-guide.json @@ -0,0 +1,31 @@ +{ + "templateStyle": "editorial release promo — polished visual with artist press photo, playlist covers, and streaming platform branding. The post announces or celebrates a release landing on editorial playlists.", + "captionRole": "the caption sells the moment without being salesy. its the artist acknowledging the milestone — editorial playlist placement, the release doing numbers, or the feeling of seeing your song on a curated list.", + "tone": "confident but not bragging. grateful but not corny. the energy of an artist who knows they earned the placement.", + "rules": [ + "lowercase only", + "keep it under 100 characters for short captions, can go longer for medium", + "no punctuation at the end unless its a question mark or ellipsis", + "never sound like a press release or label marketing copy", + "never say 'out now' or 'stream now' or 'link in bio'", + "can name the playlist, the song, or the DSP naturally", + "dont describe whats in the image", + "the caption should feel like the artist typed it, not their manager", + "can reference the grind, the journey, or what the placement means to them", + "if it sounds corporate, rewrite it until it sounds human" + ], + "formats": [ + "a one-line reaction to the playlist placement ('they put me on rapcaviar and i still cant afford a couch')", + "a quiet flex acknowledging the milestone ('editorial. finally.')", + "a thank you that feels real ('yall really streamed this enough for them to notice')", + "a reflection on what the release means ('wrote this song in my car now its on new music friday')", + "a call to the fans without being pushy ('if you played this on repeat this week... i felt it')" + ], + "examples_of_good_length": [ + "editorial. finally.", + "they put me on the playlist and i still cant sleep", + "wrote this in my bedroom now its on new music friday", + "yall really made this happen", + "i keep checking if its real" + ] +} diff --git a/src/content/templates/artist-release-editorial/references/captions/examples.json b/src/content/templates/artist-release-editorial/references/captions/examples.json new file mode 100644 index 0000000..594e2eb --- /dev/null +++ b/src/content/templates/artist-release-editorial/references/captions/examples.json @@ -0,0 +1,8 @@ +[ + "editorial. finally.", + "they put me on the playlist and i still cant sleep", + "wrote this in my bedroom now its on new music friday", + "yall really made this happen", + "i keep checking if its real", + "my song is on a playlist i used to listen to before i even started making music" +] diff --git a/src/content/templates/artist-release-editorial/references/images/ref-01.png b/src/content/templates/artist-release-editorial/references/images/ref-01.png new file mode 100644 index 0000000..87cf7ec Binary files /dev/null and b/src/content/templates/artist-release-editorial/references/images/ref-01.png differ diff --git a/src/content/templates/artist-release-editorial/references/images/ref-02.png b/src/content/templates/artist-release-editorial/references/images/ref-02.png new file mode 100644 index 0000000..d98382e Binary files /dev/null and b/src/content/templates/artist-release-editorial/references/images/ref-02.png differ diff --git a/src/content/templates/artist-release-editorial/references/images/ref-03.png b/src/content/templates/artist-release-editorial/references/images/ref-03.png new file mode 100644 index 0000000..f76fc52 Binary files /dev/null and b/src/content/templates/artist-release-editorial/references/images/ref-03.png differ diff --git a/src/content/templates/artist-release-editorial/references/images/ref-04.png b/src/content/templates/artist-release-editorial/references/images/ref-04.png new file mode 100644 index 0000000..f3d9040 Binary files /dev/null and b/src/content/templates/artist-release-editorial/references/images/ref-04.png differ diff --git a/src/content/templates/artist-release-editorial/style-guide.json b/src/content/templates/artist-release-editorial/style-guide.json new file mode 100644 index 0000000..5ef7954 --- /dev/null +++ b/src/content/templates/artist-release-editorial/style-guide.json @@ -0,0 +1,36 @@ +{ + "name": "artist-release-editorial", + "description": "Editorial promo featuring artist press photo with playlist covers and DSP branding — the kind of polished-but-organic visual an artist's team drops alongside a new release", + "usesFaceGuide": true, + "usesImageOverlay": true, + "customInstruction": "Generate a clean editorial-style press photo of the artist. The image should contain ONLY the artist — no text, no overlays, no graphics, no album art, no branding. Just a professional press photo that looks like it was taken for a magazine feature or editorial spread. Polished lighting but still feeling authentic and personal.", + "imagePrompt": "A professional editorial press photo of an artist. Clean, intentional lighting — soft key light with subtle rim light separation. The artist is posed naturally, looking directly at camera or slightly off-axis. The background is a solid or subtly textured surface (concrete wall, draped fabric, muted gradient) that does not distract from the subject. The mood is confident and polished but not sterile. Shot on a DSLR or medium format camera, shallow depth of field, cinematic color grade leaning warm or desaturated depending on the artist's aesthetic. The image contains ONLY the artist — no text, no graphics, no overlays.", + + "camera": { + "type": "DSLR or medium format, editorial portrait lens (85mm-135mm equivalent)", + "angle": "eye level or slightly below, straight on or slight three-quarter turn", + "quality": "clean, sharp focus on the subject, natural skin texture visible, cinematic color grade", + "focus": "subject tack sharp, background falls off smoothly with shallow depth of field" + }, + + "environment": { + "feel": "editorial — intentional, curated but not over-produced", + "lighting": "soft key light from the side, subtle fill, rim light for separation. Warm or cool depending on mood. Not flat studio lighting — has dimension and direction.", + "backgrounds": "solid or subtly textured — concrete, fabric drape, muted color wash, out-of-focus urban environment. Clean and uncluttered.", + "avoid": "busy backgrounds, cluttered sets, green screen look, over-saturated colors, heavy HDR, any text or graphics in the image" + }, + + "subject": { + "expression": "confident, intentional — not smiling for the camera but not stone-faced either. The look of someone who knows the music speaks for itself.", + "pose": "natural editorial pose — shoulders slightly turned, hands relaxed, not stiff or over-directed. Can be seated, standing, or leaning.", + "clothing": "stylish but authentic to the artist — not costume-y. Could be anything from a clean hoodie to tailored jacket depending on the artist's brand.", + "framing": "medium shot (waist up) or medium close-up (chest up)" + }, + + "realism": { + "priority": "the image must look like an actual editorial press photo, not AI-generated. Think Apple Music editorial playlist cover quality.", + "texture": "natural skin texture, fabric detail, subtle imperfections that make it feel real", + "imperfections": "a stray hair, slight asymmetry in pose, natural skin texture — not airbrushed smooth", + "avoid": "plastic skin, symmetrical face, uncanny valley eyes, overly saturated or HDR look, stock photo posing" + } +} diff --git a/src/content/templates/artist-release-editorial/video-moods.json b/src/content/templates/artist-release-editorial/video-moods.json new file mode 100644 index 0000000..38980c4 --- /dev/null +++ b/src/content/templates/artist-release-editorial/video-moods.json @@ -0,0 +1,10 @@ +[ + "quiet confidence, the calm after you find out your song made the playlist", + "proud but understated, like posting a W without needing to explain it", + "grateful energy, remembering everyone who played it on repeat", + "reflective, thinking about where you started vs where the song ended up", + "celebratory but cool, not popping bottles — just a knowing nod", + "determined, this placement is fuel not a finish line", + "surreal, still processing that it's real", + "warm, the feeling of your team hitting you with the screenshot" +] diff --git a/src/content/templates/artist-release-editorial/video-movements.json b/src/content/templates/artist-release-editorial/video-movements.json new file mode 100644 index 0000000..5a8085a --- /dev/null +++ b/src/content/templates/artist-release-editorial/video-movements.json @@ -0,0 +1,10 @@ +[ + "nearly still, only natural breathing — the artist holds the frame with quiet confidence", + "slow subtle head turn toward camera, locks eyes with the lens", + "the faintest nod, like acknowledging something only they understand", + "a slow exhale, shoulders drop slightly as if releasing tension", + "eyes drift down then slowly back up to camera, contemplative", + "jaw sets slightly, the look of someone who earned this", + "the corner of their mouth lifts barely — not a full smile, just satisfaction", + "a slow blink, maintaining eye contact, completely present" +] diff --git a/src/content/wrapText.ts b/src/content/wrapText.ts new file mode 100644 index 0000000..bf019ae --- /dev/null +++ b/src/content/wrapText.ts @@ -0,0 +1,21 @@ +/** + * Wraps text to fit within a max character width per line. + * Breaks on word boundaries to avoid mid-word splits. + */ +export function wrapText(text: string, maxCharsPerLine: number): string[] { + const words = text.split(" "); + const lines: string[] = []; + let currentLine = ""; + + for (const word of words) { + if (currentLine.length + word.length + 1 > maxCharsPerLine && currentLine.length > 0) { + lines.push(currentLine); + currentLine = word; + } else { + currentLine = currentLine ? `${currentLine} ${word}` : word; + } + } + if (currentLine) lines.push(currentLine); + + return lines; +} diff --git a/src/tasks/createContentTask.ts b/src/tasks/createContentTask.ts index 88e5c99..b23c837 100644 --- a/src/tasks/createContentTask.ts +++ b/src/tasks/createContentTask.ts @@ -69,8 +69,8 @@ export const createContentTask = schemaTask({ logStep("Loading template"); const template = await loadTemplate(payload.template); - // --- Step 2: Fetch face-guide (only if template uses it) --- - const faceGuideUrl = await resolveFaceGuide({ + // --- Step 2: Resolve face-guide and classify attached images --- + const { faceGuideUrl, additionalImageUrls } = await resolveFaceGuide({ usesFaceGuide: template.usesFaceGuide, images: payload.images, githubRepo: payload.githubRepo, @@ -100,6 +100,7 @@ export const createContentTask = schemaTask({ faceGuideUrl: faceGuideUrl ?? undefined, referenceImagePath, prompt: fullPrompt, + additionalImageUrls, }); // --- Step 6: Upscale image (optional) --- @@ -158,6 +159,7 @@ export const createContentTask = schemaTask({ audioDurationSeconds: audioClip.durationSeconds, captionText, hasAudio: payload.lipsync, + overlayImageUrls: template.usesImageOverlay ? additionalImageUrls : undefined, }); // --- Return result ---