-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapp.py
More file actions
1819 lines (1475 loc) · 78.2 KB
/
app.py
File metadata and controls
1819 lines (1475 loc) · 78.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import os
import re # For robust JSON cleaning
# Optimization for low VRAM and Windows
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["HF_HUB_DISABLE_SYMLINKS"] = "1"
import gradio as gr
import torch
import torch.nn as nn
import torch.optim as optim
from diffusers import AutoPipelineForText2Image, StableVideoDiffusionPipeline, UNet2DConditionModel, EulerDiscreteScheduler, AutoencoderKL
from transformers import AutoModelForCausalLM, AutoTokenizer
import requests
import json
import gc
import random
import time
import glob
import numpy as np
import imageio
import cv2
from PIL import Image
from wakepy import keep
from datetime import datetime
import shutil
# --- Constants & Config ---
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "outputs")
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Models
# Switched to SDXL-Turbo (Native, faster, fixes config crash)
MODEL_SDXL = "stabilityai/sdxl-turbo"
MODEL_SVD_XT = "stabilityai/stable-video-diffusion-img2vid-xt-1-1"
MODEL_MOONDREAM = "vikhyatk/moondream2"
HF_TOKEN = ""
OLLAMA_URL = "http://localhost:11434/api/generate"
# Updated to user preference
OLLAMA_MODEL = "gemma3:4b"
# Global State
current_model = None
loaded_dream_models = {}
stop_dreaming_flag = False # Flag to interrupt the loop
pending_guide_question = ""
guide_disabled_until = 0
# --- Helpers ---
def clean_json_text(text):
"""Robustly cleans LLM output to extract just the JSON."""
# Remove markdown code blocks
if "```" in text:
# pattern to find content between ```json and ``` or just ``` and ```
pattern = r"```(?:json)?\s*(.*?)\s*```"
match = re.search(pattern, text, re.DOTALL)
if match:
text = match.group(1)
# If no markdown, try to find the first { and last }
start = text.find("{")
end = text.rfind("}")
if start != -1 and end != -1:
text = text[start:end+1]
return text.strip()
# --- Memory Management ---
def cleanup_memory():
"""Force garbage collection and empty CUDA cache."""
gc.collect()
torch.cuda.empty_cache()
def unload_model(model):
"""Unloads a specific model from GPU memory."""
if model is not None:
del model
cleanup_memory()
def unload_dream_stack_only():
"""Aggressively clears VAE, RNN, and Guide models for expansion."""
global loaded_dream_models
# 1. Unload VAE/RNN
unload_model(loaded_dream_models.pop("vae", None))
unload_model(loaded_dream_models.pop("rnn", None))
# 2. Unload Guide
guide_tuple = loaded_dream_models.pop("guide", None)
if guide_tuple:
unload_model(guide_tuple[0]) # model
# Tokenizer is usually lightweight, but we track the disposal
loaded_dream_models.clear()
cleanup_memory()
print("🧹 Successfully cleared VAE/RNN/Guide stack for Builder.")
def unload_heavy_models():
"""Aggressively clears VRAM of the global current_model."""
global current_model
if current_model is not None:
print(f"🧹 Unloading {type(current_model).__name__}...")
unload_model(current_model)
current_model = None
# Also clear Dream models if we are building
if loaded_dream_models:
print("🧹 Clearing Dream Stack...")
loaded_dream_models.clear()
cleanup_memory()
# --- Neural Networks (The Brains) ---
class SimpleVAE(nn.Module):
"""The Eye: Compresses Reality (64x64) into Latents (32 dim)."""
def __init__(self):
super().__init__()
self.encoder = nn.Sequential(
nn.Conv2d(3, 32, 4, stride=2, padding=1), nn.ReLU(),
nn.Conv2d(32, 64, 4, stride=2, padding=1), nn.ReLU(),
nn.Conv2d(64, 128, 4, stride=2, padding=1), nn.ReLU(),
nn.Flatten()
)
self.fc_mu = nn.Linear(128*8*8, 32)
self.fc_logvar = nn.Linear(128*8*8, 32)
self.dec_fc = nn.Linear(32, 128*8*8)
self.decoder = nn.Sequential(
nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1), nn.ReLU(),
nn.ConvTranspose2d(64, 32, 4, stride=2, padding=1), nn.ReLU(),
nn.ConvTranspose2d(32, 3, 4, stride=2, padding=1), nn.Sigmoid()
)
def reparameterize(self, mu, logvar):
std = torch.exp(0.5 * logvar)
eps = torch.randn_like(std)
return mu + eps * std
def forward(self, x):
h = self.encoder(x)
mu, logvar = self.fc_mu(h), self.fc_logvar(h)
z = self.reparameterize(mu, logvar)
h_dec = self.dec_fc(z).view(-1, 128, 8, 8)
return self.decoder(h_dec), mu, logvar
class SimpleRNN(nn.Module):
"""The Traveler: Predicts the next latent vector."""
def __init__(self):
super().__init__()
# INCREASED HIDDEN SIZE: 128 -> 256 for better motion modeling
self.rnn = nn.LSTM(32 + 3, 256, batch_first=True)
self.fc = nn.Linear(256, 32) # Must match the new hidden size
def forward(self, z, action, hidden=None):
input_combined = torch.cat([z, action], dim=1).unsqueeze(1)
out, hidden = self.rnn(input_combined, hidden)
pred_z = self.fc(out.squeeze(1))
return pred_z, hidden
# --- Tab 1: Build (The Cartography) ---
# --- History / Diversity Utils ---
def get_recent_history(limit=10):
"""Reads the last N entries from history.jsonl."""
history_file = os.path.join(PROJECT_ROOT, "history.jsonl")
if not os.path.exists(history_file):
return []
lines = []
try:
with open(history_file, "r", encoding="utf-8") as f:
lines = f.readlines()
recent_descs = []
for line in lines[-limit:]:
try:
entry = json.loads(line)
if "description" in entry:
recent_descs.append(entry["description"])
except:
continue
return recent_descs
except Exception as e:
print(f"History Read Error: {e}")
return []
def append_to_history(description):
"""Appends a new generation to history.jsonl."""
history_file = os.path.join(PROJECT_ROOT, "history.jsonl")
entry = {
"timestamp": datetime.now().isoformat(),
"description": description
}
try:
with open(history_file, "a", encoding="utf-8") as f:
f.write(json.dumps(entry) + "\n")
except Exception as e:
print(f"History Write Error: {e}")
def generate_trip_concept(vibe, custom_prompt=None):
"""Generates a rich location description."""
print(f"Trip Planner: Planning a '{vibe}' trip...")
# Logic to handle "Wander Further" context (passed via custom_prompt usually)
prompt_context = ""
if custom_prompt and "CONTEXT:" in custom_prompt:
prompt_context = custom_prompt
# Reset custom prompt so it doesn't get stuck
custom_prompt = None
# 1. --- HIGH PRIORITY: CUSTOM PROMPT ---
if custom_prompt and custom_prompt.strip():
prompt = f"""You are a creative travel agent describing a location to a potential traveler when they said they wanted: {custom_prompt}. Explain the location in one brief sentence (e.g. "It's a vast desert.","It's a metropolis."), then provide a vivid and atmospheric description of this specific place that a traveler would like to explore. Focus on visual details. Keep the description of the location under 50 words and respond ONLY with that description, no other explanation."""
# 2. --- SECOND PRIORITY: AUTONOMOUS/WANDER CONTEXT ---
elif vibe == "Autonomous":
# --- DIVERSITY ENFORCEMENT ---
recent_history = get_recent_history(10)
diversity_instruction = ""
if recent_history:
history_text = "\n".join([f"- {desc[:100]}..." for desc in recent_history])
diversity_instruction = f"""
CRITICAL INSTRUCTION: You must suggest a location for the traveler that is SUBSTANTIVELY DIFFERENT from these recent locations:
{history_text}
Do not repeat themes, biomes, color palettes, or atmospheres found in this list of locations you've suggested before. Be unique.
"""
print(f"Trip Planner: Enforcing diversity against {len(recent_history)} recent items.")
if prompt_context:
# Use context gathered from Wander Further Prep
prompt = f"""You are a creative travel agent describing a new location to a traveler. The traveler wants to leave their current location for a NEW area. Write a vivid and atmospheric description of the NEW area. You MUST begin by explaining the location in one brief sentence (e.g. "It's a vast desert.","It's a metropolis.","It's a beautiful beach.","It's a cruise ship."), then provide a vivid and atmospheric description of this location that a traveler would like to explore. Focus on visual details. Keep the description of the location under 50 words and respond ONLY with that description, no other explanation.
CURRENT LOCATION: {prompt_context}
{diversity_instruction}"""
else:
# Default Autonomous prompt
prompt = f"""You are a creative travel agent describing a location to a potential traveler. Everyone loves to visit a relaxing beach, or perhaps a fantasy forest, or a futuristic city, or even visit onboard a cruise ship. You MUST begin by explaining the location in one brief sentence (e.g. "It's a vast desert.","It's a metropolis.","It's a beautiful beach.","It's a cruise ship."), then provide a vivid and atmospheric description of this location that a traveler would like to explore. Focus on visual details. Keep the description of the location under 50 words and respond ONLY with that description, no other explanation.
{diversity_instruction}"""
# 3. --- LOWEST PRIORITY: VIBE DROPDOWN ---
else:
prompt = f"""You are a creative travel agent describing a location to a potential traveler when they said they wanted this vibe: {vibe}. You MUST begin by explaining the location in one brief sentence (e.g. "It's a vast desert.","It's a metropolis.","It's a beautiful beach.","It's a cruise ship."), then provide a vivid and atmospheric description of this specific vibe that a traveler would like to explore. Focus on visual details. Keep the description of the location under 50 words and respond ONLY with that description, no other explanation."""
try:
response = requests.post(OLLAMA_URL, json={
"model": OLLAMA_MODEL, "prompt": prompt, "stream": False,
"options": {"num_gpu": 0, "temperature": 0.9}
})
response.raise_for_status()
location_desc = response.json()['response'].strip()
print(f"Trip Planner: Location found - {location_desc[:50]}...")
# --- LOGGING ---
if vibe == "Autonomous":
append_to_history(location_desc)
return location_desc
except Exception as e:
print(f"Trip Planner Error: {e}")
return "A mysterious, foggy void with faint neon lights in the distance."
def generate_shot_list(location_desc):
"""Generates a JSON shot list with SPATIAL logic."""
print("Travel Agent: Generating shot list...")
# 1. THE NEW PROMPT (Compass Logic + Context Anchoring)
system_prompt = """You are a technical director for a virtual world.
Your task is to generate a strict JSON shot list based on a location description. You must use the exact keys provided and you must provide a visual shot description for each key.
### CRITICAL RULES for describing every shot:
1. **Spatial Logic:** Imagine standing in the center.
* **hub**: The 360-degree establishing shot.
* **forward**: North. **back**: South. **left**: West. **right**: East.
2. **Lighting Consistency:** If Sun is Forward, Back must be backlit/bright.
3. **Landmark Logic:** If a mountain is Forward, it CANNOT be in the Back shot.
4. **LENGTH CONSTRAINT:** Each of your shot descriptions must be 50 words or less. Since the hub forms the basis of a further description for at least the forward shot, the hub description must be less than 31 words.
5. **CONTEXT RETENTION (MOST IMPORTANT):** * You MUST repeat the location name/theme in EVERY description.
* **IMPORTANT:** The "forward" and "back" shots are EXPLORATION shots. They must describe the visual scene and then "Camera moves forward".
* **Formula for forward:** "forward": "[Copy Hub Description]. Camera moves forward..."
* **Formula for back:** "back": "[Description of a different view facing South]. Camera moves forward..." (Do not indicate turning. We're faced in the opposite direction and are moving forward into the area behind us.)
* **Formula for left/right:** Use specific panning language: "Camera pans from center to left" or "Camera pans from center to right", and describe the visuals that are revealed in the scene.
### OUTPUT FORMAT:
Return ONLY a raw JSON object. Keys: "hub", "forward", "left", "right", "back".
In the below example:
- Note the shorter description for hub, which is then copied to forward, which then adds camera movement.
- Other shots DON'T copy/paste the hub description, but beautifully describe the scene.
- Hub description is under 30 words and all descriptions are under 50 words.
- Left and Right include direction for clear camera motion.
- Forward and Back indicate direction for camera moving forward, but they never mention turning or panning.
{
"hub": "Golden hour in a vast Desert Canyon. Towering red cliffs frame a sandy floor. The low sun casts long, dramatic shadows.",
"forward": "Golden hour in a vast Desert Canyon. Towering red cliffs frame a sandy floor. The low sun casts long, dramatic shadows. Camera moves forward through the Canyon directly toward the blinding sun.",
"back": "Facing south in a vast Desert Canyon, away from the sun. The rock walls are fully illuminated in vibrant orange and red light against a deep blue sky. Camera moves forward exploring the illuminated path.",
"left": "Panning West in a vast Desert Canyon. The cliff wall is side-lit, revealing the deep, rough texture of the red rock. Camera pans from center to left, revealing the cactus shadows.",
"right": "Panning East in a vast Desert Canyon. The canyon widens here. The uneven terrain is highlighted by grazing light. Camera pans from center to right, showing the widening path."
}
"""
full_prompt = f"{system_prompt}\n\nLOCATION: {location_desc}\n\nNow return your visually described shot list in JSON format."
try:
response = requests.post(OLLAMA_URL, json={
"model": OLLAMA_MODEL, "prompt": full_prompt, "stream": False,
"options": {"num_gpu": 0, "temperature": 0.7, "format": "json"}
})
response.raise_for_status()
raw_text = response.json()['response'].strip()
cleaned_text = clean_json_text(raw_text)
return json.loads(cleaned_text)
except Exception as e:
print(f"Travel Agent Error: {e}")
return {
"hub": f"Wide shot of {location_desc}",
"forward": f"Moving forward in {location_desc}",
"left": f"Looking left in {location_desc}",
"right": f"Looking right in {location_desc}",
"back": f"Looking back in {location_desc}"
}
# --- Media Utils ---
def save_video_frames(frames, output_path, fps=24):
try:
np_frames = [np.array(frame) for frame in frames]
imageio.mimwrite(output_path, np_frames, fps=fps, codec="libx264", quality=9)
print(f"Saved video to {output_path}")
except Exception as e:
print(f"ERROR: Failed to save video frames: {e}")
# --- Part 1: Images Only (SDXL) ---
def generate_images(shot_list, world_name="my_world", progress=gr.Progress()):
"""Phase 1: Generates the 5 static images using SDXL."""
global current_model
timestamp = int(time.time())
job_dir = os.path.join(OUTPUT_DIR, world_name, f"room_{timestamp}")
os.makedirs(job_dir, exist_ok=True)
# Empty state for video placeholders
yield None, None, None, None, None, "🎨 Loading Image Generator...", job_dir
# 1. Load SDXL
if current_model is None or not hasattr(current_model, "text_encoder") or "LTX" in str(type(current_model)):
unload_model(current_model)
try:
print("Loading SDXL-Turbo...")
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
pipe = AutoPipelineForText2Image.from_pretrained(MODEL_SDXL, vae=vae, torch_dtype=torch.float16, variant="fp16")
pipe.to("cuda")
current_model = pipe
except Exception as e:
print(f"Error: {e}")
yield None, None, None, None, None, f"❌ Error: {e}", None
return
# 2. Generate Images
images = {}
directions = ["hub", "forward", "left", "right", "back"]
for i, direction in enumerate(directions):
progress((i/5), desc=f"Painting {direction.upper()}...")
prompt = shot_list.get(direction, f"A view looking {direction}")
img = current_model(prompt, num_inference_steps=2, guidance_scale=0.0).images[0]
img_path = os.path.join(job_dir, f"{direction}.png")
img.save(img_path)
images[direction] = img_path
# Yield Hub immediately when done
if direction == "hub":
yield images["hub"], None, None, None, None, f"✅ Saved {direction.upper()}", job_dir
# 3. Clean up SDXL before Videos start
print("🛑 Unloading SDXL to free VRAM for LTX...")
unload_heavy_models() # Clears global
if 'pipe' in locals(): del pipe # Clears local
gc.collect()
torch.cuda.empty_cache()
# Final Yield: Hub is visible, Videos are empty, passing job_dir to next step
yield images.get("hub"), None, None, None, None, "✅ Images Complete. Starting Video Engine...", job_dir
# --- Part 2: Videos Only (LTX) ---
def generate_videos(job_dir, shot_list, progress=gr.Progress()):
"""Phase 2: Generates the videos using LTX 0.9.8. Does NOT output to Hub Image."""
global current_model
if not job_dir or not os.path.exists(job_dir):
yield None, None, None, None, "❌ Error: Job Directory missing."
return
# 1. Load LTX (Turbo 0.9.8)
MODELS_ROOT = r"C:\pinokio\api\worldmAIker" # replace with your project root
LOCAL_LTX_098_FILE = os.path.join(MODELS_ROOT, "models", "ltxv-2b-0.9.8-distilled.safetensors")
LOCAL_T5_PATH = os.path.join(MODELS_ROOT, "models", "t5")
try:
from diffusers import LTXImageToVideoPipeline
from transformers import T5EncoderModel, AutoTokenizer
progress(0, desc="Loading LTX Engine...")
# Load Components (Offline optimized)
try:
tokenizer = AutoTokenizer.from_pretrained("Lightricks/LTX-Video", subfolder="tokenizer", local_files_only=True)
except:
tokenizer = AutoTokenizer.from_pretrained("Lightricks/LTX-Video", subfolder="tokenizer")
text_encoder = T5EncoderModel.from_pretrained(LOCAL_T5_PATH, torch_dtype=torch.float16, local_files_only=True, low_cpu_mem_usage=True)
# Load Pipeline
pipe = LTXImageToVideoPipeline.from_single_file(
LOCAL_LTX_098_FILE,
text_encoder=text_encoder,
tokenizer=tokenizer,
torch_dtype=torch.float16,
original_config_file=None
)
pipe.enable_sequential_cpu_offload() # 6GB Safety
current_model = pipe
except Exception as e:
yield None, None, None, None, f"❌ Error loading LTX: {e}"
return
# 2. Generate Videos
nav_keys = ["hub", "forward", "left", "right", "back"]
TOTAL_STEPS = 8 # Turbo
video_paths = {}
for direction in nav_keys:
img_path = os.path.join(job_dir, f"{direction}.png")
if not os.path.exists(img_path): continue
# Callback for Progress Bar
def progress_callback(pipe, step_index, timestep, callback_kwargs):
p = (step_index + 1) / TOTAL_STEPS
progress(p, desc=f"🎥 {direction.upper()} Step {step_index+1}/{TOTAL_STEPS}")
return callback_kwargs
source_img = Image.open(img_path).resize((768, 512))
prompt = shot_list.get(direction, f"Camera moves {direction}")
# --- Hub Parameter Overrides ---
noise_scale = 0.025 # Default noise for movement
guidance = 3.0 # Default guidance
if direction == "hub":
# Override for static anchor: high guidance, low noise
guidance = 10.0 # Stick closely to the source image
noise_scale = 0.005 # Minimize motion and flicker
frames = pipe(
image=source_img,
prompt=prompt,
height=512,
width=768,
num_frames=73,
num_inference_steps=TOTAL_STEPS,
guidance_scale=guidance,
decode_timestep=0.03,
decode_noise_scale=noise_scale, # Uses the overridden scale
callback_on_step_end=progress_callback
).frames[0]
output_path = os.path.join(job_dir, f"{direction}.mp4")
save_video_frames(frames, output_path, fps=24)
video_paths[direction] = output_path
# Yield ONLY the videos + Status (Hub Image is untouched)
yield (
video_paths.get("forward"), video_paths.get("left"), video_paths.get("right"), video_paths.get("back"),
f"✅ Finished {direction.upper()}"
)
# 3. Cleanup
unload_heavy_models()
if 'pipe' in locals(): del pipe
gc.collect()
torch.cuda.empty_cache()
yield (
video_paths.get("forward"), video_paths.get("left"), video_paths.get("right"), video_paths.get("back"),
"✨ World Render Complete (Turbo 0.9.8)!"
)
def condense_video(input_path, output_dir, max_seconds=10):
"""
Reads a video and saves a condensed version by sampling frames, aiming
for max_seconds length. Returns the path to the new, condensed video.
"""
# Use the same FPS as the system expects (24 FPS)
fps = 24
# 1. Open the video file
cap = cv2.VideoCapture(input_path)
if not cap.isOpened():
print(f"ERROR: Could not open video {input_path}")
return input_path # Return original path on failure
original_fps = cap.get(cv2.CAP_PROP_FPS)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# Target frame count for max_seconds
target_frames = int(max_seconds * original_fps)
if frame_count <= target_frames:
cap.release()
return input_path # Video is already short enough
# 2. Calculate the sampling interval (e.g., if 2000 frames -> 2000/240 = 8.33, so interval is 8)
interval = max(1, frame_count // target_frames) # Ensure interval is at least 1
# 3. Prepare output video writer
output_filename = os.path.basename(input_path).replace(".mp4", "_CONDENSED.mp4")
output_path = os.path.join(output_dir, output_filename)
# Use the original video properties for fidelity
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
writer = cv2.VideoWriter(output_path, fourcc, original_fps,
(int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))))
# 4. Sample and write frames
for i in range(frame_count):
ret, frame = cap.read()
if not ret: break
if i % interval == 0:
writer.write(frame)
cap.release()
writer.release()
print(f"Condensed video saved: {output_path} (Approx {target_frames} frames)")
return output_path
# --- Tab 2: Transport (The trainer) ---
def train_world_model(world_name, progress=gr.Progress()):
"""Trains the VAE and RNN using Velocity (Delta) Learning."""
print(f"Driver: Training World Model for '{world_name}'...")
unload_heavy_models()
world_root = os.path.join(OUTPUT_DIR, world_name)
if not os.path.exists(world_root):
return "World not found!"
# --- 1. DATA GATHERING ---
training_sequences = []
video_files = glob.glob(os.path.join(world_root, "**", "*.mp4"), recursive=True)
print(f"Found {len(video_files)} videos for training.")
for vid_path in video_files:
filename = os.path.basename(vid_path).lower()
# 1. Stronger Action Vectors (Scaled down to 2 for smoother motion)
# --- Action Vectors ---
if "forward" in filename:
action_vec = [2.0, 0.0, 0.0] # Positive Momentum
elif "left" in filename:
action_vec = [0.0, 2.0, 0.0] # Rotational Change
elif "right" in filename:
action_vec = [0.0, 0.0, 2.0] # Rotational Change
elif "back" in filename:
# Negative Momentum (Reverse)
action_vec = [-2.0, 0.0, 0.0]
elif "hub" in filename:
# Null Action (Idle/Anchor)
action_vec = [0.0, 0.0, 0.0]
else:
# Fallback for unexpected files - treat as idle
action_vec = [0.0, 0.0, 0.0]
frames = []
cap = cv2.VideoCapture(vid_path)
while True:
ret, frame = cap.read()
if not ret: break
frame = cv2.resize(frame, (64, 64))
frames.append(frame)
cap.release()
if len(frames) > 1:
training_sequences.append((frames, action_vec))
if not training_sequences:
return "No training videos found! Generate a world first."
# --- 2. MODEL INITIALIZATION ---
vae_path = os.path.join(world_root, "vae.pth")
rnn_path = os.path.join(world_root, "rnn.pth")
vae = SimpleVAE().to("cuda")
rnn = SimpleRNN().to("cuda")
# Keep VAE if exists, but RETRAIN RNN FROM SCRATCH
if os.path.exists(vae_path):
try:
vae.load_state_dict(torch.load(vae_path))
print("Loaded existing VAE.")
except:
print("Starting VAE from scratch.")
optimizer_vae = torch.optim.Adam(vae.parameters(), lr=1e-3)
optimizer_rnn = torch.optim.Adam(rnn.parameters(), lr=1e-3)
# --- 3. TRAINING LOOP ---
epochs = 300
# Flatten all frames for VAE
all_frames_rgb = []
for seq, _ in training_sequences:
all_frames_rgb.extend([cv2.cvtColor(f, cv2.COLOR_BGR2RGB) for f in seq])
vae_data = torch.tensor(np.array(all_frames_rgb)).permute(0, 3, 1, 2).float() / 255.0
vae_data = vae_data.to("cuda")
batch_size = 32
for epoch in progress.tqdm(range(epochs), desc="Transporting the traveler..."):
# A. TRAIN VAE (Standard + Structural Loss)
perm = torch.randperm(vae_data.size(0))
epoch_loss_vae = 0
for i in range(0, vae_data.size(0), batch_size):
batch = vae_data[perm[i:i+batch_size]]
recon, mu, logvar = vae(batch)
# Reconstruction Loss
recon_loss = torch.nn.functional.mse_loss(recon, batch, reduction='sum')
# KL Divergence Loss (Higher weight for stabilization)
kld_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
# Total Loss: REMOVED structural_loss and INCREASED KL weight to 0.001
loss_vae = recon_loss + (kld_loss * 0.005)
optimizer_vae.zero_grad()
loss_vae.backward()
optimizer_vae.step()
epoch_loss_vae += loss_vae.item()
# ... rest of train_world_model
# B. TRAIN RNN (VELOCITY / DELTA LEARNING)
epoch_loss_rnn = 0
for frames, action_vec in training_sequences:
seq_rgb = [cv2.cvtColor(f, cv2.COLOR_BGR2RGB) for f in frames]
seq_tensor = torch.tensor(np.array(seq_rgb)).permute(0, 3, 1, 2).float() / 255.0
seq_tensor = seq_tensor.to("cuda")
with torch.no_grad():
_, mu, _ = vae(seq_tensor)
if len(mu) > 1:
current_z = mu[:-1] # Input
next_z = mu[1:] # Reality
# TARGET is the DIFFERENCE (Velocity)
target_delta = next_z - current_z
action_tensor = torch.tensor([action_vec] * len(current_z)).float().to("cuda")
# RNN predicts the DELTA
predicted_delta, _ = rnn(current_z, action_tensor)
loss_rnn = torch.nn.functional.mse_loss(predicted_delta, target_delta)
optimizer_rnn.zero_grad()
loss_rnn.backward()
optimizer_rnn.step()
epoch_loss_rnn += loss_rnn.item()
if epoch % 10 == 0:
print(f"Epoch {epoch}: VAE Loss={epoch_loss_vae:.2f} | RNN Loss={epoch_loss_rnn:.4f}")
torch.save(vae.state_dict(), vae_path)
torch.save(rnn.state_dict(), rnn_path)
del vae, rnn, vae_data, all_frames_rgb
cleanup_memory()
return f"Training Complete! Learned VELOCITY on {len(training_sequences)} paths."
# --- Tab 3: Explore (The Dream) ---
def load_dream_stack(world_name):
global loaded_dream_models, current_model
# 1. CHECK: If the brain is already loaded, STOP HERE.
if "vae" in loaded_dream_models and "rnn" in loaded_dream_models:
return True
# 2. If we are here, the brain is missing. Time to load.
# Clean up the Video Generator (Builder) if it's runningl
if current_model is not None:
print(f"🧹 Unloading Builder ({type(current_model).__name__}) to make room for Dreamer...")
unload_model(current_model)
current_model = None
gc.collect()
torch.cuda.empty_cache()
world_dir = os.path.join(OUTPUT_DIR, world_name)
vae_path = os.path.join(world_dir, "vae.pth")
# Explicitly check VAE file existence before loading
if not os.path.exists(vae_path):
print(f"ERROR: VAE brain file not found at {vae_path}")
return False # Correctly indicates a missing file
print("🧠 Loading The Eye & Traveler...")
vae = SimpleVAE().to("cuda")
# Explicitly map model location to 'cuda' on load
vae.load_state_dict(torch.load(vae_path, map_location='cuda'))
rnn = SimpleRNN().to("cuda")
# Explicitly map model location to 'cuda' on load
rnn.load_state_dict(torch.load(os.path.join(world_dir, "rnn.pth"), map_location='cuda'))
loaded_dream_models["vae"] = vae
loaded_dream_models["rnn"] = rnn
# --- HUB SEEDING (Run only once per load) ---
print(" 📍 Seeding dream with Hub Image...")
hub_files = glob.glob(os.path.join(world_dir, "**", "hub.png"), recursive=True)
if hub_files:
img = cv2.imread(hub_files[0])
img = cv2.resize(img, (64, 64))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img_tensor = torch.tensor(img).float().permute(2, 0, 1).unsqueeze(0).to("cuda") / 255.0
with torch.no_grad():
_, mu, _ = vae(img_tensor)
loaded_dream_models["state"] = mu
else:
loaded_dream_models["state"] = torch.randn(1, 32).to("cuda")
loaded_dream_models["hidden"] = None
# Load Guide if needed
if "guide" not in loaded_dream_models:
print("👁️ Loading The Guide (Moondream)...")
try:
model = AutoModelForCausalLM.from_pretrained(
MODEL_MOONDREAM,
trust_remote_code=True,
torch_dtype=torch.float16,
local_files_only=True
).to("cuda")
except:
print(" 🌐 Downloading Moondream...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_MOONDREAM,
trust_remote_code=True,
torch_dtype=torch.float16
).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(MODEL_MOONDREAM)
loaded_dream_models["guide"] = (model, tokenizer)
return True # <<-- FINAL RETURN: Signaling that VAE, RNN, State, and Guide are all ready.
def dream_step(world_name, action_name):
"""The Autonomy Loop Step using Velocity predictions, now with Latent Clamping."""
try:
load_dream_stack(world_name)
vae = loaded_dream_models["vae"]
rnn = loaded_dream_models["rnn"]
current_z = loaded_dream_models["state"]
hidden = loaded_dream_models["hidden"]
# Action Map (Scaled Up for Impact)
act_map = {"forward": [2,0,0], "left": [0,2,0], "right": [0,0,2], "back": [-2,0,0]}
# Assign action_vec based on the action_name passed in
action_vec = act_map.get(action_name, [2,0,0])
action = torch.tensor([action_vec]).float().to("cuda")
# --- STABILITY TUNING (Dynamic Momentum) ---
DILATION_STEPS = 75
TEMPERATURE = 0.25 # was advised to change this for more movement
LATENT_CLAMP = 3.0
MOMENTUM_FACTOR = 1.05 # Default low momentum for smooth, coherent travel
# Phase 1 ESCAPE Logic: Override momentum for the first 25 steps (managed by autonomous_dream_loop)
if action_name == "ESCAPE":
MOMENTUM_FACTOR = 1.8 # High momentum for quick break/traversal
TEMPERATURE = 0.5 # HIGH noise for max exploration
with torch.no_grad():
for _ in range(DILATION_STEPS):
# 1. RNN predicts VELOCITY (Delta)
delta, hidden = rnn(current_z, action, hidden)
# 2. Apply Momentum
amplified_delta = delta * MOMENTUM_FACTOR
# 3. Apply Amplified Velocity
next_z = current_z + amplified_delta
# 4. Add Noise
noise = torch.randn_like(next_z) * TEMPERATURE
current_z = next_z + noise
# 5. CLAMP LATENTS: Prevents collapse by enforcing bounds
current_z = torch.clamp(current_z, -LATENT_CLAMP, LATENT_CLAMP)
decoded_img = vae.decoder(vae.dec_fc(current_z).view(-1, 128, 8, 8))
loaded_dream_models["state"] = current_z
loaded_dream_models["hidden"] = hidden
img_np = decoded_img.squeeze().permute(1, 2, 0).cpu().numpy()
img_np = np.clip(img_np * 255, 0, 255).astype(np.uint8)
pil_img = Image.fromarray(img_np).resize((512, 512), resample=Image.NEAREST)
return pil_img
except Exception as e:
print(f"Dream Step Error: {e}")
return Image.new("RGB", (512, 512), "black")
def inject_manual_question(question, current_history):
"""Saves the question to a global state for the next autonomous cycle."""
global pending_guide_question
# 1. Save the question
pending_guide_question = question
# 2. Update the chat with a confirmation message.
# We must replace the "Guide is thinking..." placeholder from pre_update_history.
if current_history and current_history[-1][1] == "Guide is thinking...":
current_history[-1][1] = "Guide: **Question received.** I'll comment on this during the next cycle."
# Return history and an empty string (gr.update()) for the input box
return current_history, gr.update()
def ask_guide(image_input, question="What do you see?"):
"""Uses Moondream2 to analyze the current dream frame."""
if image_input is None: return "I see nothing."
# ... (Guide loading logic remains unchanged)
if "guide" in loaded_dream_models:
model, tokenizer = loaded_dream_models["guide"]
# --- Robustly handle Gradio Input ---
if isinstance(image_input, str):
image = Image.open(image_input).convert("RGB") # Force RGB on file load
elif isinstance(image_input, dict) and 'name' in image_input:
image = Image.open(image_input['name']).convert("RGB") # Force RGB on file load
elif isinstance(image_input, Image.Image):
image = image_input.convert("RGB") # Ensure it's RGB if it's already PIL
# Handle NumPy Input
elif isinstance(image_input, np.ndarray):
# 1. Convert numpy to PIL
image = Image.fromarray(image_input)
# 2. FORCE conversion to RGB for Moondream compatibility
image = image.convert("RGB")
else:
print(f"Guide Input Error: Unrecognized image input type: {type(image_input)}")
return "Guide unavailable: Image format error."
enc_image = model.encode_image(image)
answer = model.answer_question(enc_image, question, tokenizer)
return answer
return "Guide unavailable: Model not loaded."
# --- Logic: Wander Further / Expansion ---
def get_last_stable_frame(world_name):
"""
Retrieves the FINAL frame from a weighted random direction in the most recent room.
This creates branching paths (Shotgun/Tree structure) instead of a linear line.
"""
world_dir = os.path.join(OUTPUT_DIR, world_name)
if not os.path.exists(world_dir): return None
# 1. Find the latest room folder
subdirs = [os.path.join(world_dir, d) for d in os.listdir(world_dir)]
rooms = [d for d in subdirs if os.path.isdir(d) and "room_" in os.path.basename(d)]
if not rooms: return None
rooms.sort(key=lambda x: os.path.basename(x))
latest_room = rooms[-1]
# 2. Weighted Random Direction Selection
# We prefer forward momentum, but allow branching.
options = ["forward", "left", "right", "back"]
weights = [0.7, 0.1, 0.1, 0.1] # 70% Forward, 30% Turn
# Check which actually exist (just in case)
available_videos = []
available_weights = []
for opt, w in zip(options, weights):
path = os.path.join(latest_room, f"{opt}.mp4")
if os.path.exists(path):
available_videos.append(path)
available_weights.append(w)
if not available_videos: return None
# Normalize weights if some files are missing
total_w = sum(available_weights)
norm_weights = [w / total_w for w in available_weights]
# Pick the winner
video_path = np.random.choice(available_videos, p=norm_weights)
direction_name = os.path.basename(video_path).replace(".mp4", "")
print(f"🌍 Expansion Direction: {direction_name.upper()} (from {video_path})")
# 3. Extract the VERY LAST frame
try:
cap = cv2.VideoCapture(video_path)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_count - 1)
ret, frame = cap.read()
cap.release()
if ret:
# Convert OpenCV (BGR) to PIL (RGB)
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
return Image.fromarray(frame_rgb)
except Exception as e:
print(f"Error extracting video frame: {e}")
return None
def cleanup_dream_stack():
"""Unloads ALL Dream Stack models (VAE/RNN/Guide) aggressively."""
# This is essentially a re-run of unload_dream_stack_only
# but as a public function for explicit chaining.
global loaded_dream_models, stop_dreaming_flag
stop_dreaming_flag = True # Ensure loop is dead
unload_dream_stack_only() # The aggressive cleanup logic
loaded_dream_models.clear()
cleanup_memory() # Force GC and CUDA cache clear
print("✨ Dream Stack confirmed clear. Ready for Builder.")
return "Dream Stack Cleared." # Return status message
def get_room_count(world_name):
"""Counts the number of successfully created room folders."""
world_dir = os.path.join(OUTPUT_DIR, world_name)
if not os.path.exists(world_dir):
return 0
# Count directories starting with "room_"
rooms = [d for d in os.listdir(world_dir) if os.path.isdir(os.path.join(world_dir, d)) and "room_" in d]
# Return a minimum of 1 if the world directory exists (for the initial room)
return max(1, len(rooms))
def wander_further_prep(world_name):
"""
Step 1 of Wander Further:
Stops dream, unloads heavy dream models, looks at LAST REALITY frame, generates new concept.
"""
global stop_dreaming_flag
stop_dreaming_flag = True
time.sleep(1) # Give loop time to die
# Use stable video frame
last_img_pil = get_last_stable_frame(world_name)
if last_img_pil is None:
return "No stable reality found (no videos). Cannot wander further.", None
# 1. Ask Guide what this is (before unloading!)
try:
load_dream_stack(world_name) # Ensure loaded
desc = ask_guide(last_img_pil, "You are a seasoned traveler. Describe the **SINGLE, PRIMARY LOCATION** shown in the image with vivid, sensory details including lighting. Use less than 50 words. Do not list multiple places or objects that contradict each other, unless they clearly appear in the place. Even if the place seems abstract, confidently identify the main terrain and mood.")
# 2. Unload Dream Stack to free VRAM for Builder
unload_dream_stack_only()
# 3. Create context for Trip Planner
context_prompt = f"CONTEXT: We are currently at the edge of this area: {desc}. We want to move beyond this."
return f"Leaving: {desc}", context_prompt
except Exception as e:
print(f"Wander Prep Error: {e}")