-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathtrain_modal.py
More file actions
196 lines (170 loc) · 8.15 KB
/
train_modal.py
File metadata and controls
196 lines (170 loc) · 8.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#!/usr/bin/env python3
# Copyright (c) 2026 CyberViser. All Rights Reserved.
# Licensed under the CyberViser Proprietary License — see LICENSE for details.
"""
Hancock Fine-Tuning — Modal.com GPU Runner
CyberViser | Free tier: $30/month credits (~15 hours A10G)
Setup (one-time):
pip install modal
modal token new # creates ~/.modal.toml
modal secret create cyberviser-secrets \\
HF_TOKEN=hf_xxx \\
NVIDIA_API_KEY=nvapi-xxx
Run:
modal run train_modal.py # full training run
modal run train_modal.py --dry-run # validate setup only
modal run train_modal.py --push-hub # train + push to HF Hub
GPU cost estimate (Modal free tier: $30/mo credit):
A10G (24GB VRAM) — $0.94/hr → ~32 hrs free per month
A100 (80GB VRAM) — $3.72/hr → ~8 hrs free per month
T4 (16GB VRAM) — $0.59/hr → ~50 hrs free per month ← recommended
"""
import modal
import sys
from pathlib import Path
# ── Modal app definition ──────────────────────────────────────────────────────
app = modal.App("hancock-finetune")
# Docker image with all ML dependencies
image = (
modal.Image.debian_slim(python_version="3.11")
.pip_install([
"unsloth[colab-new]",
"trl>=0.8.0", "transformers>=4.40.0", "accelerate",
"datasets>=2.18.0", "peft", "bitsandbytes",
"sentencepiece", "requests", "tqdm", "huggingface_hub",
])
)
# Mount the local repo into the container
repo_mount = modal.Mount.from_local_dir(
".",
remote_path="/app",
condition=lambda p: not any(
p.startswith(x) for x in [".git", "__pycache__", ".venv", "node_modules"]
),
)
VOLUME_NAME = "hancock-models"
model_vol = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)
@app.function(
image=image,
gpu="T4", # Free-tier friendly — swap to "A10G" for speed
timeout=60 * 90, # 90 min max
secrets=[modal.Secret.from_name("cyberviser-secrets")],
mounts=[repo_mount],
volumes={"/models": model_vol},
)
def train(dry_run: bool = False, push_hub: bool = False):
import os, sys, json
from pathlib import Path
sys.path.insert(0, "/app")
os.chdir("/app")
print("=" * 60)
print(" Hancock Fine-Tuning — CyberViser")
print("=" * 60)
# ── Build dataset ─────────────────────────────────────────────────
print("\n[1/4] Building training dataset...")
from pathlib import Path as P
data_dir = P("data")
data_dir.mkdir(exist_ok=True)
dataset_path = data_dir / "hancock_v3.jsonl"
dataset_fallback = data_dir / "hancock_v2.jsonl"
if not dataset_path.exists():
# Run v3 pipeline; fall back to v2 if it fails
from hancock_pipeline import run_kev, run_atomic, run_ghsa, run_formatter_v3
run_kev(data_dir)
run_atomic(data_dir)
run_ghsa(data_dir)
run_formatter_v3()
if not dataset_path.exists() and not dataset_fallback.exists():
from hancock_pipeline import run_kb, run_soc_kb, run_mitre, run_nvd, run_formatter
run_kb(data_dir); run_soc_kb(data_dir); run_mitre(data_dir)
run_nvd(data_dir); run_formatter(v2=True)
active = dataset_path if dataset_path.exists() else dataset_fallback
if not active.exists():
sys.exit(f"❌ Dataset missing — run: python hancock_pipeline.py --phase 3")
else:
print(f" Using existing dataset: {active}")
samples = active.read_text().strip().splitlines()
print(f" ✅ Dataset: {len(samples):,} samples")
if dry_run:
print("\n[DRY RUN] Setup OK — skipping training.")
return {"status": "dry_run_ok", "samples": len(samples)}
# ── Load model ────────────────────────────────────────────────────
print("\n[2/4] Loading Mistral-7B with Unsloth...")
import torch
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "mistralai/Mistral-7B-Instruct-v0.3",
max_seq_length = 2048,
dtype = None,
load_in_4bit = True,
)
model = FastLanguageModel.get_peft_model(
model,
r=32, target_modules=["q_proj","k_proj","v_proj","o_proj",
"gate_proj","up_proj","down_proj"],
lora_alpha=32, lora_dropout=0.05, bias="none",
use_gradient_checkpointing="unsloth", random_state=42,
)
print(" ✅ Model + LoRA adapters loaded")
# ── Format dataset ────────────────────────────────────────────────
print("\n[3/4] Formatting dataset...")
from datasets import Dataset
raw = [json.loads(l) for l in samples]
texts = [tokenizer.apply_chat_template(s["messages"], tokenize=False,
add_generation_prompt=False) for s in raw]
ds = Dataset.from_dict({"text": texts}).train_test_split(test_size=0.05, seed=42)
print(f" Train: {len(ds['train']):,} | Eval: {len(ds['test']):,}")
# ── Train ─────────────────────────────────────────────────────────
print("\n[4/4] Training...")
from trl import SFTTrainer
from transformers import TrainingArguments
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds["train"], eval_dataset=ds["test"],
dataset_text_field="text", max_seq_length=2048, packing=True,
args=TrainingArguments(
per_device_train_batch_size=2, gradient_accumulation_steps=4,
warmup_ratio=0.05, num_train_epochs=3, learning_rate=2e-4,
fp16=not torch.cuda.is_bf16_supported(),
bf16=torch.cuda.is_bf16_supported(),
logging_steps=20, evaluation_strategy="steps", eval_steps=100,
save_strategy="steps", save_steps=200, save_total_limit=2,
output_dir="/models/checkpoints", report_to="none",
optim="adamw_8bit", weight_decay=0.01,
lr_scheduler_type="cosine", seed=42,
),
)
result = trainer.train()
print(f" ✅ Training complete — final loss: {result.training_loss:.4f}")
# ── Save ──────────────────────────────────────────────────────────
model.save_pretrained("/models/hancock_lora")
tokenizer.save_pretrained("/models/hancock_lora")
model.save_pretrained_gguf("/models/hancock_gguf", tokenizer, quantization_method="q4_k_m")
model_vol.commit()
print(" ✅ Model saved to Modal volume 'hancock-models'")
# ── Push to HF Hub (optional) ─────────────────────────────────────
if push_hub:
hf_token = os.getenv("HF_TOKEN", "")
if hf_token:
model.push_to_hub("cyberviser/hancock-mistral-7b-lora", token=hf_token)
tokenizer.push_to_hub("cyberviser/hancock-mistral-7b-lora", token=hf_token)
print(" ✅ Pushed to huggingface.co/cyberviser/hancock-mistral-7b-lora")
else:
print(" ⚠️ HF_TOKEN not set — skipping Hub push")
return {
"status": "success",
"loss": result.training_loss,
"samples": len(samples),
"model_path": "/models/hancock_lora",
}
@app.local_entrypoint()
def main(dry_run: bool = False, push_hub: bool = False):
result = train.remote(dry_run=dry_run, push_hub=push_hub)
print("\n" + "=" * 60)
print(" TRAINING RESULT")
print("=" * 60)
for k, v in result.items():
print(f" {k}: {v}")
print("\nTo download the model:")
print(" modal volume get hancock-models hancock_lora ./hancock_lora")
print(" modal volume get hancock-models hancock_gguf/hancock_gguf.q4_k_m.gguf .")