VLM-SubtleBench/run.sh at main · krafton-ai/VLM-SubtleBench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/bin/bash

# Evaluation script for the unified VLM-SubtleBench dataset
# Supports filtering by category and domain

# Check if running with bash
if [ -z "$BASH_VERSION" ]; then
    echo "This script requires bash. Please run with: bash run.sh"
    exit 1
fi

# Default values
DEFAULT_MODEL="gpt-4o"
DEFAULT_PROMPT_TYPE="no-reasoning"
MAX_QUESTIONS=""

# Parse command line arguments
MODEL="$DEFAULT_MODEL"
PROMPT_TYPE="$DEFAULT_PROMPT_TYPE"
CATEGORY=""
DOMAIN=""
SPLIT=""

# Function to show usage
show_usage() {
    echo "Usage: $0 [OPTIONS]"
    echo ""
    echo "Options:"
    echo "  --model MODEL           Set the model name (default: $DEFAULT_MODEL)"
    echo "  --prompt_type TYPE      Set the prompt type (default: $DEFAULT_PROMPT_TYPE)"
    echo "  --max_questions N       Limit number of questions (default: all)"
    echo "  --split SPLIT           Filter by split: test, val, or all (default: test via config)"
    echo "  --category CATEGORY     Filter by category (default: all)"
    echo "  --domain DOMAIN         Filter by domain (default: all)"
    echo "  --help                  Show this help message"
    echo ""
    echo "Available models: gpt-4o, o3, gpt-5, gemini-2.5-flash, anthropic/claude-sonnet-4, etc."
    echo "Available prompt types: standard, no_reasoning, concatenated, grid, overlapped, camera_augmented"
    echo "Categories: action, attribute, emotion, existence, quality, quantity, spatial, state, temporal, viewpoint"
    echo "Domains: natural, industrial, medical, aerial, synthetic"
    echo ""
    echo "Examples:"
    echo "  $0                                              # Evaluate all items"
    echo "  $0 --model gpt-4o --category attribute          # Only attribute category"
    echo "  $0 --model gemini-2.5-flash --domain medical    # Only medical domain"
    echo "  $0 --max_questions 100 --category state         # 100 state questions"
}

# Parse arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --model)
            MODEL="$2"
            shift 2
            ;;
        --prompt_type)
            PROMPT_TYPE="$2"
            shift 2
            ;;
        --max_questions)
            MAX_QUESTIONS="$2"
            shift 2
            ;;
        --category)
            CATEGORY="$2"
            shift 2
            ;;
        --domain)
            DOMAIN="$2"
            shift 2
            ;;
        --split)
            SPLIT="$2"
            shift 2
            ;;
        --help|-h)
            show_usage
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            echo ""
            show_usage
            exit 1
            ;;
    esac
done

# Validate that required values are set
if [ -z "$MODEL" ]; then
    echo "Model cannot be empty"
    exit 1
fi

if [ -z "$PROMPT_TYPE" ]; then
    echo "Prompt type cannot be empty"
    exit 1
fi

# Generate log file name with incremental numbering
generate_log_filename() {
    mkdir -p eval_logs

    local base_name="${PROMPT_TYPE}-${MODEL}"
    base_name=$(echo "$base_name" | sed 's|/|-|g')

    local counter=1
    local log_file="eval_logs/${base_name}-${counter}.log"

    while [ -f "$log_file" ]; do
        counter=$((counter + 1))
        log_file="eval_logs/${base_name}-${counter}.log"
    done

    echo "$log_file"
}

LOG_FILE=$(generate_log_filename)

echo "Starting evaluation"
echo "Started at: $(date)"
echo "Model: $MODEL"
echo "Prompt type: $PROMPT_TYPE"
echo "Split: ${SPLIT:-test (default)}"
echo "Category: ${CATEGORY:-all}"
echo "Domain: ${DOMAIN:-all}"
echo "Max questions: ${MAX_QUESTIONS:-all}"
echo "Log file: $LOG_FILE"
echo ""

# Start logging everything to file
exec > >(tee -a "$LOG_FILE") 2>&1

# Build CLI overrides
CLI_ARGS=(
    "model.llm_name=$MODEL"
    "model.prompt_type=$PROMPT_TYPE"
    "model.use_multithreading=true"
    "model.max_workers=16"
)

if [ -n "$MAX_QUESTIONS" ]; then
    CLI_ARGS+=("data.max_questions=$MAX_QUESTIONS")
fi

if [ -n "$CATEGORY" ]; then
    CLI_ARGS+=("data.category=$CATEGORY")
fi

if [ -n "$DOMAIN" ]; then
    CLI_ARGS+=("data.domain=$DOMAIN")
fi

if [ -n "$SPLIT" ]; then
    if [ "$SPLIT" = "all" ]; then
        CLI_ARGS+=("data.split=null")
    else
        CLI_ARGS+=("data.split=$SPLIT")
    fi
fi

# Run evaluation
python scripts/evaluate_multiple_choice.py "${CLI_ARGS[@]}" 2>&1

exit_code=$?

echo ""
echo "Finished at: $(date)"
echo "Log saved to: $LOG_FILE"

if [ $exit_code -eq 0 ]; then
    echo "Evaluation completed successfully!"
else
    echo "Evaluation failed with exit code $exit_code"
fi

exit $exit_code