BLADE/run_gen_analyses.py at main · behavioral-data/BLADE · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import datetime
import os
import os.path as osp
import time

import click
import yaml

from blade_bench.llms.config_load import get_models, get_providers

start_time = time.time()
from blade_bench.baselines.config import (
    MultiRunConfig,
)
from blade_bench.baselines.multirun import multirun_llm

start_time = time.time()
from blade_bench.data.datamodel.transforms import (
    TransformDataReturn,
)  # ❗️ this import needs to be kept here

from blade_bench.data.dataset import list_datasets
from blade_bench.logger import logger, formatter
from blade_bench.utils import get_absolute_dir


def run_gen_analysis(
    run_dataset: str,
    num_runs: int,
    use_agent: bool,
    cache_code_results: bool,
    use_data_desc: bool,
    llm_config_path: str,
    llm_eval_config_path: str,
    output_dir: str = None,
    llm_provider: str = None,
    llm_model: str = None,
):
    llm_config = yaml.safe_load(open(llm_config_path))
    if llm_provider:
        llm_config["provider"] = llm_provider
    if llm_model:
        llm_config["model"] = llm_model

    llm_eval_config = yaml.safe_load(open(llm_eval_config_path))

    if not output_dir:
        output_dir = f"./outputs/multirun/{llm_config['provider']}-{llm_config['model']}/{run_dataset}"
        if not osp.exists(output_dir):
            os.makedirs(output_dir)
    # remove old file
    if osp.exists(osp.join(output_dir, "run.log")):
        os.remove(osp.join(output_dir, "run.log"))
    logger.add(osp.join(output_dir, f"run.log"), format=formatter.format)

    # Building the command string
    command_string = f"python {__file__} "
    command_string += f"\\\n\t--run_dataset {run_dataset} "
    command_string += f"\\\n\t-n {num_runs} "
    if use_agent:
        command_string += "\\\n\t--use_agent "
    if not cache_code_results:
        command_string += "\\\n\t--no_cache_code_reuslts "
    if not use_data_desc:
        command_string += "\\\n\t--no_use_data_desc "
    command_string += f"\\\n\t--llm_config_path {get_absolute_dir(llm_config_path)} "
    command_string += (
        f"\\\n\t--llm_eval_config_path {get_absolute_dir(llm_eval_config_path)} "
    )
    command_string += f"\\\n\t--output_dir {get_absolute_dir(output_dir)} "
    if llm_provider:
        command_string += f"\\\n\t--llm_provider {llm_provider} "
    if llm_model:
        command_string += f"\\\n\t--llm_model {llm_model} "

    logger.info(f"Running command: \n{command_string}")
    with open(osp.join(output_dir, "command.sh"), "w") as f:
        f.write("""#!/bin/bash\n""")
        f.write(command_string)

    config = MultiRunConfig(
        llm=llm_config,
        llm_eval=llm_eval_config,
        output_dir=output_dir,
        run_dataset=run_dataset,
        use_agent=use_agent,
        use_data_desc=use_data_desc,
        num_runs=num_runs,
        use_code_cache=cache_code_results,
    )
    logger.info(config.model_dump_json(indent=2))
    multirun_llm(config)


@click.command()
@click.option(
    "--run_dataset",
    type=click.Choice(list_datasets()),
    default="hurricane",
    help="Dataset to run",
    required=True,
)
@click.option(
    "-n",
    "--num_runs",
    type=int,
    default=10,
    help="Number of runs to perform",
    show_default=True,
)
@click.option(
    "--use_agent",
    is_flag=True,
    default=False,
    help="Whether to use agent or just the base LM",
    show_default=True,
)
@click.option(
    "--no_cache_code_results",
    "cache_code_results",
    is_flag=True,
    default=True,
    help="[ONLY used when use_agent=True] Whether to cache code results when running code.",
)
@click.option(
    "--no_use_data_desc",
    "use_data_desc",
    is_flag=True,
    default=True,
    help="Whether to use data description in the prompts for the LM",
    show_default=True,
)
@click.option(
    "--llm_config_path",
    type=click.Path(exists=True, file_okay=True, dir_okay=False),
    default="./conf/llm.yaml",
    help="Path to the LLM config file, used to specify the provider, model, and text generation config such as the temperature.",
    show_default=True,
)
@click.option(
    "--llm_provider",
    type=click.Choice(get_providers()),
    default=None,
    help="Provider for the LLM to override the config file at llm_config_path",
)
@click.option(
    "--llm_model",
    type=str,
    default=None,
    help=f"Model for the LLM to override the config file at llm_config_path. Default options are {get_models()}",
)
@click.option(
    "--llm_eval_config_path",
    type=click.Path(exists=True, file_okay=True, dir_okay=False),
    default="./conf/llm_eval.yaml",
    help="Path to the LLM eval config file, used to specify the provider, model, and text generation config such as the temperature.",
    show_default=True,
)
@click.option(
    "--output_dir",
    type=click.Path(exists=False, file_okay=False, dir_okay=True),
    default=None,
    help="output directory to store saved analyses",
)
def run_gen_analysis_click(
    run_dataset: str,
    num_runs: int,
    use_agent: bool,
    cache_code_results: bool,
    use_data_desc: bool,
    llm_config_path: str,
    llm_eval_config_path: str,
    output_dir: str,
    llm_provider: str,
    llm_model: str,
):
    """For a given dataset and research question, generate analyses for the dataset using a language model or a basic ReAct agent that interacts with a notebook environment.

    Running this generates the following files in output_dir:

    \b
    - command.sh: A bash script that contains the command used to run this script
    - config.json: The configuration used to run this experiment
    - run.log: The log file for the multirun experiment
    - llm.log: The log file for LM prompts and responses for the experiment
    - multirun_analyses.json: The analyses generated. **Note**: This file is used in run_get_eval.py to get the evaluation results.
    - llm_analysis_*.py: The code generated for each run (if it was generated properly) for quick reference
    """

    run_gen_analysis(
        run_dataset=run_dataset,
        num_runs=num_runs,
        use_agent=use_agent,
        cache_code_results=cache_code_results,
        use_data_desc=use_data_desc,
        llm_config_path=llm_config_path,
        llm_eval_config_path=llm_eval_config_path,
        output_dir=output_dir,
        llm_provider=llm_provider,
        llm_model=llm_model,
    )


if __name__ == "__main__":
    run_gen_analysis_click()