swiftsolve/Effilearner Script at main · jonasrohw/swiftsolve · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Clone the repo (once)
!git clone https://github.com/huangd1999/EffiLearner.git

# Move into the repo
%cd EffiLearner

# Install the pinned OpenAI SDK and Hugging Face tools
!pip install --upgrade openai==0.28.0
!pip install --upgrade datasets transformers fsspec


from getpass import getpass
import os

# Prompt once for your key; it will be picked up by the SDK
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("OpenAI API key: ")


from datasets import load_dataset
import json, os

# 1) Load the full BigOBench train split
bb = load_dataset("facebook/BigOBench", split="train")

# 2) Shuffle and take x amount of examples (10 in this case)
sampled = bb.shuffle(seed=42).select(range(10))

# 3) Transform into the simple {prompt,reference} format
effibench_like = [
    {
      "prompt":    ex["description"],
      "reference": ex.get("solution_code","")
    }
    for ex in sampled
]

# 4) Write into the repository’s datasets/ folder
os.makedirs("datasets", exist_ok=True)
with open("datasets/dataset.json", "w") as f:
    json.dump(effibench_like, f, indent=2)

# 5) Confirm
print("Wrote", len(effibench_like), "examples to datasets/dataset.json")


import json

path = "datasets/dataset.json"
data = json.load(open(path))

for ex in data:
    ex["markdown_description"] = ex["prompt"]
    ex["small_test_cases"]    = [ { "input": ex["prompt"] } ]

with open(path, "w") as f:
    json.dump(data, f, indent=2)

print("Injected markdown_description + small_test_cases into dataset.json")


# Go into the src folder so the relative paths line up
%cd src

# Run the code‐generation step on your 10 examples
!python gpt_generation.py \
    --checkpoint gpt-4 \
    --dataset EffiBench


python gpt_EffiLearner.py --checkpoint gpt-4 --dataset ../datasets/dataset_gpt-4.json