-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcorrect-dataset.py
More file actions
61 lines (48 loc) · 2.37 KB
/
correct-dataset.py
File metadata and controls
61 lines (48 loc) · 2.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import json
from openai import OpenAI
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Load the OpenAI API key from the environment variable
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def correct_jsonl_format(input_file: str, output_file: str):
input_path = input_file
output_path = output_file
if not os.path.exists(input_path):
raise FileNotFoundError(f"Input file '{input_file}' does not exist")
try:
with open(input_path, 'r', encoding='utf-8') as infile, \
open(output_path, 'w', encoding='utf-8') as outfile:
for line in infile:
line = line.strip()
if not line: # Skip empty lines
continue
try:
# Attempt to parse the line as JSON
json_obj = json.loads(line)
# Ensure the JSON object is in the correct format for GPT chat model training
if isinstance(json_obj, dict) and "messages" in json_obj:
# Write the valid JSON object to the output file
outfile.write(json.dumps(json_obj) + '\n')
else:
# If the format is incorrect, use OpenAI API to correct it
response = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant. Correct the format of this JSON object for GPT chat model training."},
{"role": "user", "content": json.dumps(json_obj)}
],
model="gpt-4o"
)
corrected_json = response.choices[0].message.content
outfile.write(corrected_json + '\n')
except json.JSONDecodeError:
print(f"Invalid JSON: {line[:100]}...")
except Exception as e:
print(f"Critical error during processing: {str(e)}")
raise
if __name__ == "__main__":
input_file = "merged-final-dataset.jsonl"
output_file = "corrected-dataset.jsonl"
correct_jsonl_format(input_file, output_file)
print("Process completed successfully")