Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 105 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# dotenv
.env

# virtualenv
.venv
venv/
ENV/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

venv/

models/
70 changes: 52 additions & 18 deletions caafe/caafe.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import os
import copy
import numpy as np

import openai
from gpt4all import GPT4All
from sklearn.model_selection import RepeatedKFold
from .caafe_evaluate import (
evaluate_dataset,
Expand All @@ -17,23 +19,23 @@ def get_prompt(
if iterative == 1
else "exactly one useful column"
)
return f"""
prompt = f"""
The dataframe `df` is loaded and in memory. Columns are also named attributes.
Description of the dataset in `df` (column dtypes might be inaccurate):
"{data_description_unparsed}"

Columns in `df` (true feature dtypes listed here, categoricals encoded as int):
{samples}

This code was written by an expert datascientist working to improve predictions. It is a snippet of code that adds new columns to the dataset.
This code was written by an expert data scientist working to improve predictions. It is a snippet of code that adds new columns to the dataset.
Number of samples (rows) in training dataset: {int(len(df))}

This code generates additional columns that are useful for a downstream classification algorithm (such as XGBoost) predicting \"{ds[4][-1]}\".
Additional columns add new semantic information, that is they use real world knowledge on the dataset. They can e.g. be feature combinations, transformations, aggregations where the new column is a function of the existing columns.
The scale of columns and offset does not matter. Make sure all used columns exist. Follow the above description of columns closely and consider the datatypes and meanings of classes.
This code also drops columns, if these may be redundant and hurt the predictive performance of the downstream classifier (Feature selection). Dropping columns may help as the chance of overfitting is lower, especially if the dataset is small.
The classifier will be trained on the dataset with the generated columns and evaluated on a holdout set. The evaluation metric is accuracy. The best performing code will be selected.
Added columns can be used in other codeblocks, dropped columns are not available anymore.
The classifier will be trained on the dataset with the generated columns and evaluated on a holdout set. The evaluation metric is f1 score and ROC auc. The best performing code will be selected.
Added columns can be used in other codeblocks, for the new added columns a naming pattern must be enforced as lower case words separated by underscore, dropped columns are not available anymore.

Code formatting for each added column:
```python
Expand All @@ -53,6 +55,8 @@ def get_prompt(
Each codeblock ends with ```end and starts with "```python"
Codeblock:
"""
print(prompt)
return prompt


# Each codeblock either generates {how_many} or drops bad columns (Feature selection).
Expand Down Expand Up @@ -96,7 +100,8 @@ def build_prompt_from_df(ds, df, iterative=1):
def generate_features(
ds,
df,
model="gpt-3.5-turbo",
model,
device="cpu",
just_print_prompt=False,
iterative=1,
metric_used=None,
Expand All @@ -105,6 +110,10 @@ def generate_features(
n_splits=10,
n_repeats=2,
):

if model not in ["gpt-3.5-turbo", "gpt4all"]:
raise SystemExit("\n\nerror** `model` must be `gpt-3.5-turbo` or `gpt4all`\n\n")

def format_for_display(code):
code = code.replace("```python", "").replace("```", "").replace("<end>", "")
return code
Expand All @@ -127,20 +136,45 @@ def format_for_display(code):
code, prompt = None, prompt
return code, prompt, None

def generate_code(messages):
def generate_code(messages, model, device):
if model == "skip":
return ""

completion = openai.ChatCompletion.create(
model=model,
messages=messages,
stop=["```end"],
temperature=0.5,
max_tokens=500,
)
code = completion["choices"][0]["message"]["content"]
code = code.replace("```python", "").replace("```", "").replace("<end>", "")
return code

if model in ["gpt-3.5-turbo"]:
completion = openai.ChatCompletion.create(
model=model,
messages=messages,
stop=["```end"],
temperature=0.5,
max_tokens=500,
)
code = completion["choices"][0]["message"]["content"]
code = code.replace("```python", "").replace("```", "").replace("<end>", "")
return code

if model == "gpt4all":
gtp4all_model_bin = os.getenv("GPT4ALL_MODEL_BIN")

if not gtp4all_model_bin:
msg = "\n\n error** Environment variable `GPT4ALL_MODEL_BIN`"
msg += " pointing to the model file path is not defined.\n\n"
raise SystemExit(msg)

if "model_gpt4all" not in list(locals()) + list(globals()):
device = "gpu" if device == "cuda" else "cpu"
model_gpt4all = GPT4All(gtp4all_model_bin, allow_download=True, device=device)

system_template = messages[0].get("content")
prompt_template = "USER: {0}\nASSISTANT: ".format(messages[1].get("content"))
prompt_message = system_template + prompt_template

completion = model_gpt4all.generate(prompt_message,
temp=0.5,
max_tokens=500)

code = completion.split("```python")[-1].split("```end")[0]
print(f"\n\ngenerated code:\n{code}\n\n")
return code

def execute_and_evaluate_code_block(full_code, code):
old_accs, old_rocs, accs, rocs = [], [], [], []
Expand Down Expand Up @@ -246,7 +280,7 @@ def execute_and_evaluate_code_block(full_code, code):
i = 0
while i < n_iter:
try:
code = generate_code(messages)
code = generate_code(messages, model, device)
except Exception as e:
display_method("Error in LLM API." + str(e))
continue
Expand Down
7 changes: 6 additions & 1 deletion caafe/sklearn_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import numpy as np
from typing import Optional
import pandas as pd
import torch



Expand All @@ -28,6 +29,7 @@ class CAAFEClassifier(BaseEstimator, ClassifierMixin):
llm_model (str, optional): The LLM model to use for generating features. Defaults to 'gpt-3.5-turbo'.
n_splits (int, optional): The number of cross-validation splits to use during feature generation. Defaults to 10.
n_repeats (int, optional): The number of times to repeat the cross-validation during feature generation. Defaults to 2.
display_method (str, optional): Display method between markdown (Ipython.display) and print. Defaults to markdown.
"""
def __init__(
self,
Expand All @@ -37,6 +39,7 @@ def __init__(
llm_model: str = "gpt-3.5-turbo",
n_splits: int = 10,
n_repeats: int = 2,
display_method="markdown"
) -> None:
self.base_classifier = base_classifier
if self.base_classifier is None:
Expand All @@ -56,6 +59,7 @@ def __init__(
self.optimization_metric = optimization_metric
self.n_splits = n_splits
self.n_repeats = n_repeats
self.display_method = display_method

def fit_pandas(self, df, dataset_description, target_column_name, **kwargs):
"""
Expand Down Expand Up @@ -134,10 +138,11 @@ def fit(
ds,
df_train,
model=self.llm_model,
device="cuda" if torch.cuda.is_available() else "cpu",
iterative=self.iterations,
metric_used=auc_metric,
iterative_method=self.base_classifier,
display_method="markdown",
display_method=self.display_method,
n_splits=self.n_splits,
n_repeats=self.n_repeats,
)
Expand Down
66 changes: 66 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
aiohttp==3.8.5
aiosignal==1.3.1
appnope==0.1.3
asttokens==2.4.0
async-timeout==4.0.3
attrs==23.1.0
backcall==0.2.0
bleach==6.0.0
certifi==2023.7.22
charset-normalizer==3.2.0
decorator==5.1.1
exceptiongroup==1.1.3
executing==1.2.0
filelock==3.12.3
frozenlist==1.4.0
gpt4all==1.0.10
idna==3.4
ipdb==0.13.13
ipython==8.15.0
jedi==0.19.0
Jinja2==3.1.2
joblib==1.3.2
kaggle==1.5.16
liac-arff==2.5.0
MarkupSafe==2.1.3
matplotlib-inline==0.1.6
minio==7.1.16
mpmath==1.3.0
multidict==6.0.4
networkx==3.1
numpy==1.25.2
openai==0.28.0
openml==0.12.0
pandas==2.1.0
parso==0.8.3
pexpect==4.8.0
pickleshare==0.7.5
prompt-toolkit==3.0.39
ptyprocess==0.7.0
pure-eval==0.2.2
pyarrow==13.0.0
Pygments==2.16.1
python-dateutil==2.8.2
python-slugify==8.0.1
pytz==2023.3.post1
PyYAML==6.0.1
requests==2.31.0
scikit-learn==1.3.0
scipy==1.11.2
six==1.16.0
stack-data==0.6.2
sympy==1.12
tabpfn==0.1.9
text-unidecode==1.3
threadpoolctl==3.2.0
tomli==2.0.1
torch==2.0.1
tqdm==4.66.1
traitlets==5.9.0
typing_extensions==4.7.1
tzdata==2023.3
urllib3==2.0.4
wcwidth==0.2.6
webencodings==0.5.1
xmltodict==0.13.0
yarl==1.9.2
Loading