diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..39d2053 --- /dev/null +++ b/.gitignore @@ -0,0 +1,105 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +venv/ + +models/ diff --git a/caafe/caafe.py b/caafe/caafe.py index f781ee8..a94e2b2 100644 --- a/caafe/caafe.py +++ b/caafe/caafe.py @@ -1,7 +1,9 @@ +import os import copy import numpy as np import openai +from gpt4all import GPT4All from sklearn.model_selection import RepeatedKFold from .caafe_evaluate import ( evaluate_dataset, @@ -17,7 +19,7 @@ def get_prompt( if iterative == 1 else "exactly one useful column" ) - return f""" + prompt = f""" The dataframe `df` is loaded and in memory. Columns are also named attributes. Description of the dataset in `df` (column dtypes might be inaccurate): "{data_description_unparsed}" @@ -25,15 +27,15 @@ def get_prompt( Columns in `df` (true feature dtypes listed here, categoricals encoded as int): {samples} -This code was written by an expert datascientist working to improve predictions. It is a snippet of code that adds new columns to the dataset. +This code was written by an expert data scientist working to improve predictions. It is a snippet of code that adds new columns to the dataset. Number of samples (rows) in training dataset: {int(len(df))} This code generates additional columns that are useful for a downstream classification algorithm (such as XGBoost) predicting \"{ds[4][-1]}\". Additional columns add new semantic information, that is they use real world knowledge on the dataset. They can e.g. be feature combinations, transformations, aggregations where the new column is a function of the existing columns. The scale of columns and offset does not matter. Make sure all used columns exist. Follow the above description of columns closely and consider the datatypes and meanings of classes. This code also drops columns, if these may be redundant and hurt the predictive performance of the downstream classifier (Feature selection). Dropping columns may help as the chance of overfitting is lower, especially if the dataset is small. -The classifier will be trained on the dataset with the generated columns and evaluated on a holdout set. The evaluation metric is accuracy. The best performing code will be selected. -Added columns can be used in other codeblocks, dropped columns are not available anymore. +The classifier will be trained on the dataset with the generated columns and evaluated on a holdout set. The evaluation metric is f1 score and ROC auc. The best performing code will be selected. +Added columns can be used in other codeblocks, for the new added columns a naming pattern must be enforced as lower case words separated by underscore, dropped columns are not available anymore. Code formatting for each added column: ```python @@ -53,6 +55,8 @@ def get_prompt( Each codeblock ends with ```end and starts with "```python" Codeblock: """ + print(prompt) + return prompt # Each codeblock either generates {how_many} or drops bad columns (Feature selection). @@ -96,7 +100,8 @@ def build_prompt_from_df(ds, df, iterative=1): def generate_features( ds, df, - model="gpt-3.5-turbo", + model, + device="cpu", just_print_prompt=False, iterative=1, metric_used=None, @@ -105,6 +110,10 @@ def generate_features( n_splits=10, n_repeats=2, ): + + if model not in ["gpt-3.5-turbo", "gpt4all"]: + raise SystemExit("\n\nerror** `model` must be `gpt-3.5-turbo` or `gpt4all`\n\n") + def format_for_display(code): code = code.replace("```python", "").replace("```", "").replace("", "") return code @@ -127,20 +136,45 @@ def format_for_display(code): code, prompt = None, prompt return code, prompt, None - def generate_code(messages): + def generate_code(messages, model, device): if model == "skip": return "" - - completion = openai.ChatCompletion.create( - model=model, - messages=messages, - stop=["```end"], - temperature=0.5, - max_tokens=500, - ) - code = completion["choices"][0]["message"]["content"] - code = code.replace("```python", "").replace("```", "").replace("", "") - return code + + if model in ["gpt-3.5-turbo"]: + completion = openai.ChatCompletion.create( + model=model, + messages=messages, + stop=["```end"], + temperature=0.5, + max_tokens=500, + ) + code = completion["choices"][0]["message"]["content"] + code = code.replace("```python", "").replace("```", "").replace("", "") + return code + + if model == "gpt4all": + gtp4all_model_bin = os.getenv("GPT4ALL_MODEL_BIN") + + if not gtp4all_model_bin: + msg = "\n\n error** Environment variable `GPT4ALL_MODEL_BIN`" + msg += " pointing to the model file path is not defined.\n\n" + raise SystemExit(msg) + + if "model_gpt4all" not in list(locals()) + list(globals()): + device = "gpu" if device == "cuda" else "cpu" + model_gpt4all = GPT4All(gtp4all_model_bin, allow_download=True, device=device) + + system_template = messages[0].get("content") + prompt_template = "USER: {0}\nASSISTANT: ".format(messages[1].get("content")) + prompt_message = system_template + prompt_template + + completion = model_gpt4all.generate(prompt_message, + temp=0.5, + max_tokens=500) + + code = completion.split("```python")[-1].split("```end")[0] + print(f"\n\ngenerated code:\n{code}\n\n") + return code def execute_and_evaluate_code_block(full_code, code): old_accs, old_rocs, accs, rocs = [], [], [], [] @@ -246,7 +280,7 @@ def execute_and_evaluate_code_block(full_code, code): i = 0 while i < n_iter: try: - code = generate_code(messages) + code = generate_code(messages, model, device) except Exception as e: display_method("Error in LLM API." + str(e)) continue diff --git a/caafe/sklearn_wrapper.py b/caafe/sklearn_wrapper.py index dd53499..b6f6581 100644 --- a/caafe/sklearn_wrapper.py +++ b/caafe/sklearn_wrapper.py @@ -14,6 +14,7 @@ import numpy as np from typing import Optional import pandas as pd +import torch @@ -28,6 +29,7 @@ class CAAFEClassifier(BaseEstimator, ClassifierMixin): llm_model (str, optional): The LLM model to use for generating features. Defaults to 'gpt-3.5-turbo'. n_splits (int, optional): The number of cross-validation splits to use during feature generation. Defaults to 10. n_repeats (int, optional): The number of times to repeat the cross-validation during feature generation. Defaults to 2. + display_method (str, optional): Display method between markdown (Ipython.display) and print. Defaults to markdown. """ def __init__( self, @@ -37,6 +39,7 @@ def __init__( llm_model: str = "gpt-3.5-turbo", n_splits: int = 10, n_repeats: int = 2, + display_method="markdown" ) -> None: self.base_classifier = base_classifier if self.base_classifier is None: @@ -56,6 +59,7 @@ def __init__( self.optimization_metric = optimization_metric self.n_splits = n_splits self.n_repeats = n_repeats + self.display_method = display_method def fit_pandas(self, df, dataset_description, target_column_name, **kwargs): """ @@ -134,10 +138,11 @@ def fit( ds, df_train, model=self.llm_model, + device="cuda" if torch.cuda.is_available() else "cpu", iterative=self.iterations, metric_used=auc_metric, iterative_method=self.base_classifier, - display_method="markdown", + display_method=self.display_method, n_splits=self.n_splits, n_repeats=self.n_repeats, ) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..309284a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,66 @@ +aiohttp==3.8.5 +aiosignal==1.3.1 +appnope==0.1.3 +asttokens==2.4.0 +async-timeout==4.0.3 +attrs==23.1.0 +backcall==0.2.0 +bleach==6.0.0 +certifi==2023.7.22 +charset-normalizer==3.2.0 +decorator==5.1.1 +exceptiongroup==1.1.3 +executing==1.2.0 +filelock==3.12.3 +frozenlist==1.4.0 +gpt4all==1.0.10 +idna==3.4 +ipdb==0.13.13 +ipython==8.15.0 +jedi==0.19.0 +Jinja2==3.1.2 +joblib==1.3.2 +kaggle==1.5.16 +liac-arff==2.5.0 +MarkupSafe==2.1.3 +matplotlib-inline==0.1.6 +minio==7.1.16 +mpmath==1.3.0 +multidict==6.0.4 +networkx==3.1 +numpy==1.25.2 +openai==0.28.0 +openml==0.12.0 +pandas==2.1.0 +parso==0.8.3 +pexpect==4.8.0 +pickleshare==0.7.5 +prompt-toolkit==3.0.39 +ptyprocess==0.7.0 +pure-eval==0.2.2 +pyarrow==13.0.0 +Pygments==2.16.1 +python-dateutil==2.8.2 +python-slugify==8.0.1 +pytz==2023.3.post1 +PyYAML==6.0.1 +requests==2.31.0 +scikit-learn==1.3.0 +scipy==1.11.2 +six==1.16.0 +stack-data==0.6.2 +sympy==1.12 +tabpfn==0.1.9 +text-unidecode==1.3 +threadpoolctl==3.2.0 +tomli==2.0.1 +torch==2.0.1 +tqdm==4.66.1 +traitlets==5.9.0 +typing_extensions==4.7.1 +tzdata==2023.3 +urllib3==2.0.4 +wcwidth==0.2.6 +webencodings==0.5.1 +xmltodict==0.13.0 +yarl==1.9.2 diff --git a/tests/test_gpt4all.py b/tests/test_gpt4all.py new file mode 100644 index 0000000..d516022 --- /dev/null +++ b/tests/test_gpt4all.py @@ -0,0 +1,67 @@ +from caafe import CAAFEClassifier # Automated Feature Engineering for tabular datasets +from tabpfn import TabPFNClassifier # Fast Automated Machine Learning method for small tabular datasets +from sklearn.ensemble import RandomForestClassifier + +import os +import torch +from caafe import data +from sklearn.metrics import accuracy_score, roc_auc_score +from tabpfn.scripts import tabular_metrics +from functools import partial + +metric_used = tabular_metrics.auc_metric +cc_test_datasets_multiclass = data.load_all_data() + +ds = cc_test_datasets_multiclass[5] +ds, df_train, df_test, _, _ = data.get_data_split(ds, seed=0) +target_column_name = ds[4][-1] +dataset_description = ds[-1] +ds[0] + +from caafe.preprocessing import make_datasets_numeric +df_train, df_test = make_datasets_numeric(df_train, df_test, target_column_name) +train_x, train_y = data.get_X_y(df_train, target_column_name) +test_x, test_y = data.get_X_y(df_test, target_column_name) + +### Setup Base Classifier + +# clf_no_feat_eng = RandomForestClassifier() +clf_no_feat_eng = TabPFNClassifier(device=('cuda' if torch.cuda.is_available() else 'cpu'), N_ensemble_configurations=4) +clf_no_feat_eng.fit = partial(clf_no_feat_eng.fit, overwrite_warning=True) + +clf_no_feat_eng.fit(train_x, train_y) +pred = clf_no_feat_eng.predict(test_x) +acc = accuracy_score(pred, test_y) +print(f'Accuracy BEFORE CAAFE {acc}') +#roc_auc = roc_auc_score(pred, test_y) +#print(f'ROC auc BEFORE CAAFE {roc_auc}') + +### Setup and Run CAAFE - This will be billed to your OpenAI Account (in case you use it with llm_model with any openai model)! + +## OBS: You need to manually download the model files +models_list = ["ggml-model-gpt4all-falcon-q4_0.bin", + "starcoderbase-3b-ggml.bin", + "starcoderbase-7b-ggml.bin", + "llama-2-7b-chat.ggmlv3.q4_0.bin", + "nous-hermes-13b.ggmlv3.q4_0.bin", + "wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin"] + +os.environ["GPT4ALL_MODEL_BIN"] = f"/{models_list[1]}" + +caafe_clf = CAAFEClassifier(base_classifier=clf_no_feat_eng, + iterations=10, + llm_model="gpt4all", + display_method="print") + +caafe_clf.fit_pandas(df_train, + target_column_name=target_column_name, + dataset_description=dataset_description) + +pred = caafe_clf.predict(df_test) +acc = accuracy_score(pred, test_y) +print(f'Accuracy AFTER CAAFE {acc}') +#roc_auc = roc_auc_score(pred, test_y) +#print(f'ROC auc AFTER CAAFE {roc_auc}') + + +print(caafe_clf.code) \ No newline at end of file