noahho · ekamioka · Sep 13, 2023 · Sep 13, 2023 · Sep 13, 2023 · Sep 13, 2023
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,105 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+venv/
+
+models/
diff --git a/caafe/caafe.py b/caafe/caafe.py
@@ -1,7 +1,9 @@
+import os
 import copy
 import numpy as np
 
 import openai
+from gpt4all import GPT4All
 from sklearn.model_selection import RepeatedKFold
 from .caafe_evaluate import (
     evaluate_dataset,
@@ -17,23 +19,23 @@ def get_prompt(
         if iterative == 1
         else "exactly one useful column"
     )
-    return f"""
+    prompt = f"""
 The dataframe `df` is loaded and in memory. Columns are also named attributes.
 Description of the dataset in `df` (column dtypes might be inaccurate):
 "{data_description_unparsed}"
 
 Columns in `df` (true feature dtypes listed here, categoricals encoded as int):
 {samples}
 
-This code was written by an expert datascientist working to improve predictions. It is a snippet of code that adds new columns to the dataset.
+This code was written by an expert data scientist working to improve predictions. It is a snippet of code that adds new columns to the dataset.
 Number of samples (rows) in training dataset: {int(len(df))}
 
 This code generates additional columns that are useful for a downstream classification algorithm (such as XGBoost) predicting \"{ds[4][-1]}\".
 Additional columns add new semantic information, that is they use real world knowledge on the dataset. They can e.g. be feature combinations, transformations, aggregations where the new column is a function of the existing columns.
 The scale of columns and offset does not matter. Make sure all used columns exist. Follow the above description of columns closely and consider the datatypes and meanings of classes.
 This code also drops columns, if these may be redundant and hurt the predictive performance of the downstream classifier (Feature selection). Dropping columns may help as the chance of overfitting is lower, especially if the dataset is small.
-The classifier will be trained on the dataset with the generated columns and evaluated on a holdout set. The evaluation metric is accuracy. The best performing code will be selected.
-Added columns can be used in other codeblocks, dropped columns are not available anymore.
+The classifier will be trained on the dataset with the generated columns and evaluated on a holdout set. The evaluation metric is f1 score and ROC auc. The best performing code will be selected.
+Added columns can be used in other codeblocks, for the new added columns a naming pattern must be enforced as lower case words separated by underscore, dropped columns are not available anymore.
 
 Code formatting for each added column:
 ```python
@@ -53,6 +55,8 @@ def get_prompt(
 Each codeblock ends with ```end and starts with "```python"
 Codeblock:
 """
+    print(prompt)
+    return prompt
 
 
 # Each codeblock either generates {how_many} or drops bad columns (Feature selection).
@@ -96,7 +100,8 @@ def build_prompt_from_df(ds, df, iterative=1):
 def generate_features(
     ds,
     df,
-    model="gpt-3.5-turbo",
+    model,
+    device="cpu",
     just_print_prompt=False,
     iterative=1,
     metric_used=None,
@@ -105,6 +110,10 @@ def generate_features(
     n_splits=10,
     n_repeats=2,
 ):
+
+    if model not in ["gpt-3.5-turbo", "gpt4all"]:
+        raise SystemExit("\n\nerror** `model` must be `gpt-3.5-turbo` or `gpt4all`\n\n")
+
     def format_for_display(code):
         code = code.replace("```python", "").replace("```", "").replace("<end>", "")
         return code
@@ -127,20 +136,45 @@ def format_for_display(code):
         code, prompt = None, prompt
         return code, prompt, None
 
-    def generate_code(messages):
+    def generate_code(messages, model, device):
         if model == "skip":
             return ""
-
-        completion = openai.ChatCompletion.create(
-            model=model,
-            messages=messages,
-            stop=["```end"],
-            temperature=0.5,
-            max_tokens=500,
-        )
-        code = completion["choices"][0]["message"]["content"]
-        code = code.replace("```python", "").replace("```", "").replace("<end>", "")
-        return code
+
+        if model in ["gpt-3.5-turbo"]:
+            completion = openai.ChatCompletion.create(
+                model=model,
+                messages=messages,
+                stop=["```end"],
+                temperature=0.5,
+                max_tokens=500,
+            )
+            code = completion["choices"][0]["message"]["content"]
+            code = code.replace("```python", "").replace("```", "").replace("<end>", "")
+            return code
+
+        if model == "gpt4all":
+            gtp4all_model_bin = os.getenv("GPT4ALL_MODEL_BIN")
+
+            if not gtp4all_model_bin:
+                msg = "\n\n error** Environment variable `GPT4ALL_MODEL_BIN`"
+                msg += " pointing to the model file path is not defined.\n\n"
+                raise SystemExit(msg)
+
+            if "model_gpt4all" not in list(locals()) + list(globals()):
+                device = "gpu" if device == "cuda" else "cpu"
+                model_gpt4all = GPT4All(gtp4all_model_bin, allow_download=True, device=device)
+
+            system_template = messages[0].get("content")
+            prompt_template = "USER: {0}\nASSISTANT: ".format(messages[1].get("content"))
+            prompt_message = system_template + prompt_template
+
+            completion = model_gpt4all.generate(prompt_message,
+                                                temp=0.5,
+                                                max_tokens=500)
+
+            code = completion.split("```python")[-1].split("```end")[0]
+            print(f"\n\ngenerated code:\n{code}\n\n")
+            return code
 
     def execute_and_evaluate_code_block(full_code, code):
         old_accs, old_rocs, accs, rocs = [], [], [], []
@@ -246,7 +280,7 @@ def execute_and_evaluate_code_block(full_code, code):
     i = 0
     while i < n_iter:
         try:
-            code = generate_code(messages)
+            code = generate_code(messages, model, device)
         except Exception as e:
             display_method("Error in LLM API." + str(e))
             continue

diff --git a/caafe/sklearn_wrapper.py b/caafe/sklearn_wrapper.py
@@ -14,6 +14,7 @@
 import numpy as np
 from typing import Optional
 import pandas as pd
+import torch
 
 
 
@@ -28,6 +29,7 @@ class CAAFEClassifier(BaseEstimator, ClassifierMixin):
     llm_model (str, optional): The LLM model to use for generating features. Defaults to 'gpt-3.5-turbo'.
     n_splits (int, optional): The number of cross-validation splits to use during feature generation. Defaults to 10.
     n_repeats (int, optional): The number of times to repeat the cross-validation during feature generation. Defaults to 2.
+    display_method (str, optional): Display method between markdown (Ipython.display) and print. Defaults to markdown.
     """
     def __init__(
         self,
@@ -37,6 +39,7 @@ def __init__(
         llm_model: str = "gpt-3.5-turbo",
         n_splits: int = 10,
         n_repeats: int = 2,
+        display_method="markdown"
     ) -> None:
         self.base_classifier = base_classifier
         if self.base_classifier is None:
@@ -56,6 +59,7 @@ def __init__(
         self.optimization_metric = optimization_metric
         self.n_splits = n_splits
         self.n_repeats = n_repeats
+        self.display_method = display_method
 
     def fit_pandas(self, df, dataset_description, target_column_name, **kwargs):
         """
@@ -134,10 +138,11 @@ def fit(
                 ds,
                 df_train,
                 model=self.llm_model,
+                device="cuda" if torch.cuda.is_available() else "cpu",
                 iterative=self.iterations,
                 metric_used=auc_metric,
                 iterative_method=self.base_classifier,
-                display_method="markdown",
+                display_method=self.display_method,
                 n_splits=self.n_splits,
                 n_repeats=self.n_repeats,
             )

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,66 @@
+aiohttp==3.8.5
+aiosignal==1.3.1
+appnope==0.1.3
+asttokens==2.4.0
+async-timeout==4.0.3
+attrs==23.1.0
+backcall==0.2.0
+bleach==6.0.0
+certifi==2023.7.22
+charset-normalizer==3.2.0
+decorator==5.1.1
+exceptiongroup==1.1.3
+executing==1.2.0
+filelock==3.12.3
+frozenlist==1.4.0
+gpt4all==1.0.10
+idna==3.4
+ipdb==0.13.13
+ipython==8.15.0
+jedi==0.19.0
+Jinja2==3.1.2
+joblib==1.3.2
+kaggle==1.5.16
+liac-arff==2.5.0
+MarkupSafe==2.1.3
+matplotlib-inline==0.1.6
+minio==7.1.16
+mpmath==1.3.0
+multidict==6.0.4
+networkx==3.1
+numpy==1.25.2
+openai==0.28.0
+openml==0.12.0
+pandas==2.1.0
+parso==0.8.3
+pexpect==4.8.0
+pickleshare==0.7.5
+prompt-toolkit==3.0.39
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==13.0.0
+Pygments==2.16.1
+python-dateutil==2.8.2
+python-slugify==8.0.1
+pytz==2023.3.post1
+PyYAML==6.0.1
+requests==2.31.0
+scikit-learn==1.3.0
+scipy==1.11.2
+six==1.16.0
+stack-data==0.6.2
+sympy==1.12
+tabpfn==0.1.9
+text-unidecode==1.3
+threadpoolctl==3.2.0
+tomli==2.0.1
+torch==2.0.1
+tqdm==4.66.1
+traitlets==5.9.0
+typing_extensions==4.7.1
+tzdata==2023.3
+urllib3==2.0.4
+wcwidth==0.2.6
+webencodings==0.5.1
+xmltodict==0.13.0
+yarl==1.9.2