diff --git a/graphdoc/assets/configs/single_prompt_doc_generator_module.yaml b/graphdoc/assets/configs/single_prompt_doc_generator_module.yaml index b26b7ac..cdf1d94 100644 --- a/graphdoc/assets/configs/single_prompt_doc_generator_module.yaml +++ b/graphdoc/assets/configs/single_prompt_doc_generator_module.yaml @@ -4,6 +4,11 @@ graphdoc: mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server +mlflow: + mlflow_tracking_uri: !env MLFLOW_TRACKING_URI # The tracking URI for MLflow + mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server + mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server + language_model: model: openai/gpt-4o # Must be a valid dspy language model api_key: !env OPENAI_API_KEY # Must be a valid dspy language model API key diff --git a/graphdoc/assets/configs/single_prompt_doc_generator_module_eval.yaml b/graphdoc/assets/configs/single_prompt_doc_generator_module_eval.yaml index ad3da9f..41db88a 100644 --- a/graphdoc/assets/configs/single_prompt_doc_generator_module_eval.yaml +++ b/graphdoc/assets/configs/single_prompt_doc_generator_module_eval.yaml @@ -4,6 +4,11 @@ graphdoc: mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server +mlflow: + mlflow_tracking_uri: !env MLFLOW_TRACKING_URI # The tracking URI for MLflow + mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server + mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server + language_model: model: openai/gpt-4o # Must be a valid dspy language model api_key: !env OPENAI_API_KEY # Must be a valid dspy language model API key diff --git a/graphdoc/assets/configs/single_prompt_doc_generator_trainer.yaml b/graphdoc/assets/configs/single_prompt_doc_generator_trainer.yaml index 724c1da..1693e9a 100644 --- a/graphdoc/assets/configs/single_prompt_doc_generator_trainer.yaml +++ b/graphdoc/assets/configs/single_prompt_doc_generator_trainer.yaml @@ -4,6 +4,11 @@ graphdoc: mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server +mlflow: + mlflow_tracking_uri: !env MLFLOW_TRACKING_URI # The tracking URI for MLflow + mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server + mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server + language_model: model: openai/gpt-4o # Must be a valid dspy language model api_key: !env OPENAI_API_KEY # Must be a valid dspy language model API key diff --git a/graphdoc/assets/configs/single_prompt_doc_quality_trainer.yaml b/graphdoc/assets/configs/single_prompt_doc_quality_trainer.yaml index f80ce5b..6a0a48c 100644 --- a/graphdoc/assets/configs/single_prompt_doc_quality_trainer.yaml +++ b/graphdoc/assets/configs/single_prompt_doc_quality_trainer.yaml @@ -4,6 +4,11 @@ graphdoc: mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server +mlflow: + mlflow_tracking_uri: !env MLFLOW_TRACKING_URI # The tracking URI for MLflow + mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server + mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server + language_model: model: openai/gpt-4o # Must be a valid dspy language model api_key: !env OPENAI_API_KEY # Must be a valid dspy language model API key diff --git a/graphdoc/graphdoc/__init__.py b/graphdoc/graphdoc/__init__.py index 5b90143..22d85aa 100644 --- a/graphdoc/graphdoc/__init__.py +++ b/graphdoc/graphdoc/__init__.py @@ -1,6 +1,21 @@ # Copyright 2025-, Semiotic AI, Inc. # SPDX-License-Identifier: Apache-2.0 +from graphdoc.config import ( + doc_generator_eval_from_yaml, + doc_generator_module_from_dict, + doc_generator_module_from_yaml, + mlflow_data_helper_from_dict, + mlflow_data_helper_from_yaml, + single_prompt_from_dict, + single_prompt_from_yaml, + single_trainer_from_dict, + single_trainer_from_yaml, + split_trainset, + trainset_and_evalset_from_yaml, + trainset_from_dict, + trainset_from_yaml, +) from graphdoc.data import ( DspyDataHelper, GenerationDataHelper, @@ -79,4 +94,17 @@ "load_yaml_config", "schema_objects_to_dataset", "setup_logging", + "mlflow_data_helper_from_dict", + "mlflow_data_helper_from_yaml", + "trainset_from_dict", + "trainset_from_yaml", + "split_trainset", + "trainset_and_evalset_from_yaml", + "single_prompt_from_dict", + "single_prompt_from_yaml", + "single_trainer_from_dict", + "single_trainer_from_yaml", + "doc_generator_module_from_dict", + "doc_generator_module_from_yaml", + "doc_generator_eval_from_yaml", ] diff --git a/graphdoc/graphdoc/config.py b/graphdoc/graphdoc/config.py new file mode 100644 index 0000000..d12b2de --- /dev/null +++ b/graphdoc/graphdoc/config.py @@ -0,0 +1,564 @@ +# Copyright 2025-, Semiotic AI, Inc. +# SPDX-License-Identifier: Apache-2.0 + + +import logging + +# system packages +import random +from pathlib import Path +from typing import List, Optional, Union + +# external packages +import dspy + +# internal packages +from graphdoc.data import ( + DspyDataHelper, + GenerationDataHelper, + LocalDataHelper, + MlflowDataHelper, + QualityDataHelper, + load_yaml_config, +) +from graphdoc.eval import DocGeneratorEvaluator +from graphdoc.modules import DocGeneratorModule +from graphdoc.prompts import DocGeneratorPrompt, PromptFactory, SinglePrompt +from graphdoc.train import SinglePromptTrainer, TrainerFactory + +# logging +log = logging.getLogger(__name__) + +# global variables +random.seed(42) + + +####################### +# Resource Setup # +####################### + + +def mlflow_data_helper_from_dict(mlflow_config: dict) -> MlflowDataHelper: + return MlflowDataHelper( + mlflow_tracking_uri=mlflow_config["mlflow_tracking_uri"], + mlflow_tracking_username=mlflow_config["mlflow_tracking_username"], + mlflow_tracking_password=mlflow_config["mlflow_tracking_password"], + ) + + +def mlflow_data_helper_from_yaml(yaml_path: Union[str, Path]) -> MlflowDataHelper: + config = load_yaml_config(yaml_path) + return mlflow_data_helper_from_dict( + config["mlflow"], + ) + + +####################### +# Data Methods # +####################### +def trainset_from_dict(trainset_dict: dict) -> List[dspy.Example]: + """ + Load a trainset from a dictionary of parameters. + + { + "hf_api_key": !env HF_DATASET_KEY, # Must be a valid Hugging + # Face API key + # (with permission to + # access graphdoc) + # TODO: we may make + # this public in the future + "load_from_hf": false, # Whether to load the dataset + # from Hugging Face + "load_from_local": true, # Whether to load the dataset + # from a local directory + "load_local_specific_category": false, # Whether to load all categories + # or a specific category + "local_specific_category": perfect, # The specific category + # (if load_from_local is true) + "local_parse_objects": true, # Whether to parse the objects + # in the dataset + # (if load_from_local is true) + "split_for_eval": true, # Whether to split the dataset + # into trainset and evalset + "trainset_size": 1000, # The size of the trainset + "evalset_ratio": 0.1, # The proportionate size of evalset + "data_helper_type": "quality" # Type of data helper to use + # (quality, generation) + } + + :param trainset_dict: Dictionary containing trainset parameters. + :type trainset_dict: dict + :return: A trainset. + :rtype: List[dspy.Example] + """ + # TODO: refactor to enable the passing of alternative schema_directory_path, + # and the related enums that must be passed in turn + ldh = LocalDataHelper() + + if trainset_dict["data_helper_type"] == "quality": + dh = QualityDataHelper() + elif trainset_dict["data_helper_type"] == "generation": + dh = GenerationDataHelper() + else: + raise ValueError( + f"Invalid data helper type: {trainset_dict['data_helper_type']}" + ) + + # TODO: refactor to be more ergonomic once we have more data sources implemented + if trainset_dict["load_from_hf"]: + raise NotImplementedError("loading from Hugging Face is not implemented") + if trainset_dict["load_from_local"]: + if trainset_dict["load_local_specific_category"]: + raise NotImplementedError("loading a specific category is not implemented") + dataset = ldh.folder_of_folders_to_dataset( + parse_objects=trainset_dict["local_parse_objects"] + ) + trainset = dh.trainset(dataset) + if trainset_dict["trainset_size"] and isinstance( + trainset_dict["trainset_size"], int + ): + trainset = trainset[: trainset_dict["trainset_size"]] + return trainset + else: + raise ValueError( + "Current implementation only supports loading from local directory" + ) + + +def trainset_from_yaml(yaml_path: Union[str, Path]) -> List[dspy.Example]: + """Load a trainset from a YAML file. + + data: + hf_api_key: !env HF_DATASET_KEY # Must be a valid Hugging Face API key + # (with permission to access graphdoc) + # TODO: we may make this public + load_from_hf: false # Load the dataset from Hugging Face + load_from_local: true # Load the dataset from a local directory + load_local_specific_category: false # Load all categories or a specific category + # (if load_from_local is true) + local_specific_category: perfect, # Which category to load from the dataset + # (if load_from_local is true) + local_parse_objects: true, # Whether to parse the objects + # in the dataset + # (if load_from_local is true) + split_for_eval: true, # Whether to split the dataset + # into trainset and evalset + trainset_size: 1000, # The size of the trainset + evalset_ratio: 0.1, # The proportionate size of evalset + data_helper_type: quality # Type of data helper to use + # (quality, generation) + + :param yaml_path: Path to the YAML file. + :type yaml_path: Union[str, Path] + :return: A trainset. + :rtype: List[dspy.Example] + + """ + config = load_yaml_config(yaml_path) + trainset = trainset_from_dict(config["data"]) + return trainset + + +def split_trainset( + trainset: List[dspy.Example], evalset_ratio: float +) -> tuple[List[dspy.Example], List[dspy.Example]]: + """Split a trainset into a trainset and evalset. + + :param trainset: The trainset to split. :type trainset: List[dspy.Example] + :param evalset_ratio: The proportionate size of the evalset. :type + evalset_ratio: float :return: A tuple of trainset and evalset. :rtype: + tuple[List[dspy.Example], List[dspy.Example]] + + """ + split_idx = int(len(trainset) * (1 - evalset_ratio)) + random.shuffle(trainset) + evalset = trainset[split_idx:] + trainset = trainset[:split_idx] + return trainset, evalset + + +def trainset_and_evalset_from_yaml( + yaml_path: Union[str, Path] +) -> tuple[List[dspy.Example], List[dspy.Example]]: + """Load a trainset and evalset from a YAML file. + + data: + hf_api_key: !env HF_DATASET_KEY # Must be a valid Hugging Face API key + # (with permission to access graphdoc) + # TODO: we may make this public + load_from_hf: false # Load the dataset from Hugging Face + load_from_local: true # Load the dataset from a local directory + load_local_specific_category: false # Load all categories or a specific category + # (if load_from_local is true) + local_specific_category: perfect, # Which category to load from the dataset + # (if load_from_local is true) + local_parse_objects: true, # Whether to parse the objects + # in the dataset + # (if load_from_local is true) + split_for_eval: true, # Whether to split the dataset + # into trainset and evalset + trainset_size: 1000, # The size of the trainset + evalset_ratio: 0.1, # The proportionate size of evalset + data_helper_type: quality # Type of data helper to use + # (quality, generation) + + :param yaml_path: Path to the YAML file. + :type yaml_path: Union[str, Path] + :return: A tuple of trainset and evalset. + :rtype: tuple[List[dspy.Example], List[dspy.Example]] + + """ + config = load_yaml_config(yaml_path) + trainset = trainset_from_dict(config["data"]) + return split_trainset(trainset, config["data"]["evalset_ratio"]) + + +####################### +# Prompt Methods # +####################### +def single_prompt_from_dict( + prompt_dict: dict, + prompt_metric: Union[str, SinglePrompt], + mlflow_dict: Optional[dict] = None, +) -> SinglePrompt: + """ + Load a single prompt from a dictionary of parameters. + + { + "prompt": "doc_quality", # Which prompt signature to use + "class": "SchemaDocQualityPrompt", # Must be a child of SinglePrompt + "type": "predict", # The type of prompt to use + # (predict, chain_of_thought) + "metric": "rating", # The type of metric to use + # (rating, category) + "load_from_mlflow": false, # Whether to load the prompt from an MLFlow URI + "model_uri": null, # The tracking URI for MLflow + "model_name": null, # The name of the model in MLflow + "model_version": null # The version of the model in MLflow + "prompt_metric": False # Whether another prompt is used + # to calculate the metric + # (in which case we must also load that prompt) + } + + :param prompt_dict: Dictionary containing prompt information. + :type prompt_dict: dict + :param prompt_metric: The metric to use to calculate the metric. + Can be another prompt signature or a string. + :type prompt_metric: Union[str, SinglePrompt] + :return: A SinglePrompt object. + :rtype: SinglePrompt + """ + try: + # if we are loading from mlflow, modify the prompt_dict with the loaded model + if prompt_dict["load_from_mlflow"]: + if mlflow_dict: + log.info(f"Loading prompt from MLflow: {prompt_dict}") + mdh = mlflow_data_helper_from_dict(mlflow_dict) + prompt = mdh.model_by_args(prompt_dict) + log.info(f"Prompt loaded from MLflow: {prompt}") + prompt_signature = DspyDataHelper.prompt_signature(prompt) + prompt_dict["prompt"] = prompt_signature + else: + raise ValueError("MLflow tracking dict not provided") + + return PromptFactory.single_prompt( + prompt=prompt_dict["prompt"], + prompt_class=prompt_dict["class"], + prompt_type=prompt_dict["type"], + prompt_metric=prompt_metric, + ) + except Exception as e: + log.error(f"Error creating single prompt: {e}") + raise e + + +def single_prompt_from_yaml(yaml_path: Union[str, Path]) -> SinglePrompt: + """Load a single prompt from a YAML file. + + prompt: + prompt: base_doc_gen # Which prompt signature to use + class: DocGeneratorPrompt # Must be a child of SinglePrompt + # (we will use an enum to map this) + type: chain_of_thought # The type of prompt to use + # (predict, chain_of_thought) + metric: rating # The type of metric to use + # (rating, category) + load_from_mlflow: false # Whether to load the prompt + # from an MLFlow URI + model_uri: null # The tracking URI for MLflow + model_name: null # The name of the model in MLflow + model_version: null # The version of the model in MLflow + prompt_metric: true # Whether another prompt is used + # to calculate the metric + # (in which case we must also load that prompt) + + prompt_metric: + prompt: doc_quality # The prompt to use to calculate + # the metric + class: DocQualityPrompt # The class of the prompt to use + # to calculate the metric + type: predict # The type of prompt to use + # to calculate the metric + metric: rating # The metric to use to calculate + # the metric + load_from_mlflow: false # Whether to load the prompt + # from an MLFlow URI + + :param yaml_path: Path to the YAML file. + :type yaml_path: str + :return: A SinglePrompt object. + :rtype: SinglePrompt + + """ + config = load_yaml_config(yaml_path) + mlflow_config = config.get("mlflow", None) + if config["prompt"]["prompt_metric"]: + prompt_metric_config = config["prompt_metric"] + prompt_metric_metric = prompt_metric_config["metric"] + prompt_metric = single_prompt_from_dict( + prompt_metric_config, prompt_metric_metric, mlflow_config + ) + else: + prompt_metric = config["prompt"]["metric"] + prompt = single_prompt_from_dict(config["prompt"], prompt_metric, mlflow_config) + return prompt + + +####################### +# Trainer Methods # +####################### +def single_trainer_from_dict( + trainer_dict: dict, + prompt: SinglePrompt, + trainset: Optional[List[dspy.Example]] = None, + evalset: Optional[List[dspy.Example]] = None, +) -> SinglePromptTrainer: + """ + Load a single trainer from a dictionary of parameters. + + { + "trainer": { + "class": "DocQualityTrainer", + "mlflow_model_name": "doc_quality_model", + "mlflow_experiment_name": "doc_quality_experiment", + "mlflow_tracking_uri": "http://localhost:5000" + }, + "optimizer": { + "optimizer_type": "miprov2", + "auto": "light", + "max_labeled_demos": 2, + "max_bootstrapped_demos": 4, + "num_trials": 2, + "minibatch": true + }, + } + + :param trainer_dict: Dictionary containing trainer parameters. + :type trainer_dict: dict + :param prompt: The prompt to use for this trainer. + :type prompt: SinglePrompt + :return: A SinglePromptTrainer object. + :rtype: SinglePromptTrainer + """ + if trainset is None: + trainset = [] + if evalset is None: + evalset = [] + try: + return TrainerFactory.single_trainer( + trainer_class=trainer_dict["trainer"]["class"], + prompt=prompt, + optimizer_type=trainer_dict["optimizer"]["optimizer_type"], + optimizer_kwargs=trainer_dict["optimizer"], + mlflow_model_name=trainer_dict["trainer"]["mlflow_model_name"], + mlflow_experiment_name=trainer_dict["trainer"]["mlflow_experiment_name"], + mlflow_tracking_uri=trainer_dict["trainer"]["mlflow_tracking_uri"], + trainset=trainset, + evalset=evalset, + ) + except Exception as e: + log.error(f"Error creating single trainer: {e}") + raise e + + +def single_trainer_from_yaml(yaml_path: Union[str, Path]) -> SinglePromptTrainer: + """Load a single prompt trainer from a YAML file. + + trainer: + hf_api_key: !env HF_DATASET_KEY # Must be a valid Hugging Face API key + # (with permission to access graphdoc) + # TODO: we may make this public + load_from_hf: false # Load the dataset from Hugging Face + load_from_local: true # Load the dataset from a local directory + load_local_specific_category: false # Load all categories or a specific category + # (if load_from_local is true) + local_specific_category: perfect, # Which category to load from the dataset + # (if load_from_local is true) + local_parse_objects: true, # Whether to parse the objects + # in the dataset + # (if load_from_local is true) + split_for_eval: true, # Whether to split the dataset + # into trainset and evalset + trainset_size: 1000, # The size of the trainset + evalset_ratio: 0.1, # The proportionate size of evalset + + prompt: + prompt: base_doc_gen # Which prompt signature to use + class: DocGeneratorPrompt # Must be a child of SinglePrompt + # (we will use an enum to map this) + type: chain_of_thought # The type of prompt to use + # (predict, chain_of_thought) + metric: rating # The type of metric to use + # (rating, category) + load_from_mlflow: false # L oad the prompt from an MLFlow URI + model_uri: null # The tracking URI for MLflow + model_name: null # The name of the model in MLflow + model_version: null # The version of the model in MLflow + prompt_metric: true # Whether another prompt is used + # to calculate the metric + # (in which case we must load prompt) + + prompt_metric: + prompt: doc_quality # The prompt to use to calculate the metric + class: DocQualityPrompt # The class of the prompt to use + # to calculate the metric + type: predict # The type of prompt to use + # to calculate the metric + metric: rating # The metric to use to calculate + # the metric + load_from_mlflow: false # Whether to load the prompt + # from an MLFlow URI + model_uri: null # The tracking URI for MLflow + model_name: null # The name of the model in MLflow + model_version: null # The version of the model in MLflow + + :param yaml_path: Path to the YAML file. + :type yaml_path: Union[str, Path] + :return: A SinglePromptTrainer object. + :rtype: SinglePromptTrainer + + """ + try: + config = load_yaml_config(yaml_path) + prompt = single_prompt_from_yaml(yaml_path) + trainset, evalset = trainset_and_evalset_from_yaml(yaml_path) + return single_trainer_from_dict(config, prompt, trainset, evalset) + except Exception as e: + log.error(f"Error creating trainer from YAML: {e}") + raise e + + +####################### +# Module Methods # +####################### +def doc_generator_module_from_dict( + module_dict: dict, prompt: Union[DocGeneratorPrompt, SinglePrompt] +) -> DocGeneratorModule: + """ + Load a doc generator module from a dictionary of parameters. + + { + "retry": true, + "retry_limit": 1, + "rating_threshold": 3, + "fill_empty_descriptions": true + } + + :param module_dict: Dictionary containing module parameters. + :type module_dict: dict + :param prompt: The prompt to use for this module. + :type prompt: DocGeneratorPrompt + :return: A DocGeneratorModule object. + :rtype: DocGeneratorModule + """ + return DocGeneratorModule( + prompt=prompt, + retry=module_dict["retry"], + retry_limit=module_dict["retry_limit"], + rating_threshold=module_dict["rating_threshold"], + fill_empty_descriptions=module_dict["fill_empty_descriptions"], + ) + + +def doc_generator_module_from_yaml(yaml_path: Union[str, Path]) -> DocGeneratorModule: + """Load a doc generator module from a YAML file. + + prompt: + prompt: base_doc_gen # Which prompt signature to use + class: DocGeneratorPrompt # Must be a child of SinglePrompt + # (we will use an enum to map this) + type: chain_of_thought # The type of prompt to use + # (predict, chain_of_thought) + metric: rating # The type of metric to use + # (rating, category) + load_from_mlflow: false # Whether to load the prompt + # from an MLFlow URI + model_uri: null # The tracking URI for MLflow + model_name: null # The name of the model in MLflow + model_version: null # The version of the model in MLflow + prompt_metric: true # Whether another prompt is used + # to calculate the metric + # (in which case we must load that prompt) + + prompt_metric: + prompt: doc_quality # The prompt to use to calculate the metric + class: DocQualityPrompt # The class of the prompt to use + # to calculate the metric + type: predict # The type of prompt to use + # to calculate the metric + metric: rating # The metric to use to calculate the metric + load_from_mlflow: false # Whether to load the prompt + # from an MLFlow URI + model_uri: null # The tracking URI for MLflow + model_name: null # The name of the model in MLflow + model_version: null # The version of the model in MLflow + + module: + retry: true # Whether to retry the generation + # if the quality check fails + retry_limit: 1 # The maximum number of retries + rating_threshold: 3 # The rating threshold for the quality check + fill_empty_descriptions: true # Whether to fill empty descriptions with + # generated documentation + + """ + config = load_yaml_config(yaml_path)["module"] + prompt = single_prompt_from_yaml(yaml_path) + return doc_generator_module_from_dict(config, prompt) + + +####################### +# Eval Methods # +####################### +def doc_generator_eval_from_yaml(yaml_path: Union[str, Path]) -> DocGeneratorEvaluator: + """Load a doc generator evaluator from a YAML file.""" + # load the generator + generator = doc_generator_module_from_yaml(yaml_path) + config = load_yaml_config(yaml_path) + + # load the evaluator + metric_config = config["prompt_metric"] + evaluator = single_prompt_from_dict(metric_config, metric_config["metric"]) + + # load the eval config + mdh = mlflow_data_helper_from_yaml(yaml_path) # noqa: F841 + mlflow_tracking_uri = config["mlflow"]["mlflow_tracking_uri"] + mlflow_experiment_name = config["eval"]["mlflow_experiment_name"] + generator_prediction_field = config["eval"]["generator_prediction_field"] + evaluator_prediction_field = config["eval"]["evaluator_prediction_field"] + readable_value = config["eval"]["readable_value"] + + # load the evalset + evalset = trainset_from_yaml(yaml_path) + + # return the evaluator + return DocGeneratorEvaluator( + generator=generator, + evaluator=evaluator, + evalset=evalset, + mlflow_tracking_uri=mlflow_tracking_uri, + mlflow_experiment_name=mlflow_experiment_name, + generator_prediction_field=generator_prediction_field, + evaluator_prediction_field=evaluator_prediction_field, + readable_value=readable_value, + ) diff --git a/graphdoc/graphdoc/main.py b/graphdoc/graphdoc/main.py index a4b534e..588f7f9 100644 --- a/graphdoc/graphdoc/main.py +++ b/graphdoc/graphdoc/main.py @@ -7,26 +7,15 @@ # system packages import sys -from pathlib import Path -from typing import List, Literal, Optional, Union - -# external packages -import dspy # internal packages -from graphdoc.data import ( - DspyDataHelper, - GenerationDataHelper, - LocalDataHelper, - MlflowDataHelper, - QualityDataHelper, - load_yaml_config, - setup_logging, +from graphdoc.config import ( + doc_generator_eval_from_yaml, + doc_generator_module_from_yaml, + single_trainer_from_yaml, ) -from graphdoc.eval import DocGeneratorEvaluator -from graphdoc.modules import DocGeneratorModule -from graphdoc.prompts import DocGeneratorPrompt, PromptFactory, SinglePrompt -from graphdoc.train import SinglePromptTrainer, TrainerFactory + +# external packages # logging log = logging.getLogger(__name__) @@ -34,610 +23,6 @@ # global variables random.seed(42) - -class GraphDoc: - def __init__( - self, - model_args: dict, - mlflow_tracking_uri: Optional[Union[str, Path]] = None, - mlflow_tracking_username: Optional[str] = None, - mlflow_tracking_password: Optional[str] = None, - log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO", - ): - """Main entry point for the GraphDoc class. Refer to DSPy for a complete list of - model arguments. - - :param model_args: Dictionary containing model arguments. - :type model_args: dict - :param mlflow_tracking_uri: MLflow tracking URI. - :type mlflow_tracking_uri: Optional[str] - :param log_level: Logging level. - - """ - setup_logging(log_level) - log.info(f"GraphDoc initialized with model_args: {model_args}") - - try: - self.lm = dspy.LM(**model_args) - dspy.configure(lm=self.lm) - except Exception as e: - log.error(f"Error initializing LM: {e}") - raise e - - if mlflow_tracking_uri: - self.mdh = MlflowDataHelper( - mlflow_tracking_uri, - mlflow_tracking_username, - mlflow_tracking_password, - ) - else: - self.mdh = None - self.mlflow_tracking_uri = mlflow_tracking_uri - - ####################### - # Class Methods # - ####################### - @classmethod - def from_dict(cls, config_dict: dict) -> "GraphDoc": - """ - Create a GraphDoc object from a dictionary of parameters. - - { - "graphdoc": { - "log_level": "INFO", - "mlflow_tracking_uri": "http://localhost:5001", - "mlflow_tracking_username": "admin", - "mlflow_tracking_password": "password" - }, - "language_model": { - "model": "openai/gpt-4o", - "api_key": "!env OPENAI_API_KEY", - } - } - """ - return GraphDoc( - model_args=config_dict["language_model"], - mlflow_tracking_uri=config_dict["graphdoc"].get( - "mlflow_tracking_uri", None - ), - mlflow_tracking_username=config_dict["graphdoc"].get( - "mlflow_tracking_username", None - ), - mlflow_tracking_password=config_dict["graphdoc"].get( - "mlflow_tracking_password", None - ), - log_level=config_dict["graphdoc"].get("log_level", "INFO"), - ) - - @classmethod - def from_yaml(cls, yaml_path: Union[str, Path]) -> "GraphDoc": - """Create a GraphDoc object from a YAML file. - - graphdoc: - log_level: INFO # The log level to use - - language_model: - model: openai/gpt-4o # Must be a valid dspy - # language model - api_key: !env OPENAI_API_KEY # Must be a valid dspy - # language model API key - cache: true # Whether to cache the calls - # to the language model - - """ - config = load_yaml_config(yaml_path) - return GraphDoc.from_dict(config) - - ####################### - # Data Methods # - ####################### - def trainset_from_dict(self, trainset_dict: dict) -> List[dspy.Example]: - """ - Load a trainset from a dictionary of parameters. - - { - "hf_api_key": !env HF_DATASET_KEY, # Must be a valid Hugging - # Face API key - # (with permission to - # access graphdoc) - # TODO: we may make - # this public in the future - "load_from_hf": false, # Whether to load the dataset - # from Hugging Face - "load_from_local": true, # Whether to load the dataset - # from a local directory - "load_local_specific_category": false, # Whether to load all categories - # or a specific category - "local_specific_category": perfect, # The specific category - # (if load_from_local is true) - "local_parse_objects": true, # Whether to parse the objects - # in the dataset - # (if load_from_local is true) - "split_for_eval": true, # Whether to split the dataset - # into trainset and evalset - "trainset_size": 1000, # The size of the trainset - "evalset_ratio": 0.1, # The proportionate size of evalset - "data_helper_type": "quality" # Type of data helper to use - # (quality, generation) - } - - :param trainset_dict: Dictionary containing trainset parameters. - :type trainset_dict: dict - :return: A trainset. - :rtype: List[dspy.Example] - """ - # TODO: refactor to enable the passing of alternative schema_directory_path, - # and the related enums that must be passed in turn - ldh = LocalDataHelper() - - if trainset_dict["data_helper_type"] == "quality": - dh = QualityDataHelper() - elif trainset_dict["data_helper_type"] == "generation": - dh = GenerationDataHelper() - else: - raise ValueError( - f"Invalid data helper type: {trainset_dict['data_helper_type']}" - ) - - # TODO: refactor to be more ergonomic once we have more data sources implemented - if trainset_dict["load_from_hf"]: - raise NotImplementedError("loading from Hugging Face is not implemented") - if trainset_dict["load_from_local"]: - if trainset_dict["load_local_specific_category"]: - raise NotImplementedError( - "loading a specific category is not implemented" - ) - dataset = ldh.folder_of_folders_to_dataset( - parse_objects=trainset_dict["local_parse_objects"] - ) - trainset = dh.trainset(dataset) - if trainset_dict["trainset_size"] and isinstance( - trainset_dict["trainset_size"], int - ): - trainset = trainset[: trainset_dict["trainset_size"]] - return trainset - else: - raise ValueError( - "Current implementation only supports loading from local directory" - ) - - def trainset_from_yaml(self, yaml_path: Union[str, Path]) -> List[dspy.Example]: - """Load a trainset from a YAML file. - - data: - hf_api_key: !env HF_DATASET_KEY # Must be a valid Hugging Face API key - # (with permission to access graphdoc) - # TODO: we may make this public - load_from_hf: false # Load the dataset from Hugging Face - load_from_local: true # Load the dataset from a local directory - load_local_specific_category: false # Load all categories or a specific category - # (if load_from_local is true) - local_specific_category: perfect, # Which category to load from the dataset - # (if load_from_local is true) - local_parse_objects: true, # Whether to parse the objects - # in the dataset - # (if load_from_local is true) - split_for_eval: true, # Whether to split the dataset - # into trainset and evalset - trainset_size: 1000, # The size of the trainset - evalset_ratio: 0.1, # The proportionate size of evalset - data_helper_type: quality # Type of data helper to use - # (quality, generation) - - :param yaml_path: Path to the YAML file. - :type yaml_path: Union[str, Path] - :return: A trainset. - :rtype: List[dspy.Example] - - """ - config = load_yaml_config(yaml_path) - trainset = self.trainset_from_dict(config["data"]) - return trainset - - def split_trainset( - self, trainset: List[dspy.Example], evalset_ratio: float - ) -> tuple[List[dspy.Example], List[dspy.Example]]: - """Split a trainset into a trainset and evalset. - - :param trainset: The trainset to split. :type trainset: List[dspy.Example] - :param evalset_ratio: The proportionate size of the evalset. :type - evalset_ratio: float :return: A tuple of trainset and evalset. :rtype: - tuple[List[dspy.Example], List[dspy.Example]] - - """ - split_idx = int(len(trainset) * (1 - evalset_ratio)) - random.shuffle(trainset) - evalset = trainset[split_idx:] - trainset = trainset[:split_idx] - return trainset, evalset - - def trainset_and_evalset_from_yaml( - self, yaml_path: Union[str, Path] - ) -> tuple[List[dspy.Example], List[dspy.Example]]: - """Load a trainset and evalset from a YAML file. - - data: - hf_api_key: !env HF_DATASET_KEY # Must be a valid Hugging Face API key - # (with permission to access graphdoc) - # TODO: we may make this public - load_from_hf: false # Load the dataset from Hugging Face - load_from_local: true # Load the dataset from a local directory - load_local_specific_category: false # Load all categories or a specific category - # (if load_from_local is true) - local_specific_category: perfect, # Which category to load from the dataset - # (if load_from_local is true) - local_parse_objects: true, # Whether to parse the objects - # in the dataset - # (if load_from_local is true) - split_for_eval: true, # Whether to split the dataset - # into trainset and evalset - trainset_size: 1000, # The size of the trainset - evalset_ratio: 0.1, # The proportionate size of evalset - data_helper_type: quality # Type of data helper to use - # (quality, generation) - - :param yaml_path: Path to the YAML file. - :type yaml_path: Union[str, Path] - :return: A tuple of trainset and evalset. - :rtype: tuple[List[dspy.Example], List[dspy.Example]] - - """ - config = load_yaml_config(yaml_path) - trainset = self.trainset_from_dict(config["data"]) - return self.split_trainset(trainset, config["data"]["evalset_ratio"]) - - ####################### - # Prompt Methods # - ####################### - def single_prompt_from_dict( - self, prompt_dict: dict, prompt_metric: Union[str, SinglePrompt] - ) -> SinglePrompt: - """ - Load a single prompt from a dictionary of parameters. - - { - "prompt": "doc_quality", # Which prompt signature to use - "class": "SchemaDocQualityPrompt", # Must be a child of SinglePrompt - "type": "predict", # The type of prompt to use - # (predict, chain_of_thought) - "metric": "rating", # The type of metric to use - # (rating, category) - "load_from_mlflow": false, # Whether to load the prompt from an MLFlow URI - "model_uri": null, # The tracking URI for MLflow - "model_name": null, # The name of the model in MLflow - "model_version": null # The version of the model in MLflow - "prompt_metric": False # Whether another prompt is used - # to calculate the metric - # (in which case we must also load that prompt) - } - - :param prompt_dict: Dictionary containing prompt information. - :type prompt_dict: dict - :param prompt_metric: The metric to use to calculate the metric. - Can be another prompt signature or a string. - :type prompt_metric: Union[str, SinglePrompt] - :return: A SinglePrompt object. - :rtype: SinglePrompt - """ - try: - # if we are loading from mlflow, modify the prompt_dict with the loaded model - if prompt_dict["load_from_mlflow"]: - if self.mdh: - log.info(f"Loading prompt from MLflow: {prompt_dict}") - prompt = self.mdh.model_by_args(prompt_dict) - log.info(f"Prompt loaded from MLflow: {prompt}") - prompt_signature = DspyDataHelper.prompt_signature(prompt) - prompt_dict["prompt"] = prompt_signature - else: - raise ValueError("MLflow tracking URI not provided") - - return PromptFactory.single_prompt( - prompt=prompt_dict["prompt"], - prompt_class=prompt_dict["class"], - prompt_type=prompt_dict["type"], - prompt_metric=prompt_metric, - ) - except Exception as e: - log.error(f"Error creating single prompt: {e}") - raise e - - def single_prompt_from_yaml(self, yaml_path: Union[str, Path]) -> SinglePrompt: - """Load a single prompt from a YAML file. - - prompt: - prompt: base_doc_gen # Which prompt signature to use - class: DocGeneratorPrompt # Must be a child of SinglePrompt - # (we will use an enum to map this) - type: chain_of_thought # The type of prompt to use - # (predict, chain_of_thought) - metric: rating # The type of metric to use - # (rating, category) - load_from_mlflow: false # Whether to load the prompt - # from an MLFlow URI - model_uri: null # The tracking URI for MLflow - model_name: null # The name of the model in MLflow - model_version: null # The version of the model in MLflow - prompt_metric: true # Whether another prompt is used - # to calculate the metric - # (in which case we must also load that prompt) - - prompt_metric: - prompt: doc_quality # The prompt to use to calculate - # the metric - class: DocQualityPrompt # The class of the prompt to use - # to calculate the metric - type: predict # The type of prompt to use - # to calculate the metric - metric: rating # The metric to use to calculate - # the metric - load_from_mlflow: false # Whether to load the prompt - # from an MLFlow URI - - :param yaml_path: Path to the YAML file. - :type yaml_path: str - :return: A SinglePrompt object. - :rtype: SinglePrompt - - """ - config = load_yaml_config(yaml_path) - if config["prompt"]["prompt_metric"]: - prompt_metric_config = config["prompt_metric"] - prompt_metric_metric = prompt_metric_config["metric"] - prompt_metric = self.single_prompt_from_dict( - prompt_metric_config, prompt_metric_metric - ) - else: - prompt_metric = config["prompt"]["metric"] - prompt = self.single_prompt_from_dict(config["prompt"], prompt_metric) - return prompt - - ####################### - # Trainer Methods # - ####################### - def single_trainer_from_dict( - self, - trainer_dict: dict, - prompt: SinglePrompt, - trainset: Optional[List[dspy.Example]] = None, - evalset: Optional[List[dspy.Example]] = None, - ) -> SinglePromptTrainer: - """ - Load a single trainer from a dictionary of parameters. - - { - "trainer": { - "class": "DocQualityTrainer", - "mlflow_model_name": "doc_quality_model", - "mlflow_experiment_name": "doc_quality_experiment", - "mlflow_tracking_uri": "http://localhost:5000" - }, - "optimizer": { - "optimizer_type": "miprov2", - "auto": "light", - "max_labeled_demos": 2, - "max_bootstrapped_demos": 4, - "num_trials": 2, - "minibatch": true - }, - } - - :param trainer_dict: Dictionary containing trainer parameters. - :type trainer_dict: dict - :param prompt: The prompt to use for this trainer. - :type prompt: SinglePrompt - :return: A SinglePromptTrainer object. - :rtype: SinglePromptTrainer - """ - if trainset is None: - trainset = [] - if evalset is None: - evalset = [] - try: - return TrainerFactory.single_trainer( - trainer_class=trainer_dict["trainer"]["class"], - prompt=prompt, - optimizer_type=trainer_dict["optimizer"]["optimizer_type"], - optimizer_kwargs=trainer_dict["optimizer"], - mlflow_model_name=trainer_dict["trainer"]["mlflow_model_name"], - mlflow_experiment_name=trainer_dict["trainer"][ - "mlflow_experiment_name" - ], - mlflow_tracking_uri=trainer_dict["trainer"]["mlflow_tracking_uri"], - trainset=trainset, - evalset=evalset, - ) - except Exception as e: - log.error(f"Error creating single trainer: {e}") - raise e - - def single_trainer_from_yaml( - self, yaml_path: Union[str, Path] - ) -> SinglePromptTrainer: - """Load a single prompt trainer from a YAML file. - - trainer: - hf_api_key: !env HF_DATASET_KEY # Must be a valid Hugging Face API key - # (with permission to access graphdoc) - # TODO: we may make this public - load_from_hf: false # Load the dataset from Hugging Face - load_from_local: true # Load the dataset from a local directory - load_local_specific_category: false # Load all categories or a specific category - # (if load_from_local is true) - local_specific_category: perfect, # Which category to load from the dataset - # (if load_from_local is true) - local_parse_objects: true, # Whether to parse the objects - # in the dataset - # (if load_from_local is true) - split_for_eval: true, # Whether to split the dataset - # into trainset and evalset - trainset_size: 1000, # The size of the trainset - evalset_ratio: 0.1, # The proportionate size of evalset - - prompt: - prompt: base_doc_gen # Which prompt signature to use - class: DocGeneratorPrompt # Must be a child of SinglePrompt - # (we will use an enum to map this) - type: chain_of_thought # The type of prompt to use - # (predict, chain_of_thought) - metric: rating # The type of metric to use - # (rating, category) - load_from_mlflow: false # L oad the prompt from an MLFlow URI - model_uri: null # The tracking URI for MLflow - model_name: null # The name of the model in MLflow - model_version: null # The version of the model in MLflow - prompt_metric: true # Whether another prompt is used - # to calculate the metric - # (in which case we must load prompt) - - prompt_metric: - prompt: doc_quality # The prompt to use to calculate the metric - class: DocQualityPrompt # The class of the prompt to use - # to calculate the metric - type: predict # The type of prompt to use - # to calculate the metric - metric: rating # The metric to use to calculate - # the metric - load_from_mlflow: false # Whether to load the prompt - # from an MLFlow URI - model_uri: null # The tracking URI for MLflow - model_name: null # The name of the model in MLflow - model_version: null # The version of the model in MLflow - - :param yaml_path: Path to the YAML file. - :type yaml_path: Union[str, Path] - :return: A SinglePromptTrainer object. - :rtype: SinglePromptTrainer - - """ - try: - config = load_yaml_config(yaml_path) - prompt = self.single_prompt_from_yaml(yaml_path) - trainset, evalset = self.trainset_and_evalset_from_yaml(yaml_path) - return self.single_trainer_from_dict(config, prompt, trainset, evalset) - except Exception as e: - log.error(f"Error creating trainer from YAML: {e}") - raise e - - ####################### - # Module Methods # - ####################### - def doc_generator_module_from_dict( - self, module_dict: dict, prompt: Union[DocGeneratorPrompt, SinglePrompt] - ) -> DocGeneratorModule: - """ - Load a doc generator module from a dictionary of parameters. - - { - "retry": true, - "retry_limit": 1, - "rating_threshold": 3, - "fill_empty_descriptions": true - } - - :param module_dict: Dictionary containing module parameters. - :type module_dict: dict - :param prompt: The prompt to use for this module. - :type prompt: DocGeneratorPrompt - :return: A DocGeneratorModule object. - :rtype: DocGeneratorModule - """ - return DocGeneratorModule( - prompt=prompt, - retry=module_dict["retry"], - retry_limit=module_dict["retry_limit"], - rating_threshold=module_dict["rating_threshold"], - fill_empty_descriptions=module_dict["fill_empty_descriptions"], - ) - - def doc_generator_module_from_yaml( - self, yaml_path: Union[str, Path] - ) -> DocGeneratorModule: - """Load a doc generator module from a YAML file. - - prompt: - prompt: base_doc_gen # Which prompt signature to use - class: DocGeneratorPrompt # Must be a child of SinglePrompt - # (we will use an enum to map this) - type: chain_of_thought # The type of prompt to use - # (predict, chain_of_thought) - metric: rating # The type of metric to use - # (rating, category) - load_from_mlflow: false # Whether to load the prompt - # from an MLFlow URI - model_uri: null # The tracking URI for MLflow - model_name: null # The name of the model in MLflow - model_version: null # The version of the model in MLflow - prompt_metric: true # Whether another prompt is used - # to calculate the metric - # (in which case we must load that prompt) - - prompt_metric: - prompt: doc_quality # The prompt to use to calculate the metric - class: DocQualityPrompt # The class of the prompt to use - # to calculate the metric - type: predict # The type of prompt to use - # to calculate the metric - metric: rating # The metric to use to calculate the metric - load_from_mlflow: false # Whether to load the prompt - # from an MLFlow URI - model_uri: null # The tracking URI for MLflow - model_name: null # The name of the model in MLflow - model_version: null # The version of the model in MLflow - - module: - retry: true # Whether to retry the generation - # if the quality check fails - retry_limit: 1 # The maximum number of retries - rating_threshold: 3 # The rating threshold for the quality check - fill_empty_descriptions: true # Whether to fill empty descriptions with - # generated documentation - - """ - config = load_yaml_config(yaml_path)["module"] - prompt = self.single_prompt_from_yaml(yaml_path) - return self.doc_generator_module_from_dict(config, prompt) - - ####################### - # Eval Methods # - ####################### - def doc_generator_eval_from_yaml( - self, yaml_path: Union[str, Path] - ) -> DocGeneratorEvaluator: - """Load a doc generator evaluator from a YAML file.""" - # load the generator - generator = self.doc_generator_module_from_yaml(yaml_path) - config = load_yaml_config(yaml_path) - - # load the evaluator - metric_config = config["prompt_metric"] - evaluator = self.single_prompt_from_dict(metric_config, metric_config["metric"]) - - # load the eval config - if self.mdh is not None: - mlflow_tracking_uri = self.mdh.mlflow_tracking_uri - else: - mlflow_tracking_uri = config["eval"]["mlflow_tracking_uri"] - mlflow_experiment_name = config["eval"]["mlflow_experiment_name"] - generator_prediction_field = config["eval"]["generator_prediction_field"] - evaluator_prediction_field = config["eval"]["evaluator_prediction_field"] - readable_value = config["eval"]["readable_value"] - - # load the evalset - evalset = self.trainset_from_yaml(yaml_path) - - # return the evaluator - return DocGeneratorEvaluator( - generator=generator, - evaluator=evaluator, - evalset=evalset, - mlflow_tracking_uri=mlflow_tracking_uri, - mlflow_experiment_name=mlflow_experiment_name, - generator_prediction_field=generator_prediction_field, - evaluator_prediction_field=evaluator_prediction_field, - readable_value=readable_value, - ) - - ####################### # Main Entry Point # ####################### @@ -750,17 +135,17 @@ def doc_generator_eval_from_yaml( parser.print_help() sys.exit(1) - graphdoc = GraphDoc.from_yaml(args.config) + # graphdoc = GraphDoc.from_yaml(args.config) if args.command == "train": - trainer = graphdoc.single_trainer_from_yaml(args.trainer_config) + trainer = single_trainer_from_yaml(args.trainer_config) trained_prompt = trainer.train() print( f"Training complete. Saved to MLflow with name: {trainer.mlflow_model_name}" ) elif args.command == "generate": - module = graphdoc.doc_generator_module_from_yaml(args.module_config) + module = doc_generator_module_from_yaml(args.module_config) with open(args.input, "r") as f: schema = f.read() @@ -772,7 +157,7 @@ def doc_generator_eval_from_yaml( print(f"Generation complete. Documentation saved to {args.output}") elif args.command == "evaluate": - evaluator = graphdoc.doc_generator_eval_from_yaml(args.eval_config) + evaluator = doc_generator_eval_from_yaml(args.eval_config) results = evaluator.evaluate() print( "Evaluation complete. Results saved to MLflow experiment: " diff --git a/graphdoc/runners/eval/eval_doc_generator_module.py b/graphdoc/runners/eval/eval_doc_generator_module.py index 08e760e..0ec6993 100644 --- a/graphdoc/runners/eval/eval_doc_generator_module.py +++ b/graphdoc/runners/eval/eval_doc_generator_module.py @@ -11,7 +11,7 @@ from dotenv import load_dotenv # internal packages -from graphdoc.main import GraphDoc +from graphdoc.config import doc_generator_eval_from_yaml # logging log = logging.getLogger(__name__) @@ -62,11 +62,10 @@ def main(): # load config log.info(f"Loading config from {args.config_path}") - gd = GraphDoc.from_yaml(args.config_path) # load the doc generator module log.info(f"Loading doc generator module from {args.config_path}") - module_evaluator = gd.doc_generator_eval_from_yaml(args.config_path) + module_evaluator = doc_generator_eval_from_yaml(args.config_path) # run the evaluation and log the results log.info("Running evaluation and logging results") diff --git a/graphdoc/runners/train/single_prompt_trainer.py b/graphdoc/runners/train/single_prompt_trainer.py index 6c4fe64..7383d5b 100644 --- a/graphdoc/runners/train/single_prompt_trainer.py +++ b/graphdoc/runners/train/single_prompt_trainer.py @@ -3,18 +3,19 @@ import argparse import copy - -# internal packages import logging -# system packages +# internal packages import os # external packages import mlflow from dotenv import load_dotenv -from graphdoc.main import GraphDoc, load_yaml_config +from graphdoc.config import single_trainer_from_yaml + +# system packages +from graphdoc.data import load_yaml_config # logging log = logging.getLogger(__name__) @@ -35,11 +36,8 @@ def main(): ) args = parser.parse_args() - # load config - gd = GraphDoc.from_yaml(args.config_path) - # load the trainer object (including trainset and evalset) - trainer = gd.single_trainer_from_yaml(args.config_path) + trainer = single_trainer_from_yaml(args.config_path) # trainer trainset and evalset can be modified here if needed # trainer.trainset = ... diff --git a/graphdoc/tests/assets/configs/single_prompt_doc_generator_module.yaml b/graphdoc/tests/assets/configs/single_prompt_doc_generator_module.yaml index b26b7ac..cdf1d94 100644 --- a/graphdoc/tests/assets/configs/single_prompt_doc_generator_module.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_doc_generator_module.yaml @@ -4,6 +4,11 @@ graphdoc: mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server +mlflow: + mlflow_tracking_uri: !env MLFLOW_TRACKING_URI # The tracking URI for MLflow + mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server + mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server + language_model: model: openai/gpt-4o # Must be a valid dspy language model api_key: !env OPENAI_API_KEY # Must be a valid dspy language model API key diff --git a/graphdoc/tests/assets/configs/single_prompt_doc_generator_module_eval.yaml b/graphdoc/tests/assets/configs/single_prompt_doc_generator_module_eval.yaml index 42589cd..07edea4 100644 --- a/graphdoc/tests/assets/configs/single_prompt_doc_generator_module_eval.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_doc_generator_module_eval.yaml @@ -4,6 +4,11 @@ graphdoc: mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server +mlflow: + mlflow_tracking_uri: !env MLFLOW_TRACKING_URI # The tracking URI for MLflow + mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server + mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server + language_model: model: openai/gpt-4o # Must be a valid dspy language model api_key: !env OPENAI_API_KEY # Must be a valid dspy language model API key diff --git a/graphdoc/tests/assets/configs/single_prompt_doc_generator_trainer.yaml b/graphdoc/tests/assets/configs/single_prompt_doc_generator_trainer.yaml index adbd75e..91fd89b 100644 --- a/graphdoc/tests/assets/configs/single_prompt_doc_generator_trainer.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_doc_generator_trainer.yaml @@ -4,6 +4,11 @@ graphdoc: mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server +mlflow: + mlflow_tracking_uri: !env MLFLOW_TRACKING_URI # The tracking URI for MLflow + mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server + mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server + language_model: model: openai/gpt-4o # Must be a valid dspy language model api_key: !env OPENAI_API_KEY # Must be a valid dspy language model API key diff --git a/graphdoc/tests/assets/configs/single_prompt_doc_quality_trainer.yaml b/graphdoc/tests/assets/configs/single_prompt_doc_quality_trainer.yaml index a7ed5a1..1cc63c6 100644 --- a/graphdoc/tests/assets/configs/single_prompt_doc_quality_trainer.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_doc_quality_trainer.yaml @@ -4,6 +4,11 @@ graphdoc: mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server +mlflow: + mlflow_tracking_uri: !env MLFLOW_TRACKING_URI # The tracking URI for MLflow + mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server + mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server + language_model: model: openai/gpt-4o # Must be a valid dspy language model api_key: !env OPENAI_API_KEY # Must be a valid dspy language model API key diff --git a/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_trainer.yaml b/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_trainer.yaml index 3e7ba12..841ebd7 100644 --- a/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_trainer.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_trainer.yaml @@ -3,6 +3,11 @@ language_model: lm_api_key: !env OPENAI_API_KEY # Must be a valid dspy language model API key cache: true # Whether to cache the calls to the language model +mlflow: + mlflow_tracking_uri: !env MLFLOW_TRACKING_URI # The tracking URI for MLflow + mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server + mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server + data: hf_api_key: !env HF_DATASET_KEY # Must be a valid Hugging Face API key (with permission to access graphdoc) # TODO: we may make this public in the future load_from_hf: true # Whether to load the dataset from Hugging Face diff --git a/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_version.yaml b/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_version.yaml index 93a395f..c0e264b 100644 --- a/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_version.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_version.yaml @@ -3,6 +3,11 @@ language_model: lm_api_key: !env OPENAI_API_KEY # Must be a valid dspy language model API key cache: true # Whether to cache the calls to the language model +mlflow: + mlflow_tracking_uri: !env MLFLOW_TRACKING_URI # The tracking URI for MLflow + mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server + mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server + data: hf_api_key: !env HF_DATASET_KEY # Must be a valid Hugging Face API key (with permission to access graphdoc) # TODO: we may make this public in the future load_from_hf: true # Whether to load the dataset from Hugging Face diff --git a/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_trainer.yaml b/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_trainer.yaml index 089b33e..0c728d0 100644 --- a/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_trainer.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_trainer.yaml @@ -3,6 +3,11 @@ language_model: lm_api_key: !env OPENAI_API_KEY # Must be a valid dspy language model API key cache: true # Whether to cache the calls to the language model +mlflow: + mlflow_tracking_uri: !env MLFLOW_TRACKING_URI # The tracking URI for MLflow + mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server + mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server + data: hf_api_key: !env HF_DATASET_KEY # Must be a valid Hugging Face API key (with permission to access graphdoc) # TODO: we may make this public in the future load_from_hf: true # Whether to load the dataset from Hugging Face diff --git a/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_version.yaml b/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_version.yaml index da23806..b239055 100644 --- a/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_version.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_version.yaml @@ -3,6 +3,11 @@ language_model: lm_api_key: !env OPENAI_API_KEY # Must be a valid dspy language model API key cache: true # Whether to cache the calls to the language model +mlflow: + mlflow_tracking_uri: !env MLFLOW_TRACKING_URI # The tracking URI for MLflow + mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server + mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server + data: hf_api_key: !env HF_DATASET_KEY # Must be a valid Hugging Face API key (with permission to access graphdoc) # TODO: we may make this public in the future load_from_hf: true # Whether to load the dataset from Hugging Face diff --git a/graphdoc/tests/assets/configs/single_prompt_trainer.yaml b/graphdoc/tests/assets/configs/single_prompt_trainer.yaml index 089b33e..0c728d0 100644 --- a/graphdoc/tests/assets/configs/single_prompt_trainer.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_trainer.yaml @@ -3,6 +3,11 @@ language_model: lm_api_key: !env OPENAI_API_KEY # Must be a valid dspy language model API key cache: true # Whether to cache the calls to the language model +mlflow: + mlflow_tracking_uri: !env MLFLOW_TRACKING_URI # The tracking URI for MLflow + mlflow_tracking_username: !env MLFLOW_TRACKING_USERNAME # The username for the mlflow tracking server + mlflow_tracking_password: !env MLFLOW_TRACKING_PASSWORD # The password for the mlflow tracking server + data: hf_api_key: !env HF_DATASET_KEY # Must be a valid Hugging Face API key (with permission to access graphdoc) # TODO: we may make this public in the future load_from_hf: true # Whether to load the dataset from Hugging Face diff --git a/graphdoc/tests/conftest.py b/graphdoc/tests/conftest.py index bea0ad1..2409082 100644 --- a/graphdoc/tests/conftest.py +++ b/graphdoc/tests/conftest.py @@ -19,10 +19,11 @@ DocQualityPrompt, LocalDataHelper, Parser, + setup_logging, ) -from graphdoc.main import GraphDoc # logging +setup_logging("INFO") log = logging.getLogger(__name__) # define test asset paths @@ -122,31 +123,31 @@ def overwrite_ldh() -> LocalDataHelper: ) -@fixture -def gd() -> GraphDoc: - """Fixture for GraphDoc with proper environment setup.""" - # Ensure environment is set up correctly - if ENV_PATH.exists(): - load_dotenv(dotenv_path=ENV_PATH, override=True) - ensure_env_vars() - - api_key = os.environ.get("OPENAI_API_KEY") - mlflow_tracking_username = os.environ.get("MLFLOW_TRACKING_USERNAME") - mlflow_tracking_password = os.environ.get("MLFLOW_TRACKING_PASSWORD") - if not api_key: - log.error("OPENAI_API_KEY still not available after loading .env file") - - return GraphDoc( - model_args={ - "model": "gpt-4o-mini", - "api_key": api_key, - "cache": True, - }, - mlflow_tracking_uri=MLRUNS_DIR, - mlflow_tracking_username=mlflow_tracking_username, - mlflow_tracking_password=mlflow_tracking_password, - log_level="INFO", - ) +# @fixture +# def gd() -> GraphDoc: +# """Fixture for GraphDoc with proper environment setup.""" +# # Ensure environment is set up correctly +# if ENV_PATH.exists(): +# load_dotenv(dotenv_path=ENV_PATH, override=True) +# ensure_env_vars() + +# api_key = os.environ.get("OPENAI_API_KEY") +# mlflow_tracking_username = os.environ.get("MLFLOW_TRACKING_USERNAME") +# mlflow_tracking_password = os.environ.get("MLFLOW_TRACKING_PASSWORD") +# if not api_key: +# log.error("OPENAI_API_KEY still not available after loading .env file") + +# return GraphDoc( +# model_args={ +# "model": "gpt-4o-mini", +# "api_key": api_key, +# "cache": True, +# }, +# mlflow_tracking_uri=MLRUNS_DIR, +# mlflow_tracking_username=mlflow_tracking_username, +# mlflow_tracking_password=mlflow_tracking_password, +# log_level="INFO", +# ) @fixture diff --git a/graphdoc/tests/test_confest.py b/graphdoc/tests/test_confest.py index 9225adf..6c6148a 100644 --- a/graphdoc/tests/test_confest.py +++ b/graphdoc/tests/test_confest.py @@ -10,7 +10,6 @@ LocalDataHelper, Parser, ) -from graphdoc.main import GraphDoc from .conftest import ( OverwriteSchemaCategory, @@ -45,9 +44,9 @@ def test_overwrite_ldh(self, overwrite_ldh: LocalDataHelper): == OverwriteSchemaCategoryRatingMapping.get_rating ) - def test_gd(self, gd: GraphDoc): - assert gd is not None - assert isinstance(gd, GraphDoc) + # def test_gd(self, gd: GraphDoc): + # assert gd is not None + # assert isinstance(gd, GraphDoc) def test_dqp(self, dqp): assert isinstance(dqp, DocQualityPrompt) diff --git a/graphdoc/tests/test_graphdoc.py b/graphdoc/tests/test_config.py similarity index 68% rename from graphdoc/tests/test_graphdoc.py rename to graphdoc/tests/test_config.py index 3484761..ac33333 100644 --- a/graphdoc/tests/test_graphdoc.py +++ b/graphdoc/tests/test_config.py @@ -19,7 +19,18 @@ SinglePromptTrainer, load_yaml_config, ) -from graphdoc.main import GraphDoc +from graphdoc.config import ( + doc_generator_eval_from_yaml, + doc_generator_module_from_dict, + doc_generator_module_from_yaml, + single_prompt_from_dict, + single_prompt_from_yaml, + single_trainer_from_yaml, + split_trainset, + trainset_and_evalset_from_yaml, + trainset_from_dict, + trainset_from_yaml, +) # logging log = logging.getLogger(__name__) @@ -30,51 +41,36 @@ CONFIG_DIR = BASE_DIR / "tests" / "assets" / "configs" -class TestGraphDoc: - - ############################################################ - # class methods # - ############################################################ - - def test_from_dict(self, gd: GraphDoc): - config_path = CONFIG_DIR / "single_prompt_doc_quality_trainer.yaml" - config_dict = load_yaml_config(config_path) - gd = GraphDoc.from_dict(config_dict) - assert isinstance(gd, GraphDoc) - - def test_from_yaml(self, gd: GraphDoc): - config_path = CONFIG_DIR / "single_prompt_doc_quality_trainer.yaml" - gd = GraphDoc.from_yaml(config_path) - assert isinstance(gd, GraphDoc) +class TestConfig: ############################################################ # data tests # ############################################################ - def test_trainset_from_dict(self, gd: GraphDoc): + def test_trainset_from_dict(self): config_path = CONFIG_DIR / "single_prompt_doc_quality_trainer.yaml" config_dict = load_yaml_config(config_path) data_dict = config_dict["data"] - trainset = gd.trainset_from_dict(data_dict) + trainset = trainset_from_dict(data_dict) assert isinstance(trainset, list) assert len(trainset) > 0 assert isinstance(trainset[0], dspy.Example) - def test_trainset_from_yaml(self, gd: GraphDoc): + def test_trainset_from_yaml(self): config_path = CONFIG_DIR / "single_prompt_doc_quality_trainer.yaml" - trainset = gd.trainset_from_yaml(config_path) + trainset = trainset_from_yaml(config_path) assert isinstance(trainset, list) assert len(trainset) > 0 assert isinstance(trainset[0], dspy.Example) - def test_split_trainset(self, gd: GraphDoc): + def test_split_trainset(self): config_path = CONFIG_DIR / "single_prompt_doc_quality_trainer.yaml" config_dict = load_yaml_config(config_path) config_dict["data"]["trainset_size"] = 10 config_dict["data"]["evalset_ratio"] = 0.2 - trainset = gd.trainset_from_dict(config_dict["data"]) - trainset, evalset = gd.split_trainset( + trainset = trainset_from_dict(config_dict["data"]) + trainset, evalset = split_trainset( trainset, config_dict["data"]["evalset_ratio"] ) assert isinstance(trainset, list) @@ -83,9 +79,9 @@ def test_split_trainset(self, gd: GraphDoc): assert isinstance(evalset, list) assert len(evalset) == 2 - def test_trainset_and_evalset_from_yaml(self, gd: GraphDoc): + def test_trainset_and_evalset_from_yaml(self): config_path = CONFIG_DIR / "single_prompt_doc_quality_trainer.yaml" - trainset, evalset = gd.trainset_and_evalset_from_yaml(config_path) + trainset, evalset = trainset_and_evalset_from_yaml(config_path) assert isinstance(trainset, list) assert len(trainset) == 900 assert isinstance(trainset[0], dspy.Example) @@ -96,53 +92,55 @@ def test_trainset_and_evalset_from_yaml(self, gd: GraphDoc): # prompt tests # ############################################################ - def test_single_prompt_from_dict(self, gd: GraphDoc): + def test_single_prompt_from_dict(self): config_path = CONFIG_DIR / "single_prompt_doc_quality_trainer.yaml" prompt_dict = load_yaml_config(config_path)["prompt"] prompt_metric = prompt_dict["metric"] - prompt = gd.single_prompt_from_dict(prompt_dict, prompt_metric) + prompt = single_prompt_from_dict(prompt_dict, prompt_metric) assert isinstance(prompt, DocQualityPrompt) config_path = CONFIG_DIR / "single_prompt_doc_generator_trainer.yaml" prompt_dict = load_yaml_config(config_path)["prompt"] prompt_metric = prompt - generator_prompt = gd.single_prompt_from_dict(prompt_dict, prompt_metric) + generator_prompt = single_prompt_from_dict(prompt_dict, prompt_metric) assert isinstance(generator_prompt, DocGeneratorPrompt) assert isinstance(generator_prompt.prompt_metric, DocQualityPrompt) - def test_single_prompt_by_version_from_dict(self, gd: GraphDoc): + def test_single_prompt_by_version_from_dict(self): config_path = CONFIG_DIR / "single_prompt_doc_quality_trainer.yaml" - prompt_dict = load_yaml_config(config_path)["prompt"] + config_dict = load_yaml_config(config_path) + prompt_dict = config_dict["prompt"] prompt_dict["load_from_mlflow"] = True prompt_dict["model_name"] = "doc_quality_model" prompt_dict["model_version"] = "1" prompt_dict["type"] = "predict" prompt_metric = prompt_dict["metric"] - prompt = gd.single_prompt_from_dict(prompt_dict, prompt_metric) + mlfow_dict = config_dict["mlflow"] + prompt = single_prompt_from_dict(prompt_dict, prompt_metric, mlfow_dict) assert isinstance(prompt, DocQualityPrompt) - def test_single_prompt_from_yaml(self, gd: GraphDoc): + def test_single_prompt_from_yaml(self): config_path = CONFIG_DIR / "single_prompt_doc_quality_trainer.yaml" - prompt = gd.single_prompt_from_yaml(config_path) + prompt = single_prompt_from_yaml(config_path) assert isinstance(prompt, DocQualityPrompt) config_path = CONFIG_DIR / "single_prompt_doc_generator_trainer.yaml" - prompt = gd.single_prompt_from_yaml(config_path) + prompt = single_prompt_from_yaml(config_path) assert isinstance(prompt, DocGeneratorPrompt) ############################################################ # trainer tests # ############################################################ - def test_single_trainer_from_yaml(self, gd: GraphDoc): + def test_single_trainer_from_yaml(self): config_path = CONFIG_DIR / "single_prompt_doc_quality_trainer.yaml" - trainer = gd.single_trainer_from_yaml(config_path) + trainer = single_trainer_from_yaml(config_path) assert isinstance(trainer, SinglePromptTrainer) assert isinstance(trainer, DocQualityTrainer) assert isinstance(trainer.prompt, DocQualityPrompt) config_path = CONFIG_DIR / "single_prompt_doc_generator_trainer.yaml" - trainer = gd.single_trainer_from_yaml(config_path) + trainer = single_trainer_from_yaml(config_path) assert isinstance(trainer, SinglePromptTrainer) assert isinstance(trainer, DocGeneratorTrainer) assert isinstance(trainer.prompt, DocGeneratorPrompt) @@ -151,24 +149,24 @@ def test_single_trainer_from_yaml(self, gd: GraphDoc): # module tests # ############################################################ - def test_doc_generator_module_from_dict(self, gd: GraphDoc): + def test_doc_generator_module_from_dict(self): config_path = CONFIG_DIR / "single_prompt_doc_generator_module.yaml" - prompt = gd.single_prompt_from_yaml(config_path) + prompt = single_prompt_from_yaml(config_path) config_dict = load_yaml_config(config_path) module_dict = config_dict["module"] - module = gd.doc_generator_module_from_dict(module_dict, prompt) + module = doc_generator_module_from_dict(module_dict, prompt) assert isinstance(module, DocGeneratorModule) - def test_doc_generator_module_from_yaml(self, gd: GraphDoc): + def test_doc_generator_module_from_yaml(self): config_path = CONFIG_DIR / "single_prompt_doc_generator_module.yaml" - module = gd.doc_generator_module_from_yaml(config_path) + module = doc_generator_module_from_yaml(config_path) assert isinstance(module, DocGeneratorModule) ############################################################ # eval tests # ############################################################ - def test_doc_generator_eval_from_yaml(self, gd: GraphDoc): + def test_doc_generator_eval_from_yaml(self): config_path = CONFIG_DIR / "single_prompt_doc_generator_module_eval.yaml" - evaluator = gd.doc_generator_eval_from_yaml(config_path) + evaluator = doc_generator_eval_from_yaml(config_path) assert isinstance(evaluator, DocGeneratorEvaluator)