From b0ca604b41e23e8887ff6bb2c0a6087749f6e5e9 Mon Sep 17 00:00:00 2001 From: Rahul Tripathi Date: Sun, 28 Apr 2024 01:13:33 +0530 Subject: [PATCH 1/2] add classifier_url argument in PebbloSafeoader. Signed-off-by: Rahul Tripathi --- .../langchain_community/document_loaders/pebblo.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py index a0cc2eb4115f2..26f1979e61a02 100644 --- a/libs/community/langchain_community/document_loaders/pebblo.py +++ b/libs/community/langchain_community/document_loaders/pebblo.py @@ -45,6 +45,7 @@ def __init__( description: str = "", api_key: Optional[str] = None, load_semantic: bool = False, + classifier_url: Optional[str] = None, ): if not name or not isinstance(name, str): raise NameError("Must specify a valid name.") @@ -63,6 +64,7 @@ def __init__( self.source_type = get_loader_type(loader_name) self.source_path_size = self.get_source_size(self.source_path) self.source_aggregate_size = 0 + self.classifier_url = classifier_url or CLASSIFIER_URL self.loader_details = { "loader": loader_name, "source_path": self.source_path, @@ -210,7 +212,7 @@ def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list: self.source_aggregate_size ) payload = Doc(**payload).dict(exclude_unset=True) - load_doc_url = f"{CLASSIFIER_URL}{LOADER_DOC_URL}" + load_doc_url = f"{self.classifier_url}{LOADER_DOC_URL}" classified_docs = [] try: pebblo_resp = requests.post( @@ -296,7 +298,7 @@ def _send_discover(self) -> None: "Content-Type": "application/json", } payload = self.app.dict(exclude_unset=True) - app_discover_url = f"{CLASSIFIER_URL}{APP_DISCOVER_URL}" + app_discover_url = f"{self.classifier_url}{APP_DISCOVER_URL}" try: pebblo_resp = requests.post( app_discover_url, headers=headers, json=payload, timeout=20 From e1f81847348f62398d07633f357a2f1134572c63 Mon Sep 17 00:00:00 2001 From: Rahul Tripathi Date: Mon, 29 Apr 2024 16:11:58 +0530 Subject: [PATCH 2/2] update documentation with latest links and classifier_url argument. Signed-off-by: Rahul Tripathi --- docs/docs/integrations/document_loaders/pebblo.ipynb | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/docs/integrations/document_loaders/pebblo.ipynb b/docs/docs/integrations/document_loaders/pebblo.ipynb index 177a11fbab9f2..ef73792ae0d15 100644 --- a/docs/docs/integrations/document_loaders/pebblo.ipynb +++ b/docs/docs/integrations/document_loaders/pebblo.ipynb @@ -6,17 +6,19 @@ "source": [ "# Pebblo Safe DocumentLoader\n", "\n", - "> [Pebblo](https://github.com/daxa-ai/pebblo) enables developers to safely load data and promote their Gen AI app to deployment without worrying about the organization’s compliance and security requirements. The project identifies semantic topics and entities found in the loaded data and summarizes them on the UI or a PDF report.\n", + "> [Pebblo](https://daxa-ai.github.io/pebblo/) enables developers to safely load data and promote their Gen AI app to deployment without worrying about the organization’s compliance and security requirements. The project identifies semantic topics and entities found in the loaded data and summarizes them on the UI or a PDF report.\n", "\n", "Pebblo has two components.\n", "\n", "1. Pebblo Safe DocumentLoader for Langchain\n", - "1. Pebblo Daemon\n", + "1. Pebblo Server\n", "\n", - "This document describes how to augment your existing Langchain DocumentLoader with Pebblo Safe DocumentLoader to get deep data visibility on the types of Topics and Entities ingested into the Gen-AI Langchain application. For details on `Pebblo Daemon` see this [pebblo daemon](https://daxa-ai.github.io/pebblo-docs/daemon.html) document.\n", + "This document describes how to augment your existing Langchain DocumentLoader with Pebblo Safe DocumentLoader to get deep data visibility on the types of Topics and Entities ingested into the Gen-AI Langchain application. For details on `Pebblo Server` see this [pebblo server](https://daxa-ai.github.io/pebblo/daemon) document.\n", "\n", "Pebblo Safeloader enables safe data ingestion for Langchain `DocumentLoader`. This is done by wrapping the document loader call with `Pebblo Safe DocumentLoader`.\n", "\n", + "Note: To configure pebblo server on some url other that pebblo's default (localhost:8000) url, put the correct URL in `PEBBLO_CLASSIFIER_URL` env variable. This is configurable using the `classifier_url` keyword argument as well. Ref: [server-configurations](https://daxa-ai.github.io/pebblo/config)\n", + "\n", "#### How to Pebblo enable Document Loading?\n", "\n", "Assume a Langchain RAG application snippet using `CSVLoader` to read a CSV document for inference.\n",