diff --git a/.env.example b/.env.example index b97e01f18..a18d9b934 100644 --- a/.env.example +++ b/.env.example @@ -3,6 +3,10 @@ # If unset or false, Langflow pipeline will be used (default: upload -> ingest -> delete) DISABLE_INGEST_WITH_LANGFLOW=false +# Set to false to skip ingesting sample data (pdfs) on startup +# Default: true +INGEST_SAMPLE_DATA=true + # Langflow HTTP timeout configuration (in seconds) # For large documents (300+ pages), ingestion can take 30+ minutes # Increase these values if you experience timeouts with very large PDFs @@ -10,6 +14,15 @@ DISABLE_INGEST_WITH_LANGFLOW=false # LANGFLOW_TIMEOUT=2400 # LANGFLOW_CONNECT_TIMEOUT=30 +# Per-file processing timeout for document ingestion tasks (in seconds) +# Should be >= LANGFLOW_TIMEOUT to allow long-running ingestion to complete +# Default: 3600 seconds (60 minutes) total timeout +# INGESTION_TIMEOUT=3600 + +# OPTIONAL: Maximum number of files to upload / ingest (in batch) per task when adding knowledge via folder +# Default: 25 +# UPLOAD_BATCH_SIZE=25 + # make one like so https://docs.langflow.org/api-keys-and-authentication#langflow-secret-key LANGFLOW_SECRET_KEY= @@ -103,3 +116,14 @@ LANGFUSE_HOST= # - A list of comma-separate hostnames may be specified # - e.g. 9.46.110.49,10.21.103.227,openrag.example.com NEXT_ALLOWED_DEV_ORIGINS= + +# OPTIONAL: Number of worker processes for concurrent request handling +# - Increase for higher throughput; but, factor in CPU / memory constraints +# MAX_WORKERS=4 # Backend ingestion workers (default: min(4, CPU_COUNT // 2)) +# LANGFLOW_WORKERS=2 # Langflow workers (default: 1) +# DOCLING_WORKERS=2 # Docling workers (default: 1) + +# OPTIONAL: Enable or disable HTTP access logging events +# - e.g. INFO: 127.0.0.1:45132 - "GET /tasks HTTP/1.1" 200 OK +# - Default: true +# ACCESS_LOG=false diff --git a/.github/workflows/publish-mcp.yml b/.github/workflows/publish-mcp.yml new file mode 100644 index 000000000..ce0ff48df --- /dev/null +++ b/.github/workflows/publish-mcp.yml @@ -0,0 +1,59 @@ +name: Publish MCP (openrag-mcp) + +on: + push: + branches: + - main + paths: + - 'sdks/mcp/pyproject.toml' + workflow_dispatch: + +jobs: + publish: + name: Publish to PyPI + runs-on: ubuntu-latest + defaults: + run: + working-directory: sdks/mcp + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: '3.12' + + - name: Install uv + uses: astral-sh/setup-uv@v4 + + - name: Extract version from pyproject.toml + id: version + run: | + VERSION=$(grep -Po '(?<=^version = ")[^"]*' pyproject.toml) + echo "version=$VERSION" >> $GITHUB_OUTPUT + + - name: Check if version already published + id: check + run: | + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" https://pypi.org/pypi/openrag-mcp/${{ steps.version.outputs.version }}/json) + if [ "$HTTP_STATUS" = "200" ]; then + echo "exists=true" >> $GITHUB_OUTPUT + else + echo "exists=false" >> $GITHUB_OUTPUT + fi + + - name: Build package + if: steps.check.outputs.exists == 'false' + run: uv build + + - name: Publish to PyPI + if: steps.check.outputs.exists == 'false' + run: uv publish + env: + UV_PUBLISH_TOKEN: ${{ secrets.UV_PUBLISH_TOKEN }} + + - name: Skip publish (version exists) + if: steps.check.outputs.exists == 'true' + run: echo "Version ${{ steps.version.outputs.version }} already exists on PyPI, skipping publish" diff --git a/.gitignore b/.gitignore index 2143d13a6..2c09fc1f9 100644 --- a/.gitignore +++ b/.gitignore @@ -26,7 +26,7 @@ wheels/ !frontend/*.json .DS_Store -config/ +/config/ .docling.pid diff --git a/Makefile b/Makefile index 21b2d67e8..19f151f32 100644 --- a/Makefile +++ b/Makefile @@ -71,7 +71,7 @@ endef dev dev-cpu dev-local dev-local-cpu stop clean build logs \ shell-backend shell-frontend install \ test test-integration test-ci test-ci-local test-sdk test-os-jwt lint \ - backend frontend docling docling-stop install-be install-fe build-be build-fe logs-be logs-fe logs-lf logs-os \ + backend frontend docling docling-stop install-be install-fe build-be build-fe build-os build-lf logs-be logs-fe logs-lf logs-os \ shell-be shell-lf shell-os restart status health db-reset clear-os-data flow-upload setup factory-reset \ dev-branch build-langflow-dev stop-dev clean-dev logs-dev logs-lf-dev shell-lf-dev restart-dev status-dev @@ -197,8 +197,10 @@ help_docker: ## Show Docker and container commands @echo '' @echo "$(PURPLE)Build Images:$(NC)" @echo " $(PURPLE)make build$(NC) - Build all Docker images locally" + @echo " $(PURPLE)make build-os$(NC) - Build OpenSearch Docker image only" @echo " $(PURPLE)make build-be$(NC) - Build backend Docker image only" @echo " $(PURPLE)make build-fe$(NC) - Build frontend Docker image only" + @echo " $(PURPLE)make build-lf$(NC) - Build Langflow Docker image only" @echo '' @echo "$(PURPLE)Container Management:$(NC)" @echo " $(PURPLE)make stop$(NC) - Stop and remove all OpenRAG containers" @@ -534,18 +536,14 @@ install-fe: ## Install frontend dependencies # DOCKER BUILD ###################### -build: ## Build all Docker images locally - @echo "$(YELLOW)Building all Docker images locally...$(NC)" - @echo "$(CYAN)Building OpenSearch image...$(NC)" - $(CONTAINER_RUNTIME) build -t langflowai/openrag-opensearch:latest -f Dockerfile . - @echo "$(CYAN)Building Backend image...$(NC)" - $(CONTAINER_RUNTIME) build -t langflowai/openrag-backend:latest -f Dockerfile.backend . - @echo "$(CYAN)Building Frontend image...$(NC)" - $(CONTAINER_RUNTIME) build -t langflowai/openrag-frontend:latest -f Dockerfile.frontend . - @echo "$(CYAN)Building Langflow image...$(NC)" - $(CONTAINER_RUNTIME) build -t langflowai/openrag-langflow:latest -f Dockerfile.langflow . +build: build-os build-be build-fe build-lf ## Build all Docker images locally @echo "$(PURPLE)All images built successfully!$(NC)" +build-os: ## Build OpenSearch Docker image + @echo "$(YELLOW)Building OpenSearch image...$(NC)" + $(CONTAINER_RUNTIME) build -t langflowai/openrag-opensearch:latest -f Dockerfile . + @echo "$(PURPLE)OpenSearch image built.$(NC)" + build-be: ## Build backend Docker image @echo "$(YELLOW)Building backend image...$(NC)" $(CONTAINER_RUNTIME) build -t langflowai/openrag-backend:latest -f Dockerfile.backend . @@ -556,6 +554,11 @@ build-fe: ## Build frontend Docker image $(CONTAINER_RUNTIME) build -t langflowai/openrag-frontend:latest -f Dockerfile.frontend . @echo "$(PURPLE)Frontend image built.$(NC)" +build-lf: ## Build Langflow Docker image + @echo "$(YELLOW)Building Langflow image...$(NC)" + $(CONTAINER_RUNTIME) build -t langflowai/openrag-langflow:latest -f Dockerfile.langflow . + @echo "$(PURPLE)Langflow image built.$(NC)" + ###################### # LOGGING ###################### diff --git a/docker-compose.yml b/docker-compose.yml index 113688edb..4c56b62d4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -15,7 +15,7 @@ services: bash -c " # Ensure data directory has correct permissions sudo chown -R opensearch:opensearch /usr/share/opensearch/data || true - + # Start OpenSearch in background /usr/share/opensearch/opensearch-docker-entrypoint.sh opensearch & @@ -29,7 +29,8 @@ services: - "9200:9200" - "9600:9600" volumes: - - ${OPENSEARCH_DATA_PATH:-./opensearch-data}:/usr/share/opensearch/data:U,z + # If OPENSEARCH_DATA_PATH is set, use host path; otherwise use named volume + - ${OPENSEARCH_DATA_PATH:-opensearch-data}:/usr/share/opensearch/data dashboards: image: opensearchproject/opensearch-dashboards:3.0.0 @@ -62,11 +63,13 @@ services: - LANGFLOW_INGEST_FLOW_ID=${LANGFLOW_INGEST_FLOW_ID} - LANGFLOW_URL_INGEST_FLOW_ID=${LANGFLOW_URL_INGEST_FLOW_ID} - DISABLE_INGEST_WITH_LANGFLOW=${DISABLE_INGEST_WITH_LANGFLOW:-false} + - INGEST_SAMPLE_DATA=${INGEST_SAMPLE_DATA:-true} - NUDGES_FLOW_ID=${NUDGES_FLOW_ID} - OPENSEARCH_PORT=9200 - OPENSEARCH_USERNAME=admin - OPENSEARCH_PASSWORD=${OPENSEARCH_PASSWORD} - OPENAI_API_KEY=${OPENAI_API_KEY} + - OPENAI_API_BASE=${OPENAI_API_BASE:-None} - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} - WATSONX_API_KEY=${WATSONX_API_KEY} - WATSONX_ENDPOINT=${WATSONX_ENDPOINT} @@ -80,12 +83,15 @@ services: - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} - OPENSEARCH_INDEX_NAME=${OPENSEARCH_INDEX_NAME:-documents} + - LOG_LEVEL=${LOG_LEVEL} volumes: - ${OPENRAG_DOCUMENTS_PATH:-./openrag-documents}:/app/openrag-documents:Z - ${OPENRAG_KEYS_PATH:-./keys}:/app/keys:U,z - ${OPENRAG_FLOWS_PATH:-./flows}:/app/flows:U,z - ${OPENRAG_CONFIG_PATH:-./config}:/app/config:Z - ${OPENRAG_DATA_PATH:-./data}:/app/data:Z + ports: + - "8000:8000" openrag-frontend: image: langflowai/openrag-frontend:${OPENRAG_VERSION:-latest} @@ -116,6 +122,7 @@ services: - LANGFUSE_HOST=${LANGFUSE_HOST:-} - LANGFLOW_DEACTIVATE_TRACING - OPENAI_API_KEY=${OPENAI_API_KEY:-None} + - OPENAI_API_BASE=${OPENAI_API_BASE:-None} - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-None} - WATSONX_API_KEY=${WATSONX_API_KEY:-None} - WATSONX_ENDPOINT=${WATSONX_ENDPOINT:-None} @@ -123,7 +130,7 @@ services: - OLLAMA_BASE_URL=${OLLAMA_ENDPOINT:-None} - LANGFLOW_LOAD_FLOWS_PATH=/app/flows - LANGFLOW_SECRET_KEY=${LANGFLOW_SECRET_KEY} - - JWT=None + - JWT=None - OWNER=None - OWNER_NAME=None - OWNER_EMAIL=None @@ -144,8 +151,9 @@ services: - MIMETYPE=None - FILESIZE=0 - SELECTED_EMBEDDING_MODEL=${SELECTED_EMBEDDING_MODEL:-} - - LANGFLOW_VARIABLES_TO_GET_FROM_ENVIRONMENT=JWT,OPENRAG-QUERY-FILTER,OPENSEARCH_PASSWORD,OPENSEARCH_URL,DOCLING_SERVE_URL,OWNER,OWNER_NAME,OWNER_EMAIL,CONNECTOR_TYPE,DOCUMENT_ID,SOURCE_URL,ALLOWED_USERS,ALLOWED_GROUPS,FILENAME,MIMETYPE,FILESIZE,SELECTED_EMBEDDING_MODEL,OPENAI_API_KEY,ANTHROPIC_API_KEY,WATSONX_API_KEY,WATSONX_ENDPOINT,WATSONX_PROJECT_ID,OLLAMA_BASE_URL,OPENSEARCH_INDEX_NAME - - LANGFLOW_LOG_LEVEL=DEBUG + - LANGFLOW_VARIABLES_TO_GET_FROM_ENVIRONMENT=JWT,OPENRAG-QUERY-FILTER,OPENSEARCH_PASSWORD,OPENSEARCH_URL,DOCLING_SERVE_URL,OWNER,OWNER_NAME,OWNER_EMAIL,CONNECTOR_TYPE,DOCUMENT_ID,SOURCE_URL,ALLOWED_USERS,ALLOWED_GROUPS,FILENAME,MIMETYPE,FILESIZE,SELECTED_EMBEDDING_MODEL,OPENAI_API_KEY,OPENAI_API_BASE,ANTHROPIC_API_KEY,WATSONX_API_KEY,WATSONX_ENDPOINT,WATSONX_PROJECT_ID,OLLAMA_BASE_URL,OPENSEARCH_INDEX_NAME + - LANGFLOW_LOG_LEVEL=${LOG_LEVEL} + - LANGFLOW_WORKERS=${LANGFLOW_WORKERS:-1} - LANGFLOW_AUTO_LOGIN=${LANGFLOW_AUTO_LOGIN} - LANGFLOW_SUPERUSER=${LANGFLOW_SUPERUSER} - LANGFLOW_SUPERUSER_PASSWORD=${LANGFLOW_SUPERUSER_PASSWORD} @@ -153,3 +161,6 @@ services: - LANGFLOW_ENABLE_SUPERUSER_CLI=${LANGFLOW_ENABLE_SUPERUSER_CLI} # - DEFAULT_FOLDER_NAME=OpenRAG - HIDE_GETTING_STARTED_PROGRESS=true + +volumes: + opensearch-data: \ No newline at end of file diff --git a/docs/docs/reference/configuration.mdx b/docs/docs/reference/configuration.mdx index dd6fc4925..a71af6616 100644 --- a/docs/docs/reference/configuration.mdx +++ b/docs/docs/reference/configuration.mdx @@ -154,10 +154,12 @@ Configure general system components, session management, and logging. | `FRONTEND_PORT` | `3000` | Host port for the OpenRAG frontend web interface. Change this if port 3000 is already in use on your system. | | `OPENRAG_VERSION` | `latest` | The version of the OpenRAG Docker images to run. For more information, see [Upgrade OpenRAG](/upgrade) | | `NEXT_ALLOWED_DEV_ORIGINS` | `http://localhost:3000` | Only used when [running OpenRAG in development mode](/support/contribute). Accepts a comma-separated list of hostnames to allow additional origins to make requests to the Next.js development server. | +| `MAX_WORKERS` | `min(4, CPU_COUNT // 2)` | Number of Backend worker processes for concurrent request handling. | +| `LANGFLOW_WORKERS` | `1` | Number of Langflow worker processes for concurrent request handling. | +| `DOCLING_WORKERS` | `1` | Number of Docling worker processes for concurrent request handling. | - diff --git a/flows/components/split_text.py b/flows/components/split_text.py new file mode 100644 index 000000000..ef1100885 --- /dev/null +++ b/flows/components/split_text.py @@ -0,0 +1,515 @@ +import copy +import re +from typing import Iterable + +from langchain_text_splitters import CharacterTextSplitter + +from lfx.custom.custom_component.component import Component +from lfx.io import DropdownInput, HandleInput, IntInput, MessageTextInput, Output +from lfx.schema.data import Data +from lfx.schema.dataframe import DataFrame +from lfx.schema.message import Message +from lfx.utils.util import unescape_string +from lfx.log import logger + +from langchain_core.documents import Document + + +class SplitTextComponent(Component): + display_name: str = "Split Text" + description: str = "Split text into chunks based on specified criteria." + documentation: str = "https://docs.langflow.org/components-processing#split-text" + icon = "scissors-line-dashed" + name = "SplitText" + + inputs = [ + HandleInput( + name="data_inputs", + display_name="Input", + info="The data with texts to split in chunks.", + input_types=["Data", "DataFrame", "Message"], + required=True, + ), + IntInput( + name="chunk_overlap", + display_name="Chunk Overlap", + info="Number of characters to overlap between chunks.", + value=200, + ), + IntInput( + name="chunk_size", + display_name="Chunk Size", + info=( + "The maximum length of each chunk. Text is first split by separator, " + "then chunks are merged up to this size. " + "Individual splits larger than this won't be further divided." + ), + value=1000, + ), + MessageTextInput( + name="separator", + display_name="Separator", + info=( + "The character to split on. Use \\n for newline. " + "Examples: \\n\\n for paragraphs, \\n for lines, . for sentences" + ), + value="\n", + ), + MessageTextInput( + name="text_key", + display_name="Text Key", + info="The key to use for the text column.", + value="text", + advanced=True, + ), + DropdownInput( + name="keep_separator", + display_name="Keep Separator", + info="Whether to keep the separator in the output chunks and where to place it.", + options=["False", "True", "Start", "End"], + value="False", + advanced=True, + ), + DropdownInput( + name="splitter_type", + display_name="Splitter Type", + info="Which text splitter to use to chunk the documents.", + options=["CharacterTextSplitter", "TableAwareTextSplitter", "LineBasedTextSplitter"], + value="CharacterTextSplitter", + advanced=True, + ), + MessageTextInput( + name="model_id", + display_name="Model ID", + info="The name of the model that will be used for computing embeddings.", + value="ibm-granite/granite-embedding-30m-english", + advanced=True, + ), + DropdownInput( + name="use_document_title", + display_name="Use Document Title", + info="Whether to use the document title as a prefix in each chunk.", + options=["False", "True"], + value="False", + advanced=True, + ), + ] + + outputs = [ + Output(display_name="Chunks", name="dataframe", method="split_text"), + ] + + def _docs_to_data(self, docs) -> list[Data]: + return [Data(text=doc.page_content, data=doc.metadata) for doc in docs] + + def _fix_separator(self, separator: str) -> str: + """Fix common separator issues and convert to proper format.""" + if separator == "/n": + return "\n" + if separator == "/t": + return "\t" + return separator + + @staticmethod + def to_bool(val): + if isinstance(val, str): + if val.lower() == "false": + return False + elif val.lower() == "true": + return True + elif isinstance(val, bool): + return val + raise RuntimeError(f"Cannot convert value {val} to a boolean value. Expected 'True' or 'False'.") + + def split_text_base(self): + separator = self._fix_separator(self.separator) + separator = unescape_string(separator) + + if isinstance(self.data_inputs, DataFrame): + if not len(self.data_inputs): + msg = "DataFrame is empty" + raise TypeError(msg) + + self.data_inputs.text_key = self.text_key + try: + documents = self.data_inputs.to_lc_documents() + except Exception as e: + msg = f"Error converting DataFrame to documents: {e}" + raise TypeError(msg) from e + elif isinstance(self.data_inputs, Message): + self.data_inputs = [self.data_inputs.to_data()] + return self.split_text_base() + else: + if not self.data_inputs: + msg = "No data inputs provided" + raise TypeError(msg) + + documents = [] + if isinstance(self.data_inputs, Data): + self.data_inputs.text_key = self.text_key + documents = [self.data_inputs.to_lc_document()] + else: + try: + documents = [input_.to_lc_document() for input_ in self.data_inputs if isinstance(input_, Data)] + if not documents: + msg = f"No valid Data inputs found in {type(self.data_inputs)}" + raise TypeError(msg) + except AttributeError as e: + msg = f"Invalid input type in collection: {e}" + raise TypeError(msg) from e + try: + if self.splitter_type == "CharacterTextSplitter": + # Convert string 'False'/'True' to boolean + keep_sep = self.to_bool(self.keep_separator) + logger.debug("SPLIT: Creating a CharacterTextSplitter..") + splitter = CharacterTextSplitter( + chunk_overlap=self.chunk_overlap, + chunk_size=self.chunk_size, + separator=separator, + keep_separator=keep_sep, + ) + elif self.splitter_type == "LineBasedTextSplitter": + use_document_title = self.to_bool(self.use_document_title) + splitter = LineBasedTextSplitter( + chunk_size=self.chunk_size, + model_id=self.model_id, + use_document_title=use_document_title, + ) + elif self.splitter_type == "TableAwareTextSplitter": + logger.debug(f"SPLIT: Creating a TableAwareTextSplitter with chunk_size={self.chunk_size} and model_id '{self.model_id}'.") + splitter = TableAwareTextSplitter( + chunk_size=self.chunk_size, + model_id=self.model_id + ) + else: + raise RuntimeError(f"Unknown splitter type value '{self.splitter_type}'.") + return splitter.split_documents(documents) + except Exception as e: + msg = f"Error splitting text: {e}" + raise TypeError(msg) from e + + def split_text(self) -> DataFrame: + return DataFrame(self._docs_to_data(self.split_text_base())) + +class LineBasedTextSplitter: + def __init__( + self, + chunk_size: int, + model_id: str, + prefix: str = "", + use_document_title: bool = False, + ): + self._chunk_size = chunk_size + self.use_tiktoken = False + if model_id in ["text-embedding-3-small", "text-embedding-3-large"]: + self.use_tiktoken = True + logger.debug(f"SPLIT: Initializing LineBasedTextSplitter, chunk_size = {chunk_size}, model_id = '{model_id}', use_tiktoken = {self.use_tiktoken}, use_document_title={use_document_title}.") + if self.use_tiktoken: + import tiktoken + # The tokenizer for text-embedding-3-small, text-embedding-3-large + self._tokenizer = tiktoken.get_encoding("cl100k_base") + else: + from transformers import AutoTokenizer + self._tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=model_id, + ) + self._prefix = "" + self._prefix_len = 0 + self.set_prefix(prefix) + self.use_document_title = use_document_title + + def set_prefix(self, prefix): + logger.debug(f"SPLIT: setting prefix to '{prefix}'..") + prefix_len = len(self.tokenize(prefix)) + if prefix_len >= self._chunk_size: + raise RuntimeError( + f"Chunks prefix: {prefix} is too long for chunk size {self._chunk_size}" + ) + else: + self._prefix = prefix + self._prefix_len = prefix_len + + def tokenize(self, text: str) -> list[int]: + if self.use_tiktoken: + return self._tokenizer.encode(text) + else: + return self._tokenizer.encode(text, add_special_tokens=False) + + def decode_tokens(self, tokens: list[int]): + return self._tokenizer.decode(tokens) + + def split_documents(self, documents: Iterable[Document]) -> list[Document]: + """Given Documents, chunk the text to smaller pieces and return them as list of Documents""" + + chunks = [] + for document in documents: + chunks.extend(self._chunk_document(document)) + return chunks + + def _chunk_document(self, document: Document): + document_text = document.page_content + document_metadata = document.metadata + chunks = [] + chunk_seq_num = 0 + + first_character_index = document_metadata.get("start_index", 0) + if self.use_document_title: + file_name = document_metadata.get("filename", "unknown-file-name") + logger.debug(f"SPLIT: Chunking document with file name '{file_name}'..") + document_title = get_title(file_name) + logger.debug(f"SPLIT: Found title '{document_title}'..") + self.set_prefix(document_title) + + current = self._prefix + current_len = self._prefix_len + + new_line_token_count = len(self.tokenize("\n")) + lines = document_text.split("\n") + for line in lines: + line_tokens = self.tokenize(line) + + while ( + len(line_tokens) > self._chunk_size - current_len + ): # line cannot fit into current + num_available_tokens_in_chunk = ( + self._chunk_size - current_len + if len(line_tokens) + self._prefix_len > self._chunk_size + else 0 + ) # if whole line can fit into a new chunk, do not add anything to current chunk, + # otherwise, split the line between current and next chunks. + + if num_available_tokens_in_chunk > 0: + # split line + if current: + current += "\n" + current_len += new_line_token_count + current += self.decode_tokens( + line_tokens[:num_available_tokens_in_chunk] + ) + current_len += num_available_tokens_in_chunk + + # add current chunk + chunks.append( + self._new_chunk( + current, chunk_seq_num, first_character_index, document_metadata + ) + ) + + # new current chunk + first_character_index += len(current) + chunk_seq_num += 1 + current = self._prefix + current_len = self._prefix_len + line_tokens = line_tokens[num_available_tokens_in_chunk:] + + # rest of line fits into current + if len(line_tokens) > 0: + if current: + current += "\n" + current_len += new_line_token_count + current += self.decode_tokens(line_tokens) + current_len += len(line_tokens) + + # final chunk + chunks.append( + self._new_chunk( + current, chunk_seq_num, first_character_index, document_metadata + ) + ) + + return chunks + + @staticmethod + def _new_chunk( + text: str, seq_no: int, start_index: int, doc_metadata: dict + ) -> Document: + chunk_metadata = copy.deepcopy(doc_metadata) + chunk_metadata["sequence_number"] = seq_no + chunk_metadata["start_index"] = start_index + return Document(page_content=text, metadata=chunk_metadata) + + +class TableAwareTextSplitter: + + def __init__(self, chunk_size: int, model_id: str): + self.chunk_size = chunk_size + self.model_id = model_id + + def split_documents(self, documents: Iterable[Document]) -> list[Document]: + """Given Documents, chunk the text to smaller pieces and return them as list of Documents""" + + chunks = [] + for document in documents: + chunks.extend(self._chunk_document(document)) + return chunks + + def _chunk_document(self, document: Document) -> list[Document]: + segments = self._get_segments(document) + + chunks = [] + for segment in segments: + prefix = self.get_prefix(segment) + line_splitter = LineBasedTextSplitter( + chunk_size=self.chunk_size, + model_id=self.model_id, + prefix=prefix + ) + + chunks.extend(line_splitter.split_documents([segment])) + + return chunks + + # fix me: does not indicate sub headers + def _get_segments(self, doc): + segments = [] + doc_metadata = doc.metadata + segments_count = 0 + start_index = doc.metadata.get("start_index", 0) + current_segment = Document( + page_content="", + metadata={"type": "text", "seq_no": segments_count, "start_index": start_index} + | doc_metadata, + ) + separator_found = False + lines = doc.page_content.split("\n") + for line in lines: + + if self._is_table_line(line): + if current_segment.metadata["type"] != "table": # first table line + segments.append(current_segment) + segments_count += 1 + start_index += len(current_segment.page_content) + current_segment = Document( + page_content="", + metadata={ + "type": "table", + "caption": self.get_caption(current_segment), + "header": self.condense_table_row(line), + "seq_no": segments_count, + "start_index": start_index, + } + | doc_metadata, + ) + separator_found = False + elif self._is_table_seperator(line): + + separator_found = True + current_segment.metadata[ + "header" + ] += "\n" + self.condense_separator(line) + elif not separator_found: + + current_segment.metadata[ + "header" + ] += "\n" + self.condense_table_row(line) + else: + current_segment.page_content += "\n" + line + + else: # text line + if current_segment.metadata["type"] == "table": + segments.append(current_segment) + segments_count += 1 + start_index += len(current_segment.page_content) + current_segment = Document( + page_content="", + metadata={ + "type": "text", + "seq_no": segments_count, + "start_index": start_index, + } + | doc_metadata, + ) + current_segment.page_content += "\n" + line + + # last segment + segments.append(current_segment) + return [c for c in segments if len(c.page_content.strip()) > 0] + + @staticmethod + def get_prefix(segment: Document) -> str: + if segment.metadata["type"] == "text": + return "" + elif segment.metadata["type"] == "table": + result = segment.metadata["caption"] + if result: + result += "\n" + result += segment.metadata["header"] + return result + else: + raise RuntimeError(f"Internal error: unknown segment type '{segment['type']}' for segment {segment}.") + + # returns last sentence before table + @staticmethod + def get_caption(prev_segment) -> str: + last_sentence = prev_segment.page_content.strip().split("\n")[-1].split(".")[-1] + return last_sentence + + # each line starting with | is included in table + @staticmethod + def _is_table_line(line: str): + return line.startswith("|") + + @staticmethod + def _is_table_seperator(line: str): + cells = [c.strip() for c in line.strip().split("|")] + return all( + re.match(r"[-]+", cell.strip()) for cell in cells if len(cell.strip()) > 0 + ) + + @staticmethod + def condense_separator(line: str): + numCells = len(line.strip().split("|")) - 2 + return "| --- " * numCells + "|" + + @staticmethod + def condense_table_row(line: str) -> str: + if sum([t.isalnum() for t in line]) == 0: + return "" + cells = [c.strip() for c in line.strip().split("|")] + + return " | ".join(cells).strip() + +def get_title(file_name: str) -> str: + file_name_to_title = { + "docling.pdf": "Docling Technical Report" + } + file_name_to_title.update(filename_to_output) + return file_name_to_title.get(file_name, "") + + +filename_to_output = { + "Alaska-2017.pdf": """This document is the 2017 annual report (Form 10-K) of Alaska Air Group, Inc., filed with the United States Securities and Exchange Commission (SEC). The report covers the fiscal year ended December 31, 2017. Important entities mentioned include: + +* Alaska Air Group, Inc. (the company) +* United States Securities and Exchange Commission (SEC) +* New York Stock Exchange (where the company's common stock is registered) + +Important dates mentioned include: + +* December 31, 2017 (end of the fiscal year) +* January 31, 2018 (date of share outstanding total) +* June 30, 2017 (date used to calculate aggregate market value of shares held by nonaffiliates)""", + "Alaska-2018.pdf": """This document is the 2018 annual report (Form 10-K) of Alaska Air Group, Inc., filed with the United States Securities and Exchange Commission (SEC). The report covers the fiscal year ended December 31, 2018. Important entities mentioned include: + +* Alaska Air Group, Inc. (the company) +* United States Securities and Exchange Commission (SEC) +* New York Stock Exchange (where the company's common stock is listed) + +Important dates mentioned include: + +* December 31, 2018 (end of the fiscal year) +* January 31, 2019 (date of share outstanding total) +* June 30, 2018 (date used to calculate aggregate market value of shares held by nonaffiliates)""", + "AmericanAirlines-2017.pdf": "This document is the 2017 annual report (Form 10-K) of American Airlines Group Inc., filed with the United States Securities and Exchange Commission (SEC).", + "AmericanAirlines-2018.pdf": "The document \"AmericanAirlines-2018.pdf\" is the 2018 Annual Report on Form 10-K for American Airlines Group Inc.", + "AmericanAirlines-2019.pdf": "This document is the 2020 Annual Report on Form 10-K for American Airlines Group Inc., filed for the year ending 2019.", + "Delta-2017.pdf": "This document is the 2017 annual report (Form 10-K) of Delta Air Lines, Inc. for the fiscal year ended December 31, 2017.", + "Delta-2018.pdf": "This document is the 2018 annual report (Form 10-K) of Delta Air Lines, Inc. for the fiscal year ended December 31, 2018.", + "Delta-2019.pdf": "This document is the 2019 annual report (Form 10-K) of Delta Air Lines, Inc. for the fiscal year ended December 31, 2019.", + "Southwest-2017.pdf": "This document is the 2017 Annual Report to Shareholders of Southwest Airlines Co.", + "Southwest-2018.pdf": "This document is the 2018 Annual Report to Shareholders of Southwest Airlines Co.", + "Southwest-2019.pdf": "This document is the 2019 Annual Report to Shareholders of Southwest Airlines Co.", + "United-2017.pdf": "This document is the 2017 annual report (Form 10-K) of United Continental Holdings, Inc. and United Airlines, Inc.", + "United-2018.pdf": "This document is the 2018 annual report (Form 10-K) of United Continental Holdings, Inc. and United Airlines, Inc.", + "United-2019.pdf": "This document is the 2019 annual report (Form 10-K) of United Airlines Holdings, Inc. and United Airlines, Inc." +} diff --git a/flows/ingestion_flow.json b/flows/ingestion_flow.json index c5c2fc375..4555f40a3 100644 --- a/flows/ingestion_flow.json +++ b/flows/ingestion_flow.json @@ -5564,7 +5564,7 @@ "_type": "Component", "api_base": { "_input_type": "MessageTextInput", - "advanced": true, + "advanced": false, "display_name": "OpenAI API Base URL", "dynamic": false, "info": "Base URL for the API. Leave empty for default.", @@ -5573,7 +5573,7 @@ ], "list": false, "list_add_label": "Add More", - "load_from_db": false, + "load_from_db": true, "name": "api_base", "override_skip": false, "placeholder": "", @@ -5585,7 +5585,7 @@ "trace_as_metadata": true, "track_in_telemetry": false, "type": "str", - "value": "" + "value": "OPENAI_API_BASE" }, "api_key": { "_input_type": "SecretStrInput", diff --git a/flows/openrag_agent.json b/flows/openrag_agent.json index b2f41b1b6..1e6309032 100644 --- a/flows/openrag_agent.json +++ b/flows/openrag_agent.json @@ -1808,14 +1808,14 @@ }, "openai_api_base": { "_input_type": "StrInput", - "advanced": true, + "advanced": false, "display_name": "OpenAI API Base", "dynamic": false, "info": "The base URL of the OpenAI API. Defaults to https://api.openai.com/v1. You can change this to use other APIs like JinaChat, LocalAI and Prem.", "input_types": [], "list": false, "list_add_label": "Add More", - "load_from_db": false, + "load_from_db": true, "name": "openai_api_base", "override_skip": false, "placeholder": "", @@ -1826,7 +1826,7 @@ "trace_as_metadata": true, "track_in_telemetry": false, "type": "str", - "value": "" + "value": "OPENAI_API_BASE" }, "output_schema": { "_input_type": "TableInput", @@ -2344,7 +2344,7 @@ ], "list": false, "list_add_label": "Add More", - "load_from_db": false, + "load_from_db": true, "name": "api_base", "override_skip": false, "placeholder": "", @@ -2356,7 +2356,7 @@ "trace_as_metadata": true, "track_in_telemetry": false, "type": "str", - "value": "" + "value": "OPENAI_API_BASE" }, "api_key": { "_input_type": "SecretStrInput", diff --git a/frontend/app/api/mutations/useConnectConnectorMutation.ts b/frontend/app/api/mutations/useConnectConnectorMutation.ts new file mode 100644 index 000000000..9054976c1 --- /dev/null +++ b/frontend/app/api/mutations/useConnectConnectorMutation.ts @@ -0,0 +1,77 @@ +import { useMutation, useQueryClient } from "@tanstack/react-query"; +import { toast } from "sonner"; +import type { Connector } from "../queries/useGetConnectorsQuery"; + +interface ConnectResponse { + connection_id: string; + oauth_config?: { + authorization_endpoint: string; + client_id: string; + scopes: string[]; + redirect_uri: string; + }; +} + +export const useConnectConnectorMutation = () => { + const queryClient = useQueryClient(); + + return useMutation({ + mutationFn: async ({ connector, redirectUri }: { connector: Connector; redirectUri: string }): Promise => { + const response = await fetch("/api/auth/init", { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + connector_type: connector.type, + purpose: "data_source", + name: `${connector.name} Connection`, + redirect_uri: redirectUri, + }), + }); + + if (!response.ok) { + const result = await response.json(); + throw new Error(result.error || `Failed to initiate connection for ${connector.name}`); + } + return response.json(); + }, + onMutate: async ({ connector }) => { + // Cancel any outgoing refetches + await queryClient.cancelQueries({ queryKey: ["connectors"] }); + + // Snapshot the previous value + const previousConnectors = queryClient.getQueryData(["connectors"]); + + return { previousConnectors }; + }, + onError: (err, { connector }, context) => { + // Roll back if mutation fails + if (context?.previousConnectors) { + queryClient.setQueryData(["connectors"], context.previousConnectors); + } + toast.error(err.message); + }, + onSuccess: (result, { connector }) => { + if (result.oauth_config) { + localStorage.setItem("connecting_connector_id", result.connection_id); + localStorage.setItem("connecting_connector_type", connector.type); + localStorage.setItem("auth_purpose", "data_source"); + + const authUrl = + `${result.oauth_config.authorization_endpoint}?` + + `client_id=${result.oauth_config.client_id}&` + + `response_type=code&` + + `scope=${result.oauth_config.scopes.join(" ")}&` + + `redirect_uri=${encodeURIComponent( + result.oauth_config.redirect_uri, + )}&` + + `access_type=offline&` + + `prompt=select_account&` + + `state=${result.connection_id}`; + + window.location.href = authUrl; + } + }, + }); +}; diff --git a/frontend/app/api/mutations/useDisconnectConnectorMutation.ts b/frontend/app/api/mutations/useDisconnectConnectorMutation.ts new file mode 100644 index 000000000..c3baf0d0d --- /dev/null +++ b/frontend/app/api/mutations/useDisconnectConnectorMutation.ts @@ -0,0 +1,55 @@ +import { useMutation, useQueryClient } from "@tanstack/react-query"; +import { toast } from "sonner"; +import type { Connector } from "../queries/useGetConnectorsQuery"; + +export const useDisconnectConnectorMutation = () => { + const queryClient = useQueryClient(); + + return useMutation({ + mutationFn: async (connector: Connector) => { + const response = await fetch(`/api/connectors/${connector.type}/disconnect`, { + method: "DELETE", + }); + + if (!response.ok) { + const result = await response.json(); + throw new Error(result.error || `Failed to disconnect ${connector.name}`); + } + return response.json(); + }, + onMutate: async (connector) => { + // Cancel any outgoing refetches (so they don't overwrite our optimistic update) + await queryClient.cancelQueries({ queryKey: ["connectors"] }); + + // Snapshot the previous value + const previousConnectors = queryClient.getQueryData(["connectors"]); + + // Optimistically update to the new value + if (previousConnectors) { + queryClient.setQueryData(["connectors"], + previousConnectors.map((c) => + c.type === connector.type + ? { ...c, status: "not_connected", connectionId: undefined } + : c + ) + ); + } + + return { previousConnectors }; + }, + onError: (err, connector, context) => { + // If the mutation fails, use the context returned from onMutate to roll back + if (context?.previousConnectors) { + queryClient.setQueryData(["connectors"], context.previousConnectors); + } + toast.error(`Failed to disconnect ${connector.name}: ${err.message}`); + }, + onSuccess: (_, connector) => { + toast.success(`${connector.name} disconnected`); + }, + onSettled: () => { + // Always refetch after error or success to ensure we have the correct data + queryClient.invalidateQueries({ queryKey: ["connectors"] }); + }, + }); +}; diff --git a/frontend/app/api/mutations/useOnboardingMutation.ts b/frontend/app/api/mutations/useOnboardingMutation.ts index 911e196ca..4672f4b48 100644 --- a/frontend/app/api/mutations/useOnboardingMutation.ts +++ b/frontend/app/api/mutations/useOnboardingMutation.ts @@ -21,9 +21,6 @@ export interface OnboardingVariables { watsonx_endpoint?: string; watsonx_project_id?: string; ollama_endpoint?: string; - - // Sample data - sample_data?: boolean; } interface OnboardingResponse { diff --git a/frontend/app/api/mutations/useSyncConnector.ts b/frontend/app/api/mutations/useSyncConnector.ts index dc3d48b67..9ff22e47e 100644 --- a/frontend/app/api/mutations/useSyncConnector.ts +++ b/frontend/app/api/mutations/useSyncConnector.ts @@ -31,13 +31,30 @@ const syncAllConnectors = async (): Promise => { }; // Sync a specific connector type -const syncConnector = async (connectorType: string): Promise => { +const syncConnector = async ({ + connectorType, + body, +}: { + connectorType: string; + body?: { + connection_id?: string; + max_files?: number; + selected_files?: Array<{ + id: string; + name: string; + mimeType: string; + downloadUrl?: string; + size?: number; + }>; + settings?: any; + }; +}): Promise => { const response = await fetch(`/api/connectors/${connectorType}/sync`, { method: "POST", headers: { "Content-Type": "application/json", }, - body: JSON.stringify({}), + body: JSON.stringify(body || {}), }); if (!response.ok) { diff --git a/frontend/app/api/queries/useGetConnectorTokenQuery.ts b/frontend/app/api/queries/useGetConnectorTokenQuery.ts new file mode 100644 index 000000000..effd76e87 --- /dev/null +++ b/frontend/app/api/queries/useGetConnectorTokenQuery.ts @@ -0,0 +1,46 @@ +import { useQuery, type UseQueryOptions } from "@tanstack/react-query"; + +interface TokenResponse { + access_token: string; + expires_in?: number; + token_type?: string; + error?: string; +} + +export const useGetConnectorTokenQuery = ( + { + connectorType, + connectionId, + resource, + }: { + connectorType: string; + connectionId: string | undefined; + resource?: string; + }, + options?: Omit, "queryKey" | "queryFn">, +) => { + return useQuery({ + queryKey: ["connector-token", connectorType, connectionId, resource], + queryFn: async (): Promise => { + if (!connectionId) { + throw new Error("Connection ID is required for fetching token"); + } + + let url = `/api/connectors/${connectorType}/token?connection_id=${connectionId}`; + if (resource) { + url += `&resource=${encodeURIComponent(resource)}`; + } + + const response = await fetch(url); + if (!response.ok) { + const errorData = await response.json().catch(() => ({})); + throw new Error(errorData.error || "Failed to fetch access token"); + } + + return response.json(); + }, + enabled: !!connectorType && !!connectionId && (options?.enabled ?? true), + staleTime: 1000 * 60 * 5, // 5 minutes + ...options, + }); +}; diff --git a/frontend/app/api/queries/useGetConnectorsQuery.ts b/frontend/app/api/queries/useGetConnectorsQuery.ts new file mode 100644 index 000000000..8638fa448 --- /dev/null +++ b/frontend/app/api/queries/useGetConnectorsQuery.ts @@ -0,0 +1,118 @@ +import { type UseQueryOptions, useQuery } from "@tanstack/react-query"; + +interface GoogleDriveFile { + id: string; + name: string; + mimeType: string; + webViewLink?: string; + iconLink?: string; +} + +interface OneDriveFile { + id: string; + name: string; + mimeType?: string; + webUrl?: string; + driveItem?: { + file?: { mimeType: string }; + folder?: unknown; + }; +} + +export interface Connector { + id: string; + name: string; + description: string; + icon: string; // The icon name from the API + status: "not_connected" | "connected" | "error"; + type: string; + connectionId?: string; + clientId?: string; + baseUrl?: string; + access_token?: string; + selectedFiles?: GoogleDriveFile[] | OneDriveFile[]; + available?: boolean; +} + +interface Connection { + connection_id: string; + is_active: boolean; + is_authenticated?: boolean; + created_at: string; + last_sync?: string; + client_id?: string; + base_url?: string; +} + +export interface GetConnectorsResponse { + connectors: Connector[]; +} + +export const useGetConnectorsQuery = ( + options?: Omit, "queryKey" | "queryFn">, +) => { + async function getConnectors(): Promise { + const connectorsResponse = await fetch("/api/connectors"); + if (!connectorsResponse.ok) { + throw new Error("Failed to fetch available connectors"); + } + + const { connectors: connectorsMap } = await connectorsResponse.json(); + const connectorTypes = Object.keys(connectorsMap); + + const connectorsWithStatus = await Promise.all( + connectorTypes.map(async (type) => { + const connectorData = connectorsMap[type]; + const statusResponse = await fetch(`/api/connectors/${type}/status`); + + let status: Connector["status"] = "not_connected"; + let connectionId: string | undefined; + + if (statusResponse.ok) { + const statusData = await statusResponse.json(); + const connections = statusData.connections || []; + const activeConnection = connections.find( + (conn: Connection) => conn.is_active && conn.is_authenticated, + ); + + if (activeConnection) { + status = "connected"; + connectionId = activeConnection.connection_id; + return { + id: type, + name: connectorData.name, + description: connectorData.description, + icon: connectorData.icon, + status, + type, + connectionId, + clientId: activeConnection.client_id, + baseUrl: activeConnection.base_url, + available: connectorData.available, + } as Connector; + } + } + + return { + id: type, + name: connectorData.name, + description: connectorData.description, + icon: connectorData.icon, + status, + type, + connectionId, + available: connectorData.available, + } as Connector; + }), + ); + + return connectorsWithStatus; + } + + return useQuery({ + queryKey: ["connectors"], + queryFn: getConnectors, + refetchOnMount: "always", + ...options, + }); +}; diff --git a/frontend/app/auth/callback/page.tsx b/frontend/app/auth/callback/page.tsx index 193873c50..55cd3a4d3 100644 --- a/frontend/app/auth/callback/page.tsx +++ b/frontend/app/auth/callback/page.tsx @@ -50,7 +50,9 @@ function AuthCallbackContent() { // Determine purpose - default to app_auth for login, data_source for connectors const detectedPurpose = authPurpose || - (storedConnectorType?.includes("drive") ? "data_source" : "app_auth"); + (storedConnectorType && storedConnectorType !== "app_auth" + ? "data_source" + : "app_auth"); setPurpose(detectedPurpose); // Debug logging diff --git a/frontend/app/knowledge/chunks/page.tsx b/frontend/app/knowledge/chunks/page.tsx index 9b3122bab..a29472434 100644 --- a/frontend/app/knowledge/chunks/page.tsx +++ b/frontend/app/knowledge/chunks/page.tsx @@ -16,6 +16,7 @@ import { // import { Label } from "@/components/ui/label"; // import { Checkbox } from "@/components/ui/checkbox"; import { KnowledgeSearchInput } from "@/components/knowledge-search-input"; +import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; const getFileTypeLabel = (mimetype: string) => { if (mimetype === "application/pdf") return "PDF"; @@ -42,7 +43,7 @@ function ChunksPageContent() { const averageChunkLength = useMemo( () => chunks.reduce((acc, chunk) => acc + chunk.text.length, 0) / - chunks.length || 0, + chunks.length || 0, [chunks], ); @@ -355,18 +356,25 @@ function ChunksPageContent() { Allowed users
-
+
{fileData.allowed_users.map((user, idx) => (
- + {user?.charAt(0).toUpperCase()} - {user} + + + {user} + + + {user} + +
))}
diff --git a/frontend/app/onboarding/_components/advanced.tsx b/frontend/app/onboarding/_components/advanced.tsx index b73993b79..39bab81e5 100644 --- a/frontend/app/onboarding/_components/advanced.tsx +++ b/frontend/app/onboarding/_components/advanced.tsx @@ -15,8 +15,6 @@ export function AdvancedOnboarding({ embeddingModel, setLanguageModel, setEmbeddingModel, - sampleDataset, - setSampleDataset, }: { icon?: React.ReactNode; languageModels?: { value: string; label: string }[]; @@ -25,8 +23,6 @@ export function AdvancedOnboarding({ embeddingModel?: string; setLanguageModel?: (model: string) => void; setEmbeddingModel?: (model: string) => void; - sampleDataset: boolean; - setSampleDataset: (dataset: boolean) => void; }) { const hasEmbeddingModels = embeddingModels !== undefined && diff --git a/frontend/app/onboarding/_components/anthropic-onboarding.tsx b/frontend/app/onboarding/_components/anthropic-onboarding.tsx index d58fc1ba1..6d917c134 100644 --- a/frontend/app/onboarding/_components/anthropic-onboarding.tsx +++ b/frontend/app/onboarding/_components/anthropic-onboarding.tsx @@ -18,15 +18,11 @@ import { AdvancedOnboarding } from "./advanced"; export function AnthropicOnboarding({ setSettings, - sampleDataset, - setSampleDataset, setIsLoadingModels, isEmbedding = false, hasEnvApiKey = false, }: { setSettings: Dispatch>; - sampleDataset: boolean; - setSampleDataset: (dataset: boolean) => void; setIsLoadingModels?: (isLoading: boolean) => void; isEmbedding?: boolean; hasEnvApiKey?: boolean; @@ -57,9 +53,6 @@ export function AnthropicOnboarding({ languageModels, embeddingModels, } = useModelSelection(modelsData, isEmbedding); - const handleSampleDatasetChange = (dataset: boolean) => { - setSampleDataset(dataset); - }; const handleGetFromEnvChange = (fromEnv: boolean) => { setGetFromEnv(fromEnv); @@ -144,9 +137,7 @@ export function AnthropicOnboarding({ embeddingModels={embeddingModels} languageModel={languageModel} embeddingModel={embeddingModel} - sampleDataset={sampleDataset} setLanguageModel={setLanguageModel} - setSampleDataset={handleSampleDatasetChange} setEmbeddingModel={setEmbeddingModel} /> diff --git a/frontend/app/onboarding/_components/ibm-onboarding.tsx b/frontend/app/onboarding/_components/ibm-onboarding.tsx index 023b3467b..ebf455b83 100644 --- a/frontend/app/onboarding/_components/ibm-onboarding.tsx +++ b/frontend/app/onboarding/_components/ibm-onboarding.tsx @@ -20,8 +20,6 @@ import { ModelSelector } from "./model-selector"; export function IBMOnboarding({ isEmbedding = false, setSettings, - sampleDataset, - setSampleDataset, setIsLoadingModels, alreadyConfigured = false, existingEndpoint, @@ -30,8 +28,6 @@ export function IBMOnboarding({ }: { isEmbedding?: boolean; setSettings: Dispatch>; - sampleDataset: boolean; - setSampleDataset: (dataset: boolean) => void; setIsLoadingModels?: (isLoading: boolean) => void; alreadyConfigured?: boolean; existingEndpoint?: string; @@ -125,10 +121,6 @@ export function IBMOnboarding({ setLanguageModel?.(""); }; - const handleSampleDatasetChange = (dataset: boolean) => { - setSampleDataset(dataset); - }; - useEffect(() => { setIsLoadingModels?.(isLoadingModels); }, [isLoadingModels, setIsLoadingModels]); @@ -161,7 +153,7 @@ export function IBMOnboarding({ options={alreadyConfigured ? [] : options} value={endpoint} custom - onValueChange={alreadyConfigured ? () => {} : setEndpoint} + onValueChange={alreadyConfigured ? () => { } : setEndpoint} searchPlaceholder="Search endpoint..." noOptionsPlaceholder={ alreadyConfigured @@ -280,10 +272,8 @@ export function IBMOnboarding({ embeddingModels={embeddingModels} languageModel={languageModel} embeddingModel={embeddingModel} - sampleDataset={sampleDataset} setLanguageModel={setLanguageModel} setEmbeddingModel={setEmbeddingModel} - setSampleDataset={handleSampleDatasetChange} /> ); diff --git a/frontend/app/onboarding/_components/ollama-onboarding.tsx b/frontend/app/onboarding/_components/ollama-onboarding.tsx index d5c7428c0..bcaff27ac 100644 --- a/frontend/app/onboarding/_components/ollama-onboarding.tsx +++ b/frontend/app/onboarding/_components/ollama-onboarding.tsx @@ -12,16 +12,12 @@ import { ModelSelector } from "./model-selector"; export function OllamaOnboarding({ setSettings, - sampleDataset, - setSampleDataset, setIsLoadingModels, isEmbedding = false, alreadyConfigured = false, existingEndpoint, }: { setSettings: Dispatch>; - sampleDataset: boolean; - setSampleDataset: (dataset: boolean) => void; setIsLoadingModels?: (isLoading: boolean) => void; isEmbedding?: boolean; alreadyConfigured?: boolean; diff --git a/frontend/app/onboarding/_components/onboarding-card.tsx b/frontend/app/onboarding/_components/onboarding-card.tsx index 5324e0504..0371a6f6d 100644 --- a/frontend/app/onboarding/_components/onboarding-card.tsx +++ b/frontend/app/onboarding/_components/onboarding-card.tsx @@ -10,7 +10,6 @@ import { useOnboardingMutation, } from "@/app/api/mutations/useOnboardingMutation"; import { useOnboardingRollbackMutation } from "@/app/api/mutations/useOnboardingRollbackMutation"; -import { useUpdateOnboardingStateMutation } from "@/app/api/mutations/useUpdateOnboardingStateMutation"; import { useGetSettingsQuery } from "@/app/api/queries/useGetSettingsQuery"; import { useGetTasksQuery } from "@/app/api/queries/useGetTasksQuery"; import type { ProviderHealthResponse } from "@/app/api/queries/useProviderHealthQuery"; @@ -66,8 +65,6 @@ const OnboardingCard = ({ isEmbedding ? "openai" : "anthropic", ); - const [sampleDataset, setSampleDataset] = useState(true); - const [isLoadingModels, setIsLoadingModels] = useState(false); const queryClient = useQueryClient(); @@ -211,20 +208,20 @@ const OnboardingCard = ({ task.status === "processing", ); - // Check if any file failed in completed tasks - const completedTasks = tasks.filter( - (task) => task.status === "completed" + // Check if any task failed at the top level + const failedTask = tasks.find( + (task) => task.status === "failed" || task.status === "error", ); // Check if any completed task has at least one failed file - const taskWithFailedFile = completedTasks.find((task) => { + const completedTaskWithFailedFile = tasks.find((task) => { // Must have files object if (!task.files || typeof task.files !== "object") { return false; } const fileEntries = Object.values(task.files); - + // Must have at least one file if (fileEntries.length === 0) { return false; @@ -232,65 +229,69 @@ const OnboardingCard = ({ // Check if any file has failed status const hasFailedFile = fileEntries.some( - (file) => file.status === "failed" || file.status === "error" + (file) => file.status === "failed" || file.status === "error", ); return hasFailedFile; }); + const taskWithFailure = failedTask || completedTaskWithFailedFile; + // If any file failed, show error and jump back one step (like onboardingMutation.onError) // Only handle if we haven't already handled this task if ( - taskWithFailedFile && - !rollbackMutation.isPending && + taskWithFailure && + !rollbackMutation.isPending && !isCompleted && - !handledFailedTasksRef.current.has(taskWithFailedFile.task_id) + !handledFailedTasksRef.current.has(taskWithFailure.task_id) ) { - console.error("File failed in task, jumping back one step", taskWithFailedFile); - + console.error( + "Task failed, jumping back one step", + taskWithFailure, + ); + // Mark this task as handled to prevent infinite loops - handledFailedTasksRef.current.add(taskWithFailedFile.task_id); - + handledFailedTasksRef.current.add(taskWithFailure.task_id); + // Extract error messages from failed files const errorMessages: string[] = []; - if (taskWithFailedFile.files) { - Object.values(taskWithFailedFile.files).forEach((file) => { - if ((file.status === "failed" || file.status === "error") && file.error) { + if (taskWithFailure.files) { + Object.values(taskWithFailure.files).forEach((file) => { + if ( + (file.status === "failed" || file.status === "error") && + file.error + ) { errorMessages.push(file.error); } }); } - + // Also check task-level error - if (taskWithFailedFile.error) { - errorMessages.push(taskWithFailedFile.error); + if (taskWithFailure.error) { + errorMessages.push(taskWithFailure.error); } - + // Use the first error message, or a generic message if no errors found - const errorMessage = errorMessages.length > 0 - ? errorMessages[0] - : "Sample data file failed to ingest. Please try again with a different configuration."; - + const errorMessage = + errorMessages.length > 0 + ? errorMessages[0] + : "Sample data ingestion failed. Please try again."; + // Set error message and jump back one step (exactly like onboardingMutation.onError) setError(errorMessage); setCurrentStep(totalSteps); - // Jump back one step after 1 second (go back to the step before ingestion) - // For embedding: totalSteps is 4, ingestion is step 3, so go back to step 2 - // For LLM: totalSteps is 3, ingestion is step 2, so go back to step 1 - setTimeout(() => { - // Go back to the step before the last step (which is ingestion) - const previousStep = totalSteps > 1 ? totalSteps - 2 : 0; - setCurrentStep(previousStep); - }, 1000); + rollbackMutation.mutate(); return; } - // If no active tasks and we've started onboarding, complete it + // If at least one processed file, no failures, and we've started onboarding, complete it if ( - (!activeTasks || (activeTasks.processed_files ?? 0) > 0) && - tasks.length > 0 && + (((!activeTasks || (activeTasks.successful_files ?? 0) > 0) && + tasks.length > 0) || + (tasks.length === 0 && currentStep === totalSteps - 1)) && // Complete because no files were ingested !isCompleted && - !taskWithFailedFile + !taskWithFailure + ) { // Set to final step to show "Done" setCurrentStep(totalSteps); @@ -329,10 +330,7 @@ const OnboardingCard = ({ onError: (error) => { setError(error.message); setCurrentStep(totalSteps); - // Reset to provider selection after 1 second - setTimeout(() => { - setCurrentStep(null); - }, 1000); + rollbackMutation.mutate(); }, }); @@ -357,7 +355,6 @@ const OnboardingCard = ({ // Prepare onboarding data with provider-specific fields const onboardingData: OnboardingVariables = { - sample_data: sampleDataset, }; // Set the provider field @@ -447,8 +444,8 @@ const OnboardingCard = ({ value="anthropic" className={cn( error && - modelProvider === "anthropic" && - "data-[state=active]:border-destructive", + modelProvider === "anthropic" && + "data-[state=active]:border-destructive", )} > diff --git a/frontend/app/onboarding/_components/openai-onboarding.tsx b/frontend/app/onboarding/_components/openai-onboarding.tsx index d7272cde5..be3996901 100644 --- a/frontend/app/onboarding/_components/openai-onboarding.tsx +++ b/frontend/app/onboarding/_components/openai-onboarding.tsx @@ -18,16 +18,12 @@ import { AdvancedOnboarding } from "./advanced"; export function OpenAIOnboarding({ setSettings, - sampleDataset, - setSampleDataset, setIsLoadingModels, isEmbedding = false, hasEnvApiKey = false, alreadyConfigured = false, }: { setSettings: Dispatch>; - sampleDataset: boolean; - setSampleDataset: (dataset: boolean) => void; setIsLoadingModels?: (isLoading: boolean) => void; isEmbedding?: boolean; hasEnvApiKey?: boolean; @@ -48,8 +44,8 @@ export function OpenAIOnboarding({ getFromEnv ? { apiKey: "" } : debouncedApiKey - ? { apiKey: debouncedApiKey } - : undefined, + ? { apiKey: debouncedApiKey } + : undefined, { // Only validate when the user opts in (env) or provides a key. // If a key was previously configured, let the user decide to reuse or replace it @@ -66,9 +62,6 @@ export function OpenAIOnboarding({ languageModels, embeddingModels, } = useModelSelection(modelsData, isEmbedding); - const handleSampleDatasetChange = (dataset: boolean) => { - setSampleDataset(dataset); - }; const handleGetFromEnvChange = (fromEnv: boolean) => { setGetFromEnv(fromEnv); @@ -167,9 +160,7 @@ export function OpenAIOnboarding({ embeddingModels={embeddingModels} languageModel={languageModel} embeddingModel={embeddingModel} - sampleDataset={sampleDataset} setLanguageModel={setLanguageModel} - setSampleDataset={handleSampleDatasetChange} setEmbeddingModel={setEmbeddingModel} /> diff --git a/frontend/app/settings/_components/connector-cards.tsx b/frontend/app/settings/_components/connector-cards.tsx new file mode 100644 index 000000000..9539b42ac --- /dev/null +++ b/frontend/app/settings/_components/connector-cards.tsx @@ -0,0 +1,245 @@ +"use client"; + +import { useConnectConnectorMutation } from "@/app/api/mutations/useConnectConnectorMutation"; +import { useDisconnectConnectorMutation } from "@/app/api/mutations/useDisconnectConnectorMutation"; +import { useGetConnectorsQuery, type Connector as QueryConnector } from "@/app/api/queries/useGetConnectorsQuery"; +import GoogleDriveIcon from "@/components/icons/google-drive-logo"; +import OneDriveIcon from "@/components/icons/one-drive-logo"; +import SharePointIcon from "@/components/icons/share-point-logo"; +import { Button } from "@/components/ui/button"; +import { + Card, + CardContent, + CardDescription, + CardHeader, + CardTitle, +} from "@/components/ui/card"; +import { useAuth } from "@/contexts/auth-context"; +import { cn } from "@/lib/utils"; +import { Loader2, PlugZap, Plus, RefreshCcw, Trash2 } from "lucide-react"; +import Link from "next/link"; +import { useRouter } from "next/navigation"; +import { useCallback, useState } from "react"; +import ConnectorsSkeleton from "./connectors-skeleton"; + +interface SyncResult { + processed?: number; + added?: number; + errors?: number; + skipped?: number; + total?: number; +} + +interface Connector extends Omit { + icon: React.ReactNode; +} + +export default function ConnectorCards() { + const { isAuthenticated, isNoAuthMode } = useAuth(); + const router = useRouter(); + + const { data: queryConnectors = [], isLoading: connectorsLoading } = useGetConnectorsQuery({ + enabled: isAuthenticated || isNoAuthMode, + }); + + const connectMutation = useConnectConnectorMutation(); + const disconnectMutation = useDisconnectConnectorMutation(); + + const getConnectorIcon = useCallback((iconName: string) => { + const iconMap: { [key: string]: React.ReactElement } = { + "google-drive": , + sharepoint: , + onedrive: , + }; + return ( + iconMap[iconName] || ( +
+ ? +
+ ) + ); + }, []); + + const connectors = queryConnectors.map((c) => ({ + ...c, + icon: getConnectorIcon(c.icon), + })) as Connector[]; + + const handleConnect = async (connector: Connector) => { + connectMutation.mutate({ + connector: connector as unknown as QueryConnector, + redirectUri: `${window.location.origin}/auth/callback`, + }); + }; + + const handleDisconnect = async (connector: Connector) => { + disconnectMutation.mutate(connector as unknown as QueryConnector); + }; + + const navigateToKnowledgePage = (connector: Connector) => { + const provider = connector.type.replace(/-/g, "_"); + router.push(`/upload/${provider}`); + }; + + if (!connectorsLoading && connectors.length === 0) { + return null; + } + + return ( +
+ {connectorsLoading ? ( + <> + + + + + ) : ( + connectors.map((connector) => ( + + +
+
+
+
+ {connector.icon} +
+
+ + {connector.name} + + + {connector?.available + ? `${connector.name} is configured.` + : "Not configured."} + +
+
+
+ + {connector?.available ? ( +
+ {connector?.status === "connected" && connector?.connectionId ? ( + <> +
+ + + +
+ + ) : ( + + )} +
+ ) : ( +
+

+ See our{" "} + + Cloud Connectors installation guide + {" "} + for more detail. +

+
+ )} +
+
+ )) + )} +
+ ); +} diff --git a/frontend/app/settings/_components/connectors-skeleton.tsx b/frontend/app/settings/_components/connectors-skeleton.tsx new file mode 100644 index 000000000..838906135 --- /dev/null +++ b/frontend/app/settings/_components/connectors-skeleton.tsx @@ -0,0 +1,27 @@ +import { Card, CardContent, CardHeader } from "@/components/ui/card"; +import { Skeleton } from "@/components/ui/skeleton"; + +export default function ConnectorsSkeleton() { + return ( + + +
+
+
+ +
+ + +
+
+
+ +
+ + + +
+
+
+ ); +} diff --git a/frontend/app/settings/page.tsx b/frontend/app/settings/page.tsx index 01bc2e6cf..a6e9d5296 100644 --- a/frontend/app/settings/page.tsx +++ b/frontend/app/settings/page.tsx @@ -1,9 +1,9 @@ "use client"; -import { ArrowUpRight, Copy, Key, Loader2, Minus, PlugZap, Plus, RefreshCcw, Trash2 } from "lucide-react"; +import { ArrowUpRight, Copy, Key, Loader2, Minus, Plus, Trash2 } from "lucide-react"; import Link from "next/link"; import { useRouter, useSearchParams } from "next/navigation"; -import { Suspense, useCallback, useEffect, useState } from "react"; +import { Suspense, useEffect, useState } from "react"; import { toast } from "sonner"; import { useGetAnthropicModelsQuery, @@ -47,62 +47,15 @@ import { } from "@/lib/constants"; import { useDebounce } from "@/lib/debounce"; import { cn } from "@/lib/utils"; -import GoogleDriveIcon from "../../components/icons/google-drive-logo"; -import OneDriveIcon from "../../components/icons/one-drive-logo"; -import SharePointIcon from "../../components/icons/share-point-logo"; import { useUpdateSettingsMutation } from "../api/mutations/useUpdateSettingsMutation"; import { ModelSelector } from "../onboarding/_components/model-selector"; import ModelProviders from "./_components/model-providers"; +import ConnectorCards from "./_components/connector-cards"; import { getModelLogo, type ModelProvider } from "./_helpers/model-helpers"; const { MAX_SYSTEM_PROMPT_CHARS } = UI_CONSTANTS; -interface GoogleDriveFile { - id: string; - name: string; - mimeType: string; - webViewLink?: string; - iconLink?: string; -} - -interface OneDriveFile { - id: string; - name: string; - mimeType?: string; - webUrl?: string; - driveItem?: { - file?: { mimeType: string }; - folder?: unknown; - }; -} - -interface Connector { - id: string; - name: string; - description: string; - icon: React.ReactNode; - status: "not_connected" | "connecting" | "connected" | "error"; - type: string; - connectionId?: string; - access_token?: string; - selectedFiles?: GoogleDriveFile[] | OneDriveFile[]; - available?: boolean; -} - -interface SyncResult { - processed?: number; - added?: number; - errors?: number; - skipped?: number; - total?: number; -} -interface Connection { - connection_id: string; - is_active: boolean; - created_at: string; - last_sync?: string; -} function KnowledgeSourcesPage() { const { isAuthenticated, isNoAuthMode } = useAuth(); const { addTask, tasks } = useTask(); @@ -114,16 +67,12 @@ function KnowledgeSourcesPage() { // Use a trigger state that changes each time we detect the query param const [openLlmSelector, setOpenLlmSelector] = useState(false); - // Connectors state - const [connectors, setConnectors] = useState([]); - const [isConnecting, setIsConnecting] = useState(null); - const [isDisconnecting, setIsDisconnecting] = useState(null); - const [isSyncing, setIsSyncing] = useState(null); - const [syncResults, setSyncResults] = useState<{ - [key: string]: SyncResult | null; - }>({}); - const [maxFiles, setMaxFiles] = useState(10); - const [syncAllFiles, setSyncAllFiles] = useState(false); + // API Keys state + const [createKeyDialogOpen, setCreateKeyDialogOpen] = useState(false); + const [newKeyName, setNewKeyName] = useState(""); + const [newlyCreatedKey, setNewlyCreatedKey] = useState(null); + const [showKeyDialogOpen, setShowKeyDialogOpen] = useState(false); + // Only keep systemPrompt state since it needs manual save button const [systemPrompt, setSystemPrompt] = useState(""); @@ -134,12 +83,6 @@ function KnowledgeSourcesPage() { const [pictureDescriptions, setPictureDescriptions] = useState(false); - // API Keys state - const [createKeyDialogOpen, setCreateKeyDialogOpen] = useState(false); - const [newKeyName, setNewKeyName] = useState(""); - const [newlyCreatedKey, setNewlyCreatedKey] = useState(null); - const [showKeyDialogOpen, setShowKeyDialogOpen] = useState(false); - // Fetch settings using React Query const { data: settings = {} } = useGetSettingsQuery({ enabled: isAuthenticated || isNoAuthMode, @@ -463,239 +406,19 @@ function KnowledgeSourcesPage() { }); }; - // Helper function to get connector icon - const getConnectorIcon = useCallback((iconName: string) => { - const iconMap: { [key: string]: React.ReactElement } = { - "google-drive": , - sharepoint: , - onedrive: , - }; - return ( - iconMap[iconName] || ( -
- ? -
- ) - ); - }, []); - - // Connector functions - const checkConnectorStatuses = useCallback(async () => { - try { - // Fetch available connectors from backend - const connectorsResponse = await fetch("/api/connectors"); - if (!connectorsResponse.ok) { - throw new Error("Failed to load connectors"); - } - const connectorsResult = await connectorsResponse.json(); - const connectorTypes = Object.keys(connectorsResult.connectors); - // Initialize connectors list with metadata from backend - const initialConnectors = connectorTypes - // .filter((type) => connectorsResult.connectors[type].available) // Only show available connectors - .map((type) => ({ - id: type, - name: connectorsResult.connectors[type].name, - description: connectorsResult.connectors[type].description, - icon: getConnectorIcon(connectorsResult.connectors[type].icon), - status: "not_connected" as const, - type: type, - available: connectorsResult.connectors[type].available, - })); - setConnectors(initialConnectors); - // Check status for each connector type - - for (const connectorType of connectorTypes) { - const response = await fetch(`/api/connectors/${connectorType}/status`); - if (response.ok) { - const data = await response.json(); - const connections = data.connections || []; - // Find a connection that is both active AND authenticated - const activeConnection = connections.find( - (conn: Connection & { is_authenticated?: boolean }) => - conn.is_active && conn.is_authenticated, - ); - const isConnected = activeConnection !== undefined; - - setConnectors((prev) => - prev.map((c) => - c.type === connectorType - ? { - ...c, - status: isConnected ? "connected" : "not_connected", - connectionId: activeConnection?.connection_id, - } - : c, - ), - ); - } - } - } catch (error) { - console.error("Failed to check connector statuses:", error); - } - }, [getConnectorIcon]); - - const handleConnect = async (connector: Connector) => { - setIsConnecting(connector.id); - setSyncResults((prev) => ({ ...prev, [connector.id]: null })); - - try { - // Use the shared auth callback URL, same as connectors page - const redirectUri = `${window.location.origin}/auth/callback`; - - const response = await fetch("/api/auth/init", { - method: "POST", - headers: { - "Content-Type": "application/json", - }, - body: JSON.stringify({ - connector_type: connector.type, - purpose: "data_source", - name: `${connector.name} Connection`, - redirect_uri: redirectUri, - }), - }); - - if (response.ok) { - const result = await response.json(); - - if (result.oauth_config) { - localStorage.setItem("connecting_connector_id", result.connection_id); - localStorage.setItem("connecting_connector_type", connector.type); - - const authUrl = - `${result.oauth_config.authorization_endpoint}?` + - `client_id=${result.oauth_config.client_id}&` + - `response_type=code&` + - `scope=${result.oauth_config.scopes.join(" ")}&` + - `redirect_uri=${encodeURIComponent( - result.oauth_config.redirect_uri, - )}&` + - `access_type=offline&` + - `prompt=select_account&` + - `state=${result.connection_id}`; - - window.location.href = authUrl; - } - } else { - console.error("Failed to initiate connection"); - setIsConnecting(null); - } - } catch (error) { - console.error("Connection error:", error); - setIsConnecting(null); - } - }; - - const handleDisconnect = async (connector: Connector) => { - setIsDisconnecting(connector.id); - - try { - const response = await fetch(`/api/connectors/${connector.type}/disconnect`, { - method: "DELETE", - }); - - if (response.ok) { - // Update the connector status locally - setConnectors((prev) => - prev.map((c) => - c.type === connector.type - ? { - ...c, - status: "not_connected", - connectionId: undefined, - } - : c, - ), - ); - setSyncResults((prev) => ({ ...prev, [connector.id]: null })); - toast.success(`${connector.name} disconnected`); - } else { - const result = await response.json(); - console.error("Failed to disconnect:", result.error); - toast.error(`Failed to disconnect ${connector.name}`); - } - } catch (error) { - console.error("Disconnect error:", error); - toast.error(`Failed to disconnect ${connector.name}`); - } finally { - setIsDisconnecting(null); - } - }; - - // const handleSync = async (connector: Connector) => { - // if (!connector.connectionId) return; - - // setIsSyncing(connector.id); - // setSyncResults(prev => ({ ...prev, [connector.id]: null })); - - // try { - // const syncBody: { - // connection_id: string; - // max_files?: number; - // selected_files?: string[]; - // } = { - // connection_id: connector.connectionId, - // max_files: syncAllFiles ? 0 : maxFiles || undefined, - // }; - - // // Note: File selection is now handled via the cloud connectors dialog - - // const response = await fetch(`/api/connectors/${connector.type}/sync`, { - // method: "POST", - // headers: { - // "Content-Type": "application/json", - // }, - // body: JSON.stringify(syncBody), - // }); - - // const result = await response.json(); - - // if (response.status === 201) { - // const taskId = result.task_id; - // if (taskId) { - // addTask(taskId); - // setSyncResults(prev => ({ - // ...prev, - // [connector.id]: { - // processed: 0, - // total: result.total_files || 0, - // }, - // })); - // } - // } else if (response.ok) { - // setSyncResults(prev => ({ ...prev, [connector.id]: result })); - // // Note: Stats will auto-refresh via task completion watcher for async syncs - // } else { - // console.error("Sync failed:", result.error); - // } - // } catch (error) { - // console.error("Sync error:", error); - // } finally { - // setIsSyncing(null); - // } - // }; - - const navigateToKnowledgePage = (connector: Connector) => { - const provider = connector.type.replace(/-/g, "_"); - router.push(`/upload/${provider}`); - }; // Check connector status on mount and when returning from OAuth useEffect(() => { - if (isAuthenticated) { - checkConnectorStatuses(); - } - if (searchParams.get("oauth_success") === "true") { const url = new URL(window.location.href); url.searchParams.delete("oauth_success"); window.history.replaceState({}, "", url.toString()); } - }, [searchParams, isAuthenticated, checkConnectorStatuses]); + }, [searchParams]); // Track previous tasks to detect new completions const [prevTasks, setPrevTasks] = useState([]); @@ -916,138 +639,7 @@ function KnowledgeSourcesPage() { //
} {/* Connectors Grid */} -
- {connectors.map((connector) => { - return ( - - -
-
-
-
- {connector.icon} -
-
- - {connector.name} - - - {connector?.available - ? `${connector.name} is configured.` - : "Not configured."} - -
-
-
- - {connector?.available ? ( -
- {connector?.status === "connected" && connector?.connectionId ? ( - <> -
- - - -
- {syncResults[connector.id] && ( -
-
- Processed:{" "} - {syncResults[connector.id]?.processed || 0} -
-
- Added: {syncResults[connector.id]?.added || 0} -
- {syncResults[connector.id]?.errors && ( -
- Errors: {syncResults[connector.id]?.errors} -
- )} -
- )} - - ) : ( - - )} -
- ) : ( -
-

- See our{" "} - - Cloud Connectors installation guide - {" "} - for more detail. -

-
- )} -
-
- ); - })} -
+ {/* Model Providers Section */} diff --git a/frontend/app/upload/[provider]/page.tsx b/frontend/app/upload/[provider]/page.tsx index 56f881f32..111d88ac6 100644 --- a/frontend/app/upload/[provider]/page.tsx +++ b/frontend/app/upload/[provider]/page.tsx @@ -13,20 +13,11 @@ import { TooltipTrigger, } from "@/components/ui/tooltip"; -// CloudFile interface is now imported from the unified cloud picker +import { useSyncConnector } from "@/app/api/mutations/useSyncConnector"; +import { useGetConnectorsQuery } from "@/app/api/queries/useGetConnectorsQuery"; +import { useGetConnectorTokenQuery } from "@/app/api/queries/useGetConnectorTokenQuery"; -interface CloudConnector { - id: string; - name: string; - description: string; - status: "not_connected" | "connecting" | "connected" | "error"; - type: string; - connectionId?: string; - clientId: string; - hasAccessToken: boolean; - accessTokenError?: string; - baseUrl?: string; -} +// CloudFile interface is now imported from the unified cloud picker export default function UploadProviderPage() { const params = useParams(); @@ -34,12 +25,24 @@ export default function UploadProviderPage() { const provider = params.provider as string; const { addTask, tasks } = useTask(); - const [connector, setConnector] = useState(null); - const [isLoading, setIsLoading] = useState(true); - const [error, setError] = useState(null); - const [accessToken, setAccessToken] = useState(null); + const { data: connectors = [], isLoading: connectorsLoading, error: connectorsError } = useGetConnectorsQuery(); + const connector = connectors.find((c) => c.type === provider); + + const { data: tokenData, isLoading: tokenLoading } = useGetConnectorTokenQuery( + { + connectorType: provider, + connectionId: connector?.connectionId, + resource: + provider === "sharepoint" ? (connector?.baseUrl as string) : undefined, + }, + { + enabled: !!connector && connector.status === "connected", + }, + ); + + const syncMutation = useSyncConnector(); + const [selectedFiles, setSelectedFiles] = useState([]); - const [isIngesting, setIsIngesting] = useState(false); const [currentSyncTaskId, setCurrentSyncTaskId] = useState( null, ); @@ -51,124 +54,17 @@ export default function UploadProviderPage() { embeddingModel: "text-embedding-3-small", }); - useEffect(() => { - const fetchConnectorInfo = async () => { - setIsLoading(true); - setError(null); - - try { - // Fetch available connectors to validate the provider - const connectorsResponse = await fetch("/api/connectors"); - if (!connectorsResponse.ok) { - throw new Error("Failed to load connectors"); - } - - const connectorsResult = await connectorsResponse.json(); - const providerInfo = connectorsResult.connectors[provider]; - - if (!providerInfo || !providerInfo.available) { - setError( - `Cloud provider "${provider}" is not available or configured.`, - ); - return; - } - - // Check connector status - const statusResponse = await fetch( - `/api/connectors/${provider}/status`, - ); - if (!statusResponse.ok) { - throw new Error(`Failed to check ${provider} status`); - } - - const statusData = await statusResponse.json(); - const connections = statusData.connections || []; - const activeConnection = connections.find( - (conn: { is_active: boolean; is_authenticated?: boolean; connection_id: string }) => - conn.is_active && conn.is_authenticated, - ); - const isConnected = activeConnection !== undefined; - - let hasAccessToken = false; - let accessTokenError: string | undefined; - - // Try to get access token for connected connectors - if (isConnected && activeConnection) { - try { - // For SharePoint File Picker v8, we need a token with SharePoint as the audience - // Pass the base_url as the resource parameter to get the correct token - let tokenUrl = `/api/connectors/${provider}/token?connection_id=${activeConnection.connection_id}`; - if (provider === "sharepoint" && activeConnection.base_url) { - tokenUrl += `&resource=${encodeURIComponent(activeConnection.base_url)}`; - } - - const tokenResponse = await fetch(tokenUrl); - if (tokenResponse.ok) { - const tokenData = await tokenResponse.json(); - if (tokenData.access_token) { - hasAccessToken = true; - setAccessToken(tokenData.access_token); - } - } else { - const errorData = await tokenResponse - .json() - .catch(() => ({ error: "Token unavailable" })); - accessTokenError = errorData.error || "Access token unavailable"; - } - } catch { - accessTokenError = "Failed to fetch access token"; - } - } - - setConnector({ - id: provider, - name: providerInfo.name, - description: providerInfo.description, - status: isConnected ? "connected" : "not_connected", - type: provider, - connectionId: activeConnection?.connection_id, - clientId: activeConnection?.client_id, - hasAccessToken, - accessTokenError, - baseUrl: activeConnection?.base_url, - }); - } catch (error) { - console.error("Failed to load connector info:", error); - setError( - error instanceof Error - ? error.message - : "Failed to load connector information", - ); - } finally { - setIsLoading(false); - } - }; + const accessToken = tokenData?.access_token || null; + const isLoading = connectorsLoading || tokenLoading; + const isIngesting = syncMutation.isPending; - if (provider) { - fetchConnectorInfo(); - } - }, [provider]); + // Error handling + const error = connectorsError + ? (connectorsError as Error).message + : !connector && !connectorsLoading + ? `Cloud provider "${provider}" is not available or configured.` + : null; - // Watch for sync task completion and redirect - useEffect(() => { - if (!currentSyncTaskId) return; - - const currentTask = tasks.find( - (task) => task.task_id === currentSyncTaskId, - ); - - if (currentTask && currentTask.status === "completed") { - // Task completed successfully, show toast and redirect - setIsIngesting(false); - setTimeout(() => { - router.push("/knowledge"); - }, 2000); // 2 second delay to let user see toast - } else if (currentTask && currentTask.status === "failed") { - // Task failed, clear the tracking but don't redirect - setIsIngesting(false); - setCurrentSyncTaskId(null); - } - }, [tasks, currentSyncTaskId, router]); const handleFileSelected = (files: CloudFile[]) => { setSelectedFiles(files); @@ -176,61 +72,37 @@ export default function UploadProviderPage() { // You can add additional handling here like triggering sync, etc. }; - const handleSync = async (connector: CloudConnector) => { + const handleSync = async (connector: any) => { if (!connector.connectionId || selectedFiles.length === 0) return; - setIsIngesting(true); - - try { - // Pass full file objects including download URLs for OneDrive/SharePoint - // This allows the backend to use direct download URLs instead of Graph API - const syncBody: { - connection_id: string; - max_files?: number; - selected_files?: Array<{ - id: string; - name: string; - mimeType: string; - downloadUrl?: string; - size?: number; - }>; - settings?: IngestSettings; - } = { - connection_id: connector.connectionId, - selected_files: selectedFiles.map((file) => ({ - id: file.id, - name: file.name, - mimeType: file.mimeType, - downloadUrl: file.downloadUrl, - size: file.size, - })), - settings: ingestSettings, - }; - - const response = await fetch(`/api/connectors/${connector.type}/sync`, { - method: "POST", - headers: { - "Content-Type": "application/json", + syncMutation.mutate( + { + connectorType: connector.type, + body: { + connection_id: connector.connectionId, + selected_files: selectedFiles.map((file) => ({ + id: file.id, + name: file.name, + mimeType: file.mimeType, + downloadUrl: file.downloadUrl, + size: file.size, + })), + settings: ingestSettings, }, - body: JSON.stringify(syncBody), - }); - - const result = await response.json(); - - if (response.status === 201) { - const taskIds = result.task_ids; - if (taskIds && taskIds.length > 0) { - const taskId = taskIds[0]; // Use the first task ID - addTask(taskId); - setCurrentSyncTaskId(taskId); - } - } else { - console.error("Sync failed:", result.error); - } - } catch (error) { - console.error("Sync error:", error); - setIsIngesting(false); - } + }, + { + onSuccess: (result) => { + const taskIds = result.task_ids; + if (taskIds && taskIds.length > 0) { + const taskId = taskIds[0]; // Use the first task ID + addTask(taskId); + setCurrentSyncTaskId(taskId); + // Redirect to knowledge page already to show the syncing document + router.push("/knowledge"); + } + }, + }, + ); }; const getProviderDisplayName = () => { @@ -316,7 +188,7 @@ export default function UploadProviderPage() { ); } - if (!connector.hasAccessToken) { + if (!accessToken) { return ( <>
@@ -337,8 +209,7 @@ export default function UploadProviderPage() { Access Token Required

- {connector.accessTokenError || - `Unable to get access token for ${connector.name}. Try reconnecting your account.`} + Unable to get access token for {connector.name}. Try reconnecting your account.