From 9cf2b90618344711772d26fe4cf63ffe972fc021 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 12 Nov 2025 20:21:29 +0000 Subject: [PATCH 1/8] feat(registry): Add KeyError validation for externalDocumentationUrls - Add validation in ApiDocsUrl.from_manifest_dict to raise PyAirbyteInputError when 'title' or 'url' fields are missing - Add validation in _extract_docs_from_registry for registry entries - Add unit tests for missing field scenarios - Error messages clearly indicate which required field is missing Co-Authored-By: AJ Steers --- airbyte/registry.py | 56 ++++++++++++------- .../unit_tests/test_mcp_connector_registry.py | 40 +++++++++++++ 2 files changed, 75 insertions(+), 21 deletions(-) diff --git a/airbyte/registry.py b/airbyte/registry.py index adff22755..b1162ebc0 100644 --- a/airbyte/registry.py +++ b/airbyte/registry.py @@ -320,6 +320,9 @@ def from_manifest_dict(cls, manifest_data: dict[str, Any]) -> list[Self]: Returns: List of ApiDocsUrl objects extracted from the manifest + + Raises: + PyAirbyteInputError: If a documentation entry is missing required 'title' or 'url' field """ results: list[Self] = [] @@ -327,16 +330,21 @@ def from_manifest_dict(cls, manifest_data: dict[str, Any]) -> list[Self]: if isinstance(data_section, dict): external_docs = data_section.get("externalDocumentationUrls") if isinstance(external_docs, list): - results = [ - cls( - title=doc["title"], - url=doc["url"], - source="data_external_docs", - doc_type=doc.get("type", "other"), - requires_login=doc.get("requiresLogin", False), - ) - for doc in external_docs - ] + for doc in external_docs: + try: + results.append( + cls( + title=doc["title"], + url=doc["url"], + source="data_external_docs", + doc_type=doc.get("type", "other"), + requires_login=doc.get("requiresLogin", False), + ) + ) + except KeyError as e: + raise exc.PyAirbyteInputError( + message=f"Manifest parsing error: missing required field {e}" + ) from e return results @@ -386,6 +394,9 @@ def _extract_docs_from_registry(connector_name: str) -> list[ApiDocsUrl]: Returns: List of ApiDocsUrl objects extracted from the registry + + Raises: + PyAirbyteInputError: If a documentation entry is missing required 'title' or 'url' field """ registry_url = _get_registry_url() response = requests.get(registry_url, timeout=10) @@ -413,18 +424,21 @@ def _extract_docs_from_registry(connector_name: str) -> list[ApiDocsUrl]: if connector_entry and "externalDocumentationUrls" in connector_entry: external_docs = connector_entry["externalDocumentationUrls"] if isinstance(external_docs, list): - docs_urls.extend( - [ - ApiDocsUrl( - title=doc["title"], - url=doc["url"], - source="registry_external_docs", - doc_type=doc.get("type", "other"), - requires_login=doc.get("requiresLogin", False), + for doc in external_docs: + try: + docs_urls.append( + ApiDocsUrl( + title=doc["title"], + url=doc["url"], + source="registry_external_docs", + doc_type=doc.get("type", "other"), + requires_login=doc.get("requiresLogin", False), + ) ) - for doc in external_docs - ] - ) + except KeyError as e: + raise exc.PyAirbyteInputError( + message=f"Registry parsing error: missing required field {e}" + ) from e return docs_urls diff --git a/tests/unit_tests/test_mcp_connector_registry.py b/tests/unit_tests/test_mcp_connector_registry.py index 5f70ea167..25dbaec2e 100644 --- a/tests/unit_tests/test_mcp_connector_registry.py +++ b/tests/unit_tests/test_mcp_connector_registry.py @@ -5,6 +5,8 @@ from unittest.mock import MagicMock, patch +import pytest + from airbyte import exceptions as exc from airbyte.mcp.connector_registry import get_api_docs_urls from airbyte.registry import ( @@ -126,6 +128,44 @@ def test_empty_manifest(self) -> None: urls = ApiDocsUrl.from_manifest_dict({}) assert len(urls) == 0 + def test_manifest_missing_title_raises_error(self) -> None: + """Test that missing 'title' field raises PyAirbyteInputError.""" + manifest_dict = { + "version": "1.0.0", + "type": "DeclarativeSource", + "data": { + "externalDocumentationUrls": [ + { + "url": "https://api.example.com/docs", + } + ] + }, + } + + with pytest.raises( + exc.PyAirbyteInputError, match="Manifest parsing error.*'title'" + ): + ApiDocsUrl.from_manifest_dict(manifest_dict) + + def test_manifest_missing_url_raises_error(self) -> None: + """Test that missing 'url' field raises PyAirbyteInputError.""" + manifest_dict = { + "version": "1.0.0", + "type": "DeclarativeSource", + "data": { + "externalDocumentationUrls": [ + { + "title": "API Documentation", + } + ] + }, + } + + with pytest.raises( + exc.PyAirbyteInputError, match="Manifest parsing error.*'url'" + ): + ApiDocsUrl.from_manifest_dict(manifest_dict) + class TestGetApiDocsUrls: """Tests for get_api_docs_urls function.""" From 12de9c8674b3b23cdb138ebbce43b464a87ed495 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 12 Nov 2025 20:29:13 +0000 Subject: [PATCH 2/8] refactor(registry): Rename get_api_docs_urls to get_connector_docs_urls - Rename get_connector_api_docs_urls to get_connector_docs_urls in registry.py - Rename MCP tool from get_api_docs_urls to get_connector_docs_urls - Update internal Airbyte docs to use doc_type='internal' instead of default 'other' - Update all tests to use new function names - This is a breaking change but acceptable since the feature was just released today Co-Authored-By: AJ Steers --- airbyte/mcp/connector_registry.py | 10 +++++----- airbyte/registry.py | 3 ++- tests/unit_tests/test_mcp_connector_registry.py | 14 +++++++------- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/airbyte/mcp/connector_registry.py b/airbyte/mcp/connector_registry.py index 7dbedb341..57d3e76b9 100644 --- a/airbyte/mcp/connector_registry.py +++ b/airbyte/mcp/connector_registry.py @@ -22,9 +22,9 @@ ConnectorVersionInfo, InstallType, get_available_connectors, - get_connector_api_docs_urls, get_connector_metadata, ) +from airbyte.registry import get_connector_docs_urls as _get_connector_docs_urls from airbyte.registry import get_connector_version_history as _get_connector_version_history from airbyte.sources.util import get_source @@ -180,7 +180,7 @@ def get_connector_info( read_only=True, idempotent=True, ) -def get_api_docs_urls( +def get_connector_docs_urls( connector_name: Annotated[ str, Field( @@ -191,14 +191,14 @@ def get_api_docs_urls( ), ], ) -> list[ApiDocsUrl] | Literal["Connector not found."]: - """Get API documentation URLs for a connector. + """Get documentation URLs for a connector. - This tool retrieves documentation URLs for a connector's upstream API from multiple sources: + This tool retrieves documentation URLs for a connector from multiple sources: - Registry metadata (documentationUrl, externalDocumentationUrls) - Connector manifest.yaml file (data.externalDocumentationUrls) """ try: - return get_connector_api_docs_urls(connector_name) + return _get_connector_docs_urls(connector_name) except exc.AirbyteConnectorNotRegisteredError: return "Connector not found." diff --git a/airbyte/registry.py b/airbyte/registry.py index b1162ebc0..d9fc3f7f2 100644 --- a/airbyte/registry.py +++ b/airbyte/registry.py @@ -418,6 +418,7 @@ def _extract_docs_from_registry(connector_name: str) -> list[ApiDocsUrl]: title="Airbyte Documentation", url=connector_entry["documentationUrl"], source="registry", + doc_type="internal", ) ) @@ -443,7 +444,7 @@ def _extract_docs_from_registry(connector_name: str) -> list[ApiDocsUrl]: return docs_urls -def get_connector_api_docs_urls(connector_name: str) -> list[ApiDocsUrl]: +def get_connector_docs_urls(connector_name: str) -> list[ApiDocsUrl]: """Get API documentation URLs for a connector. This function retrieves documentation URLs for a connector's upstream API from multiple sources: diff --git a/tests/unit_tests/test_mcp_connector_registry.py b/tests/unit_tests/test_mcp_connector_registry.py index 25dbaec2e..4b0059ac1 100644 --- a/tests/unit_tests/test_mcp_connector_registry.py +++ b/tests/unit_tests/test_mcp_connector_registry.py @@ -8,7 +8,7 @@ import pytest from airbyte import exceptions as exc -from airbyte.mcp.connector_registry import get_api_docs_urls +from airbyte.mcp.connector_registry import get_connector_docs_urls from airbyte.registry import ( ApiDocsUrl, _fetch_manifest_dict, @@ -167,26 +167,26 @@ def test_manifest_missing_url_raises_error(self) -> None: ApiDocsUrl.from_manifest_dict(manifest_dict) -class TestGetApiDocsUrls: - """Tests for get_api_docs_urls function.""" +class TestGetConnectorDocsUrls: + """Tests for get_connector_docs_urls function.""" def test_connector_not_found(self) -> None: """Test handling when connector is not found.""" with patch( - "airbyte.mcp.connector_registry.get_connector_api_docs_urls" + "airbyte.mcp.connector_registry.get_connector_docs_urls" ) as mock_get_docs: mock_get_docs.side_effect = exc.AirbyteConnectorNotRegisteredError( connector_name="nonexistent-connector", context={}, ) - result = get_api_docs_urls("nonexistent-connector") + result = get_connector_docs_urls("nonexistent-connector") assert result == "Connector not found." def test_deduplication_of_urls(self) -> None: """Test that duplicate URLs are deduplicated.""" with patch( - "airbyte.mcp.connector_registry.get_connector_api_docs_urls" + "airbyte.mcp.connector_registry.get_connector_docs_urls" ) as mock_get_docs: mock_get_docs.return_value = [ ApiDocsUrl( @@ -196,7 +196,7 @@ def test_deduplication_of_urls(self) -> None: ) ] - result = get_api_docs_urls("source-example") + result = get_connector_docs_urls("source-example") assert isinstance(result, list) assert len(result) == 1 From 306557f9be16e887438a1b8124301c63e6eda973 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 12 Nov 2025 20:35:33 +0000 Subject: [PATCH 3/8] fix: Include malformed doc dict in error messages and fix test_deduplication_of_urls - Updated error messages to include the malformed doc dict for better debugging - Fixed test_deduplication_of_urls to properly mock get_available_connectors - Test now correctly verifies URL deduplication functionality Co-Authored-By: AJ Steers --- airbyte/registry.py | 4 ++-- .../unit_tests/test_mcp_connector_registry.py | 21 ++++++++++++++++--- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/airbyte/registry.py b/airbyte/registry.py index d9fc3f7f2..e011cbef0 100644 --- a/airbyte/registry.py +++ b/airbyte/registry.py @@ -343,7 +343,7 @@ def from_manifest_dict(cls, manifest_data: dict[str, Any]) -> list[Self]: ) except KeyError as e: raise exc.PyAirbyteInputError( - message=f"Manifest parsing error: missing required field {e}" + message=f"Manifest parsing error: missing required field in {doc}: {e}" ) from e return results @@ -438,7 +438,7 @@ def _extract_docs_from_registry(connector_name: str) -> list[ApiDocsUrl]: ) except KeyError as e: raise exc.PyAirbyteInputError( - message=f"Registry parsing error: missing required field {e}" + message=f"Registry parsing error: missing required field in {doc}: {e}" ) from e return docs_urls diff --git a/tests/unit_tests/test_mcp_connector_registry.py b/tests/unit_tests/test_mcp_connector_registry.py index 4b0059ac1..ea5a1c94e 100644 --- a/tests/unit_tests/test_mcp_connector_registry.py +++ b/tests/unit_tests/test_mcp_connector_registry.py @@ -186,15 +186,30 @@ def test_connector_not_found(self) -> None: def test_deduplication_of_urls(self) -> None: """Test that duplicate URLs are deduplicated.""" with patch( - "airbyte.mcp.connector_registry.get_connector_docs_urls" - ) as mock_get_docs: - mock_get_docs.return_value = [ + "airbyte.registry.get_available_connectors" + ) as mock_available, patch( + "airbyte.registry._extract_docs_from_registry" + ) as mock_extract_registry, patch( + "airbyte.registry._fetch_manifest_dict" + ) as mock_fetch_manifest: + mock_available.return_value = ["source-example"] + mock_extract_registry.return_value = [ ApiDocsUrl( title="Airbyte Documentation", url="https://docs.airbyte.com/integrations/sources/example", source="registry", ) ] + mock_fetch_manifest.return_value = { + "data": { + "externalDocumentationUrls": [ + { + "title": "Duplicate Documentation", + "url": "https://docs.airbyte.com/integrations/sources/example", + } + ] + } + } result = get_connector_docs_urls("source-example") From 117f8ffe67c8411d3785336f99434d5e40fcda73 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 12 Nov 2025 20:38:35 +0000 Subject: [PATCH 4/8] style: Apply ruff formatting to test file Co-Authored-By: AJ Steers --- tests/unit_tests/test_mcp_connector_registry.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/unit_tests/test_mcp_connector_registry.py b/tests/unit_tests/test_mcp_connector_registry.py index ea5a1c94e..23357bdab 100644 --- a/tests/unit_tests/test_mcp_connector_registry.py +++ b/tests/unit_tests/test_mcp_connector_registry.py @@ -185,13 +185,13 @@ def test_connector_not_found(self) -> None: def test_deduplication_of_urls(self) -> None: """Test that duplicate URLs are deduplicated.""" - with patch( - "airbyte.registry.get_available_connectors" - ) as mock_available, patch( - "airbyte.registry._extract_docs_from_registry" - ) as mock_extract_registry, patch( - "airbyte.registry._fetch_manifest_dict" - ) as mock_fetch_manifest: + with ( + patch("airbyte.registry.get_available_connectors") as mock_available, + patch( + "airbyte.registry._extract_docs_from_registry" + ) as mock_extract_registry, + patch("airbyte.registry._fetch_manifest_dict") as mock_fetch_manifest, + ): mock_available.return_value = ["source-example"] mock_extract_registry.return_value = [ ApiDocsUrl( From 60e43b4228490ccee64c13cf8a80058a0aed93d8 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 12 Nov 2025 21:05:17 +0000 Subject: [PATCH 5/8] refactor: Remove URL deduplication logic per user request - Removed deduplication logic from get_connector_docs_urls function - Removed test_deduplication_of_urls test - Updated docstring to remove mention of deduplication The deduplication feature was previously introduced in PR #858 but is being removed per product direction. Duplicate URLs from registry and manifest sources will now be preserved in the returned list. Co-Authored-By: AJ Steers --- airbyte/registry.py | 11 ++---- .../unit_tests/test_mcp_connector_registry.py | 34 ------------------- 2 files changed, 2 insertions(+), 43 deletions(-) diff --git a/airbyte/registry.py b/airbyte/registry.py index e011cbef0..e72a2ba12 100644 --- a/airbyte/registry.py +++ b/airbyte/registry.py @@ -455,7 +455,7 @@ def get_connector_docs_urls(connector_name: str) -> list[ApiDocsUrl]: connector_name: The canonical connector name (e.g., "source-facebook-marketing") Returns: - List of ApiDocsUrl objects with documentation URLs, deduplicated by URL. + List of ApiDocsUrl objects with documentation URLs. Raises: AirbyteConnectorNotRegisteredError: If the connector is not found in the registry. @@ -479,14 +479,7 @@ def get_connector_docs_urls(connector_name: str) -> list[ApiDocsUrl]: manifest_urls = ApiDocsUrl.from_manifest_dict(manifest_data) docs_urls.extend(manifest_urls) - seen_urls = set() - unique_docs_urls = [] - for doc_url in docs_urls: - if doc_url.url not in seen_urls: - seen_urls.add(doc_url.url) - unique_docs_urls.append(doc_url) - - return unique_docs_urls + return docs_urls def get_connector_version_history( diff --git a/tests/unit_tests/test_mcp_connector_registry.py b/tests/unit_tests/test_mcp_connector_registry.py index 23357bdab..a718d21df 100644 --- a/tests/unit_tests/test_mcp_connector_registry.py +++ b/tests/unit_tests/test_mcp_connector_registry.py @@ -182,37 +182,3 @@ def test_connector_not_found(self) -> None: result = get_connector_docs_urls("nonexistent-connector") assert result == "Connector not found." - - def test_deduplication_of_urls(self) -> None: - """Test that duplicate URLs are deduplicated.""" - with ( - patch("airbyte.registry.get_available_connectors") as mock_available, - patch( - "airbyte.registry._extract_docs_from_registry" - ) as mock_extract_registry, - patch("airbyte.registry._fetch_manifest_dict") as mock_fetch_manifest, - ): - mock_available.return_value = ["source-example"] - mock_extract_registry.return_value = [ - ApiDocsUrl( - title="Airbyte Documentation", - url="https://docs.airbyte.com/integrations/sources/example", - source="registry", - ) - ] - mock_fetch_manifest.return_value = { - "data": { - "externalDocumentationUrls": [ - { - "title": "Duplicate Documentation", - "url": "https://docs.airbyte.com/integrations/sources/example", - } - ] - } - } - - result = get_connector_docs_urls("source-example") - - assert isinstance(result, list) - assert len(result) == 1 - assert result[0].title == "Airbyte Documentation" From d2c245a6f25fcfd137d7aaf84adcd807e2d86416 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 12 Nov 2025 21:14:59 +0000 Subject: [PATCH 6/8] fix: Change exception type from PyAirbyteInputError to PyAirbyteInternalError Address PR feedback from r2519730767: Registry/manifest parsing errors should raise PyAirbyteInternalError instead of PyAirbyteInputError because malformed registry/manifest data is not a user input issue - it's an internal data issue that users cannot resolve themselves. Changes: - Updated ApiDocsUrl.from_manifest_dict() to raise PyAirbyteInternalError - Updated _extract_docs_from_registry() to raise PyAirbyteInternalError - Updated docstrings to reflect the new exception type - Updated unit tests to expect PyAirbyteInternalError Ref: https://github.com/airbytehq/PyAirbyte/pull/863#discussion_r2519730767 Co-Authored-By: AJ Steers --- airbyte/registry.py | 8 ++++---- tests/unit_tests/test_mcp_connector_registry.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/airbyte/registry.py b/airbyte/registry.py index e72a2ba12..d840c9b63 100644 --- a/airbyte/registry.py +++ b/airbyte/registry.py @@ -322,7 +322,7 @@ def from_manifest_dict(cls, manifest_data: dict[str, Any]) -> list[Self]: List of ApiDocsUrl objects extracted from the manifest Raises: - PyAirbyteInputError: If a documentation entry is missing required 'title' or 'url' field + PyAirbyteInternalError: If a documentation entry is missing required 'title' or 'url' field """ results: list[Self] = [] @@ -342,7 +342,7 @@ def from_manifest_dict(cls, manifest_data: dict[str, Any]) -> list[Self]: ) ) except KeyError as e: - raise exc.PyAirbyteInputError( + raise exc.PyAirbyteInternalError( message=f"Manifest parsing error: missing required field in {doc}: {e}" ) from e @@ -396,7 +396,7 @@ def _extract_docs_from_registry(connector_name: str) -> list[ApiDocsUrl]: List of ApiDocsUrl objects extracted from the registry Raises: - PyAirbyteInputError: If a documentation entry is missing required 'title' or 'url' field + PyAirbyteInternalError: If a documentation entry is missing required 'title' or 'url' field """ registry_url = _get_registry_url() response = requests.get(registry_url, timeout=10) @@ -437,7 +437,7 @@ def _extract_docs_from_registry(connector_name: str) -> list[ApiDocsUrl]: ) ) except KeyError as e: - raise exc.PyAirbyteInputError( + raise exc.PyAirbyteInternalError( message=f"Registry parsing error: missing required field in {doc}: {e}" ) from e diff --git a/tests/unit_tests/test_mcp_connector_registry.py b/tests/unit_tests/test_mcp_connector_registry.py index a718d21df..fbd9fe7fd 100644 --- a/tests/unit_tests/test_mcp_connector_registry.py +++ b/tests/unit_tests/test_mcp_connector_registry.py @@ -129,7 +129,7 @@ def test_empty_manifest(self) -> None: assert len(urls) == 0 def test_manifest_missing_title_raises_error(self) -> None: - """Test that missing 'title' field raises PyAirbyteInputError.""" + """Test that missing 'title' field raises PyAirbyteInternalError.""" manifest_dict = { "version": "1.0.0", "type": "DeclarativeSource", @@ -143,12 +143,12 @@ def test_manifest_missing_title_raises_error(self) -> None: } with pytest.raises( - exc.PyAirbyteInputError, match="Manifest parsing error.*'title'" + exc.PyAirbyteInternalError, match="Manifest parsing error.*'title'" ): ApiDocsUrl.from_manifest_dict(manifest_dict) def test_manifest_missing_url_raises_error(self) -> None: - """Test that missing 'url' field raises PyAirbyteInputError.""" + """Test that missing 'url' field raises PyAirbyteInternalError.""" manifest_dict = { "version": "1.0.0", "type": "DeclarativeSource", @@ -162,7 +162,7 @@ def test_manifest_missing_url_raises_error(self) -> None: } with pytest.raises( - exc.PyAirbyteInputError, match="Manifest parsing error.*'url'" + exc.PyAirbyteInternalError, match="Manifest parsing error.*'url'" ): ApiDocsUrl.from_manifest_dict(manifest_dict) From 96b6c6d6e095d4eda4ee43c0cbe77924e2ab1349 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 12 Nov 2025 21:19:03 +0000 Subject: [PATCH 7/8] refactor: Add from_docs_list classmethod to eliminate code duplication Address PR feedback from 2519832304: The code in _extract_docs_from_registry() was duplicating the logic from from_manifest_dict(). Refactored to use a shared from_docs_list() classmethod that both functions can call. Changes: - Added ApiDocsUrl.from_docs_list() classmethod with source and context parameters - Refactored from_manifest_dict() to delegate to from_docs_list() - Refactored _extract_docs_from_registry() to use from_docs_list() - Updated docstring to remove 'API' from 'Get API documentation URLs' per CodeRabbit suggestion This makes the code more DRY and easier to maintain. Ref: https://github.com/airbytehq/PyAirbyte/pull/863#discussion_r2519832304 Co-Authored-By: AJ Steers --- airbyte/registry.py | 84 +++++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 37 deletions(-) diff --git a/airbyte/registry.py b/airbyte/registry.py index d840c9b63..9e3d66bec 100644 --- a/airbyte/registry.py +++ b/airbyte/registry.py @@ -311,6 +311,40 @@ class ApiDocsUrl(BaseModel): model_config = {"populate_by_name": True} + @classmethod + def from_docs_list(cls, docs: list[dict[str, Any]], *, source: str, context: str) -> list[Self]: + """Extract documentation URLs from a list of documentation dictionaries. + + Args: + docs: List of documentation dictionaries with 'title' and 'url' fields + source: The source identifier for these documentation URLs + context: Context string for error messages (e.g., "Manifest", "Registry") + + Returns: + List of ApiDocsUrl objects extracted from the docs list + + Raises: + PyAirbyteInternalError: If a documentation entry is missing required 'title' or + 'url' field + """ + results: list[Self] = [] + for doc in docs: + try: + results.append( + cls( + title=doc["title"], + url=doc["url"], + source=source, + doc_type=doc.get("type", "other"), + requires_login=doc.get("requiresLogin", False), + ) + ) + except KeyError as e: + raise exc.PyAirbyteInternalError( + message=f"{context} parsing error: missing required field in {doc}: {e}" + ) from e + return results + @classmethod def from_manifest_dict(cls, manifest_data: dict[str, Any]) -> list[Self]: """Extract documentation URLs from parsed manifest data. @@ -322,31 +356,17 @@ def from_manifest_dict(cls, manifest_data: dict[str, Any]) -> list[Self]: List of ApiDocsUrl objects extracted from the manifest Raises: - PyAirbyteInternalError: If a documentation entry is missing required 'title' or 'url' field + PyAirbyteInternalError: If a documentation entry is missing required 'title' or + 'url' field """ - results: list[Self] = [] - data_section = manifest_data.get("data") if isinstance(data_section, dict): external_docs = data_section.get("externalDocumentationUrls") if isinstance(external_docs, list): - for doc in external_docs: - try: - results.append( - cls( - title=doc["title"], - url=doc["url"], - source="data_external_docs", - doc_type=doc.get("type", "other"), - requires_login=doc.get("requiresLogin", False), - ) - ) - except KeyError as e: - raise exc.PyAirbyteInternalError( - message=f"Manifest parsing error: missing required field in {doc}: {e}" - ) from e - - return results + return cls.from_docs_list( + external_docs, source="data_external_docs", context="Manifest" + ) + return [] def _manifest_url_for(connector_name: str) -> str: @@ -425,29 +445,19 @@ def _extract_docs_from_registry(connector_name: str) -> list[ApiDocsUrl]: if connector_entry and "externalDocumentationUrls" in connector_entry: external_docs = connector_entry["externalDocumentationUrls"] if isinstance(external_docs, list): - for doc in external_docs: - try: - docs_urls.append( - ApiDocsUrl( - title=doc["title"], - url=doc["url"], - source="registry_external_docs", - doc_type=doc.get("type", "other"), - requires_login=doc.get("requiresLogin", False), - ) - ) - except KeyError as e: - raise exc.PyAirbyteInternalError( - message=f"Registry parsing error: missing required field in {doc}: {e}" - ) from e + docs_urls.extend( + ApiDocsUrl.from_docs_list( + external_docs, source="registry_external_docs", context="Registry" + ) + ) return docs_urls def get_connector_docs_urls(connector_name: str) -> list[ApiDocsUrl]: - """Get API documentation URLs for a connector. + """Get documentation URLs for a connector. - This function retrieves documentation URLs for a connector's upstream API from multiple sources: + This function retrieves documentation URLs for a connector from multiple sources: - Registry metadata (documentationUrl, externalDocumentationUrls) - Connector manifest.yaml file (data.externalDocumentationUrls) From 8af80858c689a4279cbd151d2ed8047a358f8a79 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Thu, 13 Nov 2025 02:24:47 +0000 Subject: [PATCH 8/8] fix: Rename manifest references to metadata to fix conflation This commit fixes a fundamental conflation between manifest.yaml and metadata.yaml files throughout the codebase: - manifest.yaml: Declarative connector manifest (behavior/config) - metadata.yaml: Connector metadata that the registry caches/exposes Changes made: - Renamed _DEFAULT_MANIFEST_URL to _DEFAULT_METADATA_URL - Updated URL pattern from manifest.yaml to metadata.yaml - Renamed _manifest_url_for() to _metadata_url_for() - Renamed _fetch_manifest_dict() to _fetch_metadata_dict() - Renamed ApiDocsUrl.from_manifest_dict() to from_metadata_dict() - Renamed from_docs_list() to from_metadata_docs_list() (made public) - Updated context parameter from 'Manifest' to 'Metadata' - Updated source parameter from 'data_external_docs' to 'metadata_external_docs' - Updated all test classes, methods, and error message expectations - Updated MCP connector_registry.py imports and docstrings The code now correctly parses metadata.yaml files (and registry caches of those metadata.yaml files), not manifest.yaml files. Co-Authored-By: AJ Steers --- airbyte/mcp/connector_registry.py | 6 +- airbyte/registry.py | 54 ++++++------ .../unit_tests/test_mcp_connector_registry.py | 82 +++++++++---------- 3 files changed, 72 insertions(+), 70 deletions(-) diff --git a/airbyte/mcp/connector_registry.py b/airbyte/mcp/connector_registry.py index 57d3e76b9..d641fcf45 100644 --- a/airbyte/mcp/connector_registry.py +++ b/airbyte/mcp/connector_registry.py @@ -16,7 +16,7 @@ from airbyte.mcp._tool_utils import mcp_tool, register_tools from airbyte.mcp._util import resolve_list_of_strings from airbyte.registry import ( - _DEFAULT_MANIFEST_URL, + _DEFAULT_METADATA_URL, ApiDocsUrl, ConnectorMetadata, ConnectorVersionInfo, @@ -161,7 +161,7 @@ def get_connector_info( connector.install() config_spec_jsonschema = connector.config_spec - manifest_url = _DEFAULT_MANIFEST_URL.format( + manifest_url = _DEFAULT_METADATA_URL.format( source_name=connector_name, version="latest", ) @@ -195,7 +195,7 @@ def get_connector_docs_urls( This tool retrieves documentation URLs for a connector from multiple sources: - Registry metadata (documentationUrl, externalDocumentationUrls) - - Connector manifest.yaml file (data.externalDocumentationUrls) + - Connector metadata.yaml file (data.externalDocumentationUrls) """ try: return _get_connector_docs_urls(connector_name) diff --git a/airbyte/registry.py b/airbyte/registry.py index 9e3d66bec..8fee2c91d 100644 --- a/airbyte/registry.py +++ b/airbyte/registry.py @@ -40,8 +40,8 @@ _PYTHON_LANGUAGE_TAG = f"language:{_PYTHON_LANGUAGE}" _MANIFEST_ONLY_TAG = f"language:{_MANIFEST_ONLY_LANGUAGE}" -_DEFAULT_MANIFEST_URL = ( - "https://connectors.airbyte.com/files/metadata/airbyte/{source_name}/{version}/manifest.yaml" +_DEFAULT_METADATA_URL = ( + "https://connectors.airbyte.com/files/metadata/airbyte/{source_name}/{version}/metadata.yaml" ) @@ -312,13 +312,15 @@ class ApiDocsUrl(BaseModel): model_config = {"populate_by_name": True} @classmethod - def from_docs_list(cls, docs: list[dict[str, Any]], *, source: str, context: str) -> list[Self]: - """Extract documentation URLs from a list of documentation dictionaries. + def from_metadata_docs_list( + cls, docs: list[dict[str, Any]], *, source: str, context: str + ) -> list[Self]: + """Extract documentation URLs from a list of metadata documentation dictionaries. Args: docs: List of documentation dictionaries with 'title' and 'url' fields source: The source identifier for these documentation URLs - context: Context string for error messages (e.g., "Manifest", "Registry") + context: Context string for error messages (e.g., "Metadata", "Registry") Returns: List of ApiDocsUrl objects extracted from the docs list @@ -346,52 +348,52 @@ def from_docs_list(cls, docs: list[dict[str, Any]], *, source: str, context: str return results @classmethod - def from_manifest_dict(cls, manifest_data: dict[str, Any]) -> list[Self]: - """Extract documentation URLs from parsed manifest data. + def from_metadata_dict(cls, metadata_data: dict[str, Any]) -> list[Self]: + """Extract documentation URLs from parsed metadata. Args: - manifest_data: The parsed manifest.yaml data as a dictionary + metadata_data: The parsed metadata.yaml data as a dictionary Returns: - List of ApiDocsUrl objects extracted from the manifest + List of ApiDocsUrl objects extracted from the metadata Raises: PyAirbyteInternalError: If a documentation entry is missing required 'title' or 'url' field """ - data_section = manifest_data.get("data") + data_section = metadata_data.get("data") if isinstance(data_section, dict): external_docs = data_section.get("externalDocumentationUrls") if isinstance(external_docs, list): - return cls.from_docs_list( - external_docs, source="data_external_docs", context="Manifest" + return cls.from_metadata_docs_list( + external_docs, source="metadata_external_docs", context="Metadata" ) return [] -def _manifest_url_for(connector_name: str) -> str: - """Get the expected URL of the manifest.yaml file for a connector. +def _metadata_url_for(connector_name: str) -> str: + """Get the expected URL of the metadata.yaml file for a connector. Args: connector_name: The canonical connector name (e.g., "source-facebook-marketing") Returns: - The URL to the connector's manifest.yaml file + The URL to the connector's metadata.yaml file """ - return _DEFAULT_MANIFEST_URL.format( + return _DEFAULT_METADATA_URL.format( source_name=connector_name, version="latest", ) -def _fetch_manifest_dict(url: str) -> dict[str, Any]: - """Fetch and parse a manifest.yaml file from a URL. +def _fetch_metadata_dict(url: str) -> dict[str, Any]: + """Fetch and parse a metadata.yaml file from a URL. Args: - url: The URL to fetch the manifest from + url: The URL to fetch the metadata from Returns: - The parsed manifest data as a dictionary, or empty dict if manifest not found (404) + The parsed metadata as a dictionary, or empty dict if metadata not found (404) Raises: HTTPError: If the request fails with a non-404 status code @@ -446,7 +448,7 @@ def _extract_docs_from_registry(connector_name: str) -> list[ApiDocsUrl]: external_docs = connector_entry["externalDocumentationUrls"] if isinstance(external_docs, list): docs_urls.extend( - ApiDocsUrl.from_docs_list( + ApiDocsUrl.from_metadata_docs_list( external_docs, source="registry_external_docs", context="Registry" ) ) @@ -459,7 +461,7 @@ def get_connector_docs_urls(connector_name: str) -> list[ApiDocsUrl]: This function retrieves documentation URLs for a connector from multiple sources: - Registry metadata (documentationUrl, externalDocumentationUrls) - - Connector manifest.yaml file (data.externalDocumentationUrls) + - Connector metadata.yaml file (data.externalDocumentationUrls) Args: connector_name: The canonical connector name (e.g., "source-facebook-marketing") @@ -484,10 +486,10 @@ def get_connector_docs_urls(connector_name: str) -> list[ApiDocsUrl]: registry_urls = _extract_docs_from_registry(connector_name) docs_urls.extend(registry_urls) - manifest_url = _manifest_url_for(connector_name) - manifest_data = _fetch_manifest_dict(manifest_url) - manifest_urls = ApiDocsUrl.from_manifest_dict(manifest_data) - docs_urls.extend(manifest_urls) + metadata_url = _metadata_url_for(connector_name) + metadata_data = _fetch_metadata_dict(metadata_url) + metadata_urls = ApiDocsUrl.from_metadata_dict(metadata_data) + docs_urls.extend(metadata_urls) return docs_urls diff --git a/tests/unit_tests/test_mcp_connector_registry.py b/tests/unit_tests/test_mcp_connector_registry.py index fbd9fe7fd..3b19fd646 100644 --- a/tests/unit_tests/test_mcp_connector_registry.py +++ b/tests/unit_tests/test_mcp_connector_registry.py @@ -11,38 +11,38 @@ from airbyte.mcp.connector_registry import get_connector_docs_urls from airbyte.registry import ( ApiDocsUrl, - _fetch_manifest_dict, - _manifest_url_for, + _fetch_metadata_dict, + _metadata_url_for, ) -class TestManifestUrlFor: - """Tests for _manifest_url_for function.""" +class TestMetadataUrlFor: + """Tests for _metadata_url_for function.""" - def test_manifest_url_for(self) -> None: - """Test generating manifest URL for a connector.""" - url = _manifest_url_for("source-example") + def test_metadata_url_for(self) -> None: + """Test generating metadata URL for a connector.""" + url = _metadata_url_for("source-example") assert "source-example" in url - assert "manifest.yaml" in url + assert "metadata.yaml" in url assert "latest" in url -class TestFetchManifestDict: - """Tests for _fetch_manifest_dict function.""" +class TestFetchMetadataDict: + """Tests for _fetch_metadata_dict function.""" - def test_manifest_not_found(self) -> None: - """Test handling when manifest.yaml doesn't exist (404).""" + def test_metadata_not_found(self) -> None: + """Test handling when metadata.yaml doesn't exist (404).""" with patch("airbyte.registry.requests.get") as mock_get: mock_response = MagicMock() mock_response.status_code = 404 mock_get.return_value = mock_response - manifest_dict = _fetch_manifest_dict("https://example.com/manifest.yaml") - assert manifest_dict == {} + metadata_dict = _fetch_metadata_dict("https://example.com/metadata.yaml") + assert metadata_dict == {} - def test_fetch_manifest_dict(self) -> None: - """Test fetching and parsing manifest.yaml.""" - manifest_yaml = """ + def test_fetch_metadata_dict(self) -> None: + """Test fetching and parsing metadata.yaml.""" + metadata_yaml = """ version: 1.0.0 type: DeclarativeSource data: @@ -51,21 +51,21 @@ def test_fetch_manifest_dict(self) -> None: with patch("airbyte.registry.requests.get") as mock_get: mock_response = MagicMock() mock_response.status_code = 200 - mock_response.text = manifest_yaml + mock_response.text = metadata_yaml mock_get.return_value = mock_response - manifest_dict = _fetch_manifest_dict("https://example.com/manifest.yaml") - assert manifest_dict["version"] == "1.0.0" - assert manifest_dict["type"] == "DeclarativeSource" - assert manifest_dict["data"]["name"] == "Example" + metadata_dict = _fetch_metadata_dict("https://example.com/metadata.yaml") + assert metadata_dict["version"] == "1.0.0" + assert metadata_dict["type"] == "DeclarativeSource" + assert metadata_dict["data"]["name"] == "Example" -class TestApiDocsUrlFromManifestDict: - """Tests for ApiDocsUrl.from_manifest_dict classmethod.""" +class TestApiDocsUrlFromMetadataDict: + """Tests for ApiDocsUrl.from_metadata_dict classmethod.""" - def test_manifest_with_external_docs_urls(self) -> None: + def test_metadata_with_external_docs_urls(self) -> None: """Test extracting URLs from data.externalDocumentationUrls field.""" - manifest_dict = { + metadata_dict = { "version": "1.0.0", "type": "DeclarativeSource", "data": { @@ -90,7 +90,7 @@ def test_manifest_with_external_docs_urls(self) -> None: }, } - urls = ApiDocsUrl.from_manifest_dict(manifest_dict) + urls = ApiDocsUrl.from_metadata_dict(metadata_dict) assert len(urls) == 3 assert urls[0].title == "Versioning docs" assert urls[0].url == "https://api.example.com/versioning" @@ -102,9 +102,9 @@ def test_manifest_with_external_docs_urls(self) -> None: assert urls[2].doc_type == "api_deprecations" assert urls[2].requires_login is True - def test_manifest_with_external_docs_no_type(self) -> None: + def test_metadata_with_external_docs_no_type(self) -> None: """Test extracting URLs from data.externalDocumentationUrls without type field.""" - manifest_dict = { + metadata_dict = { "version": "1.0.0", "type": "DeclarativeSource", "data": { @@ -117,20 +117,20 @@ def test_manifest_with_external_docs_no_type(self) -> None: }, } - urls = ApiDocsUrl.from_manifest_dict(manifest_dict) + urls = ApiDocsUrl.from_metadata_dict(metadata_dict) assert len(urls) == 1 assert urls[0].title == "General docs" assert urls[0].doc_type == "other" assert urls[0].requires_login is False - def test_empty_manifest(self) -> None: - """Test handling empty manifest dict.""" - urls = ApiDocsUrl.from_manifest_dict({}) + def test_empty_metadata(self) -> None: + """Test handling empty metadata dict.""" + urls = ApiDocsUrl.from_metadata_dict({}) assert len(urls) == 0 - def test_manifest_missing_title_raises_error(self) -> None: + def test_metadata_missing_title_raises_error(self) -> None: """Test that missing 'title' field raises PyAirbyteInternalError.""" - manifest_dict = { + metadata_dict = { "version": "1.0.0", "type": "DeclarativeSource", "data": { @@ -143,13 +143,13 @@ def test_manifest_missing_title_raises_error(self) -> None: } with pytest.raises( - exc.PyAirbyteInternalError, match="Manifest parsing error.*'title'" + exc.PyAirbyteInternalError, match="Metadata parsing error.*'title'" ): - ApiDocsUrl.from_manifest_dict(manifest_dict) + ApiDocsUrl.from_metadata_dict(metadata_dict) - def test_manifest_missing_url_raises_error(self) -> None: + def test_metadata_missing_url_raises_error(self) -> None: """Test that missing 'url' field raises PyAirbyteInternalError.""" - manifest_dict = { + metadata_dict = { "version": "1.0.0", "type": "DeclarativeSource", "data": { @@ -162,9 +162,9 @@ def test_manifest_missing_url_raises_error(self) -> None: } with pytest.raises( - exc.PyAirbyteInternalError, match="Manifest parsing error.*'url'" + exc.PyAirbyteInternalError, match="Metadata parsing error.*'url'" ): - ApiDocsUrl.from_manifest_dict(manifest_dict) + ApiDocsUrl.from_metadata_dict(metadata_dict) class TestGetConnectorDocsUrls: