From 8ee15efd5e564f9289b46efd53f9076fb36e9ff4 Mon Sep 17 00:00:00 2001 From: nochore Date: Sun, 13 Jul 2025 16:08:57 +0400 Subject: [PATCH] Confluence: add filtering for file extension and name --- src/alita_tools/confluence/api_wrapper.py | 25 ++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/alita_tools/confluence/api_wrapper.py b/src/alita_tools/confluence/api_wrapper.py index 08d5f7db..03cf0e17 100644 --- a/src/alita_tools/confluence/api_wrapper.py +++ b/src/alita_tools/confluence/api_wrapper.py @@ -267,6 +267,8 @@ page_id=(str, Field(description="Confluence page ID from which attachments will be retrieved")), max_content_length=(int, Field(default=10000, description="Maximum number of characters to return for attachment content. Content will be truncated if longer. Default is 10000.")), custom_prompt=(Optional[str], Field(default=None, description="Custom prompt to use for LLM-based analysis of attachments (images, pdfs, etc). If not provided, a default prompt will be used.")), + allowed_extensions=(Optional[List[str]], Field(default=None, description="List of file extensions to include (e.g. ['pdf', 'docx']). If None, all extensions are included.", examples=[["pdf", "docx"]])), + name_pattern=(Optional[str], Field(default=None, description="Regex pattern to filter attachment names (e.g. '^report_.*\\.pdf$'). If None, all names are included.", examples=["^report_.*\\.pdf$"])), ) @@ -1494,7 +1496,7 @@ def process_base64_image(match): logger.error(f"Error processing page with images: {stacktrace}") return f"Error processing page with images: {str(e)}" - def get_page_attachments(self, page_id: str, max_content_length: int = 10000, custom_prompt: str = None): + def get_page_attachments(self, page_id: str, max_content_length: int = 10000, custom_prompt: str = None, allowed_extensions: Optional[List[str]] = None, name_pattern: Optional[str] = None): """ Retrieve all attachments for a Confluence page, including core metadata (with creator, created, updated), comments, file content, and LLM-based analysis for supported types. @@ -1515,8 +1517,19 @@ def get_page_attachments(self, page_id: str, max_content_length: int = 10000, cu logger.warning(f"Failed to fetch history for attachment {attachment.get('title', '')}: {str(e)}") history_map[attachment['id']] = None + import re results = [] for attachment in attachments['results']: + title = attachment.get('title', '') + file_ext = title.lower().split('.')[-1] if '.' in title else '' + + # Filter by allowed_extensions + if allowed_extensions and file_ext not in allowed_extensions: + continue + # Filter by name_pattern + if name_pattern and not re.match(name_pattern, title): + continue + media_type = attachment.get('metadata', {}).get('mediaType', '') # Core metadata extraction with history hist = history_map.get(attachment['id']) or {} @@ -1524,7 +1537,7 @@ def get_page_attachments(self, page_id: str, max_content_length: int = 10000, cu created_date = hist.get('createdDate', '') if hist else attachment.get('created', '') last_updated = hist.get('lastUpdated', {}).get('when', '') if hist else '' metadata = { - 'name': attachment.get('title', ''), + 'name': title, 'size': attachment.get('extensions', {}).get('fileSize', None), 'creator': created_by, 'created': created_date, @@ -1533,6 +1546,7 @@ def get_page_attachments(self, page_id: str, max_content_length: int = 10000, cu 'labels': [label['name'] for label in attachment.get('metadata', {}).get('labels', {}).get('results', [])], 'download_url': self.base_url.rstrip('/') + attachment['_links']['download'] if attachment.get('_links', {}).get('download') else None } + # Fetch comments for the attachment comments = [] try: @@ -1546,16 +1560,13 @@ def get_page_attachments(self, page_id: str, max_content_length: int = 10000, cu 'body': comment.get('body', {}).get('storage', {}).get('value', '') }) except Exception as e: - logger.warning(f"Failed to fetch comments for attachment {attachment.get('title', '')}: {str(e)}") + logger.warning(f"Failed to fetch comments for attachment {title}: {str(e)}") content = None llm_analysis = None - title = attachment.get('title', '') download_url = self.base_url.rstrip('/') + attachment['_links']['download'] # --- Begin: Raw content for xml, json, markdown, txt --- - # Check by media type or file extension - file_ext = title.lower().split('.')[-1] if '.' in title else '' is_text_type = ( media_type in [ 'application/xml', 'text/xml', @@ -1727,7 +1738,7 @@ def get_page_attachments(self, page_id: str, max_content_length: int = 10000, cu llm_analysis = self._process_image_with_llm(image_data, title, context_text, custom_prompt) if llm_analysis and isinstance(llm_analysis, str) and len(llm_analysis) > max_content_length: - llm_analysis = llm_analysis[:max_content_length] + f"\n...[truncated, showing first {max_content_length} characters]" + llm_analysis = llm_analysis[:max_content_length] + f"\n...[truncated, showing first {max_content_length} characters]" results.append({ 'metadata': metadata,