From 4e716848601f4fc04d878676c72cc351e88d8981 Mon Sep 17 00:00:00 2001 From: ivanmilevtues Date: Wed, 25 Jun 2025 20:08:27 +0200 Subject: [PATCH 1/6] Codeboarding files are now added to the endresult --- .codeboarding/on_boarding.html | 101 +++++++++++++++++++++++++++++++++ pdoc/templates/html.mako | 17 ++++++ 2 files changed, 118 insertions(+) create mode 100644 .codeboarding/on_boarding.html diff --git a/.codeboarding/on_boarding.html b/.codeboarding/on_boarding.html new file mode 100644 index 00000000..a5e3a0d4 --- /dev/null +++ b/.codeboarding/on_boarding.html @@ -0,0 +1,101 @@ + + + + + + ProteinFlow Overview + + + + + + + +
+ + Generated by CodeBoarding + + + Try Demo + + + Contact us + +
+ +

Details

+ +

The ProteinFlow project is structured around a streamlined pipeline for acquiring, processing, organizing, and preparing protein data for machine learning tasks, complemented by analysis and visualization capabilities. The architecture is designed to facilitate efficient handling of large biological datasets.

+ + +
+graph LR + User_Interface_CLI_["User Interface (CLI)"] + Core_Data_Management["Core Data Management"] + Data_Preparation_for_ML["Data Preparation for ML"] + Analysis_Visualization["Analysis & Visualization"] + User_Interface_CLI_ -- "Initiates operations in" --> Core_Data_Management + User_Interface_CLI_ -- "Initiates operations in" --> Data_Preparation_for_ML + Core_Data_Management -- "Provides processed data to" --> Data_Preparation_for_ML + Core_Data_Management -- "Provides data to" --> Analysis_Visualization + click User_Interface_CLI_ href "https://github.com/CodeBoarding/GeneratedOnBoardings/blob/main/ProteinFlow/User_Interface_CLI_.md" "Details" + click Core_Data_Management href "https://github.com/CodeBoarding/GeneratedOnBoardings/blob/main/ProteinFlow/Core_Data_Management.md" "Details" + click Data_Preparation_for_ML href "https://github.com/CodeBoarding/GeneratedOnBoardings/blob/main/ProteinFlow/Data_Preparation_for_ML.md" "Details" + click Analysis_Visualization href "https://github.com/CodeBoarding/GeneratedOnBoardings/blob/main/ProteinFlow/Analysis_Visualization.md" "Details" +
+ +

User Interface (CLI) [Expand]

+

The primary command-line interface that serves as the entry point for users to initiate and control the entire data pipeline. It orchestrates the execution of data acquisition, processing, and organization workflows.

+ + +

Core Data Management [Expand]

+

This foundational component is responsible for the acquisition of raw protein data (PDB and SAbDab files), defining the core data structures for representing proteins and associated ligands, and performing the initial processing steps. This includes filtering, cleaning, and converting raw data into standardized ProteinEntry objects, handling quality checks, and managing ligand-specific details.

+ + +

Data Preparation for ML [Expand]

+

Focuses on organizing and partitioning the processed protein data into distinct train, validation, and test sets, often employing clustering techniques to ensure diverse and representative splits. It also provides PyTorch-compatible Dataset and DataLoader classes for efficient batching and preparation of data, making it ready for machine learning model training and evaluation.

+ + +

Analysis & Visualization [Expand]

+

Offers a comprehensive suite of tools for calculating various protein-related metrics (e.g., sequence similarity, language model perplexity) and for visualizing protein structures and animations. This component also manages optional external dependencies required for its advanced functionalities.

+ + +

FAQ

+ + + diff --git a/pdoc/templates/html.mako b/pdoc/templates/html.mako index f0517880..606a6579 100644 --- a/pdoc/templates/html.mako +++ b/pdoc/templates/html.mako @@ -1,5 +1,6 @@ <% import os + import glob import pdoc from pdoc.html_helpers import extract_toc, glimpse, to_html as _to_html, format_git_link @@ -23,6 +24,12 @@ if annot: annot = ' ' + sep + '\N{NBSP}' + annot return annot + + # Get codeboarding files if they exist + codeboarding_files = [] + codeboarding_dir = os.path.join(os.getcwd(), '.codeboarding') + if os.path.exists(codeboarding_dir): + codeboarding_files = [os.path.basename(f) for f in glob.glob(os.path.join(codeboarding_dir, '*.html'))] %> <%def name="ident(name)">${name} @@ -356,6 +363,16 @@ % endif + + % if codeboarding_files: +
  • Codeboardings

    + +
  • + % endif From b162a5400a09646da48a6a8e715b9c0b9930b9d4 Mon Sep 17 00:00:00 2001 From: ivanmilevtues Date: Thu, 26 Jun 2025 04:10:52 +0200 Subject: [PATCH 2/6] Added initial changes for codeboarding integration for pdoc --- .codeboarding/Analysis_Visualization.html | 423 +++++++++++++++++ .codeboarding/Core_Data_Management.html | 481 ++++++++++++++++++++ .codeboarding/Data_Preparation_for_ML.html | 443 ++++++++++++++++++ .codeboarding/User_Interface_CLI_.html | 427 +++++++++++++++++ .codeboarding/on_boarding.html | 506 +++++++++++++++++---- dev/integrate_html_files.py | 247 ++++++++++ dev/make_docs.sh | 9 +- pdoc/templates/html.mako | 27 +- 8 files changed, 2463 insertions(+), 100 deletions(-) create mode 100644 .codeboarding/Analysis_Visualization.html create mode 100644 .codeboarding/Core_Data_Management.html create mode 100644 .codeboarding/Data_Preparation_for_ML.html create mode 100644 .codeboarding/User_Interface_CLI_.html create mode 100644 dev/integrate_html_files.py diff --git a/.codeboarding/Analysis_Visualization.html b/.codeboarding/Analysis_Visualization.html new file mode 100644 index 00000000..37b63068 --- /dev/null +++ b/.codeboarding/Analysis_Visualization.html @@ -0,0 +1,423 @@ + + + + + + CodeBoarding Analysis - django + + + + + + + +

    CodeBoarding Analysis - django

    + + + +
    + + + +
    + +
    + +

    Details

    +

    The `Analysis & Visualization` component, as described, serves as an umbrella for functionalities related to protein metrics, visualization, and the management of their underlying external dependencies. Based on the provided `Analysis summary` and the related classes/methods, this component can be broken down into four fundamental sub-components, each representing a distinct and crucial aspect of the `proteinflow` library. These components are: `Data Management`, `Visualization`, `Metrics and Analysis`, and `External Dependencies and Utilities`.

    + + +
    +

    Data Management [Expand]

    +

    This component is responsible for defining and managing the core data structures that represent protein information. It handles the loading of protein entries from various formats, such as PDB files and serialized pickle files, and provides foundational data objects for the entire system. The class hierarchy shows `SAbDabEntry` inheriting from `PDBEntry`, indicating a structured approach to handling different protein data types.

    +

    Related Classes/Methods:

    • `SAbDabEntry` (1:1)
    • `PDBEntry` (1:1)
    +
    + +
    +

    Visualization [Expand]

    +

    This component focuses on the graphical representation and animation of protein structures. It takes processed protein data and renders it for user viewing, offering functionalities like showing animations from PDB or pickle files, and merging multiple protein structures for combined display.

    +

    Related Classes/Methods:

    • `visualize` (1:1)
    +
    + +
    +

    Metrics and Analysis

    +

    This component offers a comprehensive suite of computational tools for analyzing protein sequences and structures. It includes functions for calculating various biological and structural metrics (e.g., BLOSUM62 score, TM-score, language model perplexity) and integrating with external models for structure generation (e.g., ESMFold, IgFold).

    +

    Related Classes/Methods:

    • `metrics` (1:1)
    +
    + +
    +

    External Dependencies and Utilities

    +

    This component manages optional external dependencies and provides general utility functions. Its primary roles include checking for the availability of required external packages (`requires_extra`) and facilitating the acquisition of visualization views (`_get_view`). It acts as an abstraction layer, ensuring that core functionalities can gracefully handle optional integrations.

    +

    Related Classes/Methods:

    • `requires_extra` (1:1)
    • `_get_view` (1:1)
    +
    + + +

    FAQ

    + + + + \ No newline at end of file diff --git a/.codeboarding/Core_Data_Management.html b/.codeboarding/Core_Data_Management.html new file mode 100644 index 00000000..15a62cca --- /dev/null +++ b/.codeboarding/Core_Data_Management.html @@ -0,0 +1,481 @@ + + + + + + CodeBoarding Analysis - django + + + + + + + +

    CodeBoarding Analysis - django

    + + + +
    + + + +
    + +
    + +

    Details

    +

    The `Core Data Management` component is fundamental to `proteinflow` as it establishes the initial pipeline for acquiring, structuring, and preparing raw protein data. It ensures that all subsequent operations, such as feature extraction and model training, have access to high-quality, standardized input. Without these foundational steps, the project would lack the necessary data integrity and accessibility to function effectively.

    + + +
    +

    proteinflow.download

    +

    This component is responsible for fetching raw protein data (PDB and SAbDab files) from external databases and managing their local storage. It acts as the primary entry point for data acquisition.

    +

    Related Classes/Methods:

    • `proteinflow.download` (1:1)
    +
    + +
    +

    proteinflow.data.PDBEntry

    +

    This class serves as the foundational data structure for parsing and representing information from standard PDB or mmCIF files. It extracts atomic coordinates, sequences, and basic structural properties, including initial ligand information.

    +

    Related Classes/Methods:

    • `proteinflow.data.PDBEntry` (1:1)
    +
    + +
    +

    proteinflow.data.SAbDabEntry

    +

    Extending `PDBEntry`, this specialized class handles antibody structures from the SAbDab database. It incorporates specific logic for identifying Complementarity Determining Regions (CDRs) and managing antibody chain types, building upon the base PDB structure.

    +

    Related Classes/Methods:

    • `proteinflow.data.SAbDabEntry` (1:1)
    +
    + +
    +

    proteinflow.data.ProteinEntry

    +

    This is the central, standardized data model that aggregates and processes information from `PDBEntry` and `SAbDabEntry`. It represents the cleaned, filtered, and unified protein data, ready for feature extraction and downstream analysis.

    +

    Related Classes/Methods:

    • `proteinflow.data.ProteinEntry` (1:1)
    +
    + +
    +

    proteinflow.ligand

    +

    This module is dedicated to the identification, parsing, and detailed processing of ligand molecules associated with protein structures. It handles tasks such as extracting ligand data from PDB files and managing their chemical properties.

    +

    Related Classes/Methods:

    • `proteinflow.ligand` (1:1)
    +
    + +
    +

    proteinflow.processing

    +

    This component orchestrates the overall data processing pipeline. It manages the filtering, cleaning, and conversion of raw protein entries into standardized `ProteinEntry` objects, ensuring data quality and preparing it for further use.

    +

    Related Classes/Methods:

    • `proteinflow.processing` (1:1)
    +
    + + +

    FAQ

    + + + + \ No newline at end of file diff --git a/.codeboarding/Data_Preparation_for_ML.html b/.codeboarding/Data_Preparation_for_ML.html new file mode 100644 index 00000000..058b6fba --- /dev/null +++ b/.codeboarding/Data_Preparation_for_ML.html @@ -0,0 +1,443 @@ + + + + + + CodeBoarding Analysis - django + + + + + + + +

    CodeBoarding Analysis - django

    + + + +
    + + + +
    + +
    + +

    Details

    +

    This subsystem focuses on organizing and partitioning processed protein data into distinct train, validation, and test sets, often employing clustering techniques to ensure diverse and representative splits. It also provides PyTorch-compatible `Dataset` and `DataLoader` classes for efficient batching and preparation of data, making it ready for machine learning model training and evaluation.

    + + +
    +

    Data Splitting Module

    +

    This module orchestrates the division of the protein dataset into training, validation, and test sets. It employs advanced strategies, including sequence and structural similarity-based clustering (e.g., using MMseqs2 and Foldseek), to ensure robust data separation and prevent data leakage, crucial for unbiased model evaluation.

    +

    Related Classes/Methods:

    • `proteinflow.split` (0:0)
    • `proteinflow.split.utils` (0:0)
    • `proteinflow.split.split_data` (0:0)
    • `proteinflow.split._build_dataset_partition` (0:0)
    • `proteinflow.split._split_dataset_with_graphs` (0:0)
    • `proteinflow.split._get_split_dictionaries` (0:0)
    +
    + +
    +

    PyTorch Data Module

    +

    This module provides the necessary PyTorch-compatible `Dataset` and `DataLoader` classes, facilitating the seamless integration of processed protein data with deep learning models. It handles efficient data loading, batching, and preparation for training and evaluation.

    +

    Related Classes/Methods:

    • `proteinflow.data.torch` (0:0)
    • `proteinflow.data.torch.ProteinDataset` (242:1131)
    • `proteinflow.data.torch.ProteinLoader` (67:239)
    +
    + +
    +

    Protein Data Structure

    +

    This fundamental component defines the structure for encapsulating all relevant information for a single protein entry, including sequence, coordinates, chain IDs, and associated ligand data. It provides methods for parsing, validating, and extracting specific features, serving as the core data representation throughout the data preparation pipeline. `proteinflow.data.SAbDabEntry` inherits from `proteinflow.data.PDBEntry`, extending the base protein data structure for antibody-specific entries.

    +

    Related Classes/Methods:

    • `proteinflow.data` (0:0)
    • `proteinflow.data.utils` (0:0)
    • `proteinflow.data.PDBEntry` (0:0)
    • `proteinflow.data.SAbDabEntry` (0:0)
    • `proteinflow.data.utils.from_pickle` (0:0)
    • `proteinflow.data.utils.to_pdb` (0:0)
    • `proteinflow.data.utils.get_chains` (0:0)
    • `proteinflow.data.utils.get_sequence` (0:0)
    • `proteinflow.data.utils.get_coordinates` (0:0)
    • `proteinflow.data.utils.retrieve_ligands_from_pickle` (0:0)
    +
    + +
    +

    Ligand Processing Module

    +

    This module specializes in handling ligand-related data within protein entries. It includes functionalities for loading ligand information (e.g., SMILES strings) and performing chemical similarity-based clustering, which can be integrated into data splitting strategies.

    +

    Related Classes/Methods:

    • `proteinflow.ligand` (0:0)
    • `proteinflow.ligand._load_smiles` (653:678)
    • `proteinflow.ligand._merge_chains_ligands` (694:737)
    • `proteinflow.ligand._run_tanimoto_clustering` (983:1001)
    +
    + +
    +

    Splitting Utilities

    +

    This module provides a suite of helper functions that support the intricate logic within the `Data Splitting Module`. These utilities are essential for tasks such as finding correspondences between protein chains, loading PDB files, merging chains, and managing biounit information during the data splitting process.

    +

    Related Classes/Methods:

    • `proteinflow.split.utils` (0:0)
    • `proteinflow.split.utils._find_correspondences` (139:149)
    • `proteinflow.split.utils._load_pdbs` (72:99)
    • `proteinflow.split.utils._merge_chains` (25:69)
    • `proteinflow.split.utils._biounits_in_clusters_dict` (152:164)
    +
    + + +

    FAQ

    + + + + \ No newline at end of file diff --git a/.codeboarding/User_Interface_CLI_.html b/.codeboarding/User_Interface_CLI_.html new file mode 100644 index 00000000..5c5f09aa --- /dev/null +++ b/.codeboarding/User_Interface_CLI_.html @@ -0,0 +1,427 @@ + + + + + + CodeBoarding Analysis - django + + + + + + + +

    CodeBoarding Analysis - django

    + + + +
    + + + +
    + +
    + +

    Details

    +

    The CLI Interface is fundamental because it is the user's gateway to the entire ProteinFlow system. Without it, users would not be able to initiate or control any of the data pipeline operations. It abstracts away the underlying complexity of the data processing components, providing a simplified and unified command-line experience. Its role as an orchestrator and dispatcher is critical for coordinating the execution of various data-related tasks (downloading, generating, splitting) in a structured manner. The integration with Logging and Reporting is also vital, as it provides the necessary feedback loop for users to understand the status and outcomes of their initiated processes, making the system robust and user-friendly.

    + + +
    +

    CLI Interface

    +

    The CLI Interface serves as the primary command-line entry point for users to interact with the ProteinFlow data pipeline. Its fundamental role is to orchestrate the entire data processing workflow by translating user commands into specific actions. It acts as a dispatcher, invoking the appropriate backend functions from other core components such as the Data Downloader, Data Generator, and Data Splitter. Furthermore, it integrates with the Logging and Reporting component to provide operational feedback, status updates, and error summaries to the user, ensuring transparency and aiding in debugging. This component is crucial because it provides the user-facing control mechanism, making the complex data pipeline accessible and manageable.

    +

    Related Classes/Methods:

    • `proteinflow.cli` (18:20)
    +
    + +
    +

    Data Downloader

    +

    Handles the acquisition of data.

    +

    Related Classes/Methods:

    None

    +
    + +
    +

    Data Generator

    +

    Manages the synthesis or transformation of data.

    +

    Related Classes/Methods:

    None

    +
    + +
    +

    Data Splitter

    +

    Manages dataset partitioning and re-consolidation.

    +

    Related Classes/Methods:

    None

    +
    + +
    +

    Logging and Reporting

    +

    Provides operational feedback, status updates, and error summaries.

    +

    Related Classes/Methods:

    None

    +
    + + +

    FAQ

    + + + + \ No newline at end of file diff --git a/.codeboarding/on_boarding.html b/.codeboarding/on_boarding.html index a5e3a0d4..22d32050 100644 --- a/.codeboarding/on_boarding.html +++ b/.codeboarding/on_boarding.html @@ -1,101 +1,417 @@ - - - ProteinFlow Overview - - - + + + CodeBoarding Analysis - django + + + + + - - - - -

    Details

    - -

    The ProteinFlow project is structured around a streamlined pipeline for acquiring, processing, organizing, and preparing protein data for machine learning tasks, complemented by analysis and visualization capabilities. The architecture is designed to facilitate efficient handling of large biological datasets.

    - - -
    -graph LR - User_Interface_CLI_["User Interface (CLI)"] - Core_Data_Management["Core Data Management"] - Data_Preparation_for_ML["Data Preparation for ML"] - Analysis_Visualization["Analysis & Visualization"] - User_Interface_CLI_ -- "Initiates operations in" --> Core_Data_Management - User_Interface_CLI_ -- "Initiates operations in" --> Data_Preparation_for_ML - Core_Data_Management -- "Provides processed data to" --> Data_Preparation_for_ML - Core_Data_Management -- "Provides data to" --> Analysis_Visualization - click User_Interface_CLI_ href "https://github.com/CodeBoarding/GeneratedOnBoardings/blob/main/ProteinFlow/User_Interface_CLI_.md" "Details" - click Core_Data_Management href "https://github.com/CodeBoarding/GeneratedOnBoardings/blob/main/ProteinFlow/Core_Data_Management.md" "Details" - click Data_Preparation_for_ML href "https://github.com/CodeBoarding/GeneratedOnBoardings/blob/main/ProteinFlow/Data_Preparation_for_ML.md" "Details" - click Analysis_Visualization href "https://github.com/CodeBoarding/GeneratedOnBoardings/blob/main/ProteinFlow/Analysis_Visualization.md" "Details" -
    - -

    User Interface (CLI) [Expand]

    -

    The primary command-line interface that serves as the entry point for users to initiate and control the entire data pipeline. It orchestrates the execution of data acquisition, processing, and organization workflows.

    - - -

    Core Data Management [Expand]

    -

    This foundational component is responsible for the acquisition of raw protein data (PDB and SAbDab files), defining the core data structures for representing proteins and associated ligands, and performing the initial processing steps. This includes filtering, cleaning, and converting raw data into standardized ProteinEntry objects, handling quality checks, and managing ligand-specific details.

    - - -

    Data Preparation for ML [Expand]

    -

    Focuses on organizing and partitioning the processed protein data into distinct train, validation, and test sets, often employing clustering techniques to ensure diverse and representative splits. It also provides PyTorch-compatible Dataset and DataLoader classes for efficient batching and preparation of data, making it ready for machine learning model training and evaluation.

    - - -

    Analysis & Visualization [Expand]

    -

    Offers a comprehensive suite of tools for calculating various protein-related metrics (e.g., sequence similarity, language model perplexity) and for visualizing protein structures and animations. This component also manages optional external dependencies required for its advanced functionalities.

    - - -

    FAQ

    - +

    CodeBoarding Analysis - django

    + + + +
    + + + +
    + +
    + +

    Details

    +

    The `ProteinFlow` project is structured around a streamlined pipeline for acquiring, processing, organizing, and preparing protein data for machine learning tasks, complemented by analysis and visualization capabilities. The architecture is designed to facilitate efficient handling of large biological datasets.

    + + +
    +

    User Interface (CLI) [Expand]

    +

    The primary command-line interface that serves as the entry point for users to initiate and control the entire data pipeline. It orchestrates the execution of data acquisition, processing, and organization workflows.

    +

    Related Classes/Methods:

    • `proteinflow.cli` (18:20)
    +
    + +
    +

    Core Data Management [Expand]

    +

    This foundational component is responsible for the acquisition of raw protein data (PDB and SAbDab files), defining the core data structures for representing proteins and associated ligands, and performing the initial processing steps. This includes filtering, cleaning, and converting raw data into standardized `ProteinEntry` objects, handling quality checks, and managing ligand-specific details.

    +

    Related Classes/Methods:

    • `proteinflow.data` (1:1)
    • `proteinflow.data.PDBEntry` (1:1)
    • `proteinflow.data.SAbDabEntry` (1:1)
    • `proteinflow.download` (1:1)
    • `proteinflow.processing` (1:1)
    • `proteinflow.ligand` (1:1)
    +
    + +
    +

    Data Preparation for ML [Expand]

    +

    Focuses on organizing and partitioning the processed protein data into distinct train, validation, and test sets, often employing clustering techniques to ensure diverse and representative splits. It also provides PyTorch-compatible `Dataset` and `DataLoader` classes for efficient batching and preparation of data, making it ready for machine learning model training and evaluation.

    +

    Related Classes/Methods:

    • `proteinflow.split` (1:1)
    • `proteinflow.data.torch` (1:1)
    +
    + +
    +

    Analysis & Visualization [Expand]

    +

    Offers a comprehensive suite of tools for calculating various protein-related metrics (e.g., sequence similarity, language model perplexity) and for visualizing protein structures and animations. This component also manages optional external dependencies required for its advanced functionalities.

    +

    Related Classes/Methods:

    • `proteinflow.metrics` (1:1)
    • `proteinflow.visualize` (1:1)
    • `proteinflow.extra` (1:1)
    +
    + + +

    FAQ

    + + - + \ No newline at end of file diff --git a/dev/integrate_html_files.py b/dev/integrate_html_files.py new file mode 100644 index 00000000..00cf170f --- /dev/null +++ b/dev/integrate_html_files.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 +""" +Script to integrate standalone HTML files into the pdoc template structure. +This script generates wrapper pages that main # Remove any existing "Additional Resources" sections to avoid duplication + # Keep the Codeboardings section as it's managed by the html.mako template + sidebar = re.sub(r'
  • Additional Resources

    .*?
  • ', '', sidebar, flags=re.DOTALL)n the sidebar navigation. +""" + +import os +import re +import glob +from pathlib import Path + + +def extract_html_content(filepath): + """Extract content and metadata from HTML file.""" + try: + with open(filepath, 'r', encoding='utf-8') as f: + content = f.read() + + # Extract title + title_match = re.search(r']*>(.*?)', content, re.IGNORECASE | re.DOTALL) + title = title_match.group(1).strip() if title_match else "Documentation" + + # Extract body content + body_match = re.search(r']*>(.*?)', content, re.DOTALL | re.IGNORECASE) + if body_match: + body_content = body_match.group(1).strip() + else: + # If no body tags, use everything after head + head_end = content.find('') + if head_end != -1: + body_content = content[head_end + 7:].strip() + # Remove html and body tags if present + body_content = re.sub(r']*>', '', body_content, flags=re.IGNORECASE) + body_content = re.sub(r']*>', '', body_content, flags=re.IGNORECASE) + else: + body_content = content + + # Extract custom styles from the head section + style_pattern = r']*>(.*?)' + style_matches = re.findall(style_pattern, content, re.DOTALL | re.IGNORECASE) + + # Filter out potentially conflicting styles but keep useful ones + useful_styles = [] + for style in style_matches: + # Keep styles that seem specific to content rather than layout + if any(keyword in style.lower() for keyword in ['badge', 'mermaid', 'highlight', 'code', 'pre', 'table', 'img']): + useful_styles.append(style.strip()) + + styles = '\n'.join(useful_styles) if useful_styles else '' + + # Extract any scripts that might be needed (like mermaid) + script_pattern = r']*(?:src=["\'][^"\']*["\']|type=["\'][^"\']*["\'])*[^>]*>.*?' + script_matches = re.findall(script_pattern, content, re.DOTALL | re.IGNORECASE) + scripts = '\n'.join(script_matches) if script_matches else '' + + # Also extract script tags without closing tags (like imports) + import_script_matches = re.findall(r']*type=["\']module["\'][^>]*>.*?', content, re.DOTALL | re.IGNORECASE) + if import_script_matches: + scripts = '\n'.join(import_script_matches) + '\n' + scripts + + return { + 'title': title, + 'content': body_content, + 'styles': styles, + 'scripts': scripts + } + except Exception as e: + print(f"Error reading {filepath}: {e}") + return None + + +def generate_pdoc_template(): + """Get the base pdoc template structure from existing docs.""" + index_path = "docs/index.html" + if not os.path.exists(index_path): + return None + + with open(index_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Extract the base template structure + # We'll use everything up to the main content section + content_start = content.find('
    ') + if content_start == -1: + return None + + before_content = content[:content_start] + + # Find the sidebar + sidebar_start = content.find('', sidebar_start) + 6 + if sidebar_start == -1 or sidebar_end == -1: + return None + + sidebar = content[sidebar_start:sidebar_end] + + # Find the footer + footer_start = content.find('