diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..309cd39 Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore index f7eaee2..ae2c3dc 100644 --- a/.gitignore +++ b/.gitignore @@ -189,3 +189,8 @@ examples/new_test_examples/* # Ignore new tools examples examples/new_tools/ + + +#ignore test output data +output* +*output \ No newline at end of file diff --git a/ProtPeptigram/DataProcessor.py b/ProtPeptigram/DataProcessor.py index 486b45f..bb17fa8 100644 --- a/ProtPeptigram/DataProcessor.py +++ b/ProtPeptigram/DataProcessor.py @@ -75,14 +75,15 @@ def load_peaks_data(self, file_path: str) -> pd.DataFrame: try: self.peaks_data = pd.read_csv(file_path) console.log(f"Loaded {len(self.peaks_data)} peptide entries from {file_path}", style="bold green") - + self.peaks_data = self.peaks_data.rename(columns=lambda x: str(x).capitalize()) # Check required columns required_cols = ['Peptide', 'Accession'] missing_cols = [col for col in required_cols if col not in self.peaks_data.columns] if missing_cols: raise ValueError(f"Missing required columns: {', '.join(missing_cols)}") - + + # Find intensity columns self.intensity_cols = [col for col in self.peaks_data.columns if col.startswith(self.sample_prefix)] @@ -281,7 +282,7 @@ def filter_and_format_data(self, formatted_rows = [] # Extract sample names from intensity columns - sample_names = [col.replace(self.sample_prefix, '') for col in self.intensity_cols] + sample_names = [re.sub(r'^[_\-\*]+', '', col.replace(self.sample_prefix, '')) for col in self.intensity_cols] # Process each peptide for _, row in data.iterrows(): diff --git a/ProtPeptigram/__init__.py b/ProtPeptigram/__init__.py index 00ab764..8a9dd20 100644 --- a/ProtPeptigram/__init__.py +++ b/ProtPeptigram/__init__.py @@ -23,6 +23,6 @@ #controll for __all__ to limit what is imported when using 'from module import *' # __all__ = ['PeptideDataProcessor', 'ImmunoViz'] -__version__ = "1.0.1-dev" +__version__ = "1.1.0-dev" __author__ = "Sanjay Krishna,Prithvi Munday,Chen Li" __email__ = "sanjay.sondekoppagopalakrishna@monash.edu" \ No newline at end of file diff --git a/ProtPeptigram/runner.py b/ProtPeptigram/runner.py index d0a0431..89a3270 100644 --- a/ProtPeptigram/runner.py +++ b/ProtPeptigram/runner.py @@ -106,7 +106,6 @@ def run_pipeline( Intensity threshold for filtering peptides (default: 0.0) min_samples : int, optional Minimum number of samples a peptide must be present in (default: 1) - Returns: -------- tuple: (PeptideDataProcessor, ImmunoViz) - The processor and visualization objects @@ -187,11 +186,13 @@ def run_pipeline( title=f"Peptide-Protein alignment visualisation - {prot}", color_by_protein_and_intensity=False, intensity_cmaps=["Blues", "Reds", "Greens", "Purples"], - protein_cmap="Set1", + protein_cmap="Set1", external_legend=True, highlight=True, auto_highlight=True, highlight_alpha=0.3, + use_sample_color_bars=True, + sample_bar_width=6, dpi=120 ) diff --git a/ProtPeptigram/viz.py b/ProtPeptigram/viz.py index 058b11d..06a12e2 100644 --- a/ProtPeptigram/viz.py +++ b/ProtPeptigram/viz.py @@ -48,12 +48,23 @@ def plot_peptigram( color_by: str = None, figsize: Tuple[int, int] = (12, 10), title: Optional[str] = None, + y_desnity_forntsize: int = 8, + y_desnity_forntcolour: str = "#333333", + y_lab_ticksize: int = 8, + y_sample_fontsize: int = 8, + y_sample_color: Optional[Union[list, str]] = "#333333", + max_sample_name_length: int = 15, + sample_name_wrap: bool = True, + use_sample_color_bars: bool = False, + sample_colors: Optional[List[str]] = None, + sample_bar_width: float = 0.5, x_lab_forntsize: int = 12, - y_lab_forntsize: int = 12, xticks_font: int = 12, xticks_color: str = "#333333", xticks_rotation: int = 0, annotate: bool = True, + legend_titleFontsize: int = 10, + legend_fontsize: int = 8, min_intensity: Optional[float] = None, highlight_regions: Optional[List[Tuple[int, int]]] = None, auto_highlight: bool = True, @@ -61,6 +72,7 @@ def plot_peptigram( auto_highlight_threshold: float = 0.8, highlight: bool = True, color_by_protein_and_intensity: bool = False, + colour_by_text: bool = False, intensity_color_scale: float = 0.7, intensity_cmaps: Union[str, List[str]] = "viridis", protein_cmap: str = "tab10", @@ -86,10 +98,30 @@ def plot_peptigram( Figure size (width, height) in inches (default: (12, 10)) title : str, optional Title for the plot (default: "Protein Peptide Coverage") - x_lab_forntsize : int, optional + x_lab_fontsize : int, optional Font size for x-axis labels Amino acid positions(default: 12) - y_lab_forntsize : int, optional - Font size for y-axis labels Sample names and Density plot(default: 12) + y_desnity_forntsize : int, optional + Font size for y-axis labels Density plot(default: 8) + y_desnity_forntcolour : str, optional + Color for y-axis labels Density plot (default: "#333333") + y_sample_color : str + Color for y-axis sample names (default: "#333333") + y_lab_ticksize : int, optional + Font size for y-axis tick labels (default: 8) + y_sample_fontsize : int, optional + Font size for y-axis sample names (default: 8) + max_sample_name_length : int, optional + Maximum length for sample names on the y-axis (default: 8) + sample_name_wrap : bool, optional + Whether to wrap long sample names (default: True) + use_sample_color_bars : bool, optional + Whether to use colored bars for sample names (default: False) + sample_colors : List[str], optional + List of colors for sample bars (if None, default colors are used) + sample_bar_width : float, optional + Width of sample color bars (default: 0.02) + x_lab_forntsize : int, optional + Font size for x-axis labels (default: 12) xticks_font : int, optional Font size for x-axis tick labels (default: 12) xticks_color : str, optional @@ -112,6 +144,8 @@ def plot_peptigram( Whether to apply highlighting at all (default: True) color_by_protein_and_intensity : bool, optional Whether to color peptides by both protein and intensity (default: False) + colour_by_text : bool, optional + Whether to add text indicating the coloring method (default: False) intensity_color_scale : float, optional How much the intensity should influence the color (0.0-1.0) (default: 0.7) intensity_cmaps : str or List[str], optional @@ -176,7 +210,6 @@ def plot_peptigram( plt.rcParams['ytick.direction'] = 'out' # Set colors and style elements - density_color = '#555555' # Darker gray for density plots grid_color = '#e5e5e5' # Very light gray grid separator_color = '#cccccc' # Light gray separator background_color = '#ffffff' # White background @@ -357,8 +390,8 @@ def plot_peptigram( # Styling for the top panel axs[0].set_xlim(xlim) - axs[0].set_ylabel('Density', color=text_color, - fontweight='normal', fontsize=y_lab_forntsize) + axs[0].set_ylabel('Density', color=y_desnity_forntcolour, + fontweight='normal', fontsize=y_desnity_forntsize) axs[0].spines['top'].set_visible(False) axs[0].spines['right'].set_visible(False) # Remove all ticks @@ -368,7 +401,7 @@ def plot_peptigram( max_density = max(all_proteins_density) axs[0].set_yticks([0, max_density/2, max_density]) axs[0].set_yticklabels( - [0, f"{max_density/2:.0f}", f"{max_density:.0f}"], fontsize=8, color="lightgray") + [0, f"{max_density/2:.0f}", f"{max_density:.0f}"], fontsize=y_lab_ticksize, color="lightgray") # Create legend in the designated area if external if external_legend: @@ -378,32 +411,29 @@ def plot_peptigram( patch = plt.Line2D( [0], [0], color=protein_to_color[protein_id], lw=4, label=protein_id) legend_handles.append(patch) - - # Add legend to the separate legend axis - protein_legend = legend_ax.legend( - handles=legend_handles, - loc='upper left', - fontsize=9, - frameon=True, - framealpha=0.7, - facecolor=background_color, - edgecolor=grid_color, - title='Proteins', - title_fontsize=10 - ) - - # Make sure legend title is properly formatted - protein_legend.get_title().set_fontweight('bold') + # Add legend to the separate legend axis + protein_legend = legend_ax.legend( + handles=legend_handles, + loc='upper left', + fontsize=legend_fontsize, + frameon=True, + framealpha=0.7, + facecolor=background_color, + edgecolor=grid_color, + title='Protein', + title_fontsize=legend_titleFontsize + ) + protein_legend.get_title().set_fontweight('bold') else: # Create protein legend in the main plot protein_legend = axs[0].legend( loc='upper right', - fontsize=8, + fontsize=legend_fontsize, framealpha=0.7, facecolor=background_color, edgecolor='none', title='Proteins', - title_fontsize=9 + title_fontsize=legend_titleFontsize, ) # Set the font weight of the legend title to normal for a consistent appearance protein_legend.get_title().set_fontweight('normal') @@ -567,10 +597,140 @@ def plot_peptigram( # Set plot limits and labels ax.set_ylim(-max_height, 0) ax.set_xlim(xlim) + + # Add colored vertical bar for each sample if requested + if use_sample_color_bars: + if sample_colors is None: + # Generate colors automatically + sample_cmap = plt.cm.get_cmap('tab10') + sample_color = sample_cmap(i % sample_cmap.N) + else: + sample_color = sample_colors[i % len(sample_colors)] + + # Add vertical colored bar on the left + bar_x = xlim[0] #-5 # Position slightly left of the plot + ax.axvline(bar_x, ymin=0, ymax=1, color=sample_color, + linewidth=sample_bar_width, alpha=0.8, solid_capstyle='butt') + + # Remove y-axis label if using color bars + ax.set_ylabel('') + + # Store sample info for legend + if i == 0: # Initialize on first iteration + sample_legend_handles = [] + + # Create legend handle + sample_legend_handles.append( + plt.Line2D([0], [0], color=sample_color, lw=4, label=group) + ) + # Add styled group label - ensure it's visible and consistent + elif sample_name_wrap and len(group) > max_sample_name_length: + # Calculate wrap width based on max_height + # More height = more space = wider wrap width + import textwrap + + # Base wrap width, adjusted by plot height + # Higher max_height allows longer lines + dynamic_wrap_width = max(8, min(max_sample_name_length, max_height // 2)) + + wrapped_group = '\n'.join(textwrap.wrap(group, width=dynamic_wrap_width)) + ax.set_ylabel(wrapped_group, fontweight='normal', + color=y_sample_color, fontsize=y_sample_fontsize) + + else: + ax.set_ylabel(group, fontweight='normal', + color=y_sample_color, fontsize=y_sample_fontsize) + # Add subtitle for coloring method + coloring_method = "" + if color_by_protein_and_intensity: + coloring_method = "Colored by protein and intensity" + elif color_by is None: + coloring_method = None + elif color_by == 'intensity': + coloring_method = "Colored by intensity" + elif color_by == 'protein': + coloring_method = "Colored by protein" + elif color_by == 'count': + coloring_method = "Colored by detection count" + elif color_by == 'length': + coloring_method = "Colored by peptide length" + - # Add styled group label - ensure it's visible and consistent - ax.set_ylabel(group, fontweight='normal', - color=text_color, fontsize=y_lab_forntsize) + + # Set Sample color bar legend if using sample color bars + if use_sample_color_bars and external_legend: + # Calculate scaling factor based on plot height + height_scale_factor = len(groups) / 6.0 # Normalize to 6 groups as baseline + height_scale_factor = max(0.5, min(height_scale_factor, 2.0)) # Clamp between 0.5 and 2.0 + + # Calculate space needed for protein legend, scaled by plot height + base_item_height = 1 * height_scale_factor # Scale item height + base_title_padding = 0.06 * height_scale_factor # Scale title/padding + + protein_legend_height = len(protein_ids) * base_item_height + base_title_padding + + # Position sample legend below protein legend with scaled padding + padding = 0.3 * height_scale_factor + sample_legend_y = 1.0 - protein_legend_height - padding + + # Ensure sample legend doesn't go below available space + # sample_legend_y = max(sample_legend_y, 0.1) + + # Create sample legend with calculated position + sample_legend = legend_ax.legend( + handles=sample_legend_handles, + bbox_to_anchor=(0, sample_legend_y), + loc='upper left', + fontsize=legend_fontsize, + frameon=True, + framealpha=0.7, + facecolor=background_color, + edgecolor=grid_color, + title='Samples', + title_fontsize=legend_titleFontsize + ) + sample_legend.get_title().set_fontweight('bold') + + # Add the protein legend back (since matplotlib replaces it) + legend_ax.add_artist(protein_legend) + + # Add notes below sample legend + sample_items = len(groups) + sample_legend_height = sample_items * base_item_height + base_title_padding + current_note_y = sample_legend_y - sample_legend_height - 0.03 # Start position for notes + + # Prepare notes list + notes = [] + + # Add coloring method note if enabled + if coloring_method and colour_by_text: + notes.append(f"Coloring: {coloring_method}") + + # Add auto-detected regions note if applicable + if auto_regions and len(auto_regions) > 0: + regions_str = ", ".join([f"{start}-{end}" for start, end in auto_regions]) + notes.append(f"High density regions: {regions_str}") + + # Add all notes + for i, note in enumerate(notes): + note_y_pos = current_note_y - (i * 0.04) # Space between notes + + # Ensure note doesn't go below available space + # note_y_pos = max(note_y_pos, 0.02) + + # Determine color for the note + note_color = highlight_color if "High density regions" in note else text_color + + # Add the note text + legend_ax.text(0.0, note_y_pos, f"Note: {note}", + transform=legend_ax.transAxes, + fontsize=legend_fontsize-1, + fontstyle='italic', + color=note_color, + ha='left', va='top', + wrap=True) + + # Set y-ticks and labels ax.set_yticks([]) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) @@ -613,35 +773,20 @@ def plot_peptigram( plt.figtext(0.5, 0.94, protein_str, ha='center', color=text_color, fontsize=9, fontstyle='italic') - # Add subtitle for coloring method - coloring_method = "" - if color_by_protein_and_intensity: - coloring_method = "Colored by protein and intensity" - elif color_by is None: - coloring_method = None - elif color_by == 'intensity': - coloring_method = "Colored by intensity" - elif color_by == 'protein': - coloring_method = "Colored by protein" - elif color_by == 'count': - coloring_method = "Colored by detection count" - elif color_by == 'length': - coloring_method = "Colored by peptide length" - - - if coloring_method: - y_pos = 0.92 if len(protein_ids) <= 1 else 0.90 - plt.figtext(0.5, y_pos, coloring_method, ha='center', color=text_color, - fontsize=9, fontstyle='italic') - - # Add subtitle for auto-detected regions if applicable - if auto_regions and len(auto_regions) > 0: - regions_str = ", ".join( - [f"{start}-{end}" for start, end in auto_regions]) - y_pos = 0.90 if len( - protein_ids) <= 1 and not coloring_method else 0.88 - plt.figtext(0.5, y_pos, f"Auto-highlighted regions: {regions_str}", - ha='center', fontsize=9, fontstyle='italic', color=highlight_color) + if use_sample_color_bars == False: + if coloring_method and colour_by_text: + y_pos = 0.92 if len(protein_ids) <= 1 else 0.90 + plt.figtext(0.5, y_pos, coloring_method, ha='center', color=text_color, + fontsize=9, fontstyle='italic') + + # Add subtitle for auto-detected regions if applicable + if auto_regions and len(auto_regions) > 0: + regions_str = ", ".join( + [f"{start}-{end}" for start, end in auto_regions]) + y_pos = 0.90 if len( + protein_ids) <= 1 and not coloring_method else 0.88 + plt.figtext(0.5, y_pos, f"High density regions: {regions_str}", + ha='center', fontsize=9, fontstyle='italic', color=highlight_color) # Handle x-tick labels for all axes except the last one for i in range(len(axs)): diff --git a/__init__.py b/__init__.py index 4ba8dca..6dc9cc6 100644 --- a/__init__.py +++ b/__init__.py @@ -1 +1,17 @@ -# import peptigram \ No newline at end of file +# import peptigram +# ProtPeptigram/__init__.py +# import sys +# import os + +# # Add the parent directory to the path so we can import main.py from root +# sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) + +# try: +# from main import main +# except ImportError: +# # If main.py is not in root, define a placeholder +# def main(): +# print("Main function not found") +# return 1 + +# __all__ = ['main'] \ No newline at end of file diff --git a/data/JCI146771_Mouse_peptides_peaks_online.csv b/data/JCI146771_Mouse_peptides_peaks_online.csv index 01d5940..a497b68 100644 --- a/data/JCI146771_Mouse_peptides_peaks_online.csv +++ b/data/JCI146771_Mouse_peptides_peaks_online.csv @@ -1,4 +1,4 @@ -Peptide,-10lgP,Mass,Length,ppm,m/z,RT,Accession,PTM,AScore,Area Sample 16,Intensity_Hep_B10BR_28-14-8s,Intensity_Hep_B10BR_K9-178,Intensity_178-3 skin graft_28-14-8s,Intensity_178-3 skin graft_K9-178,Intensity_178-3 spleen_K9-178,Intensity_178-3 spleen_Y-3 +Peptide,-10lgP,Mass,Length,ppm,m/z,RT,Accession,PTM,AScore,Area Sample 16,Intensity_Hep_B10BR_28-14-8s,intensity_Hep_B10BR_K9-178,Intensity_178-3 skin graft_28-14-8s,Intensity_178-3 skin graft_K9-178,Intensity_178-3 spleen_K9-178,Intensity_178-3 spleen_Y-3 TPVITGAPYEYR,55.75,1365.6929,12,-0.6,683.8533,41.7,i#CONTAM#RT-pepF,,,9.94E+08,6.10E+08,6.70E+08,7.41E+08,7.53E+08,1.26E+09,7.62E+08 YILAGVENSK,49.2,1092.5815,10,0.4,1093.5892,35.74,i#CONTAM#RT-pepD,,,5.77E+08,5.92E+08,5.10E+08,4.62E+08,5.66E+08,9.02E+08,7.54E+08 GTFIIDPGGVIR,50.91,1243.6924,12,0.6,415.5717,57.95,i#CONTAM#RT-pepI,,,4.77E+08,3.35E+08,3.56E+08,4.68E+08,5.75E+08,8.92E+08,3.69E+08 diff --git a/main.py b/main.py deleted file mode 100644 index 369353f..0000000 --- a/main.py +++ /dev/null @@ -1,178 +0,0 @@ -""" Peptigram: mapping peptide to source protein""" - -import argparse -import sys -from multiprocessing import Pool -from rich.text import Text -from rich_argparse import RichHelpFormatter -from ProtPeptigram.logger import CONSOLE -from ProtPeptigram import __author__, __version__ -from ProtPeptigram.runner import run_pipeline - - -def _welcome(): - """Display application banner.""" - - tool_icon = r""" - ____ __ ____ __ _ - / __ \_________ / /_ / __ \___ ____ / /_(_)___ __________ _____ ___ - / /_/ / ___/ __ \/ __/_____/ /_/ / _ \/ __ \/ __/ / __ `/ ___/ __ `/ __ `__ \ - / ____/ / / /_/ / /_/_____/ ____/ __/ /_/ / /_/ / /_/ / / / /_/ / / / / / / -/_/ /_/ \____/\__/ /_/ \___/ .___/\__/_/\__, /_/ \__,_/_/ /_/ /_/ - /_/ /____/ - """ - - CONSOLE.print(tool_icon, style="blue") - - - -def _print_credits(credits=False): - """Print software credits to terminal.""" - text = Text() - text.append("\n") - if credits: - text.append("Please cite: \n", style="bold") - text.append( - "GibbsCluster - 2.0 (Simultaneous alignment and clustering of peptide data)\n", - style="bold link https://services.healthtech.dtu.dk/services/GibbsCluster-2.0/", - ) - text.append( - "Seq2Logo - 2.0 (Visualization of amino acid binding motifs)\n", - style="bold link https://services.healthtech.dtu.dk/services/Seq2Logo-2.0/", - ) - text.append( - "MHC Motif Atlas (MHC motif PSM matrics are genrated using mhcmotifatlas please cite [link])\n\n", - style="bold link http://mhcmotifatlas.org/home", - ) - else: - text.append( - "Prot-Petigram", style="bold link https://www.monash.edu/research/compomics/" - ) - text.append(f" (v{__version__})\n", style="bold") - if credits: - text.append( - "HLA motif finder pipeline for identifying peptide motif immunopeptididomic data.\n", - style="italic", - ) - text.append("Developed at Li Lab / Purcell Lab, Monash University, Australia.\n") - text.append("Please cite: ") - if credits: - text.append( - "Sanjay Krishna, Nathon Craft & Chen Li et al. bioRxiv (2024)", - style="link https://www.monash.edu/research/compomics/", - ) - else: - text.append( - "Sanjay Krishna & Chen Li et al. bioRxiv (2024)", - style="ttps://www.monash.edu/research/compomics/", - ) - text.append("\n") - if credits: - text.stylize("#006cb5") - CONSOLE.print(text) - - -def main(): - """Main function to parse CLI arguments and execute the pipeline.""" - _welcome() - _print_credits() - parser = argparse.ArgumentParser( - description="Prot-Petigram: Mapping peptides to source protein 🧬🧬🧬.", - formatter_class=lambda prog: RichHelpFormatter(prog, max_help_position=42), - ) - - parser.add_argument( - "-i", - "--input", - type=str, - help="input path csv file from peaks output" - ) - - parser.add_argument( - "-o", - "--output", - type=str, - help="output dir path to save the processed data" - ) - parser.add_argument( - "-f", - "--fasta", - type=str, - help="fasta file containing protein sequences" - ) - parser.add_argument( - "-r", - "--regex", - type=str, - help=r"regex pattern to filter protein names. e.g., '(\W+\d+)\|' to extract protein names from Uniprot IDs" - ) - - parser.add_argument( - "-th", - "--threshold", - type=float, - default=0.0, - help="intensity threshold for filtering peptides" - ) - - parser.add_argument( - "-ms", - "--min-samples", - type=int, - default=1, - help="minimum number of samples a peptide must be present in" - ) - - parser.add_argument( - "-pl", - "--protein_list", - type=str, - help="path to a text file containing a list of protein IDs to filter" - ) - - parser.add_argument( - "-tp", - "--top", - type=str, - default=5, - help="fasta file containing protein sequences" - ) - - parser.add_argument( - "-v", - "--version", - action="version", - version=f"Prot-Petigram v{__version__}" - ) - - parser.add_argument( - "-c", - "--credits", - action="store_true", - help="Print software credits" - ) - - args = parser.parse_args() - if args.credits: - _print_credits(credits=True) - sys.exit(0) - - if args.input and args.fasta: - run_pipeline( - csv_path=args.input, - fasta_path=args.fasta, - output_dir=args.output, - top=args.top, - protein_list=args.protein_list, - regex_pattern=args.regex, - intensity_threshold=args.threshold, - min_samples=args.min_samples - ) - else: - parser.error("Both input CSV file (-i/--input) and FASTA file (-f/--fasta) are required.") - - - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/test_output.zip b/test_output.zip new file mode 100644 index 0000000..fd0350c Binary files /dev/null and b/test_output.zip differ