update bases2fastq MultiQC module

Mo Samman · Mo Samman · commit f4c7a0d464f0 · 2025-08-15T17:17:28.000-04:00
- Sequencing Run QC Metric Table
	- Display "# Polonies" as full number
- Sequencing Run Yield
	- Change x-axis label
- Run Base Quality Histogram
	- Add x-axis label
- Quality Metrics by cycle
	- Update description
- (Project) Sequencing QC metrics table
	- Display "# Polonies" as full number
- Sample QC Metrics Table
	- Display "# Polonies" as full number
	- Add "Percentage Mismatch"
- Per Cycle Base Content
	- Add per sample buttons
- Per Cycle Adapter Content
	- Update description
- Per Sample GC Histogram
	- Update x-axis label
	- Update y-axis label
	- Add per sample buttons
diff --git a/multiqc/modules/bases2fastq/plot_project_runs.py b/multiqc/modules/bases2fastq/plot_project_runs.py
@@ -22,11 +22,10 @@ def tabulate_project_run_stats(run_data, color_dict):
 
     headers = {}
     headers["num_polonies_run"] = {
-        "title": f"# Polonies ({config.base_count_prefix})",
-        "description": f"The total number of polonies that are calculated for the run ({config.base_count_desc})",
+        "title": "# Polonies",
+        "description": "The total number of polonies that are calculated for the run",
         "min": 0,
         "scale": "RdYlGn",
-        "shared_key": "base_count",
     }
     headers["percent_assigned_run"] = {
         "title": "% Assigned Reads",
diff --git a/multiqc/modules/bases2fastq/plot_runs.py b/multiqc/modules/bases2fastq/plot_runs.py
@@ -54,7 +54,7 @@ def plot_run_stats(run_data, color_dict):
     pconfig = {
         "data_labels": [
             {"name": "Number of Polonies", "ylab": "Number of Polonies", "format": "{d}"},
-            {"name": "Yield (Gb)", "ylab": "Gb"},
+            {"name": "Yield (Gb)", "ylab": "Yield"},
         ],
         "cpswitch": True,
         "stacking": "normal",
@@ -99,11 +99,10 @@ def tabulate_run_stats(run_data, color_dict):
 
     headers = {}
     headers["num_polonies_run"] = {
-        "title": f"# Polonies ({config.base_count_prefix})",
-        "description": f"The total number of polonies that are calculated for the run. ({config.base_count_desc})",
+        "title": "# Polonies",
+        "description": "The total number of polonies that are calculated for the run.",
         "min": 0,
         "scale": "RdYlGn",
-        "shared_key": "base_count",
     }
     headers["percent_assigned_run"] = {
         "title": "% Assigned Reads",
@@ -221,6 +220,7 @@ def plot_base_quality_hist(run_data, color_dict):
         "id": "per_run_bq_hist",
         "title": "bases2fastq: Quality Histograms",
         "ylab": "Percentage",
+        "xlab": "Q score",
     }
     plot_html = linegraph.plot(plot_content, pconfig=pconfig)
     plot_name = "Run Base Quality Histogram"
@@ -347,10 +347,9 @@ def plot_base_quality_by_cycle(run_data, color_dict):
     plot_html = linegraph.plot(plot_content, pconfig=pconfig)
     plot_name = "Quality Metrics By Cycle"
     anchor = "per_cycle_quality"
-    description = "Per run base qualities by cycle"
+    description = "Per run base qualities by cycle. Read 1 and Read 2 are separated by a red dashed line."
     helptext = """
     This section plots the base qualities by each instrument cycle.\n
-    Choose between Median Quality, Mean Quality, Percent Q30 or Percentage Q40 per cycle.\n
-    Read 1 and Read 2 are separated by a red dashed line.
+    Choose between Median Quality, Mean Quality, Percent Q30 or Percentage Q40 per cycle.
     """
     return plot_html, plot_name, anchor, description, helptext, plot_content
diff --git a/multiqc/modules/bases2fastq/plot_samples.py b/multiqc/modules/bases2fastq/plot_samples.py
@@ -20,6 +20,7 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s
         general_stats.update({"mean_base_quality_sample": sample_data[s_name]["QualityScoreMean"]})
         general_stats.update({"percent_q30_sample": sample_data[s_name]["PercentQ30"]})
         general_stats.update({"percent_q40_sample": sample_data[s_name]["PercentQ40"]})
+        general_stats.update({"percent_mismatch": sample_data[s_name]["PercentMismatch"]})
         plot_content.update({s_name: general_stats})
 
     headers = {}
@@ -37,11 +38,10 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s
         "scale": False,
     }
     headers["num_polonies_sample"] = {
-        "title": f"# Polonies ({config.base_count_prefix})",
-        "description": f"The total number of polonies that are calculated for the run. ({config.base_count_desc})",
+        "title": "# Polonies",
+        "description": "The total number of polonies that are calculated for the run.",
         "min": 0,
         "scale": "Blues",
-        "shared_key": "base_count",
     }
     headers["yield_sample"] = {
         "title": "Yield (Gb)",
@@ -70,6 +70,14 @@ def tabulate_sample_stats(sample_data, group_lookup_dict, project_lookup_dict, s
         "scale": "RdYlGn",
         "suffix": "%",
     }
+    headers["percent_mismatch"] = {
+        "title": "Percent Mismatch",
+        "description": "The percentage of mismatching reads for the sample.",
+        "max": 100,
+        "min": 0,
+        "scale": "RdYlGn",
+        "suffix": "%",
+    }
 
     pconfig = {"id": "sample_qc_metric_table", "title": "Sample QC Metrics Table", "no_violin": True}
 
@@ -96,18 +104,20 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c
     """Create the epic HTML for the FastQC sequence content heatmap"""
 
     # Prep the data
-    data = dict()
+    all_data = dict()
+    plot_content = [all_data]
 
     r1r2_split = 0
     for s_name in sorted(sample_data.keys()):
-        paired_end = True if len(sample_data[s_name]["Reads"]) > 1 else False
         for base in "ACTG":
             base_s_name = "__".join([s_name, base])
-            data[base_s_name] = {}
+            all_data[base_s_name] = {}
             R1 = sample_data[s_name]["Reads"][0]["Cycles"]
             r1r2_split = max(r1r2_split, len(R1))
 
     for s_name in sorted(sample_data.keys()):
+        paired_end = True if len(sample_data[s_name]["Reads"]) > 1 else False
+
         R1 = sample_data[s_name]["Reads"][0]["Cycles"]
         for cycle in range(len(R1)):
             base_no = cycle + 1
@@ -116,7 +126,7 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c
 
             for base in "ACTG":
                 base_s_name = "__".join([s_name, base])
-                data[base_s_name].update(
+                all_data[base_s_name].update(
                     {base_no: float(R1[cycle]["BaseComposition"][base] / float(tot)) * 100.0 if tot > 0 else None}
                 )
 
@@ -128,15 +138,32 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c
 
                 for base in "ACTG":
                     base_s_name = "__".join([s_name, base])
-                    data[base_s_name].update(
+                    all_data[base_s_name].update(
                         {base_no: float(R2[cycle]["BaseComposition"][base] / float(tot)) * 100.0 if tot > 0 else None}
                     )
 
-    plot_content = data
+    default_label = {
+        "name": "All",
+        "xlab": "Cycle",
+        "ylab": "Percentage of total reads",
+    }
+    data_labels = [
+        default_label,
+    ]
+    for s_name in sorted(sample_data.keys()):
+        sample_plot_data = dict()
+        for base in "ACTG":
+            base_s_name = "__".join([s_name, base])
+            sample_plot_data[base_s_name] = all_data[base_s_name]
+        plot_content.append(sample_plot_data)
+        data_labels.append({
+            "name": s_name,
+            "xlab": default_label["xlab"],
+            "ylab": default_label["ylab"],
+        })
 
     pconfig = {
-        "xlab": "cycle",
-        "ylab": "Percentage",
+        "data_labels": data_labels,
         "x_lines": [{"color": "#FF0000", "width": 2, "value": r1r2_split, "dashStyle": "dash"}],
         "colors": color_dict,
         "ymin": 0,
@@ -147,8 +174,8 @@ def sequence_content_plot(sample_data, group_lookup_dict, project_lookup_dict, c
     plot_name = "Per Cycle Base Content"
     anchor = "base_content"
     description = """
-    Percentage of unidentified bases ("N" bases) by each sequencing cycle.
-    Read 1 and Read 2 are separated by a red dashed line
+    Base composition per sample per cycle.
+    Read 1 and Read 2 are separated by a red dashed line.
     """
     helptext = """
     If a sequencer is unable to make a base call with sufficient confidence then it will
@@ -236,6 +263,17 @@ def plot_per_read_gc_hist(sample_data, group_lookup_dict, project_lookup_dict, s
     Plot GC Histogram per Sample
     """
     gc_hist_dict = dict()
+    plot_content = [
+        gc_hist_dict,
+    ]
+    default_label = {
+        "name": "All",
+        "xlab": "Percentage of total reads",
+        "ylab": "Percentage of reads that are GC",
+    }
+    data_labels = [
+        default_label,
+    ]
     for s_name in sample_data.keys():
         R1_gc_counts = sample_data[s_name]["Reads"][0]["PerReadGCCountHistogram"]
         R2_gc_counts = [0] * len(R1_gc_counts)
@@ -249,11 +287,18 @@ def plot_per_read_gc_hist(sample_data, group_lookup_dict, project_lookup_dict, s
             gc_hist_dict[s_name].update({gc / RLen * 100: R1R2_gc_counts[gc] / totalReads * 100})
 
     # perReadQualityHistogram
-    plot_content = gc_hist_dict
+
+    for s_name in gc_hist_dict.keys():
+        plot_content.append({s_name: gc_hist_dict[s_name]})
+        data_labels.append({
+            "name": s_name,
+            "xlab": default_label["xlab"],
+            "ylab": default_label["ylab"],
+        })
+
 
     pconfig = {
-        "xlab": "% GC",
-        "ylab": "Percentage",
+        "data_labels": data_labels,
         "colors": sample_color,
         "id": "gc_hist",
         "title": "bases2fastq: Per Sample GC Content Histogram",
@@ -323,7 +368,10 @@ def plot_adapter_content(sample_data, group_lookup_dict, project_lookup_dict, sa
     pconfig.update({"colors": sample_color})
     plot_html = linegraph.plot(plot_content, pconfig=pconfig)
     anchor = "adapter_content"
-    description = "Adapter content per cycle"
+    description = """
+    Adapter content per cycle.
+    Read 1 and Read 2 are separated by a red dashed line.
+    """
     helptext = """
     The plot shows a cumulative percentage count of the proportion
     of your library which has seen each of the adapter sequences at each cycle.