galaxyproject
diff --git a/‎catalog/build/py/build_files_from_ncbi.py‎
Lines changed: 1 addition & 1 deletion b/‎catalog/build/py/build_files_from_ncbi.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎catalog/output/qc-report.md‎ renamed to ‎catalog/output/qc-report.data.md‎ b/‎catalog/output/qc-report.md‎ renamed to ‎catalog/output/qc-report.data.md‎
diff --git a/‎catalog/output/qc-report.workflows.md‎
Lines changed: 21 additions & 0 deletions b/‎catalog/output/qc-report.workflows.md‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎catalog/output/workflows.json‎
Lines changed: 6 additions & 6 deletions b/‎catalog/output/workflows.json‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎catalog/py_package/catalog_build/build.py‎
Lines changed: 78 additions & 71 deletions b/‎catalog/py_package/catalog_build/build.py‎
Lines changed: 78 additions & 71 deletions
@@ -11,7 +11,7 @@
     "catalog/build/intermediate/outbreak-taxonomy-mapping.tsv"
 )
 
-QC_REPORT_PATH = "catalog/output/qc-report.md"
+QC_REPORT_PATH = "catalog/output/qc-report.data.md"
 TREE_OUTPUT_PATH = "catalog/output/ncbi-taxa-tree.json"
 
 TAXONOMIC_GROUPS_BY_TAXONOMY_ID = {
 
@@ -0,0 +1,21 @@
+# Catalog Workflows QC report
+
+## Workflows not using newest IWC version
+
+None
+
+## Workflows with unknown category and one valid category (kept)
+
+- #workflow/github.com/iwc-workflows/fastq-to-matrix-10x/scrna-seq-fastq-to-matrix-10x-cellplex (categories: OTHER, TRANSCRIPTOMICS)
+- #workflow/github.com/iwc-workflows/fastq-to-matrix-10x/scrna-seq-fastq-to-matrix-10x-v3 (categories: OTHER, TRANSCRIPTOMICS)
+- #workflow/github.com/iwc-workflows/sars-cov-2-pe-illumina-artic-variant-calling/COVID-19-PE-ARTIC-ILLUMINA (categories: OTHER, VARIANT_CALLING)
+- #workflow/github.com/iwc-workflows/sars-cov-2-pe-illumina-wgs-variant-calling/COVID-19-PE-WGS-ILLUMINA (categories: OTHER, VARIANT_CALLING)
+- #workflow/github.com/iwc-workflows/sars-cov-2-se-illumina-wgs-variant-calling/COVID-19-SE-WGS-ILLUMINA (categories: OTHER, VARIANT_CALLING)
+
+## Workflows with only unknown category (excluded)
+
+None
+
+## Workflows with multiple valid categories
+
+- #workflow/github.com/iwc-workflows/generic-non-segmented-viral-variant-calling/main (categories: CONSENSUS_SEQUENCES, VARIANT_CALLING)
@@ -24,7 +24,7 @@
         "ploidy": "ANY",
         "taxonomyId": "11158",
         "trsId": "#workflow/github.com/iwc-workflows/generic-non-segmented-viral-variant-calling/main/versions/v0.1",
-        "workflowDescription": "Variant calling and consensus sequence generation for batches of Illumina PE sequenced viruses with uncomplicated and stable genome structure (like e.g. Morbilliviruses).",
+        "workflowDescription": "Variant calling and consensus sequence generation for batches of Illumina PE sequenced viruses with uncomplicated and stable genome structure (like e.g. Morbilliviruses). It can handle both ampliconic and non-ampliconic data.",
         "workflowName": "Variant calling and consensus construction from paired end short read data of non-segmented viral genomes"
       },
       {
@@ -137,7 +137,7 @@
         ],
         "ploidy": "ANY",
         "taxonomyId": null,
-        "trsId": "#workflow/github.com/iwc-workflows/fastq-to-matrix-10x/scrna-seq-fastq-to-matrix-10x-cellplex/versions/v0.6.2",
+        "trsId": "#workflow/github.com/iwc-workflows/fastq-to-matrix-10x/scrna-seq-fastq-to-matrix-10x-cellplex/versions/v0.6.3",
         "workflowDescription": "Comprehensive preprocessing for 10X Genomics CellPlex multiplexed single-cell RNA-seq data. Processes Cell Multiplexing Oligo (CMO) FASTQ files with CITE-seq-Count including required CellPlex-specific translation steps. Simultaneously processes gene expression FASTQ files with STARsolo and DropletUtils for alignment and cell filtering, and formats outputs for seamless import into Seurat/Scanpy (Read10X function).",
         "workflowName": "Single-Cell RNA-seq Preprocessing: 10X Genomics CellPlex Multiplexed Samples"
       },
@@ -159,7 +159,7 @@
         ],
         "ploidy": "ANY",
         "taxonomyId": null,
-        "trsId": "#workflow/github.com/iwc-workflows/fastq-to-matrix-10x/scrna-seq-fastq-to-matrix-10x-v3/versions/v0.6.2",
+        "trsId": "#workflow/github.com/iwc-workflows/fastq-to-matrix-10x/scrna-seq-fastq-to-matrix-10x-v3/versions/v0.6.3",
         "workflowDescription": "Complete preprocessing pipeline for 10X Genomics v3 single-cell RNA-seq data. Aligns raw FASTQ files using STARsolo, performs cell calling and quality filtering with DropletUtils, and formats outputs for seamless import into Seurat/Scanpy (Read10X function).",
         "workflowName": "Single-Cell RNA-seq Preprocessing: 10X Genomics v3 to Seurat and Scanpy Compatible Format"
       },
@@ -323,7 +323,7 @@
         "ploidy": "ANY",
         "taxonomyId": "11158",
         "trsId": "#workflow/github.com/iwc-workflows/generic-non-segmented-viral-variant-calling/main/versions/v0.1",
-        "workflowDescription": "Variant calling and consensus sequence generation for batches of Illumina PE sequenced viruses with uncomplicated and stable genome structure (like e.g. Morbilliviruses).",
+        "workflowDescription": "Variant calling and consensus sequence generation for batches of Illumina PE sequenced viruses with uncomplicated and stable genome structure (like e.g. Morbilliviruses). It can handle both ampliconic and non-ampliconic data.",
         "workflowName": "Variant calling and consensus construction from paired end short read data of non-segmented viral genomes"
       },
       {
@@ -358,9 +358,9 @@
         ],
         "ploidy": "HAPLOID",
         "taxonomyId": "2",
-        "trsId": "#workflow/github.com/iwc-workflows/amr_gene_detection/main/versions/v1.1.5",
+        "trsId": "#workflow/github.com/iwc-workflows/amr_gene_detection/main/versions/v1.1.6",
         "workflowDescription": "Antimicrobial resistance gene detection from assembled bacterial genomes",
-        "workflowName": "amr_gene_detection"
+        "workflowName": "AMR gene detection"
       },
       {
         "iwcId": "lncrnas-annotation-main",
 
@@ -10,6 +10,8 @@
 import yaml
 from bs4 import BeautifulSoup
 
+from .qc_utils import format_list_section, format_raw_section, join_report
+
 MAX_NCBI_URL_LENGTH = 2000  # The actual limit seems to be a bit over 4000
 
 log = logging.getLogger(__name__)
@@ -1141,52 +1143,34 @@ def make_qc_report(
     suppressed_genomes=None,
     paired_accessions=None,
 ):
-    ncbi_assemblies_text = (
-        "None"
-        if len(missing_ncbi_assemblies) == 0
-        else "\n".join([f"- {accession}" for accession in missing_ncbi_assemblies])
+    # Convert simple lists to items for format_list_section
+    ncbi_assemblies_items = (
+        list(missing_ncbi_assemblies) if missing_ncbi_assemblies else []
     )
-    ucsc_assemblies_text = (
-        "None"
-        if len(missing_ucsc_assemblies) == 0
-        else "\n".join([f"- {accession}" for accession in missing_ucsc_assemblies])
+    ucsc_assemblies_items = (
+        list(missing_ucsc_assemblies) if missing_ucsc_assemblies else []
     )
-    gene_model_urls_text = (
-        "N/A"
-        if missing_gene_model_urls is None
-        else (
-            "None"
-            if len(missing_gene_model_urls) == 0
-            else "\n".join([f"- {accession}" for accession in missing_gene_model_urls])
-        )
+    gene_model_urls_items = (
+        list(missing_gene_model_urls) if missing_gene_model_urls else None
     )
-    taxonomy_ids_text = (
-        "None"
-        if len(inconsistent_taxonomy_ids) == 0
-        else "\n".join(
-            [f"- {taxon}: {ids}" for taxon, ids in inconsistent_taxonomy_ids]
-        )
+    taxonomy_ids_items = (
+        [f"{taxon}: {ids}" for taxon, ids in inconsistent_taxonomy_ids]
+        if inconsistent_taxonomy_ids
+        else []
     )
-    ploidy_assemblies_text = (
-        "None"
-        if missing_ploidy_assemblies is None
-        else (
-            "None"
-            if len(missing_ploidy_assemblies) == 0
-            else "\n".join(
-                [
-                    f"- {accession} (speciesTaxonomyId: {tax_id})"
-                    for accession, tax_id in missing_ploidy_assemblies
-                ]
-            )
-        )
+    ploidy_assemblies_items = (
+        [
+            f"{accession} (speciesTaxonomyId: {tax_id})"
+            for accession, tax_id in missing_ploidy_assemblies
+        ]
+        if missing_ploidy_assemblies
+        else None
+    )
+    outbreak_descendants_items = (
+        [str(tax_id) for tax_id in missing_outbreak_descendants]
+        if missing_outbreak_descendants
+        else []
     )
-    if missing_outbreak_descendants is None or len(missing_outbreak_descendants) == 0:
-        outbreak_descendants_text = "None"
-    else:
-        outbreak_descendants_text = "\n".join(
-            [f"- {tax_id}" for tax_id in missing_outbreak_descendants]
-        )
     if tree_checks is None:
         tree_checks_text = "No checks done"
     else:
@@ -1216,42 +1200,65 @@ def make_qc_report(
             + ("None" if not tree_missing_ranks else ", ".join(tree_missing_ranks))
         )
 
-    outdated_accessions_text = (
-        "None"
-        if outdated_accessions is None or len(outdated_accessions) == 0
-        else "\n".join(
-            [f"- {acc} (current: {curr_acc})" for acc, curr_acc in outdated_accessions]
-        )
+    outdated_accessions_items = (
+        [f"{acc} (current: {curr_acc})" for acc, curr_acc in outdated_accessions]
+        if outdated_accessions
+        else []
     )
-
-    suppressed_genomes_text = (
-        "None"
-        if suppressed_genomes is None or len(suppressed_genomes) == 0
-        else "\n".join(
-            [f"- {acc} (status: {status})" for acc, status in suppressed_genomes]
+    suppressed_genomes_items = (
+        [f"{acc} (status: {status})" for acc, status in suppressed_genomes]
+        if suppressed_genomes
+        else []
+    )
+    paired_accessions_items = (
+        [f"{gca} (paired RefSeq: {gcf})" for gca, gcf in paired_accessions]
+        if paired_accessions
+        else []
+    )
+    # Compose report modularly using shared QC utils
+    lines = ["# Catalog Data QC report", ""]
+    lines += format_list_section(
+        "## Assemblies not found on NCBI", ncbi_assemblies_items
+    )
+    lines += format_list_section(
+        "## Assemblies not found in UCSC list", ucsc_assemblies_items
+    )
+    # Gene model URLs can be None (N/A case)
+    if gene_model_urls_items is None:
+        lines += format_raw_section(
+            "## Assemblies with gene model URLs not found", "N/A"
         )
+    else:
+        lines += format_list_section(
+            "## Assemblies with gene model URLs not found", gene_model_urls_items
+        )
+    lines += format_list_section(
+        "## Species and strain combinations with multiple taxonomy IDs",
+        taxonomy_ids_items,
     )
-
-    paired_accessions_text = (
-        "None"
-        if paired_accessions is None or len(paired_accessions) == 0
-        else "\n".join(
-            [f"- {gca} (paired RefSeq: {gcf})" for gca, gcf in paired_accessions]
+    # Ploidy assemblies can be None (N/A case)
+    if ploidy_assemblies_items is None:
+        lines += format_raw_section("## Assemblies without ploidy information", "N/A")
+    else:
+        lines += format_list_section(
+            "## Assemblies without ploidy information", ploidy_assemblies_items
         )
+    lines += format_list_section(
+        "## Outbreak descendant taxonomy IDs not found in genomes data",
+        outbreak_descendants_items,
     )
-    return (
-        f"# Catalog QC report\n\n"
-        f"## Assemblies not found on NCBI\n\n{ncbi_assemblies_text}\n\n"
-        f"## Assemblies not found in UCSC list\n\n{ucsc_assemblies_text}\n\n"
-        f"## Assemblies with gene model URLs not found\n\n{gene_model_urls_text}\n\n"
-        f"## Species and strain combinations with multiple taxonomy IDs\n\n{taxonomy_ids_text}\n\n"
-        f"## Assemblies without ploidy information\n\n{ploidy_assemblies_text}\n\n"
-        f"## Outbreak descendant taxonomy IDs not found in genomes data\n\n{outbreak_descendants_text}\n\n"
-        f"## Outdated assembly accessions\n\n{outdated_accessions_text}\n\n"
-        f"## Suppressed or retired genomes\n\n{suppressed_genomes_text}\n\n"
-        f"## GenBank assemblies with paired RefSeq accessions\n\n{paired_accessions_text}\n\n"
-        f"## Taxonomy tree\n\n{tree_checks_text}\n"
+    lines += format_list_section(
+        "## Outdated assembly accessions", outdated_accessions_items
+    )
+    lines += format_list_section(
+        "## Suppressed or retired genomes", suppressed_genomes_items
+    )
+    lines += format_list_section(
+        "## GenBank assemblies with paired RefSeq accessions",
+        paired_accessions_items,
     )
+    lines += format_raw_section("## Taxonomy tree", tree_checks_text)
+    return join_report(lines)
 
 
 def get_outbreak_taxonomy_ids(outbreaks_path, get_primary=True, get_descendants=False):
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`	`"catalog/build/intermediate/outbreak-taxonomy-mapping.tsv"`
`12`	`12`	`)`
`13`	`13`
`14`		`-QC_REPORT_PATH = "catalog/output/qc-report.md"`
	`14`	`+QC_REPORT_PATH = "catalog/output/qc-report.data.md"`
`15`	`15`	`TREE_OUTPUT_PATH = "catalog/output/ncbi-taxa-tree.json"`
`16`	`16`
`17`	`17`	`TAXONOMIC_GROUPS_BY_TAXONOMY_ID = {`