Skip to content

Commit 3be155c

Browse files
authored
feat: add workflows qc report (#888)
## Description basic framework for making qc reports for workflow config. right now that includes notes about things like if we had to use a different version than what iwc claims is latest bc latest wasnt found in dockstore, or if it looks like a workflow we tried to mark active doesnt have a category. in future can include notes about data reqs too, which will be i think more important. happy to have the framework though, and it did help identify a case w two valid categories. maybe thats a thing we should talk about at some point, if were ok w a workflow showing twice on the same page under two different categories? ## Related Issue closes #400
2 parents 88452db + 415fa61 commit 3be155c

File tree

8 files changed

+344
-100
lines changed

8 files changed

+344
-100
lines changed

catalog/build/py/build_files_from_ncbi.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
"catalog/build/intermediate/outbreak-taxonomy-mapping.tsv"
1212
)
1313

14-
QC_REPORT_PATH = "catalog/output/qc-report.md"
14+
QC_REPORT_PATH = "catalog/output/qc-report.data.md"
1515
TREE_OUTPUT_PATH = "catalog/output/ncbi-taxa-tree.json"
1616

1717
TAXONOMIC_GROUPS_BY_TAXONOMY_ID = {
File renamed without changes.
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Catalog Workflows QC report
2+
3+
## Workflows not using newest IWC version
4+
5+
None
6+
7+
## Workflows with unknown category and one valid category (kept)
8+
9+
- #workflow/github.com/iwc-workflows/fastq-to-matrix-10x/scrna-seq-fastq-to-matrix-10x-cellplex (categories: OTHER, TRANSCRIPTOMICS)
10+
- #workflow/github.com/iwc-workflows/fastq-to-matrix-10x/scrna-seq-fastq-to-matrix-10x-v3 (categories: OTHER, TRANSCRIPTOMICS)
11+
- #workflow/github.com/iwc-workflows/sars-cov-2-pe-illumina-artic-variant-calling/COVID-19-PE-ARTIC-ILLUMINA (categories: OTHER, VARIANT_CALLING)
12+
- #workflow/github.com/iwc-workflows/sars-cov-2-pe-illumina-wgs-variant-calling/COVID-19-PE-WGS-ILLUMINA (categories: OTHER, VARIANT_CALLING)
13+
- #workflow/github.com/iwc-workflows/sars-cov-2-se-illumina-wgs-variant-calling/COVID-19-SE-WGS-ILLUMINA (categories: OTHER, VARIANT_CALLING)
14+
15+
## Workflows with only unknown category (excluded)
16+
17+
None
18+
19+
## Workflows with multiple valid categories
20+
21+
- #workflow/github.com/iwc-workflows/generic-non-segmented-viral-variant-calling/main (categories: CONSENSUS_SEQUENCES, VARIANT_CALLING)

catalog/output/workflows.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
"ploidy": "ANY",
2525
"taxonomyId": "11158",
2626
"trsId": "#workflow/github.com/iwc-workflows/generic-non-segmented-viral-variant-calling/main/versions/v0.1",
27-
"workflowDescription": "Variant calling and consensus sequence generation for batches of Illumina PE sequenced viruses with uncomplicated and stable genome structure (like e.g. Morbilliviruses).",
27+
"workflowDescription": "Variant calling and consensus sequence generation for batches of Illumina PE sequenced viruses with uncomplicated and stable genome structure (like e.g. Morbilliviruses). It can handle both ampliconic and non-ampliconic data.",
2828
"workflowName": "Variant calling and consensus construction from paired end short read data of non-segmented viral genomes"
2929
},
3030
{
@@ -137,7 +137,7 @@
137137
],
138138
"ploidy": "ANY",
139139
"taxonomyId": null,
140-
"trsId": "#workflow/github.com/iwc-workflows/fastq-to-matrix-10x/scrna-seq-fastq-to-matrix-10x-cellplex/versions/v0.6.2",
140+
"trsId": "#workflow/github.com/iwc-workflows/fastq-to-matrix-10x/scrna-seq-fastq-to-matrix-10x-cellplex/versions/v0.6.3",
141141
"workflowDescription": "Comprehensive preprocessing for 10X Genomics CellPlex multiplexed single-cell RNA-seq data. Processes Cell Multiplexing Oligo (CMO) FASTQ files with CITE-seq-Count including required CellPlex-specific translation steps. Simultaneously processes gene expression FASTQ files with STARsolo and DropletUtils for alignment and cell filtering, and formats outputs for seamless import into Seurat/Scanpy (Read10X function).",
142142
"workflowName": "Single-Cell RNA-seq Preprocessing: 10X Genomics CellPlex Multiplexed Samples"
143143
},
@@ -159,7 +159,7 @@
159159
],
160160
"ploidy": "ANY",
161161
"taxonomyId": null,
162-
"trsId": "#workflow/github.com/iwc-workflows/fastq-to-matrix-10x/scrna-seq-fastq-to-matrix-10x-v3/versions/v0.6.2",
162+
"trsId": "#workflow/github.com/iwc-workflows/fastq-to-matrix-10x/scrna-seq-fastq-to-matrix-10x-v3/versions/v0.6.3",
163163
"workflowDescription": "Complete preprocessing pipeline for 10X Genomics v3 single-cell RNA-seq data. Aligns raw FASTQ files using STARsolo, performs cell calling and quality filtering with DropletUtils, and formats outputs for seamless import into Seurat/Scanpy (Read10X function).",
164164
"workflowName": "Single-Cell RNA-seq Preprocessing: 10X Genomics v3 to Seurat and Scanpy Compatible Format"
165165
},
@@ -323,7 +323,7 @@
323323
"ploidy": "ANY",
324324
"taxonomyId": "11158",
325325
"trsId": "#workflow/github.com/iwc-workflows/generic-non-segmented-viral-variant-calling/main/versions/v0.1",
326-
"workflowDescription": "Variant calling and consensus sequence generation for batches of Illumina PE sequenced viruses with uncomplicated and stable genome structure (like e.g. Morbilliviruses).",
326+
"workflowDescription": "Variant calling and consensus sequence generation for batches of Illumina PE sequenced viruses with uncomplicated and stable genome structure (like e.g. Morbilliviruses). It can handle both ampliconic and non-ampliconic data.",
327327
"workflowName": "Variant calling and consensus construction from paired end short read data of non-segmented viral genomes"
328328
},
329329
{
@@ -358,9 +358,9 @@
358358
],
359359
"ploidy": "HAPLOID",
360360
"taxonomyId": "2",
361-
"trsId": "#workflow/github.com/iwc-workflows/amr_gene_detection/main/versions/v1.1.5",
361+
"trsId": "#workflow/github.com/iwc-workflows/amr_gene_detection/main/versions/v1.1.6",
362362
"workflowDescription": "Antimicrobial resistance gene detection from assembled bacterial genomes",
363-
"workflowName": "amr_gene_detection"
363+
"workflowName": "AMR gene detection"
364364
},
365365
{
366366
"iwcId": "lncrnas-annotation-main",

catalog/py_package/catalog_build/build.py

Lines changed: 78 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import yaml
1111
from bs4 import BeautifulSoup
1212

13+
from .qc_utils import format_list_section, format_raw_section, join_report
14+
1315
MAX_NCBI_URL_LENGTH = 2000 # The actual limit seems to be a bit over 4000
1416

1517
log = logging.getLogger(__name__)
@@ -1141,52 +1143,34 @@ def make_qc_report(
11411143
suppressed_genomes=None,
11421144
paired_accessions=None,
11431145
):
1144-
ncbi_assemblies_text = (
1145-
"None"
1146-
if len(missing_ncbi_assemblies) == 0
1147-
else "\n".join([f"- {accession}" for accession in missing_ncbi_assemblies])
1146+
# Convert simple lists to items for format_list_section
1147+
ncbi_assemblies_items = (
1148+
list(missing_ncbi_assemblies) if missing_ncbi_assemblies else []
11481149
)
1149-
ucsc_assemblies_text = (
1150-
"None"
1151-
if len(missing_ucsc_assemblies) == 0
1152-
else "\n".join([f"- {accession}" for accession in missing_ucsc_assemblies])
1150+
ucsc_assemblies_items = (
1151+
list(missing_ucsc_assemblies) if missing_ucsc_assemblies else []
11531152
)
1154-
gene_model_urls_text = (
1155-
"N/A"
1156-
if missing_gene_model_urls is None
1157-
else (
1158-
"None"
1159-
if len(missing_gene_model_urls) == 0
1160-
else "\n".join([f"- {accession}" for accession in missing_gene_model_urls])
1161-
)
1153+
gene_model_urls_items = (
1154+
list(missing_gene_model_urls) if missing_gene_model_urls else None
11621155
)
1163-
taxonomy_ids_text = (
1164-
"None"
1165-
if len(inconsistent_taxonomy_ids) == 0
1166-
else "\n".join(
1167-
[f"- {taxon}: {ids}" for taxon, ids in inconsistent_taxonomy_ids]
1168-
)
1156+
taxonomy_ids_items = (
1157+
[f"{taxon}: {ids}" for taxon, ids in inconsistent_taxonomy_ids]
1158+
if inconsistent_taxonomy_ids
1159+
else []
11691160
)
1170-
ploidy_assemblies_text = (
1171-
"None"
1172-
if missing_ploidy_assemblies is None
1173-
else (
1174-
"None"
1175-
if len(missing_ploidy_assemblies) == 0
1176-
else "\n".join(
1177-
[
1178-
f"- {accession} (speciesTaxonomyId: {tax_id})"
1179-
for accession, tax_id in missing_ploidy_assemblies
1180-
]
1181-
)
1182-
)
1161+
ploidy_assemblies_items = (
1162+
[
1163+
f"{accession} (speciesTaxonomyId: {tax_id})"
1164+
for accession, tax_id in missing_ploidy_assemblies
1165+
]
1166+
if missing_ploidy_assemblies
1167+
else None
1168+
)
1169+
outbreak_descendants_items = (
1170+
[str(tax_id) for tax_id in missing_outbreak_descendants]
1171+
if missing_outbreak_descendants
1172+
else []
11831173
)
1184-
if missing_outbreak_descendants is None or len(missing_outbreak_descendants) == 0:
1185-
outbreak_descendants_text = "None"
1186-
else:
1187-
outbreak_descendants_text = "\n".join(
1188-
[f"- {tax_id}" for tax_id in missing_outbreak_descendants]
1189-
)
11901174
if tree_checks is None:
11911175
tree_checks_text = "No checks done"
11921176
else:
@@ -1216,42 +1200,65 @@ def make_qc_report(
12161200
+ ("None" if not tree_missing_ranks else ", ".join(tree_missing_ranks))
12171201
)
12181202

1219-
outdated_accessions_text = (
1220-
"None"
1221-
if outdated_accessions is None or len(outdated_accessions) == 0
1222-
else "\n".join(
1223-
[f"- {acc} (current: {curr_acc})" for acc, curr_acc in outdated_accessions]
1224-
)
1203+
outdated_accessions_items = (
1204+
[f"{acc} (current: {curr_acc})" for acc, curr_acc in outdated_accessions]
1205+
if outdated_accessions
1206+
else []
12251207
)
1226-
1227-
suppressed_genomes_text = (
1228-
"None"
1229-
if suppressed_genomes is None or len(suppressed_genomes) == 0
1230-
else "\n".join(
1231-
[f"- {acc} (status: {status})" for acc, status in suppressed_genomes]
1208+
suppressed_genomes_items = (
1209+
[f"{acc} (status: {status})" for acc, status in suppressed_genomes]
1210+
if suppressed_genomes
1211+
else []
1212+
)
1213+
paired_accessions_items = (
1214+
[f"{gca} (paired RefSeq: {gcf})" for gca, gcf in paired_accessions]
1215+
if paired_accessions
1216+
else []
1217+
)
1218+
# Compose report modularly using shared QC utils
1219+
lines = ["# Catalog Data QC report", ""]
1220+
lines += format_list_section(
1221+
"## Assemblies not found on NCBI", ncbi_assemblies_items
1222+
)
1223+
lines += format_list_section(
1224+
"## Assemblies not found in UCSC list", ucsc_assemblies_items
1225+
)
1226+
# Gene model URLs can be None (N/A case)
1227+
if gene_model_urls_items is None:
1228+
lines += format_raw_section(
1229+
"## Assemblies with gene model URLs not found", "N/A"
12321230
)
1231+
else:
1232+
lines += format_list_section(
1233+
"## Assemblies with gene model URLs not found", gene_model_urls_items
1234+
)
1235+
lines += format_list_section(
1236+
"## Species and strain combinations with multiple taxonomy IDs",
1237+
taxonomy_ids_items,
12331238
)
1234-
1235-
paired_accessions_text = (
1236-
"None"
1237-
if paired_accessions is None or len(paired_accessions) == 0
1238-
else "\n".join(
1239-
[f"- {gca} (paired RefSeq: {gcf})" for gca, gcf in paired_accessions]
1239+
# Ploidy assemblies can be None (N/A case)
1240+
if ploidy_assemblies_items is None:
1241+
lines += format_raw_section("## Assemblies without ploidy information", "N/A")
1242+
else:
1243+
lines += format_list_section(
1244+
"## Assemblies without ploidy information", ploidy_assemblies_items
12401245
)
1246+
lines += format_list_section(
1247+
"## Outbreak descendant taxonomy IDs not found in genomes data",
1248+
outbreak_descendants_items,
12411249
)
1242-
return (
1243-
f"# Catalog QC report\n\n"
1244-
f"## Assemblies not found on NCBI\n\n{ncbi_assemblies_text}\n\n"
1245-
f"## Assemblies not found in UCSC list\n\n{ucsc_assemblies_text}\n\n"
1246-
f"## Assemblies with gene model URLs not found\n\n{gene_model_urls_text}\n\n"
1247-
f"## Species and strain combinations with multiple taxonomy IDs\n\n{taxonomy_ids_text}\n\n"
1248-
f"## Assemblies without ploidy information\n\n{ploidy_assemblies_text}\n\n"
1249-
f"## Outbreak descendant taxonomy IDs not found in genomes data\n\n{outbreak_descendants_text}\n\n"
1250-
f"## Outdated assembly accessions\n\n{outdated_accessions_text}\n\n"
1251-
f"## Suppressed or retired genomes\n\n{suppressed_genomes_text}\n\n"
1252-
f"## GenBank assemblies with paired RefSeq accessions\n\n{paired_accessions_text}\n\n"
1253-
f"## Taxonomy tree\n\n{tree_checks_text}\n"
1250+
lines += format_list_section(
1251+
"## Outdated assembly accessions", outdated_accessions_items
1252+
)
1253+
lines += format_list_section(
1254+
"## Suppressed or retired genomes", suppressed_genomes_items
1255+
)
1256+
lines += format_list_section(
1257+
"## GenBank assemblies with paired RefSeq accessions",
1258+
paired_accessions_items,
12541259
)
1260+
lines += format_raw_section("## Taxonomy tree", tree_checks_text)
1261+
return join_report(lines)
12551262

12561263

12571264
def get_outbreak_taxonomy_ids(outbreaks_path, get_primary=True, get_descendants=False):

0 commit comments

Comments
 (0)