|
10 | 10 | import yaml |
11 | 11 | from bs4 import BeautifulSoup |
12 | 12 |
|
| 13 | +from .qc_utils import format_list_section, format_raw_section, join_report |
| 14 | + |
13 | 15 | MAX_NCBI_URL_LENGTH = 2000 # The actual limit seems to be a bit over 4000 |
14 | 16 |
|
15 | 17 | log = logging.getLogger(__name__) |
@@ -1141,52 +1143,34 @@ def make_qc_report( |
1141 | 1143 | suppressed_genomes=None, |
1142 | 1144 | paired_accessions=None, |
1143 | 1145 | ): |
1144 | | - ncbi_assemblies_text = ( |
1145 | | - "None" |
1146 | | - if len(missing_ncbi_assemblies) == 0 |
1147 | | - else "\n".join([f"- {accession}" for accession in missing_ncbi_assemblies]) |
| 1146 | + # Convert simple lists to items for format_list_section |
| 1147 | + ncbi_assemblies_items = ( |
| 1148 | + list(missing_ncbi_assemblies) if missing_ncbi_assemblies else [] |
1148 | 1149 | ) |
1149 | | - ucsc_assemblies_text = ( |
1150 | | - "None" |
1151 | | - if len(missing_ucsc_assemblies) == 0 |
1152 | | - else "\n".join([f"- {accession}" for accession in missing_ucsc_assemblies]) |
| 1150 | + ucsc_assemblies_items = ( |
| 1151 | + list(missing_ucsc_assemblies) if missing_ucsc_assemblies else [] |
1153 | 1152 | ) |
1154 | | - gene_model_urls_text = ( |
1155 | | - "N/A" |
1156 | | - if missing_gene_model_urls is None |
1157 | | - else ( |
1158 | | - "None" |
1159 | | - if len(missing_gene_model_urls) == 0 |
1160 | | - else "\n".join([f"- {accession}" for accession in missing_gene_model_urls]) |
1161 | | - ) |
| 1153 | + gene_model_urls_items = ( |
| 1154 | + list(missing_gene_model_urls) if missing_gene_model_urls else None |
1162 | 1155 | ) |
1163 | | - taxonomy_ids_text = ( |
1164 | | - "None" |
1165 | | - if len(inconsistent_taxonomy_ids) == 0 |
1166 | | - else "\n".join( |
1167 | | - [f"- {taxon}: {ids}" for taxon, ids in inconsistent_taxonomy_ids] |
1168 | | - ) |
| 1156 | + taxonomy_ids_items = ( |
| 1157 | + [f"{taxon}: {ids}" for taxon, ids in inconsistent_taxonomy_ids] |
| 1158 | + if inconsistent_taxonomy_ids |
| 1159 | + else [] |
1169 | 1160 | ) |
1170 | | - ploidy_assemblies_text = ( |
1171 | | - "None" |
1172 | | - if missing_ploidy_assemblies is None |
1173 | | - else ( |
1174 | | - "None" |
1175 | | - if len(missing_ploidy_assemblies) == 0 |
1176 | | - else "\n".join( |
1177 | | - [ |
1178 | | - f"- {accession} (speciesTaxonomyId: {tax_id})" |
1179 | | - for accession, tax_id in missing_ploidy_assemblies |
1180 | | - ] |
1181 | | - ) |
1182 | | - ) |
| 1161 | + ploidy_assemblies_items = ( |
| 1162 | + [ |
| 1163 | + f"{accession} (speciesTaxonomyId: {tax_id})" |
| 1164 | + for accession, tax_id in missing_ploidy_assemblies |
| 1165 | + ] |
| 1166 | + if missing_ploidy_assemblies |
| 1167 | + else None |
| 1168 | + ) |
| 1169 | + outbreak_descendants_items = ( |
| 1170 | + [str(tax_id) for tax_id in missing_outbreak_descendants] |
| 1171 | + if missing_outbreak_descendants |
| 1172 | + else [] |
1183 | 1173 | ) |
1184 | | - if missing_outbreak_descendants is None or len(missing_outbreak_descendants) == 0: |
1185 | | - outbreak_descendants_text = "None" |
1186 | | - else: |
1187 | | - outbreak_descendants_text = "\n".join( |
1188 | | - [f"- {tax_id}" for tax_id in missing_outbreak_descendants] |
1189 | | - ) |
1190 | 1174 | if tree_checks is None: |
1191 | 1175 | tree_checks_text = "No checks done" |
1192 | 1176 | else: |
@@ -1216,42 +1200,65 @@ def make_qc_report( |
1216 | 1200 | + ("None" if not tree_missing_ranks else ", ".join(tree_missing_ranks)) |
1217 | 1201 | ) |
1218 | 1202 |
|
1219 | | - outdated_accessions_text = ( |
1220 | | - "None" |
1221 | | - if outdated_accessions is None or len(outdated_accessions) == 0 |
1222 | | - else "\n".join( |
1223 | | - [f"- {acc} (current: {curr_acc})" for acc, curr_acc in outdated_accessions] |
1224 | | - ) |
| 1203 | + outdated_accessions_items = ( |
| 1204 | + [f"{acc} (current: {curr_acc})" for acc, curr_acc in outdated_accessions] |
| 1205 | + if outdated_accessions |
| 1206 | + else [] |
1225 | 1207 | ) |
1226 | | - |
1227 | | - suppressed_genomes_text = ( |
1228 | | - "None" |
1229 | | - if suppressed_genomes is None or len(suppressed_genomes) == 0 |
1230 | | - else "\n".join( |
1231 | | - [f"- {acc} (status: {status})" for acc, status in suppressed_genomes] |
| 1208 | + suppressed_genomes_items = ( |
| 1209 | + [f"{acc} (status: {status})" for acc, status in suppressed_genomes] |
| 1210 | + if suppressed_genomes |
| 1211 | + else [] |
| 1212 | + ) |
| 1213 | + paired_accessions_items = ( |
| 1214 | + [f"{gca} (paired RefSeq: {gcf})" for gca, gcf in paired_accessions] |
| 1215 | + if paired_accessions |
| 1216 | + else [] |
| 1217 | + ) |
| 1218 | + # Compose report modularly using shared QC utils |
| 1219 | + lines = ["# Catalog Data QC report", ""] |
| 1220 | + lines += format_list_section( |
| 1221 | + "## Assemblies not found on NCBI", ncbi_assemblies_items |
| 1222 | + ) |
| 1223 | + lines += format_list_section( |
| 1224 | + "## Assemblies not found in UCSC list", ucsc_assemblies_items |
| 1225 | + ) |
| 1226 | + # Gene model URLs can be None (N/A case) |
| 1227 | + if gene_model_urls_items is None: |
| 1228 | + lines += format_raw_section( |
| 1229 | + "## Assemblies with gene model URLs not found", "N/A" |
1232 | 1230 | ) |
| 1231 | + else: |
| 1232 | + lines += format_list_section( |
| 1233 | + "## Assemblies with gene model URLs not found", gene_model_urls_items |
| 1234 | + ) |
| 1235 | + lines += format_list_section( |
| 1236 | + "## Species and strain combinations with multiple taxonomy IDs", |
| 1237 | + taxonomy_ids_items, |
1233 | 1238 | ) |
1234 | | - |
1235 | | - paired_accessions_text = ( |
1236 | | - "None" |
1237 | | - if paired_accessions is None or len(paired_accessions) == 0 |
1238 | | - else "\n".join( |
1239 | | - [f"- {gca} (paired RefSeq: {gcf})" for gca, gcf in paired_accessions] |
| 1239 | + # Ploidy assemblies can be None (N/A case) |
| 1240 | + if ploidy_assemblies_items is None: |
| 1241 | + lines += format_raw_section("## Assemblies without ploidy information", "N/A") |
| 1242 | + else: |
| 1243 | + lines += format_list_section( |
| 1244 | + "## Assemblies without ploidy information", ploidy_assemblies_items |
1240 | 1245 | ) |
| 1246 | + lines += format_list_section( |
| 1247 | + "## Outbreak descendant taxonomy IDs not found in genomes data", |
| 1248 | + outbreak_descendants_items, |
1241 | 1249 | ) |
1242 | | - return ( |
1243 | | - f"# Catalog QC report\n\n" |
1244 | | - f"## Assemblies not found on NCBI\n\n{ncbi_assemblies_text}\n\n" |
1245 | | - f"## Assemblies not found in UCSC list\n\n{ucsc_assemblies_text}\n\n" |
1246 | | - f"## Assemblies with gene model URLs not found\n\n{gene_model_urls_text}\n\n" |
1247 | | - f"## Species and strain combinations with multiple taxonomy IDs\n\n{taxonomy_ids_text}\n\n" |
1248 | | - f"## Assemblies without ploidy information\n\n{ploidy_assemblies_text}\n\n" |
1249 | | - f"## Outbreak descendant taxonomy IDs not found in genomes data\n\n{outbreak_descendants_text}\n\n" |
1250 | | - f"## Outdated assembly accessions\n\n{outdated_accessions_text}\n\n" |
1251 | | - f"## Suppressed or retired genomes\n\n{suppressed_genomes_text}\n\n" |
1252 | | - f"## GenBank assemblies with paired RefSeq accessions\n\n{paired_accessions_text}\n\n" |
1253 | | - f"## Taxonomy tree\n\n{tree_checks_text}\n" |
| 1250 | + lines += format_list_section( |
| 1251 | + "## Outdated assembly accessions", outdated_accessions_items |
| 1252 | + ) |
| 1253 | + lines += format_list_section( |
| 1254 | + "## Suppressed or retired genomes", suppressed_genomes_items |
| 1255 | + ) |
| 1256 | + lines += format_list_section( |
| 1257 | + "## GenBank assemblies with paired RefSeq accessions", |
| 1258 | + paired_accessions_items, |
1254 | 1259 | ) |
| 1260 | + lines += format_raw_section("## Taxonomy tree", tree_checks_text) |
| 1261 | + return join_report(lines) |
1255 | 1262 |
|
1256 | 1263 |
|
1257 | 1264 | def get_outbreak_taxonomy_ids(outbreaks_path, get_primary=True, get_descendants=False): |
|
0 commit comments