Skip to content

Commit 08e76ef

Browse files
authored
feat!: update gks-common / vrs models (#453)
close #452
1 parent 213b3aa commit 08e76ef

File tree

6 files changed

+479
-156
lines changed

6 files changed

+479
-156
lines changed

src/ga4gh/core/domain_models.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,4 @@
1-
"""GKS Common Library Domain Entity models
2-
3-
**This module should not be imported directly.**
4-
5-
Instead, users should use one of the following:
6-
7-
* `from ga4gh.core import domain_models`, and refer to models with the
8-
abbreviated name, e.g., `domain_models.Gene` (recommended)
9-
10-
* `import ga4gh.core`, and refer to models using the fully-qualified
11-
module name, e.g., `ga4gh.core.domain_models.Gene`
12-
"""
1+
"""GKS Common Library Domain Entity models"""
132
from enum import Enum
143
from typing import Literal, Union, List
154

@@ -138,5 +127,5 @@ class Gene(DomainEntity):
138127

139128
type: Literal["Gene"] = Field(
140129
CommonDomainType.GENE.value,
141-
description=f'MUST be "{CommonDomainType.GENE.value}".'
130+
description=f'MUST be "{CommonDomainType.GENE.value}"'
142131
)

src/ga4gh/core/entity_models.py

Lines changed: 265 additions & 51 deletions
Large diffs are not rendered by default.

src/ga4gh/vrs/models.py

Lines changed: 63 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
from ga4gh.core.pydantic import (
3333
getattr_in
3434
)
35-
from ga4gh.core.entity_models import IRI, Expression, Entity
35+
from ga4gh.core.entity_models import IRI, Entity
3636

3737

3838
def flatten(vals):
@@ -151,6 +151,13 @@ class VrsType(str, Enum):
151151
CN_CHANGE = "CopyNumberChange"
152152

153153

154+
class Orientation(str, Enum):
155+
"""The orientation of the molecular variation component."""
156+
157+
FORWARD = "forward"
158+
REVERSE_COMPLEMENT = "reverse_complement"
159+
160+
154161
class ResidueAlphabet(str, Enum):
155162
"""The interpretation of the character codes referred to by the refget accession,
156163
where "aa" specifies an amino acid character set, and "na" specifies a nucleic acid
@@ -174,6 +181,22 @@ class CopyChange(str, Enum):
174181
EFO_0030072 = 'EFO:0030072'
175182

176183

184+
class Syntax(str, Enum):
185+
"""The syntax used to describe the variation. The value should be one of the
186+
supported syntaxes.
187+
"""
188+
189+
HGVS_C = "hgvs.c"
190+
HGVS_P = "hgvs.p"
191+
HGVS_G = "hgvs.g"
192+
HGVS_M = "hgvs.m"
193+
HGVS_N = "hgvs.n"
194+
HGVS_R = "hgvs.r"
195+
HGVS_ISCN = "iscn"
196+
GNOMAD = "gnomad"
197+
SPDI = "spdi"
198+
199+
177200
def _recurse_ga4gh_serialize(obj):
178201
if isinstance(obj, _Ga4ghIdentifiableObject):
179202
return obj.get_or_create_digest()
@@ -314,6 +337,18 @@ def get_or_create_digest(self, recompute=False) -> str:
314337
class ga4gh(_ValueObject.ga4gh):
315338
prefix: str
316339

340+
class Expression(BaseModel):
341+
"""Representation of a variation by a specified nomenclature or syntax for a
342+
Variation object. Common examples of expressions for the description of molecular
343+
variation include the HGVS and ISCN nomenclatures.
344+
"""
345+
346+
model_config = ConfigDict(use_enum_values=True)
347+
348+
syntax: Syntax = Field(..., description="The syntax used to describe the variation. The value should be one of the supported syntaxes.")
349+
value: str = Field(..., description="The expression of the variation in the specified syntax. The value should be a valid expression in the specified syntax.")
350+
syntax_version: Optional[str] = Field(None, description="The version of the syntax used to describe the variation. This is particularly important for HGVS expressions, as the syntax has evolved over time.")
351+
317352

318353
#########################################
319354
# vrs numerics, comparators, and ranges
@@ -392,7 +427,7 @@ class LengthExpression(_ValueObject):
392427
type: Literal["LengthExpression"] = Field(
393428
VrsType.LEN_EXPR.value, description=f'MUST be "{VrsType.LEN_EXPR.value}"'
394429
)
395-
length: Optional[Union[Range, int]] = None
430+
length: Optional[Union[Range, int]] = Field(None, description="The length of the sequence.")
396431

397432
class ga4gh(_ValueObject.ga4gh):
398433
keys = [
@@ -408,13 +443,13 @@ class ReferenceLengthExpression(_ValueObject):
408443
VrsType.REF_LEN_EXPR.value, description=f'MUST be "{VrsType.REF_LEN_EXPR.value}"'
409444
)
410445
length: Union[Range, int] = Field(
411-
..., description='The number of residues of the expressed sequence.'
446+
..., description='The number of residues in the expressed sequence.'
412447
)
413448
sequence: Optional[SequenceString] = Field(
414-
None, description='the `Sequence` encoded by the Reference Length Expression.'
449+
None, description='the literal Sequence encoded by the Reference Length Expression.'
415450
)
416451
repeatSubunitLength: int = Field(
417-
..., description='The number of residues of the repeat subunit.'
452+
..., description='The number of residues in the repeat subunit.'
418453
)
419454

420455
class ga4gh(_ValueObject.ga4gh):
@@ -452,9 +487,9 @@ class SequenceReference(_ValueObject):
452487
type: Literal["SequenceReference"] = Field(VrsType.SEQ_REF.value, description=f'MUST be "{VrsType.SEQ_REF.value}"')
453488
refgetAccession: Annotated[str, StringConstraints(pattern=r'^SQ.[0-9A-Za-z_\-]{32}$')] = Field(
454489
...,
455-
description='A `GA4GH RefGet <http://samtools.github.io/hts-specs/refget.html>` identifier for the referenced sequence, using the sha512t24u digest.',
490+
description='A [GA4GH RefGet](http://samtools.github.io/hts-specs/refget.html) identifier for the referenced sequence, using the sha512t24u digest.',
456491
)
457-
residueAlphabet: Optional[ResidueAlphabet] = Field(None, description="The interpretation of the character codes referred to by the refget accession, where 'aa' specifies an amino acid character set, and 'na' specifies a nucleic acid character set.")
492+
residueAlphabet: Optional[ResidueAlphabet] = Field(None, description='The interpretation of the character codes referred to by the refget accession, where "aa" specifies an amino acid character set, and "na" specifies a nucleic acid character set.')
458493
circular: Optional[bool] = Field(None, description="A boolean indicating whether the molecule represented by the sequence is circular (true) or linear (false).")
459494

460495
class ga4gh(_ValueObject.ga4gh):
@@ -469,15 +504,15 @@ class SequenceLocation(_Ga4ghIdentifiableObject):
469504

470505
type: Literal["SequenceLocation"] = Field(VrsType.SEQ_LOC.value, description=f'MUST be "{VrsType.SEQ_LOC.value}"')
471506
sequenceReference: Optional[Union[IRI, SequenceReference]] = Field(
472-
None, description='A reference to a `Sequence` on which the location is defined.'
507+
None, description='A reference to a Sequence on which the location is defined.'
473508
)
474509
start: Optional[Union[Range, int]] = Field(
475510
None,
476-
description='The start coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0.',
511+
description='The start coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range less than or equal to the value of `end`.',
477512
)
478513
end: Optional[Union[Range, int]] = Field(
479514
None,
480-
description='The end coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0.',
515+
description='The end coordinate or range of the SequenceLocation. The minimum value of this coordinate or range is 0. MUST represent a coordinate or range greater than or equal to the value of `start`.',
481516

482517
)
483518
sequence: Optional[SequenceString] = Field(None, description="The literal sequence encoded by the `sequenceReference` at these coordinates.")
@@ -614,7 +649,7 @@ class CisPhasedBlock(_VariationBase):
614649
type: Literal["CisPhasedBlock"] = Field(VrsType.CIS_PHASED_BLOCK.value, description=f'MUST be "{VrsType.CIS_PHASED_BLOCK.value}"')
615650
members: List[Union[Allele, IRI]] = Field(
616651
...,
617-
description='A list of `Alleles` that are found in-cis on a shared molecule.',
652+
description='A list of Alleles that are found in-cis on a shared molecule.',
618653
min_length=2,
619654
)
620655
sequenceReference: Optional[SequenceReference] = Field(None, description="An optional Sequence Reference on which all of the in-cis Alleles are found. When defined, this may be used to implicitly define the `sequenceReference` attribute for each of the CisPhasedBlock member Alleles.")
@@ -643,7 +678,7 @@ class Adjacency(_VariationBase):
643678
potentially with an intervening linker sequence.
644679
"""
645680

646-
type: Literal["Adjacency"] = Field(VrsType.ADJACENCY.value, description=f'MUST be "{VrsType.ADJACENCY.value}"')
681+
type: Literal["Adjacency"] = Field(VrsType.ADJACENCY.value, description=f'MUST be "{VrsType.ADJACENCY.value}".')
647682
adjoinedSequences: List[Union[IRI, SequenceLocation]] = Field(
648683
...,
649684
description="The terminal sequence or pair of adjoined sequences that defines in the adjacency.",
@@ -654,7 +689,7 @@ class Adjacency(_VariationBase):
654689
None,
655690
description="The sequence found between adjoined sequences."
656691
)
657-
homology: Optional[bool] = Field(None, description="A flag indicating if coordinate ambiguity in the adjoined sequences is from sequence homology (true) or other uncertainty (false).")
692+
homology: Optional[bool] = Field(None, description="A flag indicating if coordinate ambiguity in the adjoined sequences is from sequence homology (true) or other uncertainty, such as instrument ambiguity (false).")
658693

659694
class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
660695
prefix = 'AJ'
@@ -671,7 +706,7 @@ class Terminus(_VariationBase):
671706
is not allowed and it removes the unnecessary array structure.
672707
"""
673708

674-
type: Literal["Terminus"] = Field(VrsType.TERMINUS.value, description=f'MUST be "{VrsType.TERMINUS.value}"')
709+
type: Literal["Terminus"] = Field(VrsType.TERMINUS.value, description=f'MUST be "{VrsType.TERMINUS.value}".')
675710
location: Union[IRI, SequenceLocation] = Field(..., description="The location of the terminus.")
676711

677712
class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
@@ -685,17 +720,19 @@ class TraversalBlock(_ValueObject):
685720
"""A component used to describe the orientation of a molecular variation within
686721
a DerivativeMolecule."""
687722

723+
model_config = ConfigDict(use_enum_values=True)
724+
688725
type: Literal["TraversalBlock"] = Field(
689-
VrsType.TRAVERSAL_BLOCK.value, description=f'MUST be "{VrsType.TRAVERSAL_BLOCK.value}"'
726+
VrsType.TRAVERSAL_BLOCK.value, description=f'MUST be "{VrsType.TRAVERSAL_BLOCK.value}".'
690727
)
691-
orientation: Literal["forward", "reverse_complement"] = Field(
692-
...,
693-
description='The orientation of the traversal block, either forward or reverse_complement.'
728+
orientation: Optional[Orientation] = Field(
729+
None,
730+
description='The orientation of the molecular variation component.'
694731
)
695732

696-
component: Union[IRI, Adjacency, Allele, Terminus, CisPhasedBlock] = Field(
697-
...,
698-
description="The component that make up the derivative molecule."
733+
component: Optional[Union[Allele, CisPhasedBlock, Adjacency, Terminus]] = Field(
734+
None,
735+
description="The unoriented molecular variation component."
699736
)
700737

701738
class ga4gh(_ValueObject.ga4gh):
@@ -710,13 +747,13 @@ class DerivativeMolecule(_VariationBase):
710747
molecule composed from multiple sequence components.
711748
"""
712749

713-
type: Literal["DerivativeMolecule"] = Field(VrsType.DERIVATIVE_MOL.value, description=f'MUST be "{VrsType.DERIVATIVE_MOL.value}"')
714-
components: List[TraversalBlock] = Field(
750+
type: Literal["DerivativeMolecule"] = Field(VrsType.DERIVATIVE_MOL.value, description=f'MUST be "{VrsType.DERIVATIVE_MOL.value}".')
751+
components: List[Union[IRI, TraversalBlock]] = Field(
715752
...,
716-
description="The traversal block components that make up the derivative molecule.",
753+
description="The molecular components that constitute the derivative molecule.",
717754
min_length=2
718755
)
719-
circular: Optional[bool] = Field(None, description="A flag indicating if the derivative molecule is circular (true) or linear (false).")
756+
circular: Optional[bool] = Field(None, description="A boolean indicating whether the molecule represented by the sequence is circular (true) or linear (false).")
720757

721758
class ga4gh(_Ga4ghIdentifiableObject.ga4gh):
722759
prefix = "DM"
@@ -769,7 +806,7 @@ class CopyNumberChange(_CopyNumber):
769806
type: Literal["CopyNumberChange"] = Field(VrsType.CN_CHANGE.value, description=f'MUST be "{VrsType.CN_CHANGE.value}"')
770807
copyChange: CopyChange = Field(
771808
...,
772-
description='MUST be one of "EFO:0030069" (complete genomic loss), "EFO:0020073" (high-level loss), "EFO:0030068" (low-level loss), "EFO:0030067" (loss), "EFO:0030064" (regional base ploidy), "EFO:0030070" (gain), "EFO:0030071" (low-level gain), "EFO:0030072" (high-level gain).',
809+
description='MUST be a Coding representing one of "EFO:0030069" (complete genomic loss), "EFO:0020073" (high-level loss), "EFO:0030068" (low-level loss), "EFO:0030067" (loss), "EFO:0030064" (regional base ploidy), "EFO:0030070" (gain), "EFO:0030071" (low-level gain), "EFO:0030072" (high-level gain).',
773810
)
774811

775812
class ga4gh(_Ga4ghIdentifiableObject.ga4gh):

submodules/vrs

Submodule vrs updated 96 files

tests/validation/test_schemas.py

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
"""Test that VRS-Python Pydantic models match VRS and GKS-Common schemas"""
2+
3+
from enum import Enum
4+
import json
5+
from pathlib import Path
6+
7+
import pytest
8+
from pydantic import BaseModel
9+
10+
from ga4gh.core import entity_models, domain_models
11+
from ga4gh.vrs import models as vrs_models
12+
13+
14+
class GKSSchema(str, Enum):
15+
"""Enum for GKS schema"""
16+
17+
VRS = "vrs"
18+
CORE_IM = "core-im"
19+
DOMAIN = "domain-entities"
20+
21+
22+
class GKSSchemaMapping(BaseModel):
23+
"""Model for representing GKS Schema concrete classes, primitives, and schema"""
24+
25+
base_classes: set = set()
26+
concrete_classes: set = set()
27+
primitives: set = set()
28+
schema: dict = {}
29+
30+
31+
def _update_gks_schema_mapping(
32+
f_path: Path, gks_schema_mapping: GKSSchemaMapping
33+
) -> None:
34+
"""Update ``gks_schema_mapping`` properties
35+
36+
:param f_path: Path to JSON Schema file
37+
:param gks_schema_mapping: GKS schema mapping to update
38+
"""
39+
with f_path.open() as rf:
40+
cls_def = json.load(rf)
41+
42+
spec_class = cls_def["title"]
43+
gks_schema_mapping.schema[spec_class] = cls_def
44+
45+
if "properties" in cls_def:
46+
gks_schema_mapping.concrete_classes.add(spec_class)
47+
elif cls_def.get("type") in {"array", "integer", "string"}:
48+
gks_schema_mapping.primitives.add(spec_class)
49+
else:
50+
gks_schema_mapping.base_classes.add(spec_class)
51+
52+
53+
GKS_SCHEMA_MAPPING = {gks: GKSSchemaMapping() for gks in GKSSchema}
54+
SUBMODULES_DIR = Path(__file__).parents[2] / "submodules" / "vrs"
55+
56+
57+
# Get vrs classes
58+
vrs_mapping = GKS_SCHEMA_MAPPING[GKSSchema.VRS]
59+
for f in (SUBMODULES_DIR / "schema" / "vrs" / "json").glob("*"):
60+
_update_gks_schema_mapping(f, vrs_mapping)
61+
62+
63+
# Get core-im + domain classes
64+
for child in (SUBMODULES_DIR / "submodules" / "gks-common" / "schema").iterdir():
65+
mapping_key = (
66+
GKSSchema.DOMAIN if str(child).endswith(GKSSchema.DOMAIN) else GKSSchema.CORE_IM
67+
)
68+
mapping = GKS_SCHEMA_MAPPING[mapping_key]
69+
for f in (child / "json").glob("*"):
70+
_update_gks_schema_mapping(f, mapping)
71+
72+
73+
@pytest.mark.parametrize(
74+
("gks_schema", "pydantic_models"),
75+
[
76+
(GKSSchema.VRS, vrs_models),
77+
(GKSSchema.CORE_IM, entity_models),
78+
(GKSSchema.DOMAIN, domain_models),
79+
],
80+
)
81+
def test_schema_models_in_pydantic(gks_schema, pydantic_models):
82+
"""Ensure that each schema model has corresponding Pydantic model"""
83+
mapping = GKS_SCHEMA_MAPPING[gks_schema]
84+
for schema_model in (
85+
mapping.base_classes | mapping.concrete_classes | mapping.primitives
86+
):
87+
assert getattr(pydantic_models, schema_model, False), schema_model
88+
89+
90+
@pytest.mark.parametrize(
91+
("gks_schema", "pydantic_models"),
92+
[
93+
(GKSSchema.VRS, vrs_models),
94+
(GKSSchema.CORE_IM, entity_models),
95+
(GKSSchema.DOMAIN, domain_models),
96+
],
97+
)
98+
def test_schema_class_fields(gks_schema, pydantic_models):
99+
"""Check that each schema model properties exist and are required in corresponding
100+
Pydantic model, and validate required properties
101+
"""
102+
mapping = GKS_SCHEMA_MAPPING[gks_schema]
103+
for schema_model in mapping.concrete_classes:
104+
schema_properties = mapping.schema[schema_model]["properties"]
105+
pydantic_model = getattr(pydantic_models, schema_model)
106+
assert set(pydantic_model.model_fields) == set(schema_properties), schema_model
107+
108+
required_schema_fields = set(mapping.schema[schema_model]["required"])
109+
110+
for prop, property_def in schema_properties.items():
111+
pydantic_model_field_info = pydantic_model.model_fields[prop]
112+
pydantic_field_required = pydantic_model_field_info.is_required()
113+
114+
if prop in required_schema_fields:
115+
if prop != "type":
116+
assert pydantic_field_required, f"{pydantic_model}.{prop}"
117+
else:
118+
assert not pydantic_field_required, f"{pydantic_model}.{prop}"
119+
120+
if "description" in property_def:
121+
assert property_def["description"].replace("'", "\"") == pydantic_model_field_info.description.replace("'", "\""), f"{pydantic_model}.{prop}"
122+
else:
123+
assert pydantic_model_field_info.description is None, f"{pydantic_model}.{prop}"
124+
125+
126+
def test_ga4gh_keys():
127+
"""Ensure ga4ghDigest keys defined in schema model exist in corresponding Pydantic model"""
128+
vrs_mapping = GKS_SCHEMA_MAPPING[GKSSchema.VRS]
129+
for vrs_class in vrs_mapping.concrete_classes:
130+
if (
131+
vrs_mapping.schema[vrs_class].get("ga4ghDigest", {}).get("keys", None)
132+
is None
133+
):
134+
continue
135+
136+
pydantic_model = getattr(vrs_models, vrs_class)
137+
138+
try:
139+
pydantic_model_digest_keys = pydantic_model.ga4gh.keys
140+
except AttributeError as e:
141+
raise AttributeError(vrs_class) from e
142+
143+
assert set(pydantic_model_digest_keys) == set(
144+
vrs_mapping.schema[vrs_class]["ga4ghDigest"]["keys"]
145+
), vrs_class
146+
assert pydantic_model_digest_keys == sorted(
147+
pydantic_model.ga4gh.keys
148+
), vrs_class

0 commit comments

Comments
 (0)