From 91b3d6e7b8453ce51def696d97861346d799119b Mon Sep 17 00:00:00 2001 From: Laurent Date: Tue, 2 Dec 2025 14:23:54 +0100 Subject: [PATCH 01/25] first attempt --- rocrate/model/__init__.py | 2 ++ rocrate/model/subcrate.py | 45 +++++++++++++++++++++++++++++++++++++++ rocrate/rocrate.py | 27 ++++++++++++++++++++--- 3 files changed, 71 insertions(+), 3 deletions(-) create mode 100644 rocrate/model/subcrate.py diff --git a/rocrate/model/__init__.py b/rocrate/model/__init__.py index c2ed0e7..452ef8d 100644 --- a/rocrate/model/__init__.py +++ b/rocrate/model/__init__.py @@ -42,6 +42,7 @@ from .person import Person from .root_dataset import RootDataset from .softwareapplication import SoftwareApplication +from .subcrate import Subcrate from .testdefinition import TestDefinition from .testinstance import TestInstance from .preview import Preview @@ -63,6 +64,7 @@ "Preview", "RootDataset", "SoftwareApplication", + "Subcrate", "TestDefinition", "TestInstance", "TestService", diff --git a/rocrate/model/subcrate.py b/rocrate/model/subcrate.py new file mode 100644 index 0000000..7c1fdae --- /dev/null +++ b/rocrate/model/subcrate.py @@ -0,0 +1,45 @@ + +from ..model.dataset import Dataset +from ..rocrate import ROCrate + + +class Subcrate(Dataset): + + def __init__(self, crate, source=None, dest_path=None, fetch_remote=False, + validate_url=False, record_size=False): + """ + This is a data-entity to represent a nested RO-Crate inside another RO-Crate. + + :param crate: The parent crate + :param source: The relative path to the subcrate, or its URL + :param dest_path: Description + :param fetch_remote: Description + :param validate_url: Description + :param properties: Description + :param record_size: Description + """ + super().__init__(crate, source, dest_path, fetch_remote, + validate_url, properties=None, record_size=record_size) + + self.subcrate = None + + def load_subcrate(self): + """ + Load the nested RO-Crate from the source path or URL. + + Populates the `subcrate` attribute with the loaded ROCrate instance, + and updates the JSON-LD representation accordingly. + """ + if self.subcrate is not None: + return self.subcrate + + self.subcrate = ROCrate(self.source) + + self._jsonld = self.subcrate.get("hasParts", default={}) + + def __getitem__(self, key): + + if self.subcrate is None: + self.load_subcrate() + + return super().__getitem__(key) \ No newline at end of file diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index d36750e..c0d95a2 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -49,6 +49,7 @@ Preview, RootDataset, SoftwareApplication, + Subcrate, TestDefinition, TestInstance, TestService, @@ -80,10 +81,24 @@ def pick_type(json_entity, type_map, fallback=None): except KeyError: raise ValueError(f'entity {json_entity["@id"]!r} has no @type') types = {_.strip() for _ in set(t if isinstance(t, list) else [t])} + + entity_class = None for name, c in type_map.items(): if name in types: - return c - return fallback + entity_class = c + break + + if not entity_class: + return fallback + + if entity_class is Dataset: + + # TODO find a better way to check the profile + if json_entity.get("conformsTo", "").startswith("https://w3id.org"): + # Subcrate are a specific case of dataset + return Subcrate + + return Dataset def get_version(metadata_properties): @@ -193,9 +208,15 @@ def __add_parts(self, parts, entities, source): entity = entities.pop(id_) assert id_ == entity.pop('@id') cls = pick_type(entity, type_map, fallback=DataEntity) - if cls is DataEntity: + + if cls is Subcrate: + instance = Subcrate(self, source / unquote(id_)) + + elif cls is DataEntity: instance = DataEntity(self, identifier=id_, properties=entity) + else: + # cls is either a File or a Dataset (Directory) if is_url(id_): instance = cls(self, id_, properties=entity) else: From 8203025ba4630a0c55a23c0caedc4f978bee3307 Mon Sep 17 00:00:00 2001 From: Laurent Date: Tue, 2 Dec 2025 15:35:41 +0100 Subject: [PATCH 02/25] move the Subcrate to rocrate main class --- rocrate/model/__init__.py | 2 -- rocrate/model/subcrate.py | 45 ------------------------------ rocrate/rocrate.py | 58 ++++++++++++++++++++++++++++++++++++--- 3 files changed, 54 insertions(+), 51 deletions(-) delete mode 100644 rocrate/model/subcrate.py diff --git a/rocrate/model/__init__.py b/rocrate/model/__init__.py index 452ef8d..c2ed0e7 100644 --- a/rocrate/model/__init__.py +++ b/rocrate/model/__init__.py @@ -42,7 +42,6 @@ from .person import Person from .root_dataset import RootDataset from .softwareapplication import SoftwareApplication -from .subcrate import Subcrate from .testdefinition import TestDefinition from .testinstance import TestInstance from .preview import Preview @@ -64,7 +63,6 @@ "Preview", "RootDataset", "SoftwareApplication", - "Subcrate", "TestDefinition", "TestInstance", "TestService", diff --git a/rocrate/model/subcrate.py b/rocrate/model/subcrate.py deleted file mode 100644 index 7c1fdae..0000000 --- a/rocrate/model/subcrate.py +++ /dev/null @@ -1,45 +0,0 @@ - -from ..model.dataset import Dataset -from ..rocrate import ROCrate - - -class Subcrate(Dataset): - - def __init__(self, crate, source=None, dest_path=None, fetch_remote=False, - validate_url=False, record_size=False): - """ - This is a data-entity to represent a nested RO-Crate inside another RO-Crate. - - :param crate: The parent crate - :param source: The relative path to the subcrate, or its URL - :param dest_path: Description - :param fetch_remote: Description - :param validate_url: Description - :param properties: Description - :param record_size: Description - """ - super().__init__(crate, source, dest_path, fetch_remote, - validate_url, properties=None, record_size=record_size) - - self.subcrate = None - - def load_subcrate(self): - """ - Load the nested RO-Crate from the source path or URL. - - Populates the `subcrate` attribute with the loaded ROCrate instance, - and updates the JSON-LD representation accordingly. - """ - if self.subcrate is not None: - return self.subcrate - - self.subcrate = ROCrate(self.source) - - self._jsonld = self.subcrate.get("hasParts", default={}) - - def __getitem__(self, key): - - if self.subcrate is None: - self.load_subcrate() - - return super().__getitem__(key) \ No newline at end of file diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index c0d95a2..aaada38 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -49,7 +49,6 @@ Preview, RootDataset, SoftwareApplication, - Subcrate, TestDefinition, TestInstance, TestService, @@ -93,12 +92,19 @@ def pick_type(json_entity, type_map, fallback=None): if entity_class is Dataset: + # Check if the dataset is a Subcrate + # i.e it has a conformsTo entry matching a RO-Crate profile # TODO find a better way to check the profile - if json_entity.get("conformsTo", "").startswith("https://w3id.org"): - # Subcrate are a specific case of dataset - return Subcrate + if list_profiles := json_entity.get("conformsTo", []): + + for profile_ref in as_list(list_profiles): + if profile_ref.get("@id", "").startswith("https://w3id.org/ro/crate/"): + return Subcrate return Dataset + + else: + return entity_class def get_version(metadata_properties): @@ -803,6 +809,50 @@ def __validate_suite(self, suite): return suite +class Subcrate(Dataset): + + def __init__(self, crate, source=None, dest_path=None, fetch_remote=False, + validate_url=False, record_size=False): + """ + This is a data-entity to represent a nested RO-Crate inside another RO-Crate. + + :param crate: The parent crate + :param source: The relative path to the subcrate, or its URL + :param dest_path: Description + :param fetch_remote: Description + :param validate_url: Description + :param properties: Description + :param record_size: Description + """ + super().__init__(crate, source, dest_path, fetch_remote, + validate_url, properties=None, record_size=record_size) + + self.subcrate = None + + def load_subcrate(self): + """ + Load the nested RO-Crate from the source path or URL. + + Populates the `subcrate` attribute with the loaded ROCrate instance, + and updates the JSON-LD representation accordingly. + """ + if self.subcrate is None: + self.subcrate = ROCrate(self.source) + if list_parts := self.subcrate.root_dataset.get("hasPart"): + self._jsonld["hasPart"] = list_parts + + def __getitem__(self, key): + + if self.subcrate is None: + self.load_subcrate() + + return super().__getitem__(key) + + def as_jsonld(self): + if self.subcrate is None: + self.load_subcrate() + return super().as_jsonld() + def make_workflow_rocrate(workflow_path, wf_type, include_files=[], fetch_remote=False, cwl=None, diagram=None): wf_crate = ROCrate() From 25b56d3511aecd5b0b37a3331043ede3f0942e04 Mon Sep 17 00:00:00 2001 From: Laurent Date: Tue, 2 Dec 2025 15:48:19 +0100 Subject: [PATCH 03/25] use get_norm_value instead of get + as_list --- rocrate/rocrate.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index aaada38..f907796 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -95,10 +95,10 @@ def pick_type(json_entity, type_map, fallback=None): # Check if the dataset is a Subcrate # i.e it has a conformsTo entry matching a RO-Crate profile # TODO find a better way to check the profile - if list_profiles := json_entity.get("conformsTo", []): + if list_profiles := get_norm_value(json_entity, "conformsTo"): - for profile_ref in as_list(list_profiles): - if profile_ref.get("@id", "").startswith("https://w3id.org/ro/crate/"): + for profile_ref in list_profiles: + if profile_ref.startswith("https://w3id.org/ro/crate/"): return Subcrate return Dataset From c21e0e0bda088585254149efd51bf751ec06206a Mon Sep 17 00:00:00 2001 From: Laurent Date: Tue, 2 Dec 2025 16:00:22 +0100 Subject: [PATCH 04/25] add support for get with rocrate --- rocrate/rocrate.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index f907796..7ba4017 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -833,8 +833,8 @@ def load_subcrate(self): """ Load the nested RO-Crate from the source path or URL. - Populates the `subcrate` attribute with the loaded ROCrate instance, - and updates the JSON-LD representation accordingly. + This adds an attribute "hasPart" to the `subcrate` with the entities from the nested RO-Crate, + updating the JSON-LD representation accordingly. """ if self.subcrate is None: self.subcrate = ROCrate(self.source) @@ -842,17 +842,23 @@ def load_subcrate(self): self._jsonld["hasPart"] = list_parts def __getitem__(self, key): - if self.subcrate is None: self.load_subcrate() + + if key in self._jsonld: + # e.g the "original" entity keys such as id or type + return super().__getitem__(key) + + # look into the subcrate entities + return self.subcrate.get(key) - return super().__getitem__(key) - def as_jsonld(self): if self.subcrate is None: self.load_subcrate() return super().as_jsonld() + + def make_workflow_rocrate(workflow_path, wf_type, include_files=[], fetch_remote=False, cwl=None, diagram=None): wf_crate = ROCrate() From 0dbdb152eff326df4597483ce18a8d64fc54860f Mon Sep 17 00:00:00 2001 From: Laurent Date: Tue, 2 Dec 2025 16:21:37 +0100 Subject: [PATCH 05/25] add simple tests --- test/test-data/crate_with_subcrate/file.txt | 1 + .../ro-crate-metadata.json | 38 +++++++++++++++++++ .../subcrate/ro-crate-metadata.json | 29 ++++++++++++++ .../crate_with_subcrate/subcrate/subfile.txt | 1 + test/test_read.py | 29 +++++++++++++- 5 files changed, 97 insertions(+), 1 deletion(-) create mode 100644 test/test-data/crate_with_subcrate/file.txt create mode 100644 test/test-data/crate_with_subcrate/ro-crate-metadata.json create mode 100644 test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json create mode 100644 test/test-data/crate_with_subcrate/subcrate/subfile.txt diff --git a/test/test-data/crate_with_subcrate/file.txt b/test/test-data/crate_with_subcrate/file.txt new file mode 100644 index 0000000..7b4d68d --- /dev/null +++ b/test/test-data/crate_with_subcrate/file.txt @@ -0,0 +1 @@ +empty \ No newline at end of file diff --git a/test/test-data/crate_with_subcrate/ro-crate-metadata.json b/test/test-data/crate_with_subcrate/ro-crate-metadata.json new file mode 100644 index 0000000..46aff37 --- /dev/null +++ b/test/test-data/crate_with_subcrate/ro-crate-metadata.json @@ -0,0 +1,38 @@ +{ + "@context": "https://w3id.org/ro/crate/1.1/context", + "@graph": [ + { + "@id": "./", + "@type": "Dataset", + "name": "Top-level crate with subcrate", + "description": "A RO-Crate containing a subcrate", + "license": "https://spdx.org/licenses/MIT.html", + "datePublished": "2025-12-02T08:39:54+00:00", + "hasPart": [ + {"@id": "file.txt"}, + {"@id":"subcrate/"} + ] + }, + { + "@id": "ro-crate-metadata.json", + "@type": "CreativeWork", + "about": { + "@id": "./" + }, + "conformsTo": { + "@id": "https://w3id.org/ro/crate/1.1" + } + }, + { + "@id": "file.txt", + "@type": "File" + }, + { + "@id": "subcrate/", + "@type": "Dataset", + "conformsTo": { + "@id": "https://w3id.org/ro/crate/" + } + } + ] +} \ No newline at end of file diff --git a/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json b/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json new file mode 100644 index 0000000..b56eba0 --- /dev/null +++ b/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json @@ -0,0 +1,29 @@ +{ + "@context": "https://w3id.org/ro/crate/1.1/context", + "@graph": [ + { + "@id": "./", + "@type": "Dataset", + "datePublished": "2025-12-02T08:39:54+00:00", + "hasPart": [ + + {"@id": "subfile.txt"} + + ] + }, + { + "@id": "ro-crate-metadata.json", + "@type": "CreativeWork", + "about": { + "@id": "./" + }, + "conformsTo": { + "@id": "https://w3id.org/ro/crate/1.1" + } + }, + { + "@id": "subfile.txt", + "@type": "File" + } + ] +} \ No newline at end of file diff --git a/test/test-data/crate_with_subcrate/subcrate/subfile.txt b/test/test-data/crate_with_subcrate/subcrate/subfile.txt new file mode 100644 index 0000000..7b4d68d --- /dev/null +++ b/test/test-data/crate_with_subcrate/subcrate/subfile.txt @@ -0,0 +1 @@ +empty \ No newline at end of file diff --git a/test/test_read.py b/test/test_read.py index 359da13..0563ce8 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -27,7 +27,8 @@ import zipfile from pathlib import Path -from rocrate.rocrate import ROCrate +from rocrate import model +from rocrate.rocrate import ROCrate, Subcrate from rocrate.model import DataEntity, ContextEntity, File, Dataset _URL = ('https://raw.githubusercontent.com/ResearchObject/ro-crate-py/master/' @@ -191,6 +192,32 @@ def test_bad_crate(test_data_dir, tmpdir): with pytest.raises(ValueError): ROCrate(crate_dir) +def load_crate_with_subcrate(test_data_dir): + return ROCrate(test_data_dir / "crate_with_subcrate") + +def test_crate_with_subcrate(test_data_dir): + + main_crate = load_crate_with_subcrate(test_data_dir) + + subcrate = main_crate.get("subcrate") + assert isinstance(subcrate, Subcrate) + + # check that at this point, we have not yet loaded the subcrate + assert subcrate._jsonld == subcrate._empty() + assert "hasPart" not in subcrate + + # check lazy loading by accessing an entity from the subcrate + assert isinstance(subcrate.get("subfile.txt"), model.file.File) + + # reload the crate to "reset" the state to unloaded + main_crate = load_crate_with_subcrate(test_data_dir) + subcrate = main_crate.get("subcrate") + + # as_jsonld should trigger loading of the subcrate + assert subcrate.as_jsonld() != subcrate._empty() + assert "hasPart" in subcrate + assert len(subcrate["hasPart"]) == 1 + @pytest.mark.parametrize("override", [False, True]) def test_init(test_data_dir, tmpdir, helpers, override): From ce2a0247977f7334e7e539238a30e845ed9d5911 Mon Sep 17 00:00:00 2001 From: Laurent Date: Tue, 2 Dec 2025 17:01:12 +0100 Subject: [PATCH 06/25] handle url for subcrates --- rocrate/rocrate.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 7ba4017..5184ebf 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -216,8 +216,11 @@ def __add_parts(self, parts, entities, source): cls = pick_type(entity, type_map, fallback=DataEntity) if cls is Subcrate: - instance = Subcrate(self, source / unquote(id_)) - + if is_url(id_): + instance = Subcrate(self, id_) + else: + instance = Subcrate(self, source / unquote(id_)) + elif cls is DataEntity: instance = DataEntity(self, identifier=id_, properties=entity) From fbe0eeeee3c7e9dbd677254e09b7729f1105d2e8 Mon Sep 17 00:00:00 2001 From: Laurent Date: Wed, 3 Dec 2025 09:44:31 +0100 Subject: [PATCH 07/25] add flag parse_subcrate --- rocrate/rocrate.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 5184ebf..85399f7 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -74,7 +74,7 @@ def is_data_entity(entity): return DATA_ENTITY_TYPES.intersection(as_list(entity.get("@type", []))) -def pick_type(json_entity, type_map, fallback=None): +def pick_type(json_entity, type_map, fallback=None, parse_subcrate=False): try: t = json_entity["@type"] except KeyError: @@ -95,7 +95,7 @@ def pick_type(json_entity, type_map, fallback=None): # Check if the dataset is a Subcrate # i.e it has a conformsTo entry matching a RO-Crate profile # TODO find a better way to check the profile - if list_profiles := get_norm_value(json_entity, "conformsTo"): + if parse_subcrate and (list_profiles := get_norm_value(json_entity, "conformsTo")): for profile_ref in list_profiles: if profile_ref.startswith("https://w3id.org/ro/crate/"): @@ -117,7 +117,7 @@ def get_version(metadata_properties): class ROCrate(): - def __init__(self, source=None, gen_preview=False, init=False, exclude=None, version=DEFAULT_VERSION): + def __init__(self, source=None, gen_preview=False, init=False, exclude=None, version=DEFAULT_VERSION, parse_subcrate=False): self.mode = None self.source = source self.exclude = exclude @@ -142,6 +142,7 @@ def __init__(self, source=None, gen_preview=False, init=False, exclude=None, ver source = self.__read(source, gen_preview=gen_preview) # in the zip case, self.source is the extracted dir self.source = source + self.parse_subcrate = parse_subcrate def __init_from_tree(self, top_dir, gen_preview=False, version=DEFAULT_VERSION): top_dir = Path(top_dir) @@ -213,7 +214,7 @@ def __add_parts(self, parts, entities, source): continue entity = entities.pop(id_) assert id_ == entity.pop('@id') - cls = pick_type(entity, type_map, fallback=DataEntity) + cls = pick_type(entity, type_map, fallback=DataEntity, parse_subcrate=self.parse_subcrate) if cls is Subcrate: if is_url(id_): From bd02ee2c0b349d20c55d5d351be96431b87cfd85 Mon Sep 17 00:00:00 2001 From: Laurent Date: Wed, 3 Dec 2025 11:58:25 +0100 Subject: [PATCH 08/25] fix missing flag parse_rocrate --- rocrate/rocrate.py | 19 +++++++++++-------- test/test_read.py | 2 +- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 85399f7..0cbcdea 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -117,10 +117,16 @@ def get_version(metadata_properties): class ROCrate(): - def __init__(self, source=None, gen_preview=False, init=False, exclude=None, version=DEFAULT_VERSION, parse_subcrate=False): + def __init__(self, + source=None, + gen_preview=False, + init=False, exclude=None, + version=DEFAULT_VERSION, + parse_subcrate=False): self.mode = None self.source = source self.exclude = exclude + self.parse_subcrate = parse_subcrate self.__entity_map = {} # TODO: add this as @base in the context? At least when loading # from zip @@ -142,7 +148,6 @@ def __init__(self, source=None, gen_preview=False, init=False, exclude=None, ver source = self.__read(source, gen_preview=gen_preview) # in the zip case, self.source is the extracted dir self.source = source - self.parse_subcrate = parse_subcrate def __init_from_tree(self, top_dir, gen_preview=False, version=DEFAULT_VERSION): top_dir = Path(top_dir) @@ -818,20 +823,18 @@ class Subcrate(Dataset): def __init__(self, crate, source=None, dest_path=None, fetch_remote=False, validate_url=False, record_size=False): """ - This is a data-entity to represent a nested RO-Crate inside another RO-Crate. + Data-entity representing a subcrate inside another RO-Crate. :param crate: The parent crate :param source: The relative path to the subcrate, or its URL - :param dest_path: Description - :param fetch_remote: Description - :param validate_url: Description - :param properties: Description - :param record_size: Description """ super().__init__(crate, source, dest_path, fetch_remote, validate_url, properties=None, record_size=record_size) self.subcrate = None + """ + A ROCrate instance allowing access to the nested RO-Crate. + """ def load_subcrate(self): """ diff --git a/test/test_read.py b/test/test_read.py index 0563ce8..f7dcf41 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -193,7 +193,7 @@ def test_bad_crate(test_data_dir, tmpdir): ROCrate(crate_dir) def load_crate_with_subcrate(test_data_dir): - return ROCrate(test_data_dir / "crate_with_subcrate") + return ROCrate(test_data_dir / "crate_with_subcrate", parse_subcrate=True) def test_crate_with_subcrate(test_data_dir): From bf6f276c7377f81de38243b8902ee3de6ff5c5c1 Mon Sep 17 00:00:00 2001 From: Laurent Date: Wed, 3 Dec 2025 11:59:43 +0100 Subject: [PATCH 09/25] add subcrate_entities property --- rocrate/rocrate.py | 5 +++++ test/test_read.py | 1 + 2 files changed, 6 insertions(+) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 0cbcdea..7efef1a 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -270,6 +270,11 @@ def contextual_entities(self): if not isinstance(e, (RootDataset, Metadata, Preview)) and not hasattr(e, "write")] + @property + def subcrate_entities(self): + return [e for e in self.__entity_map.values() + if isinstance(e, Subcrate)] + @property def name(self): return self.root_dataset.get('name') diff --git a/test/test_read.py b/test/test_read.py index f7dcf41..e6db9ef 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -201,6 +201,7 @@ def test_crate_with_subcrate(test_data_dir): subcrate = main_crate.get("subcrate") assert isinstance(subcrate, Subcrate) + assert main_crate.subcrate_entities == [subcrate] # check that at this point, we have not yet loaded the subcrate assert subcrate._jsonld == subcrate._empty() From fd1fd53c4883dd09a6d8806e18a16aa6a9a6d9bc Mon Sep 17 00:00:00 2001 From: Laurent Date: Wed, 3 Dec 2025 14:09:52 +0100 Subject: [PATCH 10/25] support get of subcrate entity from top crate also fix flake8 with precommit --- rocrate/rocrate.py | 56 ++++++++++++++++++++++++++++------------------ test/test_read.py | 22 ++++++++++++------ 2 files changed, 49 insertions(+), 29 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 7efef1a..88a1cc6 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -80,29 +80,29 @@ def pick_type(json_entity, type_map, fallback=None, parse_subcrate=False): except KeyError: raise ValueError(f'entity {json_entity["@id"]!r} has no @type') types = {_.strip() for _ in set(t if isinstance(t, list) else [t])} - + entity_class = None for name, c in type_map.items(): if name in types: entity_class = c break - + if not entity_class: return fallback - + if entity_class is Dataset: - - # Check if the dataset is a Subcrate + + # Check if the dataset is a Subcrate # i.e it has a conformsTo entry matching a RO-Crate profile - # TODO find a better way to check the profile + # TODO find a better way to check the profiles? if parse_subcrate and (list_profiles := get_norm_value(json_entity, "conformsTo")): - + for profile_ref in list_profiles: if profile_ref.startswith("https://w3id.org/ro/crate/"): return Subcrate - + return Dataset - + else: return entity_class @@ -220,16 +220,16 @@ def __add_parts(self, parts, entities, source): entity = entities.pop(id_) assert id_ == entity.pop('@id') cls = pick_type(entity, type_map, fallback=DataEntity, parse_subcrate=self.parse_subcrate) - + if cls is Subcrate: if is_url(id_): instance = Subcrate(self, id_) else: instance = Subcrate(self, source / unquote(id_)) - + elif cls is DataEntity: instance = DataEntity(self, identifier=id_, properties=entity) - + else: # cls is either a File or a Dataset (Directory) if is_url(id_): @@ -238,6 +238,8 @@ def __add_parts(self, parts, entities, source): instance = cls(self, source / unquote(id_), id_, properties=entity) self.add(instance) if instance.type == "Dataset": + # for Subcrate, type is currently Dataset too, + # but the hasPart is not populated yet only once accssing a subcrate element (lazy loading) self.__add_parts(as_list(entity.get("hasPart", [])), entities, source) def __read_contextual_entities(self, entities): @@ -407,7 +409,18 @@ def _get_root_jsonld(self): def dereference(self, entity_id, default=None): canonical_id = self.resolve_id(entity_id) - return self.__entity_map.get(canonical_id, default) + + if canonical_id in self.__entity_map: + return self.__entity_map[canonical_id] + + for subcrate_entity in self.subcrate_entities: + # check if the entity_id might be within a subcrate + if entity_id.startswith(subcrate_entity.id): + entity_id_in_subcrate = entity_id.replace(subcrate_entity.id, "") + return subcrate_entity.get(entity_id_in_subcrate, default=default) + + # fallback + return default get = dereference @@ -828,24 +841,24 @@ class Subcrate(Dataset): def __init__(self, crate, source=None, dest_path=None, fetch_remote=False, validate_url=False, record_size=False): """ - Data-entity representing a subcrate inside another RO-Crate. - + Data-entity representing a subcrate inside another RO-Crate. + :param crate: The parent crate :param source: The relative path to the subcrate, or its URL """ super().__init__(crate, source, dest_path, fetch_remote, validate_url, properties=None, record_size=record_size) - + self.subcrate = None """ A ROCrate instance allowing access to the nested RO-Crate. """ - + def load_subcrate(self): """ Load the nested RO-Crate from the source path or URL. - - This adds an attribute "hasPart" to the `subcrate` with the entities from the nested RO-Crate, + + This adds an attribute "hasPart" to the `subcrate` with the entities from the nested RO-Crate, updating the JSON-LD representation accordingly. """ if self.subcrate is None: @@ -856,7 +869,7 @@ def load_subcrate(self): def __getitem__(self, key): if self.subcrate is None: self.load_subcrate() - + if key in self._jsonld: # e.g the "original" entity keys such as id or type return super().__getitem__(key) @@ -868,9 +881,8 @@ def as_jsonld(self): if self.subcrate is None: self.load_subcrate() return super().as_jsonld() - - + def make_workflow_rocrate(workflow_path, wf_type, include_files=[], fetch_remote=False, cwl=None, diagram=None): wf_crate = ROCrate() diff --git a/test/test_read.py b/test/test_read.py index e6db9ef..b7f3d3d 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -192,13 +192,15 @@ def test_bad_crate(test_data_dir, tmpdir): with pytest.raises(ValueError): ROCrate(crate_dir) + def load_crate_with_subcrate(test_data_dir): return ROCrate(test_data_dir / "crate_with_subcrate", parse_subcrate=True) + def test_crate_with_subcrate(test_data_dir): - + main_crate = load_crate_with_subcrate(test_data_dir) - + subcrate = main_crate.get("subcrate") assert isinstance(subcrate, Subcrate) assert main_crate.subcrate_entities == [subcrate] @@ -206,18 +208,24 @@ def test_crate_with_subcrate(test_data_dir): # check that at this point, we have not yet loaded the subcrate assert subcrate._jsonld == subcrate._empty() assert "hasPart" not in subcrate - + # check lazy loading by accessing an entity from the subcrate - assert isinstance(subcrate.get("subfile.txt"), model.file.File) + subfile_entity = subcrate.get("subfile.txt") + assert isinstance(subfile_entity, model.file.File) + + # check access from the top-level crate works too + assert main_crate.get("subcrate/subfile.txt") is subfile_entity + + # Check the subfile entity is listed under hasPart of the subcrate + assert "hasPart" in subcrate + assert subcrate["hasPart"] == [subfile_entity] # reload the crate to "reset" the state to unloaded main_crate = load_crate_with_subcrate(test_data_dir) subcrate = main_crate.get("subcrate") - + # as_jsonld should trigger loading of the subcrate assert subcrate.as_jsonld() != subcrate._empty() - assert "hasPart" in subcrate - assert len(subcrate["hasPart"]) == 1 @pytest.mark.parametrize("override", [False, True]) From 991664101deb86da24e1efffacd152f5ab9996c5 Mon Sep 17 00:00:00 2001 From: Laurent Date: Wed, 3 Dec 2025 14:26:17 +0100 Subject: [PATCH 11/25] load_subcrate as hidden function --- rocrate/rocrate.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 88a1cc6..33aec3c 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -854,7 +854,7 @@ def __init__(self, crate, source=None, dest_path=None, fetch_remote=False, A ROCrate instance allowing access to the nested RO-Crate. """ - def load_subcrate(self): + def _load_subcrate(self): """ Load the nested RO-Crate from the source path or URL. @@ -868,7 +868,7 @@ def load_subcrate(self): def __getitem__(self, key): if self.subcrate is None: - self.load_subcrate() + self._load_subcrate() if key in self._jsonld: # e.g the "original" entity keys such as id or type @@ -879,7 +879,7 @@ def __getitem__(self, key): def as_jsonld(self): if self.subcrate is None: - self.load_subcrate() + self._load_subcrate() return super().as_jsonld() From 22b34f277e14dddeb897ac5560fa9422dc8a8cfd Mon Sep 17 00:00:00 2001 From: Laurent Date: Wed, 3 Dec 2025 14:26:28 +0100 Subject: [PATCH 12/25] add get_entities to subcrate --- rocrate/rocrate.py | 5 +++++ test/test_read.py | 2 ++ 2 files changed, 7 insertions(+) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 33aec3c..422c163 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -882,6 +882,11 @@ def as_jsonld(self): self._load_subcrate() return super().as_jsonld() + def get_entities(self): + if self.subcrate is None: + self._load_subcrate() + return self.subcrate.get_entities() + def make_workflow_rocrate(workflow_path, wf_type, include_files=[], fetch_remote=False, cwl=None, diagram=None): diff --git a/test/test_read.py b/test/test_read.py index b7f3d3d..5eb26a8 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -220,6 +220,8 @@ def test_crate_with_subcrate(test_data_dir): assert "hasPart" in subcrate assert subcrate["hasPart"] == [subfile_entity] + assert len(subcrate.get_entities()) == 3 # root dataset, metadata.json, subfile + # reload the crate to "reset" the state to unloaded main_crate = load_crate_with_subcrate(test_data_dir) subcrate = main_crate.get("subcrate") From bf7e0eaa908c88ec5b96c5c28282feb9c0679b1d Mon Sep 17 00:00:00 2001 From: Laurent Date: Wed, 3 Dec 2025 14:30:04 +0100 Subject: [PATCH 13/25] add Subcrate to test_data_entities --- test/test_model.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/test_model.py b/test/test_model.py index a86c25f..26b13d2 100644 --- a/test/test_model.py +++ b/test/test_model.py @@ -29,7 +29,7 @@ from pathlib import Path import pytest -from rocrate.rocrate import ROCrate +from rocrate.rocrate import ROCrate, Subcrate from rocrate.model import ( DataEntity, File, @@ -103,10 +103,11 @@ def test_data_entities(test_data_dir): crate = ROCrate() file_ = crate.add(File(crate, test_data_dir / 'sample_file.txt')) dataset = crate.add(Dataset(crate, test_data_dir / 'test_add_dir')) + subcrate = crate.add(Subcrate(crate, test_data_dir / 'crate-1.0')) data_entity = crate.add(DataEntity(crate, '#mysterious')) - assert set(crate.data_entities) == {file_, dataset, data_entity} + assert set(crate.data_entities) == {file_, dataset, subcrate, data_entity} part_ids = set(_["@id"] for _ in crate.root_dataset._jsonld["hasPart"]) - assert set(_.id for _ in (file_, dataset, data_entity)) <= part_ids + assert set(_.id for _ in (file_, dataset, subcrate, data_entity)) <= part_ids @pytest.mark.skipif(sys.platform == "darwin", reason="CI sometimes fails on macOS") From 9e9a4cf56007b24d101ab70ef897ed96ddda3032 Mon Sep 17 00:00:00 2001 From: Laurent Date: Wed, 3 Dec 2025 15:11:38 +0100 Subject: [PATCH 14/25] fix issue with nested crates --- rocrate/rocrate.py | 11 ++++++-- .../subcrate/ro-crate-metadata.json | 14 +++++++--- .../subcrate/subsubcrate/deepfile.txt | 1 + .../subsubcrate/ro-crate-metadata.json | 27 +++++++++++++++++++ test/test_read.py | 9 ++++--- 5 files changed, 53 insertions(+), 9 deletions(-) create mode 100644 test/test-data/crate_with_subcrate/subcrate/subsubcrate/deepfile.txt create mode 100644 test/test-data/crate_with_subcrate/subcrate/subsubcrate/ro-crate-metadata.json diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 422c163..3dcba09 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -414,9 +414,16 @@ def dereference(self, entity_id, default=None): return self.__entity_map[canonical_id] for subcrate_entity in self.subcrate_entities: + # check if the entity_id might be within a subcrate + # i.e entity_id would start with a subcrate id e.g subcrate/subfile.txt if entity_id.startswith(subcrate_entity.id): - entity_id_in_subcrate = entity_id.replace(subcrate_entity.id, "") + + # replace id of subcrate to use get in the subcrate + # subcrate/subfile.txt --> subfile.txt + # dont use replace, as it could replace in the middle of the id + entity_id_in_subcrate = entity_id[len(subcrate_entity.id):] + return subcrate_entity.get(entity_id_in_subcrate, default=default) # fallback @@ -862,7 +869,7 @@ def _load_subcrate(self): updating the JSON-LD representation accordingly. """ if self.subcrate is None: - self.subcrate = ROCrate(self.source) + self.subcrate = ROCrate(self.source, parse_subcrate=True) # would load further nested RO-Crate if list_parts := self.subcrate.root_dataset.get("hasPart"): self._jsonld["hasPart"] = list_parts diff --git a/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json b/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json index b56eba0..2bd736f 100644 --- a/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json +++ b/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json @@ -6,9 +6,8 @@ "@type": "Dataset", "datePublished": "2025-12-02T08:39:54+00:00", "hasPart": [ - - {"@id": "subfile.txt"} - + {"@id": "subfile.txt"}, + {"@id": "subsubcrate/"} ] }, { @@ -24,6 +23,13 @@ { "@id": "subfile.txt", "@type": "File" + }, + { + "@id": "subsubcrate/", + "@type": "Dataset", + "conformsTo": { + "@id": "https://w3id.org/ro/crate/" + } } ] -} \ No newline at end of file +} diff --git a/test/test-data/crate_with_subcrate/subcrate/subsubcrate/deepfile.txt b/test/test-data/crate_with_subcrate/subcrate/subsubcrate/deepfile.txt new file mode 100644 index 0000000..c6cac69 --- /dev/null +++ b/test/test-data/crate_with_subcrate/subcrate/subsubcrate/deepfile.txt @@ -0,0 +1 @@ +empty diff --git a/test/test-data/crate_with_subcrate/subcrate/subsubcrate/ro-crate-metadata.json b/test/test-data/crate_with_subcrate/subcrate/subsubcrate/ro-crate-metadata.json new file mode 100644 index 0000000..1b5390f --- /dev/null +++ b/test/test-data/crate_with_subcrate/subcrate/subsubcrate/ro-crate-metadata.json @@ -0,0 +1,27 @@ +{ + "@context": "https://w3id.org/ro/crate/1.1/context", + "@graph": [ + { + "@id": "./", + "@type": "Dataset", + "datePublished": "2025-12-02T08:39:54+00:00", + "hasPart": [ + {"@id": "deepfile.txt"} + ] + }, + { + "@id": "ro-crate-metadata.json", + "@type": "CreativeWork", + "about": { + "@id": "./" + }, + "conformsTo": { + "@id": "https://w3id.org/ro/crate/1.1" + } + }, + { + "@id": "deepfile.txt", + "@type": "File" + } + ] +} diff --git a/test/test_read.py b/test/test_read.py index 5eb26a8..07f3214 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -216,11 +216,14 @@ def test_crate_with_subcrate(test_data_dir): # check access from the top-level crate works too assert main_crate.get("subcrate/subfile.txt") is subfile_entity - # Check the subfile entity is listed under hasPart of the subcrate + # check with another nested rocrate + assert isinstance(main_crate.get("subcrate/subsubcrate/deepfile.txt"), model.file.File) + + # Check the hasPart of the subcrate lists the file and the subsubcrate assert "hasPart" in subcrate - assert subcrate["hasPart"] == [subfile_entity] + assert len(subcrate["hasPart"]) == 2 - assert len(subcrate.get_entities()) == 3 # root dataset, metadata.json, subfile + assert len(subcrate.get_entities()) == 4 # root dataset, metadata.json, subfile, subsubcrate # reload the crate to "reset" the state to unloaded main_crate = load_crate_with_subcrate(test_data_dir) From 21556c8a57d08a274af574a8d7f5f49ea791c6c1 Mon Sep 17 00:00:00 2001 From: Laurent Date: Thu, 4 Dec 2025 14:30:53 +0100 Subject: [PATCH 15/25] keep conformsTo in Subcrate --- rocrate/rocrate.py | 38 ++++++++++++++++++++++++-------------- test/test_read.py | 9 ++++++++- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 3dcba09..d2e5d02 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -209,6 +209,14 @@ def __read_data_entities(self, entities, source, gen_preview): self.__add_parts(parts, entities, source) def __add_parts(self, parts, entities, source): + """ + Add entities to the crate from a list of entities id and Entity object. + + :param self: Description + :param parts: a list of dicts (one dict per entity) in the form {@id : "entity_id"} + :param entities: a dict with the full list of entities information as in the hasPart of the root dataset of the crate. + :param source: Description + """ type_map = OrderedDict((_.__name__, _) for _ in subclasses(FileOrDir)) for ref in parts: id_ = ref['@id'] @@ -222,10 +230,11 @@ def __add_parts(self, parts, entities, source): cls = pick_type(entity, type_map, fallback=DataEntity, parse_subcrate=self.parse_subcrate) if cls is Subcrate: + if is_url(id_): - instance = Subcrate(self, id_) + instance = Subcrate(self, source=id_, properties=entity) else: - instance = Subcrate(self, source / unquote(id_)) + instance = Subcrate(self, source=source / unquote(id_), properties=entity) elif cls is DataEntity: instance = DataEntity(self, identifier=id_, properties=entity) @@ -239,7 +248,7 @@ def __add_parts(self, parts, entities, source): self.add(instance) if instance.type == "Dataset": # for Subcrate, type is currently Dataset too, - # but the hasPart is not populated yet only once accssing a subcrate element (lazy loading) + # but the hasPart is not populated yet only once accessing a subcrate element (lazy loading) self.__add_parts(as_list(entity.get("hasPart", [])), entities, source) def __read_contextual_entities(self, entities): @@ -846,7 +855,7 @@ def __validate_suite(self, suite): class Subcrate(Dataset): def __init__(self, crate, source=None, dest_path=None, fetch_remote=False, - validate_url=False, record_size=False): + validate_url=False, properties=None, record_size=False): """ Data-entity representing a subcrate inside another RO-Crate. @@ -854,11 +863,12 @@ def __init__(self, crate, source=None, dest_path=None, fetch_remote=False, :param source: The relative path to the subcrate, or its URL """ super().__init__(crate, source, dest_path, fetch_remote, - validate_url, properties=None, record_size=record_size) + validate_url, properties=properties, record_size=record_size) - self.subcrate = None + self._subcrate = None """ A ROCrate instance allowing access to the nested RO-Crate. + The nested RO-Crate is loaded on first access to any of its attribute. """ def _load_subcrate(self): @@ -868,13 +878,13 @@ def _load_subcrate(self): This adds an attribute "hasPart" to the `subcrate` with the entities from the nested RO-Crate, updating the JSON-LD representation accordingly. """ - if self.subcrate is None: - self.subcrate = ROCrate(self.source, parse_subcrate=True) # would load further nested RO-Crate - if list_parts := self.subcrate.root_dataset.get("hasPart"): + if self._subcrate is None: + self._subcrate = ROCrate(self.source, parse_subcrate=True) # would load further nested RO-Crate + if list_parts := self._subcrate.root_dataset.get("hasPart"): self._jsonld["hasPart"] = list_parts def __getitem__(self, key): - if self.subcrate is None: + if self._subcrate is None: self._load_subcrate() if key in self._jsonld: @@ -882,17 +892,17 @@ def __getitem__(self, key): return super().__getitem__(key) # look into the subcrate entities - return self.subcrate.get(key) + return self._subcrate.get(key) def as_jsonld(self): - if self.subcrate is None: + if self._subcrate is None: self._load_subcrate() return super().as_jsonld() def get_entities(self): - if self.subcrate is None: + if self._subcrate is None: self._load_subcrate() - return self.subcrate.get_entities() + return self._subcrate.get_entities() def make_workflow_rocrate(workflow_path, wf_type, include_files=[], diff --git a/test/test_read.py b/test/test_read.py index 07f3214..bed09a6 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -205,8 +205,15 @@ def test_crate_with_subcrate(test_data_dir): assert isinstance(subcrate, Subcrate) assert main_crate.subcrate_entities == [subcrate] + # Check the subcrate kept the conformsTo attribute from the original Dataset entity + assert "conformsTo" in subcrate + # check that at this point, we have not yet loaded the subcrate - assert subcrate._jsonld == subcrate._empty() + # e.g the json ld should just have id, type and conformsTo + jsonld = subcrate._jsonld + jsonld.pop("conformsTo") + + assert jsonld == subcrate._empty() assert "hasPart" not in subcrate # check lazy loading by accessing an entity from the subcrate From d98b22f798d8b7e831cb8c014a36d72b5c524174 Mon Sep 17 00:00:00 2001 From: Laurent Date: Thu, 4 Dec 2025 16:47:44 +0100 Subject: [PATCH 16/25] use getter for inner crate also prevents directly accessing items listed in subcrate under hasPart e.g subcrate.get("subfile.txt") --- rocrate/model/entity.py | 1 + rocrate/rocrate.py | 61 +++++++++++++++++++++++++++++------------ test/test_read.py | 8 ++++-- 3 files changed, 49 insertions(+), 21 deletions(-) diff --git a/rocrate/model/entity.py b/rocrate/model/entity.py index 473a7eb..b9acc16 100644 --- a/rocrate/model/entity.py +++ b/rocrate/model/entity.py @@ -43,6 +43,7 @@ def __init__(self, crate, identifier=None, properties=None): if name.startswith("@"): self._jsonld[name] = value else: + # this will call the __setitem__ method defined below self[name] = value @property diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index d2e5d02..d9224ed 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -433,7 +433,7 @@ def dereference(self, entity_id, default=None): # dont use replace, as it could replace in the middle of the id entity_id_in_subcrate = entity_id[len(subcrate_entity.id):] - return subcrate_entity.get(entity_id_in_subcrate, default=default) + return subcrate_entity.get_crate().get(entity_id_in_subcrate, default=default) # fallback return default @@ -865,44 +865,69 @@ def __init__(self, crate, source=None, dest_path=None, fetch_remote=False, super().__init__(crate, source, dest_path, fetch_remote, validate_url, properties=properties, record_size=record_size) - self._subcrate = None + self._crate = None """ A ROCrate instance allowing access to the nested RO-Crate. The nested RO-Crate is loaded on first access to any of its attribute. + This attribute should not be confused with the crate attribute, which is a reference to the parent crate. + Caller should rather use the get_crate() method to access the nested RO-Crate. """ + def get_crate(self): + """ + Return the RO-Crate object referenced by this subcrate. + """ + if self._crate is None: + self._load_subcrate() + + return self._crate + def _load_subcrate(self): """ Load the nested RO-Crate from the source path or URL. - This adds an attribute "hasPart" to the `subcrate` with the entities from the nested RO-Crate, - updating the JSON-LD representation accordingly. + This populates the attribute `hasPart` of the `Subcrate` entity, + with the data entities listed under the `root_dataset["hasPart"]` of the nested RO-Crate. + If the nested RO-crate does not list any part und `hasPart`, + then the `hasPart` attribute of the `Subcrate` entity will be an empty list. + """ + if self._crate is None: + # parse_subcrate=True to load further nested RO-Crate (on-demand / lazily too) + self._crate = ROCrate(self.source, parse_subcrate=True) + + # Note : assigning to hasPart keeps only the dict with id:entity not the actual entities + # such that when retrieving something from hasPart one was getting a string not an entity + self["hasPart"] = self._crate.root_dataset.get("hasPart", []) + + def _get_parts_subcrate_root(self): """ - if self._subcrate is None: - self._subcrate = ROCrate(self.source, parse_subcrate=True) # would load further nested RO-Crate - if list_parts := self._subcrate.root_dataset.get("hasPart"): - self._jsonld["hasPart"] = list_parts + Get the list of data entities listed under the `root_dataset["hasPart"]` of the nested RO-Crate. + + This will load the nested RO-Crate if not already loaded. + + :return: A list of data entities of the nested RO-Crate, + or an empty list if the nested RO-Crate does not list any part. + """ + + return self.get_crate().root_dataset.get("hasPart", []) def __getitem__(self, key): - if self._subcrate is None: - self._load_subcrate() - if key in self._jsonld: - # e.g the "original" entity keys such as id or type - return super().__getitem__(key) + if key == "hasPart": + return self._get_parts_subcrate_root() - # look into the subcrate entities - return self._subcrate.get(key) + else: + return super().__getitem__(key) def as_jsonld(self): - if self._subcrate is None: + if self._crate is None: self._load_subcrate() return super().as_jsonld() def get_entities(self): - if self._subcrate is None: + if self._crate is None: self._load_subcrate() - return self._subcrate.get_entities() + return self._crate.get_entities() def make_workflow_rocrate(workflow_path, wf_type, include_files=[], diff --git a/test/test_read.py b/test/test_read.py index bed09a6..709669c 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -217,11 +217,13 @@ def test_crate_with_subcrate(test_data_dir): assert "hasPart" not in subcrate # check lazy loading by accessing an entity from the subcrate - subfile_entity = subcrate.get("subfile.txt") - assert isinstance(subfile_entity, model.file.File) + list_subcrate_parts = subcrate.get("hasPart", []) + assert len(list_subcrate_parts) == 2 # subfile.txt and subsubcrate/ + assert isinstance(list_subcrate_parts[0], DataEntity) + assert "subfile.txt" in [e.id for e in list_subcrate_parts] # check access from the top-level crate works too - assert main_crate.get("subcrate/subfile.txt") is subfile_entity + assert main_crate.get("subcrate/subfile.txt") in list_subcrate_parts # check with another nested rocrate assert isinstance(main_crate.get("subcrate/subsubcrate/deepfile.txt"), model.file.File) From 629bcfe5eeb2ce28684b6598103dd24b58dcb1ee Mon Sep 17 00:00:00 2001 From: Laurent Date: Thu, 4 Dec 2025 17:41:09 +0100 Subject: [PATCH 17/25] remove get_entities from Subcrate --- rocrate/rocrate.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index d9224ed..27b743c 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -924,11 +924,6 @@ def as_jsonld(self): self._load_subcrate() return super().as_jsonld() - def get_entities(self): - if self._crate is None: - self._load_subcrate() - return self._crate.get_entities() - def make_workflow_rocrate(workflow_path, wf_type, include_files=[], fetch_remote=False, cwl=None, diagram=None): From 771247db0508781254cb2aab1306b96a04846d51 Mon Sep 17 00:00:00 2001 From: Laurent Date: Thu, 4 Dec 2025 17:43:30 +0100 Subject: [PATCH 18/25] remove subcrate.get_entities from tests --- test/test_read.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/test_read.py b/test/test_read.py index 709669c..315be15 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -232,8 +232,6 @@ def test_crate_with_subcrate(test_data_dir): assert "hasPart" in subcrate assert len(subcrate["hasPart"]) == 2 - assert len(subcrate.get_entities()) == 4 # root dataset, metadata.json, subfile, subsubcrate - # reload the crate to "reset" the state to unloaded main_crate = load_crate_with_subcrate(test_data_dir) subcrate = main_crate.get("subcrate") From 13682081e0ef82456c31db5fd892aa79add26363 Mon Sep 17 00:00:00 2001 From: Laurent Date: Fri, 5 Dec 2025 11:21:29 +0100 Subject: [PATCH 19/25] implement crate writing --- rocrate/rocrate.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 27b743c..a77e270 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -21,6 +21,7 @@ # limitations under the License. import errno +from typing import cast import uuid import zipfile import atexit @@ -873,14 +874,14 @@ def __init__(self, crate, source=None, dest_path=None, fetch_remote=False, Caller should rather use the get_crate() method to access the nested RO-Crate. """ - def get_crate(self): + def get_crate(self) -> ROCrate: """ Return the RO-Crate object referenced by this subcrate. """ if self._crate is None: self._load_subcrate() - return self._crate + return cast(ROCrate, self._crate) def _load_subcrate(self): """ @@ -924,6 +925,10 @@ def as_jsonld(self): self._load_subcrate() return super().as_jsonld() + def write(self, base_path): + self.get_crate().write(base_path / self.id) + # TODO check with URL + def make_workflow_rocrate(workflow_path, wf_type, include_files=[], fetch_remote=False, cwl=None, diagram=None): From 3c77ad255401fbeaade0bf2b815f2830611f52be Mon Sep 17 00:00:00 2001 From: Laurent Date: Tue, 9 Dec 2025 15:42:22 +0100 Subject: [PATCH 20/25] add test writing the subcrate --- test/test_write.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/test_write.py b/test/test_write.py index 3cf0422..9b3cd4f 100644 --- a/test/test_write.py +++ b/test/test_write.py @@ -627,6 +627,18 @@ def test_write_zip_nested_dest(tmpdir, helpers): assert (unpack_path / "subdir" / "a b" / "j k" / "l m.txt").is_file() +def test_write_subcrate(test_data_dir, tmpdir): + """Read the test crate with subcrate and write it to a new location. + Check that the subcrate contents are correctly written.""" + crate = ROCrate(test_data_dir / "crate_with_subcrate") + crate.write(tmpdir / "ro_crate_out") + + assert (tmpdir / "ro_crate_out" / "subcrate" / "ro-crate-metadata.json").is_file() + assert (tmpdir / "ro_crate_out" / "subcrate" / "subfile.txt").is_file() + + assert (tmpdir / "ro_crate_out" / "subcrate" / "subsubcrate" / "deepfile.txt").is_file() + + @pytest.mark.parametrize("version", ["1.0", "1.1", "1.2"]) def test_write_version(tmpdir, helpers, version): basename = helpers.LEGACY_METADATA_FILE_NAME if version == "1.0" else helpers.METADATA_FILE_NAME From dfe45eb8fd493c5ecb146a80c6b30793138fac9b Mon Sep 17 00:00:00 2001 From: simleo Date: Thu, 11 Dec 2025 12:27:12 +0100 Subject: [PATCH 21/25] test_write_subcrate: activate parse_subcrate --- test/test_write.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_write.py b/test/test_write.py index 9b3cd4f..d82b2c2 100644 --- a/test/test_write.py +++ b/test/test_write.py @@ -630,13 +630,17 @@ def test_write_zip_nested_dest(tmpdir, helpers): def test_write_subcrate(test_data_dir, tmpdir): """Read the test crate with subcrate and write it to a new location. Check that the subcrate contents are correctly written.""" - crate = ROCrate(test_data_dir / "crate_with_subcrate") + crate = ROCrate(test_data_dir / "crate_with_subcrate", parse_subcrate=True) crate.write(tmpdir / "ro_crate_out") + assert (tmpdir / "ro_crate_out" / "file.txt").is_file() + assert (tmpdir / "ro_crate_out" / "ro-crate-metadata.json").is_file() + assert (tmpdir / "ro_crate_out" / "subcrate" / "ro-crate-metadata.json").is_file() assert (tmpdir / "ro_crate_out" / "subcrate" / "subfile.txt").is_file() assert (tmpdir / "ro_crate_out" / "subcrate" / "subsubcrate" / "deepfile.txt").is_file() + assert (tmpdir / "ro_crate_out" / "subcrate" / "subsubcrate" / "ro-crate-metadata.json").is_file() @pytest.mark.parametrize("version", ["1.0", "1.1", "1.2"]) From 6ea62fe7d592043fa209cf7aca6e7e851583ab8c Mon Sep 17 00:00:00 2001 From: simleo Date: Thu, 11 Dec 2025 16:32:51 +0100 Subject: [PATCH 22/25] don't modify the main crate's jsonld when loading a subcrate --- rocrate/rocrate.py | 34 ---------------------------------- test/test_read.py | 38 ++++++++++++++++---------------------- 2 files changed, 16 insertions(+), 56 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index a77e270..9e0e5dc 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -886,45 +886,11 @@ def get_crate(self) -> ROCrate: def _load_subcrate(self): """ Load the nested RO-Crate from the source path or URL. - - This populates the attribute `hasPart` of the `Subcrate` entity, - with the data entities listed under the `root_dataset["hasPart"]` of the nested RO-Crate. - If the nested RO-crate does not list any part und `hasPart`, - then the `hasPart` attribute of the `Subcrate` entity will be an empty list. """ if self._crate is None: # parse_subcrate=True to load further nested RO-Crate (on-demand / lazily too) self._crate = ROCrate(self.source, parse_subcrate=True) - # Note : assigning to hasPart keeps only the dict with id:entity not the actual entities - # such that when retrieving something from hasPart one was getting a string not an entity - self["hasPart"] = self._crate.root_dataset.get("hasPart", []) - - def _get_parts_subcrate_root(self): - """ - Get the list of data entities listed under the `root_dataset["hasPart"]` of the nested RO-Crate. - - This will load the nested RO-Crate if not already loaded. - - :return: A list of data entities of the nested RO-Crate, - or an empty list if the nested RO-Crate does not list any part. - """ - - return self.get_crate().root_dataset.get("hasPart", []) - - def __getitem__(self, key): - - if key == "hasPart": - return self._get_parts_subcrate_root() - - else: - return super().__getitem__(key) - - def as_jsonld(self): - if self._crate is None: - self._load_subcrate() - return super().as_jsonld() - def write(self, base_path): self.get_crate().write(base_path / self.id) # TODO check with URL diff --git a/test/test_read.py b/test/test_read.py index 315be15..0d694ae 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -27,7 +27,6 @@ import zipfile from pathlib import Path -from rocrate import model from rocrate.rocrate import ROCrate, Subcrate from rocrate.model import DataEntity, ContextEntity, File, Dataset @@ -206,38 +205,33 @@ def test_crate_with_subcrate(test_data_dir): assert main_crate.subcrate_entities == [subcrate] # Check the subcrate kept the conformsTo attribute from the original Dataset entity - assert "conformsTo" in subcrate + assert subcrate.get("conformsTo") == "https://w3id.org/ro/crate/" # check that at this point, we have not yet loaded the subcrate - # e.g the json ld should just have id, type and conformsTo - jsonld = subcrate._jsonld - jsonld.pop("conformsTo") + assert subcrate._crate is None - assert jsonld == subcrate._empty() - assert "hasPart" not in subcrate + # check access from the top-level crate + subfile = main_crate.get("subcrate/subfile.txt") + assert isinstance(subfile, File) - # check lazy loading by accessing an entity from the subcrate - list_subcrate_parts = subcrate.get("hasPart", []) - assert len(list_subcrate_parts) == 2 # subfile.txt and subsubcrate/ - assert isinstance(list_subcrate_parts[0], DataEntity) - assert "subfile.txt" in [e.id for e in list_subcrate_parts] - - # check access from the top-level crate works too - assert main_crate.get("subcrate/subfile.txt") in list_subcrate_parts + # check that the above dereferencing triggered lazy loading + assert isinstance(subcrate._crate, ROCrate) + assert subfile.id == "subfile.txt" + assert subfile.crate is not main_crate + assert subfile.crate is subcrate._crate # check with another nested rocrate - assert isinstance(main_crate.get("subcrate/subsubcrate/deepfile.txt"), model.file.File) - - # Check the hasPart of the subcrate lists the file and the subsubcrate - assert "hasPart" in subcrate - assert len(subcrate["hasPart"]) == 2 + assert isinstance(main_crate.get("subcrate/subsubcrate/deepfile.txt"), File) # reload the crate to "reset" the state to unloaded main_crate = load_crate_with_subcrate(test_data_dir) subcrate = main_crate.get("subcrate") + assert subcrate._crate is None - # as_jsonld should trigger loading of the subcrate - assert subcrate.as_jsonld() != subcrate._empty() + # get_crate() should trigger loading of the subcrate + nested_crate = subcrate.get_crate() + assert isinstance(nested_crate, ROCrate) + assert subcrate._crate is nested_crate @pytest.mark.parametrize("override", [False, True]) From 284c9edd4c4a34595e4b8907d4e9f89d33f7aaf0 Mon Sep 17 00:00:00 2001 From: simleo Date: Fri, 12 Dec 2025 08:35:56 +0100 Subject: [PATCH 23/25] no trailing slash in generic ro-crate profile, as per the spec --- rocrate/rocrate.py | 2 +- test/test-data/crate_with_subcrate/ro-crate-metadata.json | 4 ++-- .../crate_with_subcrate/subcrate/ro-crate-metadata.json | 2 +- test/test_read.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 9e0e5dc..97ce542 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -99,7 +99,7 @@ def pick_type(json_entity, type_map, fallback=None, parse_subcrate=False): if parse_subcrate and (list_profiles := get_norm_value(json_entity, "conformsTo")): for profile_ref in list_profiles: - if profile_ref.startswith("https://w3id.org/ro/crate/"): + if profile_ref.startswith("https://w3id.org/ro/crate"): return Subcrate return Dataset diff --git a/test/test-data/crate_with_subcrate/ro-crate-metadata.json b/test/test-data/crate_with_subcrate/ro-crate-metadata.json index 46aff37..ae5a7eb 100644 --- a/test/test-data/crate_with_subcrate/ro-crate-metadata.json +++ b/test/test-data/crate_with_subcrate/ro-crate-metadata.json @@ -31,8 +31,8 @@ "@id": "subcrate/", "@type": "Dataset", "conformsTo": { - "@id": "https://w3id.org/ro/crate/" + "@id": "https://w3id.org/ro/crate" } } ] -} \ No newline at end of file +} diff --git a/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json b/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json index 2bd736f..86273d1 100644 --- a/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json +++ b/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json @@ -28,7 +28,7 @@ "@id": "subsubcrate/", "@type": "Dataset", "conformsTo": { - "@id": "https://w3id.org/ro/crate/" + "@id": "https://w3id.org/ro/crate" } } ] diff --git a/test/test_read.py b/test/test_read.py index 0d694ae..abe0c62 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -205,7 +205,7 @@ def test_crate_with_subcrate(test_data_dir): assert main_crate.subcrate_entities == [subcrate] # Check the subcrate kept the conformsTo attribute from the original Dataset entity - assert subcrate.get("conformsTo") == "https://w3id.org/ro/crate/" + assert subcrate.get("conformsTo") == "https://w3id.org/ro/crate" # check that at this point, we have not yet loaded the subcrate assert subcrate._crate is None From ad51c860c37134f1cadf312ddc15e04f2180df36 Mon Sep 17 00:00:00 2001 From: simleo Date: Fri, 12 Dec 2025 09:52:59 +0100 Subject: [PATCH 24/25] reindent metadata files to reduce diffs --- test/test-data/crate_with_subcrate/ro-crate-metadata.json | 8 ++++++-- .../crate_with_subcrate/subcrate/ro-crate-metadata.json | 8 ++++++-- .../subcrate/subsubcrate/ro-crate-metadata.json | 4 +++- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/test/test-data/crate_with_subcrate/ro-crate-metadata.json b/test/test-data/crate_with_subcrate/ro-crate-metadata.json index ae5a7eb..89e7413 100644 --- a/test/test-data/crate_with_subcrate/ro-crate-metadata.json +++ b/test/test-data/crate_with_subcrate/ro-crate-metadata.json @@ -9,8 +9,12 @@ "license": "https://spdx.org/licenses/MIT.html", "datePublished": "2025-12-02T08:39:54+00:00", "hasPart": [ - {"@id": "file.txt"}, - {"@id":"subcrate/"} + { + "@id": "file.txt" + }, + { + "@id": "subcrate/" + } ] }, { diff --git a/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json b/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json index 86273d1..eca9026 100644 --- a/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json +++ b/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json @@ -6,8 +6,12 @@ "@type": "Dataset", "datePublished": "2025-12-02T08:39:54+00:00", "hasPart": [ - {"@id": "subfile.txt"}, - {"@id": "subsubcrate/"} + { + "@id": "subfile.txt" + }, + { + "@id": "subsubcrate/" + } ] }, { diff --git a/test/test-data/crate_with_subcrate/subcrate/subsubcrate/ro-crate-metadata.json b/test/test-data/crate_with_subcrate/subcrate/subsubcrate/ro-crate-metadata.json index 1b5390f..b552c79 100644 --- a/test/test-data/crate_with_subcrate/subcrate/subsubcrate/ro-crate-metadata.json +++ b/test/test-data/crate_with_subcrate/subcrate/subsubcrate/ro-crate-metadata.json @@ -6,7 +6,9 @@ "@type": "Dataset", "datePublished": "2025-12-02T08:39:54+00:00", "hasPart": [ - {"@id": "deepfile.txt"} + { + "@id": "deepfile.txt" + } ] }, { From d2649a218960a87ecb3cb37fe385dfef98380b78 Mon Sep 17 00:00:00 2001 From: simleo Date: Fri, 12 Dec 2025 12:27:28 +0100 Subject: [PATCH 25/25] don't use dereference to check for unlisted files --- rocrate/rocrate.py | 12 ++++++------ test/test_write.py | 24 ++++++++++++++++-------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index 97ce542..c51710f 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -417,6 +417,10 @@ def get_entities(self): def _get_root_jsonld(self): self.root_dataset.properties() + def __contains__(self, entity_id): + canonical_id = self.resolve_id(entity_id) + return canonical_id in self.__entity_map + def dereference(self, entity_id, default=None): canonical_id = self.resolve_id(entity_id) @@ -563,7 +567,7 @@ def _copy_unlisted(self, top, base_path): for name in files: source = root / name rel = source.relative_to(top) - if not self.dereference(str(rel)): + if str(rel) not in self: dest = base_path / rel if not dest.exists() or not dest.samefile(source): shutil.copyfile(source, dest) @@ -621,7 +625,7 @@ def _stream_zip(self, chunk_size=8192, out_path=None): continue rel = source.relative_to(self.source) - if not self.dereference(str(rel)) and not str(rel) in listed_files: + if str(rel) not in self and not str(rel) in listed_files: with archive.open(str(rel), mode='w') as out_file, open(source, 'rb') as in_file: while chunk := in_file.read(chunk_size): out_file.write(chunk) @@ -891,10 +895,6 @@ def _load_subcrate(self): # parse_subcrate=True to load further nested RO-Crate (on-demand / lazily too) self._crate = ROCrate(self.source, parse_subcrate=True) - def write(self, base_path): - self.get_crate().write(base_path / self.id) - # TODO check with URL - def make_workflow_rocrate(workflow_path, wf_type, include_files=[], fetch_remote=False, cwl=None, diagram=None): diff --git a/test/test_write.py b/test/test_write.py index d82b2c2..3b55897 100644 --- a/test/test_write.py +++ b/test/test_write.py @@ -627,20 +627,28 @@ def test_write_zip_nested_dest(tmpdir, helpers): assert (unpack_path / "subdir" / "a b" / "j k" / "l m.txt").is_file() -def test_write_subcrate(test_data_dir, tmpdir): +@pytest.mark.parametrize("to_zip", [False, True]) +def test_write_subcrate(test_data_dir, tmpdir, to_zip): """Read the test crate with subcrate and write it to a new location. Check that the subcrate contents are correctly written.""" crate = ROCrate(test_data_dir / "crate_with_subcrate", parse_subcrate=True) - crate.write(tmpdir / "ro_crate_out") + out_path = tmpdir / "ro_crate_out" + if to_zip: + zip_path = tmpdir / 'ro_crate_out.crate.zip' + crate.write_zip(zip_path) + with zipfile.ZipFile(zip_path, "r") as zf: + zf.extractall(out_path) + else: + crate.write(out_path) - assert (tmpdir / "ro_crate_out" / "file.txt").is_file() - assert (tmpdir / "ro_crate_out" / "ro-crate-metadata.json").is_file() + assert (out_path / "file.txt").is_file() + assert (out_path / "ro-crate-metadata.json").is_file() - assert (tmpdir / "ro_crate_out" / "subcrate" / "ro-crate-metadata.json").is_file() - assert (tmpdir / "ro_crate_out" / "subcrate" / "subfile.txt").is_file() + assert (out_path / "subcrate" / "ro-crate-metadata.json").is_file() + assert (out_path / "subcrate" / "subfile.txt").is_file() - assert (tmpdir / "ro_crate_out" / "subcrate" / "subsubcrate" / "deepfile.txt").is_file() - assert (tmpdir / "ro_crate_out" / "subcrate" / "subsubcrate" / "ro-crate-metadata.json").is_file() + assert (out_path / "subcrate" / "subsubcrate" / "deepfile.txt").is_file() + assert (out_path / "subcrate" / "subsubcrate" / "ro-crate-metadata.json").is_file() @pytest.mark.parametrize("version", ["1.0", "1.1", "1.2"])