diff --git a/rocrate/model/entity.py b/rocrate/model/entity.py index 473a7eb..b9acc16 100644 --- a/rocrate/model/entity.py +++ b/rocrate/model/entity.py @@ -43,6 +43,7 @@ def __init__(self, crate, identifier=None, properties=None): if name.startswith("@"): self._jsonld[name] = value else: + # this will call the __setitem__ method defined below self[name] = value @property diff --git a/rocrate/rocrate.py b/rocrate/rocrate.py index d36750e..c51710f 100644 --- a/rocrate/rocrate.py +++ b/rocrate/rocrate.py @@ -21,6 +21,7 @@ # limitations under the License. import errno +from typing import cast import uuid import zipfile import atexit @@ -74,16 +75,37 @@ def is_data_entity(entity): return DATA_ENTITY_TYPES.intersection(as_list(entity.get("@type", []))) -def pick_type(json_entity, type_map, fallback=None): +def pick_type(json_entity, type_map, fallback=None, parse_subcrate=False): try: t = json_entity["@type"] except KeyError: raise ValueError(f'entity {json_entity["@id"]!r} has no @type') types = {_.strip() for _ in set(t if isinstance(t, list) else [t])} + + entity_class = None for name, c in type_map.items(): if name in types: - return c - return fallback + entity_class = c + break + + if not entity_class: + return fallback + + if entity_class is Dataset: + + # Check if the dataset is a Subcrate + # i.e it has a conformsTo entry matching a RO-Crate profile + # TODO find a better way to check the profiles? + if parse_subcrate and (list_profiles := get_norm_value(json_entity, "conformsTo")): + + for profile_ref in list_profiles: + if profile_ref.startswith("https://w3id.org/ro/crate"): + return Subcrate + + return Dataset + + else: + return entity_class def get_version(metadata_properties): @@ -96,10 +118,16 @@ def get_version(metadata_properties): class ROCrate(): - def __init__(self, source=None, gen_preview=False, init=False, exclude=None, version=DEFAULT_VERSION): + def __init__(self, + source=None, + gen_preview=False, + init=False, exclude=None, + version=DEFAULT_VERSION, + parse_subcrate=False): self.mode = None self.source = source self.exclude = exclude + self.parse_subcrate = parse_subcrate self.__entity_map = {} # TODO: add this as @base in the context? At least when loading # from zip @@ -182,6 +210,14 @@ def __read_data_entities(self, entities, source, gen_preview): self.__add_parts(parts, entities, source) def __add_parts(self, parts, entities, source): + """ + Add entities to the crate from a list of entities id and Entity object. + + :param self: Description + :param parts: a list of dicts (one dict per entity) in the form {@id : "entity_id"} + :param entities: a dict with the full list of entities information as in the hasPart of the root dataset of the crate. + :param source: Description + """ type_map = OrderedDict((_.__name__, _) for _ in subclasses(FileOrDir)) for ref in parts: id_ = ref['@id'] @@ -192,16 +228,28 @@ def __add_parts(self, parts, entities, source): continue entity = entities.pop(id_) assert id_ == entity.pop('@id') - cls = pick_type(entity, type_map, fallback=DataEntity) - if cls is DataEntity: + cls = pick_type(entity, type_map, fallback=DataEntity, parse_subcrate=self.parse_subcrate) + + if cls is Subcrate: + + if is_url(id_): + instance = Subcrate(self, source=id_, properties=entity) + else: + instance = Subcrate(self, source=source / unquote(id_), properties=entity) + + elif cls is DataEntity: instance = DataEntity(self, identifier=id_, properties=entity) + else: + # cls is either a File or a Dataset (Directory) if is_url(id_): instance = cls(self, id_, properties=entity) else: instance = cls(self, source / unquote(id_), id_, properties=entity) self.add(instance) if instance.type == "Dataset": + # for Subcrate, type is currently Dataset too, + # but the hasPart is not populated yet only once accessing a subcrate element (lazy loading) self.__add_parts(as_list(entity.get("hasPart", [])), entities, source) def __read_contextual_entities(self, entities): @@ -234,6 +282,11 @@ def contextual_entities(self): if not isinstance(e, (RootDataset, Metadata, Preview)) and not hasattr(e, "write")] + @property + def subcrate_entities(self): + return [e for e in self.__entity_map.values() + if isinstance(e, Subcrate)] + @property def name(self): return self.root_dataset.get('name') @@ -364,9 +417,31 @@ def get_entities(self): def _get_root_jsonld(self): self.root_dataset.properties() + def __contains__(self, entity_id): + canonical_id = self.resolve_id(entity_id) + return canonical_id in self.__entity_map + def dereference(self, entity_id, default=None): canonical_id = self.resolve_id(entity_id) - return self.__entity_map.get(canonical_id, default) + + if canonical_id in self.__entity_map: + return self.__entity_map[canonical_id] + + for subcrate_entity in self.subcrate_entities: + + # check if the entity_id might be within a subcrate + # i.e entity_id would start with a subcrate id e.g subcrate/subfile.txt + if entity_id.startswith(subcrate_entity.id): + + # replace id of subcrate to use get in the subcrate + # subcrate/subfile.txt --> subfile.txt + # dont use replace, as it could replace in the middle of the id + entity_id_in_subcrate = entity_id[len(subcrate_entity.id):] + + return subcrate_entity.get_crate().get(entity_id_in_subcrate, default=default) + + # fallback + return default get = dereference @@ -492,7 +567,7 @@ def _copy_unlisted(self, top, base_path): for name in files: source = root / name rel = source.relative_to(top) - if not self.dereference(str(rel)): + if str(rel) not in self: dest = base_path / rel if not dest.exists() or not dest.samefile(source): shutil.copyfile(source, dest) @@ -550,7 +625,7 @@ def _stream_zip(self, chunk_size=8192, out_path=None): continue rel = source.relative_to(self.source) - if not self.dereference(str(rel)) and not str(rel) in listed_files: + if str(rel) not in self and not str(rel) in listed_files: with archive.open(str(rel), mode='w') as out_file, open(source, 'rb') as in_file: while chunk := in_file.read(chunk_size): out_file.write(chunk) @@ -782,6 +857,45 @@ def __validate_suite(self, suite): return suite +class Subcrate(Dataset): + + def __init__(self, crate, source=None, dest_path=None, fetch_remote=False, + validate_url=False, properties=None, record_size=False): + """ + Data-entity representing a subcrate inside another RO-Crate. + + :param crate: The parent crate + :param source: The relative path to the subcrate, or its URL + """ + super().__init__(crate, source, dest_path, fetch_remote, + validate_url, properties=properties, record_size=record_size) + + self._crate = None + """ + A ROCrate instance allowing access to the nested RO-Crate. + The nested RO-Crate is loaded on first access to any of its attribute. + This attribute should not be confused with the crate attribute, which is a reference to the parent crate. + Caller should rather use the get_crate() method to access the nested RO-Crate. + """ + + def get_crate(self) -> ROCrate: + """ + Return the RO-Crate object referenced by this subcrate. + """ + if self._crate is None: + self._load_subcrate() + + return cast(ROCrate, self._crate) + + def _load_subcrate(self): + """ + Load the nested RO-Crate from the source path or URL. + """ + if self._crate is None: + # parse_subcrate=True to load further nested RO-Crate (on-demand / lazily too) + self._crate = ROCrate(self.source, parse_subcrate=True) + + def make_workflow_rocrate(workflow_path, wf_type, include_files=[], fetch_remote=False, cwl=None, diagram=None): wf_crate = ROCrate() diff --git a/test/test-data/crate_with_subcrate/file.txt b/test/test-data/crate_with_subcrate/file.txt new file mode 100644 index 0000000..7b4d68d --- /dev/null +++ b/test/test-data/crate_with_subcrate/file.txt @@ -0,0 +1 @@ +empty \ No newline at end of file diff --git a/test/test-data/crate_with_subcrate/ro-crate-metadata.json b/test/test-data/crate_with_subcrate/ro-crate-metadata.json new file mode 100644 index 0000000..89e7413 --- /dev/null +++ b/test/test-data/crate_with_subcrate/ro-crate-metadata.json @@ -0,0 +1,42 @@ +{ + "@context": "https://w3id.org/ro/crate/1.1/context", + "@graph": [ + { + "@id": "./", + "@type": "Dataset", + "name": "Top-level crate with subcrate", + "description": "A RO-Crate containing a subcrate", + "license": "https://spdx.org/licenses/MIT.html", + "datePublished": "2025-12-02T08:39:54+00:00", + "hasPart": [ + { + "@id": "file.txt" + }, + { + "@id": "subcrate/" + } + ] + }, + { + "@id": "ro-crate-metadata.json", + "@type": "CreativeWork", + "about": { + "@id": "./" + }, + "conformsTo": { + "@id": "https://w3id.org/ro/crate/1.1" + } + }, + { + "@id": "file.txt", + "@type": "File" + }, + { + "@id": "subcrate/", + "@type": "Dataset", + "conformsTo": { + "@id": "https://w3id.org/ro/crate" + } + } + ] +} diff --git a/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json b/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json new file mode 100644 index 0000000..eca9026 --- /dev/null +++ b/test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json @@ -0,0 +1,39 @@ +{ + "@context": "https://w3id.org/ro/crate/1.1/context", + "@graph": [ + { + "@id": "./", + "@type": "Dataset", + "datePublished": "2025-12-02T08:39:54+00:00", + "hasPart": [ + { + "@id": "subfile.txt" + }, + { + "@id": "subsubcrate/" + } + ] + }, + { + "@id": "ro-crate-metadata.json", + "@type": "CreativeWork", + "about": { + "@id": "./" + }, + "conformsTo": { + "@id": "https://w3id.org/ro/crate/1.1" + } + }, + { + "@id": "subfile.txt", + "@type": "File" + }, + { + "@id": "subsubcrate/", + "@type": "Dataset", + "conformsTo": { + "@id": "https://w3id.org/ro/crate" + } + } + ] +} diff --git a/test/test-data/crate_with_subcrate/subcrate/subfile.txt b/test/test-data/crate_with_subcrate/subcrate/subfile.txt new file mode 100644 index 0000000..7b4d68d --- /dev/null +++ b/test/test-data/crate_with_subcrate/subcrate/subfile.txt @@ -0,0 +1 @@ +empty \ No newline at end of file diff --git a/test/test-data/crate_with_subcrate/subcrate/subsubcrate/deepfile.txt b/test/test-data/crate_with_subcrate/subcrate/subsubcrate/deepfile.txt new file mode 100644 index 0000000..c6cac69 --- /dev/null +++ b/test/test-data/crate_with_subcrate/subcrate/subsubcrate/deepfile.txt @@ -0,0 +1 @@ +empty diff --git a/test/test-data/crate_with_subcrate/subcrate/subsubcrate/ro-crate-metadata.json b/test/test-data/crate_with_subcrate/subcrate/subsubcrate/ro-crate-metadata.json new file mode 100644 index 0000000..b552c79 --- /dev/null +++ b/test/test-data/crate_with_subcrate/subcrate/subsubcrate/ro-crate-metadata.json @@ -0,0 +1,29 @@ +{ + "@context": "https://w3id.org/ro/crate/1.1/context", + "@graph": [ + { + "@id": "./", + "@type": "Dataset", + "datePublished": "2025-12-02T08:39:54+00:00", + "hasPart": [ + { + "@id": "deepfile.txt" + } + ] + }, + { + "@id": "ro-crate-metadata.json", + "@type": "CreativeWork", + "about": { + "@id": "./" + }, + "conformsTo": { + "@id": "https://w3id.org/ro/crate/1.1" + } + }, + { + "@id": "deepfile.txt", + "@type": "File" + } + ] +} diff --git a/test/test_model.py b/test/test_model.py index a86c25f..26b13d2 100644 --- a/test/test_model.py +++ b/test/test_model.py @@ -29,7 +29,7 @@ from pathlib import Path import pytest -from rocrate.rocrate import ROCrate +from rocrate.rocrate import ROCrate, Subcrate from rocrate.model import ( DataEntity, File, @@ -103,10 +103,11 @@ def test_data_entities(test_data_dir): crate = ROCrate() file_ = crate.add(File(crate, test_data_dir / 'sample_file.txt')) dataset = crate.add(Dataset(crate, test_data_dir / 'test_add_dir')) + subcrate = crate.add(Subcrate(crate, test_data_dir / 'crate-1.0')) data_entity = crate.add(DataEntity(crate, '#mysterious')) - assert set(crate.data_entities) == {file_, dataset, data_entity} + assert set(crate.data_entities) == {file_, dataset, subcrate, data_entity} part_ids = set(_["@id"] for _ in crate.root_dataset._jsonld["hasPart"]) - assert set(_.id for _ in (file_, dataset, data_entity)) <= part_ids + assert set(_.id for _ in (file_, dataset, subcrate, data_entity)) <= part_ids @pytest.mark.skipif(sys.platform == "darwin", reason="CI sometimes fails on macOS") diff --git a/test/test_read.py b/test/test_read.py index 359da13..abe0c62 100644 --- a/test/test_read.py +++ b/test/test_read.py @@ -27,7 +27,7 @@ import zipfile from pathlib import Path -from rocrate.rocrate import ROCrate +from rocrate.rocrate import ROCrate, Subcrate from rocrate.model import DataEntity, ContextEntity, File, Dataset _URL = ('https://raw.githubusercontent.com/ResearchObject/ro-crate-py/master/' @@ -192,6 +192,48 @@ def test_bad_crate(test_data_dir, tmpdir): ROCrate(crate_dir) +def load_crate_with_subcrate(test_data_dir): + return ROCrate(test_data_dir / "crate_with_subcrate", parse_subcrate=True) + + +def test_crate_with_subcrate(test_data_dir): + + main_crate = load_crate_with_subcrate(test_data_dir) + + subcrate = main_crate.get("subcrate") + assert isinstance(subcrate, Subcrate) + assert main_crate.subcrate_entities == [subcrate] + + # Check the subcrate kept the conformsTo attribute from the original Dataset entity + assert subcrate.get("conformsTo") == "https://w3id.org/ro/crate" + + # check that at this point, we have not yet loaded the subcrate + assert subcrate._crate is None + + # check access from the top-level crate + subfile = main_crate.get("subcrate/subfile.txt") + assert isinstance(subfile, File) + + # check that the above dereferencing triggered lazy loading + assert isinstance(subcrate._crate, ROCrate) + assert subfile.id == "subfile.txt" + assert subfile.crate is not main_crate + assert subfile.crate is subcrate._crate + + # check with another nested rocrate + assert isinstance(main_crate.get("subcrate/subsubcrate/deepfile.txt"), File) + + # reload the crate to "reset" the state to unloaded + main_crate = load_crate_with_subcrate(test_data_dir) + subcrate = main_crate.get("subcrate") + assert subcrate._crate is None + + # get_crate() should trigger loading of the subcrate + nested_crate = subcrate.get_crate() + assert isinstance(nested_crate, ROCrate) + assert subcrate._crate is nested_crate + + @pytest.mark.parametrize("override", [False, True]) def test_init(test_data_dir, tmpdir, helpers, override): crate_dir = test_data_dir / "ro-crate-galaxy-sortchangecase" diff --git a/test/test_write.py b/test/test_write.py index 3cf0422..3b55897 100644 --- a/test/test_write.py +++ b/test/test_write.py @@ -627,6 +627,30 @@ def test_write_zip_nested_dest(tmpdir, helpers): assert (unpack_path / "subdir" / "a b" / "j k" / "l m.txt").is_file() +@pytest.mark.parametrize("to_zip", [False, True]) +def test_write_subcrate(test_data_dir, tmpdir, to_zip): + """Read the test crate with subcrate and write it to a new location. + Check that the subcrate contents are correctly written.""" + crate = ROCrate(test_data_dir / "crate_with_subcrate", parse_subcrate=True) + out_path = tmpdir / "ro_crate_out" + if to_zip: + zip_path = tmpdir / 'ro_crate_out.crate.zip' + crate.write_zip(zip_path) + with zipfile.ZipFile(zip_path, "r") as zf: + zf.extractall(out_path) + else: + crate.write(out_path) + + assert (out_path / "file.txt").is_file() + assert (out_path / "ro-crate-metadata.json").is_file() + + assert (out_path / "subcrate" / "ro-crate-metadata.json").is_file() + assert (out_path / "subcrate" / "subfile.txt").is_file() + + assert (out_path / "subcrate" / "subsubcrate" / "deepfile.txt").is_file() + assert (out_path / "subcrate" / "subsubcrate" / "ro-crate-metadata.json").is_file() + + @pytest.mark.parametrize("version", ["1.0", "1.1", "1.2"]) def test_write_version(tmpdir, helpers, version): basename = helpers.LEGACY_METADATA_FILE_NAME if version == "1.0" else helpers.METADATA_FILE_NAME