Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
91b3d6e
first attempt
LauLauThom Dec 2, 2025
8203025
move the Subcrate to rocrate main class
LauLauThom Dec 2, 2025
25b56d3
use get_norm_value instead of get + as_list
LauLauThom Dec 2, 2025
c21e0e0
add support for get with rocrate
LauLauThom Dec 2, 2025
0dbdb15
add simple tests
LauLauThom Dec 2, 2025
ce2a024
handle url for subcrates
LauLauThom Dec 2, 2025
fbe0eee
add flag parse_subcrate
LauLauThom Dec 3, 2025
bd02ee2
fix missing flag parse_rocrate
LauLauThom Dec 3, 2025
bf6f276
add subcrate_entities property
LauLauThom Dec 3, 2025
fd1fd53
support get of subcrate entity from top crate
LauLauThom Dec 3, 2025
9916641
load_subcrate as hidden function
LauLauThom Dec 3, 2025
22b34f2
add get_entities to subcrate
LauLauThom Dec 3, 2025
bf7e0ea
add Subcrate to test_data_entities
LauLauThom Dec 3, 2025
9e9a4cf
fix issue with nested crates
LauLauThom Dec 3, 2025
21556c8
keep conformsTo in Subcrate
LauLauThom Dec 4, 2025
d98b22f
use getter for inner crate
LauLauThom Dec 4, 2025
629bcfe
remove get_entities from Subcrate
LauLauThom Dec 4, 2025
771247d
remove subcrate.get_entities from tests
LauLauThom Dec 4, 2025
1368208
implement crate writing
LauLauThom Dec 5, 2025
3c77ad2
add test writing the subcrate
LauLauThom Dec 9, 2025
dfe45eb
test_write_subcrate: activate parse_subcrate
simleo Dec 11, 2025
6ea62fe
don't modify the main crate's jsonld when loading a subcrate
simleo Dec 11, 2025
284c9ed
no trailing slash in generic ro-crate profile, as per the spec
simleo Dec 12, 2025
ad51c86
reindent metadata files to reduce diffs
simleo Dec 12, 2025
d2649a2
don't use dereference to check for unlisted files
simleo Dec 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions rocrate/model/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def __init__(self, crate, identifier=None, properties=None):
if name.startswith("@"):
self._jsonld[name] = value
else:
# this will call the __setitem__ method defined below
self[name] = value

@property
Expand Down
132 changes: 123 additions & 9 deletions rocrate/rocrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# limitations under the License.

import errno
from typing import cast
import uuid
import zipfile
import atexit
Expand Down Expand Up @@ -74,16 +75,37 @@ def is_data_entity(entity):
return DATA_ENTITY_TYPES.intersection(as_list(entity.get("@type", [])))


def pick_type(json_entity, type_map, fallback=None):
def pick_type(json_entity, type_map, fallback=None, parse_subcrate=False):
try:
t = json_entity["@type"]
except KeyError:
raise ValueError(f'entity {json_entity["@id"]!r} has no @type')
types = {_.strip() for _ in set(t if isinstance(t, list) else [t])}

entity_class = None
for name, c in type_map.items():
if name in types:
return c
return fallback
entity_class = c
break

if not entity_class:
return fallback

if entity_class is Dataset:

# Check if the dataset is a Subcrate
# i.e it has a conformsTo entry matching a RO-Crate profile
# TODO find a better way to check the profiles?
if parse_subcrate and (list_profiles := get_norm_value(json_entity, "conformsTo")):

for profile_ref in list_profiles:
if profile_ref.startswith("https://w3id.org/ro/crate"):
return Subcrate

return Dataset

else:
return entity_class


def get_version(metadata_properties):
Expand All @@ -96,10 +118,16 @@ def get_version(metadata_properties):

class ROCrate():

def __init__(self, source=None, gen_preview=False, init=False, exclude=None, version=DEFAULT_VERSION):
def __init__(self,
source=None,
gen_preview=False,
init=False, exclude=None,
version=DEFAULT_VERSION,
parse_subcrate=False):
self.mode = None
self.source = source
self.exclude = exclude
self.parse_subcrate = parse_subcrate
self.__entity_map = {}
# TODO: add this as @base in the context? At least when loading
# from zip
Expand Down Expand Up @@ -182,6 +210,14 @@ def __read_data_entities(self, entities, source, gen_preview):
self.__add_parts(parts, entities, source)

def __add_parts(self, parts, entities, source):
"""
Add entities to the crate from a list of entities id and Entity object.

:param self: Description
:param parts: a list of dicts (one dict per entity) in the form {@id : "entity_id"}
:param entities: a dict with the full list of entities information as in the hasPart of the root dataset of the crate.
:param source: Description
"""
type_map = OrderedDict((_.__name__, _) for _ in subclasses(FileOrDir))
for ref in parts:
id_ = ref['@id']
Expand All @@ -192,16 +228,28 @@ def __add_parts(self, parts, entities, source):
continue
entity = entities.pop(id_)
assert id_ == entity.pop('@id')
cls = pick_type(entity, type_map, fallback=DataEntity)
if cls is DataEntity:
cls = pick_type(entity, type_map, fallback=DataEntity, parse_subcrate=self.parse_subcrate)

if cls is Subcrate:

if is_url(id_):
instance = Subcrate(self, source=id_, properties=entity)
else:
instance = Subcrate(self, source=source / unquote(id_), properties=entity)

elif cls is DataEntity:
instance = DataEntity(self, identifier=id_, properties=entity)

else:
# cls is either a File or a Dataset (Directory)
if is_url(id_):
instance = cls(self, id_, properties=entity)
else:
instance = cls(self, source / unquote(id_), id_, properties=entity)
self.add(instance)
if instance.type == "Dataset":
# for Subcrate, type is currently Dataset too,
# but the hasPart is not populated yet only once accessing a subcrate element (lazy loading)
self.__add_parts(as_list(entity.get("hasPart", [])), entities, source)

def __read_contextual_entities(self, entities):
Expand Down Expand Up @@ -234,6 +282,11 @@ def contextual_entities(self):
if not isinstance(e, (RootDataset, Metadata, Preview))
and not hasattr(e, "write")]

@property
def subcrate_entities(self):
return [e for e in self.__entity_map.values()
if isinstance(e, Subcrate)]

@property
def name(self):
return self.root_dataset.get('name')
Expand Down Expand Up @@ -364,9 +417,31 @@ def get_entities(self):
def _get_root_jsonld(self):
self.root_dataset.properties()

def __contains__(self, entity_id):
canonical_id = self.resolve_id(entity_id)
return canonical_id in self.__entity_map

def dereference(self, entity_id, default=None):
canonical_id = self.resolve_id(entity_id)
return self.__entity_map.get(canonical_id, default)

if canonical_id in self.__entity_map:
return self.__entity_map[canonical_id]

for subcrate_entity in self.subcrate_entities:

# check if the entity_id might be within a subcrate
# i.e entity_id would start with a subcrate id e.g subcrate/subfile.txt
if entity_id.startswith(subcrate_entity.id):

# replace id of subcrate to use get in the subcrate
# subcrate/subfile.txt --> subfile.txt
# dont use replace, as it could replace in the middle of the id
entity_id_in_subcrate = entity_id[len(subcrate_entity.id):]

return subcrate_entity.get_crate().get(entity_id_in_subcrate, default=default)

# fallback
return default

get = dereference

Expand Down Expand Up @@ -492,7 +567,7 @@ def _copy_unlisted(self, top, base_path):
for name in files:
source = root / name
rel = source.relative_to(top)
if not self.dereference(str(rel)):
if str(rel) not in self:
dest = base_path / rel
if not dest.exists() or not dest.samefile(source):
shutil.copyfile(source, dest)
Expand Down Expand Up @@ -550,7 +625,7 @@ def _stream_zip(self, chunk_size=8192, out_path=None):
continue

rel = source.relative_to(self.source)
if not self.dereference(str(rel)) and not str(rel) in listed_files:
if str(rel) not in self and not str(rel) in listed_files:
with archive.open(str(rel), mode='w') as out_file, open(source, 'rb') as in_file:
while chunk := in_file.read(chunk_size):
out_file.write(chunk)
Expand Down Expand Up @@ -782,6 +857,45 @@ def __validate_suite(self, suite):
return suite


class Subcrate(Dataset):

def __init__(self, crate, source=None, dest_path=None, fetch_remote=False,
validate_url=False, properties=None, record_size=False):
"""
Data-entity representing a subcrate inside another RO-Crate.

:param crate: The parent crate
:param source: The relative path to the subcrate, or its URL
"""
super().__init__(crate, source, dest_path, fetch_remote,
validate_url, properties=properties, record_size=record_size)

self._crate = None
"""
A ROCrate instance allowing access to the nested RO-Crate.
The nested RO-Crate is loaded on first access to any of its attribute.
This attribute should not be confused with the crate attribute, which is a reference to the parent crate.
Caller should rather use the get_crate() method to access the nested RO-Crate.
"""

def get_crate(self) -> ROCrate:
"""
Return the RO-Crate object referenced by this subcrate.
"""
if self._crate is None:
self._load_subcrate()

return cast(ROCrate, self._crate)

def _load_subcrate(self):
"""
Load the nested RO-Crate from the source path or URL.
"""
if self._crate is None:
# parse_subcrate=True to load further nested RO-Crate (on-demand / lazily too)
self._crate = ROCrate(self.source, parse_subcrate=True)


def make_workflow_rocrate(workflow_path, wf_type, include_files=[],
fetch_remote=False, cwl=None, diagram=None):
wf_crate = ROCrate()
Expand Down
1 change: 1 addition & 0 deletions test/test-data/crate_with_subcrate/file.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
empty
42 changes: 42 additions & 0 deletions test/test-data/crate_with_subcrate/ro-crate-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"@context": "https://w3id.org/ro/crate/1.1/context",
"@graph": [
{
"@id": "./",
"@type": "Dataset",
"name": "Top-level crate with subcrate",
"description": "A RO-Crate containing a subcrate",
"license": "https://spdx.org/licenses/MIT.html",
"datePublished": "2025-12-02T08:39:54+00:00",
"hasPart": [
{
"@id": "file.txt"
},
{
"@id": "subcrate/"
}
]
},
{
"@id": "ro-crate-metadata.json",
"@type": "CreativeWork",
"about": {
"@id": "./"
},
"conformsTo": {
"@id": "https://w3id.org/ro/crate/1.1"
}
},
{
"@id": "file.txt",
"@type": "File"
},
{
"@id": "subcrate/",
"@type": "Dataset",
"conformsTo": {
"@id": "https://w3id.org/ro/crate"
}
}
]
}
39 changes: 39 additions & 0 deletions test/test-data/crate_with_subcrate/subcrate/ro-crate-metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"@context": "https://w3id.org/ro/crate/1.1/context",
"@graph": [
{
"@id": "./",
"@type": "Dataset",
"datePublished": "2025-12-02T08:39:54+00:00",
"hasPart": [
{
"@id": "subfile.txt"
},
{
"@id": "subsubcrate/"
}
]
},
{
"@id": "ro-crate-metadata.json",
"@type": "CreativeWork",
"about": {
"@id": "./"
},
"conformsTo": {
"@id": "https://w3id.org/ro/crate/1.1"
}
},
{
"@id": "subfile.txt",
"@type": "File"
},
{
"@id": "subsubcrate/",
"@type": "Dataset",
"conformsTo": {
"@id": "https://w3id.org/ro/crate"
}
}
]
}
1 change: 1 addition & 0 deletions test/test-data/crate_with_subcrate/subcrate/subfile.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
empty
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
empty
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"@context": "https://w3id.org/ro/crate/1.1/context",
"@graph": [
{
"@id": "./",
"@type": "Dataset",
"datePublished": "2025-12-02T08:39:54+00:00",
"hasPart": [
{
"@id": "deepfile.txt"
}
]
},
{
"@id": "ro-crate-metadata.json",
"@type": "CreativeWork",
"about": {
"@id": "./"
},
"conformsTo": {
"@id": "https://w3id.org/ro/crate/1.1"
}
},
{
"@id": "deepfile.txt",
"@type": "File"
}
]
}
7 changes: 4 additions & 3 deletions test/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from pathlib import Path

import pytest
from rocrate.rocrate import ROCrate
from rocrate.rocrate import ROCrate, Subcrate
from rocrate.model import (
DataEntity,
File,
Expand Down Expand Up @@ -103,10 +103,11 @@ def test_data_entities(test_data_dir):
crate = ROCrate()
file_ = crate.add(File(crate, test_data_dir / 'sample_file.txt'))
dataset = crate.add(Dataset(crate, test_data_dir / 'test_add_dir'))
subcrate = crate.add(Subcrate(crate, test_data_dir / 'crate-1.0'))
data_entity = crate.add(DataEntity(crate, '#mysterious'))
assert set(crate.data_entities) == {file_, dataset, data_entity}
assert set(crate.data_entities) == {file_, dataset, subcrate, data_entity}
part_ids = set(_["@id"] for _ in crate.root_dataset._jsonld["hasPart"])
assert set(_.id for _ in (file_, dataset, data_entity)) <= part_ids
assert set(_.id for _ in (file_, dataset, subcrate, data_entity)) <= part_ids


@pytest.mark.skipif(sys.platform == "darwin", reason="CI sometimes fails on macOS")
Expand Down
Loading