1- import copy
2- import pdb
31import datetime
4- import logging
52import urllib
63import uuid
74import json
8- from io import BytesIO
95from pathlib import PurePath , PurePosixPath
10- from socket import getfqdn
116from typing import (
127 Any ,
138 Dict ,
14- Iterable ,
159 List ,
16- MutableMapping ,
1710 MutableSequence ,
1811 Optional ,
1912 Tuple ,
2316
2417from prov .identifier import Identifier
2518from prov .model import PROV , PROV_LABEL , PROV_TYPE , PROV_VALUE , ProvDocument , ProvEntity
26- from schema_salad .sourceline import SourceLine
27- from typing_extensions import TYPE_CHECKING
28- from tools .load_ga_export import load_ga_history_export , GalaxyJob , GalaxyDataset
19+ from tools .load_ga_export import load_ga_history_export , GalaxyJob
2920from ast import literal_eval
3021import os
3122
3627from rocrate .provenance_constants import (
3728 ACCOUNT_UUID ,
3829 CWLPROV ,
39- ENCODING ,
40- FOAF ,
4130 METADATA ,
4231 ORE ,
4332 PROVENANCE ,
4433 RO ,
4534 SCHEMA ,
4635 SHA1 ,
47- SHA256 ,
48- TEXT_PLAIN ,
4936 UUID ,
5037 WF4EVER ,
5138 WFDESC ,
5946# from rocrate.provenance import ResearchObject
6047
6148from pathlib import Path
62- import rocrate . rocrate as roc
49+
6350
6451def posix_path (local_path : str ) -> str :
6552 return str (PurePosixPath (Path (local_path )))
6653
54+
6755def remove_escapes (s ):
6856 escapes = '' .join ([chr (char ) for char in range (1 , 32 )])
6957 translator = str .maketrans ('' , '' , escapes )
70- t = s .translate (translator )
58+ s .translate (translator )
59+
7160
7261def reassign (d ):
7362 for k , v in d .items ():
@@ -78,16 +67,17 @@ def reassign(d):
7867 except ValueError :
7968 pass
8069
70+
8171class ProvenanceProfile :
82- """
72+ """\
8373 Provenance profile.
8474
8575 Populated from a galaxy workflow export.
8676 """
8777
8878 def __init__ (
8979 self ,
90- ga_export : Dict ,
80+ ga_export : Dict ,
9181 full_name : str = None ,
9282 orcid : str = None ,
9383 # prov_name: str = None,
@@ -112,12 +102,11 @@ def __init__(
112102 self .base_uri = "arcp://uuid,%s/" % self .ro_uuid
113103 self .document = ProvDocument ()
114104 # TODO extract engine_uuid from galaxy, type: str
115- self .engine_uuid = "urn:uuid:%s" % uuid .uuid4 () #type: str
105+ self .engine_uuid = "urn:uuid:%s" % uuid .uuid4 () # type: str
116106 self .full_name = full_name
117107 self .workflow_run_uuid = run_uuid or uuid .uuid4 ()
118108 self .workflow_run_uri = self .workflow_run_uuid .urn # type: str
119-
120- # move to separate function
109+ # move to separate function
121110 metadata_export = load_ga_history_export (ga_export )
122111 self .generate_prov_doc ()
123112 self .jobs = []
@@ -143,7 +132,7 @@ def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
143132 # PROV_TYPE: FOAF["OnlineAccount"],
144133 # TODO: change how we register galaxy version, probably a declare_version func
145134 # self.galaxy_version = self.ga_export["jobs_attrs"][0]["galaxy_version"]
146- # TODO: change notation to already imported namespaces?
135+ # TODO: change notation to already imported namespaces?
147136 self .document .add_namespace ("wfprov" , "http://purl.org/wf4ever/wfprov#" )
148137 # document.add_namespace('prov', 'http://www.w3.org/ns/prov#')
149138 self .document .add_namespace ("wfdesc" , "http://purl.org/wf4ever/wfdesc#" )
@@ -166,7 +155,7 @@ def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
166155 "provenance" , self .base_uri + posix_path (PROVENANCE ) + "/"
167156 )
168157 # TODO: use appropriate refs for ga_export and related inputs
169- ro_identifier_workflow = self .base_uri + "ga_export" + "/"
158+ ro_identifier_workflow = self .base_uri + "ga_export" + "/"
170159 self .wf_ns = self .document .add_namespace ("wf" , ro_identifier_workflow )
171160 ro_identifier_input = (
172161 self .base_uri + "ga_export/datasets#"
@@ -230,15 +219,15 @@ def declare_process(
230219 """Record the start of each Process."""
231220 if process_run_id is None :
232221 process_run_id = uuid .uuid4 ().urn
233-
234- cmd = ga_export_jobs_attrs ["command_line" ]
222+
223+ # cmd = ga_export_jobs_attrs["command_line"]
235224 process_name = ga_export_jobs_attrs ["tool_id" ]
236- tool_version = ga_export_jobs_attrs ["tool_version" ]
225+ # tool_version = ga_export_jobs_attrs["tool_version"]
237226 prov_label = "Run of ga_export/jobs_attrs.txt#" + process_name
238227 start_time = ga_export_jobs_attrs ["create_time" ]
239228 end_time = ga_export_jobs_attrs ["update_time" ]
240229
241- #TODO: Find out how to include commandline as a string
230+ # TODO: Find out how to include commandline as a string
242231 # cmd = self.document.entity(
243232 # uuid.uuid4().urn,
244233 # {PROV_TYPE: WFPROV["Artifact"], PROV_LABEL: ga_export_jobs_attrs["command_line"]}
@@ -249,9 +238,9 @@ def declare_process(
249238 start_time ,
250239 end_time ,
251240 {
252- PROV_TYPE : WFPROV ["ProcessRun" ],
253- PROV_LABEL : prov_label ,
254- #TODO: Find out how to include commandline as a string
241+ PROV_TYPE : WFPROV ["ProcessRun" ],
242+ PROV_LABEL : prov_label ,
243+ # TODO: Find out how to include commandline as a string
255244 # PROV_LABEL: cmd
256245 },
257246 )
@@ -279,7 +268,7 @@ def used_artefacts(
279268 base += "/" + process_name
280269 tool_id = process_metadata ["tool_id" ]
281270 base += "/" + tool_id
282- items = ["inputs" ,"outputs" ,"parameters" ]
271+ items = ["inputs" , "outputs" , "parameters" ]
283272 # print(process_metadata["params"])
284273 for item in items :
285274 # print(item)
@@ -293,8 +282,8 @@ def used_artefacts(
293282 value = json .loads (value )
294283 if isinstance (key , str ):
295284 key = key .replace ("|" , "_" )
296- if isinstance (value , str ):
297- val = value .replace ("|" , "_" )
285+ if isinstance (value , str ):
286+ value = value .replace ("|" , "_" )
298287
299288 prov_role = self .wf_ns [f"{ base } /{ key } " ]
300289
@@ -307,7 +296,6 @@ def used_artefacts(
307296
308297 # for artefact in value:
309298 try :
310- # pdb.set_trace()
311299 entity = self .declare_artefact (value )
312300 self .document .used (
313301 process_run_id ,
@@ -346,7 +334,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
346334 # byte_s = BytesIO(value)
347335 # data_file = self.research_object.add_data_file(byte_s)
348336 # FIXME: Don't naively assume add_data_file uses hash in filename!
349- data_id = "data:%s" % str (value ) # PurePosixPath(data_file).stem
337+ data_id = "data:%s" % str (value ) # PurePosixPath(data_file).stem
350338 return self .document .entity (
351339 data_id ,
352340 {PROV_TYPE : WFPROV ["Artifact" ], PROV_VALUE : str (value )},
@@ -383,7 +371,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
383371 )
384372
385373 if value .get ("class" ):
386- #_logger.warning("Unknown data class %s.", value["class"])
374+ # _logger.warning("Unknown data class %s.", value["class"])
387375 # FIXME: The class might be "http://example.com/somethingelse"
388376 coll .add_asserted_type (CWLPROV [value ["class" ]])
389377
@@ -393,7 +381,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
393381 # clean up unwanted characters
394382 if isinstance (key , str ):
395383 key = key .replace ("|" , "_" )
396- if isinstance (val , str ):
384+ if isinstance (val , str ):
397385 val = val .replace ("|" , "_" )
398386
399387 v_ent = self .declare_artefact (val )
@@ -440,7 +428,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
440428 # FIXME: list value does not support adding "@id"
441429 return coll
442430 except TypeError :
443- #_logger.warning("Unrecognized type %s of %r", type(value), value)
431+ # _logger.warning("Unrecognized type %s of %r", type(value), value)
444432 # Let's just fall back to Python repr()
445433 entity = self .document .entity (uuid .uuid4 ().urn , {PROV_LABEL : repr (value )})
446434 # self.research_object.add_uri(entity.identifier.uri)
@@ -455,7 +443,7 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
455443 if "checksum" in value :
456444 csum = cast (str , value ["checksum" ])
457445 (method , checksum ) = csum .split ("$" , 1 )
458- if method == SHA1 : # and self.research_object.has_data_file(checksum):
446+ if method == SHA1 : # and self.research_object.has_data_file(checksum):
459447 entity = self .document .entity ("data:" + checksum )
460448
461449 if not entity and "location" in value :
@@ -502,8 +490,8 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
502490
503491 # Check for secondaries
504492 for sec in cast (
505- # MutableSequence[CWLObjectType],
506- value .get ("secondaryFiles" , [])
493+ # MutableSequence[CWLObjectType],
494+ value .get ("secondaryFiles" , []) # noqa
507495 ):
508496 # TODO: Record these in a specializationOf entity with UUID?
509497 if sec ["class" ] == "File" :
@@ -524,8 +512,10 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
524512
525513 return file_entity , entity , checksum
526514
527- def declare_directory (self
528- # , value: CWLObjectType
515+ def declare_directory (
516+ self ,
517+ # value: CWLObjectType
518+ value
529519 ) -> ProvEntity :
530520 """Register any nested files/directories."""
531521 # FIXME: Calculate a hash-like identifier for directory
@@ -636,12 +626,11 @@ def declare_string(self, value: str) -> Tuple[ProvEntity, str]:
636626 # checksum = PurePosixPath(data_file).name
637627 # FIXME: Don't naively assume add_data_file uses hash in filename!
638628 value = str (value ).replace ("|" , "_" )
639- data_id = "data:%s" % str (value ) # PurePosixPath(data_file).stem
629+ data_id = "data:%s" % str (value ) # PurePosixPath(data_file).stem
640630 entity = self .document .entity (
641631 data_id , {PROV_TYPE : WFPROV ["Artifact" ], PROV_VALUE : str (value )}
642632 ) # type: ProvEntity
643- return entity #, checksum
644-
633+ return entity # , checksum
645634
646635 def generate_output_prov (
647636 self ,
@@ -724,7 +713,7 @@ def activity_has_provenance(self, activity, prov_ids):
724713 self .document .activity (activity , other_attributes = attribs )
725714 # Tip: we can't use https://www.w3.org/TR/prov-links/#term-mention
726715 # as prov:mentionOf() is only for entities, not activities
727- uris = [i .uri for i in prov_ids ]
716+ # uris = [i.uri for i in prov_ids]
728717 # self.research_object.add_annotation(activity, uris, PROV["has_provenance"].uri)
729718
730719 def finalize_prov_profile (self , name = None , out_path = None ):
@@ -759,7 +748,7 @@ def finalize_prov_profile(self, name=None, out_path=None):
759748
760749 # https://www.w3.org/TR/prov-xml/
761750 # serialized_prov_docs["xml"] = self.document.serialize(format="xml", indent=4)
762- prov_ids .append (self .provenance_ns [filename + ".xml" ])
751+ prov_ids .append (self .provenance_ns [filename + ".xml" ])
763752 with open (basename + ".xml" , "w" ) as provenance_file :
764753 self .document .serialize (provenance_file , format = "xml" , indent = 4 )
765754
@@ -768,7 +757,6 @@ def finalize_prov_profile(self, name=None, out_path=None):
768757 prov_ids .append (self .provenance_ns [filename + ".provn" ])
769758 with open (basename + ".provn" , "w" ) as provenance_file :
770759 self .document .serialize (provenance_file , format = "provn" , indent = 2 )
771-
772760
773761 # https://www.w3.org/Submission/prov-json/
774762 # serialized_prov_docs["json"] = self.document.serialize(format="json", indent=2)
@@ -799,7 +787,6 @@ def finalize_prov_profile(self, name=None, out_path=None):
799787 prov_ids .append (self .provenance_ns [filename + ".jsonld" ])
800788 with open (basename + ".jsonld" , "w" ) as provenance_file :
801789 self .document .serialize (provenance_file , format = "rdf" , rdf_format = "json-ld" )
802-
803790
804- #_logger.debug("[provenance] added provenance: %s", prov_ids)
791+ # _logger.debug("[provenance] added provenance: %s", prov_ids)
805792 return (serialized_prov_docs , prov_ids )
0 commit comments