Skip to content

Commit 28af8a5

Browse files
committed
adjust PROV for potential metadata updates - works for workflow, but not single CommandLineTool (relates to common-workflow-language/cwltool#2082)
1 parent 2a5a3ee commit 28af8a5

File tree

3 files changed

+169
-94
lines changed

3 files changed

+169
-94
lines changed

config/weaver.ini.example

+1-1
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ weaver.wps_metadata_identification_keywords=Weaver,WPS,OGC
133133
# access constraints can be comma-separated
134134
weaver.wps_metadata_identification_accessconstraints=NONE
135135
weaver.wps_metadata_identification_fees=NONE
136-
weaver.wps_metadata_provider_name=CRIM
136+
weaver.wps_metadata_provider_name=Computer Research Institute of Montréal (CRIM)
137137
weaver.wps_metadata_provider_url=http://pavics-weaver.readthedocs.org/en/latest/
138138
weaver.wps_metadata_contact_name=Francis Charette-Migneault
139139
weaver.wps_metadata_contact_position=Research Software Developer

weaver/datatype.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -1554,6 +1554,8 @@ def links(self, container=None, self_link=None):
15541554
"title": "Job statistics collected following process execution."},
15551555
{"href": f"{job_url}/prov", "rel": "provenance", # unofficial
15561556
"title": "Job provenance collected following process execution."},
1557+
{"href": f"{job_url}/prov", "rel": "https://www.w3.org/ns/prov", # unofficial
1558+
"title": "Job provenance collected following process execution."},
15571559
])
15581560
else:
15591561
job_links.append({
@@ -1577,8 +1579,8 @@ def links(self, container=None, self_link=None):
15771579
job_links.extend([self_link_body, self_link_up])
15781580
link_meta = {"type": ContentType.APP_JSON, "hreflang": AcceptLanguage.EN_CA}
15791581
for link in job_links:
1580-
for meta, parma in link_meta.items():
1581-
link.setdefault(meta, parma)
1582+
for meta, param in link_meta.items():
1583+
link.setdefault(meta, param)
15821584
return job_links
15831585

15841586
def json(self, container=None): # pylint: disable=W0221,arguments-differ

weaver/processes/wps_package.py

+164-91
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
from cwltool.context import LoadingContext, RuntimeContext
3535
from cwltool.cwlprov import provenance_constants as cwl_prov_const
3636
from cwltool.cwlprov.ro import ResearchObject
37-
from cwltool.cwlprov.writablebagfile import close_ro
37+
from cwltool.cwlprov.writablebagfile import close_ro, packed_workflow
3838
from cwltool.factory import Factory as CWLFactory, WorkflowStatus as CWLException
3939
from cwltool.process import shortname, use_custom_schema
4040
from cwltool.secrets import SecretStore
@@ -183,6 +183,7 @@
183183
from cwltool.factory import Callable as CWLFactoryCallable
184184
from cwltool.process import Process as ProcessCWL
185185
from owslib.wps import WPSExecution
186+
from prov.model import ProvDocument
186187
from pywps.response.execute import ExecuteResponse
187188

188189
from weaver.datatype import Authentication, Job
@@ -1491,6 +1492,160 @@ def location(self, destination):
14911492
return self.storage.location(destination)
14921493

14931494

1495+
class WeaverResearchObject(ResearchObject):
1496+
"""
1497+
Defines extended provenance details related to `Weaver` operations and referencing the active server instance.
1498+
"""
1499+
1500+
def __init__(self, fs_access, temp_prefix_ro="tmp", orcid="", full_name="", ro_uuid=None, settings=None):
1501+
# type: (StdFsAccess, str, str, str, uuid.UUID, AnySettingsContainer) -> None
1502+
super(WeaverResearchObject, self).__init__(fs_access, temp_prefix_ro, orcid, full_name)
1503+
1504+
# rewrite auto-initialized random UUIDs with Weaver-specific references
1505+
self.ro_uuid = ro_uuid or uuid.uuid4()
1506+
self.base_uri = f"arcp://uuid,{self.ro_uuid}/"
1507+
self.settings = settings
1508+
1509+
# FIXME: improve "hook" strategy with CWLProv
1510+
# all the following is not called when the runner resolves to a 'cwltool.executors.SingleJobExecutor'
1511+
# (ie: when the CWL is a CommandLineTool). This is because its code sets the 'ProvenanceProfile' option
1512+
# with 'user_provenance=False' explicitly Sadly, this is our only availble "hook" mechanism for the time being.
1513+
# However, it works for a CWL Workflow since that variant passes down
1514+
# the 'user_provenance option that we set (ie: 'ProvenanceProfile' created in 'cwltool.workflow.Workflow').
1515+
# see https://github.com/common-workflow-language/cwltool/pull/2082 for alternative to resolve "properly"
1516+
def user_provenance(self, document):
1517+
# type: (ProvDocument) -> None
1518+
"""
1519+
Hook `Weaver` updates onto user provenance step.
1520+
1521+
Because of how the :class`ResearchObject` and :class:`cwltool.cwlprov.provenance_profile.ProvenanceProfile`
1522+
definitions are passed around, invoked, and finalized in the :mod:`cwltool` operations, attempting to update
1523+
the ``PROV`` definitions after execution of the tool would be too late, since all manifest and provenance files
1524+
would already have been written to disk. Instead, hook ourselves to this function that is invoked before the
1525+
procedure is completed to adjust or apply additional metadata.
1526+
"""
1527+
self.self_check()
1528+
1529+
# NOTE:
1530+
# the original operation does the following,
1531+
# but using the machine user means nothing in a docker running on a server
1532+
# (username, fullname) = _whoami()
1533+
1534+
weaver_url = get_weaver_url(self.settings)
1535+
weaver_desc = self.settings.get(
1536+
"weaver.wps_metadata_identification_abstract",
1537+
"Weaver OGC API Processes Server"
1538+
)
1539+
weaver_full_name = f"crim-ca/weaver:{weaver_version}"
1540+
1541+
# for whatever reason, this is done by a local 'host_provenance' function
1542+
# within the 'ProvenanceProfile.generate_prov_doc' method, and it applies
1543+
# by default the machine host FQDN, which is irrelevant inside a docker container
1544+
# instead, use actual information from the weaver server to populate the field
1545+
# note: because it is done here, option 'host_provenance' MUST be disabled to avoid duplicates
1546+
document.add_namespace(cwl_prov_const.UUID)
1547+
document.add_namespace(cwl_prov_const.ORCID)
1548+
document.add_namespace(cwl_prov_const.FOAF)
1549+
cwltool_agent = document.agent(
1550+
cwl_prov_const.ACCOUNT_UUID,
1551+
{
1552+
prov.constants.PROV_TYPE: cwl_prov_const.FOAF["OnlineAccount"],
1553+
prov.constants.PROV_LABEL: weaver_desc,
1554+
prov.constants.PROV_LOCATION: weaver_url,
1555+
cwl_prov_const.FOAF["accountName"]: weaver_full_name,
1556+
},
1557+
)
1558+
1559+
full_name = self.full_name or "undefined"
1560+
user_agent = document.agent(
1561+
self.orcid or cwl_prov_const.USER_UUID, # actual user if provided or alias for machine
1562+
{
1563+
prov.constants.PROV_TYPE: prov.constants.PROV["Person"],
1564+
prov.constants.PROV_LABEL: "User running the workflow job.",
1565+
cwl_prov_const.FOAF["name"]: full_name,
1566+
cwl_prov_const.FOAF["account"]: cwltool_agent,
1567+
},
1568+
)
1569+
# cwltool may be started on the shell (directly by user),
1570+
# by shell script (indirectly by user)
1571+
# or from a different program
1572+
# (which again is launched by any of the above)
1573+
#
1574+
# We can't tell in which way, but ultimately we're still
1575+
# acting in behalf of that user (even if we might
1576+
# get their name wrong!)
1577+
document.actedOnBehalfOf(cwltool_agent, user_agent)
1578+
1579+
document.add_namespace("doi", "https://doi.org/")
1580+
sha1_ns = document._namespaces.get_namespace("sha1")
1581+
1582+
crim_name = "Computer Research Institute of Montréal"
1583+
crim_entity = document.entity(
1584+
"_:crim",
1585+
{
1586+
prov.constants.PROV_TYPE: prov.constants.PROV["Organization"],
1587+
"foaf:name": crim_name,
1588+
"schema:name": crim_name,
1589+
}
1590+
)
1591+
1592+
server_provider_name = self.settings.get("weaver.wps_metadata_provider_name", weaver_url)
1593+
server_provider_entity = document.entity(
1594+
"_:server",
1595+
{
1596+
prov.constants.PROV_TYPE: prov.constants.PROV["Organization"],
1597+
"foaf:name": server_provider_name,
1598+
"schema:name": server_provider_name,
1599+
}
1600+
)
1601+
1602+
weaver_sha1 = hashlib.sha1(weaver_url)
1603+
weaver_agent = document.agent(
1604+
sha1_ns.qname(weaver_sha1),
1605+
{
1606+
prov.constants.PROV_TYPE: prov.constants.PROV["SoftwareAgent"],
1607+
prov.constants.PROV_LOCATION: weaver_url,
1608+
prov.constants.PROV_LABEL: weaver_full_name,
1609+
# "prov:qualifiedPrimarySource":
1610+
# "prov:Organization": "Computer Research Institute of Montréal (CRIM).",
1611+
# "foaf:Project": "https://github.com/crim-ca/weaver",
1612+
# "doi": "10.5281/zenodo.14210717" # see CITATION.cff
1613+
}
1614+
)
1615+
1616+
# cross-ref: https://wf4ever.github.io/ro/wfprov.owl
1617+
job_entity = document.entity(
1618+
self.job.uuid,
1619+
{
1620+
prov.constants.PROV_TYPE: cwl_prov_const.WFDESC["ProcessRun"],
1621+
prov.constants.PROV_LOCATION: self.job.job_url(self.settings),
1622+
prov.constants.PROV_LABEL: "Job Information",
1623+
}
1624+
)
1625+
proc_entity = document.entity(
1626+
self.job.uuid,
1627+
{
1628+
prov.constants.PROV_TYPE: cwl_prov_const.WFDESC["Process"],
1629+
prov.constants.PROV_LOCATION: self.job.process_url(self.settings),
1630+
prov.constants.PROV_LABEL: "Process Description",
1631+
}
1632+
)
1633+
1634+
wf_agent = document.get_record(self.engine_uuid) # current job run aligned with cwl workflow
1635+
1636+
# define relationships
1637+
document.actedOnBehalfOf(weaver_agent, user_agent)
1638+
document.specializationOf(weaver_agent, cwltool_agent)
1639+
document.attribution(crim_entity, weaver_agent)
1640+
document.wasDerivedFrom(cwltool_agent, weaver_agent)
1641+
document.derivation(server_provider_entity, weaver_agent)
1642+
document.wasStartedBy(job_entity, weaver_agent)
1643+
document.wasStartedBy(wf_agent, job_entity, time=self.job.created)
1644+
document.specializationOf(wf_agent, job_entity)
1645+
document.alternateOf(wf_agent, job_entity)
1646+
document.wasGeneratedBy(job_entity, proc_entity)
1647+
1648+
14941649
class WpsPackage(Process):
14951650
def __init__(
14961651
self,
@@ -1836,114 +1991,32 @@ def setup_provenance(self, loading_context, runtime_context):
18361991
return
18371992

18381993
loading_context.user_provenance = True
1839-
loading_context.host_provenance = True
1994+
loading_context.host_provenance = False # see 'WeaverResearchObject.user_provenance'
18401995

18411996
fs = runtime_context.make_fs_access or StdFsAccess
18421997
if not runtime_context.research_obj:
1843-
ro = ResearchObject(
1998+
ro = WeaverResearchObject(
18441999
fs(""),
18452000
temp_prefix_ro=runtime_context.tmpdir_prefix,
18462001
orcid=runtime_context.orcid,
18472002
full_name=runtime_context.cwl_full_name,
2003+
ro_uuid=self.job.uuid, # align the RO definition with the job (make the UUIDs logical)
2004+
settings=self.settings,
18482005
)
18492006

1850-
# rewrite auto-initialized random UUIDs with Weaver-specific references
1851-
ro.ro_uuid = self.job.uuid
1852-
ro.base_uri = f"arcp://uuid,{ro.ro_uuid}/"
1853-
18542007
loading_context.research_obj = ro
18552008
runtime_context.research_obj = ro
18562009

18572010
def finalize_provenance(self, runtime_context):
18582011
# type: (RuntimeContext) -> None
18592012
if runtime_context.research_obj:
1860-
ro = runtime_context.research_obj
1861-
prov_obj = runtime_context.prov_obj
1862-
1863-
# FIXME: all in try/except fails because 'prov_obj' is unset
1864-
# (operation already performed before we reach here! - find a way to hook ourselves during the operation)
1865-
# the actual creation of 'cwltool.cwlprov.provenance_profile.ProvenanceProfile'
1866-
# happens within one of the 'cwltool.executors.JobExecutor', which ends up
1867-
# calling 'process.parent_wf.finalize_prov_profile' directly before the end
1868-
# of 'cwltool.executors.JobExecutor.execute', which in turns generates all the PROV files
1869-
try:
1870-
prov_obj.document.add_namespace("doi", "https://doi.org/")
1871-
sha1_ns = prov_obj.document._namespaces.get_namespace("sha1")
1872-
1873-
crim_name = "Computer Research Institute of Montréal"
1874-
crim_entity = prov_obj.document.entity(
1875-
"_:crim",
1876-
{
1877-
prov.constants.PROV_TYPE: prov.constants.PROV["Organization"],
1878-
"foaf:name": crim_name,
1879-
"schema:name": crim_name,
1880-
}
1881-
)
1882-
1883-
weaver_url = get_weaver_url(self.settings)
1884-
weaver_sha1 = hashlib.sha1(weaver_url)
1885-
weaver_agent = prov_obj.document.agent(
1886-
sha1_ns.qname(weaver_sha1),
1887-
{
1888-
prov.constants.PROV_TYPE: prov.constants.PROV["SoftwareAgent"],
1889-
prov.constants.PROV_LOCATION: weaver_url,
1890-
prov.constants.PROV_LABEL: f"crim-ca/weaver {weaver_version}",
1891-
# "prov:qualifiedPrimarySource":
1892-
# "prov:Organization": "Computer Research Institute of Montréal (CRIM).",
1893-
# "foaf:Project": "https://github.com/crim-ca/weaver",
1894-
# "doi": "10.5281/zenodo.14210717" # see CITATION.cff
1895-
}
1896-
)
1897-
1898-
# cross-ref: https://wf4ever.github.io/ro/wfprov.owl
1899-
job_entity = prov_obj.document.entity(
1900-
self.job.uuid,
1901-
{
1902-
prov.constants.PROV_TYPE: cwl_prov_const.WFDESC["ProcessRun"],
1903-
prov.constants.PROV_LOCATION: self.job.job_url(self.settings),
1904-
prov.constants.PROV_LABEL: "Job Information",
1905-
}
1906-
)
1907-
proc_entity = prov_obj.document.entity(
1908-
self.job.uuid,
1909-
{
1910-
prov.constants.PROV_TYPE: cwl_prov_const.WFDESC["Process"],
1911-
prov.constants.PROV_LOCATION: self.job.process_url(self.settings),
1912-
prov.constants.PROV_LABEL: "Process Description",
1913-
}
1914-
)
1915-
1916-
cwl_agent = prov_obj.document.get_record(cwl_prov_const.ACCOUNT_UUID) # cwltool
1917-
usr_agent = prov_obj.document.get_record(cwl_prov_const.USER_UUID) # pseudo-user (machine user)
1918-
wf_agent = prov_obj.document.get_record(ro.engine_uuid) # current job run aligned with cwl workflow
1919-
1920-
# FIXME: patch override of 'host_provenance' since access through RO it is not possible
1921-
# (private function in cwltool.cwlprov.provenance_profile.ProvenanceProfile.generate_prov_doc
1922-
# cwl_agent.extend()
1923-
# document.agent(
1924-
# ACCOUNT_UUID,
1925-
# {
1926-
# PROV_TYPE: FOAF["OnlineAccount"],
1927-
# "prov:location": hostname,
1928-
# CWLPROV["hostname"]: hostname,
1929-
# },
1930-
# )
1931-
1932-
# define relationships
1933-
prov_obj.document.actedOnBehalfOf(weaver_agent, usr_agent)
1934-
prov_obj.document.specializationOf(weaver_agent, cwl_agent)
1935-
prov_obj.document.attribution(crim_entity, weaver_agent)
1936-
prov_obj.document.wasDerivedFrom(cwl_agent, weaver_agent)
1937-
# prov_obj.document.wasStartedBy(job_agent, weaver_agent)
1938-
prov_obj.document.wasStartedBy(wf_agent, job_entity, time=self.job.created)
1939-
# prov_obj.document.specializationOf(wf_agent, job_entity)
1940-
# prov_obj.document.alternateOf(wf_agent, job_entity)
1941-
except:
1942-
pass
2013+
# perform packaging of the workflow
2014+
packed_wf_str = repr_json(self.package, force_string=True, indent=2)
2015+
packed_workflow(runtime_context.research_obj, packed_wf_str)
19432016

19442017
# sign-off and persist completed PROV
19452018
prov_dir = self.job.prov_path(self.settings)
1946-
close_ro(ro, prov_dir)
2019+
close_ro(runtime_context.research_obj, prov_dir)
19472020

19482021
def update_requirements(self):
19492022
# type: () -> None

0 commit comments

Comments
 (0)