|
34 | 34 | from cwltool.context import LoadingContext, RuntimeContext
|
35 | 35 | from cwltool.cwlprov import provenance_constants as cwl_prov_const
|
36 | 36 | from cwltool.cwlprov.ro import ResearchObject
|
37 |
| -from cwltool.cwlprov.writablebagfile import close_ro |
| 37 | +from cwltool.cwlprov.writablebagfile import close_ro, packed_workflow |
38 | 38 | from cwltool.factory import Factory as CWLFactory, WorkflowStatus as CWLException
|
39 | 39 | from cwltool.process import shortname, use_custom_schema
|
40 | 40 | from cwltool.secrets import SecretStore
|
|
183 | 183 | from cwltool.factory import Callable as CWLFactoryCallable
|
184 | 184 | from cwltool.process import Process as ProcessCWL
|
185 | 185 | from owslib.wps import WPSExecution
|
| 186 | + from prov.model import ProvDocument |
186 | 187 | from pywps.response.execute import ExecuteResponse
|
187 | 188 |
|
188 | 189 | from weaver.datatype import Authentication, Job
|
@@ -1491,6 +1492,160 @@ def location(self, destination):
|
1491 | 1492 | return self.storage.location(destination)
|
1492 | 1493 |
|
1493 | 1494 |
|
| 1495 | +class WeaverResearchObject(ResearchObject): |
| 1496 | + """ |
| 1497 | + Defines extended provenance details related to `Weaver` operations and referencing the active server instance. |
| 1498 | + """ |
| 1499 | + |
| 1500 | + def __init__(self, fs_access, temp_prefix_ro="tmp", orcid="", full_name="", ro_uuid=None, settings=None): |
| 1501 | + # type: (StdFsAccess, str, str, str, uuid.UUID, AnySettingsContainer) -> None |
| 1502 | + super(WeaverResearchObject, self).__init__(fs_access, temp_prefix_ro, orcid, full_name) |
| 1503 | + |
| 1504 | + # rewrite auto-initialized random UUIDs with Weaver-specific references |
| 1505 | + self.ro_uuid = ro_uuid or uuid.uuid4() |
| 1506 | + self.base_uri = f"arcp://uuid,{self.ro_uuid}/" |
| 1507 | + self.settings = settings |
| 1508 | + |
| 1509 | + # FIXME: improve "hook" strategy with CWLProv |
| 1510 | + # all the following is not called when the runner resolves to a 'cwltool.executors.SingleJobExecutor' |
| 1511 | + # (ie: when the CWL is a CommandLineTool). This is because its code sets the 'ProvenanceProfile' option |
| 1512 | + # with 'user_provenance=False' explicitly Sadly, this is our only availble "hook" mechanism for the time being. |
| 1513 | + # However, it works for a CWL Workflow since that variant passes down |
| 1514 | + # the 'user_provenance option that we set (ie: 'ProvenanceProfile' created in 'cwltool.workflow.Workflow'). |
| 1515 | + # see https://github.com/common-workflow-language/cwltool/pull/2082 for alternative to resolve "properly" |
| 1516 | + def user_provenance(self, document): |
| 1517 | + # type: (ProvDocument) -> None |
| 1518 | + """ |
| 1519 | + Hook `Weaver` updates onto user provenance step. |
| 1520 | +
|
| 1521 | + Because of how the :class`ResearchObject` and :class:`cwltool.cwlprov.provenance_profile.ProvenanceProfile` |
| 1522 | + definitions are passed around, invoked, and finalized in the :mod:`cwltool` operations, attempting to update |
| 1523 | + the ``PROV`` definitions after execution of the tool would be too late, since all manifest and provenance files |
| 1524 | + would already have been written to disk. Instead, hook ourselves to this function that is invoked before the |
| 1525 | + procedure is completed to adjust or apply additional metadata. |
| 1526 | + """ |
| 1527 | + self.self_check() |
| 1528 | + |
| 1529 | + # NOTE: |
| 1530 | + # the original operation does the following, |
| 1531 | + # but using the machine user means nothing in a docker running on a server |
| 1532 | + # (username, fullname) = _whoami() |
| 1533 | + |
| 1534 | + weaver_url = get_weaver_url(self.settings) |
| 1535 | + weaver_desc = self.settings.get( |
| 1536 | + "weaver.wps_metadata_identification_abstract", |
| 1537 | + "Weaver OGC API Processes Server" |
| 1538 | + ) |
| 1539 | + weaver_full_name = f"crim-ca/weaver:{weaver_version}" |
| 1540 | + |
| 1541 | + # for whatever reason, this is done by a local 'host_provenance' function |
| 1542 | + # within the 'ProvenanceProfile.generate_prov_doc' method, and it applies |
| 1543 | + # by default the machine host FQDN, which is irrelevant inside a docker container |
| 1544 | + # instead, use actual information from the weaver server to populate the field |
| 1545 | + # note: because it is done here, option 'host_provenance' MUST be disabled to avoid duplicates |
| 1546 | + document.add_namespace(cwl_prov_const.UUID) |
| 1547 | + document.add_namespace(cwl_prov_const.ORCID) |
| 1548 | + document.add_namespace(cwl_prov_const.FOAF) |
| 1549 | + cwltool_agent = document.agent( |
| 1550 | + cwl_prov_const.ACCOUNT_UUID, |
| 1551 | + { |
| 1552 | + prov.constants.PROV_TYPE: cwl_prov_const.FOAF["OnlineAccount"], |
| 1553 | + prov.constants.PROV_LABEL: weaver_desc, |
| 1554 | + prov.constants.PROV_LOCATION: weaver_url, |
| 1555 | + cwl_prov_const.FOAF["accountName"]: weaver_full_name, |
| 1556 | + }, |
| 1557 | + ) |
| 1558 | + |
| 1559 | + full_name = self.full_name or "undefined" |
| 1560 | + user_agent = document.agent( |
| 1561 | + self.orcid or cwl_prov_const.USER_UUID, # actual user if provided or alias for machine |
| 1562 | + { |
| 1563 | + prov.constants.PROV_TYPE: prov.constants.PROV["Person"], |
| 1564 | + prov.constants.PROV_LABEL: "User running the workflow job.", |
| 1565 | + cwl_prov_const.FOAF["name"]: full_name, |
| 1566 | + cwl_prov_const.FOAF["account"]: cwltool_agent, |
| 1567 | + }, |
| 1568 | + ) |
| 1569 | + # cwltool may be started on the shell (directly by user), |
| 1570 | + # by shell script (indirectly by user) |
| 1571 | + # or from a different program |
| 1572 | + # (which again is launched by any of the above) |
| 1573 | + # |
| 1574 | + # We can't tell in which way, but ultimately we're still |
| 1575 | + # acting in behalf of that user (even if we might |
| 1576 | + # get their name wrong!) |
| 1577 | + document.actedOnBehalfOf(cwltool_agent, user_agent) |
| 1578 | + |
| 1579 | + document.add_namespace("doi", "https://doi.org/") |
| 1580 | + sha1_ns = document._namespaces.get_namespace("sha1") |
| 1581 | + |
| 1582 | + crim_name = "Computer Research Institute of Montréal" |
| 1583 | + crim_entity = document.entity( |
| 1584 | + "_:crim", |
| 1585 | + { |
| 1586 | + prov.constants.PROV_TYPE: prov.constants.PROV["Organization"], |
| 1587 | + "foaf:name": crim_name, |
| 1588 | + "schema:name": crim_name, |
| 1589 | + } |
| 1590 | + ) |
| 1591 | + |
| 1592 | + server_provider_name = self.settings.get("weaver.wps_metadata_provider_name", weaver_url) |
| 1593 | + server_provider_entity = document.entity( |
| 1594 | + "_:server", |
| 1595 | + { |
| 1596 | + prov.constants.PROV_TYPE: prov.constants.PROV["Organization"], |
| 1597 | + "foaf:name": server_provider_name, |
| 1598 | + "schema:name": server_provider_name, |
| 1599 | + } |
| 1600 | + ) |
| 1601 | + |
| 1602 | + weaver_sha1 = hashlib.sha1(weaver_url) |
| 1603 | + weaver_agent = document.agent( |
| 1604 | + sha1_ns.qname(weaver_sha1), |
| 1605 | + { |
| 1606 | + prov.constants.PROV_TYPE: prov.constants.PROV["SoftwareAgent"], |
| 1607 | + prov.constants.PROV_LOCATION: weaver_url, |
| 1608 | + prov.constants.PROV_LABEL: weaver_full_name, |
| 1609 | + # "prov:qualifiedPrimarySource": |
| 1610 | + # "prov:Organization": "Computer Research Institute of Montréal (CRIM).", |
| 1611 | + # "foaf:Project": "https://github.com/crim-ca/weaver", |
| 1612 | + # "doi": "10.5281/zenodo.14210717" # see CITATION.cff |
| 1613 | + } |
| 1614 | + ) |
| 1615 | + |
| 1616 | + # cross-ref: https://wf4ever.github.io/ro/wfprov.owl |
| 1617 | + job_entity = document.entity( |
| 1618 | + self.job.uuid, |
| 1619 | + { |
| 1620 | + prov.constants.PROV_TYPE: cwl_prov_const.WFDESC["ProcessRun"], |
| 1621 | + prov.constants.PROV_LOCATION: self.job.job_url(self.settings), |
| 1622 | + prov.constants.PROV_LABEL: "Job Information", |
| 1623 | + } |
| 1624 | + ) |
| 1625 | + proc_entity = document.entity( |
| 1626 | + self.job.uuid, |
| 1627 | + { |
| 1628 | + prov.constants.PROV_TYPE: cwl_prov_const.WFDESC["Process"], |
| 1629 | + prov.constants.PROV_LOCATION: self.job.process_url(self.settings), |
| 1630 | + prov.constants.PROV_LABEL: "Process Description", |
| 1631 | + } |
| 1632 | + ) |
| 1633 | + |
| 1634 | + wf_agent = document.get_record(self.engine_uuid) # current job run aligned with cwl workflow |
| 1635 | + |
| 1636 | + # define relationships |
| 1637 | + document.actedOnBehalfOf(weaver_agent, user_agent) |
| 1638 | + document.specializationOf(weaver_agent, cwltool_agent) |
| 1639 | + document.attribution(crim_entity, weaver_agent) |
| 1640 | + document.wasDerivedFrom(cwltool_agent, weaver_agent) |
| 1641 | + document.derivation(server_provider_entity, weaver_agent) |
| 1642 | + document.wasStartedBy(job_entity, weaver_agent) |
| 1643 | + document.wasStartedBy(wf_agent, job_entity, time=self.job.created) |
| 1644 | + document.specializationOf(wf_agent, job_entity) |
| 1645 | + document.alternateOf(wf_agent, job_entity) |
| 1646 | + document.wasGeneratedBy(job_entity, proc_entity) |
| 1647 | + |
| 1648 | + |
1494 | 1649 | class WpsPackage(Process):
|
1495 | 1650 | def __init__(
|
1496 | 1651 | self,
|
@@ -1836,114 +1991,32 @@ def setup_provenance(self, loading_context, runtime_context):
|
1836 | 1991 | return
|
1837 | 1992 |
|
1838 | 1993 | loading_context.user_provenance = True
|
1839 |
| - loading_context.host_provenance = True |
| 1994 | + loading_context.host_provenance = False # see 'WeaverResearchObject.user_provenance' |
1840 | 1995 |
|
1841 | 1996 | fs = runtime_context.make_fs_access or StdFsAccess
|
1842 | 1997 | if not runtime_context.research_obj:
|
1843 |
| - ro = ResearchObject( |
| 1998 | + ro = WeaverResearchObject( |
1844 | 1999 | fs(""),
|
1845 | 2000 | temp_prefix_ro=runtime_context.tmpdir_prefix,
|
1846 | 2001 | orcid=runtime_context.orcid,
|
1847 | 2002 | full_name=runtime_context.cwl_full_name,
|
| 2003 | + ro_uuid=self.job.uuid, # align the RO definition with the job (make the UUIDs logical) |
| 2004 | + settings=self.settings, |
1848 | 2005 | )
|
1849 | 2006 |
|
1850 |
| - # rewrite auto-initialized random UUIDs with Weaver-specific references |
1851 |
| - ro.ro_uuid = self.job.uuid |
1852 |
| - ro.base_uri = f"arcp://uuid,{ro.ro_uuid}/" |
1853 |
| - |
1854 | 2007 | loading_context.research_obj = ro
|
1855 | 2008 | runtime_context.research_obj = ro
|
1856 | 2009 |
|
1857 | 2010 | def finalize_provenance(self, runtime_context):
|
1858 | 2011 | # type: (RuntimeContext) -> None
|
1859 | 2012 | if runtime_context.research_obj:
|
1860 |
| - ro = runtime_context.research_obj |
1861 |
| - prov_obj = runtime_context.prov_obj |
1862 |
| - |
1863 |
| - # FIXME: all in try/except fails because 'prov_obj' is unset |
1864 |
| - # (operation already performed before we reach here! - find a way to hook ourselves during the operation) |
1865 |
| - # the actual creation of 'cwltool.cwlprov.provenance_profile.ProvenanceProfile' |
1866 |
| - # happens within one of the 'cwltool.executors.JobExecutor', which ends up |
1867 |
| - # calling 'process.parent_wf.finalize_prov_profile' directly before the end |
1868 |
| - # of 'cwltool.executors.JobExecutor.execute', which in turns generates all the PROV files |
1869 |
| - try: |
1870 |
| - prov_obj.document.add_namespace("doi", "https://doi.org/") |
1871 |
| - sha1_ns = prov_obj.document._namespaces.get_namespace("sha1") |
1872 |
| - |
1873 |
| - crim_name = "Computer Research Institute of Montréal" |
1874 |
| - crim_entity = prov_obj.document.entity( |
1875 |
| - "_:crim", |
1876 |
| - { |
1877 |
| - prov.constants.PROV_TYPE: prov.constants.PROV["Organization"], |
1878 |
| - "foaf:name": crim_name, |
1879 |
| - "schema:name": crim_name, |
1880 |
| - } |
1881 |
| - ) |
1882 |
| - |
1883 |
| - weaver_url = get_weaver_url(self.settings) |
1884 |
| - weaver_sha1 = hashlib.sha1(weaver_url) |
1885 |
| - weaver_agent = prov_obj.document.agent( |
1886 |
| - sha1_ns.qname(weaver_sha1), |
1887 |
| - { |
1888 |
| - prov.constants.PROV_TYPE: prov.constants.PROV["SoftwareAgent"], |
1889 |
| - prov.constants.PROV_LOCATION: weaver_url, |
1890 |
| - prov.constants.PROV_LABEL: f"crim-ca/weaver {weaver_version}", |
1891 |
| - # "prov:qualifiedPrimarySource": |
1892 |
| - # "prov:Organization": "Computer Research Institute of Montréal (CRIM).", |
1893 |
| - # "foaf:Project": "https://github.com/crim-ca/weaver", |
1894 |
| - # "doi": "10.5281/zenodo.14210717" # see CITATION.cff |
1895 |
| - } |
1896 |
| - ) |
1897 |
| - |
1898 |
| - # cross-ref: https://wf4ever.github.io/ro/wfprov.owl |
1899 |
| - job_entity = prov_obj.document.entity( |
1900 |
| - self.job.uuid, |
1901 |
| - { |
1902 |
| - prov.constants.PROV_TYPE: cwl_prov_const.WFDESC["ProcessRun"], |
1903 |
| - prov.constants.PROV_LOCATION: self.job.job_url(self.settings), |
1904 |
| - prov.constants.PROV_LABEL: "Job Information", |
1905 |
| - } |
1906 |
| - ) |
1907 |
| - proc_entity = prov_obj.document.entity( |
1908 |
| - self.job.uuid, |
1909 |
| - { |
1910 |
| - prov.constants.PROV_TYPE: cwl_prov_const.WFDESC["Process"], |
1911 |
| - prov.constants.PROV_LOCATION: self.job.process_url(self.settings), |
1912 |
| - prov.constants.PROV_LABEL: "Process Description", |
1913 |
| - } |
1914 |
| - ) |
1915 |
| - |
1916 |
| - cwl_agent = prov_obj.document.get_record(cwl_prov_const.ACCOUNT_UUID) # cwltool |
1917 |
| - usr_agent = prov_obj.document.get_record(cwl_prov_const.USER_UUID) # pseudo-user (machine user) |
1918 |
| - wf_agent = prov_obj.document.get_record(ro.engine_uuid) # current job run aligned with cwl workflow |
1919 |
| - |
1920 |
| - # FIXME: patch override of 'host_provenance' since access through RO it is not possible |
1921 |
| - # (private function in cwltool.cwlprov.provenance_profile.ProvenanceProfile.generate_prov_doc |
1922 |
| - # cwl_agent.extend() |
1923 |
| - # document.agent( |
1924 |
| - # ACCOUNT_UUID, |
1925 |
| - # { |
1926 |
| - # PROV_TYPE: FOAF["OnlineAccount"], |
1927 |
| - # "prov:location": hostname, |
1928 |
| - # CWLPROV["hostname"]: hostname, |
1929 |
| - # }, |
1930 |
| - # ) |
1931 |
| - |
1932 |
| - # define relationships |
1933 |
| - prov_obj.document.actedOnBehalfOf(weaver_agent, usr_agent) |
1934 |
| - prov_obj.document.specializationOf(weaver_agent, cwl_agent) |
1935 |
| - prov_obj.document.attribution(crim_entity, weaver_agent) |
1936 |
| - prov_obj.document.wasDerivedFrom(cwl_agent, weaver_agent) |
1937 |
| - # prov_obj.document.wasStartedBy(job_agent, weaver_agent) |
1938 |
| - prov_obj.document.wasStartedBy(wf_agent, job_entity, time=self.job.created) |
1939 |
| - # prov_obj.document.specializationOf(wf_agent, job_entity) |
1940 |
| - # prov_obj.document.alternateOf(wf_agent, job_entity) |
1941 |
| - except: |
1942 |
| - pass |
| 2013 | + # perform packaging of the workflow |
| 2014 | + packed_wf_str = repr_json(self.package, force_string=True, indent=2) |
| 2015 | + packed_workflow(runtime_context.research_obj, packed_wf_str) |
1943 | 2016 |
|
1944 | 2017 | # sign-off and persist completed PROV
|
1945 | 2018 | prov_dir = self.job.prov_path(self.settings)
|
1946 |
| - close_ro(ro, prov_dir) |
| 2019 | + close_ro(runtime_context.research_obj, prov_dir) |
1947 | 2020 |
|
1948 | 2021 | def update_requirements(self):
|
1949 | 2022 | # type: () -> None
|
|
0 commit comments