Skip to content

Commit

Permalink
- Resolve #29: pre-prime cache of lidvids → xml files rather than att…
Browse files Browse the repository at this point in the history
…empting to catalog them while also finding primaries (#38)

-   Also: make the pairing of lidvid → xml file unique to avoid redundnant cache entries
        -   Use `insert or ignore`
    -   Add a functional test that compares generated SIP with "valid" SIP
        -   And add framework for unit+functional+integration tests
-   Test data accessible via setuptools `pkg_resources`
-   Resolve #27: An *attempt* to tag test builds for test PyPI with both a date and timestamp; this requires dropping `--tag-date` and coming up with our own date+time format—and hoping that PyPI doesn't enforce its more capricious requirements on release naming
-   Add running tests in the CI/CD GitHub Action
-   Add ability to run tests with `setup test` as well as buildout's `bin/test`
-   Fix extra newlines in log messages
-   Remove need of extra `.strip()` calls by using a smarter regexp
-   PEP-8 cleanups: remove needless `+` for string concatenation
-   Arg list documentation in function calls
-   Remove exec bit from test data files
  • Loading branch information
nutjob4life authored Apr 20, 2020
1 parent 7b81946 commit e28d3bf
Show file tree
Hide file tree
Showing 17 changed files with 255 additions and 39 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/integration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,14 +106,14 @@ jobs:
run: |
sudo apt-get install -y libxml2-dev libxslt1-dev
-
name: 💽 Building distribution
name: 💽 Building & testing distribution
run: |
rm -rf dist proddist testdist
python3 bootstrap.py
bin/buildout
bin/buildout setup . egg_info --tag-build .dev --tag-date sdist --dist-dir testdist
bin/test
bin/buildout setup . egg_info --tag-build .$(date --utc '+%Y%m%d%H%M%S') sdist --dist-dir testdist
bin/buildout setup . sdist --dist-dir proddist
# TODO: Put in unit+functional+integration testing here
-
name: 📇 Publishing to Test PyPI
uses: pypa/gh-action-pypi-publish@master
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
.*.swp
.DS_Store
__pycache__
.eggs
typescript
/*.tab
/*.xml
Expand Down
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@

graft src/pds
graft docs
include *.rst
include *.rst *.tab *.TAB *.xml *.xsd *.pdf
global-exclude *.pyc *.pyo
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
'pds-deep-archive=pds.aipgen.main:main'
]
},
test_suite='pds.aipgen.tests.test_suite',
namespace_packages=['pds'],
packages=find_packages('src', exclude=['docs', 'tests', 'bootstrap', 'ez_setup']),
package_dir={'': 'src'},
Expand Down
8 changes: 4 additions & 4 deletions src/pds/aipgen/aip.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def _writeLabel(
• ``xferNum`` — count of records in the transfer manifest file
'''

_logger.debug('🏷 Writing AIP label to %s\n', labelOutputFile)
_logger.debug('🏷 Writing AIP label to %s', labelOutputFile)
ts = datetime.utcnow()
ts = datetime(ts.year, ts.month, ts.day, ts.hour, ts.minute, ts.second, microsecond=0, tzinfo=None)

Expand Down Expand Up @@ -295,7 +295,7 @@ def process(bundle):
``bundle``, which is an open file stream (with a ``name`` atribute) on the local
filesystem. Return the name of the generated checksum manifest file.
'''
_logger.info('🏃‍♀️ Starting AIP generation for %s\n', bundle.name)
_logger.info('🏃‍♀️ Starting AIP generation for %s', bundle.name)
d = os.path.dirname(os.path.abspath(bundle.name))

# Get the bundle's primary collections and other useful info
Expand Down Expand Up @@ -330,7 +330,7 @@ def process(bundle):
_logger.info('🎉 Success! AIP done, files generated:')
_logger.info('• Checksum manifest: %s', chksumFN)
_logger.info('• Transfer manifest: %s', xferFN)
_logger.info('• XML label for them both: %s\n', labelFN)
_logger.info('• XML label for them both: %s', labelFN)
return chksumFN


Expand All @@ -347,7 +347,7 @@ def main():
logging.basicConfig(level=args.loglevel, format='%(levelname)s %(message)s')
_logger.debug('⚙️ command line args = %r', args)
process(args.bundle)
_logger.info('👋 Thanks for using this program! Bye!\n\n')
_logger.info('👋 Thanks for using this program! Bye!')
sys.exit(0)


Expand Down
2 changes: 1 addition & 1 deletion src/pds/aipgen/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def main():
args.bundle_base_url,
chksumStream
)
_logger.info("👋 That's it! Thanks for making an AIP and SIP with us today. Bye!\n\n")
_logger.info("👋 That's it! Thanks for making an AIP and SIP with us today. Bye!")
sys.exit(0)


Expand Down
74 changes: 46 additions & 28 deletions src/pds/aipgen/sip.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@

# Other constants and defaults:
_registryServiceURL = 'https://pds.nasa.gov/services/registry/pds' # Default registry service
_bufsiz = 512 # Buffer size for reading from URL con
_pLineMatcher = re.compile(r'^P,\s*(.+)') # Match P-lines in a tab file
_bufsiz = 512 # Buffer size for reading from URL con
_pLineMatcher = re.compile(r'^P,\s*([^\s]+)') # Match P-lines in a tab file

# TODO: Auto-generate from PDS4 IM
_providerSiteIDs = ['PDS_' + i for i in ('ATM', 'ENG', 'GEO', 'IMG', 'JPL', 'NAI', 'PPI', 'PSI', 'RNG', 'SBN')]
Expand All @@ -89,6 +89,7 @@

# Logging
# -------

_logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -165,10 +166,27 @@ def _getAssociatedProducts(root, filepath):
if not matches: return products
for m in matches:
products.add('file:' + os.path.join(filepath, m.text))

return products


def _createLidVidtoXMLFileTable(xmlFiles, con):
'''Fill out a table for later (future multiprocessing-enabled) use to rapidly look up lidvids
in XML files. We get all of this XPath out of the way!
'''
for xmlFile in xmlFiles:
tree = etree.parse(xmlFile)
root = tree.getroot()
matches = root.findall(f'./{{{PDS_NS_URI}}}Identification_Area/{{{PDS_NS_URI}}}logical_identifier')
if not matches: continue
lid = matches[0].text.strip()

matches = root.findall(f'./{{{PDS_NS_URI}}}Identification_Area/{{{PDS_NS_URI}}}version_id')
if not matches: continue
vid = matches[0].text.strip()
lidvid = lid + '::' + vid
con.execute('''INSERT OR IGNORE INTO lidvids (lidvid, xmlFile) VALUES (?,?)''', (lidvid, xmlFile))


def _getLocalFileInfo(bundle, primaries, bundleLidvid, con):
'''Search all XML files (except for the ``bundle`` file) in the same directory as ``bundle``
and look for all XPath ``Product_Collection/Identification_Area/logical_identifier`` values
Expand All @@ -182,8 +200,6 @@ def _getLocalFileInfo(bundle, primaries, bundleLidvid, con):
have that "lidvid" and return then a mapping of lidvids to set of matching files, as ``file:``
URLs.
'''
# First get a set of all XML files under the same directory as ``bundle``

# I'll take a six-pack of tabs
lidvids = set()

Expand All @@ -198,6 +214,7 @@ def _getLocalFileInfo(bundle, primaries, bundleLidvid, con):
xmlFile text NOT NULL
)''')
cursor.execute('''CREATE INDEX IF NOT EXISTS lidvidIndex ON lidvids (lidvid)''')
cursor.execute('''CREATE UNIQUE INDEX lidvidPairing ON lidvids (lidvid, xmlFile)''')

# Add bundle to manifest
lidvidsToFiles[bundleLidvid] = {'file:' + bundle}
Expand All @@ -209,6 +226,8 @@ def _getLocalFileInfo(bundle, primaries, bundleLidvid, con):
# Locate all the XML files
for dirpath, dirnames, filenames in os.walk(root):
xmlFiles |= set([os.path.join(dirpath, i) for i in filenames if i.lower().endswith(PDS_LABEL_FILENAME_EXTENSION.lower())])
with con:
_createLidVidtoXMLFileTable(xmlFiles, con)

# Get the lidvids and inventory of files mentioned in each xml file
with con:
Expand All @@ -222,8 +241,6 @@ def _getLocalFileInfo(bundle, primaries, bundleLidvid, con):
for tab in tabs:
lidvids |= _getPLines(tab)
lidvidsToFiles[lidvid].add('file:' + tab)
for lidvid in lidvids:
con.execute('INSERT INTO lidvids (lidvid, xmlFile) VALUES (?,?)', (lidvid, xmlFile))

# Now go through each lidvid mentioned by the PLines in each inventory tab and find their xml files
for lidvid in lidvids:
Expand Down Expand Up @@ -265,7 +282,7 @@ def _writeTable(hashedFiles, hashName, manifest, offline, baseURL, basePathToRep
If ``offline`` mode, we transform all URLs written to the table by stripping off
everything except the last component (the file) and prepending the given ``baseURL``.
'''
hashish, size = hashlib.new('md5'), 0
hashish, size, hashName = hashlib.new('md5'), 0, hashName.upper()
for url, digest, lidvid in sorted(hashedFiles):
if offline:
if baseURL.endswith('/'):
Expand Down Expand Up @@ -397,9 +414,8 @@ def produce(bundle, hashName, registryServiceURL, insecureConnectionFlag, site,
# the future for sharing this DB amongst many processes for some fancy multiprocessing
with tempfile.NamedTemporaryFile() as dbfile:
con = sqlite3.connect(dbfile.name)
_logger.debug('→ Database file (deleted) is %sf', dbfile.name)

_logger.info('🏃‍♀️ Starting SIP generation for %s\n', bundle.name)
_logger.info('🏃‍♀️ Starting SIP generation for %s', bundle.name)

# Get the bundle path
bundle = os.path.abspath(bundle.name)
Expand All @@ -423,7 +439,7 @@ def produce(bundle, hashName, registryServiceURL, insecureConnectionFlag, site,
_writeLabel(bundleLID, bundleVID, title, md5, size, len(hashedFiles), hashName, manifestFileName, site, label, aipFile)
_logger.info('🎉 Success! From %s, generated these output files:', bundle)
_logger.info('• SIP Manifest: %s', manifestFileName)
_logger.info('• XML label for the SIP: %s\n', labelFileName)
_logger.info('• XML label for the SIP: %s', labelFileName)
return manifestFileName, labelFileName


Expand All @@ -448,7 +464,7 @@ def addSIParguments(parser):
# TODO: Temporarily setting offline to True by default until online mode is available
group.add_argument(
'-n', '--offline', default=True, action='store_true',
help='Run offline, scanning bundle directory for matching files instead of querying registry service.'+
help='Run offline, scanning bundle directory for matching files instead of querying registry service.'
' NOTE: By default, set to True until online mode is available.'
)

Expand All @@ -461,7 +477,7 @@ def addSIParguments(parser):
# TODO: Temporarily setting to be required by default until online mode is available
parser.add_argument(
'-b', '--bundle-base-url', required=True,
help='Base URL for Node data archive. This URL will be prepended to' +
help='Base URL for Node data archive. This URL will be prepended to'
' the bundle directory to form URLs to the products. For example,'
' if we are generating a SIP for mission_bundle/LADEE_Bundle_1101.xml,'
' and bundle-base-url is https://atmos.nmsu.edu/PDS/data/PDS4/LADEE/,'
Expand All @@ -471,8 +487,10 @@ def addSIParguments(parser):

def main():
'''Check the command-line for options and create a SIP from the given bundle XML'''
parser = argparse.ArgumentParser(description=_description,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser = argparse.ArgumentParser(
description=_description,
formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument('--version', action='version', version=f'%(prog)s {_version}')
addSIParguments(parser)
addLoggingArguments(parser)
Expand All @@ -488,21 +506,21 @@ def main():
_logger.debug('⚙️ command line args = %r', args)
if args.offline and not args.bundle_base_url:
parser.error('--bundle-base-url is required when in offline mode (--offline).')
manifest, label = _produce(
args.bundle,
manifest, label = produce(
bundle=args.bundle,
# TODO: Temporarily hardcoding these values until other modes are available
# HASH_ALGORITHMS[args.algorithm],
# args.url,
# args.insecure,
HASH_ALGORITHMS['MD5'],
'',
'',
args.site,
args.offline,
args.bundle_base_url,
args.aip
# hashName=HASH_ALGORITHMS[args.algorithm],
# registryServiceURL=args.url,
# insecureConnectionFlag=args.insecure,
hashName=HASH_ALGORITHMS['MD5'],
registryServiceURL=None,
insecureConnectionFlag=False,
site=args.site,
offline=args.offline,
baseURL=args.bundle_base_url,
aipFile=args.aip
)
_logger.info('INFO 👋 All done. Thanks for making a SIP. Bye!\n\n')
_logger.info('👋 All done. Thanks for making a SIP. Bye!')
sys.exit(0)


Expand Down
12 changes: 12 additions & 0 deletions src/pds/aipgen/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,15 @@


'''PDS AIP-GEN Tests'''


import unittest
import pds.aipgen.tests.test_utils
import pds.aipgen.tests.test_functional


def test_suite():
return unittest.TestSuite([
pds.aipgen.tests.test_utils.test_suite(),
pds.aipgen.tests.test_functional.test_suite()
])
1 change: 1 addition & 0 deletions src/pds/aipgen/tests/data
71 changes: 71 additions & 0 deletions src/pds/aipgen/tests/test_functional.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# encoding: utf-8
#
# Copyright © 2020 California Institute of Technology ("Caltech").
# ALL RIGHTS RESERVED. U.S. Government sponsorship acknowledged.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# • Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# • Redistributions must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or other
# materials provided with the distribution.
# • Neither the name of Caltech nor its operating division, the Jet Propulsion
# Laboratory, nor the names of its contributors may be used to endorse or
# promote products derived from this software without specific prior written
# permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.


'''PDS AIP-GEN functional tests'''


import unittest, tempfile, shutil, os, pkg_resources, filecmp
from pds.aipgen.sip import produce


class SIPFunctionalTestCase(unittest.TestCase):
'''Functional test case for SIP generation.
TODO: factor this out so we can generically do AIP and other file-based functional tests too.
'''
def setUp(self):
super(SIPFunctionalTestCase, self).setUp()
self.input = pkg_resources.resource_stream(__name__, 'data/ladee_test/mission_bundle/LADEE_Bundle_1101.xml')
self.valid = pkg_resources.resource_filename(__name__, 'data/ladee_test/valid/ladee_mission_bundle_sip_v1.0.tab')
self.cwd, self.testdir = os.getcwd(), tempfile.mkdtemp()
os.chdir(self.testdir)
def test_sip_of_a_ladee(self):
'''Test if the SIP manifest of LADEE bundle works as expected'''
manifest, label = produce(
bundle=self.input,
hashName='md5',
registryServiceURL=None,
insecureConnectionFlag=True,
site='PDS_ATM',
offline=True,
baseURL='https://atmos.nmsu.edu/PDS/data/PDS4/LADEE/',
aipFile=None
)
self.assertTrue(filecmp.cmp(manifest, self.valid), "SIP manifest doesn't match the valid version")
def tearDown(self):
self.input.close()
os.chdir(self.cwd)
shutil.rmtree(self.testdir, ignore_errors=True)
super(SIPFunctionalTestCase, self).tearDown()


def test_suite():
return unittest.defaultTestLoader.loadTestsFromName(__name__)
Loading

0 comments on commit e28d3bf

Please sign in to comment.