Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce local disk cache for references #1523

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,24 @@ task testWithDefaultReference(type: Test) {
}
}

task testReferenceCache(type: Test) {
description = "Run tests which require one-level reference cache"
jvmArgs += '-Dsamjdk.ref_cache=src/test/resources/htsjdk/samtools/ref_cache/%s'

tags {
include "refCache"
}
}

task testReferenceCacheMultilevel(type: Test) {
description = "Run tests which require multilevel reference cache"
jvmArgs += "-Dsamjdk.ref_cache=${temporaryDir}/%2s/%s"

tags {
include "refCacheMultilevel"
}
}

test {
description = "Runs the unit tests other than the SRA tests"

Expand All @@ -122,10 +140,12 @@ test {
exclude "http"
exclude "sra"
exclude "ena"
exclude "refCache"
exclude "refCacheMultilevel"

if (!OperatingSystem.current().isUnix()) exclude "unix"
}
} dependsOn findScalaAndJavaTypes, testWithDefaultReference
} dependsOn findScalaAndJavaTypes, testWithDefaultReference, testReferenceCache, testReferenceCacheMultilevel


task testFTP(type: Test) {
Expand Down
17 changes: 17 additions & 0 deletions src/main/java/htsjdk/samtools/Defaults.java
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,21 @@ public class Defaults {
*/
public static final String CUSTOM_READER_FACTORY;

/**
* The name of the system property which contains the location of a reference cache.
*/
public static final String REF_CACHE_PROPERTY_NAME = "ref_cache";

/**
* Pathname to a local disk directory housing a cache of reference files.
* The pathname can be constructed using %nums and %s notation, consuming num characters of the MD5sum.
* For example /local/ref_cache/%2s/%2s/%s will create 2 nested subdirectories with the filenames in
* the deepest directory being the last 28 characters of the md5sum.
* The system property corresponds to the REF_CACHE environment variable implemented by samtools
* (see http://www.htslib.org/doc/samtools.html#ENVIRONMENT_VARIABLES).
*/
public static final String REF_CACHE;

/**
* Boolean describing whether downloading a reference file is allowed (for CRAM files),
* in case the reference file is not specified by the user
Expand Down Expand Up @@ -128,6 +143,7 @@ public class Defaults {
NON_ZERO_BUFFER_SIZE = BUFFER_SIZE;
}
REFERENCE_FASTA = getFileProperty("reference_fasta", null);
REF_CACHE = getStringProperty(REF_CACHE_PROPERTY_NAME, "");
USE_CRAM_REF_DOWNLOAD = getBooleanProperty("use_cram_ref_download", false);
EBI_REFERENCE_SERVICE_URL_MASK = "https://www.ebi.ac.uk/ena/cram/md5/%s";
CUSTOM_READER_FACTORY = getStringProperty("custom_reader", "");
Expand All @@ -152,6 +168,7 @@ public static SortedMap<String, Object> allDefaults(){
result.put("BUFFER_SIZE", BUFFER_SIZE);
result.put("NON_ZERO_BUFFER_SIZE", NON_ZERO_BUFFER_SIZE);
result.put("REFERENCE_FASTA", REFERENCE_FASTA);
result.put("REF_CACHE", REF_CACHE);
result.put("USE_CRAM_REF_DOWNLOAD", USE_CRAM_REF_DOWNLOAD);
result.put("EBI_REFERENCE_SERVICE_URL_MASK", EBI_REFERENCE_SERVICE_URL_MASK);
result.put("CUSTOM_READER_FACTORY", CUSTOM_READER_FACTORY);
Expand Down
73 changes: 73 additions & 0 deletions src/main/java/htsjdk/samtools/cram/ref/ReferenceSource.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,15 @@
import java.io.InputStream;
import java.lang.ref.WeakReference;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
Expand Down Expand Up @@ -84,6 +87,8 @@ public ReferenceSource(final ReferenceSequenceFile rsFile) {
*<p><ul>
* <li>Defaults.REFERENCE_FASTA - the value of the system property "reference_fasta". If set,
* must refer to a valid reference file.</li>
* <li>Defaults.REF_CACHE - the value of the system property "ref_cache". Local disk directory
* where the reference files are cached.</li>
* <li>ENA Reference Service if it is enabled</li>
* </ul>
*/
Expand All @@ -98,6 +103,10 @@ public static CRAMReferenceSource getDefaultCRAMReferenceSource() {
"The file specified by the reference_fasta property does not exist: " + Defaults.REFERENCE_FASTA.getName());
}
}
else if (!Defaults.REF_CACHE.isEmpty()) {
log.info("REF_CACHE is set, so attempting to find reference file in disk cache.");
return new ReferenceSource((ReferenceSequenceFile)null);
}
else if (Defaults.USE_CRAM_REF_DOWNLOAD) {
log.info("USE_CRAM_REF_DOWNLOAD=true, so attempting to download reference file as needed.");
return new ReferenceSource((ReferenceSequenceFile)null);
Expand Down Expand Up @@ -165,6 +174,14 @@ public synchronized byte[] getReferenceBases(final SAMSequenceRecord record,
}

{
if (!Defaults.REF_CACHE.isEmpty()) {
// try to fetch from local disk cache
bases = findBasesByMD5inLocalDiskCache(md5);
if (bases != null) {
return addToCache(md5, bases);
}
}

if (Defaults.USE_CRAM_REF_DOWNLOAD) { // try to fetch sequence by md5:
if (md5 != null) {
bases = findBasesByMD5(md5.toLowerCase());
Expand Down Expand Up @@ -220,6 +237,13 @@ private byte[] findBasesByMD5(final String md5) {

final String downloadedMD5 = SequenceUtil.calculateMD5String(data);
if (md5.equals(downloadedMD5)) {
if(!Defaults.REF_CACHE.isEmpty()) {
// save to local disk cache unless it is there already
Path refFile = pathToLocalDiskCache(md5);
if (!Files.exists(refFile)) {
Files.write(refFile, data);
}
}
return data;
} else {
final String message = String
Expand All @@ -236,6 +260,55 @@ private byte[] findBasesByMD5(final String md5) {
+ md5);
}

private byte[] findBasesByMD5inLocalDiskCache(final String md5) {
Path refFile = pathToLocalDiskCache(md5);
if (Files.exists(refFile)) {
try {
// read the reference if present
log.info("Found reference in local disk cache: " + refFile.toString());
return Files.readAllBytes(refFile);
} catch (IOException e) {
final String message = String
.format("File not found: %s", refFile.toString());
log.error(message);
}
}
return null;
}

private Path pathToLocalDiskCache(final String md5) {
// initial segment of the path name up to the first "%"
String[] subdirs = Defaults.REF_CACHE.split("%");
StringBuilder pathName = new StringBuilder(subdirs[0]);
// process subdirectory names
Pattern pattern = Pattern.compile("(\\d*)s(.?)");
String rem = md5;
for (int i = 1; i < subdirs.length; i++) {
Matcher matcher = pattern.matcher(subdirs[i]);
if (matcher.find() && !rem.isEmpty()) {
if (matcher.group(1).isEmpty()) {
// last segment of the path
pathName.append(rem);
rem = "";
} else {
// first matched group corresponds to sub-directory name length
int subdirLength = Integer.parseInt(matcher.group(1));
pathName.append(rem.substring(0, subdirLength) + matcher.group(2));
if (subdirLength < rem.length())
rem = rem.substring(subdirLength);
else
rem = "";
}
} else {
final String message = String
.format("Invalid REF_CACHE syntax: %s", Defaults.REF_CACHE);
log.error(message);
return null;
}
}
return Paths.get(pathName.toString());
}

private static final Pattern chrPattern = Pattern.compile("chr.*",
Pattern.CASE_INSENSITIVE);

Expand Down
66 changes: 66 additions & 0 deletions src/test/java/htsjdk/samtools/cram/ref/ReferenceSourceTest.java
Original file line number Diff line number Diff line change
@@ -1,19 +1,33 @@
package htsjdk.samtools.cram.ref;

import htsjdk.HtsjdkTest;
import htsjdk.samtools.CRAMFileReader;
import htsjdk.samtools.Defaults;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.reference.InMemoryReferenceSequenceFile;
import htsjdk.samtools.util.SequenceUtil;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;

import static htsjdk.utils.SamtoolsTestUtils.getSamtoolsBin;
import static htsjdk.utils.SamtoolsTestUtils.isSamtoolsAvailable;

/**
* Created by vadim on 29/06/2017.
*/
public class ReferenceSourceTest extends HtsjdkTest{

private final String TEST_DIR = "src/test/resources/htsjdk/samtools/cram/";
private final String REF_CACHE_DIR = "src/test/resources/htsjdk/samtools/ref_cache/";
private final String MD5_REFERENCE = "7ddd8a4b4f2c1dec43476a738b1a9b72";
private final String[] SAMTOOLS_ENVP = {"REF_CACHE=" + REF_CACHE_DIR};

@Test
public void testReferenceSourceUpperCasesBases() {
final String sequenceName = "1";
Expand All @@ -31,4 +45,56 @@ public void testReferenceSourceUpperCasesBases() {
Assert.assertNotEquals(refBasesFromSource, originalRefBases);
Assert.assertEquals(refBasesFromSource, SequenceUtil.upperCase(originalRefBases));
}

@Test(groups = {"refCache"})
public void testReferenceLocalDiskCache() {
// requires -Dsamjdk.ref_cache=src/test/resources/htsjdk/samtools/ref_cache/%s
final File cramFile = new File(TEST_DIR + "auxf#values.3.0.cram");
CRAMReferenceSource refSource = ReferenceSource.getDefaultCRAMReferenceSource();
CRAMFileReader cramFileReader = new CRAMFileReader(cramFile, refSource);

// find reference by MD5
cramFileReader.getIterator().next();
}

@Test(groups = {"refCacheMultilevel"})
public void testReferenceLocalDiskCacheWithSubdirectory() throws IOException {
String dirName = MD5_REFERENCE.substring(0, 2);
String refFileName = MD5_REFERENCE.substring(2);
Path tmpDir = Paths.get(Defaults.REF_CACHE.replaceFirst("%[0-9s/%]*", ""));
Path subDir = null;
try {
// set up two-level cache in a temporary directory
subDir = Files.createDirectory(tmpDir.resolve(dirName));
final File cramFile = new File(TEST_DIR + "auxf#values.3.0.cram");
CRAMReferenceSource refSource = ReferenceSource.getDefaultCRAMReferenceSource();
CRAMFileReader cramFileReader = new CRAMFileReader(cramFile, refSource);

// copy reference file to subDir removing the first two characters of the file name
Path sourceRef = Paths.get(REF_CACHE_DIR + MD5_REFERENCE);
Files.copy(sourceRef, subDir.resolve(refFileName));

// find reference by MD5
cramFileReader.getIterator().next();
} finally {
// remove temporary resources
if (subDir != null) {
Files.deleteIfExists(subDir.resolve(refFileName));
Files.deleteIfExists(subDir);
}
}
}

@Test
public void testInteroperabilityWithSamtools() throws IOException, InterruptedException {
if (isSamtoolsAvailable()) {
final String commandString = getSamtoolsBin() + " view " + TEST_DIR + "auxf#values.3.0.cram";
// provide path to reference in the REF_CACHE environment variable
Process process = Runtime.getRuntime().exec(commandString, SAMTOOLS_ENVP);
process.waitFor();
Assert.assertEquals(0, process.exitValue());
BufferedReader reader = new BufferedReader(new InputStreamReader(process.getInputStream()));
Assert.assertTrue(reader.readLine().startsWith("Fred"));
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
GCTAGCTCAGAAAAAAAAAA