Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce local disk cache for references #1523

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/main/java/htsjdk/samtools/Defaults.java
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,14 @@ public class Defaults {
*/
public static final String CUSTOM_READER_FACTORY;

/**
* Pathname to a local disk directory housing a cache of reference files.
* The pathname can be constructed using %nums and %s notation, consuming num characters of the MD5sum.
* For example /local/ref_cache/%2s/%2s/%s will create 2 nested subdirectories with the filenames in
* the deepest directory being the last 28 characters of the md5sum.
*/
public static final String REF_CACHE;

/**
* Boolean describing whether downloading a reference file is allowed (for CRAM files),
* in case the reference file is not specified by the user
Expand Down Expand Up @@ -128,6 +136,7 @@ public class Defaults {
NON_ZERO_BUFFER_SIZE = BUFFER_SIZE;
}
REFERENCE_FASTA = getFileProperty("reference_fasta", null);
REF_CACHE = getStringProperty("ref_cache", "");
USE_CRAM_REF_DOWNLOAD = getBooleanProperty("use_cram_ref_download", false);
EBI_REFERENCE_SERVICE_URL_MASK = "https://www.ebi.ac.uk/ena/cram/md5/%s";
CUSTOM_READER_FACTORY = getStringProperty("custom_reader", "");
Expand All @@ -152,6 +161,7 @@ public static SortedMap<String, Object> allDefaults(){
result.put("BUFFER_SIZE", BUFFER_SIZE);
result.put("NON_ZERO_BUFFER_SIZE", NON_ZERO_BUFFER_SIZE);
result.put("REFERENCE_FASTA", REFERENCE_FASTA);
result.put("REF_CACHE", REF_CACHE);
result.put("USE_CRAM_REF_DOWNLOAD", USE_CRAM_REF_DOWNLOAD);
result.put("EBI_REFERENCE_SERVICE_URL_MASK", EBI_REFERENCE_SERVICE_URL_MASK);
result.put("CUSTOM_READER_FACTORY", CUSTOM_READER_FACTORY);
Expand Down
69 changes: 69 additions & 0 deletions src/main/java/htsjdk/samtools/cram/ref/ReferenceSource.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,15 @@
import java.io.InputStream;
import java.lang.ref.WeakReference;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
Expand Down Expand Up @@ -84,6 +87,8 @@ public ReferenceSource(final ReferenceSequenceFile rsFile) {
*<p><ul>
* <li>Defaults.REFERENCE_FASTA - the value of the system property "reference_fasta". If set,
* must refer to a valid reference file.</li>
* <li>Defaults.REF_CACHE - the value of the system property "ref_cache". Local disk directory
* where the reference files are cached.</li>
* <li>ENA Reference Service if it is enabled</li>
* </ul>
*/
Expand All @@ -98,6 +103,10 @@ public static CRAMReferenceSource getDefaultCRAMReferenceSource() {
"The file specified by the reference_fasta property does not exist: " + Defaults.REFERENCE_FASTA.getName());
}
}
else if (!Defaults.REF_CACHE.isEmpty()) {
log.info("REF_CACHE is set, so attempting to find reference file in disk cache.");
return new ReferenceSource((ReferenceSequenceFile)null);
}
else if (Defaults.USE_CRAM_REF_DOWNLOAD) {
log.info("USE_CRAM_REF_DOWNLOAD=true, so attempting to download reference file as needed.");
return new ReferenceSource((ReferenceSequenceFile)null);
Expand Down Expand Up @@ -165,6 +174,14 @@ public synchronized byte[] getReferenceBases(final SAMSequenceRecord record,
}

{
if (!Defaults.REF_CACHE.isEmpty()) {
// try to fetch from local disk cache
bases = findBasesByMD5inLocalDiskCache(md5);
if (bases != null) {
return addToCache(md5, bases);
}
}

if (Defaults.USE_CRAM_REF_DOWNLOAD) { // try to fetch sequence by md5:
if (md5 != null) {
bases = findBasesByMD5(md5.toLowerCase());
Expand Down Expand Up @@ -220,6 +237,13 @@ private byte[] findBasesByMD5(final String md5) {

final String downloadedMD5 = SequenceUtil.calculateMD5String(data);
if (md5.equals(downloadedMD5)) {
if(!Defaults.REF_CACHE.isEmpty()) {
// save to local disk cache unless it is there already
Path refFile = pathToLocalDiskCache(md5);
if (!Files.exists(refFile)) {
Files.write(refFile, data);
}
}
return data;
} else {
final String message = String
Expand All @@ -236,6 +260,51 @@ private byte[] findBasesByMD5(final String md5) {
+ md5);
}

private byte[] findBasesByMD5inLocalDiskCache(final String md5) {
Path refFile = pathToLocalDiskCache(md5);
if (Files.exists(refFile)) {
try {
// read the reference if present
log.info("Found reference in local disk cache: " + refFile.toString());
return Files.readAllBytes(refFile);
} catch (IOException e) {
final String message = String
.format("File not found: %s", refFile.toString());
log.error(message);
}
}
return null;
}

private Path pathToLocalDiskCache(final String md5) {
// initial segment of the path name up to the first "%"
String[] subdirs = Defaults.REF_CACHE.split("%");
StringBuilder pathName = new StringBuilder(subdirs[0]);
// process subdirectory names
Pattern pattern = Pattern.compile("(\\d*)s(.?)");
String rem = md5;
for (int i = 1; i < subdirs.length; i++) {
Matcher matcher = pattern.matcher(subdirs[i]);
if (matcher.find()) {
if (matcher.group(1).isEmpty()) {
// last segment of the path
pathName.append(rem);
} else {
// first matched group corresponds to sub-directory name length
int subdirLength = Integer.parseInt(matcher.group(1));
pathName.append(rem.substring(0, subdirLength) + matcher.group(2));
rem = rem.substring(subdirLength);
}
} else {
final String message = String
.format("Invalid REF_CACHE syntax: %s", Defaults.REF_CACHE);
log.error(message);
return null;
}
}
return Paths.get(pathName.toString());
}

private static final Pattern chrPattern = Pattern.compile("chr.*",
Pattern.CASE_INSENSITIVE);

Expand Down
44 changes: 44 additions & 0 deletions src/test/java/htsjdk/samtools/cram/ref/ReferenceSourceTest.java
Original file line number Diff line number Diff line change
@@ -1,19 +1,29 @@
package htsjdk.samtools.cram.ref;

import htsjdk.HtsjdkTest;
import htsjdk.samtools.CRAMFileReader;
import htsjdk.samtools.Defaults;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.reference.InMemoryReferenceSequenceFile;
import htsjdk.samtools.util.SequenceUtil;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;

/**
* Created by vadim on 29/06/2017.
*/
public class ReferenceSourceTest extends HtsjdkTest{

private final String TEST_DIR = "src/test/resources/htsjdk/samtools/cram/";
private final String MD5_REFERENCE = "7ddd8a4b4f2c1dec43476a738b1a9b72";

@Test
public void testReferenceSourceUpperCasesBases() {
final String sequenceName = "1";
Expand All @@ -31,4 +41,38 @@ public void testReferenceSourceUpperCasesBases() {
Assert.assertNotEquals(refBasesFromSource, originalRefBases);
Assert.assertEquals(refBasesFromSource, SequenceUtil.upperCase(originalRefBases));
}

@Test
public void testReferenceLocalDiskCache() {
System.setProperty("samjdk.ref_cache", TEST_DIR + "%s");
final File cramFile = new File(TEST_DIR + "auxf#values.3.0.cram");
CRAMReferenceSource refSource = ReferenceSource.getDefaultCRAMReferenceSource();
CRAMFileReader cramFileReader = new CRAMFileReader(cramFile, refSource);

// find reference by MD5, maps to "src/test/resources/htsjdk/samtools/cram/7ddd8a4b4f2c1dec43476a738b1a9b72"
cramFileReader.getIterator().next();
}

@Test
public void testReferenceLocalDiskCacheWithSubdirectory() throws IOException {
System.setProperty("samjdk.ref_cache", TEST_DIR + "%2s/%s");
final File cramFile = new File(TEST_DIR + "auxf#values.3.0.cram");
CRAMReferenceSource refSource = ReferenceSource.getDefaultCRAMReferenceSource();
CRAMFileReader cramFileReader = new CRAMFileReader(cramFile, refSource);

// copy reference file to "src/test/resources/htsjdk/samtools/cram/7d/dd8a4b4f2c1dec43476a738b1a9b72"
String dirName = MD5_REFERENCE.substring(0, 2);
Path subDir = Paths.get(TEST_DIR + dirName);
if (!Files.exists(subDir))
Files.createDirectory(subDir);
Path sourceRef = Paths.get(TEST_DIR + MD5_REFERENCE);
Files.copy(sourceRef, subDir.resolve(MD5_REFERENCE.substring(2)));

// find reference by MD5
cramFileReader.getIterator().next();

// remove temporary resources
Files.delete(subDir.resolve(MD5_REFERENCE.substring(2)));
Files.delete(subDir);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
GCTAGCTCAGAAAAAAAAAA