From 272143f09b6d980b7953d3528e3dc88d5efda19d Mon Sep 17 00:00:00 2001 From: Valentin Ruano Rubio Date: Thu, 3 May 2018 15:47:17 -0400 Subject: [PATCH] =?UTF-8?q?Now=20one=20can=20specify=20the=20expected=20in?= =?UTF-8?q?sert=20size=20distribution=20as=20supposed=E2=80=A6=20(#19)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Now one can specify the expected insert size distribution as supposed to allow BWA to infer it on the fly. This might have some value when re-aligning a small number of reads (inference may fail) or when one knows that there is a heterozygous variant and we actually now the expected insert size distribution; the input data would have a mixture of gaussians rather than the expected single gaussian. --- src/main/c/Makefile | 6 +- src/main/c/init.c | 30 +++ src/main/c/init.h | 15 ++ src/main/c/jnibwa.c | 4 +- src/main/c/jnibwa.h | 2 +- ...stitute_hellbender_utils_bwa_BwaMemIndex.c | 28 ++- .../hellbender/utils/bwa/BwaMemAligner.java | 30 ++- .../hellbender/utils/bwa/BwaMemIndex.java | 6 +- .../utils/bwa/BwaMemPairEndStats.java | 205 ++++++++++++++++++ .../hellbender/utils/bwa/BwaMemIndexTest.java | 27 ++- 10 files changed, 338 insertions(+), 15 deletions(-) create mode 100644 src/main/c/init.c create mode 100644 src/main/c/init.h create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/bwa/BwaMemPairEndStats.java diff --git a/src/main/c/Makefile b/src/main/c/Makefile index aa4b08c..7401366 100644 --- a/src/main/c/Makefile +++ b/src/main/c/Makefile @@ -19,7 +19,7 @@ JNI_BASE_NAME=org_broadinstitute_hellbender_utils_bwa_BwaMemIndex all: libbwa.$(LIB_EXT) -libbwa.$(LIB_EXT): $(JNI_BASE_NAME).o jnibwa.o bwa/libbwa.a +libbwa.$(LIB_EXT): $(JNI_BASE_NAME).o init.o jnibwa.o bwa/libbwa.a $(CC) -ggdb -dynamiclib -shared -o $@ $^ -lm -lz -lpthread bwa: @@ -31,7 +31,9 @@ bwa/libbwa.a: bwa $(JNI_BASE_NAME).o: $(JNI_BASE_NAME).c jnibwa.h bwa -jnibwa.o: jnibwa.c jnibwa.h bwa +jnibwa.o: jnibwa.c jnibwa.h init.h bwa + +init.o: init.c init.h clean: rm -rf bwa *.o *.$(LIB_EXT) diff --git a/src/main/c/init.c b/src/main/c/init.c new file mode 100644 index 0000000..e758d76 --- /dev/null +++ b/src/main/c/init.c @@ -0,0 +1,30 @@ +#include "init.h" + +jfieldID peStatClass_failedID; +jfieldID peStatClass_lowID; +jfieldID peStatClass_highID; +jfieldID peStatClass_averageID; +jfieldID peStatClass_stdID; + + +#define FAIL_IF_NULL(x) if (!(x)) return JNI_ERR; + +jint JNI_OnLoad(JavaVM* vm, void* reserved) { + + JNIEnv* env; + jclass peStatClass; + if ((*vm)->GetEnv(vm, &env, JNI_VERSION_1_8) != JNI_OK) { + return JNI_ERR; + } + + FAIL_IF_NULL(peStatClass = (*env)->FindClass(env, "org/broadinstitute/hellbender/utils/bwa/BwaMemPairEndStats")); + FAIL_IF_NULL(peStatClass_failedID = (*env)->GetFieldID(env, peStatClass, "failed", "Z")); + FAIL_IF_NULL(peStatClass_lowID = (*env)->GetFieldID(env, peStatClass, "low", "I")); + FAIL_IF_NULL(peStatClass_highID = (*env)->GetFieldID(env, peStatClass, "high", "I")); + FAIL_IF_NULL(peStatClass_averageID = (*env)->GetFieldID(env, peStatClass, "average", "D")); + FAIL_IF_NULL(peStatClass_stdID = (*env)->GetFieldID(env, peStatClass, "std", "D")); + (*env)->DeleteLocalRef(env, peStatClass); + + return JNI_VERSION_1_8; +} + diff --git a/src/main/c/init.h b/src/main/c/init.h new file mode 100644 index 0000000..1268ec3 --- /dev/null +++ b/src/main/c/init.h @@ -0,0 +1,15 @@ +#include + +#ifndef INIT_H +#define INIT_H + +extern jfieldID peStatClass_failedID; +extern jfieldID peStatClass_lowID; +extern jfieldID peStatClass_highID; +extern jfieldID peStatClass_averageID; +extern jfieldID peStatClass_stdID; + + +jint JNI_OnLoad(JavaVM* vm, void* reserved); + +#endif \ No newline at end of file diff --git a/src/main/c/jnibwa.c b/src/main/c/jnibwa.c index ae57867..e7005e2 100644 --- a/src/main/c/jnibwa.c +++ b/src/main/c/jnibwa.c @@ -194,7 +194,7 @@ void* jnibwa_getRefContigNames( bwaidx_t* pIdx, size_t* pBufSize ) { return bufMem; } -void* jnibwa_createAlignments( bwaidx_t* pIdx, mem_opt_t* pOpts, char* pSeq, size_t* pBufSize) { +void* jnibwa_createAlignments( bwaidx_t* pIdx, mem_opt_t* pOpts, mem_pestat_t* pPestat, char* pSeq, size_t* pBufSize) { char c = 0; char* emptyString = &c; uint32_t nSeqs = *(uint32_t*)pSeq; @@ -211,7 +211,7 @@ void* jnibwa_createAlignments( bwaidx_t* pIdx, mem_opt_t* pOpts, char* pSeq, siz pSeq += seqLen + 1; } - mem_process_seqs(pOpts, pIdx->bwt, pIdx->bns, pIdx->pac, 0, nSeqs, pSeq1Beg, 0); + mem_process_seqs(pOpts, pIdx->bwt, pIdx->bns, pIdx->pac, 0, nSeqs, pSeq1Beg, pPestat); size_t nInts = 0; for ( pSeq1 = pSeq1Beg; pSeq1 != pSeq1End; ++pSeq1 ) { diff --git a/src/main/c/jnibwa.h b/src/main/c/jnibwa.h index 736ffb5..cc98824 100644 --- a/src/main/c/jnibwa.h +++ b/src/main/c/jnibwa.h @@ -13,6 +13,6 @@ int jnibwa_createIndexFile( char const* refName, char const* imgSuffix ); bwaidx_t* jnibwa_openIndex( int fd ); int jnibwa_destroyIndex( bwaidx_t* pIdx ); void* jnibwa_getRefContigNames( bwaidx_t* pIdx, size_t* pBufSize ); -void* jnibwa_createAlignments( bwaidx_t* pIdx, mem_opt_t* pOpts, char* pSeq, size_t* pBufSize); +void* jnibwa_createAlignments( bwaidx_t* pIdx, mem_opt_t* pOpts, mem_pestat_t* peStats, char* pSeq, size_t* pBufSize); #endif /* JNIBWA_H_ */ diff --git a/src/main/c/org_broadinstitute_hellbender_utils_bwa_BwaMemIndex.c b/src/main/c/org_broadinstitute_hellbender_utils_bwa_BwaMemIndex.c index fa076d8..583a229 100644 --- a/src/main/c/org_broadinstitute_hellbender_utils_bwa_BwaMemIndex.c +++ b/src/main/c/org_broadinstitute_hellbender_utils_bwa_BwaMemIndex.c @@ -2,6 +2,7 @@ #include #include #include "jnibwa.h" +#include "init.h" #include "bwa/bwa_commit.h" @@ -17,6 +18,27 @@ jint throwIllegalArgumentException(JNIEnv* env, char* message) { return (*env)->ThrowNew(env, iaeClass, message); } +int jobject_to_mem_pestat_t(JNIEnv* env, jobject in, mem_pestat_t *out) { + if (in == NULL) { + return 0; + } + memset(out, 0, sizeof(mem_pestat_t) * 4); + for (int i = 0; i < 4; i++, out++) { + if (i == 1) { + out->failed = (int) (*env)->GetBooleanField(env, in, peStatClass_failedID); + if (!out->failed) { + out->low = (int) (*env)->GetIntField(env, in, peStatClass_lowID); + out->high = (int) (*env)->GetIntField(env, in, peStatClass_highID); + out->avg = (double) (*env)->GetDoubleField(env, in, peStatClass_averageID); + out->std = (double) (*env)->GetDoubleField(env, in, peStatClass_stdID); + } + } else { + out->failed = 1; + } + } + return 1; +} + JNIEXPORT jboolean JNICALL Java_org_broadinstitute_hellbender_utils_bwa_BwaMemIndex_createReferenceIndex( JNIEnv* env, jclass cls, jstring jReferenceFileName, jstring jIndexPrefix, jstring jAlgoName) { @@ -119,12 +141,14 @@ typedef struct { */ JNIEXPORT jobject JNICALL Java_org_broadinstitute_hellbender_utils_bwa_BwaMemIndex_createAlignments( - JNIEnv* env, jclass cls, jobject seqsBuf, jlong idxAddr, jobject optsBuf ) { + JNIEnv* env, jclass cls, jobject seqsBuf, jlong idxAddr, jobject optsBuf, jobject frPEStats ) { bwaidx_t* pIdx = (bwaidx_t*)idxAddr; mem_opt_t* pOpts = (*env)->GetDirectBufferAddress(env, optsBuf); + mem_pestat_t peStats[4]; + int pestatProvided = jobject_to_mem_pestat_t(env, frPEStats, peStats); char* pSeq = (*env)->GetDirectBufferAddress(env, seqsBuf); size_t bufSize = 0; - void* bufMem = jnibwa_createAlignments(pIdx, pOpts, pSeq, &bufSize); + void* bufMem = jnibwa_createAlignments(pIdx, pOpts, pestatProvided ? peStats : 0, pSeq, &bufSize); jobject alnBuf = (*env)->NewDirectByteBuffer(env, bufMem, bufSize); if ( !alnBuf ) free(bufMem); return alnBuf; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/bwa/BwaMemAligner.java b/src/main/java/org/broadinstitute/hellbender/utils/bwa/BwaMemAligner.java index 60af11c..382c52f 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/bwa/BwaMemAligner.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/bwa/BwaMemAligner.java @@ -3,6 +3,7 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.function.Function; @@ -20,6 +21,8 @@ public final class BwaMemAligner implements AutoCloseable { private final BwaMemIndex index; private ByteBuffer opts; + private BwaMemPairEndStats pairEndStats; + public BwaMemAligner( final BwaMemIndex index ) { this.index = index; if ( !index.isOpen() ) { @@ -27,6 +30,7 @@ public BwaMemAligner( final BwaMemIndex index ) { } opts = BwaMemIndex.createDefaultOptions(); opts.order(ByteOrder.nativeOrder()).position(0).limit(opts.capacity()); + pairEndStats = null; } public boolean isOpen() { return opts != null; } @@ -141,6 +145,30 @@ public void setIntraCtgOptions() { setClip3PenaltyOption(5); } + /** + * Indicate that we want bwa-mem to infer pair-end inter-size stat from the sequences given. + * Only has effect in paired alignment. + */ + public void inferPairEndStats() { + pairEndStats = null; + } + + /** + * Tells the aligner to avoid trying to infer inter-size stats and just go with bwamem defaults when + * that information is not-available. + */ + public void dontInferPairEndStats() { + pairEndStats = BwaMemPairEndStats.DO_NOT_INFER; + } + + /** + * Indicate the pair-end inter size stats for "properly" oriented read-pairs. + * @param stats + */ + public void setProperPairEndStats(final BwaMemPairEndStats stats) { + pairEndStats = stats; + } + public BwaMemIndex getIndex() { return index; } @@ -179,7 +207,7 @@ public List> alignSeqs( final Iterable iterable, fi contigBuf.put(func.apply(ele)).put((byte) 0); } contigBuf.flip(); - alignsBuf = index.doAlignment(contigBuf, tmpOpts); + alignsBuf = index.doAlignment(contigBuf, tmpOpts, pairEndStats); } finally { index.deRefIndex(); diff --git a/src/main/java/org/broadinstitute/hellbender/utils/bwa/BwaMemIndex.java b/src/main/java/org/broadinstitute/hellbender/utils/bwa/BwaMemIndex.java index 4675d84..4103f84 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/bwa/BwaMemIndex.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/bwa/BwaMemIndex.java @@ -407,8 +407,8 @@ public static String getBWAVersion() { return getVersion(); } - ByteBuffer doAlignment( final ByteBuffer seqs, final ByteBuffer opts ) { - final ByteBuffer alignments = createAlignments(seqs, indexAddress, opts); + ByteBuffer doAlignment( final ByteBuffer seqs, final ByteBuffer opts, final BwaMemPairEndStats peStats) { + final ByteBuffer alignments = createAlignments(seqs, indexAddress, opts, peStats); if ( alignments == null ) { throw new IllegalStateException("Unable to get alignments from bwa-mem index "+indexImageFile+": We don't know why."); } @@ -482,7 +482,7 @@ private static void loadNativeLibrary() { private static native int destroyIndex( long indexAddress ); static native ByteBuffer createDefaultOptions(); private static native ByteBuffer getRefContigNames( long indexAddress ); - private static native ByteBuffer createAlignments( ByteBuffer seqs, long indexAddress, ByteBuffer opts ); + private static native ByteBuffer createAlignments( ByteBuffer seqs, long indexAddress, ByteBuffer opts, BwaMemPairEndStats peStats); static native void destroyByteBuffer( ByteBuffer alignments ); private static native String getVersion(); } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/bwa/BwaMemPairEndStats.java b/src/main/java/org/broadinstitute/hellbender/utils/bwa/BwaMemPairEndStats.java new file mode 100644 index 0000000..34e0664 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/utils/bwa/BwaMemPairEndStats.java @@ -0,0 +1,205 @@ +package org.broadinstitute.hellbender.utils.bwa; + +/** + * Equivalent to Bwa's mem_pestat_t struct. + * At the time this was written such a type declaration looked like this: + *
+ *    // bwamem.h
+ *    typedef struct {
+            int low, high;   // lower and upper bounds within which a read pair is considered to be properly paired
+            int failed;      // non-zero if the orientation is not supported by sufficient data
+            double avg, std; // mean and stddev of the insert size distribution
+     } mem_pestat_t;
+ * 
+ */ +public final class BwaMemPairEndStats { + + /** + * Default number of std. deviation from the mean for the lowest and highest insert size that is considered normal. + *

+ * Its value, {@value #DEFAULT_LOW_AND_HIGH_SIGMA}, corresponds to the one use in bwa-mem source code at the + * time this code was written. + *

+ */ + public static final int DEFAULT_LOW_AND_HIGH_SIGMA = 4; + + /** + * Default std. deviation to average ratio to use in case the deviation is not provided. + *

+ * Its value, {@value #DEFAULT_STD_TO_AVERAGE_RATIO}, corresponds to the one use in bwa-mem source code at the + * time this code was written. + *

+ */ + public static final double DEFAULT_STD_TO_AVERAGE_RATIO = .1; + + /** + * Constant to indicate that pair-end insert size inference failed or that it should not be inferred depending on + * the context. + */ + public static final BwaMemPairEndStats FAILED = new BwaMemPairEndStats(); + + /** + * Constant to indicate that pair-end insert size inference failed or hat is should not be inferred depending on + * the context. + */ + public static final BwaMemPairEndStats DO_NOT_INFER = FAILED; + + /** + * The shortest insert size that is considered normal. + *
+ * Only defined if {@link #failed} is {@code false}, and if so is always in the [1..{@link #average}] range. + */ + public final int low; + + /** + * The longest insert size that is considered normal. + *
+ * Only defined if {@link #failed} is {@code false}, and if so is always in the [#average .. +Inf) range. + */ + public final int high; + + /** + * Depending on the context, indicates iff the insert-size inference failed or + * if we are requesting not to perform such a inference. + *
+ * When this field is {@code true}, nothing can be expected on what values other fields take. + */ + public final boolean failed; + + /** + * The average insert size. + *
+ * Only defined if {@link #failed} is {@code false}, and if so is always in the range [1 .. +Inf). + */ + public final double average; + + /** + * The insert size std. deviation. + *
+ * Only defined if {@link #failed} is {@code false}, and if so is finite and equal or greater than 0. + */ + public final double std; + + + /** + * Composes a new instance given the estimated average and std. deviation + *

+ * low and high limits are calculated using the recipe use in bwa mem's code-base + * where the 4th sigma is used as the upper limit rounding it to the closest integer. + *

+ *

+ * The resulting stats object's {@link #failed} field will be set to {@code false}. + *

+ * @param average the insert size average estimate. + * @param std the insert size standard deviation estimate. + * @throws IllegalArgumentException if either input is {@link Double#NaN}, infinite. Also if the average is + * less than 1.0 and if the std. deviation is strictly negative. + */ + public BwaMemPairEndStats(final double average, final double std) { + this(average, std, + /* low = */ Math.max(1, (int) Math.round(average - DEFAULT_LOW_AND_HIGH_SIGMA * std)), + /* high = */ Math.max(1, (int) Math.round(average + DEFAULT_LOW_AND_HIGH_SIGMA * std))); + } + + /** + * Composes a new instance given the estimated average. + *

+ * The std. deviation would be calculated out of this mean using the default deviation + * to average ratio {@link #DEFAULT_STD_TO_AVERAGE_RATIO}. + *

+ *

+ * The resulting stats object's {@link #failed} field will be set to {@code false}. + *

+ *

+ * Low and high insert-sizes will be estimated using the approach described in {@link #BwaMemPairEndStats(double, double)}. + *

+ *

+ * The resulting stats object's {@link #failed} field will be set to {@code false}. + *

+ * @param average the insert size average estimate. + * @throws IllegalArgumentException if the input average is {@link Double#NaN}, infinite or less than 1. + */ + public BwaMemPairEndStats(final double average) { + this(average, average * DEFAULT_STD_TO_AVERAGE_RATIO); + } + + /** + * Composes a new instance giving arbitrary (albeit valid and consistent) values to all its numerical + * fields: {@link #average}, {@link #std}, {@link #low} and {@link #high}. + *

+ * The resulting stats object's {@link #failed} field will be set to {@code false}. + *

+ * @param average the insert-size average in [1 .. +Inf) range. + * @param std the insert-size std. dev in [0 .. +Inf) range. + * @param low the shortest insert-size to be considered "normal" [1 .. {@code average}] + * @param high the longest insert-size to be considered "normal" [{@code average} .. +Inf). + * + * @throws IllegalArgumentException if the values provided are invalid or inconsistent based on the constraints + * listed above. + */ + public BwaMemPairEndStats(final double average, final double std, final int low, final int high) { + if (Double.isNaN(average) || !Double.isFinite(average) || average < 1) { + throw new IllegalArgumentException("invalid input average: " + average); + } else if (Double.isNaN(std) || !Double.isFinite(std) || std < 0) { + throw new IllegalArgumentException("invalid std. err: " + std); + } else if (low > average) { + throw new IllegalArgumentException("the low limit cannot be larger than the average"); + } else if (high < average) { + throw new IllegalArgumentException("the high limit cannot be larger than the average"); + } else { + this.failed = false; + this.average = average; + this.std = std; + this.high = high; + this.low = low; + } + } + + /** + * Constructor used to create the singleton {@link #FAILED} instance. + */ + private BwaMemPairEndStats() { + this.failed = true; + this.average = Double.NaN; + this.std = Double.NaN; + this.low = Integer.MAX_VALUE; + this.high = Integer.MIN_VALUE; + } + + @Override + public String toString() { + if (failed) { + return "InsertSize ~ FAILED/DO_NOT_INFER"; + } + return String.format("InsertSize ~ N(%.2f, %.2f) in [%d, %d]", average, std, low, high); + } + + @Override + public boolean equals(final Object other) { + return other instanceof BwaMemPairEndStats && equals((BwaMemPairEndStats) other); + } + + @Override + public int hashCode() { + if (this.failed) { + return 0; + } else { + return ((((Double.hashCode(average) * 31) + + Double.hashCode(std) * 31) + + Integer.hashCode(low) * 31) + + Integer.hashCode(high) * 31); + } + } + + public boolean equals(final BwaMemPairEndStats other) { + if (other == null || this.failed != other.failed) { + return false; + } else { + return this.failed || (this.average == other.average && + this.std == other.std && + this.low == other.low && this.high == other.high); + } + } + + +} diff --git a/src/test/java/org/broadinstitute/hellbender/utils/bwa/BwaMemIndexTest.java b/src/test/java/org/broadinstitute/hellbender/utils/bwa/BwaMemIndexTest.java index dace57c..97ef55a 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/bwa/BwaMemIndexTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/bwa/BwaMemIndexTest.java @@ -81,13 +81,23 @@ void testMulti() { testAlignment(alignmentList.get(0), 70, 140, 0, 68, "32M2D36M", 2, 0); // 2-base deletion } - @Test - void testPair() { + @Test(dataProvider = "testPairData") + void testPair(final int defaultSetOrClearPEStats) { final List seqs = new ArrayList<>(); seqs.add("GGCTTTTAATGCTTTTCAGTGGTTGCTGCTCAAGATGGAGTCTACTCAGCAGATGGTAAGCTCTATTATT"); // ref.fa line 1 seqs.add("TTGTTTTTAACACCAGAGTCATCCATCACATAATCAAATTTACTTTTAACTCTGGTAAATACTTCATTGT"); // rc ref.fa line 3 final BwaMemAligner aligner = new BwaMemAligner(index); aligner.alignPairs(); + switch (defaultSetOrClearPEStats) { + case 1: + aligner.setProperPairEndStats(new BwaMemPairEndStats(200, 10, 1, 600)); + break; + case 2: + aligner.dontInferPairEndStats(); + break; + default: + aligner.inferPairEndStats(); + } final List> alignments = aligner.alignSeqs(seqs,String::getBytes); Assert.assertNotNull(alignments); Assert.assertEquals(alignments.size(), 2); @@ -95,18 +105,27 @@ void testPair() { Assert.assertNotNull(alignmentList); Assert.assertEquals(alignmentList.size(), 1); BwaMemAlignment alignment = alignmentList.get(0); - testAlignment(alignment, 0, 70, 0, 70, "70M", 0, 0x61); + testAlignment(alignment, 0, 70, 0, 70, "70M", 0, defaultSetOrClearPEStats == 1 ? 0x63 : 0x61); Assert.assertEquals(alignment.getMateRefStart(), 140); Assert.assertEquals(alignment.getTemplateLen(), 210); alignmentList = alignments.get(1); Assert.assertNotNull(alignmentList); Assert.assertEquals(alignmentList.size(), 1); alignment = alignmentList.get(0); - testAlignment(alignment, 140, 210, 0, 70, "70M", 0, 0x91); + testAlignment(alignment, 140, 210, 0, 70, "70M", 0, defaultSetOrClearPEStats == 1 ? 0x93 : 0x91); Assert.assertEquals(alignment.getMateRefStart(), 0); Assert.assertEquals(alignment.getTemplateLen(), -210); } + @DataProvider(name = "testPairData") + public Object[][] testPairData() { + final List result = new ArrayList<>(3); + result.add(new Object[] { 0 }); + result.add(new Object[] { 1 }); + result.add(new Object[] { 2 }); + return result.toArray(new Object[result.size()][]); + } + void testAlignment( final BwaMemAlignment alignment, final int refStart, final int refEnd, final int seqStart, final int seqEnd, final String cigar, final int nMismatches, final int samFlag ) {