Skip to content

Commit e5ba94c

Browse files
committed
wip on #1614
This is more complicated than originally described because the file encoding isn't really described and java bytes include negative values. EOF is triggered correctly, but it's also triggered on various potential characters which become negative values when they are truncated to byte. It's unclear if this is meant to only read ASCII, ISO 8859-1, or UTF-8 but nothing outside of the ascii space works correctly.
1 parent f684576 commit e5ba94c

File tree

5 files changed

+124
-16
lines changed

5 files changed

+124
-16
lines changed

src/main/java/htsjdk/tribble/index/AbstractIndex.java

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import java.io.BufferedOutputStream;
3030
import java.io.File;
3131
import java.io.IOException;
32+
import java.nio.charset.StandardCharsets;
3233
import java.nio.file.Files;
3334
import java.nio.file.Path;
3435
import java.util.ArrayList;
@@ -302,10 +303,10 @@ private void writeHeader(final LittleEndianOutputStream dos) throws IOException
302303
private void readHeader(final LittleEndianInputStream dis) throws IOException {
303304

304305
version = dis.readInt();
305-
indexedPath = IOUtil.getPath(dis.readString());
306+
indexedPath = IOUtil.getPath(dis.readString(StandardCharsets.US_ASCII));
306307
indexedFileSize = dis.readLong();
307308
indexedFileTS = dis.readLong();
308-
indexedFileMD5 = dis.readString();
309+
indexedFileMD5 = dis.readString(StandardCharsets.US_ASCII);
309310
flags = dis.readInt();
310311
if (version < 3 && (flags & SEQUENCE_DICTIONARY_FLAG) == SEQUENCE_DICTIONARY_FLAG) {
311312
readSequenceDictionary(dis);
@@ -314,8 +315,8 @@ private void readHeader(final LittleEndianInputStream dis) throws IOException {
314315
if (version >= 3) {
315316
int nProperties = dis.readInt();
316317
while (nProperties-- > 0) {
317-
final String key = dis.readString();
318-
final String value = dis.readString();
318+
final String key = dis.readString(StandardCharsets.US_ASCII);
319+
final String value = dis.readString(StandardCharsets.US_ASCII);
319320
properties.put(key, value);
320321
}
321322
}
@@ -332,7 +333,7 @@ private void readSequenceDictionary(final LittleEndianInputStream dis) throws IO
332333
final int size = dis.readInt();
333334
if (size < 0) throw new IllegalStateException("Size of the sequence dictionary entries is negative");
334335
for (int x = 0; x < size; x++) {
335-
dis.readString();
336+
dis.readString(StandardCharsets.US_ASCII);
336337
dis.readInt();
337338
}
338339
}

src/main/java/htsjdk/tribble/index/interval/IntervalTreeIndex.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
import java.io.IOException;
2727
import java.io.InputStream;
28+
import java.nio.charset.StandardCharsets;
2829
import java.nio.file.Path;
2930
import java.util.ArrayList;
3031
import java.util.Arrays;
@@ -210,7 +211,7 @@ public void read(final LittleEndianInputStream dis) throws IOException {
210211

211212
tree = new IntervalTree();
212213

213-
name = dis.readString();
214+
name = dis.readString(StandardCharsets.US_ASCII);
214215
int nIntervals = dis.readInt();
215216
while (nIntervals-- > 0) {
216217

src/main/java/htsjdk/tribble/util/LittleEndianInputStream.java

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
import java.io.FilterInputStream;
1717
import java.io.IOException;
1818
import java.io.InputStream;
19+
import java.nio.charset.Charset;
20+
import java.nio.charset.StandardCharsets;
1921

2022

2123
/**
@@ -105,22 +107,33 @@ public final float readFloat() throws IOException {
105107
}
106108

107109
/**
108-
* Read a null terminated byte array and return result as a string
109-
*
110-
* @return
111-
* @throws IOException
110+
* Read a null terminated byte array and return result as a String
111+
* This method decodes theh bytes as UTF-8 string
112+
* @throws IOException if reading from the stream fails for some reason
113+
* @throws EOFException if the stream ends without encountering a null terminator.
114+
* @deprecated Prefer the {@link #readString(Charset)} which allows specifying a charset explicitly
112115
*/
113-
116+
@Deprecated
114117
public String readString() throws IOException {
115-
ByteArrayOutputStream bis = new ByteArrayOutputStream(100);
116-
byte b;
117-
while ((b = (byte) in.read()) != 0) {
118+
return readString(StandardCharsets.UTF_8);
119+
}
120+
121+
/**
122+
* Read a null terminated byte array and return result as a String
123+
* @param charset the Charset to use when decoding the bytes to a String
124+
* @throws IOException if reading from the stream fails for some reason
125+
* @throws EOFException if the stream ends without encountering a null terminator.
126+
*/
127+
public String readString(final Charset charset) throws IOException {
128+
final ByteArrayOutputStream bis = new ByteArrayOutputStream(100);
129+
int b;
130+
while ((b = in.read()) != 0) {
118131
if(b < 0) {
119132
throw new EOFException();
120133
}
121-
bis.write(b);
134+
bis.write((byte)b);
122135
}
123-
return new String(bis.toByteArray());
136+
return bis.toString(charset.name());
124137
}
125138

126139

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
package htsjdk.testutil;
2+
3+
import org.testng.Assert;
4+
5+
public interface Expected<T> {
6+
void test(ThrowingSupplier<T> functionToTest);
7+
8+
9+
interface ThrowingConsumer<T> {
10+
void test(T a) throws Exception;
11+
}
12+
13+
static <T> Expected<T> match(final T expected) {
14+
return new ComparisonExpected<>((T actual) -> Assert.assertEquals(actual, expected));
15+
}
16+
17+
static <T> Expected<T> mismatch(final T expected) {
18+
return new ComparisonExpected<>((T actual) -> Assert.assertNotEquals(actual, expected));
19+
}
20+
21+
static <T> Expected<T> exception(final Class<? extends Exception> exceptionClass) {
22+
return functionToTest -> Assert.assertThrows(exceptionClass, functionToTest::produce);
23+
}
24+
25+
interface ThrowingSupplier<T> {
26+
T produce() throws Exception;
27+
}
28+
}
29+
30+
final class ComparisonExpected<T> implements Expected<T> {
31+
private final ThrowingConsumer<T> test;
32+
33+
@Override
34+
public void test(ThrowingSupplier<T> supplier) {
35+
try {
36+
test.test(supplier.produce());
37+
} catch (AssertionError e) {
38+
throw e;
39+
} catch (Exception e) {
40+
throw new AssertionError(e);
41+
}
42+
}
43+
44+
ComparisonExpected(ThrowingConsumer<T> test) {
45+
this.test = test;
46+
}
47+
48+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
package htsjdk.tribble.util;
2+
3+
import htsjdk.HtsjdkTest;
4+
import htsjdk.testutil.Expected;
5+
import org.testng.annotations.DataProvider;
6+
import org.testng.annotations.Test;
7+
8+
import java.io.BufferedInputStream;
9+
import java.io.EOFException;
10+
import java.io.FileInputStream;
11+
import java.nio.charset.Charset;
12+
import java.nio.charset.StandardCharsets;
13+
import java.nio.file.Files;
14+
import java.nio.file.Paths;
15+
16+
public class LittleEndianInputStreamTest extends HtsjdkTest {
17+
18+
19+
@DataProvider
20+
public Object[][] testCases() {
21+
final String missingTerminator = "src/test/resources/htsjdk/tribble/util/string_with_extended_ascii_no_terminator.bin";
22+
final String extendedAsciiFile = "src/test/resources/htsjdk/tribble/util/string_with_extended_ascii_and_null_terminator.bin";
23+
final Object utf8File = "src/test/resources/htsjdk/tribble/util/string_with_utf8_emoji_and_null_terminator.txt";
24+
return new Object[][]{
25+
{missingTerminator, StandardCharsets.ISO_8859_1, Expected.exception(EOFException.class)},
26+
{missingTerminator, StandardCharsets.US_ASCII, Expected.exception(EOFException.class)},
27+
{missingTerminator, StandardCharsets.UTF_8, Expected.exception(EOFException.class)},
28+
{extendedAsciiFile, StandardCharsets.ISO_8859_1, Expected.match("very dràààààmatic and null terminated")},
29+
{extendedAsciiFile, StandardCharsets.US_ASCII, Expected.mismatch("very dràààààmatic and null terminated")},
30+
{extendedAsciiFile, StandardCharsets.UTF_8, Expected.mismatch("very dràààààmatic and null terminated")},
31+
{utf8File, StandardCharsets.UTF_8, Expected.match("🐋 UTF8 is Great 🐋")},
32+
{utf8File, StandardCharsets.ISO_8859_1, Expected.mismatch("🐋 UTF8 is Great 🐋")}
33+
};
34+
}
35+
36+
@Test(dataProvider = "testCases")
37+
public void testAllCases(String filename, Charset charset, Expected<String> expected) {
38+
expected.test(() -> {
39+
try(final LittleEndianInputStream in = new LittleEndianInputStream(new BufferedInputStream(Files.newInputStream(Paths.get(filename))))){
40+
return in.readString(charset);
41+
}
42+
});
43+
}
44+
45+
}

0 commit comments

Comments
 (0)