Skip to content

Commit a64ce41

Browse files
committed
Merge remote-tracking branch 'origin/candidate-8.12.x'
Signed-off-by: Gavin Halliday <[email protected]>
2 parents 057e391 + 0f124d0 commit a64ce41

File tree

3 files changed

+211
-21
lines changed

3 files changed

+211
-21
lines changed

dfsclient/src/main/java/org/hpccsystems/dfs/client/BinaryRecordReader.java

+128-14
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ public long skip(long n) throws IOException
119119

120120
/**
121121
* Deserializes data from the provided InputStream and constructs records via the provided IRecordBuilder.
122-
*
122+
*
123123
* Data in the InputStream is expected to be in the HPCC Systems binary record format.
124124
*/
125125
public class BinaryRecordReader implements IRecordReader
@@ -132,6 +132,15 @@ public class BinaryRecordReader implements IRecordReader
132132
private boolean isIndex = false;
133133
private boolean useDecimalForUnsigned8 = false;
134134

135+
public static final int NO_STRING_PROCESSING = 0;
136+
public static final int TRIM_STRINGS = 1;
137+
public static final int TRIM_FIXED_LEN_STRINGS = 2;
138+
public static final int CONVERT_EMPTY_STRINGS_TO_NULL = 4;
139+
140+
private boolean shouldTrimFixedLenStrings = false;
141+
private boolean shouldTrimStrings = false;
142+
private boolean convertEmptyStringsToNull = false;
143+
135144
private byte[] scratchBuffer = new byte[BUFFER_GROW_SIZE];
136145

137146
private static final Charset sbcSet = Charset.forName("ISO-8859-1");
@@ -223,7 +232,7 @@ public void initialize(IRecordBuilder rb) throws Exception
223232

224233
/**
225234
* Determines if unsigned 8 values should be parsed into BigDecimals to avoid long overflow.
226-
*
235+
*
227236
* @param useDecimal use decimal
228237
*/
229238
public void setUseDecimalForUnsigned8(boolean useDecimal)
@@ -233,14 +242,26 @@ public void setUseDecimalForUnsigned8(boolean useDecimal)
233242

234243
/**
235244
* Should be set if this record reader is reading an index file.
236-
*
245+
*
237246
* @param isIdx Is this an index file?
238247
*/
239248
public void setIsIndex(boolean isIdx)
240249
{
241250
this.isIndex = isIdx;
242251
}
243252

253+
/**
254+
* Set string processing flags.
255+
*
256+
* @param flags string processing flags
257+
*/
258+
public void setStringProcessingFlags(int flags)
259+
{
260+
shouldTrimStrings = (flags & TRIM_STRINGS) != 0;
261+
shouldTrimFixedLenStrings = (flags & TRIM_FIXED_LEN_STRINGS) != 0;
262+
convertEmptyStringsToNull = (flags & CONVERT_EMPTY_STRINGS_TO_NULL) != 0;
263+
}
264+
244265
/*
245266
* (non-Javadoc)
246267
*
@@ -367,12 +388,12 @@ private Object parseFlatField(FieldDef fd, boolean isLittleEndian) throws Unpars
367388
BigInteger bi = Utils.extractUnsigned8Val(intValue);
368389
fieldValue = new BigDecimal(bi);
369390
}
370-
else
391+
else
371392
{
372393
fieldValue = Long.valueOf(intValue);
373394
if (intValue < 0)
374395
{
375-
messages.addMessage("Warning: Possible unsigned overflow in column: '" + fd.getFieldName()
396+
messages.addMessage("Warning: Possible unsigned overflow in column: '" + fd.getFieldName()
376397
+ "'. Convert values to BigInteger via org.hpccsystems.commons.utils.extractUnsigned8 if necessary, "
377398
+ " or call BinaryRecordReader.setUseDecimalForUnsigned8() before reading to convert unsigned8 values to BigDecimal values.");
378399
}
@@ -439,10 +460,11 @@ private Object parseFlatField(FieldDef fd, boolean isLittleEndian) throws Unpars
439460
fieldValue = Boolean.valueOf(value != 0);
440461
break;
441462
case CHAR:
442-
fieldValue = getString(fd.getSourceType(), 1);
463+
fieldValue = getString(fd.getSourceType(), 1, false);
443464
break;
444465
case STRING:
445466
{
467+
boolean shouldTrim = shouldTrimStrings;
446468
int codePoints = 0;
447469
if (fd.isFixed())
448470
{
@@ -453,13 +475,15 @@ private Object parseFlatField(FieldDef fd, boolean isLittleEndian) throws Unpars
453475
}
454476

455477
codePoints = (int) fd.getDataLen();
478+
479+
shouldTrim = shouldTrim || shouldTrimFixedLenStrings;
456480
}
457481
else
458482
{
459483
codePoints = ((int) getInt(4, isLittleEndian, false));
460484
}
461485

462-
fieldValue = getString(fd.getSourceType(), codePoints);
486+
fieldValue = getString(fd.getSourceType(), codePoints, shouldTrim);
463487
break;
464488
}
465489
case VAR_STRING:
@@ -474,8 +498,10 @@ private Object parseFlatField(FieldDef fd, boolean isLittleEndian) throws Unpars
474498
"Data length: " + fd.getDataLen() + " exceeds max supported length: " + Integer.MAX_VALUE);
475499
}
476500

501+
boolean shouldTrim = shouldTrimStrings || shouldTrimFixedLenStrings;
502+
477503
int codePoints = (int) fd.getDataLen();
478-
String strValue = getString(fd.getSourceType(), codePoints);
504+
String strValue = getString(fd.getSourceType(), codePoints, shouldTrim);
479505

480506
// Unicode uses two byte nulls
481507
if (fd.getSourceType().isUTF16())
@@ -490,7 +516,8 @@ private Object parseFlatField(FieldDef fd, boolean isLittleEndian) throws Unpars
490516
}
491517
else
492518
{
493-
fieldValue = getNullTerminatedString(fd.getSourceType());
519+
boolean shouldTrim = shouldTrimStrings;
520+
fieldValue = getNullTerminatedString(fd.getSourceType(), shouldTrim);
494521
}
495522
break;
496523
}
@@ -692,7 +719,7 @@ private void readIntoScratchBuffer(int offset, int dataLen) throws IOException
692719
* the length, 1 to 8 bytes
693720
* @param little_endian
694721
* true if the value is little endian
695-
* @param shouldCorrectBias
722+
* @param shouldCorrectBias
696723
* true if the value should be corrected for index bias
697724
* @return the integer extracted as a long
698725
* @throws IOException
@@ -914,6 +941,66 @@ private BigDecimal getSignedDecimal(int numDigits, int precision, int dataLen) t
914941
return ret;
915942
}
916943

944+
/**
945+
* Trims the string within the scratch buffer by modifying the provided string range
946+
*
947+
* @param isUnicode is the string unicode
948+
* @param range array with starting and ending byte of string within the scratch buffer
949+
*/
950+
private void trimStringInScratchBuffer(boolean isUnicode, int[] range)
951+
{
952+
if (isUnicode)
953+
{
954+
while (range[0] < range[1] - 1)
955+
{
956+
int codePoint = this.scratchBuffer[range[0]] << 8 | this.scratchBuffer[range[0]+1];
957+
if (!Character.isWhitespace(codePoint))
958+
{
959+
break;
960+
}
961+
962+
range[0] += 2;
963+
}
964+
965+
while (range[1] > range[0])
966+
{
967+
// Need to check against EOS (0x0) in trim fixed len strings correctly
968+
int codePoint = this.scratchBuffer[range[1]-2] << 8 | this.scratchBuffer[range[1]-1];
969+
if (!Character.isWhitespace(codePoint) && codePoint != 0x0)
970+
{
971+
break;
972+
}
973+
974+
range[1] -= 2;
975+
}
976+
}
977+
else
978+
{
979+
while (range[0] < range[1])
980+
{
981+
int codePoint = this.scratchBuffer[range[0]];
982+
if (!Character.isWhitespace(codePoint))
983+
{
984+
break;
985+
}
986+
987+
range[0]++;
988+
}
989+
990+
while (range[1] > range[0])
991+
{
992+
// Need to check against EOS (0x0) in trim fixed len strings correctly
993+
int codePoint = this.scratchBuffer[range[1]-1];
994+
if (!Character.isWhitespace(codePoint) && codePoint != 0x0)
995+
{
996+
break;
997+
}
998+
999+
range[1]--;
1000+
}
1001+
}
1002+
}
1003+
9171004
/**
9181005
* Gets the null terminated string.
9191006
*
@@ -923,7 +1010,7 @@ private BigDecimal getSignedDecimal(int numDigits, int precision, int dataLen) t
9231010
* @throws IOException
9241011
* Signals that an I/O exception has occurred.
9251012
*/
926-
private String getNullTerminatedString(HpccSrcType stype) throws IOException
1013+
private String getNullTerminatedString(HpccSrcType stype, boolean shouldTrim) throws IOException
9271014
{
9281015
Charset charset = sbcSet;
9291016
switch (stype)
@@ -1043,7 +1130,20 @@ private String getNullTerminatedString(HpccSrcType stype) throws IOException
10431130
}
10441131
}
10451132

1046-
return new String(scratchBuffer,0,strByteLen,charset);
1133+
int[] strRange = {0, strByteLen};
1134+
if (shouldTrim)
1135+
{
1136+
boolean isUnicode = (stype == HpccSrcType.UTF16BE || stype == HpccSrcType.UTF16LE);
1137+
trimStringInScratchBuffer(isUnicode, strRange);
1138+
}
1139+
1140+
strByteLen = strRange[1] - strRange[0];
1141+
if (strByteLen == 0 && convertEmptyStringsToNull)
1142+
{
1143+
return null;
1144+
}
1145+
1146+
return new String(scratchBuffer,strRange[0],strByteLen,charset);
10471147
}
10481148

10491149
/**
@@ -1057,7 +1157,7 @@ private String getNullTerminatedString(HpccSrcType stype) throws IOException
10571157
* @throws IOException
10581158
* Signals that an I/O exception has occurred.
10591159
*/
1060-
private String getString(HpccSrcType styp, int codePoints) throws IOException
1160+
private String getString(HpccSrcType styp, int codePoints, boolean shouldTrim) throws IOException
10611161
{
10621162
Charset charset = utf8Set;
10631163
switch (styp)
@@ -1129,6 +1229,7 @@ else if ((this.scratchBuffer[strByteLen + bytesScanned] & 0xF8) == 0xF0)
11291229
strByteLen += misalignedBytes;
11301230
}
11311231
}
1232+
11321233
break;
11331234
}
11341235
case SINGLE_BYTE_CHAR:
@@ -1240,7 +1341,20 @@ else if ((this.scratchBuffer[strByteLen + bytesScanned] & 0xF8) == 0xF0)
12401341
throw new IOException("Unknown source type");
12411342
}
12421343

1243-
return new String(this.scratchBuffer, 0, strByteLen, charset);
1344+
int[] strRange = {0, strByteLen};
1345+
if (shouldTrim)
1346+
{
1347+
boolean isUnicode = (styp == HpccSrcType.UTF16BE || styp == HpccSrcType.UTF16LE);
1348+
trimStringInScratchBuffer(isUnicode, strRange);
1349+
}
1350+
1351+
strByteLen = strRange[1] - strRange[0];
1352+
if (strByteLen == 0 && convertEmptyStringsToNull)
1353+
{
1354+
return null;
1355+
}
1356+
1357+
return new String(scratchBuffer,strRange[0],strByteLen,charset);
12441358
}
12451359

12461360
/**

dfsclient/src/main/java/org/hpccsystems/dfs/client/BinaryRecordWriter.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,7 @@ else if (fd.getDataLen() == 8)
448448
case VAR_STRING:
449449
case STRING:
450450
{
451-
String value = fieldValue != null?(String) fieldValue:"";
451+
String value = fieldValue != null ? (String) fieldValue : "";
452452
byte[] data = new byte[0];
453453
if (fd.getSourceType() == HpccSrcType.UTF16LE)
454454
{

0 commit comments

Comments
 (0)