@@ -119,7 +119,7 @@ public long skip(long n) throws IOException
119
119
120
120
/**
121
121
* Deserializes data from the provided InputStream and constructs records via the provided IRecordBuilder.
122
- *
122
+ *
123
123
* Data in the InputStream is expected to be in the HPCC Systems binary record format.
124
124
*/
125
125
public class BinaryRecordReader implements IRecordReader
@@ -132,6 +132,15 @@ public class BinaryRecordReader implements IRecordReader
132
132
private boolean isIndex = false ;
133
133
private boolean useDecimalForUnsigned8 = false ;
134
134
135
+ public static final int NO_STRING_PROCESSING = 0 ;
136
+ public static final int TRIM_STRINGS = 1 ;
137
+ public static final int TRIM_FIXED_LEN_STRINGS = 2 ;
138
+ public static final int CONVERT_EMPTY_STRINGS_TO_NULL = 4 ;
139
+
140
+ private boolean shouldTrimFixedLenStrings = false ;
141
+ private boolean shouldTrimStrings = false ;
142
+ private boolean convertEmptyStringsToNull = false ;
143
+
135
144
private byte [] scratchBuffer = new byte [BUFFER_GROW_SIZE ];
136
145
137
146
private static final Charset sbcSet = Charset .forName ("ISO-8859-1" );
@@ -223,7 +232,7 @@ public void initialize(IRecordBuilder rb) throws Exception
223
232
224
233
/**
225
234
* Determines if unsigned 8 values should be parsed into BigDecimals to avoid long overflow.
226
- *
235
+ *
227
236
* @param useDecimal use decimal
228
237
*/
229
238
public void setUseDecimalForUnsigned8 (boolean useDecimal )
@@ -233,14 +242,26 @@ public void setUseDecimalForUnsigned8(boolean useDecimal)
233
242
234
243
/**
235
244
* Should be set if this record reader is reading an index file.
236
- *
245
+ *
237
246
* @param isIdx Is this an index file?
238
247
*/
239
248
public void setIsIndex (boolean isIdx )
240
249
{
241
250
this .isIndex = isIdx ;
242
251
}
243
252
253
+ /**
254
+ * Set string processing flags.
255
+ *
256
+ * @param flags string processing flags
257
+ */
258
+ public void setStringProcessingFlags (int flags )
259
+ {
260
+ shouldTrimStrings = (flags & TRIM_STRINGS ) != 0 ;
261
+ shouldTrimFixedLenStrings = (flags & TRIM_FIXED_LEN_STRINGS ) != 0 ;
262
+ convertEmptyStringsToNull = (flags & CONVERT_EMPTY_STRINGS_TO_NULL ) != 0 ;
263
+ }
264
+
244
265
/*
245
266
* (non-Javadoc)
246
267
*
@@ -367,12 +388,12 @@ private Object parseFlatField(FieldDef fd, boolean isLittleEndian) throws Unpars
367
388
BigInteger bi = Utils .extractUnsigned8Val (intValue );
368
389
fieldValue = new BigDecimal (bi );
369
390
}
370
- else
391
+ else
371
392
{
372
393
fieldValue = Long .valueOf (intValue );
373
394
if (intValue < 0 )
374
395
{
375
- messages .addMessage ("Warning: Possible unsigned overflow in column: '" + fd .getFieldName ()
396
+ messages .addMessage ("Warning: Possible unsigned overflow in column: '" + fd .getFieldName ()
376
397
+ "'. Convert values to BigInteger via org.hpccsystems.commons.utils.extractUnsigned8 if necessary, "
377
398
+ " or call BinaryRecordReader.setUseDecimalForUnsigned8() before reading to convert unsigned8 values to BigDecimal values." );
378
399
}
@@ -439,10 +460,11 @@ private Object parseFlatField(FieldDef fd, boolean isLittleEndian) throws Unpars
439
460
fieldValue = Boolean .valueOf (value != 0 );
440
461
break ;
441
462
case CHAR :
442
- fieldValue = getString (fd .getSourceType (), 1 );
463
+ fieldValue = getString (fd .getSourceType (), 1 , false );
443
464
break ;
444
465
case STRING :
445
466
{
467
+ boolean shouldTrim = shouldTrimStrings ;
446
468
int codePoints = 0 ;
447
469
if (fd .isFixed ())
448
470
{
@@ -453,13 +475,15 @@ private Object parseFlatField(FieldDef fd, boolean isLittleEndian) throws Unpars
453
475
}
454
476
455
477
codePoints = (int ) fd .getDataLen ();
478
+
479
+ shouldTrim = shouldTrim || shouldTrimFixedLenStrings ;
456
480
}
457
481
else
458
482
{
459
483
codePoints = ((int ) getInt (4 , isLittleEndian , false ));
460
484
}
461
485
462
- fieldValue = getString (fd .getSourceType (), codePoints );
486
+ fieldValue = getString (fd .getSourceType (), codePoints , shouldTrim );
463
487
break ;
464
488
}
465
489
case VAR_STRING :
@@ -474,8 +498,10 @@ private Object parseFlatField(FieldDef fd, boolean isLittleEndian) throws Unpars
474
498
"Data length: " + fd .getDataLen () + " exceeds max supported length: " + Integer .MAX_VALUE );
475
499
}
476
500
501
+ boolean shouldTrim = shouldTrimStrings || shouldTrimFixedLenStrings ;
502
+
477
503
int codePoints = (int ) fd .getDataLen ();
478
- String strValue = getString (fd .getSourceType (), codePoints );
504
+ String strValue = getString (fd .getSourceType (), codePoints , shouldTrim );
479
505
480
506
// Unicode uses two byte nulls
481
507
if (fd .getSourceType ().isUTF16 ())
@@ -490,7 +516,8 @@ private Object parseFlatField(FieldDef fd, boolean isLittleEndian) throws Unpars
490
516
}
491
517
else
492
518
{
493
- fieldValue = getNullTerminatedString (fd .getSourceType ());
519
+ boolean shouldTrim = shouldTrimStrings ;
520
+ fieldValue = getNullTerminatedString (fd .getSourceType (), shouldTrim );
494
521
}
495
522
break ;
496
523
}
@@ -692,7 +719,7 @@ private void readIntoScratchBuffer(int offset, int dataLen) throws IOException
692
719
* the length, 1 to 8 bytes
693
720
* @param little_endian
694
721
* true if the value is little endian
695
- * @param shouldCorrectBias
722
+ * @param shouldCorrectBias
696
723
* true if the value should be corrected for index bias
697
724
* @return the integer extracted as a long
698
725
* @throws IOException
@@ -914,6 +941,66 @@ private BigDecimal getSignedDecimal(int numDigits, int precision, int dataLen) t
914
941
return ret ;
915
942
}
916
943
944
+ /**
945
+ * Trims the string within the scratch buffer by modifying the provided string range
946
+ *
947
+ * @param isUnicode is the string unicode
948
+ * @param range array with starting and ending byte of string within the scratch buffer
949
+ */
950
+ private void trimStringInScratchBuffer (boolean isUnicode , int [] range )
951
+ {
952
+ if (isUnicode )
953
+ {
954
+ while (range [0 ] < range [1 ] - 1 )
955
+ {
956
+ int codePoint = this .scratchBuffer [range [0 ]] << 8 | this .scratchBuffer [range [0 ]+1 ];
957
+ if (!Character .isWhitespace (codePoint ))
958
+ {
959
+ break ;
960
+ }
961
+
962
+ range [0 ] += 2 ;
963
+ }
964
+
965
+ while (range [1 ] > range [0 ])
966
+ {
967
+ // Need to check against EOS (0x0) in trim fixed len strings correctly
968
+ int codePoint = this .scratchBuffer [range [1 ]-2 ] << 8 | this .scratchBuffer [range [1 ]-1 ];
969
+ if (!Character .isWhitespace (codePoint ) && codePoint != 0x0 )
970
+ {
971
+ break ;
972
+ }
973
+
974
+ range [1 ] -= 2 ;
975
+ }
976
+ }
977
+ else
978
+ {
979
+ while (range [0 ] < range [1 ])
980
+ {
981
+ int codePoint = this .scratchBuffer [range [0 ]];
982
+ if (!Character .isWhitespace (codePoint ))
983
+ {
984
+ break ;
985
+ }
986
+
987
+ range [0 ]++;
988
+ }
989
+
990
+ while (range [1 ] > range [0 ])
991
+ {
992
+ // Need to check against EOS (0x0) in trim fixed len strings correctly
993
+ int codePoint = this .scratchBuffer [range [1 ]-1 ];
994
+ if (!Character .isWhitespace (codePoint ) && codePoint != 0x0 )
995
+ {
996
+ break ;
997
+ }
998
+
999
+ range [1 ]--;
1000
+ }
1001
+ }
1002
+ }
1003
+
917
1004
/**
918
1005
* Gets the null terminated string.
919
1006
*
@@ -923,7 +1010,7 @@ private BigDecimal getSignedDecimal(int numDigits, int precision, int dataLen) t
923
1010
* @throws IOException
924
1011
* Signals that an I/O exception has occurred.
925
1012
*/
926
- private String getNullTerminatedString (HpccSrcType stype ) throws IOException
1013
+ private String getNullTerminatedString (HpccSrcType stype , boolean shouldTrim ) throws IOException
927
1014
{
928
1015
Charset charset = sbcSet ;
929
1016
switch (stype )
@@ -1043,7 +1130,20 @@ private String getNullTerminatedString(HpccSrcType stype) throws IOException
1043
1130
}
1044
1131
}
1045
1132
1046
- return new String (scratchBuffer ,0 ,strByteLen ,charset );
1133
+ int [] strRange = {0 , strByteLen };
1134
+ if (shouldTrim )
1135
+ {
1136
+ boolean isUnicode = (stype == HpccSrcType .UTF16BE || stype == HpccSrcType .UTF16LE );
1137
+ trimStringInScratchBuffer (isUnicode , strRange );
1138
+ }
1139
+
1140
+ strByteLen = strRange [1 ] - strRange [0 ];
1141
+ if (strByteLen == 0 && convertEmptyStringsToNull )
1142
+ {
1143
+ return null ;
1144
+ }
1145
+
1146
+ return new String (scratchBuffer ,strRange [0 ],strByteLen ,charset );
1047
1147
}
1048
1148
1049
1149
/**
@@ -1057,7 +1157,7 @@ private String getNullTerminatedString(HpccSrcType stype) throws IOException
1057
1157
* @throws IOException
1058
1158
* Signals that an I/O exception has occurred.
1059
1159
*/
1060
- private String getString (HpccSrcType styp , int codePoints ) throws IOException
1160
+ private String getString (HpccSrcType styp , int codePoints , boolean shouldTrim ) throws IOException
1061
1161
{
1062
1162
Charset charset = utf8Set ;
1063
1163
switch (styp )
@@ -1129,6 +1229,7 @@ else if ((this.scratchBuffer[strByteLen + bytesScanned] & 0xF8) == 0xF0)
1129
1229
strByteLen += misalignedBytes ;
1130
1230
}
1131
1231
}
1232
+
1132
1233
break ;
1133
1234
}
1134
1235
case SINGLE_BYTE_CHAR :
@@ -1240,7 +1341,20 @@ else if ((this.scratchBuffer[strByteLen + bytesScanned] & 0xF8) == 0xF0)
1240
1341
throw new IOException ("Unknown source type" );
1241
1342
}
1242
1343
1243
- return new String (this .scratchBuffer , 0 , strByteLen , charset );
1344
+ int [] strRange = {0 , strByteLen };
1345
+ if (shouldTrim )
1346
+ {
1347
+ boolean isUnicode = (styp == HpccSrcType .UTF16BE || styp == HpccSrcType .UTF16LE );
1348
+ trimStringInScratchBuffer (isUnicode , strRange );
1349
+ }
1350
+
1351
+ strByteLen = strRange [1 ] - strRange [0 ];
1352
+ if (strByteLen == 0 && convertEmptyStringsToNull )
1353
+ {
1354
+ return null ;
1355
+ }
1356
+
1357
+ return new String (scratchBuffer ,strRange [0 ],strByteLen ,charset );
1244
1358
}
1245
1359
1246
1360
/**
0 commit comments