Merge remote-tracking branch 'origin/candidate-8.12.x'

ghalliday · ghalliday · commit a64ce41fccca · 2023-02-10T17:48:30.000Z
Signed-off-by: Gavin Halliday &lt;gavin.halliday@lexisnexis.com&gt;
diff --git a/dfsclient/src/main/java/org/hpccsystems/dfs/client/BinaryRecordReader.java b/dfsclient/src/main/java/org/hpccsystems/dfs/client/BinaryRecordReader.java
@@ -119,7 +119,7 @@ public long skip(long n) throws IOException
 
 /**
  * Deserializes data from the provided InputStream and constructs records via the provided IRecordBuilder.
- * 
+ *
  * Data in the InputStream is expected to be in the HPCC Systems binary record format.
  */
 public class BinaryRecordReader implements IRecordReader
@@ -132,6 +132,15 @@ public class BinaryRecordReader implements IRecordReader
     private boolean              isIndex = false;
     private boolean              useDecimalForUnsigned8 = false;
 
+    public static final int      NO_STRING_PROCESSING = 0;
+    public static final int      TRIM_STRINGS = 1;
+    public static final int      TRIM_FIXED_LEN_STRINGS = 2;
+    public static final int      CONVERT_EMPTY_STRINGS_TO_NULL = 4;
+
+    private boolean              shouldTrimFixedLenStrings = false;
+    private boolean              shouldTrimStrings = false;
+    private boolean              convertEmptyStringsToNull = false;
+
     private byte[]               scratchBuffer = new byte[BUFFER_GROW_SIZE];
 
     private static final Charset sbcSet              = Charset.forName("ISO-8859-1");
@@ -223,7 +232,7 @@ public void initialize(IRecordBuilder rb) throws Exception
 
     /**
      * Determines if unsigned 8 values should be parsed into BigDecimals to avoid long overflow.
-     * 
+     *
      * @param useDecimal use decimal
      */
     public void setUseDecimalForUnsigned8(boolean useDecimal)
@@ -233,14 +242,26 @@ public void setUseDecimalForUnsigned8(boolean useDecimal)
 
     /**
      * Should be set if this record reader is reading an index file.
-     * 
+     *
      * @param isIdx Is this an index file?
      */
     public void setIsIndex(boolean isIdx)
     {
         this.isIndex = isIdx;
     }
 
+    /**
+     * Set string processing flags.
+     *
+     * @param flags string processing flags
+     */
+    public void setStringProcessingFlags(int flags)
+    {
+        shouldTrimStrings = (flags & TRIM_STRINGS) != 0;
+        shouldTrimFixedLenStrings = (flags & TRIM_FIXED_LEN_STRINGS) != 0;
+        convertEmptyStringsToNull = (flags & CONVERT_EMPTY_STRINGS_TO_NULL) != 0;
+    }
+
     /*
      * (non-Javadoc)
      *
@@ -367,12 +388,12 @@ private Object parseFlatField(FieldDef fd, boolean isLittleEndian) throws Unpars
                         BigInteger bi = Utils.extractUnsigned8Val(intValue);
                         fieldValue = new BigDecimal(bi);
                     }
-                    else 
+                    else
                     {
                         fieldValue = Long.valueOf(intValue);
                         if (intValue < 0)
                         {
-                            messages.addMessage("Warning: Possible unsigned overflow in column: '" + fd.getFieldName() 
+                            messages.addMessage("Warning: Possible unsigned overflow in column: '" + fd.getFieldName()
                                             + "'. Convert values to BigInteger via org.hpccsystems.commons.utils.extractUnsigned8 if necessary, "
                                             + " or call BinaryRecordReader.setUseDecimalForUnsigned8() before reading to convert unsigned8 values to BigDecimal values.");
                         }
@@ -439,10 +460,11 @@ private Object parseFlatField(FieldDef fd, boolean isLittleEndian) throws Unpars
                 fieldValue = Boolean.valueOf(value != 0);
                 break;
             case CHAR:
-                fieldValue = getString(fd.getSourceType(), 1);
+                fieldValue = getString(fd.getSourceType(), 1, false);
                 break;
             case STRING:
             {
+                boolean shouldTrim = shouldTrimStrings;
                 int codePoints = 0;
                 if (fd.isFixed())
                 {
@@ -453,13 +475,15 @@ private Object parseFlatField(FieldDef fd, boolean isLittleEndian) throws Unpars
                     }
 
                     codePoints = (int) fd.getDataLen();
+
+                    shouldTrim = shouldTrim || shouldTrimFixedLenStrings;
                 }
                 else
                 {
                     codePoints = ((int) getInt(4, isLittleEndian, false));
                 }
 
-                fieldValue = getString(fd.getSourceType(), codePoints);
+                fieldValue = getString(fd.getSourceType(), codePoints, shouldTrim);
                 break;
             }
             case VAR_STRING:
@@ -474,8 +498,10 @@ private Object parseFlatField(FieldDef fd, boolean isLittleEndian) throws Unpars
                                 "Data length: " + fd.getDataLen() + " exceeds max supported length: " + Integer.MAX_VALUE);
                     }
 
+                    boolean shouldTrim = shouldTrimStrings || shouldTrimFixedLenStrings;
+
                     int codePoints = (int) fd.getDataLen();
-                    String strValue = getString(fd.getSourceType(), codePoints);
+                    String strValue = getString(fd.getSourceType(), codePoints, shouldTrim);
 
                     // Unicode uses two byte nulls
                     if (fd.getSourceType().isUTF16())
@@ -490,7 +516,8 @@ private Object parseFlatField(FieldDef fd, boolean isLittleEndian) throws Unpars
                 }
                 else
                 {
-                    fieldValue = getNullTerminatedString(fd.getSourceType());
+                    boolean shouldTrim = shouldTrimStrings;
+                    fieldValue = getNullTerminatedString(fd.getSourceType(), shouldTrim);
                 }
                 break;
             }
@@ -692,7 +719,7 @@ private void readIntoScratchBuffer(int offset, int dataLen) throws IOException
      *            the length, 1 to 8 bytes
      * @param little_endian
      *            true if the value is little endian
-     * @param shouldCorrectBias 
+     * @param shouldCorrectBias
      *            true if the value should be corrected for index bias
      * @return the integer extracted as a long
      * @throws IOException
@@ -914,6 +941,66 @@ private BigDecimal getSignedDecimal(int numDigits, int precision, int dataLen) t
         return ret;
     }
 
+    /**
+     * Trims the string within the scratch buffer by modifying the provided string range
+     *
+     * @param isUnicode is the string unicode
+     * @param range array with starting and ending byte of string within the scratch buffer
+    */
+    private void trimStringInScratchBuffer(boolean isUnicode, int[] range)
+    {
+        if (isUnicode)
+        {
+            while (range[0] < range[1] - 1)
+            {
+                int codePoint = this.scratchBuffer[range[0]] << 8 | this.scratchBuffer[range[0]+1];
+                if (!Character.isWhitespace(codePoint))
+                {
+                    break;
+                }
+
+                range[0] += 2;
+            }
+
+            while (range[1] > range[0])
+            {
+                // Need to check against EOS (0x0) in trim fixed len strings correctly
+                int codePoint = this.scratchBuffer[range[1]-2] << 8 | this.scratchBuffer[range[1]-1];
+                if (!Character.isWhitespace(codePoint) && codePoint != 0x0)
+                {
+                    break;
+                }
+
+                range[1] -= 2;
+            }
+        }
+        else
+        {
+            while (range[0] < range[1])
+            {
+                int codePoint = this.scratchBuffer[range[0]];
+                if (!Character.isWhitespace(codePoint))
+                {
+                    break;
+                }
+
+                range[0]++;
+            }
+
+            while (range[1] > range[0])
+            {
+                // Need to check against EOS (0x0) in trim fixed len strings correctly
+                int codePoint = this.scratchBuffer[range[1]-1];
+                if (!Character.isWhitespace(codePoint) && codePoint != 0x0)
+                {
+                    break;
+                }
+
+                range[1]--;
+            }
+        }
+    }
+
     /**
      * Gets the null terminated string.
      *
@@ -923,7 +1010,7 @@ private BigDecimal getSignedDecimal(int numDigits, int precision, int dataLen) t
      * @throws IOException
      *             Signals that an I/O exception has occurred.
      */
-    private String getNullTerminatedString(HpccSrcType stype) throws IOException
+    private String getNullTerminatedString(HpccSrcType stype, boolean shouldTrim) throws IOException
     {
         Charset charset = sbcSet;
         switch (stype)
@@ -1043,7 +1130,20 @@ private String getNullTerminatedString(HpccSrcType stype) throws IOException
             }
         }
 
-        return new String(scratchBuffer,0,strByteLen,charset);
+        int[] strRange = {0, strByteLen};
+        if (shouldTrim)
+        {
+            boolean isUnicode = (stype == HpccSrcType.UTF16BE || stype == HpccSrcType.UTF16LE);
+            trimStringInScratchBuffer(isUnicode, strRange);
+        }
+
+        strByteLen = strRange[1] - strRange[0];
+        if (strByteLen == 0 && convertEmptyStringsToNull)
+        {
+            return null;
+        }
+
+        return new String(scratchBuffer,strRange[0],strByteLen,charset);
     }
 
     /**
@@ -1057,7 +1157,7 @@ private String getNullTerminatedString(HpccSrcType stype) throws IOException
      * @throws IOException
      *             Signals that an I/O exception has occurred.
      */
-    private String getString(HpccSrcType styp, int codePoints) throws IOException
+    private String getString(HpccSrcType styp, int codePoints, boolean shouldTrim) throws IOException
     {
         Charset charset = utf8Set;
         switch (styp)
@@ -1129,6 +1229,7 @@ else if ((this.scratchBuffer[strByteLen + bytesScanned] & 0xF8) == 0xF0)
                         strByteLen += misalignedBytes;
                     }
                 }
+
                 break;
             }
             case SINGLE_BYTE_CHAR:
@@ -1240,7 +1341,20 @@ else if ((this.scratchBuffer[strByteLen + bytesScanned] & 0xF8) == 0xF0)
                 throw new IOException("Unknown source type");
         }
 
-        return new String(this.scratchBuffer, 0, strByteLen, charset);
+        int[] strRange = {0, strByteLen};
+        if (shouldTrim)
+        {
+            boolean isUnicode = (styp == HpccSrcType.UTF16BE || styp == HpccSrcType.UTF16LE);
+            trimStringInScratchBuffer(isUnicode, strRange);
+        }
+
+        strByteLen = strRange[1] - strRange[0];
+        if (strByteLen == 0 && convertEmptyStringsToNull)
+        {
+            return null;
+        }
+
+        return new String(scratchBuffer,strRange[0],strByteLen,charset);
     }
 
     /**
diff --git a/dfsclient/src/main/java/org/hpccsystems/dfs/client/BinaryRecordWriter.java b/dfsclient/src/main/java/org/hpccsystems/dfs/client/BinaryRecordWriter.java
@@ -448,7 +448,7 @@ else if (fd.getDataLen() == 8)
             case VAR_STRING:
             case STRING:
             {
-                String value = fieldValue != null?(String) fieldValue:"";
+                String value = fieldValue != null ? (String) fieldValue : "";
                 byte[] data = new byte[0];
                 if (fd.getSourceType() == HpccSrcType.UTF16LE)
                 {
diff --git a/dfsclient/src/test/java/org/hpccsystems/dfs/client/DFSReadWriteTest.java b/dfsclient/src/test/java/org/hpccsystems/dfs/client/DFSReadWriteTest.java

Original file line number	Diff line number	Diff line change
`@@ -448,7 +448,7 @@ else if (fd.getDataLen() == 8)`
`448`	`448`	`case VAR_STRING:`
`449`	`449`	`case STRING:`
`450`	`450`	`{`
`451`		`- String value = fieldValue != null?(String) fieldValue:"";`
	`451`	`+ String value = fieldValue != null ? (String) fieldValue : "";`
`452`	`452`	`byte[] data = new byte[0];`
`453`	`453`	`if (fd.getSourceType() == HpccSrcType.UTF16LE)`
`454`	`454`	`{`