apache · daniel-adam-tfs · Jan 29, 2026 · Feb 2, 2026 · Feb 4, 2026 · Feb 9, 2026
@@ -30,7 +30,7 @@ import (
 // encodeByteStreamSplit encodes the raw bytes provided by 'in' into the output buffer 'data' using BYTE_STREAM_SPLIT encoding.
 // 'data' must have space for at least len(in) bytes.
 func encodeByteStreamSplit(data []byte, in []byte, width int) {
-	debug.Assert(len(data) >= len(in), fmt.Sprintf("not enough space in destination buffer for encoding, dest: %d bytes, src: %d bytes", len(data), len(in)))
+	debug.Assert(len(data) >= len(in), "not enough space in destination buffer for encoding")
 	numElements := len(in) / width
 	for stream := 0; stream < width; stream++ {
 		for element := 0; element < numElements; element++ {
@@ -44,7 +44,7 @@ func encodeByteStreamSplit(data []byte, in []byte, width int) {
 // encodeByteStreamSplitWidth2 implements encodeByteStreamSplit optimized for types stored using 2 bytes.
 // 'data' must have space for at least len(in) bytes.
 func encodeByteStreamSplitWidth2(data []byte, in []byte) {
-	debug.Assert(len(data) >= len(in), fmt.Sprintf("not enough space in destination buffer for encoding, dest: %d bytes, src: %d bytes", len(data), len(in)))
+	debug.Assert(len(data) >= len(in), "not enough space in destination buffer for encoding")
 	const width = 2
 	numElements := len(in) / width
 	for element := 0; element < numElements; element++ {
@@ -57,7 +57,7 @@ func encodeByteStreamSplitWidth2(data []byte, in []byte) {
 // encodeByteStreamSplitWidth4 implements encodeByteStreamSplit optimized for types stored using 4 bytes.
 // 'data' must have space for at least len(in) bytes.
 func encodeByteStreamSplitWidth4(data []byte, in []byte) {
-	debug.Assert(len(data) >= len(in), fmt.Sprintf("not enough space in destination buffer for encoding, dest: %d bytes, src: %d bytes", len(data), len(in)))
+	debug.Assert(len(data) >= len(in), "not enough space in destination buffer for encoding")
 	const width = 4
 	numElements := len(in) / width
 	for element := 0; element < numElements; element++ {
@@ -72,7 +72,7 @@ func encodeByteStreamSplitWidth4(data []byte, in []byte) {
 // encodeByteStreamSplitWidth8 implements encodeByteStreamSplit optimized for types stored using 8 bytes.
 // 'data' must have space for at least len(in) bytes.
 func encodeByteStreamSplitWidth8(data []byte, in []byte) {
-	debug.Assert(len(data) >= len(in), fmt.Sprintf("not enough space in destination buffer for encoding, dest: %d bytes, src: %d bytes", len(data), len(in)))
+	debug.Assert(len(data) >= len(in), "not enough space in destination buffer for encoding")
 	const width = 8
 	numElements := len(in) / width
 	for element := 0; element < numElements; element++ {
@@ -88,60 +88,6 @@ func encodeByteStreamSplitWidth8(data []byte, in []byte) {
 	}
 }
 
-// decodeByteStreamSplitBatchFLBA decodes the batch of nValues FixedLenByteArrays provided by 'data',
-// into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
-// 'out' must have space for at least nValues slices.
-func decodeByteStreamSplitBatchFLBA(data []byte, nValues, stride, width int, out []parquet.FixedLenByteArray) {
-	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
-	for stream := 0; stream < width; stream++ {
-		for element := 0; element < nValues; element++ {
-			encLoc := stride*stream + element
-			out[element][stream] = data[encLoc]
-		}
-	}
-}
-
-// decodeByteStreamSplitBatchFLBAWidth2 decodes the batch of nValues FixedLenByteArrays of length 2 provided by 'data',
-// into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
-// 'out' must have space for at least nValues slices.
-func decodeByteStreamSplitBatchFLBAWidth2(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
-	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
-	for element := 0; element < nValues; element++ {
-		out[element][0] = data[element]
-		out[element][1] = data[stride+element]
-	}
-}
-
-// decodeByteStreamSplitBatchFLBAWidth4 decodes the batch of nValues FixedLenByteArrays of length 4 provided by 'data',
-// into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
-// 'out' must have space for at least nValues slices.
-func decodeByteStreamSplitBatchFLBAWidth4(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
-	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
-	for element := 0; element < nValues; element++ {
-		out[element][0] = data[element]
-		out[element][1] = data[stride+element]
-		out[element][2] = data[stride*2+element]
-		out[element][3] = data[stride*3+element]
-	}
-}
-
-// decodeByteStreamSplitBatchFLBAWidth8 decodes the batch of nValues FixedLenByteArrays of length 8 provided by 'data',
-// into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
-// 'out' must have space for at least nValues slices.
-func decodeByteStreamSplitBatchFLBAWidth8(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
-	debug.Assert(len(out) >= nValues, fmt.Sprintf("not enough space in output slice for decoding, out: %d values, data: %d values", len(out), nValues))
-	for element := 0; element < nValues; element++ {
-		out[element][0] = data[element]
-		out[element][1] = data[stride+element]
-		out[element][2] = data[stride*2+element]
-		out[element][3] = data[stride*3+element]
-		out[element][4] = data[stride*4+element]
-		out[element][5] = data[stride*5+element]
-		out[element][6] = data[stride*6+element]
-		out[element][7] = data[stride*7+element]
-	}
-}
-
 func releaseBufferToPool(pooled *PooledBufferWriter) {
 	buf := pooled.buf
 	memory.Set(buf.Buf(), 0)

@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package encoding
+
+import (
+	"unsafe"
+
+	"github.com/apache/arrow-go/v18/parquet/internal/debug"
+	"golang.org/x/sys/cpu"
+)
+
+func init() {
+	if cpu.X86.HasAVX2 {
+		decodeByteStreamSplitBatchWidth4InByteOrder = decodeByteStreamSplitBatchWidth4AVX2
+		decodeByteStreamSplitBatchWidth8InByteOrder = decodeByteStreamSplitBatchWidth8AVX2
+	}
+}
+
+//go:noescape
+func _decodeByteStreamSplitWidth4AVX2(data, out unsafe.Pointer, nValues, stride int)
+
+//go:noescape
+func _decodeByteStreamSplitWidth8AVX2(data, out unsafe.Pointer, nValues, stride int)
+
+func decodeByteStreamSplitBatchWidth4AVX2(data []byte, nValues, stride int, out []byte) {
+	if nValues == 0 {
+		return
+	}
+	const width = 4
+	debug.Assert(len(out) >= nValues*width, "not enough space in output buffer for decoding")
+	debug.Assert(len(data) >= 3*stride+nValues, "not enough data for decoding")
+	_decodeByteStreamSplitWidth4AVX2(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
+}
+
+func decodeByteStreamSplitBatchWidth8AVX2(data []byte, nValues, stride int, out []byte) {
+	if nValues == 0 {
+		return
+	}
+	const width = 8
+	debug.Assert(len(out) >= nValues*width, "not enough space in output buffer for decoding")
+	debug.Assert(len(data) >= 7*stride+nValues, "not enough data for decoding")
+	_decodeByteStreamSplitWidth8AVX2(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
+}
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//go:build !noasm
+
+package encoding
+
+import (
+	"unsafe"
+
+	"github.com/apache/arrow-go/v18/parquet/internal/debug"
+	"golang.org/x/sys/cpu"
+)
+
+func init() {
+	if cpu.ARM64.HasASIMD {
+		decodeByteStreamSplitBatchWidth4InByteOrder = decodeByteStreamSplitBatchWidth4NEON
+		decodeByteStreamSplitBatchWidth8InByteOrder = decodeByteStreamSplitBatchWidth8NEON
+	}
+}
+
+//go:noescape
+func _decodeByteStreamSplitWidth4NEON(data, out unsafe.Pointer, nValues, stride int)
+
+//go:noescape
+func _decodeByteStreamSplitWidth8NEON(data, out unsafe.Pointer, nValues, stride int)
+
+func decodeByteStreamSplitBatchWidth4NEON(data []byte, nValues, stride int, out []byte) {
+	if nValues == 0 {
+		return
+	}
+	const width = 4
+	debug.Assert(len(out) >= nValues*width, "not enough space in output buffer for decoding")
+	debug.Assert(len(data) >= 3*stride+nValues, "not enough data for decoding")
+	_decodeByteStreamSplitWidth4NEON(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
+}
+
+func decodeByteStreamSplitBatchWidth8NEON(data []byte, nValues, stride int, out []byte) {
+	if nValues == 0 {
+		return
+	}
+	const width = 8
+	debug.Assert(len(out) >= nValues*width, "not enough space in output buffer for decoding")
+	debug.Assert(len(data) >= 7*stride+nValues, "not enough data for decoding")
+	_decodeByteStreamSplitWidth8NEON(unsafe.Pointer(&data[0]), unsafe.Pointer(&out[0]), nValues, stride)
+}
@@ -19,43 +19,98 @@
 package encoding
 
 import (
-	"fmt"
+	"unsafe"
 
+	"github.com/apache/arrow-go/v18/parquet"
 	"github.com/apache/arrow-go/v18/parquet/internal/debug"
 )
 
 // decodeByteStreamSplitBatchWidth4InByteOrder decodes the batch of nValues raw bytes representing a 4-byte datatype provided
-// by 'data', into the output buffer 'out' using BYTE_STREAM_SPLIT encoding. The values are expected to be in little-endian
+// by 'data', into the output buffer 'out' using BYTE_STREAM_SPLIT encoding. The values are expected to be in big-endian
 // byte order and are be decoded into the 'out' array in machine's native endianness.
 // 'out' must have space for at least len(data) bytes.
-func decodeByteStreamSplitBatchWidth4InByteOrder(data []byte, nValues, stride int, out []byte) {
+func decodeByteStreamSplitBatchWidth4InByteOrderDefault(data []byte, nValues, stride int, out []byte) {
 	const width = 4
-	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
-	for element := 0; element < nValues; element++ {
-		// Big Endian: most significant byte first
-		out[width*element+0] = data[3*stride+element]
-		out[width*element+1] = data[2*stride+element]
-		out[width*element+2] = data[stride+element]
-		out[width*element+3] = data[element]
+	debug.Assert(len(out) >= nValues*width, "not enough space in output buffer for decoding")
+	// the beginning of the data slice can be truncated, but for valid encoding we need at least (width-1)*stride+nValues bytes
+	debug.Assert(len(data) >= 3*stride+nValues, "not enough data for decoding")
+	s0 := data[:nValues]
+	s1 := data[stride : stride+nValues]
+	s2 := data[2*stride : 2*stride+nValues]
+	s3 := data[3*stride : 3*stride+nValues]
+	out = out[:width*nValues]
+	out32 := unsafe.Slice((*uint32)(unsafe.Pointer(&out[0])), nValues)
+	for i := range nValues {
+		// Big-endian machine: put s0 as MSB, s3 as LSB
+		out32[i] = uint32(s3[i])<<24 | uint32(s2[i])<<16 | uint32(s1[i])<<8 | uint32(s0[i])
 	}
 }
 
 // decodeByteStreamSplitBatchWidth8InByteOrder decodes the batch of nValues raw bytes representing a 8-byte datatype provided
-// by 'data', into the output buffer 'out' using BYTE_STREAM_SPLIT encoding. The values are expected to be in little-endian
+// by 'data', into the output buffer 'out' using BYTE_STREAM_SPLIT encoding. The values are expected to be in big-endian
 // byte order and are be decoded into the 'out' array in machine's native endianness.
 // 'out' must have space for at least len(data) bytes.
-func decodeByteStreamSplitBatchWidth8InByteOrder(data []byte, nValues, stride int, out []byte) {
+func decodeByteStreamSplitBatchWidth8InByteOrderDefault(data []byte, nValues, stride int, out []byte) {
 	const width = 8
-	debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), len(data)))
+	debug.Assert(len(out) >= nValues*width, "not enough space in output buffer for decoding")
+	debug.Assert(len(data) >= 7*stride+nValues, "not enough data for decoding")
+	s0 := data[:nValues]
+	s1 := data[stride : stride+nValues]
+	s2 := data[2*stride : 2*stride+nValues]
+	s3 := data[3*stride : 3*stride+nValues]
+	s4 := data[4*stride : 4*stride+nValues]
+	s5 := data[5*stride : 5*stride+nValues]
+	s6 := data[6*stride : 6*stride+nValues]
+	s7 := data[7*stride : 7*stride+nValues]
+	out = out[:width*nValues]
+	out64 := unsafe.Slice((*uint64)(unsafe.Pointer(&out[0])), nValues)
+	for i := range nValues {
+		// Big-endian machine: put s0 as MSB, s7 as LSB
+		out64[i] = uint64(s7[i])<<56 | uint64(s6[i])<<48 | uint64(s5[i])<<40 | uint64(s4[i])<<32 |
+			uint64(s3[i])<<24 | uint64(s2[i])<<16 | uint64(s1[i])<<8 | uint64(s0[i])
+	}
+}
+
+// decodeByteStreamSplitBatchFLBAWidth2 decodes the batch of nValues FixedLenByteArrays of length 2 provided by 'data',
+// into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
+// 'out' must have space for at least nValues slices.
+func decodeByteStreamSplitBatchFLBAWidth2(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+	debug.Assert(len(out) >= nValues, "not enough space in output slice for decoding")
+	debug.Assert(len(data) >= stride+nValues, "not enough data for decoding")
+	for element := 0; element < nValues; element++ {
+		out[element][0] = data[element]
+		out[element][1] = data[stride+element]
+	}
+}
+
+// decodeByteStreamSplitBatchFLBAWidth4 decodes the batch of nValues FixedLenByteArrays of length 4 provided by 'data',
+// into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
+// 'out' must have space for at least nValues slices.
+func decodeByteStreamSplitBatchFLBAWidth4(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+	debug.Assert(len(out) >= nValues, "not enough space in output slice for decoding")
+	debug.Assert(len(data) >= 3*stride+nValues, "not enough data for decoding")
+	for element := 0; element < nValues; element++ {
+		out[element][0] = data[element]
+		out[element][1] = data[stride+element]
+		out[element][2] = data[stride*2+element]
+		out[element][3] = data[stride*3+element]
+	}
+}
+
+// decodeByteStreamSplitBatchFLBAWidth8 decodes the batch of nValues FixedLenByteArrays of length 8 provided by 'data',
+// into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
+// 'out' must have space for at least nValues slices.
+func decodeByteStreamSplitBatchFLBAWidth8(data []byte, nValues, stride int, out []parquet.FixedLenByteArray) {
+	debug.Assert(len(out) >= nValues, "not enough space in output slice for decoding")
+	debug.Assert(len(data) >= 7*stride+nValues, "not enough data for decoding")
 	for element := 0; element < nValues; element++ {
-		// Big Endian: most significant byte first
-		out[width*element+0] = data[7*stride+element]
-		out[width*element+1] = data[6*stride+element]
-		out[width*element+2] = data[5*stride+element]
-		out[width*element+3] = data[4*stride+element]
-		out[width*element+4] = data[3*stride+element]
-		out[width*element+5] = data[2*stride+element]
-		out[width*element+6] = data[stride+element]
-		out[width*element+7] = data[element]
+		out[element][0] = data[element]
+		out[element][1] = data[stride+element]
+		out[element][2] = data[stride*2+element]
+		out[element][3] = data[stride*3+element]
+		out[element][4] = data[stride*4+element]
+		out[element][5] = data[stride*5+element]
+		out[element][6] = data[stride*6+element]
+		out[element][7] = data[stride*7+element]
 	}
 }
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package encoding
+
+import (
+	"github.com/apache/arrow-go/v18/parquet"
+	"github.com/apache/arrow-go/v18/parquet/internal/debug"
+)
+
+var (
+	decodeByteStreamSplitBatchWidth4InByteOrder func(data []byte, nValues, stride int, out []byte) = decodeByteStreamSplitBatchWidth4InByteOrderDefault
+	decodeByteStreamSplitBatchWidth8InByteOrder func(data []byte, nValues, stride int, out []byte) = decodeByteStreamSplitBatchWidth8InByteOrderDefault
+)
+
+// decodeByteStreamSplitBatchFLBA decodes the batch of nValues FixedLenByteArrays provided by 'data',
+// into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
+// 'out' must have space for at least nValues slices.
+func decodeByteStreamSplitBatchFLBA(data []byte, nValues, stride, width int, out []parquet.FixedLenByteArray) {
+	debug.Assert(len(out) >= nValues, "not enough space in output slice for decoding")
+	debug.Assert(len(data) >= (width-1)*stride+nValues, "not enough data for decoding")
+	for stream := 0; stream < width; stream++ {
+		for element := 0; element < nValues; element++ {
+			encLoc := stride*stream + element
+			out[element][stream] = data[encLoc]
+		}
+	}
+}