Skip to content

Commit 4aa4e38

Browse files
Vectorize Softmax Phoenix and Strix (Xilinx#2139)
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
1 parent e6ffc46 commit 4aa4e38

File tree

6 files changed

+176
-87
lines changed

6 files changed

+176
-87
lines changed

aie_kernels/aie2/softmax.cc

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
//===- softmax.cc --------------------------------------------*- C++
2+
//-*-===//
3+
//
4+
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
//
8+
// Copyright (C) 2025, Advanced Micro Devices, Inc.
9+
//
10+
//===-------------------------------------------------- --------===//
11+
12+
#include <aie_api/aie.hpp>
13+
#include <lut_based_ops.h>
14+
#include <stdint.h>
15+
16+
using namespace aie;
17+
18+
void softmax_simple_bf16(bfloat16 *restrict input_vector,
19+
bfloat16 *restrict output_vector,
20+
const int32_t vector_size) {
21+
event0();
22+
23+
int num_elems = vector_size;
24+
float accum_exp_val;
25+
auto it_exp_in = aie::cbegin_vector<16>((bfloat16 *)input_vector);
26+
auto it_exp_out = aie::begin_vector<16>((bfloat16 *)output_vector);
27+
auto it_scale = aie::cbegin_restrict_vector<16>((bfloat16 *)output_vector);
28+
auto it_soft_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
29+
30+
bfloat16 col_sum_inv;
31+
aie::vector<bfloat16, 16> in_elems, va;
32+
aie::accum<accfloat, 16> out_vals;
33+
int col_iters = num_elems >> 4;
34+
accum_exp_val = 0;
35+
36+
/////////////////////
37+
//// Compute exp ////
38+
/////////////////////
39+
aie::vector<bfloat16, 16> exp_val;
40+
aie::vector<float, 16> input_fp32;
41+
42+
const int elem_iters = num_elems / 16;
43+
aie::vector<bfloat16, 16> input_bf16;
44+
aie::accum<accfloat, 16> exp_val_accum;
45+
exp_val_accum = aie::zeros<accfloat, 16>();
46+
for (int i = 0; i < elem_iters; i++) {
47+
input_bf16 = *it_exp_in++;
48+
exp_val = to_v16bfloat16(getExpBf16(input_bf16));
49+
exp_val_accum = add(exp_val_accum, exp_val);
50+
*it_exp_out++ = exp_val;
51+
}
52+
aie::vector<float, 16> reduce = exp_val_accum.to_vector<float>();
53+
accum_exp_val = aie::reduce_add(reduce);
54+
/////////////////////
55+
56+
col_sum_inv = (bfloat16)aie::inv(accum_exp_val);
57+
for (int c = 0; c < col_iters; c++) {
58+
in_elems = *it_scale++;
59+
out_vals = aie::mul(in_elems, col_sum_inv);
60+
*it_soft_out++ = out_vals.to_vector<bfloat16>();
61+
}
62+
63+
event1();
64+
65+
return;
66+
}
67+
68+
extern "C" {
69+
70+
void softmax_bf16(bfloat16 *restrict input, bfloat16 *restrict output,
71+
const int32_t input_size) {
72+
softmax_simple_bf16(input, output, input_size);
73+
}
74+
75+
} // extern "C"

aie_kernels/aie2p/softmax.cc

+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
//===- softmax.cc --------------------------------------------*- C++
2+
//-*-===//
3+
//
4+
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
//
8+
// Copyright (C) 2025, Advanced Micro Devices, Inc.
9+
//
10+
//===-------------------------------------------------- --------===//
11+
12+
#include <aie_api/aie.hpp>
13+
#include <stdint.h>
14+
15+
#define SM_VEC_LEN 16 // 32
16+
#define log2e 1.4453125 // 1.44269504089
17+
18+
using namespace aie;
19+
20+
void softmax_simple_bf16(bfloat16 *restrict input_vector,
21+
bfloat16 *restrict output_vector,
22+
const int32_t vector_size) {
23+
event0();
24+
25+
int num_elems = vector_size;
26+
float accum_exp_val;
27+
auto it_exp_in = aie::cbegin_vector<16>((bfloat16 *)input_vector);
28+
auto it_exp_out = aie::begin_vector<16>((bfloat16 *)output_vector);
29+
auto it_scale = aie::cbegin_restrict_vector<16>((bfloat16 *)output_vector);
30+
auto it_soft_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
31+
32+
bfloat16 col_sum_inv;
33+
aie::vector<bfloat16, 16> in_elems, va;
34+
aie::accum<accfloat, 16> out_vals;
35+
int col_iters = num_elems >> 4;
36+
accum_exp_val = 0;
37+
38+
/////////////////////
39+
//// Compute exp ////
40+
/////////////////////
41+
aie::vector<bfloat16, SM_VEC_LEN> exp_val;
42+
aie::vector<float, SM_VEC_LEN> input_fp32;
43+
aie::vector<bfloat16, SM_VEC_LEN> log2e_vec =
44+
aie::broadcast<bfloat16, SM_VEC_LEN>(log2e);
45+
46+
const int elem_iters = num_elems / SM_VEC_LEN;
47+
aie::vector<bfloat16, SM_VEC_LEN> input_bf16;
48+
aie::accum<accfloat, SM_VEC_LEN> exp_val_accum;
49+
exp_val_accum = aie::zeros<accfloat, SM_VEC_LEN>();
50+
for (int i = 0; i < elem_iters; i++) {
51+
input_bf16 = *it_exp_in++;
52+
aie::accum<accfloat, 16> exp_in;
53+
exp_in = aie::mul(input_bf16, log2e_vec);
54+
exp_val = aie::exp2<bfloat16>(exp_in.to_vector<float>());
55+
exp_val_accum = add(exp_val_accum, exp_val);
56+
*it_exp_out++ = exp_val;
57+
}
58+
aie::vector<float, SM_VEC_LEN> reduce = exp_val_accum.to_vector<float>();
59+
accum_exp_val = aie::reduce_add(reduce);
60+
/////////////////////
61+
62+
col_sum_inv = (bfloat16)aie::inv(accum_exp_val);
63+
for (int c = 0; c < col_iters; c++) {
64+
in_elems = *it_scale++;
65+
out_vals = aie::mul(in_elems, col_sum_inv);
66+
*it_soft_out++ = out_vals.to_vector<bfloat16>();
67+
}
68+
69+
event1();
70+
71+
return;
72+
}
73+
74+
extern "C" {
75+
76+
void softmax_bf16(bfloat16 *restrict input, bfloat16 *restrict output,
77+
const int32_t input_size) {
78+
softmax_simple_bf16(input, output, input_size);
79+
}
80+
81+
} // extern "C"

programming_examples/ml/softmax/Makefile

+19-21
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,16 @@ srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
1212

1313
include ${srcdir}/../../makefile-common
1414

15-
VPATH := ${srcdir}/../../../aie_kernels/aie2
15+
aie2_runtime_dir = ${AIEOPT_DIR}/aie_runtime_lib/AIE2
1616

1717
device ?= $(if $(filter 1,$(NPU2)),npu2,npu)
18+
19+
ifeq ($(device),npu2)
20+
VPATH :=${srcdir}/../../../aie_kernels/aie2p
21+
else
22+
VPATH :=${srcdir}/../../../aie_kernels/aie2
23+
endif
24+
1825
targetname = softmax
1926
trace_size = 8192
2027

@@ -27,42 +34,33 @@ endif
2734

2835
all: build/final.xclbin build/insts.txt
2936

30-
build/dut.cc: ${srcdir}/bf16_softmax.mlir
31-
mkdir -p ${@D}
32-
cd ${@D} && aie-opt $< -affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aie2" -lower-affine | aie-translate -aie2=true --aievec-to-cpp -o ${@F}
33-
34-
build/dut.o: build/dut.cc
3537
ifeq ($(device),npu)
36-
cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -I../../../../aie_runtime_lib/AIE2 -c ${<F} -o ${@F}
37-
else ifeq ($(device),npu2)
38-
cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2P_FLAGS} -I../../../../aie_runtime_lib/AIE2P -c ${<F} -o ${@F}
39-
else
40-
echo "Device type not supported"
38+
build/lut_based_ops.o: ${aie2_runtime_dir}/lut_based_ops.cpp
39+
mkdir -p ${@D}
40+
cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -I. -c $< -o ${@F}
4141
endif
4242

43-
build/lut_based_ops.o: ../../../aie_runtime_lib/AIE2/lut_based_ops.cpp
43+
44+
build/softmax.o: ${VPATH}/softmax.cc
4445
mkdir -p ${@D}
4546
ifeq ($(device),npu)
46-
cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -I. -c $(<:%=../%) -o ${@F}
47+
cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -I. -I${aie2_runtime_dir} -c $< -o ${@F}
4748
else ifeq ($(device),npu2)
48-
cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2P_FLAGS} -I. -c $(<:%=../%) -o ${@F}
49+
cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2P_FLAGS} -c $< -o ${@F}
4950
else
5051
echo "Device type not supported"
5152
endif
5253

53-
build/softmax.o: bf16_softmax.cc
54-
mkdir -p ${@D}
5554
ifeq ($(device),npu)
56-
cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} -I. -I../../../../aie_runtime_lib/AIE2 -c $< -o ${@F}
55+
build/kernels.a: build/softmax.o build/lut_based_ops.o
56+
ar rvs $@ $+
5757
else ifeq ($(device),npu2)
58-
cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2P_FLAGS} -I. -I../../../../aie_runtime_lib/AIE2P -c $< -o ${@F}
58+
build/kernels.a: build/softmax.o
59+
ar rvs $@ $+
5960
else
6061
echo "Device type not supported"
6162
endif
6263

63-
build/kernels.a: build/softmax.o
64-
ar rvs $@ $+
65-
6664
build/aie.mlir: ${srcdir}/${aie_py_src}
6765
mkdir -p ${@D}
6866
python3 $< ${device} > $@

programming_examples/ml/softmax/bf16_softmax.mlir

-34
This file was deleted.

programming_examples/ml/softmax/softmax.cc

-31
This file was deleted.

programming_examples/ml/softmax/test.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ int main(int argc, const char *argv[]) {
9393

9494
size_t OUT_SIZE = INOUT1_SIZE + trace_size;
9595

96-
srand(time(NULL));
96+
srand(42);
9797

9898
// Load instruction sequence
9999
std::vector<uint32_t> instr_v =

0 commit comments

Comments
 (0)