Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 78 additions & 16 deletions src/CodeGen_ARM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "IROperator.h"
#include "IRPrinter.h"
#include "LLVM_Headers.h"
#include "OptimizeShuffles.h"
#include "Simplify.h"
#include "Substitute.h"
#include "Util.h"
Expand Down Expand Up @@ -227,6 +228,7 @@ class CodeGen_ARM : public CodeGen_Posix {
Value *interleave_vectors(const std::vector<Value *> &) override;
Value *shuffle_vectors(Value *a, Value *b, const std::vector<int> &indices) override;
Value *shuffle_scalable_vectors_general(Value *a, Value *b, const std::vector<int> &indices);
Value *shuffle_scalable_vectors_general_llvm(Value *a, Value *b, Value *indices, int min_index, int max_index);
Value *codegen_shuffle_indices(int bits, const std::vector<int> &indices);
Value *codegen_whilelt(int total_lanes, int start, int end);
void codegen_vector_reduce(const VectorReduce *, const Expr &) override;
Expand Down Expand Up @@ -1223,6 +1225,22 @@ void CodeGen_ARM::compile_func(const LoweredFunc &f,
// and a - (b << c) into umlsl/smlsl.
func.body = distribute_shifts(func.body, /* multiply_adds */ true);

if (target_vscale() > 0) {
debug(1) << "ARM: Optimizing shuffles...\n";
const int lut_alignment = 16;

auto max_span_query = [&](const Type &lut_type) -> std::vector<int> {
int vl = natural_vector_size(lut_type);
// SVE2 has TBL and TBL2 (TBL with two src vectors) LLVM intrinsic.
// We prioritize TBL with single src vector in favor of performance.
return {vl, vl * 2};
};

func.body = optimize_shuffles(func.body, lut_alignment, native_vector_bits(), max_span_query, true);
debug(2) << "ARM: Lowering after optimizing shuffles:\n"
<< func.body << "\n\n";
}

CodeGen_Posix::compile_func(func, simple_name, extern_name);
}

Expand Down Expand Up @@ -2250,7 +2268,7 @@ Value *CodeGen_ARM::shuffle_vectors(Value *a, Value *b, const std::vector<int> &
}

// Perform vector shuffle by decomposing the operation to multiple native shuffle steps
// which calls shuffle_scalable_vectors_general() which emits TBL/TBL2 instruction
// which calls shuffle_scalable_vectors_general() which emits TBL/TBL2 LLVM intrinsic.
DecomposeVectorShuffle shuffler(*this, a, b, get_vector_num_elements(a->getType()), natural_lanes);
return shuffler.run(indices);
}
Expand All @@ -2259,41 +2277,50 @@ Value *CodeGen_ARM::shuffle_scalable_vectors_general(Value *a, Value *b, const s
internal_assert(a) << "Must provide a valid vector operand";
internal_assert(!indices.empty()) << "Cannot shuffle with empty indices";

llvm::Type *elt = get_vector_element_type(a->getType());
Value *val_indices = codegen_shuffle_indices(elt->getScalarSizeInBits(), indices);
auto [min_itr, max_itr] = std::minmax_element(indices.begin(), indices.end());
int highest_lane = *max_itr;
internal_assert(highest_lane >= 0)
<< "highest_lane was "
<< (highest_lane == SliceIndexNone ? "SliceIndexNone" :
highest_lane == SliceIndexCarryPrevResult ? "SliceIndexCarryPrevResult" :
"")
<< " (" << highest_lane << ")";

return shuffle_scalable_vectors_general_llvm(a, b, val_indices, *min_itr, *max_itr);
}

Value *CodeGen_ARM::shuffle_scalable_vectors_general_llvm(Value *a, Value *b, Value *indices, int min_index, int max_index) {
internal_assert(a) << "Must provide a valid vector operand";
internal_assert(indices) << "Must provide a valid indices";

llvm::Type *elt = get_vector_element_type(a->getType());
const int bits = elt->getScalarSizeInBits();
const int natural_lanes = natural_vector_size(Int(bits));
const int src_lanes = get_vector_num_elements(a->getType());
const int dst_lanes = indices.size();
const int dst_lanes = get_vector_num_elements(indices->getType());
llvm::Type *dst_type = get_vector_type(elt, dst_lanes);

internal_assert(target_vscale() > 0 && is_scalable_vector(a)) << "Only deal with scalable vectors\n";
internal_assert(src_lanes == natural_lanes && dst_lanes == natural_lanes)
<< "Only deal with vector with natural_lanes\n";

// We select TBL or TBL2 intrinsic depending on indices range
int highest_lane = *std::max_element(indices.begin(), indices.end());
internal_assert(highest_lane >= 0)
<< "highest_lane was "
<< (highest_lane == SliceIndexNone ? "SliceIndexNone" :
highest_lane == SliceIndexCarryPrevResult ? "SliceIndexCarryPrevResult" :
"")
<< " (" << highest_lane << ")";

bool use_tbl = highest_lane < src_lanes;
const bool use_tbl = max_index < src_lanes;
internal_assert(use_tbl || b) << "'b' must be valid in case of tbl2\n";

auto instr = concat_strings("llvm.aarch64.sve.", use_tbl ? "tbl" : "tbl2", mangle_llvm_type(dst_type));

Value *val_indices = codegen_shuffle_indices(bits, indices);
llvm::Type *vt_natural = get_vector_type(elt, natural_lanes);
std::vector<llvm::Type *> llvm_arg_types;
std::vector<llvm::Value *> llvm_arg_vals;
if (use_tbl) {
llvm_arg_types = {vt_natural, val_indices->getType()};
llvm_arg_vals = {a, val_indices};
llvm_arg_types = {vt_natural, indices->getType()};
llvm_arg_vals = {a, indices};
} else {
llvm_arg_types = {vt_natural, vt_natural, val_indices->getType()};
llvm_arg_vals = {a, b, val_indices};
llvm_arg_types = {vt_natural, vt_natural, indices->getType()};
llvm_arg_vals = {a, b, indices};
}
llvm::FunctionType *fn_type = FunctionType::get(vt_natural, llvm_arg_types, false);
FunctionCallee fn = module->getOrInsertFunction(instr, fn_type);
Expand Down Expand Up @@ -2383,6 +2410,41 @@ void CodeGen_ARM::visit(const Call *op) {
value = codegen(lower_round_to_nearest_ties_to_even(op->args[0]));
return;
}
} else if (op->is_intrinsic(Call::dynamic_shuffle)) {
internal_assert(target_vscale() > 0);
internal_assert(op->args.size() == 4);
const auto min_index = as_const_int(op->args[2]);
const auto max_index = as_const_int(op->args[3]);
internal_assert(min_index.has_value() && max_index.has_value());

Type lut_type = op->args[0].type();
const int src_lanes = lut_type.lanes();
const int dst_lanes = op->args[1].type().lanes();
const int natural_lanes = natural_vector_size(lut_type);

debug(3) << "dynamic_shuffle: [" << *min_index << ", " << *max_index << "]"
<< ", natural_lanes:" << natural_lanes << ", src_lanes:" << src_lanes << "\n";

Value *src = codegen(op->args[0]);
internal_assert(src_lanes <= natural_lanes * 2) << "src is too long to dynamic_shuffle\n";
Value *src_a = slice_vector(src, 0, natural_lanes);
Value *src_b = (src_lanes > natural_lanes) ? slice_vector(src, natural_lanes, natural_lanes) : nullptr;

// Cast index to integer with the same bits as LUT data
Type index_type = UInt(lut_type.bits()).with_lanes(dst_lanes);
Expr indices = cast(index_type, op->args[1]);
Value *val_indices = codegen(indices);

std::vector<Value *> slices;
const int num_slices = align_up(dst_lanes, natural_lanes) / natural_lanes;
slices.reserve(num_slices);
for (int i = 0; i < num_slices; i++) {
Value *indices_slice = slice_vector(val_indices, i * natural_lanes, natural_lanes);
Value *dst_slice = shuffle_scalable_vectors_general_llvm(src_a, src_b, indices_slice, *min_index, *max_index);
slices.push_back(dst_slice);
}
value = slice_vector(concat_vectors(slices), 0, dst_lanes);
return;
}

if (op->type.is_vector()) {
Expand Down
4 changes: 3 additions & 1 deletion src/CodeGen_Hexagon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1935,7 +1935,9 @@ void CodeGen_Hexagon::visit(const Call *op) {
auto max_index = as_const_int(op->args[3]);
internal_assert(min_index && max_index);
Value *lut = codegen(op->args[0]);
Value *idx = codegen(op->args[1]);
// Cast the index to 8 bit
Expr index = cast(UInt(8).with_lanes(op->type.lanes()), op->args[1]);
Value *idx = codegen(index);
value = vlut(lut, idx, *min_index, *max_index);
return;
} else if (op->is_intrinsic(Call::abs)) {
Expand Down
3 changes: 2 additions & 1 deletion src/HexagonOptimize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2285,7 +2285,8 @@ class SyncronizationBarriers : public IRMutator {
Stmt optimize_hexagon_shuffles(const Stmt &s, int lut_alignment) {
// Replace indirect and other complicated loads with
// dynamic_shuffle (vlut) calls.
return optimize_shuffles(s, lut_alignment);
auto max_span_query = [](const Type &t) -> std::vector<int> { return {256}; };
return optimize_shuffles(s, lut_alignment, 1024, max_span_query, false);
}

Stmt scatter_gather_generator(Stmt s) {
Expand Down
78 changes: 46 additions & 32 deletions src/OptimizeShuffles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,13 @@ namespace Internal {

namespace {

using SpanQueryType = std::function<std::vector<int>(const Type &)>;

class OptimizeShuffles : public IRMutator {
int lut_alignment;
int native_vector_bits;
SpanQueryType get_max_span_sizes;
bool align_loads_with_native_vector;
Scope<Interval> bounds;
std::vector<std::pair<std::string, Expr>> lets;

Expand Down Expand Up @@ -67,7 +72,7 @@ class OptimizeShuffles : public IRMutator {
if (allocations_to_pad.count(op->name)) {
op = s.as<Allocate>();
internal_assert(op);
int padding = 128 / op->type.bytes(); // One native vector
int padding = native_vector_bits / op->type.bits(); // One native vector
return Allocate::make(op->name, op->type, op->memory_type,
op->extents, op->condition,
op->body, op->new_expr, op->free_function,
Expand Down Expand Up @@ -99,34 +104,40 @@ class OptimizeShuffles : public IRMutator {
((unaligned_index_bounds.max + align) / align) * align - 1};
ModulusRemainder alignment(align, 0);

for (const Interval &index_bounds : {aligned_index_bounds, unaligned_index_bounds}) {
Expr index_span = span_of_bounds(index_bounds);
index_span = common_subexpression_elimination(index_span);
index_span = simplify(index_span);

if (can_prove(index_span < 256)) {
// This is a lookup within an up to 256 element array. We
// can use dynamic_shuffle for this.
int const_extent = as_const_int(index_span) ? *as_const_int(index_span) + 1 : 256;
Expr base = simplify(index_bounds.min);

// Load all of the possible indices loaded from the
// LUT. Note that for clamped ramps, this loads up to 1
// vector past the max, so we will add padding to the
// allocation accordingly (if we're the one that made it).
allocations_to_pad.insert(op->name);
Expr lut = Load::make(op->type.with_lanes(const_extent), op->name,
Ramp::make(base, 1, const_extent),
op->image, op->param, const_true(const_extent), alignment);

// We know the size of the LUT is not more than 256, so we
// can safely cast the index to 8 bit, which
// dynamic_shuffle requires.
index = simplify(cast(UInt(8).with_lanes(op->type.lanes()), index - base));
return Call::make(op->type, "dynamic_shuffle", {lut, index, 0, const_extent - 1}, Call::PureIntrinsic);
const int native_vector_size = native_vector_bits / op->type.bits();

for (const auto &max_span_size : get_max_span_sizes(op->type)) {

for (const Interval &index_bounds : {aligned_index_bounds, unaligned_index_bounds}) {
Expr index_span = span_of_bounds(index_bounds);
index_span = common_subexpression_elimination(index_span);
index_span = simplify(index_span);

if (can_prove(index_span < max_span_size)) {
// This is a lookup within an up to max_span_size element array. We
// can use dynamic_shuffle for this.
int const_extent = as_const_int(index_span) ? *as_const_int(index_span) + 1 : max_span_size;
if (align_loads_with_native_vector) {
const_extent = align_up(const_extent, native_vector_size);
}
Expr base = simplify(index_bounds.min);

// Load all of the possible indices loaded from the
// LUT. Note that for clamped ramps, this loads up to 1
// vector past the max, so we will add padding to the
// allocation accordingly (if we're the one that made it).
allocations_to_pad.insert(op->name);
Expr lut = Load::make(op->type.with_lanes(const_extent), op->name,
Ramp::make(base, 1, const_extent),
op->image, op->param, const_true(const_extent), alignment);

// Target dependent codegen needs to cast the type of index to what it accepts
index = simplify(index - base);
return Call::make(op->type, "dynamic_shuffle", {lut, index, 0, const_extent - 1}, Call::PureIntrinsic);
}
// Only the first iteration of this loop is aligned.
alignment = ModulusRemainder();
}
// Only the first iteration of this loop is aligned.
alignment = ModulusRemainder();
}
}
if (!index.same_as(op->index)) {
Expand All @@ -137,14 +148,17 @@ class OptimizeShuffles : public IRMutator {
}

public:
OptimizeShuffles(int lut_alignment)
: lut_alignment(lut_alignment) {
OptimizeShuffles(int lut_alignment, int native_vector_bits, SpanQueryType get_max_span_sizes, bool align_loads_with_native_vector)
: lut_alignment(lut_alignment),
native_vector_bits(native_vector_bits),
get_max_span_sizes(std::move(get_max_span_sizes)),
align_loads_with_native_vector(align_loads_with_native_vector) {
}
};
} // namespace

Stmt optimize_shuffles(Stmt s, int lut_alignment) {
s = OptimizeShuffles(lut_alignment)(s);
Stmt optimize_shuffles(Stmt s, int lut_alignment, int native_vector_bits, SpanQueryType get_max_span_sizes, bool align_loads_with_native_vector) {
s = OptimizeShuffles(lut_alignment, native_vector_bits, std::move(get_max_span_sizes), align_loads_with_native_vector)(s);
return s;
}

Expand Down
8 changes: 7 additions & 1 deletion src/OptimizeShuffles.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,19 @@
*/

#include "Expr.h"
#include <functional>
#include <vector>

namespace Halide {
namespace Internal {

/* Replace indirect loads with dynamic_shuffle intrinsics where
possible. */
Stmt optimize_shuffles(Stmt s, int lut_alignment);
Stmt optimize_shuffles(Stmt s,
int lut_alignment,
int native_vector_bits,
std::function<std::vector<int>(const Type &)> get_max_span_sizes,
bool align_loads_with_native_vector);

} // namespace Internal
} // namespace Halide
Expand Down
34 changes: 33 additions & 1 deletion test/correctness/simd_op_check_sve2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -855,7 +855,7 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
if (instr_lanes < 2 || (total_lanes / vscale < 2)) continue; // bail out scalar and <vscale x 1 x ty>

AddTestFunctor add(*this, bits, total_lanes);
Expr index = clamp(cast<int>(in_im(x)), 0, W - 1);
Expr index = clamp(in_i32(x), 0, W - 1);
Func tmp;
tmp(x, y) = cast(elt, y);
tmp(x, index) = cast(elt, 1);
Expand All @@ -876,6 +876,38 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
}
}
}

// Gather load where index range is bounded within certain value. e.g. LUT
// In this case, Halide tries to transform it into contiguous load + Call::dynamic_shuffle
// which is lowered to TBL instruction. (see OptimizeShuffles.cpp)
if (has_sve()) {
const int width = base_vec_bits;
const int total_lanes = width / bits;
const int instr_lanes = Instruction::get_instr_lanes(bits, total_lanes, target);
if (instr_lanes < 2 || (total_lanes / vscale < 2)) continue; // bail out scalar and <vscale x 1 x ty>

AddTestFunctor add(*this, bits, total_lanes);
const std::vector<std::pair<int, int>> index_min_max{
{0, total_lanes - 1},
{1, total_lanes},
{0, total_lanes * 2 - 1},
};
for (auto &[index_min, index_max] : index_min_max) {
Expr index = cast(Int(32), in_im(x));
index = clamp(index, index_min, index_max);
Expr look_up = in_im(index);

add("tbl", look_up);
}

// Without clamped but bounded by the range of the data type of the input image (8bit)
Expr index = cast(Int(32), in_u8(x)); // 8 bit fixed
int factor = (1 << 8) / (total_lanes * 2);
index = index / factor; // index should be within native_vector*2 range
Expr look_up = in_im(index);

add("tbl", look_up);
}
}
}

Expand Down
Loading