Skip to content

Commit 0e19522

Browse files
Revert "Adds support for accelerated sorting with x86-simd-sort (pytorch#127936)"
This reverts commit 239a9ad. Reverted pytorch#127936 on behalf of https://github.com/atalman due to test/test_sort_and_select.py::TestSortAndSelectCPU::test_sort_discontiguous_slow_cpu_float32 [GH job link](https://github.com/pytorch/pytorch/actions/runs/10994904767/job/30525578456) [HUD commit link](https://hud.pytorch.org/pytorch/pytorch/commit/239a9ad65eebf93dcf9bb108a5129d4160b12c86) ([comment](pytorch#127936 (comment)))
1 parent bae427e commit 0e19522

File tree

8 files changed

+2
-207
lines changed

8 files changed

+2
-207
lines changed

.gitmodules

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,3 @@
127127
[submodule "third_party/NVTX"]
128128
path = third_party/NVTX
129129
url = https://github.com/NVIDIA/NVTX.git
130-
[submodule "third_party/x86-simd-sort"]
131-
path = third_party/x86-simd-sort
132-
url = https://github.com/intel/x86-simd-sort.git

CMakeLists.txt

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,6 @@ else()
262262
cmake_dependent_option(USE_CUFILE "Use cuFile" OFF "USE_CUDA AND NOT WIN32" OFF)
263263
endif()
264264
option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
265-
option(USE_X86_SIMD_SORT "Use x86-simd-sort to accelerate sorting and topk for AVX2/AVX512" ON)
266265
option(USE_KINETO "Use Kineto profiling library" ON)
267266
option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
268267
option(USE_FAKELOWP "Use FakeLowp operators" OFF)
@@ -908,13 +907,6 @@ if(USE_FBGEMM)
908907
string(APPEND CMAKE_CXX_FLAGS " -DUSE_FBGEMM")
909908
endif()
910909

911-
if(USE_X86_SIMD_SORT)
912-
string(APPEND CMAKE_CXX_FLAGS " -DUSE_X86_SIMD_SORT")
913-
if(USE_XSS_OPENMP)
914-
string(APPEND CMAKE_CXX_FLAGS " -DXSS_USE_OPENMP")
915-
endif()
916-
endif()
917-
918910
if(USE_PYTORCH_QNNPACK)
919911
string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
920912
endif()

NOTICE

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -454,37 +454,3 @@ and reference the following license:
454454
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
455455
OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
456456
PERFORMANCE OF THIS SOFTWARE.
457-
458-
=======================================================================
459-
x86-simd-sort BSD 3-Clause License
460-
=======================================================================
461-
462-
Code derived from implementations in x86-simd-sort should mention its
463-
derivation and reference the following license:
464-
465-
Copyright (c) 2022, Intel. All rights reserved.
466-
467-
Redistribution and use in source and binary forms, with or without
468-
modification, are permitted provided that the following conditions are met:
469-
470-
1. Redistributions of source code must retain the above copyright notice, this
471-
list of conditions and the following disclaimer.
472-
473-
2. Redistributions in binary form must reproduce the above copyright notice,
474-
this list of conditions and the following disclaimer in the documentation
475-
and/or other materials provided with the distribution.
476-
477-
3. Neither the name of the copyright holder nor the names of its
478-
contributors may be used to endorse or promote products derived from
479-
this software without specific prior written permission.
480-
481-
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
482-
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
483-
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
484-
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
485-
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
486-
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
487-
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
488-
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
489-
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
490-
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

aten/src/ATen/native/cpu/SortingKernel.cpp

Lines changed: 2 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,11 @@
1515
#include <ATen/native/CompositeRandomAccessor.h>
1616
#include <ATen/native/TopKImpl.h>
1717
#include <c10/core/WrapDimMinimal.h>
18-
#include <c10/util/SmallBuffer.h>
1918
#include <c10/util/irange.h>
20-
2119
#ifdef USE_FBGEMM
2220
#include <fbgemm/Utils.h>
2321
#endif
2422

25-
#if USE_X86_SIMD_SORT && (defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2))
26-
#define XSS_COMPILE_TIME_SUPPORTED
27-
#include <src/x86simdsort-static-incl.h>
28-
#endif
29-
3023
namespace at::native {
3124

3225
namespace {
@@ -126,7 +119,6 @@ static void parallel_sort1d_kernel(
126119
std::vector<int64_t> tmp_vals(elements);
127120
const scalar_t* sorted_keys = nullptr;
128121
const int64_t* sorted_vals = nullptr;
129-
130122
std::tie(sorted_keys, sorted_vals) = fbgemm::radix_sort_parallel(
131123
keys,
132124
vals,
@@ -175,116 +167,6 @@ static inline void sort_kernel_impl(const value_accessor_t& value_accessor,
175167
}
176168
}
177169

178-
#if defined(XSS_COMPILE_TIME_SUPPORTED)
179-
180-
#define AT_DISPATCH_CASE_XSS_TYPES(...) \
181-
AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) \
182-
AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \
183-
AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
184-
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)
185-
186-
#define AT_DISPATCH_XSS_TYPES(TYPE, NAME, ...) \
187-
AT_DISPATCH_SWITCH(TYPE, NAME, AT_DISPATCH_CASE_XSS_TYPES(__VA_ARGS__))
188-
189-
static bool can_use_xss_sort(const TensorBase& values, const TensorBase& indices, int64_t dim, const bool stable) {
190-
// xss_sort is not a stable sort
191-
if (stable) return false;
192-
193-
auto type = values.scalar_type();
194-
if (! (type == ScalarType::Long || type == ScalarType::Int || type == ScalarType::Double || type == ScalarType::Float)) return false;
195-
196-
return true;
197-
}
198-
199-
static bool xss_sort_preferred(const TensorBase& values, const bool descending) {
200-
#if defined(XSS_USE_OPENMP) || !defined(USE_FBGEMM)
201-
return true;
202-
#else
203-
// Without OpenMP support for x86-simd-sort, fbgemm radix sort is faster when it can be used
204-
return !can_use_radix_sort(values, descending);
205-
#endif
206-
}
207-
208-
static void xss_sort_kernel(
209-
const TensorBase& values,
210-
const TensorBase& indices,
211-
int64_t dim,
212-
bool descending) {
213-
auto iter = TensorIteratorConfig()
214-
.check_all_same_dtype(false)
215-
.resize_outputs(false)
216-
.declare_static_shape(values.sizes(), /*squash_dims=*/dim)
217-
.add_output(values)
218-
.add_output(indices)
219-
.build();
220-
221-
using index_t = int64_t;
222-
223-
AT_DISPATCH_XSS_TYPES(values.scalar_type(), "xss_sort_kernel", [&] {
224-
225-
auto values_dim_stride = values.stride(dim);
226-
auto indices_dim_stride = indices.stride(dim);
227-
auto dim_size = values.size(dim);
228-
229-
auto loop = [&](char** data, const int64_t* strides, int64_t n) {
230-
auto* values_data_bytes = data[0];
231-
auto* indices_data_bytes = data[1];
232-
233-
if(values_data_bytes==nullptr || indices_data_bytes==nullptr){
234-
return;
235-
}
236-
237-
if (values_dim_stride == 1 && indices_dim_stride == 1){
238-
for (const auto i C10_UNUSED : c10::irange(n)) {
239-
x86simdsortStatic::keyvalue_qsort<scalar_t, index_t>(
240-
reinterpret_cast<scalar_t*>(values_data_bytes),
241-
reinterpret_cast<index_t*>(indices_data_bytes),
242-
dim_size,
243-
true,
244-
descending);
245-
246-
values_data_bytes += strides[0];
247-
indices_data_bytes += strides[1];
248-
}
249-
}else{
250-
c10::SmallBuffer<scalar_t, 0> tmp_values(dim_size);
251-
c10::SmallBuffer<index_t, 0> tmp_indices(dim_size);
252-
253-
for (const auto i : c10::irange(n)) {
254-
TensorAccessor<scalar_t, 1> mode_values_acc(
255-
reinterpret_cast<scalar_t*>(data[0] + i * strides[0]),
256-
&dim_size, &values_dim_stride);
257-
TensorAccessor<index_t, 1> mode_indices_acc(
258-
reinterpret_cast<index_t*>(data[1] + i * strides[1]),
259-
&dim_size, &indices_dim_stride);
260-
261-
for (const auto j : c10::irange(dim_size)) {
262-
tmp_values[j] = mode_values_acc[j];
263-
tmp_indices[j] = j;
264-
}
265-
266-
x86simdsortStatic::keyvalue_qsort<scalar_t, index_t>(
267-
tmp_values.data(),
268-
tmp_indices.data(),
269-
dim_size,
270-
true,
271-
descending);
272-
273-
for (const auto j : c10::irange(dim_size)) {
274-
mode_values_acc[j] = tmp_values[j];
275-
mode_indices_acc[j] = tmp_indices[j];
276-
}
277-
}
278-
}
279-
};
280-
281-
int64_t grain_size = internal::GRAIN_SIZE / std::max(int64_t{1}, dim_size);
282-
iter.for_each(loop, /*grain_size=*/grain_size);
283-
284-
});
285-
}
286-
#endif
287-
288170
static void sort_kernel(
289171
const TensorBase& self,
290172
const TensorBase& values,
@@ -299,14 +181,6 @@ static void sort_kernel(
299181
// https://github.com/pytorch/pytorch/issues/91420
300182
return;
301183
}
302-
303-
#if defined(XSS_COMPILE_TIME_SUPPORTED)
304-
if (can_use_xss_sort(values, indices, dim, stable) && xss_sort_preferred(values, descending)){
305-
xss_sort_kernel(values, indices, dim, descending);
306-
return;
307-
}
308-
#endif
309-
310184
#ifdef USE_FBGEMM
311185
if (can_use_radix_sort(values, descending)) {
312186
parallel_sort1d_kernel(values, indices);
@@ -358,7 +232,6 @@ static void topk_kernel(
358232
int64_t dim,
359233
bool largest,
360234
bool sorted) {
361-
362235
auto sizes = self.sizes();
363236
auto iter = TensorIteratorConfig()
364237
.check_all_same_dtype(false)
@@ -393,7 +266,7 @@ static void topk_kernel(
393266

394267
} // anonymous namespace
395268

396-
ALSO_REGISTER_AVX512_DISPATCH(sort_stub, &sort_kernel);
397-
ALSO_REGISTER_AVX512_DISPATCH(topk_stub, &topk_kernel);
269+
REGISTER_DISPATCH(sort_stub, &sort_kernel);
270+
REGISTER_DISPATCH(topk_stub, &topk_kernel);
398271

399272
} //at::native

cmake/Dependencies.cmake

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1328,28 +1328,6 @@ if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX)
13281328
set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS})
13291329
endif()
13301330

1331-
# --[ x86-simd-sort integration
1332-
if(USE_X86_SIMD_SORT)
1333-
if(NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
1334-
message(WARNING
1335-
"x64 operating system is required for x86-simd-sort. "
1336-
"Not compiling with x86-simd-sort. "
1337-
"Turn this warning off by USE_X86_SIMD_SORT=OFF.")
1338-
set(USE_X86_SIMD_SORT OFF)
1339-
endif()
1340-
1341-
if(USE_X86_SIMD_SORT)
1342-
if(USE_OPENMP AND NOT MSVC)
1343-
set(USE_XSS_OPENMP ON)
1344-
else()
1345-
set(USE_XSS_OPENMP OFF)
1346-
endif()
1347-
1348-
set(XSS_SIMD_SORT_INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/../third_party/x86-simd-sort)
1349-
include_directories(SYSTEM ${XSS_SIMD_SORT_INCLUDE_DIR})
1350-
endif()
1351-
endif()
1352-
13531331
# --[ ATen checks
13541332
set(USE_LAPACK 0)
13551333

cmake/Summary.cmake

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,6 @@ function(caffe2_print_configuration_summary)
133133
endif()
134134
message(STATUS " BUILD_NVFUSER : ${BUILD_NVFUSER}")
135135
message(STATUS " USE_EIGEN_FOR_BLAS : ${CAFFE2_USE_EIGEN_FOR_BLAS}")
136-
message(STATUS " USE_X86_SIMD_SORT : ${USE_X86_SIMD_SORT}")
137136
message(STATUS " USE_FBGEMM : ${USE_FBGEMM}")
138137
message(STATUS " USE_FAKELOWP : ${USE_FAKELOWP}")
139138
message(STATUS " USE_KINETO : ${USE_KINETO}")

test/inductor/test_torchinductor_opinfo.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -466,9 +466,6 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
466466
("nn.functional.interpolate.bicubic", u8): {"atol": 1, "rtol": 0},
467467
# High atol due to precision loss
468468
("nn.functional.interpolate.bicubic", f32): {"atol": 5e-3, "rtol": 0},
469-
# reference_in_float can cause erroneous failures in sorting tests
470-
"argsort": {"reference_in_float": False},
471-
"sort": {"reference_in_float": False},
472469
}
473470

474471
inductor_override_kwargs["cuda"] = {
@@ -539,9 +536,6 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
539536
("index_reduce.amax", f32): {"check_gradient": False},
540537
("index_reduce.amax", f16): {"check_gradient": False},
541538
("tanh", f16): {"atol": 1e-4, "rtol": 1e-2},
542-
# reference_in_float can cause erroneous failures in sorting tests
543-
"argsort": {"reference_in_float": False},
544-
"sort": {"reference_in_float": False},
545539
}
546540

547541
inductor_override_kwargs["xpu"] = {
@@ -661,9 +655,6 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
661655
("nn.functional.embedding_bag", f64): {"check_gradient": False},
662656
("_unsafe_masked_index", f16): {"atol": 1e-5, "rtol": 2e-3},
663657
("_unsafe_masked_index_put_accumulate", f16): {"atol": 1e-5, "rtol": 5e-3},
664-
# reference_in_float can cause erroneous failures in sorting tests
665-
"argsort": {"reference_in_float": False},
666-
"sort": {"reference_in_float": False},
667658
}
668659

669660
# Test with one sample only for following ops

third_party/x86-simd-sort

Lines changed: 0 additions & 1 deletion
This file was deleted.

0 commit comments

Comments
 (0)