Skip to content

Commit 9fb223a

Browse files
authored
[MOD-11650] Fix Out-of-Bounds Write in Vector Preprocessing (#784)
* rename dataSize->getStoredDataSize * missing rename * format * add input blob size to AbstractIndexInitParams in VecSimIndexAbstract move data members to private when possible DONT FIX LEAK YET Factories: move NewAbstractInitParams to a general location (new file factory_utils) replace it in all factories * try sanitizer * run sanitizer * runanyway * rervt task unit test use sanitizer in pull request * fix leak in input bob size * fix int8/uint8 elementSizeEstimation tests This caused a buffer overflow because setup<TieredIndexParams> didn't set the data type to INT8, creating a float32 index instead. DataBlock::addElement() tries to copy dim * sizeof(float) bytes, but the allocated buffer is only dim * sizeof(int8) bytes, causing a read overflow. * run codecov with sanitizer (also intek) * some fixes and assertion * fix uint8 * fix again * fix possible warning abour divison by zero by checking quantBits with constexpr * TO REVERT! fix leaks that will be moved to a separate PR it was failing only with codecov becuase only there we use FP64_TESTS=1 Prevent template deduction errors in GenerateAndAddVector by making data_t parameter non-deducible Used std::type_identity<data_t>::type for the value parameter to force explicit template specification (e.g., GenerateAndAddVector<double>()) instead of allowing compiler to incorrectly deduce int from literal values, which caused buffer overflows when index expected different data types. * Revert "TO REVERT!" This reverts commit af844ec. * rever ci changes * calculate EstimateElementSize accroding to the stored vector size add tests * revert unrelated change in cmake.san * add batch itertor blob correctness to int8 tests fix getQueryBlob in tiered add getHNSWIterator to tiered batch itertor if its BUILD_TESTS * apply suggesting
1 parent f988b58 commit 9fb223a

28 files changed

+247
-162
lines changed

cmake/san.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,4 @@ if (USE_ASAN OR USE_MSAN)
2020
set(LLVM_CXX_FLAGS "-stdlib=libc++ -I${MSAN_PREFIX}/include -I${MSAN_PREFIX}/include/c++/v1")
2121
set(LLVM_LD_FLAGS "-stdlib=libc++ -Wl,-rpath=${MSAN_PREFIX}/lib -L${MSAN_PREFIX}/lib -lc++abi")
2222
endif()
23-
endif()
23+
endif()

src/VecSim/algorithms/brute_force/brute_force_multi.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ class BruteForceIndex_Multi : public BruteForceIndex<DataType, DistType> {
6363
const char *data = reinterpret_cast<const char *>(this->getDataByInternalId(id));
6464

6565
// Create a vector with the full data (including any metadata like norms)
66-
std::vector<char> vec(this->getDataSize());
67-
memcpy(vec.data(), data, this->getDataSize());
66+
std::vector<char> vec(this->getStoredDataSize());
67+
memcpy(vec.data(), data, this->getStoredDataSize());
6868
vectors_output.push_back(std::move(vec));
6969
}
7070

src/VecSim/algorithms/brute_force/brute_force_single.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ class BruteForceIndex_Single : public BruteForceIndex<DataType, DistType> {
6363
const char *data = reinterpret_cast<const char *>(this->getDataByInternalId(id));
6464

6565
// Create a vector with the full data (including any metadata like norms)
66-
std::vector<char> vec(this->getDataSize());
67-
memcpy(vec.data(), data, this->getDataSize());
66+
std::vector<char> vec(this->getStoredDataSize());
67+
memcpy(vec.data(), data, this->getStoredDataSize());
6868
vectors_output.push_back(std::move(vec));
6969

7070
return vectors_output;

src/VecSim/algorithms/hnsw/hnsw.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1182,7 +1182,7 @@ void HNSWIndex<DataType, DistType>::SwapLastIdWithDeletedId(idType element_inter
11821182
memcpy((void *)element, last_element, this->elementGraphDataSize);
11831183

11841184
auto data = getDataByInternalId(element_internal_id);
1185-
memcpy((void *)data, last_element_data, this->dataSize);
1185+
memcpy((void *)data, last_element_data, this->getStoredDataSize());
11861186

11871187
this->idToMetaData[element_internal_id] = this->idToMetaData[curElementCount];
11881188

src/VecSim/algorithms/hnsw/hnsw_multi.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,8 @@ class HNSWIndex_Multi : public HNSWIndex<DataType, DistType> {
9090
const char *data = this->getDataByInternalId(id);
9191

9292
// Create a vector with the full data (including any metadata like norms)
93-
std::vector<char> vec(this->dataSize);
94-
memcpy(vec.data(), data, this->dataSize);
93+
std::vector<char> vec(this->getStoredDataSize());
94+
memcpy(vec.data(), data, this->getStoredDataSize());
9595
vectors_output.push_back(std::move(vec));
9696
}
9797

src/VecSim/algorithms/hnsw/hnsw_single.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ class HNSWIndex_Single : public HNSWIndex<DataType, DistType> {
6262
const char *data = this->getDataByInternalId(id);
6363

6464
// Create a vector with the full data (including any metadata like norms)
65-
std::vector<char> vec(this->dataSize);
66-
memcpy(vec.data(), data, this->dataSize);
65+
std::vector<char> vec(this->getStoredDataSize());
66+
memcpy(vec.data(), data, this->getStoredDataSize());
6767
vectors_output.push_back(std::move(vec));
6868

6969
return vectors_output;

src/VecSim/algorithms/hnsw/hnsw_tiered.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,11 +179,17 @@ class TieredHNSWIndex : public VecSimTieredIndex<DataType, DistType> {
179179

180180
~TieredHNSW_BatchIterator();
181181

182+
const void *getQueryBlob() const override { return flat_iterator->getQueryBlob(); }
183+
182184
VecSimQueryReply *getNextResults(size_t n_res, VecSimQueryReply_Order order) override;
183185

184186
bool isDepleted() override;
185187

186188
void reset() override;
189+
190+
#ifdef BUILD_TESTS
191+
VecSimBatchIterator *getHNSWIterator() { return hnsw_iterator; }
192+
#endif
187193
};
188194

189195
public:
@@ -542,7 +548,7 @@ void TieredHNSWIndex<DataType, DistType>::executeInsertJob(HNSWInsertJob *job) {
542548
HNSWIndex<DataType, DistType> *hnsw_index = this->getHNSWIndex();
543549
// Copy the vector blob from the flat buffer, so we can release the flat lock while we are
544550
// indexing the vector into HNSW index.
545-
size_t data_size = this->frontendIndex->getDataSize();
551+
size_t data_size = this->frontendIndex->getStoredDataSize();
546552
auto blob_copy = this->getAllocator()->allocate_unique(data_size);
547553
// Assuming the size of the blob stored in the frontend index matches the size of the blob
548554
// stored in the HNSW index.

src/VecSim/algorithms/svs/svs.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,12 +182,13 @@ class SVSIndex : public VecSimIndexAbstract<svs_details::vecsim_dt<DataType>, fl
182182
return MemoryUtils::unique_blob{const_cast<void *>(original_data), [](void *) {}};
183183
}
184184

185-
const auto data_size = this->getDataSize() * n;
185+
const auto data_size = this->getStoredDataSize() * n;
186186

187187
auto processed_blob =
188188
MemoryUtils::unique_blob{this->allocator->allocate(data_size),
189189
[this](void *ptr) { this->allocator->free_allocation(ptr); }};
190190
// Assuming original data size equals to processed data size
191+
assert(this->getInputBlobSize() == this->getStoredDataSize());
191192
memcpy(processed_blob.get(), original_data, data_size);
192193
// Preprocess each vector in place
193194
for (size_t i = 0; i < n; i++) {

src/VecSim/algorithms/svs/svs_serializer_impl.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ void SVSIndex<MetricType, DataType, isMulti, QuantBits, ResidualBits, IsLeanVec>
2424
// Note: this->metric corresponds to MetricType template parameter
2525
writeBinaryPOD(output, this->dim);
2626
writeBinaryPOD(output, this->vecType); // DataType template parameter (as VecSimType enum)
27-
writeBinaryPOD(output, this->dataSize);
27+
writeBinaryPOD(output, this->getStoredDataSize());
2828
writeBinaryPOD(output, this->metric); // MetricType template parameter (as VecSimMetric enum)
2929
writeBinaryPOD(output, this->blockSize);
3030
writeBinaryPOD(output, this->isMulti);
@@ -128,7 +128,7 @@ bool SVSIndex<MetricType, DataType, isMulti, QuantBits, ResidualBits,
128128

129129
compareField(input, this->dim, "dim");
130130
compareField(input, this->vecType, "vecType");
131-
compareField(input, this->dataSize, "dataSize");
131+
compareField(input, this->getStoredDataSize(), "dataSize");
132132
compareField(input, this->metric, "metric");
133133
compareField(input, this->blockSize, "blockSize");
134134
compareField(input, this->isMulti, "isMulti");

src/VecSim/batch_iterator.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ struct VecSimBatchIterator : public VecsimBaseObject {
2929
: VecsimBaseObject(allocator), query_vector(query_vector), returned_results_count(0),
3030
timeoutCtx(tctx) {};
3131

32-
inline const void *getQueryBlob() const { return query_vector; }
32+
virtual inline const void *getQueryBlob() const { return query_vector; }
3333

3434
inline void *getTimeoutCtx() const { return timeoutCtx; }
3535

0 commit comments

Comments
 (0)