Skip to content

Commit d9ea4a0

Browse files
committed
Add nullable vector support for segment search and indexing
This commit adds nullable vector support for both growing and sealed segments, including search, retrieval, and index building operations. It introduces OffsetMapping for efficient logical-to-physical offset translation. Core infrastructure changes: - Add OffsetMapping class with Build()/BuildIncremental() for bitmap-based offset mapping, supporting both dense lookup and efficient valid count tracking - Add TransformBitset() and TransformOffset() utilities for converting between logical and physical coordinate spaces during search operations - Update SearchResult to include has_raw_data_ flag and physical offset handling Growing segment changes: - Update ConcurrentVector to support nullable vectors with OffsetMapping - Add get_offset_mapping() and valid data tracking in AckSeal() - Update FieldIndexing to handle nullable vectors with physical offset storage - Modify SearchOnGrowing to transform bitset/offsets for nullable vector search - Add nullable vector tests in SegmentGrowingTest covering search and retrieval Sealed segment changes: - Update ChunkedColumn/ChunkedColumnGroup to support OffsetMapping - Modify ChunkedSegmentSealedImpl to handle nullable vector field data loading - Update SearchOnSealedIndex/SearchOnSealedColumn with early return for 100% null case to prevent crash when searching empty index - Add FilterVectorValidOffsets() for output field retrieval with nullable vectors - Expand ChunkedSegmentSealedBinlogIndexTest with comprehensive nullable test cases Index building changes: - Update VectorMemIndex/VectorDiskIndex to store and serialize OffsetMapping - Add BuildValidData()/UpdateValidData() for nullable vector index building - Update VecIndexCreator and index_c API with valid data parameter support - Modify InterimSealedIndexTranslator to handle nullable vector binlog index Search and reduce changes: - Update Reduce.cpp to transform physical offsets back to logical for output - Update StreamReduce with nullable vector offset transformation support - Add early return in SearchOnSealed when offset_mapping has zero valid count Signed-off-by: marcelo-cjl <[email protected]>
1 parent 2e6c7ac commit d9ea4a0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+3614
-601
lines changed

internal/core/src/common/Chunk.h

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,11 @@ class Chunk {
126126
return data_;
127127
}
128128

129+
FixedVector<bool>&
130+
Valid() {
131+
return valid_;
132+
}
133+
129134
virtual bool
130135
isValid(int offset) const {
131136
if (nullable_) {
@@ -559,17 +564,32 @@ class SparseFloatVectorChunk : public Chunk {
559564
bool nullable,
560565
std::shared_ptr<ChunkMmapGuard> chunk_mmap_guard)
561566
: Chunk(row_nums, data, size, nullable, chunk_mmap_guard) {
562-
vec_.resize(row_nums);
563567
auto null_bitmap_bytes_num = nullable ? (row_nums + 7) / 8 : 0;
564568
auto offsets_ptr =
565569
reinterpret_cast<uint64_t*>(data + null_bitmap_bytes_num);
566-
for (int i = 0; i < row_nums; i++) {
567-
vec_[i] = {(offsets_ptr[i + 1] - offsets_ptr[i]) /
568-
knowhere::sparse::SparseRow<
569-
SparseValueType>::element_size(),
570-
reinterpret_cast<uint8_t*>(data + offsets_ptr[i]),
571-
false};
572-
dim_ = std::max(dim_, vec_[i].dim());
570+
571+
if (nullable_) {
572+
for (int i = 0; i < row_nums; i++) {
573+
if (isValid(i)) {
574+
vec_.emplace_back(
575+
(offsets_ptr[i + 1] - offsets_ptr[i]) /
576+
knowhere::sparse::SparseRow<
577+
SparseValueType>::element_size(),
578+
reinterpret_cast<uint8_t*>(data + offsets_ptr[i]),
579+
false);
580+
dim_ = std::max(dim_, vec_.back().dim());
581+
}
582+
}
583+
} else {
584+
vec_.resize(row_nums);
585+
for (int i = 0; i < row_nums; i++) {
586+
vec_[i] = {(offsets_ptr[i + 1] - offsets_ptr[i]) /
587+
knowhere::sparse::SparseRow<
588+
SparseValueType>::element_size(),
589+
reinterpret_cast<uint8_t*>(data + offsets_ptr[i]),
590+
false};
591+
dim_ = std::max(dim_, vec_[i].dim());
592+
}
573593
}
574594
}
575595

internal/core/src/common/ChunkWriter.cpp

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -435,8 +435,10 @@ SparseFloatVectorChunkWriter::calculate_size(
435435
for (const auto& data : array_vec) {
436436
auto array = std::dynamic_pointer_cast<arrow::BinaryArray>(data);
437437
for (int64_t i = 0; i < array->length(); ++i) {
438-
auto str = array->GetView(i);
439-
size += str.size();
438+
if (!nullable_ || !array->IsNull(i)) {
439+
auto str = array->GetView(i);
440+
size += str.size();
441+
}
440442
}
441443
row_nums_ += array->length();
442444
}
@@ -459,8 +461,10 @@ SparseFloatVectorChunkWriter::write_to_target(
459461
for (const auto& data : array_vec) {
460462
auto array = std::dynamic_pointer_cast<arrow::BinaryArray>(data);
461463
for (int64_t i = 0; i < array->length(); ++i) {
462-
auto str = array->GetView(i);
463-
strs.emplace_back(str);
464+
if (!nullable_ || !array->IsNull(i)) {
465+
auto str = array->GetView(i);
466+
strs.emplace_back(str);
467+
}
464468
}
465469
if (nullable_) {
466470
null_bitmaps.emplace_back(
@@ -478,9 +482,23 @@ SparseFloatVectorChunkWriter::write_to_target(
478482
std::vector<uint64_t> offsets;
479483
offsets.reserve(offset_num);
480484

481-
for (const auto& str : strs) {
482-
offsets.push_back(offset_start_pos);
483-
offset_start_pos += str.size();
485+
if (nullable_) {
486+
size_t str_idx = 0;
487+
for (const auto& data : array_vec) {
488+
auto array = std::dynamic_pointer_cast<arrow::BinaryArray>(data);
489+
for (int i = 0; i < array->length(); i++) {
490+
offsets.push_back(offset_start_pos);
491+
if (!array->IsNull(i)) {
492+
offset_start_pos += strs[str_idx].size();
493+
str_idx++;
494+
}
495+
}
496+
}
497+
} else {
498+
for (const auto& str : strs) {
499+
offsets.push_back(offset_start_pos);
500+
offset_start_pos += str.size();
501+
}
484502
}
485503
offsets.push_back(offset_start_pos);
486504

@@ -524,22 +542,43 @@ create_chunk_writer(const FieldMeta& field_meta) {
524542
return std::make_shared<ChunkWriter<arrow::Int64Array, int64_t>>(
525543
dim, nullable);
526544
case milvus::DataType::VECTOR_FLOAT:
545+
if (nullable) {
546+
return std::make_shared<
547+
NullableVectorChunkWriter<knowhere::fp32>>(dim, nullable);
548+
}
527549
return std::make_shared<
528550
ChunkWriter<arrow::FixedSizeBinaryArray, knowhere::fp32>>(
529551
dim, nullable);
530552
case milvus::DataType::VECTOR_BINARY:
553+
if (nullable) {
554+
return std::make_shared<
555+
NullableVectorChunkWriter<knowhere::bin1>>(dim / 8,
556+
nullable);
557+
}
531558
return std::make_shared<
532559
ChunkWriter<arrow::FixedSizeBinaryArray, knowhere::bin1>>(
533560
dim / 8, nullable);
534561
case milvus::DataType::VECTOR_FLOAT16:
562+
if (nullable) {
563+
return std::make_shared<
564+
NullableVectorChunkWriter<knowhere::fp16>>(dim, nullable);
565+
}
535566
return std::make_shared<
536567
ChunkWriter<arrow::FixedSizeBinaryArray, knowhere::fp16>>(
537568
dim, nullable);
538569
case milvus::DataType::VECTOR_BFLOAT16:
570+
if (nullable) {
571+
return std::make_shared<
572+
NullableVectorChunkWriter<knowhere::bf16>>(dim, nullable);
573+
}
539574
return std::make_shared<
540575
ChunkWriter<arrow::FixedSizeBinaryArray, knowhere::bf16>>(
541576
dim, nullable);
542577
case milvus::DataType::VECTOR_INT8:
578+
if (nullable) {
579+
return std::make_shared<
580+
NullableVectorChunkWriter<knowhere::int8>>(dim, nullable);
581+
}
543582
return std::make_shared<
544583
ChunkWriter<arrow::FixedSizeBinaryArray, knowhere::int8>>(
545584
dim, nullable);

internal/core/src/common/ChunkWriter.h

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,55 @@ class ChunkWriter final : public ChunkWriterBase {
129129
const int64_t dim_;
130130
};
131131

132+
template <typename T>
133+
class NullableVectorChunkWriter final : public ChunkWriterBase {
134+
public:
135+
NullableVectorChunkWriter(int64_t dim, bool nullable)
136+
: ChunkWriterBase(nullable), dim_(dim) {
137+
}
138+
139+
std::pair<size_t, size_t>
140+
calculate_size(const arrow::ArrayVector& array_vec) override {
141+
size_t size = 0;
142+
size_t row_nums = 0;
143+
144+
for (const auto& data : array_vec) {
145+
row_nums += data->length();
146+
auto binary_array =
147+
std::static_pointer_cast<arrow::BinaryArray>(data);
148+
int64_t valid_count = data->length() - binary_array->null_count();
149+
size += valid_count * dim_ * sizeof(T);
150+
}
151+
152+
// null bitmap size
153+
size += (row_nums + 7) / 8;
154+
row_nums_ = row_nums;
155+
return {size, row_nums};
156+
}
157+
158+
void
159+
write_to_target(const arrow::ArrayVector& array_vec,
160+
const std::shared_ptr<ChunkTarget>& target) override {
161+
std::vector<std::tuple<const uint8_t*, int64_t, int64_t>> null_bitmaps;
162+
for (const auto& data : array_vec) {
163+
null_bitmaps.emplace_back(
164+
data->null_bitmap_data(), data->length(), data->offset());
165+
}
166+
write_null_bit_maps(null_bitmaps, target);
167+
168+
for (const auto& data : array_vec) {
169+
auto binary_array =
170+
std::static_pointer_cast<arrow::BinaryArray>(data);
171+
auto data_ptr = binary_array->value_data()->data();
172+
int64_t valid_count = data->length() - binary_array->null_count();
173+
target->write(data_ptr, valid_count * dim_ * sizeof(T));
174+
}
175+
}
176+
177+
private:
178+
const int64_t dim_;
179+
};
180+
132181
template <>
133182
inline void
134183
ChunkWriter<arrow::BooleanArray, bool>::write_to_target(

internal/core/src/common/FieldMeta.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -183,12 +183,17 @@ FieldMeta::ParseFrom(const milvus::proto::schema::FieldSchema& schema_proto) {
183183
data_type,
184184
dim,
185185
std::nullopt,
186-
false,
186+
nullable,
187187
default_value};
188188
}
189189
auto metric_type = index_map.at("metric_type");
190-
return FieldMeta{
191-
name, field_id, data_type, dim, metric_type, false, default_value};
190+
return FieldMeta{name,
191+
field_id,
192+
data_type,
193+
dim,
194+
metric_type,
195+
nullable,
196+
default_value};
192197
}
193198

194199
if (IsStringDataType(data_type)) {

internal/core/src/common/FieldMeta.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,6 @@ class FieldMeta {
125125
vector_info_(VectorInfo{dim, std::move(metric_type)}),
126126
default_value_(std::move(default_value)) {
127127
Assert(IsVectorDataType(type_));
128-
Assert(!nullable);
129128
}
130129

131130
// array of vector type

0 commit comments

Comments
 (0)