Skip to content

Commit 84a5131

Browse files
authored
feat(ivf): Add numpy array support for IVF assemble functions (#287)
This enables users to build IVF indices from in-memory numpy arrays without intermediate file I/O, improving performance for dynamic workflows. - Add assemble_from_clustering and assemble_from_file methods that accept numpy arrays directly (py_data parameter) in addition to file paths - Refactor assemble_dynamic_from_clustering to avoid double data copy: - Internal _impl function takes data by const reference - Smart dispatching detects ImmutableMemoryDataset and passes by reference - Add timing instrumentation for assemble operations - Add comprehensive tests for numpy array assembly in both IVF and DynamicIVF - Zero-copy path: numpy views passed directly without data duplication
1 parent 885c574 commit 84a5131

6 files changed

Lines changed: 644 additions & 37 deletions

File tree

bindings/python/src/dynamic_ivf.cpp

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "svs/python/manager.h"
2424

2525
// svs
26+
#include "svs/core/data/simple.h"
2627
#include "svs/index/ivf/data_traits.h"
2728
#include "svs/lib/dispatcher.h"
2829
#include "svs/lib/saveload.h"
@@ -125,6 +126,69 @@ void register_ivf_assembly_from_file(Dispatcher& dispatcher) {
125126
register_uncompressed_ivf_assemble_from_file(dispatcher);
126127
}
127128

129+
/////
130+
///// Assemble from Clustering from Array
131+
/////
132+
133+
template <typename Q, typename T, size_t N>
134+
svs::DynamicIVF uncompressed_assemble_from_clustering_from_array(
135+
Clustering clustering,
136+
svs::data::ConstSimpleDataView<T, N> view,
137+
std::span<const size_t> ids,
138+
svs::DistanceType distance_type,
139+
size_t num_threads,
140+
size_t intra_query_threads = 1
141+
) {
142+
auto mutable_view = svs::data::SimpleDataView<T, N>(
143+
const_cast<T*>(view.data()), view.size(), view.dimensions()
144+
);
145+
return svs::DynamicIVF::assemble_from_clustering<Q>(
146+
std::move(clustering),
147+
mutable_view,
148+
ids,
149+
distance_type,
150+
num_threads,
151+
intra_query_threads
152+
);
153+
}
154+
155+
template <typename Dispatcher>
156+
void register_ivf_assemble_from_clustering_from_array(Dispatcher& dispatcher) {
157+
for_standard_specializations([&dispatcher]<typename Q, typename T, size_t N>() {
158+
auto method = &uncompressed_assemble_from_clustering_from_array<Q, T, N>;
159+
dispatcher.register_target(svs::lib::dispatcher_build_docs, method);
160+
});
161+
}
162+
163+
/////
164+
///// Assemble from File from Array
165+
/////
166+
167+
template <typename Q, typename T, size_t N>
168+
svs::DynamicIVF uncompressed_assemble_from_file_from_array(
169+
const std::filesystem::path& cluster_path,
170+
svs::data::ConstSimpleDataView<T, N> view,
171+
std::span<const size_t> ids,
172+
svs::DistanceType distance_type,
173+
size_t num_threads,
174+
size_t intra_query_threads = 1
175+
) {
176+
auto mutable_view = svs::data::SimpleDataView<T, N>(
177+
const_cast<T*>(view.data()), view.size(), view.dimensions()
178+
);
179+
return svs::DynamicIVF::assemble_from_file<Q, svs::BFloat16>(
180+
cluster_path, mutable_view, ids, distance_type, num_threads, intra_query_threads
181+
);
182+
}
183+
184+
template <typename Dispatcher>
185+
void register_ivf_assemble_from_file_from_array(Dispatcher& dispatcher) {
186+
for_standard_specializations([&dispatcher]<typename Q, typename T, size_t N>() {
187+
auto method = &uncompressed_assemble_from_file_from_array<Q, T, N>;
188+
dispatcher.register_target(svs::lib::dispatcher_build_docs, method);
189+
});
190+
}
191+
128192
using IVFAssembleTypes =
129193
std::variant<UnspecializedVectorDataLoader, svs::lib::SerializedObject>;
130194

@@ -210,6 +274,147 @@ svs::DynamicIVF assemble_from_file(
210274
);
211275
}
212276

277+
// Assemble from clustering from array.
278+
using AssembleFromClusteringArrayDispatcher = svs::lib::Dispatcher<
279+
svs::DynamicIVF,
280+
Clustering,
281+
AnonymousVectorData,
282+
std::span<const size_t>,
283+
svs::DistanceType,
284+
size_t,
285+
size_t>;
286+
287+
AssembleFromClusteringArrayDispatcher assemble_from_clustering_array_dispatcher() {
288+
auto dispatcher = AssembleFromClusteringArrayDispatcher{};
289+
register_ivf_assemble_from_clustering_from_array(dispatcher);
290+
return dispatcher;
291+
}
292+
293+
svs::DynamicIVF assemble_from_clustering_from_array(
294+
Clustering clustering,
295+
AnonymousVectorData py_data,
296+
const py_contiguous_array_t<size_t>& py_ids,
297+
svs::DistanceType distance_type,
298+
size_t num_threads,
299+
size_t intra_query_threads = 1
300+
) {
301+
auto ids = std::span<const size_t>(py_ids.data(), py_ids.size());
302+
return assemble_from_clustering_array_dispatcher().invoke(
303+
std::move(clustering), py_data, ids, distance_type, num_threads, intra_query_threads
304+
);
305+
}
306+
307+
// Assemble from file from array.
308+
using AssembleFromFileArrayDispatcher = svs::lib::Dispatcher<
309+
svs::DynamicIVF,
310+
const std::filesystem::path&,
311+
AnonymousVectorData,
312+
std::span<const size_t>,
313+
svs::DistanceType,
314+
size_t,
315+
size_t>;
316+
317+
AssembleFromFileArrayDispatcher assemble_from_file_array_dispatcher() {
318+
auto dispatcher = AssembleFromFileArrayDispatcher{};
319+
register_ivf_assemble_from_file_from_array(dispatcher);
320+
return dispatcher;
321+
}
322+
323+
svs::DynamicIVF assemble_from_file_from_array(
324+
const std::string& cluster_path,
325+
AnonymousVectorData py_data,
326+
const py_contiguous_array_t<size_t>& py_ids,
327+
svs::DistanceType distance_type,
328+
size_t num_threads,
329+
size_t intra_query_threads = 1
330+
) {
331+
auto ids = std::span<const size_t>(py_ids.data(), py_ids.size());
332+
return assemble_from_file_array_dispatcher().invoke(
333+
cluster_path, py_data, ids, distance_type, num_threads, intra_query_threads
334+
);
335+
}
336+
337+
// Templatize at the top level for numpy array assemble specializations.
338+
template <typename ElementType>
339+
void add_assemble_from_clustering_array_specialization(
340+
py::class_<svs::DynamicIVF>& dynamic_ivf
341+
) {
342+
dynamic_ivf.def_static(
343+
"assemble_from_clustering",
344+
[](Clustering clustering,
345+
py_contiguous_array_t<ElementType> py_data,
346+
const py_contiguous_array_t<size_t>& py_ids,
347+
svs::DistanceType distance,
348+
size_t num_threads,
349+
size_t intra_query_threads) {
350+
return assemble_from_clustering_from_array(
351+
std::move(clustering),
352+
AnonymousVectorData(py_data),
353+
py_ids,
354+
distance,
355+
num_threads,
356+
intra_query_threads
357+
);
358+
},
359+
py::arg("clustering"),
360+
py::arg("py_data"),
361+
py::arg("ids"),
362+
py::arg("distance") = svs::L2,
363+
py::arg("num_threads") = 1,
364+
py::arg("intra_query_threads") = 1,
365+
R"(
366+
Assemble a searchable DynamicIVF index from provided clustering and numpy data array.
367+
368+
Args:
369+
clustering: The clustering object (from Clustering.build or Clustering.load_clustering).
370+
py_data: The dataset as a numpy array. SVS will maintain an internal copy.
371+
ids: External IDs for the vectors. Must match dataset length and contain unique values.
372+
distance: The distance function to use. Default: L2.
373+
num_threads: The number of threads to use for queries. Default: 1.
374+
intra_query_threads: Number of threads for intra-query parallelism. Default: 1.
375+
)"
376+
);
377+
}
378+
379+
template <typename ElementType>
380+
void add_assemble_from_file_array_specialization(py::class_<svs::DynamicIVF>& dynamic_ivf) {
381+
dynamic_ivf.def_static(
382+
"assemble_from_file",
383+
[](const std::string& clustering_path,
384+
py_contiguous_array_t<ElementType> py_data,
385+
const py_contiguous_array_t<size_t>& py_ids,
386+
svs::DistanceType distance,
387+
size_t num_threads,
388+
size_t intra_query_threads) {
389+
return assemble_from_file_from_array(
390+
clustering_path,
391+
AnonymousVectorData(py_data),
392+
py_ids,
393+
distance,
394+
num_threads,
395+
intra_query_threads
396+
);
397+
},
398+
py::arg("clustering_path"),
399+
py::arg("py_data"),
400+
py::arg("ids"),
401+
py::arg("distance") = svs::L2,
402+
py::arg("num_threads") = 1,
403+
py::arg("intra_query_threads") = 1,
404+
R"(
405+
Assemble a searchable DynamicIVF index from clustering on disk and numpy data array.
406+
407+
Args:
408+
clustering_path: Path to the directory where the clustering was generated.
409+
py_data: The dataset as a numpy array. SVS will maintain an internal copy.
410+
ids: External IDs for the vectors. Must match dataset length and contain unique values.
411+
distance: The distance function to use. Default: L2.
412+
num_threads: The number of threads to use for queries. Default: 1.
413+
intra_query_threads: Number of threads for intra-query parallelism. Default: 1.
414+
)"
415+
);
416+
}
417+
213418
constexpr std::string_view ASSEMBLE_DOCSTRING_PROTO = R"(
214419
Assemble a searchable IVF index from provided clustering and data
215420
@@ -462,6 +667,10 @@ Method {}:
462667
);
463668
}
464669

670+
// Assemble from numpy array.
671+
add_assemble_from_clustering_array_specialization<float>(dynamic_ivf);
672+
add_assemble_from_file_array_specialization<float>(dynamic_ivf);
673+
465674
// Index modification.
466675
add_points_specialization<float>(dynamic_ivf);
467676

0 commit comments

Comments
 (0)