feat: add MCI hybrid workflow support#2087
Conversation
- add the MCI index, HGraph hybrid overlay, and external KNNG import path - add eval/export tooling plus benchmark configs for filtered MCI comparisons - document MCI in English and Chinese and add a runnable hybrid example Signed-off-by: zhuangye.yxw <2510035537@qq.com> Assisted-by: GitHub Copilot:GPT-5.4
Merge ProtectionsYour pull request matches the following merge protections and will not be merged until they are valid. 🟢 Require kind labelWonderful, this rule succeeded.
🟢 Require version labelWonderful, this rule succeeded.
|
There was a problem hiding this comment.
Code Review
This pull request introduces the MCI index, a dense-vector index utilizing maximal-clique candidate structures and an optional HGraph hybrid overlay for filtered searches. The implementation includes the core algorithm, parameter handling, comprehensive documentation, and benchmark configurations. Feedback identifies a duplicated key in a benchmark YAML file and suggests using YAML anchors to manage configuration duplication. Additionally, the reviewer recommends defining variables in documentation code snippets for clarity, replacing std::getenv and std::cerr with more robust configuration and logging mechanisms, allowing a limited_size of zero in range searches, and refactoring duplicated result-set creation logic into a helper function.
| MCI/WUFUFILTER_5M/FILTER_TOP20/L160_SEED3600: | ||
| datapath: "/root/data/wufufilter-5m-128-euclidean.h5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":128,"dtype":"float32","metric_type":"l2","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":15,"alpha":1.2,"build_thread_count":32}}' | ||
| search_params: '{"mci":{"ef_search":1600,"seed_count":360}}' | ||
| index_path: "/root/vsag/benchs/indexes/wufufilter/mci_sq8_self_build_m32_mcs200_c15.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 10000 | ||
| delete_index_after_search: false |
There was a problem hiding this comment.
| HGRAPH/CODEFILTER_3M/FILTER_TOP20/P1_EF50: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "hgraph" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}' | ||
| search_params: '{"hgraph":{"ef_search":50}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| HGRAPH/CODEFILTER_3M/FILTER_TOP20/P2_EF100: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "hgraph" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}' | ||
| search_params: '{"hgraph":{"ef_search":100}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| HGRAPH/CODEFILTER_3M/FILTER_TOP20/P3_EF200: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "hgraph" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}' | ||
| search_params: '{"hgraph":{"ef_search":200}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| HGRAPH/CODEFILTER_3M/FILTER_TOP20/P4_EF400: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "hgraph" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}' | ||
| search_params: '{"hgraph":{"ef_search":400}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| MCI/CODEFILTER_3M/FILTER_TOP20/P1_L20_SEED3600: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80}}' | ||
| search_params: '{"mci":{"ef_search":20,"seed_count":3600}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| MCI/CODEFILTER_3M/FILTER_TOP20/P2_L40_SEED3600: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80}}' | ||
| search_params: '{"mci":{"ef_search":40,"seed_count":3600}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| MCI/CODEFILTER_3M/FILTER_TOP20/P3_L80_SEED3600: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80}}' | ||
| search_params: '{"mci":{"ef_search":80,"seed_count":3600}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| MCI/CODEFILTER_3M/FILTER_TOP20/P4_L160_SEED3600: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80}}' | ||
| search_params: '{"mci":{"ef_search":160,"seed_count":3600}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| MCI/CODEFILTER_3M/FILTER_TOP20/P5_L320_SEED3600: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80}}' | ||
| search_params: '{"mci":{"ef_search":320,"seed_count":3600}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| MCI/CODEFILTER_3M/FILTER_TOP20/P6_L640_SEED3600: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80}}' | ||
| search_params: '{"mci":{"ef_search":640,"seed_count":3600}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| MCI/CODEFILTER_3M/FILTER_TOP20/P7_L1280_SEED3600: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80}}' | ||
| search_params: '{"mci":{"ef_search":1280,"seed_count":3600}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| HYBRID0.2/CODEFILTER_3M/FILTER_TOP20/P1_L20_EF50: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80,"use_hgraph_hybrid":true,"hgraph_valid_ratio_threshold":0.2,"hgraph_index_path":"/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32","hgraph_ef_search":50,"hgraph_index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}}' | ||
| search_params: '{"mci":{"ef_search":20,"seed_count":3600},"hgraph":{"ef_search":50}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| HYBRID0.2/CODEFILTER_3M/FILTER_TOP20/P2_L40_EF100: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80,"use_hgraph_hybrid":true,"hgraph_valid_ratio_threshold":0.2,"hgraph_index_path":"/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32","hgraph_ef_search":50,"hgraph_index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}}' | ||
| search_params: '{"mci":{"ef_search":40,"seed_count":3600},"hgraph":{"ef_search":100}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| HYBRID0.2/CODEFILTER_3M/FILTER_TOP20/P3_L80_EF200: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80,"use_hgraph_hybrid":true,"hgraph_valid_ratio_threshold":0.2,"hgraph_index_path":"/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32","hgraph_ef_search":50,"hgraph_index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}}' | ||
| search_params: '{"mci":{"ef_search":80,"seed_count":3600},"hgraph":{"ef_search":200}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| HYBRID0.2/CODEFILTER_3M/FILTER_TOP20/P4_L160_EF400: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80,"use_hgraph_hybrid":true,"hgraph_valid_ratio_threshold":0.2,"hgraph_index_path":"/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32","hgraph_ef_search":50,"hgraph_index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}}' | ||
| search_params: '{"mci":{"ef_search":160,"seed_count":3600},"hgraph":{"ef_search":400}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| HYBRID0.2/CODEFILTER_3M/FILTER_TOP20/P5_L320_EF800: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80,"use_hgraph_hybrid":true,"hgraph_valid_ratio_threshold":0.2,"hgraph_index_path":"/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32","hgraph_ef_search":50,"hgraph_index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}}' | ||
| search_params: '{"mci":{"ef_search":320,"seed_count":3600},"hgraph":{"ef_search":800}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| HYBRID0.2/CODEFILTER_3M/FILTER_TOP20/P6_L640_EF1600: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80,"use_hgraph_hybrid":true,"hgraph_valid_ratio_threshold":0.2,"hgraph_index_path":"/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32","hgraph_ef_search":50,"hgraph_index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}}' | ||
| search_params: '{"mci":{"ef_search":640,"seed_count":3600},"hgraph":{"ef_search":1600}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| HYBRID0.2/CODEFILTER_3M/FILTER_TOP20/P7_L1280_EF2000: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80,"use_hgraph_hybrid":true,"hgraph_valid_ratio_threshold":0.2,"hgraph_index_path":"/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32","hgraph_ef_search":50,"hgraph_index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}}' | ||
| search_params: '{"mci":{"ef_search":1280,"seed_count":3600},"hgraph":{"ef_search":2000}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| HYBRID0.05/CODEFILTER_3M/FILTER_TOP20/P1_L20_EF50: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80,"use_hgraph_hybrid":true,"hgraph_valid_ratio_threshold":0.05,"hgraph_index_path":"/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32","hgraph_ef_search":50,"hgraph_index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}}' | ||
| search_params: '{"mci":{"ef_search":20,"seed_count":3600},"hgraph":{"ef_search":50}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| HYBRID0.05/CODEFILTER_3M/FILTER_TOP20/P2_L40_EF100: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80,"use_hgraph_hybrid":true,"hgraph_valid_ratio_threshold":0.05,"hgraph_index_path":"/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32","hgraph_ef_search":50,"hgraph_index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}}' | ||
| search_params: '{"mci":{"ef_search":40,"seed_count":3600},"hgraph":{"ef_search":100}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| HYBRID0.05/CODEFILTER_3M/FILTER_TOP20/P3_L80_EF200: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80,"use_hgraph_hybrid":true,"hgraph_valid_ratio_threshold":0.05,"hgraph_index_path":"/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32","hgraph_ef_search":50,"hgraph_index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}}' | ||
| search_params: '{"mci":{"ef_search":80,"seed_count":3600},"hgraph":{"ef_search":200}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| HYBRID0.05/CODEFILTER_3M/FILTER_TOP20/P4_L160_EF400: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80,"use_hgraph_hybrid":true,"hgraph_valid_ratio_threshold":0.05,"hgraph_index_path":"/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32","hgraph_ef_search":50,"hgraph_index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}}' | ||
| search_params: '{"mci":{"ef_search":160,"seed_count":3600},"hgraph":{"ef_search":400}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| HYBRID0.05/CODEFILTER_3M/FILTER_TOP20/P5_L320_EF800: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80,"use_hgraph_hybrid":true,"hgraph_valid_ratio_threshold":0.05,"hgraph_index_path":"/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32","hgraph_ef_search":50,"hgraph_index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}}' | ||
| search_params: '{"mci":{"ef_search":320,"seed_count":3600},"hgraph":{"ef_search":800}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| HYBRID0.05/CODEFILTER_3M/FILTER_TOP20/P6_L640_EF1600: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80,"use_hgraph_hybrid":true,"hgraph_valid_ratio_threshold":0.05,"hgraph_index_path":"/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32","hgraph_ef_search":50,"hgraph_index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}}' | ||
| search_params: '{"mci":{"ef_search":640,"seed_count":3600},"hgraph":{"ef_search":1600}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false | ||
|
|
||
| HYBRID0.05/CODEFILTER_3M/FILTER_TOP20/P7_L1280_EF2000: | ||
| datapath: "/tmp/codefilter-3m-384-angular-vsag-eval.hdf5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":384,"dtype":"float32","metric_type":"cosine","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":50,"alpha":1.2,"knng_path":"/root/data/codefilter-3m-384-angular/clique_index/knng200_cliqueMax50_mcs200/knng_200.bin","build_thread_count":80,"use_hgraph_hybrid":true,"hgraph_valid_ratio_threshold":0.05,"hgraph_index_path":"/root/vsag/benchs/indexes/codefilter/hgraph_fp32_cosine_odescent_m32","hgraph_ef_search":50,"hgraph_index_param":{"base_quantization_type":"fp32","graph_type":"odescent","max_degree":32,"alpha":1.2,"graph_iter_turn":20,"neighbor_sample_rate":0.2}}}' | ||
| search_params: '{"mci":{"ef_search":1280,"seed_count":3600},"hgraph":{"ef_search":2000}}' | ||
| index_path: "/root/vsag/benchs/indexes/codefilter/mci_sq8_self_build_knng200_m32_mcs200_c50.index" | ||
| topk: 20 | ||
| search_mode: "knn_filter" | ||
| search_query_count: 1278 | ||
| delete_index_after_search: false No newline at end of file |
There was a problem hiding this comment.
This configuration file has a significant amount of duplication across different test cases (e.g., HGRAPH/*, MCI/*, HYBRID*/*). Using YAML anchors and aliases would make this file much more concise and easier to maintain. You could define a base template for each index type and then merge and override specific parameters for each test case.
| auto base = vsag::Dataset::Make(); | ||
| base->NumElements(n)->Dim(128)->Ids(ids)->Float32Vectors(data)->Owner(false); | ||
| index->Build(base); |
There was a problem hiding this comment.
The variables n, ids, and data are used in this code snippet but are not defined. For clarity and to make the example easier to use, please consider adding their definitions, for example:
int64_t n = 10000;
std::vector<int64_t> ids(n);
std::vector<float> data(n * 128);
// ... populate ids and data ...| auto query = vsag::Dataset::Make(); | ||
| query->NumElements(1)->Dim(128)->Float32Vectors(q)->Owner(false); |
| auto base = vsag::Dataset::Make(); | ||
| base->NumElements(n)->Dim(128)->Ids(ids)->Float32Vectors(data)->Owner(false); | ||
| index->Build(base); |
| auto query = vsag::Dataset::Make(); | ||
| query->NumElements(1)->Dim(128)->Float32Vectors(q)->Owner(false); |
| const auto node_clique_limit = std::max<uint32_t>(3, static_cast<uint32_t>(total / 100)); | ||
| const auto max_saved_per_seed = | ||
| std::min<uint64_t>(candidate_limit, static_cast<uint64_t>(this->max_degree_ + 2)); | ||
| const auto enable_build_stats = std::getenv("VSAG_MCI_BUILD_STATS") != nullptr; |
| if (enable_build_stats) { | ||
| std::cerr << "mci_build_round round=" << round + 1 << " alpha=" << now_alpha | ||
| << " uncovered=" << uncovered | ||
| << " round_cliques=" << cliques.size() - cliques_before_round | ||
| << " total_cliques=" << cliques.size() << std::endl; | ||
| } |
There was a problem hiding this comment.
Directly writing to std::cerr is generally discouraged in a library. It's better to use the project's logging framework (e.g., logger::info or logger::debug) to output these build statistics. This allows users of the library to control log levels and destinations. This also applies to lines 1352-1364.
| "query.dim({}) must be equal to index.dim({})", request.query_->GetDim(), dim_)); | ||
| CHECK_ARGUMENT(request.query_->GetFloat32Vectors() != nullptr, "query.float_vector is nullptr"); | ||
| CHECK_ARGUMENT(request.topk_ > 0, "mci topk must be positive"); | ||
| CHECK_ARGUMENT(request.limited_size_ != 0, "mci limited_size must not be equal to 0"); |
There was a problem hiding this comment.
This check prevents limited_size from being 0. However, a limited_size of 0 could be a valid use case for a range search if the user wants to get search statistics without retrieving any results. The current implementation seems to handle a size-0 heap correctly. Consider removing this check to allow for this possibility.
| auto [dataset_results, dists, ids] = | ||
| create_fast_dataset(static_cast<int64_t>(heap->Size()), allocator_); | ||
| for (auto i = static_cast<int64_t>(heap->Size() - 1); i >= 0; --i) { | ||
| dists[i] = heap->Top().first; | ||
| ids[i] = this->label_table_->GetLabelById(heap->Top().second); | ||
| heap->Pop(); | ||
| } |
There was a problem hiding this comment.
Pull request overview
Adds the new MCI dense-vector index type (with optional hybrid routing to an external HGraph for broad filtered searches), plus evaluation tooling/docs/examples to support benchmarking and adoption across the repo.
Changes:
- Introduces the MCI index implementation, parameter parsing, factory registration, and unit tests.
- Updates the eval pipeline for filtered searches that may return fewer than
topk, and adds a KNNG export tool. - Adds C++ example and English/Chinese documentation + benchmark YAML presets for MCI and MCI/HGraph hybrid workflows.
Reviewed changes
Copilot reviewed 47 out of 47 changed files in this pull request and generated 4 comments.
Show a summary per file
| File | Description |
|---|---|
| tools/eval/monitor/recall_monitor.cpp | Updates recall computation to handle filtered results returning fewer than topk. |
| tools/eval/export_knng.cpp | Adds a CLI tool to export a fixed-width KNNG binary by running HGraph searches over base vectors. |
| tools/eval/CMakeLists.txt | Builds and links the new export_knng executable and ensures OpenMP flags are applied. |
| tools/eval/case/search_eval_case.h | Adds cached valid-id lists to support Filter::GetValidIds() in eval runs. |
| tools/eval/case/search_eval_case.cpp | Enables parallel filtered KNN eval and supplies result count + valid-id hints to monitors/filters. |
| src/inner_string_params.h | Adds internal string constant for mci index type. |
| src/factory/index_creators.cpp | Wires MCI into the index factory registration. |
| src/factory/factory_test.cpp | Adds factory coverage to create an MCI index from full parameters. |
| src/constants.cpp | Exposes INDEX_MCI constant for API/config usage. |
| src/algorithm/mci.h | Declares the MCI index class and its hybrid-search hooks. |
| src/algorithm/mci.cpp | Implements MCI build/search/serialize/deserialize, clique enumeration, and hybrid routing to HGraph. |
| src/algorithm/mci_test.cpp | Adds unit tests for MCI build, filtered search, RabitQ one-bit mode, KNNG import, and hybrid overlay. |
| src/algorithm/mci_parameter.h | Defines MCI build/search parameter schemas (including hybrid overlay settings). |
| src/algorithm/mci_parameter.cpp | Implements MCI parameter parsing, validation, JSON conversion, and search param parsing. |
| src/algorithm/mci_parameter_test.cpp | Adds parameter round-trip and compatibility tests for MCI (including hybrid settings). |
| include/vsag/index.h | Extends public IndexType enum with MCI. |
| include/vsag/constants.h | Declares INDEX_MCI in the public constants header. |
| examples/cpp/CMakeLists.txt | Builds the new runnable example for MCI hybrid filtered search. |
| examples/cpp/322_feature_mci_hybrid_filter.cpp | Demonstrates exporting KNNG from HGraph, building MCI, and running hybrid filtered queries routed to HGraph. |
| docs/docs/zh/src/SUMMARY.md | Adds MCI page to Chinese docs navigation. |
| docs/docs/zh/src/resources/index_parameters.md | Documents MCI build/search parameters in Chinese. |
| docs/docs/zh/src/indexes/README.md | Adds MCI to Chinese index overview table. |
| docs/docs/zh/src/indexes/mci.md | Adds Chinese MCI documentation page (incl. hybrid overlay and KNNG format). |
| docs/docs/zh/src/guide/create_index.md | Lists mci in Chinese create-index guide. |
| docs/docs/en/src/SUMMARY.md | Adds MCI page to English docs navigation. |
| docs/docs/en/src/resources/index_parameters.md | Documents MCI build/search parameters in English. |
| docs/docs/en/src/indexes/README.md | Adds MCI to English index overview table. |
| docs/docs/en/src/indexes/mci.md | Adds English MCI documentation page (incl. hybrid overlay and KNNG format). |
| docs/docs/en/src/guide/create_index.md | Lists mci in English create-index guide. |
| benchs/datasets/wufufilter_5m_mci_self_build.yml | Adds benchmark preset for self-built MCI on WUFUFILTER 5M. |
| benchs/datasets/wufufilter_5m_mci_self_build_sq8.yml | Adds benchmark preset variant using SQ8 base quantization. |
| benchs/datasets/wufufilter_5m_mci_self_build_search_sweep.yml | Adds WUFUFILTER 5M MCI search sweep presets. |
| benchs/datasets/wufufilter_5m_mci_hgraph_knng_search_sweep.yml | Adds sweep presets for MCI built from an HGraph-derived KNNG. |
| benchs/datasets/wufufilter_5m_mci_hgraph_knng_build.yml | Adds build+search preset for MCI using an HGraph-derived KNNG. |
| benchs/datasets/wufufilter_5m_mci_hgraph_hybrid_search_sweep.yml | Adds sweep presets for hybrid MCI/HGraph routing experiments. |
| benchs/datasets/wufufilter_5m_hgraph_mci_hybrid_compare_search.yml | Adds side-by-side comparison presets (HGraph vs MCI vs hybrid thresholds). |
| benchs/datasets/wufufilter_5m_hgraph_build.yml | Adds baseline HGraph build preset for WUFUFILTER 5M. |
| benchs/datasets/wufufilter_5m_filtered_hgraph_search_sweep.yml | Adds filtered-search sweep presets for HGraph on WUFUFILTER 5M. |
| benchs/datasets/gist1m_sq8_uniform_pure_baseline.yml | Adds baseline gist1m SQ8-uniform HGraph benchmark presets. |
| benchs/datasets/gist1m_sq8_pure_baseline.yml | Adds baseline gist1m SQ8 HGraph benchmark presets. |
| benchs/datasets/gist1m_base_quantization_sq8_vs_rabitq_build_search.yml | Adds gist1m build+search comparison presets (SQ8 vs RabitQ). |
| benchs/datasets/codefilter_3m_mci_self_build.yml | Adds codefilter 3M MCI benchmark preset(s). |
| benchs/datasets/codefilter_3m_mci_self_build_search_sweep.yml | Adds codefilter 3M MCI search sweep presets. |
| benchs/datasets/codefilter_3m_ivf_build.yml | Adds codefilter 3M IVF build preset for comparison. |
| benchs/datasets/codefilter_3m_hgraph_mci_hybrid_compare_search.yml | Adds codefilter 3M comparison presets across HGraph/MCI/hybrid. |
| benchs/datasets/codefilter_3m_hgraph_build.yml | Adds baseline HGraph build preset for codefilter 3M. |
| benchs/datasets/codefilter_3m_filtered_search_sweep.yml | Adds filtered-search sweep reference presets for codefilter 3M. |
| auto [neighbors, result_count, gt_neighbors, dataset, query_data, topk] = | ||
| *(reinterpret_cast< | ||
| std::tuple<int64_t*, uint64_t, int64_t*, EvalDataset*, const void*, uint64_t>*>( | ||
| input)); |
| MCI::Serialize(StreamWriter& writer) const { | ||
| this->base_codes_->Serialize(writer); | ||
| if (this->reorder_codes_ != nullptr) { | ||
| this->reorder_codes_->Serialize(writer); | ||
| } | ||
| this->label_table_->Serialize(writer); | ||
| StreamWriter::WriteVector(writer, this->p_maxc_); | ||
| StreamWriter::WriteVector(writer, this->maxcs_); | ||
| StreamWriter::WriteVector(writer, this->p_node_to_cid_); | ||
| StreamWriter::WriteVector(writer, this->node_to_cids_); | ||
|
|
||
| uint64_t hgraph_serialized_size = 0; | ||
|
|
||
| auto metadata = std::make_shared<Metadata>(); | ||
| JsonType basic_info; | ||
| basic_info["dim"].SetInt(dim_); | ||
| basic_info["total_count"].SetInt(static_cast<int64_t>(this->total_count_.load())); | ||
| basic_info["max_capacity"].SetInt(static_cast<int64_t>(this->max_capacity_.load())); | ||
| basic_info["total_clique_count"].SetInt(static_cast<int64_t>(this->total_clique_count_)); | ||
| basic_info["hgraph_serialized_size"].SetInt(static_cast<int64_t>(hgraph_serialized_size)); | ||
| basic_info[INDEX_PARAM].SetString(this->create_param_ptr_->ToString()); | ||
| metadata->Set(BASIC_INFO, basic_info); | ||
| auto footer = std::make_shared<Footer>(metadata); | ||
| footer->Write(writer); |
| JsonType | ||
| MCIParameter::ToJson() const { | ||
| JsonType json = InnerIndexParameter::ToJson(); | ||
| json[TYPE_KEY].SetString(INDEX_TYPE_MCI); | ||
| json[BASE_CODES_KEY].SetJson(this->base_codes_param->ToJson()); | ||
| json[MCI_PARAMETER_MAX_DEGREE].SetInt(static_cast<int64_t>(this->max_degree)); | ||
| json[MCI_PARAMETER_MCS].SetInt(static_cast<int64_t>(this->mcs)); | ||
| json[MCI_PARAMETER_CLIQUE_MAX].SetInt(static_cast<int64_t>(this->clique_max)); | ||
| json[MCI_PARAMETER_ALPHA].SetFloat(this->alpha); | ||
| json[MCI_PARAMETER_KNNG_PATH].SetString(this->knng_path); | ||
| json[MCI_PARAMETER_CLIQUE_PATH].SetString(this->clique_path); | ||
| json[MCI_PARAMETER_USE_HGRAPH_HYBRID].SetBool(this->use_hgraph_hybrid); | ||
| json[MCI_PARAMETER_HGRAPH_VALID_RATIO_THRESHOLD].SetFloat(this->hgraph_valid_ratio_threshold); | ||
| if (this->hgraph_param != nullptr) { | ||
| json[MCI_PARAMETER_HGRAPH_INDEX_PARAM].SetJson(this->hgraph_param->ToJson()); | ||
| } | ||
| json[MCI_PARAMETER_HGRAPH_EF_SEARCH].SetInt(this->hgraph_ef_search); | ||
| return json; | ||
| } |
| MCI/WUFUFILTER_5M/FILTER_TOP20/L160_SEED3600: | ||
| datapath: "/root/data/wufufilter-5m-128-euclidean.h5" | ||
| type: "search" | ||
| index_name: "mci" | ||
| create_params: '{"dim":128,"dtype":"float32","metric_type":"l2","index_param":{"base_quantization_type":"sq8","base_codes_type":"flatten","max_degree":32,"mcs":200,"clique_max":15,"alpha":1.2,"build_thread_count":32}}' |
Change Type
Linked Issue
N/A (
kind/improvement)What Changed
tools/eval/export_knng.cpp, benchmark YAMLs, and evaluator fixes for filtered searches that return fewer thantopkids.examples/cpp/322_feature_mci_hybrid_filter.cpp.Test Evidence
make fmtmake lintmake testmake cov, run tests, and collect coverageTest details:
Compatibility Impact
mciindex type and related constants.hgraph_index_path; eval recall handling now tolerates filtered searches that return fewer thantopkresults.Performance and Concurrency Impact
Documentation Impact
README.mdDEVELOPMENT.mdCONTRIBUTING.mddocs/docs/en/src/indexes/mci.md,docs/docs/zh/src/indexes/mci.md, related nav/guide/parameter pagesRisk and Rollback
0cd95c72.Checklist
kind/bugandkind/feature; see "Linked Issue" above)[skip ci]prefix)