@@ -48,54 +48,71 @@ pub const ALL_VECTOR_DATASETS: &[VectorDataset] = &[
4848 VectorDataset :: LaionLarge100m ,
4949] ;
5050
51+ // NB: We can't do `#[clap(rename_all = "kebab-case")]` here because it won't put a dash in front of
52+ // any numbers.
5153/// The publicly hosted vector benchmark datasets.
5254///
5355/// Variants are named `<source><size><rowcount>`, kebab-cased on the CLI (e.g. `cohere-large-10m`).
5456///
5557/// The static metadata for each variant (dimensionality, row count, hosted layouts, etc.) is
5658/// exposed via the inherent methods below; the full table is reachable via [`ALL_VECTOR_DATASETS`].
5759#[ derive( Debug , Clone , Copy , PartialEq , Eq , Hash , ValueEnum ) ]
58- #[ clap( rename_all = "kebab-case" ) ]
5960pub enum VectorDataset {
6061 /// Cohere wiki-22-12, 100K × 768 f32, cosine. Single + SingleShuffled.
62+ #[ clap( name = "cohere-small-100k" ) ]
6163 CohereSmall100k ,
6264 /// Cohere wiki-22-12, 1M × 768 f32, cosine. Single + SingleShuffled.
65+ #[ clap( name = "cohere-medium-1m" ) ]
6366 CohereMedium1m ,
6467 /// Cohere wiki-22-12, 10M × 768 f32, cosine. Partitioned + PartitionedShuffled (10 shards).
68+ #[ clap( name = "cohere-large-10m" ) ]
6569 CohereLarge10m ,
6670
6771 /// OpenAI embeddings on C4, 50K × 1536 f64, cosine. Single + SingleShuffled.
72+ #[ clap( name = "openai-small-50k" ) ]
6873 OpenaiSmall50k ,
6974 /// OpenAI embeddings on C4, 500K × 1536 f64, cosine. Single + SingleShuffled.
75+ #[ clap( name = "openai-medium-500k" ) ]
7076 OpenaiMedium500k ,
7177 /// OpenAI embeddings on C4, 5M × 1536 f64, cosine. Partitioned + PartitionedShuffled (10
7278 /// shards).
79+ #[ clap( name = "openai-large-5m" ) ]
7380 OpenaiLarge5m ,
7481
7582 /// Bioasq biomedical, 1M × 1024 f32, cosine. SingleShuffled only.
83+ #[ clap( name = "bioasq-medium-1m" ) ]
7684 BioasqMedium1m ,
7785 /// Bioasq biomedical, 10M × 1024 f32, cosine. PartitionedShuffled only (10 shards).
86+ #[ clap( name = "bioasq-large-10m" ) ]
7887 BioasqLarge10m ,
7988
8089 /// GloVe word vectors, 100K × 200 f32, cosine. Single only. No neighbors / labels.
90+ #[ clap( name = "glove-small-100k" ) ]
8191 GloveSmall100k ,
8292 /// GloVe word vectors, 1M × 200 f32, cosine. Single only. No neighbors / labels.
93+ #[ clap( name = "glove-medium-1m" ) ]
8394 GloveMedium1m ,
8495
8596 /// GIST image features, 100K × 960 f32, L2. Single only. No neighbors / labels.
97+ #[ clap( name = "gist-small-100k" ) ]
8698 GistSmall100k ,
8799 /// GIST image features, 1M × 960 f32, L2. Single only. No neighbors / labels.
100+ #[ clap( name = "gist-medium-1m" ) ]
88101 GistMedium1m ,
89102
90103 /// SIFT image features, 500K × 128 f32, L2. Single only. No neighbors / labels.
104+ #[ clap( name = "sift-small-500k" ) ]
91105 SiftSmall500k ,
92106 /// SIFT image features, 5M × 128 f32, L2. Single only. No neighbors / labels.
107+ #[ clap( name = "sift-medium-5m" ) ]
93108 SiftMedium5m ,
94109 /// SIFT image features, 50M × 128 f32, L2. Partitioned only (50 shards). No labels.
110+ #[ clap( name = "sift-large-50m" ) ]
95111 SiftLarge50m ,
96112
97113 /// LAION image embeddings, 100M × 768 f32, L2. Partitioned only (100 shards).
98114 /// Has `neighbors.parquet` and `scalar_labels.parquet`.
115+ #[ clap( name = "laion-large-100m" ) ]
99116 LaionLarge100m ,
100117}
101118
@@ -305,12 +322,6 @@ impl VectorDataset {
305322 }
306323 }
307324 }
308-
309- /// Pick the default layout for this dataset — the first entry in [`Self::layouts`].
310- /// Stable across runs since the catalog table is statically ordered.
311- pub fn default_layout ( & self ) -> LayoutSpec {
312- self . layouts ( ) [ 0 ]
313- }
314325}
315326
316327#[ cfg( test) ]
0 commit comments