diff --git a/Cargo.toml b/Cargo.toml index dba78d8..4c20502 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "massmap" -version = "0.1.3" +version = "0.1.4" edition = "2024" authors = ["SF-Zhou "] homepage = "https://github.com/SF-Zhou/massmap" diff --git a/README.md b/README.md index 86db8b7..df52437 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ massmap convert -i examples/demo.json -o examples/demo.massmap --bucket-count 32 #> { #> "header": { #> "meta_offset": 486, -#> "meta_length": 192 +#> "meta_length": 239 #> }, #> "meta": { #> "entry_count": 47, @@ -82,7 +82,9 @@ massmap convert -i examples/demo.json -o examples/demo.massmap --bucket-count 32 #> "parameters": { #> "seed": 0 #> } -#> } +#> }, +#> "key_type": "alloc::string::String", +#> "value_type": "serde_json::value::Value" #> } #> } @@ -91,7 +93,7 @@ massmap info examples/demo.massmap -k 1999 #> { #> "header": { #> "meta_offset": 486, -#> "meta_length": 192 +#> "meta_length": 239 #> }, #> "meta": { #> "entry_count": 47, @@ -106,15 +108,17 @@ massmap info examples/demo.massmap -k 1999 #> "parameters": { #> "seed": 0 #> } -#> } +#> }, +#> "key_type": "alloc::string::String", +#> "value_type": "serde_json::value::Value" #> } #> } -#> 1999: Some(Number(7229)) +#> Get 1999: Some(Number(7229)) # 4. view the raw bytes of the massmap file hexdump -C examples/demo.massmap #> 00000000 4d 41 53 53 4d 41 50 21 00 00 00 00 00 00 01 e6 |MASSMAP!........| -#> 00000010 00 00 00 00 00 00 00 c0 92 92 a4 31 39 39 34 cd |...........1994.| +#> 00000010 00 00 00 00 00 00 00 ef 92 92 a4 31 39 39 34 cd |...........1994.| #> 00000020 0f f1 92 a4 32 30 32 30 ce 00 01 18 94 91 92 a4 |....2020........| #> 00000030 32 30 32 34 ce 00 01 76 05 92 92 a4 31 39 38 39 |2024...v....1989| #> 00000040 cd 06 00 92 a4 32 30 30 39 cd 66 44 92 92 a4 31 |.....2009.fD...1| @@ -143,20 +147,22 @@ hexdump -C examples/demo.massmap #> 000001b0 e9 91 92 a4 31 39 38 33 cd 02 4c 91 92 a4 31 39 |....1983..L...19| #> 000001c0 38 32 cd 02 15 92 92 a4 31 39 39 30 cd 06 7f 92 |82......1990....| #> 000001d0 a4 32 30 31 39 ce 00 01 11 be 91 92 a4 32 30 32 |.2019........202| -#> 000001e0 32 ce 00 01 4e c2 92 95 2f 20 1b 92 00 20 92 a8 |2...N.../ ... ..| -#> 000001f0 66 6f 6c 64 68 61 73 68 81 a4 73 65 65 64 00 dc |foldhash..seed..| -#> 00000200 00 20 93 18 15 02 93 2d 0c 01 93 39 13 02 93 4c |. .....-...9...L| -#> 00000210 13 02 93 5f 25 04 93 cc 84 0a 01 93 cc 8e 1c 03 |..._%...........| -#> 00000220 93 cc aa 13 02 93 cc bd 0a 01 93 cc c7 0a 01 93 |................| -#> 00000230 00 00 00 93 00 00 00 93 cc d1 15 02 93 cc e6 13 |................| -#> 00000240 02 93 00 00 00 93 cc f9 25 04 93 cd 01 1e 0a 01 |........%.......| -#> 00000250 93 cd 01 28 13 02 93 cd 01 3b 0a 01 93 cd 01 45 |...(.....;.....E| -#> 00000260 13 02 93 00 00 00 93 cd 01 58 13 02 93 00 00 00 |.........X......| -#> 00000270 93 cd 01 6b 0c 01 93 cd 01 77 0a 01 93 cd 01 81 |...k.....w......| -#> 00000280 13 02 93 cd 01 94 13 02 93 cd 01 a7 0a 01 93 cd |................| -#> 00000290 01 b1 0a 01 93 cd 01 bb 0a 01 93 cd 01 c5 15 02 |................| -#> 000002a0 93 cd 01 da 0c 01 |......| -#> 000002a6 +#> 000001e0 32 ce 00 01 4e c2 92 97 2f 20 1b 92 00 20 92 a8 |2...N.../ ... ..| +#> 000001f0 66 6f 6c 64 68 61 73 68 81 a4 73 65 65 64 00 b5 |foldhash..seed..| +#> 00000200 61 6c 6c 6f 63 3a 3a 73 74 72 69 6e 67 3a 3a 53 |alloc::string::S| +#> 00000210 74 72 69 6e 67 b8 73 65 72 64 65 5f 6a 73 6f 6e |tring.serde_json| +#> 00000220 3a 3a 76 61 6c 75 65 3a 3a 56 61 6c 75 65 dc 00 |::value::Value..| +#> 00000230 20 93 18 15 02 93 2d 0c 01 93 39 13 02 93 4c 13 | .....-...9...L.| +#> 00000240 02 93 5f 25 04 93 cc 84 0a 01 93 cc 8e 1c 03 93 |.._%............| +#> 00000250 cc aa 13 02 93 cc bd 0a 01 93 cc c7 0a 01 93 00 |................| +#> 00000260 00 00 93 00 00 00 93 cc d1 15 02 93 cc e6 13 02 |................| +#> 00000270 93 00 00 00 93 cc f9 25 04 93 cd 01 1e 0a 01 93 |.......%........| +#> 00000280 cd 01 28 13 02 93 cd 01 3b 0a 01 93 cd 01 45 13 |..(.....;.....E.| +#> 00000290 02 93 00 00 00 93 cd 01 58 13 02 93 00 00 00 93 |........X.......| +#> 000002a0 cd 01 6b 0c 01 93 cd 01 77 0a 01 93 cd 01 81 13 |..k.....w.......| +#> 000002b0 02 93 cd 01 94 13 02 93 cd 01 a7 0a 01 93 cd 01 |................| +#> 000002c0 b1 0a 01 93 cd 01 bb 0a 01 93 cd 01 c5 15 02 93 |................| +#> 000002d0 cd 01 da 0c 01 |.....| ``` ## Configuration diff --git a/examples/massmap.rs b/examples/massmap.rs index d55a88f..cb4a166 100644 --- a/examples/massmap.rs +++ b/examples/massmap.rs @@ -2,9 +2,11 @@ use clap::{Parser, Subcommand}; use foldhash::fast::FixedState; use massmap::{ MassMap, MassMapBuilder, MassMapDefaultHashLoader, MassMapHashConfig, MassMapHashLoader, - MassMapMerger, + MassMapInner, MassMapMerger, MassMapReader, }; +use serde::{Deserialize, Serialize}; use serde_json::Value; +use std::fmt::Display; use std::fs::File; use std::io::{BufReader, Error, ErrorKind, Result}; use std::path::{Path, PathBuf}; @@ -116,26 +118,27 @@ impl MassMapHashLoader for MassMapTolerableHashLoader { } } -fn run_info(args: InfoArgs) -> Result<()> { - let file = File::open(&args.input)?; - - let map = MassMap::::load(file)?; - - let json = serde_json::to_string_pretty(&map.info()) - .map_err(|e| Error::other(format!("Failed to format JSON: {e}")))?; - println!("{}", json); - - if let Some(key) = args.key { - println!("{}: {:?}", key, map.get(&key)?); +fn do_query( + map: MassMap, + key: Option, + bucket: Option, +) -> Result<()> +where + K: Serialize + for<'de> Deserialize<'de> + Display + std::hash::Hash + Eq, + R: MassMapReader, +{ + if let Some(key) = key { + println!("Get {}: {:?}", key, map.get(&key)?); } - if let Some(bucket_index) = args.bucket { - if bucket_index >= map.meta.bucket_count { + if let Some(bucket_index) = bucket { + if bucket_index as usize >= map.bucket_count() { return Err(Error::new( ErrorKind::InvalidInput, format!( "Bucket index {} out of range >= {}", - bucket_index, map.meta.bucket_count + bucket_index, + map.bucket_count() ), )); } @@ -148,6 +151,56 @@ fn run_info(args: InfoArgs) -> Result<()> { Ok(()) } +fn run_info(args: InfoArgs) -> Result<()> { + let file = File::open(&args.input)?; + + let map = MassMapInner::<_, MassMapTolerableHashLoader>::load(file)?; + + let json = serde_json::to_string_pretty(&map.info()) + .map_err(|e| Error::other(format!("Failed to format JSON: {e}")))?; + println!("{}", json); + + match map.meta.key_type.as_str() { + "u8" => do_query( + map.cast::(), + args.key.map(|x| x.parse().unwrap()), + args.bucket, + )?, + "u16" => do_query( + map.cast::(), + args.key.map(|x| x.parse().unwrap()), + args.bucket, + )?, + "u32" => do_query( + map.cast::(), + args.key.map(|x| x.parse().unwrap()), + args.bucket, + )?, + "u64" => do_query( + map.cast::(), + args.key.map(|x| x.parse().unwrap()), + args.bucket, + )?, + "u128" => do_query( + map.cast::(), + args.key.map(|x| x.parse().unwrap()), + args.bucket, + )?, + _ if map.meta.key_type == std::any::type_name::() => { + do_query(map.cast::(), args.key, args.bucket)? + } + _ => { + assert!( + args.key.is_none() && args.bucket.is_none(), + "Unsupported key type: {}", + map.meta.key_type + ); + } + } + + Ok(()) +} + fn run_convert(args: ConvertArgs) -> Result<()> { let entries = load_entries_from_json(&args.input)?; let writer = File::create(&args.output)?; diff --git a/src/builder.rs b/src/builder.rs index bc8e5c4..2627ead 100644 --- a/src/builder.rs +++ b/src/builder.rs @@ -212,6 +212,8 @@ impl MassMapBuilder { bucket_count: self.bucket_count, occupied_bucket_count, occupied_bucket_range, + key_type: std::any::type_name::().to_string(), + value_type: std::any::type_name::().to_string(), }; let meta_offset = offset.load(Ordering::Relaxed) + buf_writer.buffer().len() as u64; @@ -292,39 +294,39 @@ impl MassMapMerger { )); } - maps.sort_by_key(|m| m.meta.occupied_bucket_range.start); + maps.sort_by_key(|m| m.meta().occupied_bucket_range.start); let mut entry_count = 0; let mut bucket_metas = - vec![MassMapBucketMeta::default(); maps[0].meta.bucket_count as usize]; - let hash_config = maps[0].meta.hash_config.clone(); + vec![MassMapBucketMeta::default(); maps[0].meta().bucket_count as usize]; + let hash_config = maps[0].meta().hash_config.clone(); let mut occupied_bucket_count = 0; let mut occupied_bucket_range = 0..0; let mut global_offset = 0u64; for map in &maps { - if map.meta.hash_config != hash_config { + if map.meta().hash_config != hash_config { return Err(Error::new( ErrorKind::InvalidData, "Incompatible hash configurations between massmaps", )); } - if map.meta.bucket_count != bucket_metas.len() as u64 { + if map.meta().bucket_count != bucket_metas.len() as u64 { return Err(Error::new( ErrorKind::InvalidData, "Incompatible bucket counts between massmaps", )); } - if map.meta.entry_count == 0 { + if map.meta().entry_count == 0 { continue; } - occupied_bucket_count += map.meta.occupied_bucket_count; + occupied_bucket_count += map.meta().occupied_bucket_count; if occupied_bucket_range.is_empty() { - occupied_bucket_range = map.meta.occupied_bucket_range.clone(); - } else if occupied_bucket_range.end <= map.meta.occupied_bucket_range.start { - occupied_bucket_range.end = map.meta.occupied_bucket_range.end; + occupied_bucket_range = map.meta().occupied_bucket_range.clone(); + } else if occupied_bucket_range.end <= map.meta().occupied_bucket_range.start { + occupied_bucket_range.end = map.meta().occupied_bucket_range.end; } else { return Err(Error::new( ErrorKind::InvalidData, @@ -333,24 +335,24 @@ impl MassMapMerger { } // update bucket metas. - for idx in map.meta.occupied_bucket_range.clone() { + for idx in map.meta().occupied_bucket_range.clone() { let bucket_meta = &mut bucket_metas[idx as usize]; - *bucket_meta = map.bucket_metas[idx as usize]; + *bucket_meta = map.bucket_metas()[idx as usize]; if bucket_meta.count > 0 { bucket_meta.offset += global_offset; } } - entry_count += map.meta.entry_count; + entry_count += map.meta().entry_count; // copy buckets from reader to writer directly. let mut current_offset = MassMapHeader::SIZE as u64; - let finished_offset = map.header.meta_offset; + let finished_offset = map.header().meta_offset; while current_offset < finished_offset { let chunk = std::cmp::min( finished_offset - current_offset, self.writer_buffer_size as u64, ); - map.reader.read_exact_at(current_offset, chunk, |data| { + map.reader().read_exact_at(current_offset, chunk, |data| { writer.write_all_at(data, global_offset + MassMapHeader::SIZE as u64)?; Ok(()) })?; @@ -365,6 +367,8 @@ impl MassMapMerger { bucket_count: bucket_metas.len() as u64, occupied_bucket_count, occupied_bucket_range, + key_type: std::any::type_name::().to_string(), + value_type: std::any::type_name::().to_string(), }; let meta_offset = global_offset + MassMapHeader::SIZE as u64; @@ -585,9 +589,9 @@ mod tests { threads.push(std::thread::spawn(move || { let entries = (0..N).filter(|v| (v % M) / (M / P) == i).map(|v| (v, v)); let map = create_simple_map(entries, M, M); - assert_eq!(map.meta.occupied_bucket_count, M / P); - assert_eq!(map.meta.entry_count, N / P); - assert_eq!(map.meta.occupied_bucket_range.start, (M / P) * i); + assert_eq!(map.meta().occupied_bucket_count, M / P); + assert_eq!(map.meta().entry_count, N / P); + assert_eq!(map.meta().occupied_bucket_range.start, (M / P) * i); for item in map.iter() { let (k, v) = item.unwrap(); diff --git a/src/lib.rs b/src/lib.rs index 7b2ffc3..5b4fef3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -42,7 +42,7 @@ mod writer; pub use writer::MassMapWriter; mod massmap; -pub use massmap::{MassMap, MassMapIter}; +pub use massmap::{MassMap, MassMapInner, MassMapIter}; mod builder; pub use builder::{MassMapBuilder, MassMapMerger}; diff --git a/src/massmap.rs b/src/massmap.rs index ab0ab65..48a0f54 100644 --- a/src/massmap.rs +++ b/src/massmap.rs @@ -22,7 +22,7 @@ use crate::{ /// - `H`: hash loader used to reconstruct the [`BuildHasher`](BuildHasher) from /// the persisted [`MassMapHashConfig`](crate::MassMapHashConfig). #[derive(Debug)] -pub struct MassMap { +pub struct MassMapInner { /// Header serialized at the start of the massmap file. pub header: MassMapHeader, /// Metadata describing the layout and hashing strategy of the backing file. @@ -33,25 +33,24 @@ pub struct MassMap { + inner: MassMapInner, /// Phantom data to associate key and value types. phantom_data: PhantomData<(K, V)>, } -impl MassMap -where - K: for<'de> Deserialize<'de> + Eq + Hash, - V: for<'de> Deserialize<'de> + Clone, -{ - /// Constructs a massmap from a [`MassMapReader`] implementation. +impl MassMapInner { + /// Constructs an untyped massmap from a [`MassMapReader`] implementation. /// /// The method validates the leading header (magic number, metadata offset and /// length) and deserializes [`MassMapMeta`]. Any IO or deserialization errors /// are forwarded to the caller. - /// - /// # Errors - /// - /// Returns an error when the magic number is invalid, the metadata cannot be - /// read in full, or the MessagePack payload fails to deserialize. pub fn load(reader: R) -> Result { let header = reader.read_exact_at(0, MassMapHeader::SIZE as u64, MassMapHeader::deserialize)?; @@ -67,13 +66,12 @@ where })?; let build_hasher = H::load(&meta.hash_config)?; - Ok(MassMap { + Ok(MassMapInner { header, meta, bucket_metas, build_hasher, reader, - phantom_data: PhantomData, }) } @@ -95,6 +93,81 @@ where } } + /// Casts this untyped massmap into a typed view with the specified key and value types. + pub fn cast(self) -> MassMap + where + K: for<'de> Deserialize<'de> + Eq + Hash, + V: for<'de> Deserialize<'de> + Clone, + { + MassMap { + inner: self, + phantom_data: PhantomData, + } + } +} + +impl MassMap +where + K: for<'de> Deserialize<'de> + Eq + Hash, + V: for<'de> Deserialize<'de> + Clone, +{ + /// Constructs a massmap from a [`MassMapReader`] implementation. + /// + /// The method validates the leading header (magic number, metadata offset and + /// length) and deserializes [`MassMapMeta`]. Any IO or deserialization errors + /// are forwarded to the caller. + pub fn load(reader: R) -> Result { + let inner = MassMapInner::load(reader)?; + Ok(MassMap { + inner, + phantom_data: PhantomData, + }) + } + + /// Returns the number of entries written into this map. + pub fn len(&self) -> u64 { + self.inner.len() + } + + /// Returns `true` if the map contains no entries. + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + /// Returns the number of buckets in the underlying massmap. + /// + /// This is mainly intended for testing and diagnostics. + pub fn bucket_count(&self) -> usize { + self.inner.bucket_metas.len() + } + + /// Exposes a reference to the underlying immutable metadata. + /// + /// This is primarily intended for internal crate use (e.g. merging maps). + pub(crate) fn meta(&self) -> &MassMapMeta { + &self.inner.meta + } + + /// Exposes a reference to the underlying bucket metadata array. + pub(crate) fn bucket_metas(&self) -> &[MassMapBucketMeta] { + &self.inner.bucket_metas + } + + /// Exposes the underlying header for internal crate use. + pub(crate) fn header(&self) -> &MassMapHeader { + &self.inner.header + } + + /// Exposes a reference to the underlying reader for internal crate use. + pub(crate) fn reader(&self) -> &R { + &self.inner.reader + } + + /// Returns information about the map's structure and contents. + pub fn info(&self) -> MassMapInfo { + self.inner.info() + } + /// Attempts to deserialize the value associated with `k`. /// /// Keys are hashed using the stored seed and only the relevant bucket is @@ -138,11 +211,11 @@ where { let iov = keys.into_iter().map(|key| { let index = self.bucket_index(key.borrow()); - let bucket = &self.bucket_metas[index]; + let bucket = &self.inner.bucket_metas[index]; (key, bucket.offset, bucket.length as u64) }); - self.reader.batch_read_at(iov, |expected, data| { + self.inner.reader.batch_read_at(iov, |expected, data| { if data.is_empty() { return Ok(None); } @@ -204,12 +277,13 @@ where /// Returns an error if the reader fails to provide the bucket or if the /// serialized data cannot be deserialized into `(K, V)` pairs. pub fn get_bucket(&self, index: usize) -> Result> { - let bucket = &self.bucket_metas[index]; + let bucket = &self.inner.bucket_metas[index]; if bucket.count == 0 { return Ok(Vec::new()); } - self.reader + self.inner + .reader .read_exact_at(bucket.offset, bucket.length as u64, |data| { let entries: Vec<(K, V)> = rmp_serde::from_slice(data).map_err(|e| { Error::new( @@ -226,7 +300,7 @@ where K: Borrow, Q: Eq + Hash + ?Sized, { - (self.build_hasher.hash_one(k) % (self.bucket_metas.len() as u64)) as usize + (self.inner.build_hasher.hash_one(k) % (self.inner.bucket_metas.len() as u64)) as usize } } @@ -256,7 +330,7 @@ where } // Move to the next bucket - if self.bucket_index >= self.map.bucket_metas.len() { + if self.bucket_index >= self.map.inner.bucket_metas.len() { return None; } @@ -308,8 +382,11 @@ mod tests { assert_eq!(info, map.info()); assert_eq!(map.len(), 5); assert!(!map.is_empty()); - assert_eq!(map.bucket_metas.len(), 8); - assert_eq!(map.bucket_metas.iter().map(|b| b.count).sum::(), 5); + assert_eq!(map.bucket_count(), 8); + assert_eq!( + map.inner.bucket_metas.iter().map(|b| b.count).sum::(), + 5 + ); assert_eq!(map.get("apple").unwrap(), Some(1)); assert_eq!(map.get("banana").unwrap(), Some(2)); assert_eq!(map.get("steins").unwrap(), None); @@ -342,9 +419,10 @@ mod tests { let map = MassMap::::load(file).unwrap(); assert_eq!(map.len(), N as u64); - assert_eq!(map.bucket_metas.len(), N as usize); + assert_eq!(map.bucket_count(), N as usize); assert_eq!( - map.bucket_metas + map.inner + .bucket_metas .iter() .map(|b| b.count as usize) .sum::(), @@ -391,25 +469,25 @@ mod tests { file.write_all_at(b"invalid data", info.header.meta_offset) .unwrap(); let file = std::fs::File::open(&path).unwrap(); - MassMap::::load(file).unwrap_err(); + assert!(MassMap::::load(file).is_err()); } { file.set_len(info.header.meta_offset + info.header.meta_length - 8) .unwrap(); let file = std::fs::File::open(&path).unwrap(); - MassMap::::load(file).unwrap_err(); + assert!(MassMap::::load(file).is_err()); } { file.write_all_at(b"invalid data", 0).unwrap(); let file = std::fs::File::open(&path).unwrap(); - MassMap::::load(file).unwrap_err(); + assert!(MassMap::::load(file).is_err()); } { let file = std::fs::File::create(&path).unwrap(); - MassMap::::load(file).unwrap_err(); + assert!(MassMap::::load(file).is_err()); } let writer = std::fs::File::create(&path).unwrap(); @@ -592,7 +670,7 @@ mod tests { let file = std::fs::File::open(&path).unwrap(); let map = MassMap::::load(file).unwrap(); - for bucket in &map.bucket_metas { + for bucket in &map.inner.bucket_metas { if bucket.offset != 24 && bucket.count > 0 { // Corrupt the first non-empty bucket let file = std::fs::OpenOptions::new() @@ -616,4 +694,30 @@ mod tests { } assert!(found_error); } + + #[test] + fn test_massmap_cast() { + let dir = tempfile::tempdir().unwrap(); + let file = dir.path().join("massmap_cast.bin"); + let writer = std::fs::File::create(&file).unwrap(); + let entries = vec![ + ("apple", 1), + ("banana", 2), + ("cherry", 3), + ("date", 4), + ("elderberry", 5), + ]; + let builder = MassMapBuilder::default() + .with_hash_seed(42) + .with_bucket_count(8); + builder.build(&writer, entries.iter()).unwrap(); + + let file = std::fs::File::open(&file).unwrap(); + let map = MassMapInner::<_>::load(file).unwrap(); + + let casted_map: MassMap = map.cast(); + assert_eq!(casted_map.get("apple").unwrap(), Some(1i64)); + assert_eq!(casted_map.get("banana").unwrap(), Some(2i64)); + assert_eq!(casted_map.get("steins").unwrap(), None); + } } diff --git a/src/meta.rs b/src/meta.rs index 1acf6be..db6596c 100644 --- a/src/meta.rs +++ b/src/meta.rs @@ -80,6 +80,10 @@ pub struct MassMapMeta { /// Hash configuration used to derive the [`BuildHasher`](std::hash::BuildHasher) /// when reopening the map. pub hash_config: MassMapHashConfig, + /// Key type name. + pub key_type: String, + /// Value type name. + pub value_type: String, } /// Summary returned by [`MassMapBuilder::build`](crate::MassMapBuilder::build).