From 674f2b3a7e27950573ceace89f6c7953f8489c4c Mon Sep 17 00:00:00 2001 From: tison Date: Tue, 30 Dec 2025 14:42:25 +0800 Subject: [PATCH] chore: centralize default seed Signed-off-by: tison --- datasketches/src/countmin/mod.rs | 1 - datasketches/src/countmin/sketch.rs | 8 +++----- datasketches/src/hash/mod.rs | 19 +++++++++++++++++-- datasketches/src/hash/murmurhash.rs | 5 +++-- datasketches/tests/countmin_test.rs | 17 ++++++++--------- 5 files changed, 31 insertions(+), 19 deletions(-) diff --git a/datasketches/src/countmin/mod.rs b/datasketches/src/countmin/mod.rs index b7de4b8..2be9282 100644 --- a/datasketches/src/countmin/mod.rs +++ b/datasketches/src/countmin/mod.rs @@ -24,4 +24,3 @@ mod serialization; mod sketch; pub use self::sketch::CountMinSketch; -pub use self::sketch::DEFAULT_SEED; diff --git a/datasketches/src/countmin/sketch.rs b/datasketches/src/countmin/sketch.rs index e98e96e..3df99f3 100644 --- a/datasketches/src/countmin/sketch.rs +++ b/datasketches/src/countmin/sketch.rs @@ -30,13 +30,11 @@ use crate::countmin::serialization::PREAMBLE_LONGS_SHORT; use crate::countmin::serialization::SERIAL_VERSION; use crate::countmin::serialization::compute_seed_hash; use crate::error::Error; +use crate::hash::DEFAULT_UPDATE_SEED; use crate::hash::MurmurHash3X64128; const MAX_TABLE_ENTRIES: usize = 1 << 30; -/// Default seed used by the sketch. -pub const DEFAULT_SEED: u64 = 9001; - /// Count-Min sketch for estimating item frequencies. /// /// The sketch provides upper and lower bounds on estimated item frequencies @@ -59,7 +57,7 @@ impl CountMinSketch { /// Panics if `num_hashes` is 0, `num_buckets` is less than 3, or the /// total table size exceeds the supported limit. pub fn new(num_hashes: u8, num_buckets: u32) -> Self { - Self::with_seed(num_hashes, num_buckets, DEFAULT_SEED) + Self::with_seed(num_hashes, num_buckets, DEFAULT_UPDATE_SEED) } /// Creates a new Count-Min sketch with the provided seed. @@ -232,7 +230,7 @@ impl CountMinSketch { /// Deserializes a sketch from bytes using the default seed. pub fn deserialize(bytes: &[u8]) -> Result { - Self::deserialize_with_seed(bytes, DEFAULT_SEED) + Self::deserialize_with_seed(bytes, DEFAULT_UPDATE_SEED) } /// Deserializes a sketch from bytes using the provided seed. diff --git a/datasketches/src/hash/mod.rs b/datasketches/src/hash/mod.rs index ae9b280..a094f39 100644 --- a/datasketches/src/hash/mod.rs +++ b/datasketches/src/hash/mod.rs @@ -18,6 +18,21 @@ mod murmurhash; mod xxhash; -pub use murmurhash::MurmurHash3X64128; +pub(crate) use self::murmurhash::MurmurHash3X64128; #[allow(unused_imports)] -pub use xxhash::XxHash64; +pub(crate) use self::xxhash::XxHash64; + +/// The seed 9001 used in the sketch update methods is a prime number that was chosen very early +/// on in experimental testing. +/// +/// Choosing a seed is somewhat arbitrary, and the author cannot prove that this particular seed +/// is somehow superior to other seeds. There was some early Internet discussion that a seed of 0 +/// did not produce as clean avalanche diagrams as non-zero seeds, but this may have been more +/// related to the MurmurHash2 release, which did have some issues. As far as the author can +/// determine, MurmurHash3 does not have these problems. +/// +/// In order to perform set operations on two sketches it is critical that the same hash function +/// and seed are identical for both sketches, otherwise the assumed 1:1 relationship between the +/// original source key value and the hashed bit string would be violated. Once you have developed +/// a history of stored sketches you are stuck with it. +pub(crate) const DEFAULT_UPDATE_SEED: u64 = 9001; diff --git a/datasketches/src/hash/murmurhash.rs b/datasketches/src/hash/murmurhash.rs index 689ee75..e275e36 100644 --- a/datasketches/src/hash/murmurhash.rs +++ b/datasketches/src/hash/murmurhash.rs @@ -20,7 +20,8 @@ use std::hash::Hasher; use byteorder::ByteOrder; use byteorder::LE; -const DEFAULT_SEED: u64 = 9001; +use crate::hash::DEFAULT_UPDATE_SEED; + const C1: u64 = 0x87c37b91114253d5; const C2: u64 = 0x4cf5ad432745937f; @@ -122,7 +123,7 @@ impl MurmurHash3X64128 { impl Default for MurmurHash3X64128 { fn default() -> Self { - Self::with_seed(DEFAULT_SEED) + Self::with_seed(DEFAULT_UPDATE_SEED) } } diff --git a/datasketches/tests/countmin_test.rs b/datasketches/tests/countmin_test.rs index 06e1041..b4b3685 100644 --- a/datasketches/tests/countmin_test.rs +++ b/datasketches/tests/countmin_test.rs @@ -16,14 +16,13 @@ // under the License. use datasketches::countmin::CountMinSketch; -use datasketches::countmin::DEFAULT_SEED; #[test] fn test_init_defaults() { let sketch = CountMinSketch::new(3, 5); assert_eq!(sketch.num_hashes(), 3); assert_eq!(sketch.num_buckets(), 5); - assert_eq!(sketch.seed(), DEFAULT_SEED); + assert_eq!(sketch.seed(), 9001); assert!(sketch.is_empty()); assert_eq!(sketch.total_weight(), 0); assert_eq!(sketch.estimate("missing"), 0); @@ -41,7 +40,7 @@ fn test_parameter_suggestions() { assert_eq!(CountMinSketch::suggest_num_hashes(0.997300204), 6); let buckets = CountMinSketch::suggest_num_buckets(0.1); - let sketch = CountMinSketch::with_seed(3, buckets, DEFAULT_SEED); + let sketch = CountMinSketch::new(3, buckets); assert!(sketch.relative_error() <= 0.1); } @@ -71,8 +70,8 @@ fn test_negative_weights() { #[test] fn test_merge() { - let mut left = CountMinSketch::with_seed(3, 64, DEFAULT_SEED); - let mut right = CountMinSketch::with_seed(3, 64, DEFAULT_SEED); + let mut left = CountMinSketch::new(3, 64); + let mut right = CountMinSketch::new(3, 64); for _ in 0..10 { left.update("a"); } @@ -124,14 +123,14 @@ fn test_invalid_buckets() { #[test] #[should_panic(expected = "Incompatible sketch configuration.")] fn test_merge_incompatible() { - let mut left = CountMinSketch::with_seed(3, 64, DEFAULT_SEED); - let right = CountMinSketch::with_seed(2, 64, DEFAULT_SEED); + let mut left = CountMinSketch::new(3, 64); + let right = CountMinSketch::new(2, 64); left.merge(&right); } #[test] fn test_increment_single_key_like_rust_count_min_sketch() { - let mut sketch = CountMinSketch::with_seed(4, 32, DEFAULT_SEED); + let mut sketch = CountMinSketch::new(4, 32); for _ in 0..300 { sketch.update("key"); } @@ -140,7 +139,7 @@ fn test_increment_single_key_like_rust_count_min_sketch() { #[test] fn test_increment_multi_like_rust_count_min_sketch() { - let mut sketch = CountMinSketch::with_seed(6, 128, DEFAULT_SEED); + let mut sketch = CountMinSketch::new(6, 128); for i in 0..1_000_000u64 { sketch.update(i % 100); }