Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion datasketches/src/countmin/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,3 @@ mod serialization;

mod sketch;
pub use self::sketch::CountMinSketch;
pub use self::sketch::DEFAULT_SEED;
8 changes: 3 additions & 5 deletions datasketches/src/countmin/sketch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,11 @@ use crate::countmin::serialization::PREAMBLE_LONGS_SHORT;
use crate::countmin::serialization::SERIAL_VERSION;
use crate::countmin::serialization::compute_seed_hash;
use crate::error::Error;
use crate::hash::DEFAULT_UPDATE_SEED;
use crate::hash::MurmurHash3X64128;

const MAX_TABLE_ENTRIES: usize = 1 << 30;

/// Default seed used by the sketch.
pub const DEFAULT_SEED: u64 = 9001;

/// Count-Min sketch for estimating item frequencies.
///
/// The sketch provides upper and lower bounds on estimated item frequencies
Expand All @@ -59,7 +57,7 @@ impl CountMinSketch {
/// Panics if `num_hashes` is 0, `num_buckets` is less than 3, or the
/// total table size exceeds the supported limit.
pub fn new(num_hashes: u8, num_buckets: u32) -> Self {
Self::with_seed(num_hashes, num_buckets, DEFAULT_SEED)
Self::with_seed(num_hashes, num_buckets, DEFAULT_UPDATE_SEED)
}

/// Creates a new Count-Min sketch with the provided seed.
Expand Down Expand Up @@ -232,7 +230,7 @@ impl CountMinSketch {

/// Deserializes a sketch from bytes using the default seed.
pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
Self::deserialize_with_seed(bytes, DEFAULT_SEED)
Self::deserialize_with_seed(bytes, DEFAULT_UPDATE_SEED)
}

/// Deserializes a sketch from bytes using the provided seed.
Expand Down
19 changes: 17 additions & 2 deletions datasketches/src/hash/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,21 @@
mod murmurhash;
mod xxhash;

pub use murmurhash::MurmurHash3X64128;
pub(crate) use self::murmurhash::MurmurHash3X64128;
#[allow(unused_imports)]
pub use xxhash::XxHash64;
pub(crate) use self::xxhash::XxHash64;

/// The seed 9001 used in the sketch update methods is a prime number that was chosen very early
/// on in experimental testing.
///
/// Choosing a seed is somewhat arbitrary, and the author cannot prove that this particular seed
/// is somehow superior to other seeds. There was some early Internet discussion that a seed of 0
/// did not produce as clean avalanche diagrams as non-zero seeds, but this may have been more
/// related to the MurmurHash2 release, which did have some issues. As far as the author can
/// determine, MurmurHash3 does not have these problems.
///
/// In order to perform set operations on two sketches it is critical that the same hash function
/// and seed are identical for both sketches, otherwise the assumed 1:1 relationship between the
/// original source key value and the hashed bit string would be violated. Once you have developed
/// a history of stored sketches you are stuck with it.
pub(crate) const DEFAULT_UPDATE_SEED: u64 = 9001;
5 changes: 3 additions & 2 deletions datasketches/src/hash/murmurhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ use std::hash::Hasher;
use byteorder::ByteOrder;
use byteorder::LE;

const DEFAULT_SEED: u64 = 9001;
use crate::hash::DEFAULT_UPDATE_SEED;

const C1: u64 = 0x87c37b91114253d5;
const C2: u64 = 0x4cf5ad432745937f;

Expand Down Expand Up @@ -122,7 +123,7 @@ impl MurmurHash3X64128 {

impl Default for MurmurHash3X64128 {
fn default() -> Self {
Self::with_seed(DEFAULT_SEED)
Self::with_seed(DEFAULT_UPDATE_SEED)
}
}

Expand Down
17 changes: 8 additions & 9 deletions datasketches/tests/countmin_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,13 @@
// under the License.

use datasketches::countmin::CountMinSketch;
use datasketches::countmin::DEFAULT_SEED;

#[test]
fn test_init_defaults() {
let sketch = CountMinSketch::new(3, 5);
assert_eq!(sketch.num_hashes(), 3);
assert_eq!(sketch.num_buckets(), 5);
assert_eq!(sketch.seed(), DEFAULT_SEED);
assert_eq!(sketch.seed(), 9001);
assert!(sketch.is_empty());
assert_eq!(sketch.total_weight(), 0);
assert_eq!(sketch.estimate("missing"), 0);
Expand All @@ -41,7 +40,7 @@ fn test_parameter_suggestions() {
assert_eq!(CountMinSketch::suggest_num_hashes(0.997300204), 6);

let buckets = CountMinSketch::suggest_num_buckets(0.1);
let sketch = CountMinSketch::with_seed(3, buckets, DEFAULT_SEED);
let sketch = CountMinSketch::new(3, buckets);
assert!(sketch.relative_error() <= 0.1);
}

Expand Down Expand Up @@ -71,8 +70,8 @@ fn test_negative_weights() {

#[test]
fn test_merge() {
let mut left = CountMinSketch::with_seed(3, 64, DEFAULT_SEED);
let mut right = CountMinSketch::with_seed(3, 64, DEFAULT_SEED);
let mut left = CountMinSketch::new(3, 64);
let mut right = CountMinSketch::new(3, 64);
for _ in 0..10 {
left.update("a");
}
Expand Down Expand Up @@ -124,14 +123,14 @@ fn test_invalid_buckets() {
#[test]
#[should_panic(expected = "Incompatible sketch configuration.")]
fn test_merge_incompatible() {
let mut left = CountMinSketch::with_seed(3, 64, DEFAULT_SEED);
let right = CountMinSketch::with_seed(2, 64, DEFAULT_SEED);
let mut left = CountMinSketch::new(3, 64);
let right = CountMinSketch::new(2, 64);
left.merge(&right);
}

#[test]
fn test_increment_single_key_like_rust_count_min_sketch() {
let mut sketch = CountMinSketch::with_seed(4, 32, DEFAULT_SEED);
let mut sketch = CountMinSketch::new(4, 32);
for _ in 0..300 {
sketch.update("key");
}
Expand All @@ -140,7 +139,7 @@ fn test_increment_single_key_like_rust_count_min_sketch() {

#[test]
fn test_increment_multi_like_rust_count_min_sketch() {
let mut sketch = CountMinSketch::with_seed(6, 128, DEFAULT_SEED);
let mut sketch = CountMinSketch::new(6, 128);
for i in 0..1_000_000u64 {
sketch.update(i % 100);
}
Expand Down