Add bloom filter (#84)

2025-07-07 01:44:52 +08:00 · 2018-06-30 10:07:19 -07:00
parent b33f1d52dc
commit 41a6430532
5 changed files with 358 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -38,6 +38,7 @@ the data.
    * `A` [Fenwick Tree](src/data-structures/tree/fenwick-tree) (Binary Indexed Tree)
 * `A` [Graph](src/data-structures/graph) (both directed and undirected)
 * `A` [Disjoint Set](src/data-structures/disjoint-set)
 * `A` [Bloom Filter](src/data-structures/bloom-filter)
 ## Algorithms
@ -231,6 +232,7 @@ Below is the list of some of the most used Big O notations and their performance
 | **B-Tree**              | log(n)    | log(n)    | log(n)    | log(n)    |           |
 | **Red-Black Tree**      | log(n)    | log(n)    | log(n)    | log(n)    |           |
 | **AVL Tree**            | log(n)    | log(n)    | log(n)    | log(n)    |           |
 | **Bloom Filter**        |           | 1         | 1         |           |           |
 ### Array Sorting Algorithms Complexity
--- a/src/data-structures/bloom-filter/BloomFilter.js
+++ b/src/data-structures/bloom-filter/BloomFilter.js
@ -0,0 +1,127 @@
 export default class BloomFilter {
  /**
   * @param {number} size
   */
  constructor(size = 100) {
    // Bloom filter size directly affects the likelihood of false positives.
    // The bigger the size the lower the likelihood of false positives.
    this.size = size;
    this.storage = this.createStore(size);
  }
  /**
   * @param {string} item
   */
  insert(item) {
    const hashValues = this.getHashValues(item);
    // Set each hashValue index to true
    hashValues.forEach(val => this.storage.setValue(val));
  }
  /**
   * @param {string} item
   * @return {boolean}
   */
  mayContain(item) {
    const hashValues = this.getHashValues(item);
    for (let i = 0; i < hashValues.length; i += 1) {
      if (!this.storage.getValue(hashValues[i])) {
        // We know that the item was definitely not inserted.
        return false;
      }
    }
    // The item may or may not have been inserted.
    return true;
  }
  /**
   * Creates the data store for our filter.
   * We use this method to generate the store in order to
   * encapsulate the data itself and only provide access
   * to the necessary methods.
   *
   * @param {number} size
   * @return {Object}
   */
  createStore(size) {
    const storage = [];
    // Initialize all indexes to false
    for (let i = 0; i < size; i += 1) {
      storage.push(false);
    }
    const storageInterface = {
      getValue(index) {
        return storage[index];
      },
      setValue(index) {
        storage[index] = true;
      },
    };
    return storageInterface;
  }
  /**
   * @param {string} str
   * @return {number}
   */
  hash1(str) {
    let hash = 0;
    for (let i = 0; i < str.length; i += 1) {
      const char = str.charCodeAt(i);
      hash = (hash << 5) + hash + char;
      hash &= hash; // Convert to 32bit integer
      hash = Math.abs(hash);
    }
    return hash % this.size;
  }
  /**
   * @param {string} str
   * @return {number}
   */
  hash2(str) {
    let hash = 5381;
    for (let i = 0; i < str.length; i += 1) {
      const char = str.charCodeAt(i);
      hash = (hash << 5) + hash + char; /* hash * 33 + c */
    }
    return hash % this.size;
  }
  /**
   * @param {string} str
   * @return {number}
   */
  hash3(str) {
    let hash = 0;
    for (let i = 0; i < str.length; i += 1) {
      const char = str.charCodeAt(i);
      hash = (hash << 5) - hash;
      hash += char;
      hash &= hash; // Convert to 32bit integer
    }
    return hash % this.size;
  }
  /**
   * Runs all 3 hash functions on the input and returns an array of results
   *
   * @param {string} str
   * @return {number[]}
   */
  getHashValues(item) {
    return [this.hash1(item), Math.abs(this.hash2(item)), Math.abs(this.hash3(item))];
  }
 }
--- a/src/data-structures/bloom-filter/README.md
+++ b/src/data-structures/bloom-filter/README.md
@ -0,0 +1,104 @@
 # Bloom Filter
 A bloom filter is a data structure designed to
 test whether an element is present in a set. It
 is designed to be blazingly fast and use minimal
 memory at the cost of potential false positives.
 ![Bloom Filter](https://upload.wikimedia.org/wikipedia/commons/a/ac/Bloom_filter.svg)
 ## Operations
 There are two main operations a bloom filter can
 perform: insertion and search. Search may result in
 false positives. Deletion is not possible.
 In other words, the filter can take in items. When
 we go to check if an item has previously been
 inserted, it can tell us either "no" or "maybe".
 Both insertion and search are O(1) operations.
 ## Making the filter
 A bloom filter is created by allotting a certain size.
 In our example, we use 100 as a default length. All
 locations are initialized to `false`.
 ### Insertion
 During insertion, a number of hash functions,
 in our case 3 hash functions, are used to create
 hashes of the input. These hash functions output
 indexes. At every index received, we simply change
 the value in our bloom filter to `true`.
 ### Search
 During a search, the same hash functions are called
 and used to hash the input. We then check if the
 indexes received _all_ have a value of `true` inside
 our bloom filter. If they _all_ have a value of
 `true`, we know that the bloom filter may have had
 the value previously inserted.
 However, it's not certain, because it's possible
 that other values previously inserted flipped the
 values to `true`. The values aren't necessarily
 `true` due to the item currently being searched for.
 Absolute certainty is impossible unless only a single
 item has previously been inserted.
 While checking the bloom filter for the indexes
 returned by our hash functions, if even one of them
 has a value of `false`, we definitively know that the
 item was not previously inserted.
 ## False Positives
 The probability of false positives is determined by
 three factors: the size of the bloom filter, the
 number of hash functions we use, and the number
 of items that have been inserted into the filter.
 The formula to calculate probablity of a false positive is:
 ( 1 - e <sup>-kn/m</sup> ) <sup>k</sup>
 k = # hash functions
 m = size
 n = # items inserted
 These variables, k, m, and n, should be picked based
 on how acceptable false positives are. If the values
 are picked and the resulting probability is too high,
 the values should be tweaked and the probability
 re-calculated.
 ## Applications
 A bloom filter can be used on a blogging website. If
 the goal is to show readers only articles that they
 have never seen before, a bloom filter is perfect.
 It can store hashed values based on the articles. After
 a user reads a few articles, they can be inserted into
 the filter. The next time the user visits the site,
 those articles can be filtered out of the results.
 Some articles will inevitably be filtered out by mistake,
 but the cost is acceptable. It's ok if a user never sees
 a few articles as long as they have other, brand new ones
 to see every time they visit the site.
 The popular blog site Medium does a version of this.
 Feel free to read [their article](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff).
 ## References
 - [Wikipedia](https://en.wikipedia.org/wiki/Bloom_filter)
 - [Tutorial](http://llimllib.github.io/bloomfilter-tutorial/)
 - [Calculating false positive probability](https://hur.st/bloomfilter/?n=4&p=&m=18&k=3)
 - [Medium blog](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff)
 - [YouTube](https://www.youtube.com/watch?v=bEmBh1HtYrw)
--- a/src/data-structures/bloom-filter/test/BloomFilter.test.js
+++ b/src/data-structures/bloom-filter/test/BloomFilter.test.js
@ -0,0 +1,39 @@
 import BloomFilter from '../BloomFilter';
 describe('Bloom Filter', () => {
  let bloomFilter;
  const people = ['Bruce Wayne', 'Clark Kent', 'Barry Allen'];
  beforeEach(() => {
    bloomFilter = new BloomFilter();
  });
  it('Should have methods named "insert" and "mayContain"', () => {
    expect(typeof bloomFilter.insert).toBe('function');
    expect(typeof bloomFilter.mayContain).toBe('function');
  });
  it('Should create a new filter store with the appropriate methods', () => {
    const store = bloomFilter.createStore(18);
    expect(typeof store.getValue).toBe('function');
    expect(typeof store.setValue).toBe('function');
  });
  it('Should hash deterministically with all 3 hash functions', () => {
    const str = 'abc';
    expect(bloomFilter.hash1(str)).toEqual(bloomFilter.hash1(str));
    expect(bloomFilter.hash2(str)).toEqual(bloomFilter.hash2(str));
    expect(bloomFilter.hash3(str)).toEqual(bloomFilter.hash3(str));
  });
  it('Should create an array with 3 hash values', () => {
    expect(bloomFilter.getHashValues('abc').length).toEqual(3);
  });
  it('Should insert strings correctly and return true when checking for inserted values', () => {
    people.forEach(person => bloomFilter.insert(person));
    expect(bloomFilter.mayContain('Bruce Wayne')).toBe(true);
    expect(bloomFilter.mayContain('Clark Kent')).toBe(true);
    expect(bloomFilter.mayContain('Barry Allen')).toBe(true);
  });
 });
--- a/src/data-structures/bloom-filter/test/BloomFilterFalsePositive.test.js
+++ b/src/data-structures/bloom-filter/test/BloomFilterFalsePositive.test.js
@ -0,0 +1,86 @@
 import BloomFilter from '../BloomFilter';
 // Adapted from http://stackoverflow.com/questions/1349404/generate-random-string-characters-in-javascript
 function makeID() {
  const possible = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
  let id = '';
  for (let i = 0; i < 10; i += 1) {
    const randomLength = Math.random() * possible.length;
    const randomIndex = Math.floor(randomLength);
    id += possible.charAt(randomIndex);
  }
  return id;
 }
 function run10kTrials(numRandomTests = 1000) {
  const bloomFilter = new BloomFilter();
  const mockPeopleIDs = [];
  for (let i = 0; i < 10; i += 1) {
    mockPeopleIDs.push(makeID());
  }
  mockPeopleIDs.forEach(id => bloomFilter.insert(id));
  let numFalsePositives = 0;
  for (let index = 0; index < numRandomTests; index += 1) {
    const randomID = makeID();
    if (bloomFilter.mayContain(randomID)) {
      numFalsePositives += 1;
    }
  }
  return numFalsePositives;
 }
 function testFilter(numTrials = 100) {
  const results = [];
  for (let i = 0; i < numTrials; i += 1) {
    results.push(run10kTrials());
  }
  const sum = results.reduce((cumulative, next) => cumulative + next, 0);
  return sum / numTrials;
 }
 describe('Bloom filter false positives', () => {
  const falsePositiveProbability = 0.0174;
  const expectedFalsePositives = falsePositiveProbability * 1000;
  const avgFalsePositives = testFilter();
  it(`Should keep false positives close to an expected value:
  # trials = 1000
  k = 3    (hash functions)
  m = 100  (size)
  n = 10   (items inserted)
  Using k, m, and n, plugged into https://hur.st/bloomfilter/?n=3&p=&m=18&k=3
  Chance of false positive = 0.017
  Expected false positives    =  # trials * chance of false positive
  Expected false positives    => 1000 * ${falsePositiveProbability}
  Expected false positives    => ${expectedFalsePositives}
  **************************
  EXPECTED   = ${expectedFalsePositives}
  ACTUAL AVG = ${avgFalsePositives}
  **************************
  If the expected and actual numbers are far off, something is wrong.
  Inspect manually.`, () => {
    // We give it a large range to avoid unnecessary failures.
    // If it's working correctly, the value should definitely
    // fall within this range.
    // In over 1,000 test runs, none of them ever come close
    // to falling outside of this range.
    const upperLimit = expectedFalsePositives + 5;
    const lowerLimit = expectedFalsePositives - 5;
    expect(avgFalsePositives).toBeGreaterThan(lowerLimit);
    expect(avgFalsePositives).toBeLessThan(upperLimit);
  });
 });