mirror of
https://github.com/trekhleb/javascript-algorithms.git
synced 2025-07-07 01:44:52 +08:00
Add bloom filter (#84)
This commit is contained in:

committed by
Oleksii Trekhleb

parent
b33f1d52dc
commit
41a6430532
@ -38,6 +38,7 @@ the data.
|
|||||||
* `A` [Fenwick Tree](src/data-structures/tree/fenwick-tree) (Binary Indexed Tree)
|
* `A` [Fenwick Tree](src/data-structures/tree/fenwick-tree) (Binary Indexed Tree)
|
||||||
* `A` [Graph](src/data-structures/graph) (both directed and undirected)
|
* `A` [Graph](src/data-structures/graph) (both directed and undirected)
|
||||||
* `A` [Disjoint Set](src/data-structures/disjoint-set)
|
* `A` [Disjoint Set](src/data-structures/disjoint-set)
|
||||||
|
* `A` [Bloom Filter](src/data-structures/bloom-filter)
|
||||||
|
|
||||||
## Algorithms
|
## Algorithms
|
||||||
|
|
||||||
@ -231,6 +232,7 @@ Below is the list of some of the most used Big O notations and their performance
|
|||||||
| **B-Tree** | log(n) | log(n) | log(n) | log(n) | |
|
| **B-Tree** | log(n) | log(n) | log(n) | log(n) | |
|
||||||
| **Red-Black Tree** | log(n) | log(n) | log(n) | log(n) | |
|
| **Red-Black Tree** | log(n) | log(n) | log(n) | log(n) | |
|
||||||
| **AVL Tree** | log(n) | log(n) | log(n) | log(n) | |
|
| **AVL Tree** | log(n) | log(n) | log(n) | log(n) | |
|
||||||
|
| **Bloom Filter** | | 1 | 1 | | |
|
||||||
|
|
||||||
### Array Sorting Algorithms Complexity
|
### Array Sorting Algorithms Complexity
|
||||||
|
|
||||||
|
127
src/data-structures/bloom-filter/BloomFilter.js
Normal file
127
src/data-structures/bloom-filter/BloomFilter.js
Normal file
@ -0,0 +1,127 @@
|
|||||||
|
export default class BloomFilter {
|
||||||
|
/**
|
||||||
|
* @param {number} size
|
||||||
|
*/
|
||||||
|
constructor(size = 100) {
|
||||||
|
// Bloom filter size directly affects the likelihood of false positives.
|
||||||
|
// The bigger the size the lower the likelihood of false positives.
|
||||||
|
this.size = size;
|
||||||
|
this.storage = this.createStore(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param {string} item
|
||||||
|
*/
|
||||||
|
insert(item) {
|
||||||
|
const hashValues = this.getHashValues(item);
|
||||||
|
|
||||||
|
// Set each hashValue index to true
|
||||||
|
hashValues.forEach(val => this.storage.setValue(val));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param {string} item
|
||||||
|
* @return {boolean}
|
||||||
|
*/
|
||||||
|
mayContain(item) {
|
||||||
|
const hashValues = this.getHashValues(item);
|
||||||
|
|
||||||
|
for (let i = 0; i < hashValues.length; i += 1) {
|
||||||
|
if (!this.storage.getValue(hashValues[i])) {
|
||||||
|
// We know that the item was definitely not inserted.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// The item may or may not have been inserted.
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates the data store for our filter.
|
||||||
|
* We use this method to generate the store in order to
|
||||||
|
* encapsulate the data itself and only provide access
|
||||||
|
* to the necessary methods.
|
||||||
|
*
|
||||||
|
* @param {number} size
|
||||||
|
* @return {Object}
|
||||||
|
*/
|
||||||
|
createStore(size) {
|
||||||
|
const storage = [];
|
||||||
|
|
||||||
|
// Initialize all indexes to false
|
||||||
|
for (let i = 0; i < size; i += 1) {
|
||||||
|
storage.push(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
const storageInterface = {
|
||||||
|
getValue(index) {
|
||||||
|
return storage[index];
|
||||||
|
},
|
||||||
|
setValue(index) {
|
||||||
|
storage[index] = true;
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
return storageInterface;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param {string} str
|
||||||
|
* @return {number}
|
||||||
|
*/
|
||||||
|
hash1(str) {
|
||||||
|
let hash = 0;
|
||||||
|
|
||||||
|
for (let i = 0; i < str.length; i += 1) {
|
||||||
|
const char = str.charCodeAt(i);
|
||||||
|
hash = (hash << 5) + hash + char;
|
||||||
|
hash &= hash; // Convert to 32bit integer
|
||||||
|
hash = Math.abs(hash);
|
||||||
|
}
|
||||||
|
|
||||||
|
return hash % this.size;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param {string} str
|
||||||
|
* @return {number}
|
||||||
|
*/
|
||||||
|
hash2(str) {
|
||||||
|
let hash = 5381;
|
||||||
|
|
||||||
|
for (let i = 0; i < str.length; i += 1) {
|
||||||
|
const char = str.charCodeAt(i);
|
||||||
|
hash = (hash << 5) + hash + char; /* hash * 33 + c */
|
||||||
|
}
|
||||||
|
|
||||||
|
return hash % this.size;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param {string} str
|
||||||
|
* @return {number}
|
||||||
|
*/
|
||||||
|
hash3(str) {
|
||||||
|
let hash = 0;
|
||||||
|
|
||||||
|
for (let i = 0; i < str.length; i += 1) {
|
||||||
|
const char = str.charCodeAt(i);
|
||||||
|
hash = (hash << 5) - hash;
|
||||||
|
hash += char;
|
||||||
|
hash &= hash; // Convert to 32bit integer
|
||||||
|
}
|
||||||
|
|
||||||
|
return hash % this.size;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs all 3 hash functions on the input and returns an array of results
|
||||||
|
*
|
||||||
|
* @param {string} str
|
||||||
|
* @return {number[]}
|
||||||
|
*/
|
||||||
|
getHashValues(item) {
|
||||||
|
return [this.hash1(item), Math.abs(this.hash2(item)), Math.abs(this.hash3(item))];
|
||||||
|
}
|
||||||
|
}
|
104
src/data-structures/bloom-filter/README.md
Normal file
104
src/data-structures/bloom-filter/README.md
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
# Bloom Filter
|
||||||
|
|
||||||
|
A bloom filter is a data structure designed to
|
||||||
|
test whether an element is present in a set. It
|
||||||
|
is designed to be blazingly fast and use minimal
|
||||||
|
memory at the cost of potential false positives.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
## Operations
|
||||||
|
|
||||||
|
There are two main operations a bloom filter can
|
||||||
|
perform: insertion and search. Search may result in
|
||||||
|
false positives. Deletion is not possible.
|
||||||
|
|
||||||
|
In other words, the filter can take in items. When
|
||||||
|
we go to check if an item has previously been
|
||||||
|
inserted, it can tell us either "no" or "maybe".
|
||||||
|
|
||||||
|
Both insertion and search are O(1) operations.
|
||||||
|
|
||||||
|
## Making the filter
|
||||||
|
|
||||||
|
A bloom filter is created by allotting a certain size.
|
||||||
|
In our example, we use 100 as a default length. All
|
||||||
|
locations are initialized to `false`.
|
||||||
|
|
||||||
|
### Insertion
|
||||||
|
|
||||||
|
During insertion, a number of hash functions,
|
||||||
|
in our case 3 hash functions, are used to create
|
||||||
|
hashes of the input. These hash functions output
|
||||||
|
indexes. At every index received, we simply change
|
||||||
|
the value in our bloom filter to `true`.
|
||||||
|
|
||||||
|
### Search
|
||||||
|
|
||||||
|
During a search, the same hash functions are called
|
||||||
|
and used to hash the input. We then check if the
|
||||||
|
indexes received _all_ have a value of `true` inside
|
||||||
|
our bloom filter. If they _all_ have a value of
|
||||||
|
`true`, we know that the bloom filter may have had
|
||||||
|
the value previously inserted.
|
||||||
|
|
||||||
|
However, it's not certain, because it's possible
|
||||||
|
that other values previously inserted flipped the
|
||||||
|
values to `true`. The values aren't necessarily
|
||||||
|
`true` due to the item currently being searched for.
|
||||||
|
Absolute certainty is impossible unless only a single
|
||||||
|
item has previously been inserted.
|
||||||
|
|
||||||
|
While checking the bloom filter for the indexes
|
||||||
|
returned by our hash functions, if even one of them
|
||||||
|
has a value of `false`, we definitively know that the
|
||||||
|
item was not previously inserted.
|
||||||
|
|
||||||
|
## False Positives
|
||||||
|
|
||||||
|
The probability of false positives is determined by
|
||||||
|
three factors: the size of the bloom filter, the
|
||||||
|
number of hash functions we use, and the number
|
||||||
|
of items that have been inserted into the filter.
|
||||||
|
|
||||||
|
The formula to calculate probablity of a false positive is:
|
||||||
|
|
||||||
|
( 1 - e <sup>-kn/m</sup> ) <sup>k</sup>
|
||||||
|
|
||||||
|
k = # hash functions
|
||||||
|
|
||||||
|
m = size
|
||||||
|
|
||||||
|
n = # items inserted
|
||||||
|
|
||||||
|
These variables, k, m, and n, should be picked based
|
||||||
|
on how acceptable false positives are. If the values
|
||||||
|
are picked and the resulting probability is too high,
|
||||||
|
the values should be tweaked and the probability
|
||||||
|
re-calculated.
|
||||||
|
|
||||||
|
## Applications
|
||||||
|
|
||||||
|
A bloom filter can be used on a blogging website. If
|
||||||
|
the goal is to show readers only articles that they
|
||||||
|
have never seen before, a bloom filter is perfect.
|
||||||
|
It can store hashed values based on the articles. After
|
||||||
|
a user reads a few articles, they can be inserted into
|
||||||
|
the filter. The next time the user visits the site,
|
||||||
|
those articles can be filtered out of the results.
|
||||||
|
|
||||||
|
Some articles will inevitably be filtered out by mistake,
|
||||||
|
but the cost is acceptable. It's ok if a user never sees
|
||||||
|
a few articles as long as they have other, brand new ones
|
||||||
|
to see every time they visit the site.
|
||||||
|
|
||||||
|
The popular blog site Medium does a version of this.
|
||||||
|
Feel free to read [their article](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff).
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [Wikipedia](https://en.wikipedia.org/wiki/Bloom_filter)
|
||||||
|
- [Tutorial](http://llimllib.github.io/bloomfilter-tutorial/)
|
||||||
|
- [Calculating false positive probability](https://hur.st/bloomfilter/?n=4&p=&m=18&k=3)
|
||||||
|
- [Medium blog](https://blog.medium.com/what-are-bloom-filters-1ec2a50c68ff)
|
||||||
|
- [YouTube](https://www.youtube.com/watch?v=bEmBh1HtYrw)
|
@ -0,0 +1,39 @@
|
|||||||
|
import BloomFilter from '../BloomFilter';
|
||||||
|
|
||||||
|
describe('Bloom Filter', () => {
|
||||||
|
let bloomFilter;
|
||||||
|
const people = ['Bruce Wayne', 'Clark Kent', 'Barry Allen'];
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
bloomFilter = new BloomFilter();
|
||||||
|
});
|
||||||
|
|
||||||
|
it('Should have methods named "insert" and "mayContain"', () => {
|
||||||
|
expect(typeof bloomFilter.insert).toBe('function');
|
||||||
|
expect(typeof bloomFilter.mayContain).toBe('function');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('Should create a new filter store with the appropriate methods', () => {
|
||||||
|
const store = bloomFilter.createStore(18);
|
||||||
|
expect(typeof store.getValue).toBe('function');
|
||||||
|
expect(typeof store.setValue).toBe('function');
|
||||||
|
});
|
||||||
|
|
||||||
|
it('Should hash deterministically with all 3 hash functions', () => {
|
||||||
|
const str = 'abc';
|
||||||
|
expect(bloomFilter.hash1(str)).toEqual(bloomFilter.hash1(str));
|
||||||
|
expect(bloomFilter.hash2(str)).toEqual(bloomFilter.hash2(str));
|
||||||
|
expect(bloomFilter.hash3(str)).toEqual(bloomFilter.hash3(str));
|
||||||
|
});
|
||||||
|
|
||||||
|
it('Should create an array with 3 hash values', () => {
|
||||||
|
expect(bloomFilter.getHashValues('abc').length).toEqual(3);
|
||||||
|
});
|
||||||
|
|
||||||
|
it('Should insert strings correctly and return true when checking for inserted values', () => {
|
||||||
|
people.forEach(person => bloomFilter.insert(person));
|
||||||
|
expect(bloomFilter.mayContain('Bruce Wayne')).toBe(true);
|
||||||
|
expect(bloomFilter.mayContain('Clark Kent')).toBe(true);
|
||||||
|
expect(bloomFilter.mayContain('Barry Allen')).toBe(true);
|
||||||
|
});
|
||||||
|
});
|
@ -0,0 +1,86 @@
|
|||||||
|
import BloomFilter from '../BloomFilter';
|
||||||
|
|
||||||
|
// Adapted from http://stackoverflow.com/questions/1349404/generate-random-string-characters-in-javascript
|
||||||
|
function makeID() {
|
||||||
|
const possible = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
|
||||||
|
let id = '';
|
||||||
|
|
||||||
|
for (let i = 0; i < 10; i += 1) {
|
||||||
|
const randomLength = Math.random() * possible.length;
|
||||||
|
const randomIndex = Math.floor(randomLength);
|
||||||
|
id += possible.charAt(randomIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
function run10kTrials(numRandomTests = 1000) {
|
||||||
|
const bloomFilter = new BloomFilter();
|
||||||
|
const mockPeopleIDs = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < 10; i += 1) {
|
||||||
|
mockPeopleIDs.push(makeID());
|
||||||
|
}
|
||||||
|
|
||||||
|
mockPeopleIDs.forEach(id => bloomFilter.insert(id));
|
||||||
|
let numFalsePositives = 0;
|
||||||
|
|
||||||
|
for (let index = 0; index < numRandomTests; index += 1) {
|
||||||
|
const randomID = makeID();
|
||||||
|
if (bloomFilter.mayContain(randomID)) {
|
||||||
|
numFalsePositives += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return numFalsePositives;
|
||||||
|
}
|
||||||
|
|
||||||
|
function testFilter(numTrials = 100) {
|
||||||
|
const results = [];
|
||||||
|
|
||||||
|
for (let i = 0; i < numTrials; i += 1) {
|
||||||
|
results.push(run10kTrials());
|
||||||
|
}
|
||||||
|
|
||||||
|
const sum = results.reduce((cumulative, next) => cumulative + next, 0);
|
||||||
|
return sum / numTrials;
|
||||||
|
}
|
||||||
|
|
||||||
|
describe('Bloom filter false positives', () => {
|
||||||
|
const falsePositiveProbability = 0.0174;
|
||||||
|
const expectedFalsePositives = falsePositiveProbability * 1000;
|
||||||
|
const avgFalsePositives = testFilter();
|
||||||
|
|
||||||
|
it(`Should keep false positives close to an expected value:
|
||||||
|
|
||||||
|
# trials = 1000
|
||||||
|
k = 3 (hash functions)
|
||||||
|
m = 100 (size)
|
||||||
|
n = 10 (items inserted)
|
||||||
|
|
||||||
|
Using k, m, and n, plugged into https://hur.st/bloomfilter/?n=3&p=&m=18&k=3
|
||||||
|
Chance of false positive = 0.017
|
||||||
|
|
||||||
|
Expected false positives = # trials * chance of false positive
|
||||||
|
Expected false positives => 1000 * ${falsePositiveProbability}
|
||||||
|
Expected false positives => ${expectedFalsePositives}
|
||||||
|
|
||||||
|
**************************
|
||||||
|
EXPECTED = ${expectedFalsePositives}
|
||||||
|
ACTUAL AVG = ${avgFalsePositives}
|
||||||
|
**************************
|
||||||
|
|
||||||
|
If the expected and actual numbers are far off, something is wrong.
|
||||||
|
Inspect manually.`, () => {
|
||||||
|
// We give it a large range to avoid unnecessary failures.
|
||||||
|
// If it's working correctly, the value should definitely
|
||||||
|
// fall within this range.
|
||||||
|
|
||||||
|
// In over 1,000 test runs, none of them ever come close
|
||||||
|
// to falling outside of this range.
|
||||||
|
const upperLimit = expectedFalsePositives + 5;
|
||||||
|
const lowerLimit = expectedFalsePositives - 5;
|
||||||
|
expect(avgFalsePositives).toBeGreaterThan(lowerLimit);
|
||||||
|
expect(avgFalsePositives).toBeLessThan(upperLimit);
|
||||||
|
});
|
||||||
|
});
|
Reference in New Issue
Block a user