mirror of
https://github.com/ipfs/kubo.git
synced 2025-07-02 03:28:25 +08:00
Merge pull request #1085 from ipfs/bloomfilter-hamming-distance
Add hamming distance calculation to bloom filters
This commit is contained in:
5
Godeps/Godeps.json
generated
5
Godeps/Godeps.json
generated
@ -217,6 +217,11 @@
|
||||
"ImportPath": "github.com/mtchavez/jenkins",
|
||||
"Rev": "5a816af6ef21ef401bff5e4b7dd255d63400f497"
|
||||
},
|
||||
{
|
||||
"ImportPath": "github.com/steakknife/hamming",
|
||||
"Comment": "0.0.10",
|
||||
"Rev": "8bad99011016569c05320e51be39c648679c5b73"
|
||||
},
|
||||
{
|
||||
"ImportPath": "github.com/syndtr/goleveldb/leveldb",
|
||||
"Rev": "87e4e645d80ae9c537e8f2dee52b28036a5dd75e"
|
||||
|
8
Godeps/_workspace/src/github.com/steakknife/hamming/MIT-LICENSE.txt
generated
vendored
Normal file
8
Godeps/_workspace/src/github.com/steakknife/hamming/MIT-LICENSE.txt
generated
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
The MIT License (MIT)
|
||||
Copyright © 2014, 2015 Barry Allard
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
44
Godeps/_workspace/src/github.com/steakknife/hamming/README.md
generated
vendored
Normal file
44
Godeps/_workspace/src/github.com/steakknife/hamming/README.md
generated
vendored
Normal file
@ -0,0 +1,44 @@
|
||||
# hamming distance calculations in Go
|
||||
|
||||
Copyright © 2014, 2015 Barry Allard
|
||||
|
||||
[MIT license](MIT-LICENSE.txt)
|
||||
|
||||
## Usage
|
||||
|
||||
```go
|
||||
import 'github.com/steakknife/hamming'
|
||||
|
||||
// ...
|
||||
|
||||
// hamming distance between values
|
||||
hamming.Byte(0xFF, 0x00) // 8
|
||||
hamming.Byte(0x00, 0x00) // 0
|
||||
|
||||
// just count bits in a byte
|
||||
hamming.CountBitsByte(0xA5), // 4
|
||||
```
|
||||
|
||||
See help in the [docs](https://godoc.org/github.com/steakknife/hamming)
|
||||
|
||||
## Get
|
||||
|
||||
go get -u github.com/steakknife/hamming # master is always stable
|
||||
|
||||
## Source
|
||||
|
||||
- On the web: https://github.com/steakknife/hamming
|
||||
|
||||
- Git: `git clone https://github.com/steakknife/hamming`
|
||||
|
||||
## Contact
|
||||
|
||||
- [Feedback](mailto:barry.allard@gmail.com)
|
||||
|
||||
- [Issues](https://github.com/steakknife/hamming/issues)
|
||||
|
||||
## License
|
||||
|
||||
[MIT license](MIT-LICENSE.txt)
|
||||
|
||||
Copyright © 2014, 2015 Barry Allard
|
97
Godeps/_workspace/src/github.com/steakknife/hamming/hamming.go
generated
vendored
Normal file
97
Godeps/_workspace/src/github.com/steakknife/hamming/hamming.go
generated
vendored
Normal file
@ -0,0 +1,97 @@
|
||||
//
|
||||
// hamming distance calculations in Go
|
||||
//
|
||||
// https://github.com/steakknife/hamming
|
||||
//
|
||||
// Copyright © 2014, 2015 Barry Allard
|
||||
//
|
||||
// MIT license
|
||||
//
|
||||
//
|
||||
// Usage
|
||||
//
|
||||
// The functions are named (CountBits)?(Byte|Uint64)s?. The plural forms are for slices. The CountBits.+ forms are Population Count only, where the bare-type forms are Hamming distance.
|
||||
//
|
||||
// import 'github.com/steakknife/hamming'
|
||||
//
|
||||
// // ...
|
||||
//
|
||||
// // hamming distance between values
|
||||
// hamming.Byte(0xFF, 0x00) // 8
|
||||
// hamming.Byte(0x00, 0x00) // 0
|
||||
//
|
||||
// // just count bits in a byte
|
||||
// hamming.CountBitsByte(0xA5), // 4
|
||||
//
|
||||
package hamming
|
||||
|
||||
// SSE4.x PopCnt is 10x slower
|
||||
// References: check out Hacker's Delight
|
||||
|
||||
const (
|
||||
m1 uint64 = 0x5555555555555555 //binary: 0101...
|
||||
m2 uint64 = 0x3333333333333333 //binary: 00110011..
|
||||
m4 uint64 = 0x0f0f0f0f0f0f0f0f //binary: 4 zeros, 4 ones ...
|
||||
m8 uint64 = 0x00ff00ff00ff00ff //binary: 8 zeros, 8 ones ...
|
||||
m16 uint64 = 0x0000ffff0000ffff //binary: 16 zeros, 16 ones ...
|
||||
m32 uint64 = 0x00000000ffffffff //binary: 32 zeros, 32 ones
|
||||
hff uint64 = 0xffffffffffffffff //binary: all ones
|
||||
h01 uint64 = 0x0101010101010101 //the sum of 256 to the power of 0,1,2,3...
|
||||
)
|
||||
|
||||
var table = [256]byte{0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}
|
||||
|
||||
// hamming distance of two uint64's
|
||||
func Uint64(x, y uint64) int {
|
||||
return CountBitsUint64(x ^ y)
|
||||
}
|
||||
|
||||
// hamming distance of two uint64 buffers, of which the size of the first argument is used for both (panics if b1 is smaller than b0, does not compare b1 beyond length of b0)
|
||||
func Uint64s(b0, b1 []uint64) int {
|
||||
d := 0
|
||||
for i, x := range b0 {
|
||||
d += Uint64(x, b1[i])
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
// hamming distance of two bytes
|
||||
func Byte(x, y byte) int {
|
||||
return CountBitsByte(x ^ y)
|
||||
}
|
||||
|
||||
// hamming distance of two byte buffers, of which the size of the first argument is used for both (panics if b1 is smaller than b0, does not compare b1 beyond length of b0)
|
||||
func Bytes(b0, b1 []byte) int {
|
||||
d := 0
|
||||
for i, x := range b0 {
|
||||
d += Byte(x, b1[i])
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
func CountBitsUint64(x uint64) int {
|
||||
x -= (x >> 1) & m1 // put count of each 2 bits into those 2 bits
|
||||
x = (x & m2) + ((x >> 2) & m2) // put count of each 4 bits into those 4 bits
|
||||
x = (x + (x >> 4)) & m4 // put count of each 8 bits into those 8 bits
|
||||
return int((x * h01) >> 56) // returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24) + ...
|
||||
}
|
||||
|
||||
func CountBitsUint64s(b []uint64) int {
|
||||
c := 0
|
||||
for _, x := range b {
|
||||
c += CountBitsUint64(x)
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
func CountBitsByte(x byte) int {
|
||||
return int(table[x])
|
||||
}
|
||||
|
||||
func CountBitsBytes(b []byte) int {
|
||||
c := 0
|
||||
for _, x := range b {
|
||||
c += CountBitsByte(x)
|
||||
}
|
||||
return c
|
||||
}
|
143
Godeps/_workspace/src/github.com/steakknife/hamming/hamming_test.go
generated
vendored
Normal file
143
Godeps/_workspace/src/github.com/steakknife/hamming/hamming_test.go
generated
vendored
Normal file
@ -0,0 +1,143 @@
|
||||
//
|
||||
// hamming distance calculations in Go
|
||||
//
|
||||
// https://github.com/steakknife/hamming
|
||||
//
|
||||
// Copyright © 2014, 2015 Barry Allard
|
||||
//
|
||||
// MIT license
|
||||
//
|
||||
package hamming
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
type testCountBitsUint64Case struct {
|
||||
x uint64
|
||||
n int
|
||||
}
|
||||
|
||||
type testCountBitsByteCase struct {
|
||||
x byte
|
||||
n int
|
||||
}
|
||||
|
||||
type testBytesCase struct {
|
||||
b0, b1 []byte
|
||||
n int
|
||||
}
|
||||
|
||||
type testUint64sCase struct {
|
||||
b0, b1 []uint64
|
||||
n int
|
||||
}
|
||||
|
||||
var testCountBitsByteCases = []testCountBitsByteCase{
|
||||
{0x00, 0},
|
||||
{0x01, 1},
|
||||
{0x02, 1},
|
||||
{0x03, 2},
|
||||
{0xaa, 4},
|
||||
{0x55, 4},
|
||||
{0x7f, 7},
|
||||
{0xff, 8},
|
||||
}
|
||||
|
||||
var testCountBitsUint64Cases = []testCountBitsUint64Case{
|
||||
{0x00, 0},
|
||||
{0x01, 1},
|
||||
{0x02, 1},
|
||||
{0x03, 2},
|
||||
{0xaa, 4},
|
||||
{0x55, 4},
|
||||
{0x7f, 7},
|
||||
{0xff, 8},
|
||||
{0xffff, 16},
|
||||
{0xffffffff, 32},
|
||||
{0x1ffffffff, 33},
|
||||
{0x3ffffffff, 34},
|
||||
{0x7ffffffff, 35},
|
||||
{0xfffffffff, 36},
|
||||
{0x3fffffffffffffff, 62},
|
||||
{0x7fffffffffffffff, 63},
|
||||
{0xffffffffffffffff, 64},
|
||||
}
|
||||
|
||||
var testBytesCases = []testBytesCase{
|
||||
{[]byte{}, []byte{}, 0},
|
||||
{[]byte{1}, []byte{0}, 1},
|
||||
{[]byte{1}, []byte{2}, 2},
|
||||
{[]byte{1, 0}, []byte{0, 1}, 2},
|
||||
{[]byte{1, 0}, []byte{0, 1}, 2},
|
||||
}
|
||||
|
||||
var testUint64sCases = []testUint64sCase{
|
||||
{[]uint64{}, []uint64{}, 0},
|
||||
{[]uint64{1}, []uint64{0}, 1},
|
||||
{[]uint64{1}, []uint64{2}, 2},
|
||||
{[]uint64{1, 0}, []uint64{0, 1}, 2},
|
||||
{[]uint64{1, 0}, []uint64{0, 1}, 2},
|
||||
}
|
||||
|
||||
func TestCountBitByte(t *testing.T) {
|
||||
for _, c := range testCountBitsByteCases {
|
||||
if actualN := CountBitsByte(c.x); actualN != c.n {
|
||||
t.Fatal("CountBitsByte(", c.x, ") = ", actualN, " != ", c.n)
|
||||
} else {
|
||||
t.Log("CountBitsByte(", c.x, ") == ", c.n)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBytes(t *testing.T) {
|
||||
for _, c := range testBytesCases {
|
||||
if actualN := Bytes(c.b0, c.b1); actualN != c.n {
|
||||
t.Fatal("Bytes(", c.b0, ",", c.b1, ") = ", actualN, " != ", c.n)
|
||||
} else {
|
||||
t.Log("Bytes(", c.b0, ",", c.b1, ") == ", c.n)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestUint64s(t *testing.T) {
|
||||
for _, c := range testUint64sCases {
|
||||
if actualN := Uint64s(c.b0, c.b1); actualN != c.n {
|
||||
t.Fatal("Uint64s(", c.b0, ",", c.b1, ") = ", actualN, " != ", c.n)
|
||||
} else {
|
||||
t.Log("Uint64s(", c.b0, ",", c.b1, ") == ", c.n)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCountBitUint64(t *testing.T) {
|
||||
for _, c := range testCountBitsUint64Cases {
|
||||
if actualN := CountBitsUint64(c.x); actualN != c.n {
|
||||
t.Fatal("CountBitsUint64(", c.x, ") = ", actualN, " != ", c.n)
|
||||
} else {
|
||||
t.Log("CountBitsUint64(", c.x, ") == ", c.n)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkCountBitsUint64(b *testing.B) {
|
||||
j := 0
|
||||
for i := 0; i < b.N; i++ {
|
||||
CountBitsUint64(testCountBitsUint64Cases[j].x)
|
||||
j++
|
||||
if j == len(testCountBitsUint64Cases) {
|
||||
j = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkCountBitsByte(b *testing.B) {
|
||||
j := 0
|
||||
for i := 0; i < b.N; i++ {
|
||||
CountBitsByte(testCountBitsByteCases[j].x)
|
||||
j++
|
||||
if j == len(testCountBitsByteCases) {
|
||||
j = 0
|
||||
}
|
||||
}
|
||||
}
|
@ -6,6 +6,7 @@ import (
|
||||
"errors"
|
||||
// Non crypto hash, because speed
|
||||
"github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/mtchavez/jenkins"
|
||||
"github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/steakknife/hamming"
|
||||
"hash"
|
||||
)
|
||||
|
||||
@ -13,6 +14,7 @@ type Filter interface {
|
||||
Add([]byte)
|
||||
Find([]byte) bool
|
||||
Merge(Filter) (Filter, error)
|
||||
HammingDistance(Filter) (int, error)
|
||||
}
|
||||
|
||||
func NewFilter(size int) Filter {
|
||||
@ -100,3 +102,23 @@ func (f *filter) Merge(o Filter) (Filter, error) {
|
||||
|
||||
return nfilt, nil
|
||||
}
|
||||
|
||||
func (f *filter) HammingDistance(o Filter) (int, error) {
|
||||
casfil, ok := o.(*filter)
|
||||
if !ok {
|
||||
return 0, errors.New("Unsupported filter type")
|
||||
}
|
||||
|
||||
if len(f.filter) != len(casfil.filter) {
|
||||
return 0, errors.New("filter lengths must match!")
|
||||
}
|
||||
|
||||
acc := 0
|
||||
|
||||
// xor together
|
||||
for i := 0; i < len(f.filter); i++ {
|
||||
acc += hamming.Byte(f.filter[i], casfil.filter[i])
|
||||
}
|
||||
|
||||
return acc, nil
|
||||
}
|
||||
|
@ -78,3 +78,17 @@ func TestMerge(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestHamming(t *testing.T) {
|
||||
f1 := NewFilter(128)
|
||||
f2 := NewFilter(128)
|
||||
|
||||
f1.Add([]byte("no collision"))
|
||||
f1.Add([]byte("collision? no!"))
|
||||
|
||||
dist, _ := f1.HammingDistance(f2)
|
||||
|
||||
if dist != 6 {
|
||||
t.Fatal("Should have 6 bit difference")
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user