1
0
mirror of https://github.com/ipfs/kubo.git synced 2025-07-02 03:28:25 +08:00

Make bloom filters simpler

These did not work before, and had some unnecessary complexity.

Now the filters use only one hashing function, no bignum arithmetic, and gets the additional bit positions by repeatedly hashing the result of prior hash.

Since we're not concerned about crypto hashing here, this should be a win.

External interfaces unchanged.
This commit is contained in:
Kristoffer Ström
2015-04-06 16:45:23 +02:00
parent 07d9cdbd69
commit 3d8e96a22e
10 changed files with 355 additions and 46 deletions

12
Godeps/Godeps.json generated
View File

@ -213,6 +213,10 @@
"ImportPath": "github.com/mitchellh/go-homedir",
"Rev": "7d2d8c8a4e078ce3c58736ab521a40b37a504c52"
},
{
"ImportPath": "github.com/mtchavez/jenkins",
"Rev": "5a816af6ef21ef401bff5e4b7dd255d63400f497"
},
{
"ImportPath": "github.com/syndtr/goleveldb/leveldb",
"Rev": "87e4e645d80ae9c537e8f2dee52b28036a5dd75e"
@ -221,6 +225,10 @@
"ImportPath": "github.com/syndtr/gosnappy/snappy",
"Rev": "156a073208e131d7d2e212cb749feae7c339e846"
},
{
"ImportPath": "github.com/whyrusleeping/go-metrics",
"Rev": "1cd8009604ec2238b5a71305a0ecd974066e0e16"
},
{
"ImportPath": "golang.org/x/crypto/blowfish",
"Rev": "b7d6bf2c61544745a02f83dec90393985fc3a065"
@ -233,10 +241,6 @@
"ImportPath": "golang.org/x/net/context",
"Rev": "7dbad50ab5b31073856416cdcfeb2796d682f844"
},
{
"ImportPath": "github.com/whyrusleeping/go-metrics",
"Rev": "1cd8009604ec2238b5a71305a0ecd974066e0e16"
},
{
"ImportPath": "gopkg.in/fsnotify.v1",
"Comment": "v1.2.0",

View File

@ -0,0 +1,23 @@
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test

View File

@ -0,0 +1,8 @@
go:
- 1.1
- tip
install:
- go get github.com/onsi/ginkgo
- go get github.com/onsi/gomega
before_script: go test -i ./...
script: go test ./...

View File

@ -0,0 +1,11 @@
build:
go build jenkins.go
run:
go run jenkins.go
test:
go test -cover
default:
go run jenkins.go

View File

@ -0,0 +1,45 @@
Jenkins
=================
Golang Jenkins hash
[![Build Status](https://travis-ci.org/mtchavez/go-jenkins-hashes.png?branch=master)](https://travis-ci.org/mtchavez/go-jenkins-hashes)
## Install
`go get -u github.com/mtchavez/jenkins`
## Usage
Jenkins follows the [Hash32](http://golang.org/pkg/hash/#Hash32) interface from the Go standard library
```go
// Create a new hash
jenkhash := New()
// Write a string of bytes to hash
key := []byte("my-random-key")
length, err := jenkhash(key)
// Get uint32 sum of hash
sum := jenkhash.Sum32()
// Sum hash with byte string
sumbytes := jenkhash.Sum(key)
```
## Testing
Uses [Ginkgo](http://onsi.github.io/ginkgo/) for testing.
Run via `make test` which will run `go test -cover`
## Documentation
Docs on [godoc](http://godoc.org/github.com/mtchavez/jenkins)
## License
Written by Chavez
Released under the MIT License: http://www.opensource.org/licenses/mit-license.php

View File

@ -0,0 +1,48 @@
package jenkins
import "hash"
type jenkhash uint32
func New() hash.Hash32 {
var j jenkhash = 0
return &j
}
func (j *jenkhash) Write(key []byte) (int, error) {
hash := *j
for _, b := range key {
hash += jenkhash(b)
hash += (hash << 10)
hash ^= (hash >> 6)
}
hash += (hash << 3)
hash ^= (hash >> 11)
hash += (hash << 15)
*j = hash
return len(key), nil
}
func (j *jenkhash) Reset() {
*j = 0
}
func (j *jenkhash) Size() int {
return 4
}
func (j *jenkhash) BlockSize() int {
return 1
}
func (j *jenkhash) Sum32() uint32 {
return uint32(*j)
}
func (j *jenkhash) Sum(in []byte) []byte {
v := j.Sum32()
return append(in, byte(v>>24), byte(v>>16), byte(v>>8), byte(v))
}

View File

@ -0,0 +1,13 @@
package jenkins
import (
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
"testing"
)
func TestJenkins(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Jenkins Suite")
}

View File

@ -0,0 +1,101 @@
package jenkins
import (
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
"hash"
)
var _ = Describe("Jenkins", func() {
var jhash hash.Hash32
var key []byte
BeforeEach(func() {
jhash = New()
key = []byte("Apple")
})
Describe("New", func() {
It("returns jenkhash", func() {
var h *jenkhash
Expect(jhash).To(BeAssignableToTypeOf(h))
})
It("initializes offset to 0", func() {
Expect(jhash.Sum32()).To(Equal(uint32(0)))
})
})
Describe("Write", func() {
It("returns key length", func() {
length, _ := jhash.Write(key)
Expect(length).To(Equal(5))
})
It("has no error", func() {
_, err := jhash.Write(key)
Expect(err).To(BeNil())
})
})
Describe("Reset", func() {
It("sets back to 0", func() {
Expect(jhash.Sum32()).To(Equal(uint32(0)))
jhash.Write(key)
Expect(jhash.Sum32()).NotTo(Equal(uint32(0)))
jhash.Reset()
Expect(jhash.Sum32()).To(Equal(uint32(0)))
})
})
Describe("Size", func() {
It("is 4", func() {
Expect(jhash.Size()).To(Equal(4))
})
})
Describe("BlockSize", func() {
It("is 1", func() {
Expect(jhash.BlockSize()).To(Equal(1))
})
})
Describe("Sum32", func() {
It("defaults to 0", func() {
Expect(jhash.Sum32()).To(Equal(uint32(0)))
})
It("sums hash", func() {
jhash.Write(key)
Expect(jhash.Sum32()).To(Equal(uint32(884782484)))
})
})
Describe("Sum", func() {
It("default 0 hash byte returned", func() {
expected := []byte{0x41, 0x70, 0x70, 0x6c, 0x65, 0x0, 0x0, 0x0, 0x0}
Expect(jhash.Sum(key)).To(Equal(expected))
})
It("returns sum byte array", func() {
jhash.Write(key)
expected := []byte{0x41, 0x70, 0x70, 0x6c, 0x65, 0x34, 0xbc, 0xb5, 0x94}
Expect(jhash.Sum(key)).To(Equal(expected))
})
})
})

View File

@ -2,13 +2,11 @@
package bloom
import (
"encoding/binary"
"errors"
"fmt"
// Non crypto hash, because speed
"github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/mtchavez/jenkins"
"hash"
"hash/adler32"
"hash/crc32"
"hash/fnv"
"math/big"
)
type Filter interface {
@ -17,61 +15,66 @@ type Filter interface {
Merge(Filter) (Filter, error)
}
func BasicFilter() Filter {
// Non crypto hashes, because speed
return NewFilter(2048, adler32.New(), fnv.New32(), crc32.NewIEEE())
}
func NewFilter(size int, hashes ...hash.Hash) Filter {
func NewFilter(size int) Filter {
return &filter{
hash: jenkins.New(),
filter: make([]byte, size),
hashes: hashes,
k: 3,
}
}
type filter struct {
filter []byte
hashes []hash.Hash
hash hash.Hash32
k int
}
func (f *filter) Add(k []byte) {
for _, h := range f.hashes {
i := bytesMod(h.Sum(k), int64(len(f.filter)*8))
f.setBit(i)
func BasicFilter() Filter {
return NewFilter(2048)
}
func (f *filter) Add(bytes []byte) {
for _, bit := range f.getBitIndicies(bytes) {
f.setBit(bit)
}
}
func (f *filter) Find(k []byte) bool {
for _, h := range f.hashes {
i := bytesMod(h.Sum(k), int64(len(f.filter)*8))
if !f.getBit(i) {
func (f *filter) getBitIndicies(bytes []byte) []uint32 {
indicies := make([]uint32, f.k)
f.hash.Write(bytes)
b := make([]byte, 4)
for i := 0; i < f.k; i++ {
res := f.hash.Sum32()
indicies[i] = res % (uint32(len(f.filter)) * 8)
binary.LittleEndian.PutUint32(b, res)
f.hash.Write(b)
}
f.hash.Reset()
return indicies
}
func (f *filter) Find(bytes []byte) bool {
for _, bit := range f.getBitIndicies(bytes) {
if !f.getBit(bit) {
return false
}
}
return true
}
func (f *filter) setBit(i int64) {
fmt.Printf("setting bit %d\n", i)
func (f *filter) setBit(i uint32) {
f.filter[i/8] |= (1 << byte(i%8))
}
func (f *filter) getBit(i int64) bool {
fmt.Printf("getting bit %d\n", i)
func (f *filter) getBit(i uint32) bool {
return f.filter[i/8]&(1<<byte(i%8)) != 0
}
func bytesMod(b []byte, modulo int64) int64 {
i := big.NewInt(0)
i = i.SetBytes(b)
bigmod := big.NewInt(int64(modulo))
result := big.NewInt(0)
result.Mod(i, bigmod)
return result.Int64()
}
func (f *filter) Merge(o Filter) (Filter, error) {
casfil, ok := o.(*filter)
if !ok {
@ -82,12 +85,15 @@ func (f *filter) Merge(o Filter) (Filter, error) {
return nil, errors.New("filter lengths must match!")
}
if casfil.k != f.k {
return nil, errors.New("filter k-values must match!")
}
nfilt := new(filter)
// this bit is sketchy, need a way of comparing hash functions
nfilt.hashes = f.hashes
nfilt.hash = f.hash
nfilt.filter = make([]byte, len(f.filter))
nfilt.k = f.k
for i, v := range f.filter {
nfilt.filter[i] = v | casfil.filter[i]
}

View File

@ -1,13 +1,19 @@
package bloom
import "testing"
import (
"encoding/binary"
"fmt"
"testing"
)
func TestFilter(t *testing.T) {
f := BasicFilter()
f := NewFilter(128)
keys := [][]byte{
[]byte("hello"),
[]byte("fish"),
[]byte("ipfsrocks"),
[]byte("i want ipfs socks"),
}
f.Add(keys[0])
@ -21,10 +27,54 @@ func TestFilter(t *testing.T) {
}
f.Add(keys[2])
f.Add(keys[3])
for _, k := range keys {
if !f.Find(k) {
t.Fatal("Couldnt find one of three keys")
}
}
if f.Find([]byte("beep boop")) {
t.Fatal("Got false positive! Super unlikely!")
}
fmt.Println(f)
}
func TestMerge(t *testing.T) {
f1 := NewFilter(128)
f2 := NewFilter(128)
fbork := NewFilter(32)
_, err := f1.Merge(fbork)
if err == nil {
t.Fatal("Merge should fail on filters with different lengths")
}
b := make([]byte, 4)
var i uint32
for i = 0; i < 10; i++ {
binary.LittleEndian.PutUint32(b, i)
f1.Add(b)
}
for i = 10; i < 20; i++ {
binary.LittleEndian.PutUint32(b, i)
f2.Add(b)
}
merged, _ := f1.Merge(f2)
for i = 0; i < 20; i++ {
binary.LittleEndian.PutUint32(b, i)
if !merged.Find(b) {
t.Fatal("Could not find all keys in merged filter")
}
}
}