Make bloom filters simpler

These did not work before, and had some unnecessary complexity. Now the filters use only one hashing function, no bignum arithmetic, and gets the additional bit positions by repeatedly hashing the result of prior hash. Since we're not concerned about crypto hashing here, this should be a win. External interfaces unchanged.
2025-07-02 03:28:25 +08:00 · 2015-04-06 16:45:23 +02:00
parent 07d9cdbd69
commit 3d8e96a22e
10 changed files with 355 additions and 46 deletions
--- a/Godeps/Godeps.json
+++ b/Godeps/Godeps.json
@ -213,6 +213,10 @@
 			"ImportPath": "github.com/mitchellh/go-homedir",
 			"Rev": "7d2d8c8a4e078ce3c58736ab521a40b37a504c52"
 		},
+		{
+			"ImportPath": "github.com/mtchavez/jenkins",
+			"Rev": "5a816af6ef21ef401bff5e4b7dd255d63400f497"
+		},
 		{
 			"ImportPath": "github.com/syndtr/goleveldb/leveldb",
 			"Rev": "87e4e645d80ae9c537e8f2dee52b28036a5dd75e"
@ -221,6 +225,10 @@
 			"ImportPath": "github.com/syndtr/gosnappy/snappy",
 			"Rev": "156a073208e131d7d2e212cb749feae7c339e846"
 		},
+		{
+			"ImportPath": "github.com/whyrusleeping/go-metrics",
+			"Rev": "1cd8009604ec2238b5a71305a0ecd974066e0e16"
+		},
 		{
 			"ImportPath": "golang.org/x/crypto/blowfish",
 			"Rev": "b7d6bf2c61544745a02f83dec90393985fc3a065"
@ -233,10 +241,6 @@
 			"ImportPath": "golang.org/x/net/context",
 			"Rev": "7dbad50ab5b31073856416cdcfeb2796d682f844"
 		},
-		{
-			"ImportPath": "github.com/whyrusleeping/go-metrics",
-			"Rev": "1cd8009604ec2238b5a71305a0ecd974066e0e16"
-		},
 		{
 			"ImportPath": "gopkg.in/fsnotify.v1",
 			"Comment": "v1.2.0",
--- a/Godeps/_workspace/src/github.com/mtchavez/jenkins/.gitignore
+++ b/Godeps/_workspace/src/github.com/mtchavez/jenkins/.gitignore
@ -0,0 +1,23 @@
+# Compiled Object files, Static and Dynamic libs (Shared Objects)
+*.o
+*.a
+*.so
+
+# Folders
+_obj
+_test
+
+# Architecture specific extensions/prefixes
+*.[568vq]
+[568vq].out
+
+*.cgo1.go
+*.cgo2.c
+_cgo_defun.c
+_cgo_gotypes.go
+_cgo_export.*
+
+_testmain.go
+
+*.exe
+*.test
--- a/Godeps/_workspace/src/github.com/mtchavez/jenkins/.travis.yml
+++ b/Godeps/_workspace/src/github.com/mtchavez/jenkins/.travis.yml
@ -0,0 +1,8 @@
+go:
+  - 1.1
+  - tip
+install:
+  - go get github.com/onsi/ginkgo
+  - go get github.com/onsi/gomega
+before_script: go test -i ./...
+script: go test ./...
--- a/Godeps/_workspace/src/github.com/mtchavez/jenkins/Makefile
+++ b/Godeps/_workspace/src/github.com/mtchavez/jenkins/Makefile
@ -0,0 +1,11 @@
+build:
+	go build jenkins.go
+
+run:
+	go run jenkins.go
+
+test:
+	go test -cover
+
+default:
+	go run jenkins.go
--- a/Godeps/_workspace/src/github.com/mtchavez/jenkins/README.md
+++ b/Godeps/_workspace/src/github.com/mtchavez/jenkins/README.md
@ -0,0 +1,45 @@
+Jenkins
+=================
+
+Golang Jenkins hash
+
+[![Build Status](https://travis-ci.org/mtchavez/go-jenkins-hashes.png?branch=master)](https://travis-ci.org/mtchavez/go-jenkins-hashes)
+
+## Install
+
+`go get -u github.com/mtchavez/jenkins`
+
+## Usage
+
+Jenkins follows the [Hash32](http://golang.org/pkg/hash/#Hash32) interface from the Go standard library
+
+```go
+// Create a new hash
+jenkhash := New()
+
+// Write a string of bytes to hash
+key := []byte("my-random-key")
+length, err := jenkhash(key)
+
+// Get uint32 sum of hash
+sum := jenkhash.Sum32()
+
+// Sum hash with byte string
+sumbytes := jenkhash.Sum(key)
+```
+
+## Testing
+
+Uses [Ginkgo](http://onsi.github.io/ginkgo/) for testing.
+
+Run via `make test` which will run `go test -cover`
+
+## Documentation
+
+Docs on [godoc](http://godoc.org/github.com/mtchavez/jenkins)
+
+## License
+
+Written by Chavez
+
+Released under the MIT License: http://www.opensource.org/licenses/mit-license.php
--- a/Godeps/_workspace/src/github.com/mtchavez/jenkins/jenkins.go
+++ b/Godeps/_workspace/src/github.com/mtchavez/jenkins/jenkins.go
@ -0,0 +1,48 @@
+package jenkins
+
+import "hash"
+
+type jenkhash uint32
+
+func New() hash.Hash32 {
+	var j jenkhash = 0
+	return &j
+}
+
+func (j *jenkhash) Write(key []byte) (int, error) {
+	hash := *j
+
+	for _, b := range key {
+		hash += jenkhash(b)
+		hash += (hash << 10)
+		hash ^= (hash >> 6)
+	}
+
+	hash += (hash << 3)
+	hash ^= (hash >> 11)
+	hash += (hash << 15)
+
+	*j = hash
+	return len(key), nil
+}
+
+func (j *jenkhash) Reset() {
+	*j = 0
+}
+
+func (j *jenkhash) Size() int {
+	return 4
+}
+
+func (j *jenkhash) BlockSize() int {
+	return 1
+}
+
+func (j *jenkhash) Sum32() uint32 {
+	return uint32(*j)
+}
+
+func (j *jenkhash) Sum(in []byte) []byte {
+	v := j.Sum32()
+	return append(in, byte(v>>24), byte(v>>16), byte(v>>8), byte(v))
+}
--- a/Godeps/_workspace/src/github.com/mtchavez/jenkins/jenkins_suite_test.go
+++ b/Godeps/_workspace/src/github.com/mtchavez/jenkins/jenkins_suite_test.go
@ -0,0 +1,13 @@
+package jenkins
+
+import (
+	. "github.com/onsi/ginkgo"
+	. "github.com/onsi/gomega"
+
+	"testing"
+)
+
+func TestJenkins(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Jenkins Suite")
+}
--- a/Godeps/_workspace/src/github.com/mtchavez/jenkins/jenkins_test.go
+++ b/Godeps/_workspace/src/github.com/mtchavez/jenkins/jenkins_test.go
@ -0,0 +1,101 @@
+package jenkins
+
+import (
+	. "github.com/onsi/ginkgo"
+	. "github.com/onsi/gomega"
+	"hash"
+)
+
+var _ = Describe("Jenkins", func() {
+
+	var jhash hash.Hash32
+	var key []byte
+
+	BeforeEach(func() {
+		jhash = New()
+		key = []byte("Apple")
+	})
+
+	Describe("New", func() {
+
+		It("returns jenkhash", func() {
+			var h *jenkhash
+			Expect(jhash).To(BeAssignableToTypeOf(h))
+		})
+
+		It("initializes offset to 0", func() {
+			Expect(jhash.Sum32()).To(Equal(uint32(0)))
+		})
+	})
+
+	Describe("Write", func() {
+
+		It("returns key length", func() {
+			length, _ := jhash.Write(key)
+			Expect(length).To(Equal(5))
+		})
+
+		It("has no error", func() {
+			_, err := jhash.Write(key)
+			Expect(err).To(BeNil())
+		})
+
+	})
+
+	Describe("Reset", func() {
+
+		It("sets back to 0", func() {
+			Expect(jhash.Sum32()).To(Equal(uint32(0)))
+			jhash.Write(key)
+			Expect(jhash.Sum32()).NotTo(Equal(uint32(0)))
+			jhash.Reset()
+			Expect(jhash.Sum32()).To(Equal(uint32(0)))
+		})
+
+	})
+
+	Describe("Size", func() {
+
+		It("is 4", func() {
+			Expect(jhash.Size()).To(Equal(4))
+		})
+
+	})
+
+	Describe("BlockSize", func() {
+
+		It("is 1", func() {
+			Expect(jhash.BlockSize()).To(Equal(1))
+		})
+
+	})
+
+	Describe("Sum32", func() {
+
+		It("defaults to 0", func() {
+			Expect(jhash.Sum32()).To(Equal(uint32(0)))
+		})
+
+		It("sums hash", func() {
+			jhash.Write(key)
+			Expect(jhash.Sum32()).To(Equal(uint32(884782484)))
+		})
+
+	})
+
+	Describe("Sum", func() {
+
+		It("default 0 hash byte returned", func() {
+			expected := []byte{0x41, 0x70, 0x70, 0x6c, 0x65, 0x0, 0x0, 0x0, 0x0}
+			Expect(jhash.Sum(key)).To(Equal(expected))
+		})
+
+		It("returns sum byte array", func() {
+			jhash.Write(key)
+			expected := []byte{0x41, 0x70, 0x70, 0x6c, 0x65, 0x34, 0xbc, 0xb5, 0x94}
+			Expect(jhash.Sum(key)).To(Equal(expected))
+		})
+
+	})
+
+})
--- a/blocks/bloom/filter.go
+++ b/blocks/bloom/filter.go
@ -2,13 +2,11 @@
 package bloom

 import (
+	"encoding/binary"
 	"errors"
-	"fmt"
+	// Non crypto hash, because speed
+	"github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/mtchavez/jenkins"
 	"hash"
-	"hash/adler32"
-	"hash/crc32"
-	"hash/fnv"
-	"math/big"
 )

 type Filter interface {
@ -17,61 +15,66 @@ type Filter interface {
 	Merge(Filter) (Filter, error)
 }

-func BasicFilter() Filter {
-	// Non crypto hashes, because speed
-	return NewFilter(2048, adler32.New(), fnv.New32(), crc32.NewIEEE())
-}
-
-func NewFilter(size int, hashes ...hash.Hash) Filter {
+func NewFilter(size int) Filter {
 	return &filter{
+		hash:   jenkins.New(),
 		filter: make([]byte, size),
-		hashes: hashes,
+		k:      3,
 	}
 }

 type filter struct {
 	filter []byte
-	hashes []hash.Hash
+	hash   hash.Hash32
+	k      int
 }

-func (f *filter) Add(k []byte) {
-	for _, h := range f.hashes {
-		i := bytesMod(h.Sum(k), int64(len(f.filter)*8))
-		f.setBit(i)
+func BasicFilter() Filter {
+	return NewFilter(2048)
+}
+
+func (f *filter) Add(bytes []byte) {
+	for _, bit := range f.getBitIndicies(bytes) {
+		f.setBit(bit)
 	}
 }

-func (f *filter) Find(k []byte) bool {
-	for _, h := range f.hashes {
-		i := bytesMod(h.Sum(k), int64(len(f.filter)*8))
-		if !f.getBit(i) {
+func (f *filter) getBitIndicies(bytes []byte) []uint32 {
+	indicies := make([]uint32, f.k)
+
+	f.hash.Write(bytes)
+	b := make([]byte, 4)
+
+	for i := 0; i < f.k; i++ {
+		res := f.hash.Sum32()
+		indicies[i] = res % (uint32(len(f.filter)) * 8)
+
+		binary.LittleEndian.PutUint32(b, res)
+		f.hash.Write(b)
+	}
+
+	f.hash.Reset()
+
+	return indicies
+}
+
+func (f *filter) Find(bytes []byte) bool {
+	for _, bit := range f.getBitIndicies(bytes) {
+		if !f.getBit(bit) {
 			return false
 		}
 	}
 	return true
 }

-func (f *filter) setBit(i int64) {
-	fmt.Printf("setting bit %d\n", i)
+func (f *filter) setBit(i uint32) {
 	f.filter[i/8] |= (1 << byte(i%8))
 }

-func (f *filter) getBit(i int64) bool {
-	fmt.Printf("getting bit %d\n", i)
+func (f *filter) getBit(i uint32) bool {
 	return f.filter[i/8]&(1<<byte(i%8)) != 0
 }

-func bytesMod(b []byte, modulo int64) int64 {
-	i := big.NewInt(0)
-	i = i.SetBytes(b)
-
-	bigmod := big.NewInt(int64(modulo))
-	result := big.NewInt(0)
-	result.Mod(i, bigmod)
-
-	return result.Int64()
-}
-
 func (f *filter) Merge(o Filter) (Filter, error) {
 	casfil, ok := o.(*filter)
 	if !ok {
@ -82,12 +85,15 @@ func (f *filter) Merge(o Filter) (Filter, error) {
 		return nil, errors.New("filter lengths must match!")
 	}

+	if casfil.k != f.k {
+		return nil, errors.New("filter k-values must match!")
+	}
+
 	nfilt := new(filter)
-
-	// this bit is sketchy, need a way of comparing hash functions
-	nfilt.hashes = f.hashes
-
+	nfilt.hash = f.hash
 	nfilt.filter = make([]byte, len(f.filter))
+	nfilt.k = f.k
+
 	for i, v := range f.filter {
 		nfilt.filter[i] = v | casfil.filter[i]
 	}
--- a/blocks/bloom/filter_test.go
+++ b/blocks/bloom/filter_test.go
@ -1,13 +1,19 @@
 package bloom

-import "testing"
+import (
+	"encoding/binary"
+	"fmt"
+	"testing"
+)

 func TestFilter(t *testing.T) {
-	f := BasicFilter()
+	f := NewFilter(128)
+
 	keys := [][]byte{
 		[]byte("hello"),
 		[]byte("fish"),
 		[]byte("ipfsrocks"),
+		[]byte("i want ipfs socks"),
 	}

 	f.Add(keys[0])
@ -21,10 +27,54 @@ func TestFilter(t *testing.T) {
 	}

 	f.Add(keys[2])
+	f.Add(keys[3])

 	for _, k := range keys {
 		if !f.Find(k) {
 			t.Fatal("Couldnt find one of three keys")
 		}
 	}
+
+	if f.Find([]byte("beep boop")) {
+		t.Fatal("Got false positive! Super unlikely!")
+	}
+
+	fmt.Println(f)
+}
+
+func TestMerge(t *testing.T) {
+
+	f1 := NewFilter(128)
+	f2 := NewFilter(128)
+
+	fbork := NewFilter(32)
+
+	_, err := f1.Merge(fbork)
+
+	if err == nil {
+		t.Fatal("Merge should fail on filters with different lengths")
+	}
+
+	b := make([]byte, 4)
+
+	var i uint32
+	for i = 0; i < 10; i++ {
+		binary.LittleEndian.PutUint32(b, i)
+		f1.Add(b)
+	}
+
+	for i = 10; i < 20; i++ {
+		binary.LittleEndian.PutUint32(b, i)
+		f2.Add(b)
+	}
+
+	merged, _ := f1.Merge(f2)
+
+	for i = 0; i < 20; i++ {
+		binary.LittleEndian.PutUint32(b, i)
+
+		if !merged.Find(b) {
+			t.Fatal("Could not find all keys in merged filter")
+		}
+	}
 }