1
0
mirror of https://github.com/ipfs/kubo.git synced 2025-06-26 15:42:21 +08:00

faster hamt logic

1. Use a custom bitfield type instead of bigints.
2. Make iterating over a hamt *significantly* faster.

License: MIT
Signed-off-by: Steven Allen <steven@stebalien.com>
This commit is contained in:
Steven Allen
2018-03-28 17:41:28 -07:00
parent dbb2ca2ece
commit f1ae13d721
5 changed files with 35 additions and 56 deletions

View File

@ -581,6 +581,12 @@
"hash": "QmPdqSMmiwtQCBC515gFtMW2mP14HsfgnyQ2k5xPQVxMge", "hash": "QmPdqSMmiwtQCBC515gFtMW2mP14HsfgnyQ2k5xPQVxMge",
"name": "go-fs-lock", "name": "go-fs-lock",
"version": "0.1.2" "version": "0.1.2"
},
{
"author": "Stebalien",
"hash": "QmTbBs3Y3u5F69XNJzdnnc6SP5GKgcXxCDzx6w8m6piVRT",
"name": "go-bitfield",
"version": "0.1.1"
} }
], ],
"gxVersion": "0.10.0", "gxVersion": "0.10.0",

View File

@ -23,14 +23,13 @@ package hamt
import ( import (
"context" "context"
"fmt" "fmt"
"math"
"math/big"
"os" "os"
dag "github.com/ipfs/go-ipfs/merkledag" dag "github.com/ipfs/go-ipfs/merkledag"
format "github.com/ipfs/go-ipfs/unixfs" format "github.com/ipfs/go-ipfs/unixfs"
upb "github.com/ipfs/go-ipfs/unixfs/pb" upb "github.com/ipfs/go-ipfs/unixfs/pb"
bitfield "gx/ipfs/QmTbBs3Y3u5F69XNJzdnnc6SP5GKgcXxCDzx6w8m6piVRT/go-bitfield"
proto "gx/ipfs/QmZ4Qi3GaRbjcx28Sme5eMH7RQjGkt8wHxt2a65oLaeFEV/gogo-protobuf/proto" proto "gx/ipfs/QmZ4Qi3GaRbjcx28Sme5eMH7RQjGkt8wHxt2a65oLaeFEV/gogo-protobuf/proto"
cid "gx/ipfs/QmcZfnkapfECQGcLZaf9B79NRg7cRa9EnZh4LSbkCzwNvY/go-cid" cid "gx/ipfs/QmcZfnkapfECQGcLZaf9B79NRg7cRa9EnZh4LSbkCzwNvY/go-cid"
ipld "gx/ipfs/Qme5bWv7wtjUNGsK2BNGVUFPKiuxWrsqrtvYwCLRw8YFES/go-ipld-format" ipld "gx/ipfs/Qme5bWv7wtjUNGsK2BNGVUFPKiuxWrsqrtvYwCLRw8YFES/go-ipld-format"
@ -46,7 +45,7 @@ const (
type Shard struct { type Shard struct {
nd *dag.ProtoNode nd *dag.ProtoNode
bitfield *big.Int bitfield bitfield.Bitfield
children []child children []child
@ -75,22 +74,22 @@ func NewShard(dserv ipld.DAGService, size int) (*Shard, error) {
return nil, err return nil, err
} }
ds.bitfield = big.NewInt(0)
ds.nd = new(dag.ProtoNode) ds.nd = new(dag.ProtoNode)
ds.hashFunc = HashMurmur3 ds.hashFunc = HashMurmur3
return ds, nil return ds, nil
} }
func makeShard(ds ipld.DAGService, size int) (*Shard, error) { func makeShard(ds ipld.DAGService, size int) (*Shard, error) {
lg2s := int(math.Log2(float64(size))) lg2s, err := logtwo(size)
if 1<<uint(lg2s) != size { if err != nil {
return nil, fmt.Errorf("hamt size should be a power of two") return nil, err
} }
maxpadding := fmt.Sprintf("%X", size-1) maxpadding := fmt.Sprintf("%X", size-1)
return &Shard{ return &Shard{
tableSizeLg2: lg2s, tableSizeLg2: lg2s,
prefixPadStr: fmt.Sprintf("%%0%dX", len(maxpadding)), prefixPadStr: fmt.Sprintf("%%0%dX", len(maxpadding)),
maxpadlen: len(maxpadding), maxpadlen: len(maxpadding),
bitfield: bitfield.NewBitfield(size),
tableSize: size, tableSize: size,
dserv: ds, dserv: ds,
}, nil }, nil
@ -123,7 +122,7 @@ func NewHamtFromDag(dserv ipld.DAGService, nd ipld.Node) (*Shard, error) {
ds.nd = pbnd.Copy().(*dag.ProtoNode) ds.nd = pbnd.Copy().(*dag.ProtoNode)
ds.children = make([]child, len(pbnd.Links())) ds.children = make([]child, len(pbnd.Links()))
ds.bitfield = new(big.Int).SetBytes(pbd.GetData()) ds.bitfield.SetBytes(pbd.GetData())
ds.hashFunc = pbd.GetHashType() ds.hashFunc = pbd.GetHashType()
ds.prefix = &ds.nd.Prefix ds.prefix = &ds.nd.Prefix
@ -145,13 +144,13 @@ func (ds *Shard) Node() (ipld.Node, error) {
out := new(dag.ProtoNode) out := new(dag.ProtoNode)
out.SetPrefix(ds.prefix) out.SetPrefix(ds.prefix)
cindex := 0
// TODO: optimized 'for each set bit' // TODO: optimized 'for each set bit'
for i := 0; i < ds.tableSize; i++ { for i := 0; i < ds.tableSize; i++ {
if ds.bitfield.Bit(i) == 0 { if !ds.bitfield.Bit(i) {
continue continue
} }
cindex := ds.indexForBitPos(i)
ch := ds.children[cindex] ch := ds.children[cindex]
if ch != nil { if ch != nil {
clnk, err := ch.Link() clnk, err := ch.Link()
@ -173,6 +172,7 @@ func (ds *Shard) Node() (ipld.Node, error) {
return nil, err return nil, err
} }
} }
cindex++
} }
typ := upb.Data_HAMTShard typ := upb.Data_HAMTShard
@ -338,7 +338,7 @@ func (ds *Shard) insertChild(idx int, key string, lnk *ipld.Link) error {
} }
i := ds.indexForBitPos(idx) i := ds.indexForBitPos(idx)
ds.bitfield.SetBit(ds.bitfield, idx, 1) ds.bitfield.SetBit(idx)
lnk.Name = ds.linkNamePrefix(idx) + key lnk.Name = ds.linkNamePrefix(idx) + key
sv := &shardValue{ sv := &shardValue{
@ -367,7 +367,7 @@ func (ds *Shard) rmChild(i int) error {
func (ds *Shard) getValue(ctx context.Context, hv *hashBits, key string, cb func(*shardValue) error) error { func (ds *Shard) getValue(ctx context.Context, hv *hashBits, key string, cb func(*shardValue) error) error {
idx := hv.Next(ds.tableSizeLg2) idx := hv.Next(ds.tableSizeLg2)
if ds.bitfield.Bit(int(idx)) == 1 { if ds.bitfield.Bit(int(idx)) {
cindex := ds.indexForBitPos(idx) cindex := ds.indexForBitPos(idx)
child, err := ds.getChild(ctx, cindex) child, err := ds.getChild(ctx, cindex)
@ -409,14 +409,7 @@ func (ds *Shard) ForEachLink(ctx context.Context, f func(*ipld.Link) error) erro
} }
func (ds *Shard) walkTrie(ctx context.Context, cb func(*shardValue) error) error { func (ds *Shard) walkTrie(ctx context.Context, cb func(*shardValue) error) error {
for i := 0; i < ds.tableSize; i++ { for idx := range ds.children {
if ds.bitfield.Bit(i) == 0 {
continue
}
idx := ds.indexForBitPos(i)
// NOTE: an optimized version could simply iterate over each
// element in the 'children' array.
c, err := ds.getChild(ctx, idx) c, err := ds.getChild(ctx, idx)
if err != nil { if err != nil {
return err return err
@ -424,14 +417,12 @@ func (ds *Shard) walkTrie(ctx context.Context, cb func(*shardValue) error) error
switch c := c.(type) { switch c := c.(type) {
case *shardValue: case *shardValue:
err := cb(c) if err := cb(c); err != nil {
if err != nil {
return err return err
} }
case *Shard: case *Shard:
err := c.walkTrie(ctx, cb) if err := c.walkTrie(ctx, cb); err != nil {
if err != nil {
return err return err
} }
default: default:
@ -444,7 +435,7 @@ func (ds *Shard) walkTrie(ctx context.Context, cb func(*shardValue) error) error
func (ds *Shard) modifyValue(ctx context.Context, hv *hashBits, key string, val *ipld.Link) error { func (ds *Shard) modifyValue(ctx context.Context, hv *hashBits, key string, val *ipld.Link) error {
idx := hv.Next(ds.tableSizeLg2) idx := hv.Next(ds.tableSizeLg2)
if ds.bitfield.Bit(idx) != 1 { if !ds.bitfield.Bit(idx) {
return ds.insertChild(idx, key, val) return ds.insertChild(idx, key, val)
} }
@ -469,7 +460,7 @@ func (ds *Shard) modifyValue(ctx context.Context, hv *hashBits, key string, val
// Note: this shouldnt normally ever happen // Note: this shouldnt normally ever happen
// in the event of another implementation creates flawed // in the event of another implementation creates flawed
// structures, this will help to normalize them. // structures, this will help to normalize them.
ds.bitfield.SetBit(ds.bitfield, idx, 0) ds.bitfield.UnsetBit(idx)
return ds.rmChild(cindex) return ds.rmChild(cindex)
case 1: case 1:
nchild, ok := child.children[0].(*shardValue) nchild, ok := child.children[0].(*shardValue)
@ -486,7 +477,7 @@ func (ds *Shard) modifyValue(ctx context.Context, hv *hashBits, key string, val
if child.key == key { if child.key == key {
// value modification // value modification
if val == nil { if val == nil {
ds.bitfield.SetBit(ds.bitfield, idx, 0) ds.bitfield.UnsetBit(idx)
return ds.rmChild(cindex) return ds.rmChild(cindex)
} }
@ -530,15 +521,7 @@ func (ds *Shard) modifyValue(ctx context.Context, hv *hashBits, key string, val
// the given bit in the bitset. The collapsed array contains only one entry // the given bit in the bitset. The collapsed array contains only one entry
// per bit set in the bitfield, and this function is used to map the indices. // per bit set in the bitfield, and this function is used to map the indices.
func (ds *Shard) indexForBitPos(bp int) int { func (ds *Shard) indexForBitPos(bp int) int {
// TODO: an optimization could reuse the same 'mask' here and change the size return ds.bitfield.OnesBefore(bp)
// as needed. This isnt yet done as the bitset package doesnt make it easy
// to do.
// make a bitmask (all bits set) 'bp' bits long
mask := new(big.Int).Sub(new(big.Int).Exp(big.NewInt(2), big.NewInt(int64(bp)), nil), big.NewInt(1))
mask.And(mask, ds.bitfield)
return popCount(mask)
} }
// linkNamePrefix takes in the bitfield index of an entry and returns its hex prefix // linkNamePrefix takes in the bitfield index of an entry and returns its hex prefix

View File

@ -433,7 +433,7 @@ func TestBitfieldIndexing(t *testing.T) {
s, _ := NewShard(ds, 256) s, _ := NewShard(ds, 256)
set := func(i int) { set := func(i int) {
s.bitfield.SetBit(s.bitfield, i, 1) s.bitfield.SetBit(i)
} }
assert := func(i int, val int) { assert := func(i int, val int) {

View File

@ -1,7 +1,7 @@
package hamt package hamt
import ( import (
"math/big" "fmt"
"math/bits" "math/bits"
) )
@ -40,10 +40,13 @@ func (hb *hashBits) Next(i int) int {
} }
} }
func popCount(i *big.Int) int { func logtwo(v int) (int, error) {
var n int if v <= 0 {
for _, v := range i.Bits() { return 0, fmt.Errorf("hamt size should be a power of two")
n += bits.OnesCount64(uint64(v))
} }
return n lg2 := bits.TrailingZeros(uint(v))
if 1<<uint(lg2) != v {
return 0, fmt.Errorf("hamt size should be a power of two")
}
return lg2, nil
} }

View File

@ -1,22 +1,9 @@
package hamt package hamt
import ( import (
"math/big"
"testing" "testing"
) )
func TestPopCount(t *testing.T) {
x := big.NewInt(0)
for i := 0; i < 50; i++ {
x.SetBit(x, i, 1)
}
if popCount(x) != 50 {
t.Fatal("expected popcount to be 50")
}
}
func TestHashBitsEvenSizes(t *testing.T) { func TestHashBitsEvenSizes(t *testing.T) {
buf := []byte{255, 127, 79, 45, 116, 99, 35, 17} buf := []byte{255, 127, 79, 45, 116, 99, 35, 17}
hb := hashBits{b: buf} hb := hashBits{b: buf}