1
0
mirror of https://github.com/ipfs/kubo.git synced 2025-06-29 01:12:24 +08:00

implement basic rabin fingerprint file splitting

This commit is contained in:
Jeromy
2014-08-30 10:53:26 -07:00
parent fbd611f4a4
commit 1f309b72d0
2 changed files with 102 additions and 0 deletions

50
importer/split_test.go Normal file
View File

@ -0,0 +1,50 @@
package importer
import (
"testing"
"crypto/rand"
"bytes"
)
func TestDataSplitting(t *testing.T) {
buf := make([]byte, 16*1024*1024)
rand.Read(buf)
split := Rabin(buf)
if len(split) == 1 {
t.Fatal("No split occurred!")
}
min := 2 << 15
max := 0
mxcount := 0
n := 0
for _, b := range split {
if !bytes.Equal(b, buf[n:n+len(b)]) {
t.Fatal("Split lost data!")
}
n += len(b)
if len(b) < min {
min = len(b)
}
if len(b) > max {
max = len(b)
}
if len(b) == 16384 {
mxcount++
}
}
if n != len(buf) {
t.Fatal("missing some bytes!")
}
t.Log(len(split))
t.Log(min, max, mxcount)
}

52
importer/splitting.go Normal file
View File

@ -0,0 +1,52 @@
package importer
type BlockSplitter func([]byte) [][]byte
// TODO: this should take a reader, not a byte array. what if we're splitting a 3TB file?
func Rabin(b []byte) [][]byte {
var out [][]byte
windowsize := uint64(48)
chunk_max := 1024 * 16
min_blk_size := 2048
blk_beg_i := 0
prime := uint64(61)
var poly uint64
var curchecksum uint64
// Smaller than a window? Get outa here!
if len(b) <= int(windowsize) {
return [][]byte{b}
}
i := 0
for n := i; i < n+int(windowsize); i++ {
cur := uint64(b[i])
curchecksum = (curchecksum * prime) + cur
poly = (poly * prime) + cur
}
for ; i < len(b); i++ {
cur := uint64(b[i])
curchecksum = (curchecksum * prime) + cur
poly = (poly * prime) + cur
curchecksum -= (uint64(b[i-1]) * prime)
if i-blk_beg_i >= chunk_max {
// push block
out = append(out, b[blk_beg_i:i])
blk_beg_i = i
}
// first 13 bits of polynomial are 0
if poly % 8192 == 0 && i-blk_beg_i >= min_blk_size {
// push block
out = append(out, b[blk_beg_i:i])
blk_beg_i = i
}
}
if i > blk_beg_i {
out = append(out, b[blk_beg_i:])
}
return out
}