mirror of
https://github.com/ipfs/kubo.git
synced 2025-06-29 01:12:24 +08:00
implement basic rabin fingerprint file splitting
This commit is contained in:
50
importer/split_test.go
Normal file
50
importer/split_test.go
Normal file
@ -0,0 +1,50 @@
|
||||
package importer
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"crypto/rand"
|
||||
"bytes"
|
||||
)
|
||||
|
||||
func TestDataSplitting(t *testing.T) {
|
||||
buf := make([]byte, 16*1024*1024)
|
||||
rand.Read(buf)
|
||||
|
||||
split := Rabin(buf)
|
||||
|
||||
if len(split) == 1 {
|
||||
t.Fatal("No split occurred!")
|
||||
}
|
||||
|
||||
min := 2 << 15
|
||||
max := 0
|
||||
|
||||
mxcount := 0
|
||||
|
||||
n := 0
|
||||
for _, b := range split {
|
||||
if !bytes.Equal(b, buf[n:n+len(b)]) {
|
||||
t.Fatal("Split lost data!")
|
||||
}
|
||||
n += len(b)
|
||||
|
||||
if len(b) < min {
|
||||
min = len(b)
|
||||
}
|
||||
|
||||
if len(b) > max {
|
||||
max = len(b)
|
||||
}
|
||||
|
||||
if len(b) == 16384 {
|
||||
mxcount++
|
||||
}
|
||||
}
|
||||
|
||||
if n != len(buf) {
|
||||
t.Fatal("missing some bytes!")
|
||||
}
|
||||
t.Log(len(split))
|
||||
t.Log(min, max, mxcount)
|
||||
}
|
||||
|
52
importer/splitting.go
Normal file
52
importer/splitting.go
Normal file
@ -0,0 +1,52 @@
|
||||
package importer
|
||||
|
||||
type BlockSplitter func([]byte) [][]byte
|
||||
|
||||
// TODO: this should take a reader, not a byte array. what if we're splitting a 3TB file?
|
||||
func Rabin(b []byte) [][]byte {
|
||||
var out [][]byte
|
||||
windowsize := uint64(48)
|
||||
chunk_max := 1024 * 16
|
||||
min_blk_size := 2048
|
||||
blk_beg_i := 0
|
||||
prime := uint64(61)
|
||||
|
||||
var poly uint64
|
||||
var curchecksum uint64
|
||||
|
||||
// Smaller than a window? Get outa here!
|
||||
if len(b) <= int(windowsize) {
|
||||
return [][]byte{b}
|
||||
}
|
||||
|
||||
i := 0
|
||||
for n := i; i < n+int(windowsize); i++ {
|
||||
cur := uint64(b[i])
|
||||
curchecksum = (curchecksum * prime) + cur
|
||||
poly = (poly * prime) + cur
|
||||
}
|
||||
|
||||
for ; i < len(b); i++ {
|
||||
cur := uint64(b[i])
|
||||
curchecksum = (curchecksum * prime) + cur
|
||||
poly = (poly * prime) + cur
|
||||
curchecksum -= (uint64(b[i-1]) * prime)
|
||||
|
||||
if i-blk_beg_i >= chunk_max {
|
||||
// push block
|
||||
out = append(out, b[blk_beg_i:i])
|
||||
blk_beg_i = i
|
||||
}
|
||||
|
||||
// first 13 bits of polynomial are 0
|
||||
if poly % 8192 == 0 && i-blk_beg_i >= min_blk_size {
|
||||
// push block
|
||||
out = append(out, b[blk_beg_i:i])
|
||||
blk_beg_i = i
|
||||
}
|
||||
}
|
||||
if i > blk_beg_i {
|
||||
out = append(out, b[blk_beg_i:])
|
||||
}
|
||||
return out
|
||||
}
|
Reference in New Issue
Block a user