Reformulate sparseWriter to deal with starting/ending zeroes explicitly

... instead of using a multi-variable state machine.

The net effect of this code is exactly the same as the previous implementation,
except:
- the operation after Write() returns an error might differ
- If the file ends with zeroes, we don't Seek(-1), and
  we don't create a hole at all if it is too small, preferring
  to save a syscall.

But this formulation is hopefully easier to prove correct.

Signed-off-by: Miloslav Trmač <mitr@redhat.com>
This commit is contained in:
Miloslav Trmač
2024-02-20 20:16:53 +01:00
parent c5434bf711
commit 5d303ca267

View File

@ -1,19 +1,12 @@
package compression package compression
import ( import (
"bytes"
"errors" "errors"
"fmt"
"io" "io"
) )
type state int const zerosThreshold = 1024
const (
zerosThreshold = 1024
stateData = iota
stateZeros
)
type WriteSeekCloser interface { type WriteSeekCloser interface {
io.Closer io.Closer
@ -21,91 +14,103 @@ type WriteSeekCloser interface {
} }
type sparseWriter struct { type sparseWriter struct {
state state file WriteSeekCloser
file WriteSeekCloser // Invariant between method calls:
zeros int64 // The contents of the file match the contents passed to Write, except that pendingZeroes trailing zeroes have not been written.
lastIsZero bool // Also, the data that _has_ been written does not end with a zero byte (i.e. pendingZeroes is the largest possible value.
pendingZeroes int64
} }
func NewSparseWriter(file WriteSeekCloser) *sparseWriter { func NewSparseWriter(file WriteSeekCloser) *sparseWriter {
return &sparseWriter{ return &sparseWriter{
file: file, file: file,
state: stateData, pendingZeroes: 0,
zeros: 0,
lastIsZero: false,
} }
} }
func (sw *sparseWriter) createHole() error { func (sw *sparseWriter) createHole(size int64) error {
zeros := sw.zeros _, err := sw.file.Seek(size, io.SeekCurrent)
if zeros == 0 {
return nil
}
sw.zeros = 0
sw.lastIsZero = true
_, err := sw.file.Seek(zeros, io.SeekCurrent)
return err return err
} }
func findFirstNotZero(b []byte) int { func zeroSpanEnd(b []byte, i int) int {
for i, v := range b { for i < len(b) && b[i] == 0 {
if v != 0 { i++
return i
}
} }
return -1 return i
}
func nonzeroSpanEnd(b []byte, i int) int {
for i < len(b) && b[i] != 0 {
i++
}
return i
} }
// Write writes data to the file, creating holes for long sequences of zeros. // Write writes data to the file, creating holes for long sequences of zeros.
func (sw *sparseWriter) Write(data []byte) (int, error) { func (sw *sparseWriter) Write(data []byte) (int, error) {
written, current := 0, 0 initialZeroSpanLength := zeroSpanEnd(data, 0)
totalLen := len(data) if initialZeroSpanLength == len(data) {
for current < len(data) { sw.pendingZeroes += int64(initialZeroSpanLength)
switch sw.state { return initialZeroSpanLength, nil
case stateData: }
nextZero := bytes.IndexByte(data[current:], 0)
if nextZero < 0 { // We have _some_ non-zero data to write.
_, err := sw.file.Write(data[written:]) // Think of the input as an alternating sequence of spans of zeroes / non-zeroes 0a0b…c0,
sw.lastIsZero = false // where the starting/ending span of zeroes may be empty.
return totalLen, err
} else { pendingWriteOffset := 0
current += nextZero // The expected condition for creating a hole would be sw.pendingZeroes + initialZeroSpanLength >= zerosThreshold; but
sw.state = stateZeros // if sw.pendingZeroes != 0, we are going to spend a syscall to deal with sw.pendingZeroes either way.
} // We might just as well make it a createHole(), even if the hole size is below zeroThreshold.
case stateZeros: if sw.pendingZeroes != 0 || initialZeroSpanLength >= zerosThreshold {
nextNonZero := findFirstNotZero(data[current:]) if err := sw.createHole(sw.pendingZeroes + int64(initialZeroSpanLength)); err != nil {
if nextNonZero < 0 { return -1, err
// finish with a zero, flush any data and keep track of the zeros }
if written != current { // We could set sw.pendingZeroes = 0 now; it would always be overwritten on successful return from this function.
if _, err := sw.file.Write(data[written:current]); err != nil { pendingWriteOffset = initialZeroSpanLength
return -1, err }
}
sw.lastIsZero = false current := initialZeroSpanLength
} for {
sw.zeros += int64(len(data) - current) // Invariant at this point of this loop:
return totalLen, nil // - pendingWriteOffset <= current < len(data)
} // - data[current] != 0
// do not bother with too short sequences // - data[pendingWriteOffset:current] has not yet been written
if sw.zeros == 0 && nextNonZero < zerosThreshold { if pendingWriteOffset > current || current >= len(data) {
sw.state = stateData return -1, fmt.Errorf("internal error: sparseWriter invariant violation: %d <= %d < %d", pendingWriteOffset, current, len(data))
current += nextNonZero }
continue if b := data[current]; b == 0 {
} return -1, fmt.Errorf("internal error: sparseWriter invariant violation: %d@%d", b, current)
if written != current { }
if _, err := sw.file.Write(data[written:current]); err != nil {
return -1, err nonzeroSpanEnd := nonzeroSpanEnd(data, current)
} if nonzeroSpanEnd == current {
sw.lastIsZero = false return -1, fmt.Errorf("internal error: sparseWriters nonzeroSpanEnd didnt advance")
} }
sw.zeros += int64(nextNonZero) zeroSpanEnd := zeroSpanEnd(data, nonzeroSpanEnd) // possibly == nonzeroSpanEnd
current += nextNonZero zeroSpanLength := zeroSpanEnd - nonzeroSpanEnd
if err := sw.createHole(); err != nil { if zeroSpanEnd < len(data) && zeroSpanLength < zerosThreshold {
return -1, err // Too small a hole, keep going
} current = zeroSpanEnd
written = current continue
} }
// We have either reached the end, or found an interesting hole. Issue a write.
if _, err := sw.file.Write(data[pendingWriteOffset:nonzeroSpanEnd]); err != nil {
return -1, err
}
if zeroSpanEnd == len(data) {
sw.pendingZeroes = int64(zeroSpanLength)
return zeroSpanEnd, nil
}
if err := sw.createHole(int64(zeroSpanLength)); err != nil {
return -1, err
}
pendingWriteOffset = zeroSpanEnd
current = zeroSpanEnd
} }
return totalLen, nil
} }
// Close closes the SparseWriter's underlying file. // Close closes the SparseWriter's underlying file.
@ -113,16 +118,16 @@ func (sw *sparseWriter) Close() error {
if sw.file == nil { if sw.file == nil {
return errors.New("file is already closed") return errors.New("file is already closed")
} }
if err := sw.createHole(); err != nil { if sw.pendingZeroes != 0 {
sw.file.Close() if holeSize := sw.pendingZeroes - 1; holeSize >= zerosThreshold {
return err if err := sw.createHole(holeSize); err != nil {
} sw.file.Close()
if sw.lastIsZero { return err
if _, err := sw.file.Seek(-1, io.SeekCurrent); err != nil { }
sw.file.Close() sw.pendingZeroes -= holeSize
return err
} }
if _, err := sw.file.Write([]byte{0}); err != nil { var zeroArray [zerosThreshold]byte
if _, err := sw.file.Write(zeroArray[:sw.pendingZeroes]); err != nil {
sw.file.Close() sw.file.Close()
return err return err
} }