From 5d303ca267fce98be6f2299fbbb50334d7bda159 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miloslav=20Trma=C4=8D?= Date: Tue, 20 Feb 2024 20:16:53 +0100 Subject: [PATCH] Reformulate sparseWriter to deal with starting/ending zeroes explicitly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... instead of using a multi-variable state machine. The net effect of this code is exactly the same as the previous implementation, except: - the operation after Write() returns an error might differ - If the file ends with zeroes, we don't Seek(-1), and we don't create a hole at all if it is too small, preferring to save a syscall. But this formulation is hopefully easier to prove correct. Signed-off-by: Miloslav Trmač --- pkg/machine/compression/sparse_file_writer.go | 179 +++++++++--------- 1 file changed, 92 insertions(+), 87 deletions(-) diff --git a/pkg/machine/compression/sparse_file_writer.go b/pkg/machine/compression/sparse_file_writer.go index d8e071a3ca..c79941587e 100644 --- a/pkg/machine/compression/sparse_file_writer.go +++ b/pkg/machine/compression/sparse_file_writer.go @@ -1,19 +1,12 @@ package compression import ( - "bytes" "errors" + "fmt" "io" ) -type state int - -const ( - zerosThreshold = 1024 - - stateData = iota - stateZeros -) +const zerosThreshold = 1024 type WriteSeekCloser interface { io.Closer @@ -21,91 +14,103 @@ type WriteSeekCloser interface { } type sparseWriter struct { - state state - file WriteSeekCloser - zeros int64 - lastIsZero bool + file WriteSeekCloser + // Invariant between method calls: + // The contents of the file match the contents passed to Write, except that pendingZeroes trailing zeroes have not been written. + // Also, the data that _has_ been written does not end with a zero byte (i.e. pendingZeroes is the largest possible value. + pendingZeroes int64 } func NewSparseWriter(file WriteSeekCloser) *sparseWriter { return &sparseWriter{ - file: file, - state: stateData, - zeros: 0, - lastIsZero: false, + file: file, + pendingZeroes: 0, } } -func (sw *sparseWriter) createHole() error { - zeros := sw.zeros - if zeros == 0 { - return nil - } - sw.zeros = 0 - sw.lastIsZero = true - _, err := sw.file.Seek(zeros, io.SeekCurrent) +func (sw *sparseWriter) createHole(size int64) error { + _, err := sw.file.Seek(size, io.SeekCurrent) return err } -func findFirstNotZero(b []byte) int { - for i, v := range b { - if v != 0 { - return i - } +func zeroSpanEnd(b []byte, i int) int { + for i < len(b) && b[i] == 0 { + i++ } - return -1 + return i +} + +func nonzeroSpanEnd(b []byte, i int) int { + for i < len(b) && b[i] != 0 { + i++ + } + return i } // Write writes data to the file, creating holes for long sequences of zeros. func (sw *sparseWriter) Write(data []byte) (int, error) { - written, current := 0, 0 - totalLen := len(data) - for current < len(data) { - switch sw.state { - case stateData: - nextZero := bytes.IndexByte(data[current:], 0) - if nextZero < 0 { - _, err := sw.file.Write(data[written:]) - sw.lastIsZero = false - return totalLen, err - } else { - current += nextZero - sw.state = stateZeros - } - case stateZeros: - nextNonZero := findFirstNotZero(data[current:]) - if nextNonZero < 0 { - // finish with a zero, flush any data and keep track of the zeros - if written != current { - if _, err := sw.file.Write(data[written:current]); err != nil { - return -1, err - } - sw.lastIsZero = false - } - sw.zeros += int64(len(data) - current) - return totalLen, nil - } - // do not bother with too short sequences - if sw.zeros == 0 && nextNonZero < zerosThreshold { - sw.state = stateData - current += nextNonZero - continue - } - if written != current { - if _, err := sw.file.Write(data[written:current]); err != nil { - return -1, err - } - sw.lastIsZero = false - } - sw.zeros += int64(nextNonZero) - current += nextNonZero - if err := sw.createHole(); err != nil { - return -1, err - } - written = current - } + initialZeroSpanLength := zeroSpanEnd(data, 0) + if initialZeroSpanLength == len(data) { + sw.pendingZeroes += int64(initialZeroSpanLength) + return initialZeroSpanLength, nil + } + + // We have _some_ non-zero data to write. + // Think of the input as an alternating sequence of spans of zeroes / non-zeroes 0a0b…c0, + // where the starting/ending span of zeroes may be empty. + + pendingWriteOffset := 0 + // The expected condition for creating a hole would be sw.pendingZeroes + initialZeroSpanLength >= zerosThreshold; but + // if sw.pendingZeroes != 0, we are going to spend a syscall to deal with sw.pendingZeroes either way. + // We might just as well make it a createHole(), even if the hole size is below zeroThreshold. + if sw.pendingZeroes != 0 || initialZeroSpanLength >= zerosThreshold { + if err := sw.createHole(sw.pendingZeroes + int64(initialZeroSpanLength)); err != nil { + return -1, err + } + // We could set sw.pendingZeroes = 0 now; it would always be overwritten on successful return from this function. + pendingWriteOffset = initialZeroSpanLength + } + + current := initialZeroSpanLength + for { + // Invariant at this point of this loop: + // - pendingWriteOffset <= current < len(data) + // - data[current] != 0 + // - data[pendingWriteOffset:current] has not yet been written + if pendingWriteOffset > current || current >= len(data) { + return -1, fmt.Errorf("internal error: sparseWriter invariant violation: %d <= %d < %d", pendingWriteOffset, current, len(data)) + } + if b := data[current]; b == 0 { + return -1, fmt.Errorf("internal error: sparseWriter invariant violation: %d@%d", b, current) + } + + nonzeroSpanEnd := nonzeroSpanEnd(data, current) + if nonzeroSpanEnd == current { + return -1, fmt.Errorf("internal error: sparseWriter’s nonzeroSpanEnd didn’t advance") + } + zeroSpanEnd := zeroSpanEnd(data, nonzeroSpanEnd) // possibly == nonzeroSpanEnd + zeroSpanLength := zeroSpanEnd - nonzeroSpanEnd + if zeroSpanEnd < len(data) && zeroSpanLength < zerosThreshold { + // Too small a hole, keep going + current = zeroSpanEnd + continue + } + + // We have either reached the end, or found an interesting hole. Issue a write. + if _, err := sw.file.Write(data[pendingWriteOffset:nonzeroSpanEnd]); err != nil { + return -1, err + } + if zeroSpanEnd == len(data) { + sw.pendingZeroes = int64(zeroSpanLength) + return zeroSpanEnd, nil + } + + if err := sw.createHole(int64(zeroSpanLength)); err != nil { + return -1, err + } + pendingWriteOffset = zeroSpanEnd + current = zeroSpanEnd } - return totalLen, nil } // Close closes the SparseWriter's underlying file. @@ -113,16 +118,16 @@ func (sw *sparseWriter) Close() error { if sw.file == nil { return errors.New("file is already closed") } - if err := sw.createHole(); err != nil { - sw.file.Close() - return err - } - if sw.lastIsZero { - if _, err := sw.file.Seek(-1, io.SeekCurrent); err != nil { - sw.file.Close() - return err + if sw.pendingZeroes != 0 { + if holeSize := sw.pendingZeroes - 1; holeSize >= zerosThreshold { + if err := sw.createHole(holeSize); err != nil { + sw.file.Close() + return err + } + sw.pendingZeroes -= holeSize } - if _, err := sw.file.Write([]byte{0}); err != nil { + var zeroArray [zerosThreshold]byte + if _, err := sw.file.Write(zeroArray[:sw.pendingZeroes]); err != nil { sw.file.Close() return err }