mirror of
https://github.com/grafana/loki.git
synced 2026-03-13 09:33:58 +08:00
291 lines
8.2 KiB
Go
291 lines
8.2 KiB
Go
package columnar
|
|
|
|
import (
|
|
"github.com/grafana/loki/v3/pkg/memory"
|
|
)
|
|
|
|
// UTF8Scalar is a [Scalar] representing a [UTF8] value.
|
|
type UTF8Scalar struct {
|
|
Value []byte // Value of the scalar.
|
|
Null bool // True if the scalar is null.
|
|
}
|
|
|
|
var _ Scalar = (*UTF8Scalar)(nil)
|
|
|
|
// Kind implements [Datum] and returns [KindUTF8].
|
|
func (s *UTF8Scalar) Kind() Kind { return KindUTF8 }
|
|
|
|
// IsNull implements [Scalar] and returns s.Null.
|
|
func (s *UTF8Scalar) IsNull() bool { return s.Null }
|
|
|
|
func (s *UTF8Scalar) isDatum() {}
|
|
func (s *UTF8Scalar) isScalar() {}
|
|
|
|
// UTF8 is an [Array] of UTF-8 encoded strings.
|
|
type UTF8 struct {
|
|
validity memory.Bitmap // Empty when there's no nulls.
|
|
offsets []int32
|
|
data []byte
|
|
length int
|
|
nullCount int
|
|
}
|
|
|
|
var _ Array = (*UTF8)(nil)
|
|
|
|
// NewUTF8 creates a new UTF8 array from the given data, offsets, and
|
|
// optional validity bitmap.
|
|
//
|
|
// UTF8 arrays made from memory owned by a [memory.Allocator] are invalidated
|
|
// when the allocator reclaims memory.
|
|
//
|
|
// Each UTF-8 string goes from data[offsets[i] : offsets[i+1]]. Offsets must
|
|
// be monotonically increasing, even for null values. The offsets slice is not
|
|
// validated for correctness.
|
|
//
|
|
// If validity is of length zero, all elements are considered valid. Otherwise,
|
|
// NewUTF8 panics if the number of elements does not match the length of
|
|
// validity.
|
|
func NewUTF8(data []byte, offsets []int32, validity memory.Bitmap) *UTF8 {
|
|
arr := &UTF8{
|
|
validity: validity,
|
|
offsets: offsets,
|
|
data: data,
|
|
}
|
|
arr.init()
|
|
return arr
|
|
}
|
|
|
|
//go:noinline
|
|
func (arr *UTF8) init() {
|
|
// Moving initialization of additional fields to a non-inlined init method
|
|
// improved the performance of the plain bytes decoder in dataset by 10%.
|
|
|
|
numElements := max(0, len(arr.offsets)-1)
|
|
if arr.validity.Len() > 0 && arr.validity.Len() != numElements {
|
|
panic("length mismatch with validity")
|
|
}
|
|
|
|
arr.length = numElements
|
|
arr.nullCount = arr.validity.ClearCount()
|
|
}
|
|
|
|
// Len returns the total number of elements in the array.
|
|
func (arr *UTF8) Len() int { return arr.length }
|
|
|
|
// DataLen returns the total length of the data in the array.
|
|
func (arr *UTF8) DataLen() int {
|
|
if arr.length == 0 {
|
|
return 0
|
|
}
|
|
return int(arr.offsets[arr.length] - arr.offsets[0])
|
|
}
|
|
|
|
// Nulls returns the number of null elements in the array. The number of
|
|
// non-null elements can be calculated from Len() - Nulls().
|
|
func (arr *UTF8) Nulls() int { return arr.nullCount }
|
|
|
|
// Get returns the value at index i. If the element at index i is null, Get
|
|
// returns an empty string.
|
|
//
|
|
// Get panics if i is out of range.
|
|
func (arr *UTF8) Get(i int) []byte {
|
|
var (
|
|
start = arr.offsets[i]
|
|
end = arr.offsets[i+1]
|
|
)
|
|
return arr.data[start:end]
|
|
}
|
|
|
|
// IsNull returns true if the element at index i is null.
|
|
func (arr *UTF8) IsNull(i int) bool {
|
|
if arr.nullCount == 0 {
|
|
return false
|
|
}
|
|
return !arr.validity.Get(i)
|
|
}
|
|
|
|
// Data returns the underlying packed UTF8 bytes. If arr is a slice, Data may
|
|
// include bytes beyond the offset ranges from the original array.
|
|
func (arr *UTF8) Data() []byte { return arr.data }
|
|
|
|
// Offsets returns the underlying offsets array.
|
|
func (arr *UTF8) Offsets() []int32 { return arr.offsets }
|
|
|
|
// Size returns the size in bytes of the array's buffers.
|
|
func (arr *UTF8) Size() int {
|
|
var (
|
|
validitySize = arr.validity.Len() / 8
|
|
dataSize = len(arr.data)
|
|
offsetsSize = len(arr.offsets) * 4 // *4 for int32
|
|
)
|
|
return validitySize + dataSize + offsetsSize
|
|
}
|
|
|
|
// Validity returns the validity bitmap of the array. The returned bitmap
|
|
// may be of length 0 if there are no nulls.
|
|
//
|
|
// A value of 1 in the Validity bitmap indicates that the corresponding
|
|
// element at that position is valid (not null).
|
|
func (arr *UTF8) Validity() memory.Bitmap { return arr.validity }
|
|
|
|
// Kind returns the kind of Array being represented.
|
|
func (arr *UTF8) Kind() Kind { return KindUTF8 }
|
|
|
|
// Slice returns a slice of arr from i to j.
|
|
//
|
|
// A sliced UTF8 array creates a slice of the offsets and validity buffers, but
|
|
// not the data buffer, permitting the offsets to continue to be valid without
|
|
// needing to normalize them to start at 0.
|
|
//
|
|
// It is recommended to normalize the offsets and remove unused memory in the
|
|
// data buffer before serializing UTF8 for network communication.
|
|
func (arr *UTF8) Slice(i, j int) Array {
|
|
if i < 0 || j < i || j > arr.Len() {
|
|
panic(errorSliceBounds{i, j, arr.Len()})
|
|
}
|
|
|
|
// Unlike with all other Arrays, we only slice the arr.offsets buffer, which
|
|
// relative to the start of arr.data.
|
|
//
|
|
// Slicing both arr.offsets and arr.data would cause the offsets to be
|
|
// incorrect.
|
|
var (
|
|
validity = sliceValidity(arr.validity, i, j)
|
|
data = arr.data
|
|
offsets = arr.offsets[i : j+1]
|
|
)
|
|
return NewUTF8(data, offsets, validity)
|
|
}
|
|
|
|
func (arr *UTF8) isDatum() {}
|
|
func (arr *UTF8) isArray() {}
|
|
|
|
// A UTF8Builder assists with constructing a [UTF8] array. A UTF8Builder must be
|
|
// constructed by calling [NewUTF8Builder].
|
|
type UTF8Builder struct {
|
|
alloc *memory.Allocator
|
|
|
|
validity memory.Bitmap
|
|
offsets memory.Buffer[int32]
|
|
data memory.Buffer[byte]
|
|
|
|
lastOffset int32
|
|
}
|
|
|
|
var _ Builder = (*UTF8Builder)(nil)
|
|
|
|
// NewUTF8Builder creates a new UTF8Builder for constructing a [UTF8] array.
|
|
func NewUTF8Builder(alloc *memory.Allocator) *UTF8Builder {
|
|
return &UTF8Builder{
|
|
alloc: alloc,
|
|
validity: memory.NewBitmap(alloc, 0),
|
|
offsets: memory.NewBuffer[int32](alloc, 0),
|
|
data: memory.NewBuffer[byte](alloc, 0),
|
|
}
|
|
}
|
|
|
|
// Grow increases b's capacity, if necessary, to guarantee space for another n
|
|
// elements. After Grow(n), at least n elements can be appended to b without
|
|
// another allocation. If n is negative or too large to allocate the memory,
|
|
// Grow panics.
|
|
func (b *UTF8Builder) Grow(n int) {
|
|
// b.offsets has an extra element for the starting offset.
|
|
if !b.needGrow(n + 1) {
|
|
return
|
|
}
|
|
|
|
b.validity.Grow(n)
|
|
b.offsets.Grow(n + 1)
|
|
}
|
|
|
|
func (b *UTF8Builder) needGrow(n int) bool {
|
|
return b.offsets.Len()+n > b.offsets.Cap()
|
|
}
|
|
|
|
// GrowData increases b's bytes capacity, if necessary, to guarantee space
|
|
// for another n bytes of data. After GrowData(n), at least n bytes can be
|
|
// appended to b (across all UTF8 values) without another allocation. If n is
|
|
// negative or too large to allocate the memory, GrowData panics.
|
|
//
|
|
// GrowData only impacts capacity of string data. Use [UTF8Builder.Grow] to
|
|
// reserve space for more elements.
|
|
func (b *UTF8Builder) GrowData(n int) {
|
|
if !b.needGrowData(n) {
|
|
return
|
|
}
|
|
b.data.Grow(n)
|
|
}
|
|
|
|
func (b *UTF8Builder) needGrowData(n int) bool {
|
|
return b.data.Len()+n > b.data.Cap()
|
|
}
|
|
|
|
// AppendNull adds a new null element to b.
|
|
func (b *UTF8Builder) AppendNull() {
|
|
if b.needGrow(1) {
|
|
b.Grow(1)
|
|
}
|
|
b.initOffsets()
|
|
|
|
b.validity.AppendUnsafe(false)
|
|
b.offsets.Push(b.lastOffset)
|
|
}
|
|
|
|
func (b *UTF8Builder) initOffsets() {
|
|
// For the first element, we need to push an initial offset of 0. All
|
|
// elements after that will push the offset where that string ends (the
|
|
// length).
|
|
if b.offsets.Len() == 0 {
|
|
b.offsets.Push(0)
|
|
}
|
|
}
|
|
|
|
// AppendNulls appends the given number of null elements to b.
|
|
func (b *UTF8Builder) AppendNulls(count int) {
|
|
if b.needGrow(count) {
|
|
b.Grow(count)
|
|
}
|
|
b.initOffsets()
|
|
|
|
b.validity.AppendCount(false, count)
|
|
b.offsets.AppendCount(b.lastOffset, count)
|
|
}
|
|
|
|
// AppendValue adds a new non-null element to b.
|
|
func (b *UTF8Builder) AppendValue(v []byte) {
|
|
dataSize := len(v)
|
|
|
|
if b.needGrow(1) {
|
|
b.Grow(1)
|
|
}
|
|
if b.needGrowData(dataSize) {
|
|
b.GrowData(dataSize)
|
|
}
|
|
b.initOffsets()
|
|
|
|
b.lastOffset += int32(dataSize)
|
|
|
|
// We can use unsafe appends here because we guarantee in the check above
|
|
// that there's enough capacity. This saves 40% of CPU time.
|
|
b.validity.AppendUnsafe(true)
|
|
b.data.Append(v...)
|
|
b.offsets.Push(b.lastOffset)
|
|
}
|
|
|
|
// BuildArray returns the constructed array. After calling Build, the builder
|
|
// is reset to an initial state.
|
|
func (b *UTF8Builder) BuildArray() Array { return b.Build() }
|
|
|
|
// Build returns the constructed [UTF8] array. After calling Build, the builder
|
|
// is reset to an initial state.
|
|
func (b *UTF8Builder) Build() *UTF8 {
|
|
// Move the original bitmaps to the constructed array, then reset the
|
|
// builder's bitmaps since they've been moved.
|
|
arr := NewUTF8(b.data.Data(), b.offsets.Data(), b.validity)
|
|
b.validity = memory.NewBitmap(b.alloc, 0)
|
|
b.offsets = memory.NewBuffer[int32](b.alloc, 0)
|
|
b.data = memory.NewBuffer[byte](b.alloc, 0)
|
|
b.lastOffset = 0
|
|
return arr
|
|
}
|