grafana
diff --git a/‎pkg/engine/internal/executor/compat.go‎
Lines changed: 230 additions & 0 deletions b/‎pkg/engine/internal/executor/compat.go‎
Lines changed: 230 additions & 0 deletions
@@ -0,0 +1,230 @@
+package executor
+
+import (
+	"cmp"
+	"context"
+	"slices"
+
+	_ "github.com/apache/arrow-go/v18/arrow"
+	"github.com/apache/arrow-go/v18/arrow/array"
+	"github.com/apache/arrow-go/v18/arrow/memory"
+
+	"github.com/grafana/loki/v3/pkg/engine/internal/planner/physical"
+	"github.com/grafana/loki/v3/pkg/engine/internal/semconv"
+)
+
+func newColumnCompatibilityPipeline(compat *physical.ColumnCompat, input Pipeline) Pipeline {
+	const extracted = "_extracted"
+
+	return newGenericPipeline(Local, func(ctx context.Context, inputs []Pipeline) state {
+		input := inputs[0]
+		batch, err := input.Read(ctx)
+		if err != nil {
+			return failureState(err)
+		}
+
+		// Return early if the batch has zero rows, even if column names would collide.
+		if batch.NumRows() == 0 {
+			return successState(batch)
+		}
+
+		// First, find all fields in the schema that have colliding names,
+		// based on the collision column type and the source column type.
+		var (
+			collisionFieldIndices []int
+			collisionFieldNames   []string
+			sourceFieldIndices    []int
+			sourceFieldNames      []string
+		)
+
+		schema := batch.Schema()
+		for idx := range schema.NumFields() {
+			ident, err := semconv.ParseFQN(schema.Field(idx).Name)
+			if err != nil {
+				return failureState(err)
+			}
+			switch ident.ColumnType() {
+			case compat.Collision:
+				collisionFieldIndices = append(collisionFieldIndices, idx)
+				collisionFieldNames = append(collisionFieldNames, ident.ShortName())
+			case compat.Source:
+				sourceFieldIndices = append(sourceFieldIndices, idx)
+				sourceFieldNames = append(sourceFieldNames, ident.ShortName())
+			}
+		}
+
+		duplicates := findDuplicates(collisionFieldNames, sourceFieldNames)
+
+		// Return early if there are no colliding column names.
+		if len(duplicates) == 0 {
+			return successState(batch)
+		}
+
+		// Release batch, since it is not passed to the caller.
+		defer batch.Release()
+
+		// Next, update the schema with the new columns that have the _extracted suffix.
+		newSchema := batch.Schema()
+		duplicateCols := make([]duplicateColumn, 0, len(duplicates))
+		r := int(batch.NumCols())
+		for i, duplicate := range duplicates {
+			collisionFieldIdx := collisionFieldIndices[duplicate.s1Idx]
+			sourceFieldIdx := sourceFieldIndices[duplicate.s2Idx]
+
+			sourceField := newSchema.Field(sourceFieldIdx)
+			sourceIdent, err := semconv.ParseFQN(sourceField.Name)
+			if err != nil {
+				return failureState(err)
+			}
+
+			destinationIdent := semconv.NewIdentifier(sourceIdent.ShortName()+extracted, compat.Destination, sourceIdent.DataType())
+			newSchema, err = newSchema.AddField(len(newSchema.Fields()), semconv.FieldFromIdent(destinationIdent, true))
+			if err != nil {
+				return failureState(err)
+			}
+
+			duplicateCols = append(duplicateCols, duplicateColumn{
+				name:           duplicate.value,
+				collisionIdx:   collisionFieldIdx,
+				sourceIdx:      sourceFieldIdx,
+				destinationIdx: r + i,
+			})
+		}
+
+		// Create a new builder with the updated schema.
+		builder := array.NewRecordBuilder(memory.DefaultAllocator, newSchema)
+		builder.Reserve(int(batch.NumRows()))
+		defer builder.Release()
+
+		// Now, go through all fields of the old schema and append the rows to the new builder.
+		for idx := range schema.NumFields() {
+			col := batch.Column(idx)
+			fieldBuilder := builder.Field(idx)
+
+			duplicateIdx := slices.IndexFunc(duplicateCols, func(d duplicateColumn) bool { return d.sourceIdx == idx })
+
+			// If not a colliding column, just copy over the column data of the original record.
+			// I could not find a "batch copy" function, which I guess would be much more efficient.
+			//
+			// TODO(chaudum): Simplify
+			if duplicateIdx < 0 {
+				switch b := fieldBuilder.(type) {
+				case *array.StringBuilder:
+					for i := range int(batch.NumRows()) {
+						if col.IsNull(i) || !col.IsValid(i) {
+							b.AppendNull()
+						} else {
+							v := col.(*array.String).Value(i)
+							b.Append(v)
+						}
+					}
+				case *array.TimestampBuilder:
+					for i := range int(batch.NumRows()) {
+						if col.IsNull(i) || !col.IsValid(i) {
+							b.AppendNull()
+						} else {
+							v := col.(*array.Timestamp).Value(i)
+							b.Append(v)
+						}
+					}
+				case *array.Float64Builder:
+					for i := range int(batch.NumRows()) {
+						if col.IsNull(i) || !col.IsValid(i) {
+							b.AppendNull()
+						} else {
+							v := col.(*array.Float64).Value(i)
+							b.Append(v)
+						}
+					}
+				case *array.Int64Builder:
+					for i := range int(batch.NumRows()) {
+						if col.IsNull(i) || !col.IsValid(i) {
+							b.AppendNull()
+						} else {
+							v := col.(*array.Int64).Value(i)
+							b.Append(v)
+						}
+					}
+				}
+				continue
+			}
+
+			// If the currently processed column is the source field for a colliding column,
+			// then write non-null values from source column into destination column.
+			// Also, "clear" the original column value by writing a NULL instead of the original value.
+			duplicate := duplicateCols[duplicateIdx]
+			collisionCol := batch.Column(duplicate.collisionIdx)
+
+			switch b := fieldBuilder.(type) {
+			case *array.StringBuilder:
+				newFieldBuilder := builder.Field(duplicate.destinationIdx).(*array.StringBuilder)
+				for i := range int(batch.NumRows()) {
+					if (col.IsNull(i) || !col.IsValid(i)) || (collisionCol.IsNull(i) || !collisionCol.IsValid(i)) {
+						b.AppendNull()               // append NULL to original column
+						newFieldBuilder.AppendNull() // append NULL to _extraced column
+					} else {
+						b.AppendNull() // append NULL to original column
+						v := col.(*array.String).Value(i)
+						newFieldBuilder.Append(v) // append value to _extracted column
+					}
+				}
+			default:
+				panic("invalid column type: only string columns can be checked for collisions")
+			}
+		}
+
+		return successState(builder.NewRecord())
+	}, input)
+}
+
+// duplicate holds indexes to a duplicate values in two slices
+type duplicate struct {
+	value        string
+	s1Idx, s2Idx int
+}
+
+// findDuplicates finds strings that appear in both slices and returns
+// their indexes in each slice.
+// The function assumes that elements in a slices are unique.
+func findDuplicates(s1, s2 []string) []duplicate {
+	if len(s1) == 0 || len(s2) == 0 {
+		return nil
+	}
+
+	set1 := make(map[string]int)
+	for i, v := range s1 {
+		set1[v] = i
+	}
+
+	set2 := make(map[string]int)
+	for i, v := range s2 {
+		set2[v] = i
+	}
+
+	// Find duplicates that exist in both slices
+	var duplicates []duplicate
+	for value, s1Idx := range set1 {
+		if s2Idx, exists := set2[value]; exists {
+			duplicates = append(duplicates, duplicate{
+				value: value,
+				s1Idx: s1Idx,
+				s2Idx: s2Idx,
+			})
+		}
+	}
+
+	slices.SortStableFunc(duplicates, func(a, b duplicate) int { return cmp.Compare(a.value, b.value) })
+	return duplicates
+}
+
+// duplicateColumn holds indexes to fields/columns in an [*arrow.Schema].
+type duplicateColumn struct {
+	// name is the duplicate column name
+	name string
+	// collisionIdx is the index of the collision column
+	collisionIdx int
+	// sourceIdx is the index of the source column
+	sourceIdx int
+	// destinationIdx is the index of the destination column
+	destinationIdx int
+}