Fix i8/opaque pointer byte offset GEP scalarization in PrivateMemoryResolution

michalpaszkowski · igcbot · commit e8906d0679be · 2025-08-13T22:53:48.000+02:00
When LLVM IR uses opaque pointers or inserts a bitcast to i8*, a
subsequent GEP is expressed in bytes. The legacy handleGEPInst always
scalarized indices by starting from pGEP-&gt;getSourceElementType(). After
the i8* cast, the type is i8, so the algorithm mistakenly treated the
byte index as a count of elements, producing misscaled (too large)
scalarized index.

Example:
%a = alloca [16 x [16 x float]], align 4
%b = bitcast [16 x [16 x float]]* %a to i8*
%c = getelementptr inbounds i8, i8* %b, i64 64

Here, 64 is a byte offset into the original aggregate. The old
implementation, seeing i8, scaled as if 64 elements, not 64 bytes.

Yet, the meaningful base of the GEP is alloca's aggregate type
[16 x [16 x float]] and the element-calculations should be based on this
type.

This change:
1. Introduces getFirstNonScalarSourceElementType(GEP), which
walks back from the GEP base through pointer casts to find a root
aggregate element type.
2. Adds additional handling in handleGEPInst, so that i8 GEP byte offset
is converted to an element index of the underlying base type.

This way the algorithm avoids basing element index scalarization on
incidental i8* and keeps index calculation aligned with the underlying
allocation layout.

For reference, in typed pointer mode (or without the bitcast), the GEP
would look like this:
%a = alloca [16 x [16 x float]], align 4
%c = getelementptr inbounds [16 x [16 x float]], [16 x [16 x float]]* %a, i64 0, i64 1

Here, %c is the pointer to the 2nd inner array [16 x float]*.
diff --git a/IGC/Compiler/CISACodeGen/LowerGEPForPrivMem.cpp b/IGC/Compiler/CISACodeGen/LowerGEPForPrivMem.cpp
@@ -1,6 +1,6 @@
 /*========================== begin_copyright_notice ============================
 
-Copyright (C) 2017-2024 Intel Corporation
+Copyright (C) 2017-2025 Intel Corporation
 
 SPDX-License-Identifier: MIT
 
@@ -824,6 +824,25 @@ std::pair<unsigned int, Type *> TransposeHelper::getArrSizeAndEltType(Type *T) {
   return std::make_pair(arr_sz, retTy);
 }
 
+Type *TransposeHelper::getFirstNonScalarSourceElementType(const GetElementPtrInst &GEP) {
+  Type *currTy = GEP.getSourceElementType();
+  if (getArrSizeAndEltType(currTy).first > 1)
+    return currTy;
+
+  const Value *base = GEP.getPointerOperand()->stripPointerCasts();
+
+  if (const auto *AI = dyn_cast<AllocaInst>(base))
+    return AI->getAllocatedType();
+  if (const auto *GV = dyn_cast<GlobalVariable>(base))
+    return GV->getValueType();
+  if (const auto *LI = dyn_cast<LoadInst>(base))
+    return LI->getType();
+  if (const auto *SI = dyn_cast<StoreInst>(base))
+    return SI->getValueOperand()->getType();
+
+  return currTy;
+}
+
 void TransposeHelper::handleGEPInst(llvm::GetElementPtrInst *pGEP, llvm::Value *idx) {
   // TODO: Add support for GEP attributes: nsw, nuw, inbounds. Currently, neigher the old nor the new algorithm handles
   // them.
@@ -841,13 +860,38 @@ void TransposeHelper::handleGEPInst(llvm::GetElementPtrInst *pGEP, llvm::Value *
     return;
   }
 
+  IRBuilder<> IRB(pGEP);
+  Value *pScalarizedIdx = IRB.getInt32(0);
+
+  // If the GEP is on i8, its index is a byte offset and must be converted to an element index of the underlying base
+  // type.
+  if (pGEP->getSourceElementType()->isIntegerTy(8)) {
+    // Get the non-scalar/aggregate GEP source element type.
+    Type *baseAggregateTy = getFirstNonScalarSourceElementType(*pGEP);
+    // Find the scalar element type at the bottom of the aggregate.
+    Type *elementTy = baseAggregateTy;
+    while (elementTy->isStructTy() || elementTy->isArrayTy() || elementTy->isVectorTy()) {
+      elementTy = getArrSizeAndEltType(elementTy).second;
+    }
+    elementTy = elementTy->getScalarType();
+    uint32_t elementBytes = (uint32_t)m_DL.getTypeAllocSize(elementTy);
+
+    // The 1st operand is the byte offset, convert bytes to element count.
+    Value *byteIndex = IRB.CreateZExtOrTrunc(pGEP->getOperand(1), IRB.getInt32Ty());
+    if (elementBytes > 1)
+      byteIndex = IRB.CreateUDiv(byteIndex, IRB.getInt32(elementBytes));
+
+    pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, byteIndex);
+    pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, idx);
+    HandleAllocaSources(pGEP, pScalarizedIdx);
+    return;
+  }
+
   // Given %p = getelementptr [4 x [3 x <2 x float>]]* %v, i64 0, i64 %1, i64 %2
   // compute the scalarized index with an auxiliary array [4, 3, 2]:
   //
   // Formula: index = (%1 x 3 + %2) x 2
   //
-  IRBuilder<> IRB(pGEP);
-  Value *pScalarizedIdx = IRB.getInt32(0);
   Type *T = pGEP->getSourceElementType();
   for (unsigned i = 0, e = pGEP->getNumIndices(); i < e; ++i) {
     // If T is VectorType we should be at the last loop iteration. This will break things only if m_vectorIndex == true.
diff --git a/IGC/Compiler/CISACodeGen/LowerGEPForPrivMem.hpp b/IGC/Compiler/CISACodeGen/LowerGEPForPrivMem.hpp
@@ -1,6 +1,6 @@
 /*========================== begin_copyright_notice ============================
 
-Copyright (C) 2017-2024 Intel Corporation
+Copyright (C) 2017-2025 Intel Corporation
 
 SPDX-License-Identifier: MIT
 
@@ -155,5 +155,6 @@ class TransposeHelper {
 private:
   bool m_vectorIndex;
   std::pair<unsigned int, llvm::Type *> getArrSizeAndEltType(llvm::Type *T);
+  llvm::Type *getFirstNonScalarSourceElementType(const llvm::GetElementPtrInst &GEP);
 };
 } // namespace IGC
diff --git a/IGC/Compiler/tests/PrivateMemoryResolution/i8_gep_byte_offset.ll b/IGC/Compiler/tests/PrivateMemoryResolution/i8_gep_byte_offset.ll
@@ -0,0 +1,27 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2025 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+
+; RUN: igc_opt --opaque-pointers --igc-private-mem-resolution --platformlnl -S %s | FileCheck %s
+
+; This test ensures GEP scalarization on i8*/opaque ptr offsets treats the index as bytes and converts to element index via recovered base type size.
+
+; CHECK-NOT: mul i32 64
+; CHECK: mul i32 16
+
+define spir_kernel void @test() {
+  %a = alloca [16 x [16 x float]], align 4
+  %b = getelementptr inbounds i8, ptr %a, i64 64
+  %c = getelementptr <8 x i32>, ptr %b, i32 0
+  %d = load <8 x i32>, ptr %c, align 4
+  ret void
+}
+
+!igc.functions = !{!1}
+!1 = !{ptr @test, !2}
+!2 = !{!3}
+!3 = !{!"function_type", i32 0}