| From a3028239a75157d517baf17d58a1f1e3ee1c24cb Mon Sep 17 00:00:00 2001 |
| From: Christudasan Devadasan <Christudasan.Devadasan@amd.com> |
| Date: Wed, 21 Dec 2022 16:11:30 +0530 |
| Subject: [PATCH] Revert "[AMDGPU][SILowerSGPRSpills] Spill SGPRs to virtual |
| VGPRs" |
| |
| This reverts commit 40ba0942e2ab1107f83aa5a0ee5ae2980bf47b1a. |
| --- |
| .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 3 - |
| llvm/lib/Target/AMDGPU/SIDefines.h | 7 - |
| llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 45 +- |
| llvm/lib/Target/AMDGPU/SIFrameLowering.h | 4 +- |
| llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 - |
| llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 58 +- |
| llvm/lib/Target/AMDGPU/SIInstrInfo.h | 5 - |
| llvm/lib/Target/AMDGPU/SIInstructions.td | 2 - |
| llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 108 +- |
| .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 32 +- |
| .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 41 +- |
| llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 64 +- |
| .../CodeGen/AMDGPU/GlobalISel/assert-align.ll | 1 - |
| .../GlobalISel/call-outgoing-stack-args.ll | 8 +- |
| .../GlobalISel/image-waterfall-loop-O0.ll | 209 ++- |
| .../CodeGen/AMDGPU/GlobalISel/localizer.ll | 3 +- |
| .../abi-attribute-hints-undefined-behavior.ll | 1 - |
| llvm/test/CodeGen/AMDGPU/bf16.ll | 156 +-- |
| .../test/CodeGen/AMDGPU/branch-relax-spill.ll | 8 +- |
| .../AMDGPU/call-alias-register-usage-agpr.ll | 6 +- |
| .../AMDGPU/call-alias-register-usage1.ll | 2 +- |
| .../test/CodeGen/AMDGPU/callee-frame-setup.ll | 31 +- |
| .../CodeGen/AMDGPU/cf-loop-on-constant.ll | 91 -- |
| llvm/test/CodeGen/AMDGPU/collapse-endcf.ll | 108 +- |
| .../AMDGPU/control-flow-fastregalloc.ll | 11 +- |
| .../AMDGPU/cross-block-use-is-not-abi-copy.ll | 4 - |
| .../AMDGPU/csr-sgpr-spill-live-ins.mir | 9 +- |
| .../AMDGPU/dwarf-multi-register-use-crash.ll | 77 +- |
| .../fix-frame-reg-in-custom-csr-spills.ll | 1 - |
| llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll | 80 +- |
| .../CodeGen/AMDGPU/fold-reload-into-exec.mir | 58 +- |
| .../CodeGen/AMDGPU/fold-reload-into-m0.mir | 16 +- |
| ...frame-setup-without-sgpr-to-vgpr-spills.ll | 1 - |
| .../CodeGen/AMDGPU/gfx-call-non-gfx-func.ll | 14 +- |
| .../AMDGPU/gfx-callable-argument-types.ll | 1180 ++++++----------- |
| .../gfx-callable-preserved-registers.ll | 286 ++-- |
| .../AMDGPU/gfx-callable-return-types.ll | 41 +- |
| llvm/test/CodeGen/AMDGPU/indirect-call.ll | 324 +++-- |
| .../kernel-vgpr-spill-mubuf-with-voffset.ll | 28 +- |
| llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 296 ++--- |
| .../CodeGen/AMDGPU/mubuf-legalize-operands.ll | 29 +- |
| .../CodeGen/AMDGPU/mul24-pass-ordering.ll | 47 +- |
| .../AMDGPU/need-fp-from-vgpr-spills.ll | 68 +- |
| .../AMDGPU/no-source-locations-in-prologue.ll | 38 +- |
| .../AMDGPU/partial-sgpr-to-vgpr-spills.ll | 1053 +++++++-------- |
| .../scc-clobbered-sgpr-to-vmem-spill.ll | 389 +----- |
| .../sgpr-spill-dead-frame-in-dbg-value.mir | 26 +- |
| ...fi-skip-processing-stack-arg-dbg-value.mir | 4 +- |
| .../CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll | 290 ++-- |
| .../AMDGPU/sgpr-spill-partially-undef.mir | 14 +- |
| .../sgpr-spill-update-only-slot-indexes.ll | 9 +- |
| .../AMDGPU/sgpr-spill-vmem-large-frame.mir | 4 +- |
| .../AMDGPU/sgpr-spills-split-regalloc.ll | 177 ++- |
| .../CodeGen/AMDGPU/si-spill-sgpr-stack.ll | 10 +- |
| llvm/test/CodeGen/AMDGPU/sibling-call.ll | 8 +- |
| .../AMDGPU/spill-csr-frame-ptr-reg-copy.ll | 18 +- |
| .../AMDGPU/spill-offset-calculation.ll | 19 +- |
| .../AMDGPU/spill-reg-tuple-super-reg-use.mir | 36 +- |
| .../CodeGen/AMDGPU/spill-scavenge-offset.ll | 478 ++++--- |
| .../AMDGPU/spill-sgpr-csr-live-ins.mir | 5 +- |
| .../AMDGPU/spill-sgpr-stack-no-sgpr.ll | 45 +- |
| .../AMDGPU/spill-sgpr-to-virtual-vgpr.mir | 320 ----- |
| .../spill-vgpr-to-agpr-update-regscavenger.ll | 85 -- |
| .../CodeGen/AMDGPU/spill-writelane-vgprs.ll | 9 +- |
| llvm/test/CodeGen/AMDGPU/spill192.mir | 29 +- |
| llvm/test/CodeGen/AMDGPU/spill224.mir | 33 +- |
| llvm/test/CodeGen/AMDGPU/spill288.mir | 41 +- |
| llvm/test/CodeGen/AMDGPU/spill320.mir | 45 +- |
| llvm/test/CodeGen/AMDGPU/spill352.mir | 49 +- |
| llvm/test/CodeGen/AMDGPU/spill384.mir | 53 +- |
| .../CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll | 3 +- |
| .../AMDGPU/tuple-allocation-failure.ll | 224 ++-- |
| .../AMDGPU/unstructured-cfg-def-use-issue.ll | 163 ++- |
| .../CodeGen/AMDGPU/vgpr-tuple-allocation.ll | 246 ++-- |
| .../CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll | 5 +- |
| .../wwm-register-spill-during-regalloc.ll | 166 --- |
| .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll | 697 +++++----- |
| .../AMDGPU/machine-function-info-after-pei.ll | 1 - |
| .../AMDGPU/machine-function-info-no-ir.mir | 29 - |
| .../MIR/AMDGPU/machine-function-info.ll | 4 - |
| .../AMDGPU/sgpr-for-exec-copy-invalid-reg.mir | 12 - |
| .../CodeGen/MIR/AMDGPU/stack-id-assert.mir | 2 +- |
| 82 files changed, 3235 insertions(+), 5175 deletions(-) |
| delete mode 100644 llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir |
| delete mode 100644 llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll |
| delete mode 100644 llvm/test/CodeGen/AMDGPU/wwm-register-spill-during-regalloc.ll |
| delete mode 100644 llvm/test/CodeGen/MIR/AMDGPU/sgpr-for-exec-copy-invalid-reg.mir |
| |
| diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |
| index 5300ce94462d..7bda7eda6822 100644 |
| --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |
| +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |
| @@ -1450,9 +1450,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo( |
| if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy)) |
| return true; |
| |
| - if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy)) |
| - return true; |
| - |
| auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) { |
| // Create a diagnostic for a the register string literal. |
| const MemoryBuffer &Buffer = |
| diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h |
| index 10cdad30913d..1325afb68ca9 100644 |
| --- a/llvm/lib/Target/AMDGPU/SIDefines.h |
| +++ b/llvm/lib/Target/AMDGPU/SIDefines.h |
| @@ -909,13 +909,6 @@ enum Offset_COV5 : unsigned { |
| }; |
| |
| } // namespace ImplicitArg |
| - |
| -namespace VirtRegFlag { |
| -// Virtual Register Flags. |
| -enum Register_Flag : uint8_t { WWM_REG = 0 }; |
| - |
| -} // namespace VirtRegFlag |
| - |
| } // namespace AMDGPU |
| |
| #define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 |
| diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp |
| index 1d7d0dfd9a94..c2bc95930272 100644 |
| --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp |
| +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp |
| @@ -66,8 +66,7 @@ static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, |
| |
| static void getVGPRSpillLaneOrTempRegister( |
| MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR, |
| - const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass, |
| - bool IncludeScratchCopy = true) { |
| + const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass) { |
| SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
| |
| @@ -78,12 +77,9 @@ static void getVGPRSpillLaneOrTempRegister( |
| |
| // We need to save and restore the given SGPR. |
| |
| - Register ScratchSGPR; |
| // 1: Try to save the given register into an unused scratch SGPR. The LiveRegs |
| - // should have all the callee saved registers marked as used. For certain |
| - // cases we skip copy to scratch SGPR. |
| - if (IncludeScratchCopy) |
| - ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC); |
| + // should have all the callee saved registers marked as used. |
| + Register ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC); |
| |
| if (!ScratchSGPR) { |
| int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr, |
| @@ -1354,8 +1350,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( |
| TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); |
| if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, |
| TRI->isAGPR(MRI, VReg))) { |
| - RS->enterBasicBlockEnd(MBB); |
| - RS->backward(MI); |
| + // FIXME: change to enterBasicBlockEnd() |
| + RS->enterBasicBlock(MBB); |
| TRI->eliminateFrameIndex(MI, 0, FIOp, RS); |
| SpillFIs.set(FI); |
| continue; |
| @@ -1452,10 +1448,8 @@ void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced( |
| // The special SGPR spills like the one needed for FP, BP or any reserved |
| // registers delayed until frame lowering. |
| void SIFrameLowering::determinePrologEpilogSGPRSaves( |
| - MachineFunction &MF, BitVector &SavedVGPRs, |
| - bool NeedExecCopyReservedReg) const { |
| + MachineFunction &MF, BitVector &SavedVGPRs) const { |
| MachineFrameInfo &FrameInfo = MF.getFrameInfo(); |
| - MachineRegisterInfo &MRI = MF.getRegInfo(); |
| SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); |
| const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
| @@ -1467,27 +1461,6 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves( |
| for (unsigned I = 0; CSRegs[I]; ++I) |
| LiveRegs.addReg(CSRegs[I]); |
| |
| - if (NeedExecCopyReservedReg) { |
| - Register ReservedReg = MFI->getSGPRForEXECCopy(); |
| - assert(ReservedReg && "Should have reserved an SGPR for EXEC copy."); |
| - const TargetRegisterClass &RC = ST.isWave32() |
| - ? AMDGPU::SReg_32_XM0_XEXECRegClass |
| - : AMDGPU::SGPR_64RegClass; |
| - Register UnusedScratchReg = findUnusedRegister(MRI, LiveRegs, RC); |
| - if (UnusedScratchReg) { |
| - // If found any unused scratch SGPR, reserve the register itself for Exec |
| - // copy and there is no need for any spills in that case. |
| - MFI->setSGPRForEXECCopy(UnusedScratchReg); |
| - LiveRegs.addReg(UnusedScratchReg); |
| - } else { |
| - // Needs spill. |
| - assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) && |
| - "Re-reserving spill slot for EXEC copy register"); |
| - getVGPRSpillLaneOrTempRegister(MF, LiveRegs, ReservedReg, RC, |
| - /* IncludeScratchCopy */ false); |
| - } |
| - } |
| - |
| // hasFP only knows about stack objects that already exist. We're now |
| // determining the stack slots that will be created, so we have to predict |
| // them. Stack objects force FP usage with calls. |
| @@ -1526,8 +1499,6 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, |
| |
| const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
| - const SIInstrInfo *TII = ST.getInstrInfo(); |
| - bool NeedExecCopyReservedReg = false; |
| |
| for (MachineBasicBlock &MBB : MF) { |
| for (MachineInstr &MI : MBB) { |
| @@ -1545,8 +1516,6 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, |
| MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg()); |
| else if (MI.getOpcode() == AMDGPU::V_READLANE_B32) |
| MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg()); |
| - else if (TII->isWWMRegSpillOpcode(MI.getOpcode())) |
| - NeedExecCopyReservedReg = true; |
| } |
| } |
| |
| @@ -1559,7 +1528,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF, |
| if (!ST.hasGFX90AInsts()) |
| SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask()); |
| |
| - determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg); |
| + determinePrologEpilogSGPRSaves(MF, SavedVGPRs); |
| |
| // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't |
| // allow the default insertion to handle them. |
| diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h |
| index 0060fc0be431..def07dc4b1f7 100644 |
| --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h |
| +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h |
| @@ -34,8 +34,8 @@ public: |
| RegScavenger *RS = nullptr) const override; |
| void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, |
| RegScavenger *RS = nullptr) const; |
| - void determinePrologEpilogSGPRSaves(MachineFunction &MF, BitVector &SavedRegs, |
| - bool NeedExecCopyReservedReg) const; |
| + void determinePrologEpilogSGPRSaves(MachineFunction &MF, |
| + BitVector &SavedRegs) const; |
| void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB, |
| MachineBasicBlock::iterator MBBI, DebugLoc &DL, |
| LivePhysRegs &LiveRegs, Register FrameReg, |
| diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp |
| index 76fd98a174b6..8a627fb79a0e 100644 |
| --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp |
| +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp |
| @@ -12589,14 +12589,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { |
| } |
| } |
| |
| - // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling. |
| - unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF); |
| - Register SReg = |
| - ST.isWave32() |
| - ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1) |
| - : AMDGPU::SGPR_64RegClass.getRegister((MaxNumSGPRs / 2) - 1); |
| - Info->setSGPRForEXECCopy(SReg); |
| - |
| TargetLoweringBase::finalizeLowering(MF); |
| } |
| |
| diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp |
| index b29fa1ae7718..492f06c97a86 100644 |
| --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp |
| +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp |
| @@ -1567,28 +1567,6 @@ static unsigned getAVSpillSaveOpcode(unsigned Size) { |
| } |
| } |
| |
| -static unsigned getWWMRegSpillSaveOpcode(unsigned Size) { |
| - // Currently, there is only 32-bit WWM register spills needed. |
| - if (Size != 4) |
| - llvm_unreachable("unknown wwm register spill size"); |
| - |
| - return AMDGPU::SI_SPILL_WWM_V32_SAVE; |
| -} |
| - |
| -static unsigned getVectorRegSpillSaveOpcode(Register Reg, |
| - const TargetRegisterClass *RC, |
| - unsigned Size, |
| - const SIRegisterInfo &TRI, |
| - const SIMachineFunctionInfo &MFI) { |
| - // Choose the right opcode if spilling a WWM register. |
| - if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) |
| - return getWWMRegSpillSaveOpcode(Size); |
| - |
| - return TRI.isVectorSuperClass(RC) ? getAVSpillSaveOpcode(Size) |
| - : TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size) |
| - : getVGPRSpillSaveOpcode(Size); |
| -} |
| - |
| void SIInstrInfo::storeRegToStackSlot( |
| MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, |
| bool isKill, int FrameIndex, const TargetRegisterClass *RC, |
| @@ -1633,8 +1611,11 @@ void SIInstrInfo::storeRegToStackSlot( |
| return; |
| } |
| |
| - unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, |
| - SpillSize, RI, *MFI); |
| + unsigned Opcode = RI.isVectorSuperClass(RC) |
| + ? getAVSpillSaveOpcode(SpillSize) |
| + : RI.isAGPRClass(RC) |
| + ? getAGPRSpillSaveOpcode(SpillSize) |
| + : getVGPRSpillSaveOpcode(SpillSize); |
| MFI->setHasSpilledVGPRs(); |
| |
| BuildMI(MBB, MI, DL, get(Opcode)) |
| @@ -1785,27 +1766,6 @@ static unsigned getAVSpillRestoreOpcode(unsigned Size) { |
| } |
| } |
| |
| -static unsigned getWWMRegSpillRestoreOpcode(unsigned Size) { |
| - // Currently, there is only 32-bit WWM register spills needed. |
| - if (Size != 4) |
| - llvm_unreachable("unknown wwm register spill size"); |
| - |
| - return AMDGPU::SI_SPILL_WWM_V32_RESTORE; |
| -} |
| - |
| -static unsigned |
| -getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, |
| - unsigned Size, const SIRegisterInfo &TRI, |
| - const SIMachineFunctionInfo &MFI) { |
| - // Choose the right opcode if restoring a WWM register. |
| - if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) |
| - return getWWMRegSpillRestoreOpcode(Size); |
| - |
| - return TRI.isVectorSuperClass(RC) ? getAVSpillRestoreOpcode(Size) |
| - : TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size) |
| - : getVGPRSpillRestoreOpcode(Size); |
| -} |
| - |
| void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, |
| MachineBasicBlock::iterator MI, |
| Register DestReg, int FrameIndex, |
| @@ -1849,9 +1809,11 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, |
| return; |
| } |
| |
| - unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC, |
| - SpillSize, RI, *MFI); |
| - |
| + unsigned Opcode = RI.isVectorSuperClass(RC) |
| + ? getAVSpillRestoreOpcode(SpillSize) |
| + : RI.isAGPRClass(RC) |
| + ? getAGPRSpillRestoreOpcode(SpillSize) |
| + : getVGPRSpillRestoreOpcode(SpillSize); |
| BuildMI(MBB, MI, DL, get(Opcode), DestReg) |
| .addFrameIndex(FrameIndex) // vaddr |
| .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset |
| diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h |
| index bc0f44d6ac0f..4782b3a7bc20 100644 |
| --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h |
| +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h |
| @@ -627,11 +627,6 @@ public: |
| return get(Opcode).TSFlags & SIInstrFlags::SGPRSpill; |
| } |
| |
| - static bool isWWMRegSpillOpcode(uint16_t Opcode) { |
| - return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE || |
| - Opcode == AMDGPU::SI_SPILL_WWM_V32_RESTORE; |
| - } |
| - |
| static bool isDPP(const MachineInstr &MI) { |
| return MI.getDesc().TSFlags & SIInstrFlags::DPP; |
| } |
| diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td |
| index 20722f97323f..24384aeea21f 100644 |
| --- a/llvm/lib/Target/AMDGPU/SIInstructions.td |
| +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td |
| @@ -890,8 +890,6 @@ defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384, 1>; |
| defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>; |
| defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>; |
| |
| -defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR <VGPR_32>; |
| - |
| def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < |
| (outs SReg_64:$dst), |
| (ins si_ga:$ptr_lo, si_ga:$ptr_hi), |
| diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp |
| index bb81c2556bfd..3450a9f0681f 100644 |
| --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp |
| +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp |
| @@ -20,7 +20,6 @@ |
| #include "MCTargetDesc/AMDGPUMCTargetDesc.h" |
| #include "SIMachineFunctionInfo.h" |
| #include "llvm/CodeGen/LiveIntervals.h" |
| -#include "llvm/CodeGen/MachineDominators.h" |
| #include "llvm/CodeGen/MachineFrameInfo.h" |
| #include "llvm/CodeGen/RegisterScavenging.h" |
| #include "llvm/InitializePasses.h" |
| @@ -39,7 +38,6 @@ private: |
| const SIInstrInfo *TII = nullptr; |
| LiveIntervals *LIS = nullptr; |
| SlotIndexes *Indexes = nullptr; |
| - MachineDominatorTree *MDT = nullptr; |
| |
| // Save and Restore blocks of the current function. Typically there is a |
| // single save block, unless Windows EH funclets are involved. |
| @@ -53,23 +51,13 @@ public: |
| |
| void calculateSaveRestoreBlocks(MachineFunction &MF); |
| bool spillCalleeSavedRegs(MachineFunction &MF); |
| - void updateLaneVGPRDomInstr( |
| - int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt, |
| - DenseMap<Register, MachineBasicBlock::iterator> &LaneVGPRDomInstr); |
| |
| bool runOnMachineFunction(MachineFunction &MF) override; |
| |
| void getAnalysisUsage(AnalysisUsage &AU) const override { |
| - AU.addRequired<MachineDominatorTree>(); |
| AU.setPreservesAll(); |
| MachineFunctionPass::getAnalysisUsage(AU); |
| } |
| - |
| - MachineFunctionProperties getClearedProperties() const override { |
| - return MachineFunctionProperties() |
| - .set(MachineFunctionProperties::Property::IsSSA) |
| - .set(MachineFunctionProperties::Property::NoVRegs); |
| - } |
| }; |
| |
| } // end anonymous namespace |
| @@ -80,7 +68,6 @@ INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE, |
| "SI lower SGPR spill instructions", false, false) |
| INITIALIZE_PASS_DEPENDENCY(LiveIntervals) |
| INITIALIZE_PASS_DEPENDENCY(VirtRegMap) |
| -INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) |
| INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE, |
| "SI lower SGPR spill instructions", false, false) |
| |
| @@ -261,55 +248,6 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { |
| return false; |
| } |
| |
| -void SILowerSGPRSpills::updateLaneVGPRDomInstr( |
| - int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt, |
| - DenseMap<Register, MachineBasicBlock::iterator> &LaneVGPRDomInstr) { |
| - // For the Def of a virtual LaneVPGR to dominate all its uses, we should |
| - // insert an IMPLICIT_DEF before the dominating spill. Switching to a |
| - // depth first order doesn't really help since the machine function can be in |
| - // the unstructured control flow post-SSA. For each virtual register, hence |
| - // finding the common dominator to get either the dominating spill or a block |
| - // dominating all spills. Is there a better way to handle it? |
| - SIMachineFunctionInfo *FuncInfo = |
| - MBB->getParent()->getInfo<SIMachineFunctionInfo>(); |
| - ArrayRef<SIRegisterInfo::SpilledReg> VGPRSpills = |
| - FuncInfo->getSGPRSpillToVGPRLanes(FI); |
| - Register PrevLaneVGPR; |
| - for (auto &Spill : VGPRSpills) { |
| - if (PrevLaneVGPR == Spill.VGPR) |
| - continue; |
| - |
| - PrevLaneVGPR = Spill.VGPR; |
| - auto I = LaneVGPRDomInstr.find(Spill.VGPR); |
| - if (Spill.Lane == 0 && I == LaneVGPRDomInstr.end()) { |
| - // Initially add the spill instruction itself for Insertion point. |
| - LaneVGPRDomInstr[Spill.VGPR] = InsertPt; |
| - } else { |
| - assert(I != LaneVGPRDomInstr.end()); |
| - auto PrevInsertPt = I->second; |
| - MachineBasicBlock *DomMBB = PrevInsertPt->getParent(); |
| - if (DomMBB == MBB) { |
| - // The insertion point earlier selected in a predecessor block whose |
| - // spills are currently being lowered. The earlier InsertPt would be |
| - // the one just before the block terminator and it should be changed |
| - // if we insert any new spill in it. |
| - if (MDT->dominates(&*InsertPt, &*PrevInsertPt)) |
| - I->second = InsertPt; |
| - |
| - continue; |
| - } |
| - |
| - // Find the common dominator block between PrevInsertPt and the |
| - // current spill. |
| - DomMBB = MDT->findNearestCommonDominator(DomMBB, MBB); |
| - if (DomMBB == MBB) |
| - I->second = InsertPt; |
| - else if (DomMBB != PrevInsertPt->getParent()) |
| - I->second = &(*DomMBB->getFirstTerminator()); |
| - } |
| - } |
| -} |
| - |
| bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { |
| const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| TII = ST.getInstrInfo(); |
| @@ -317,7 +255,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { |
| |
| LIS = getAnalysisIfAvailable<LiveIntervals>(); |
| Indexes = getAnalysisIfAvailable<SlotIndexes>(); |
| - MDT = &getAnalysis<MachineDominatorTree>(); |
| |
| assert(SaveBlocks.empty() && RestoreBlocks.empty()); |
| |
| @@ -327,6 +264,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { |
| bool HasCSRs = spillCalleeSavedRegs(MF); |
| |
| MachineFrameInfo &MFI = MF.getFrameInfo(); |
| + MachineRegisterInfo &MRI = MF.getRegInfo(); |
| SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); |
| |
| if (!MFI.hasStackObjects() && !HasCSRs) { |
| @@ -336,6 +274,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { |
| } |
| |
| bool MadeChange = false; |
| + bool NewReservedRegs = false; |
| |
| // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be |
| // handled as SpilledToReg in regular PrologEpilogInserter. |
| @@ -351,9 +290,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { |
| // To track the spill frame indices handled in this pass. |
| BitVector SpillFIs(MFI.getObjectIndexEnd(), false); |
| |
| - // To track the IMPLICIT_DEF insertion point for the lane vgprs. |
| - DenseMap<Register, MachineBasicBlock::iterator> LaneVGPRDomInstr; |
| - |
| for (MachineBasicBlock &MBB : MF) { |
| for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { |
| if (!TII->isSGPRSpill(MI)) |
| @@ -361,32 +297,23 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { |
| |
| int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); |
| assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); |
| - MachineInstrSpan MIS(&MI, &MBB); |
| if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) { |
| + NewReservedRegs = true; |
| bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( |
| MI, FI, nullptr, Indexes, LIS); |
| (void)Spilled; |
| assert(Spilled && "failed to spill SGPR to VGPR when allocated"); |
| SpillFIs.set(FI); |
| - updateLaneVGPRDomInstr(FI, &MBB, MIS.begin(), LaneVGPRDomInstr); |
| } |
| } |
| } |
| |
| - for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) { |
| - auto InsertPt = LaneVGPRDomInstr[Reg]; |
| - // Insert the IMPLICIT_DEF at the identified points. |
| - auto MIB = |
| - BuildMI(*InsertPt->getParent(), *InsertPt, InsertPt->getDebugLoc(), |
| - TII->get(AMDGPU::IMPLICIT_DEF), Reg); |
| - FuncInfo->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG); |
| - if (LIS) { |
| - LIS->InsertMachineInstrInMaps(*MIB); |
| - LIS->createAndComputeVirtRegInterval(Reg); |
| - } |
| - } |
| - |
| + // FIXME: Adding to live-ins redundant with reserving registers. |
| for (MachineBasicBlock &MBB : MF) { |
| + for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) |
| + MBB.addLiveIn(Reg); |
| + MBB.sortUniqueLiveIns(); |
| + |
| // FIXME: The dead frame indices are replaced with a null register from |
| // the debug value instructions. We should instead, update it with the |
| // correct register value. But not sure the register value alone is |
| @@ -407,26 +334,15 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { |
| // lane". |
| FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false); |
| |
| - MachineRegisterInfo &MRI = MF.getRegInfo(); |
| - const TargetRegisterClass *RC = |
| - ST.isWave32() ? &AMDGPU::SGPR_32RegClass : &AMDGPU::SGPR_64RegClass; |
| - // Shift back the reserved SGPR for EXEC copy into the lowest range. |
| - // This SGPR is reserved to handle the whole-wave spill/copy operations |
| - // that might get inserted during vgpr regalloc. |
| - Register UnusedLowSGPR = TRI->findUnusedRegister(MRI, RC, MF); |
| - if (UnusedLowSGPR && TRI->getHWRegIndex(UnusedLowSGPR) < |
| - TRI->getHWRegIndex(FuncInfo->getSGPRForEXECCopy())) |
| - FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR); |
| - |
| MadeChange = true; |
| - } else { |
| - // No SGPR spills and hence there won't be any WWM spills/copies. Reset the |
| - // SGPR reserved for EXEC copy. |
| - FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister); |
| } |
| |
| SaveBlocks.clear(); |
| RestoreBlocks.clear(); |
| |
| + // Updated the reserved registers with any VGPRs added for SGPR spills. |
| + if (NewReservedRegs) |
| + MRI.freezeReservedRegs(MF); |
| + |
| return MadeChange; |
| } |
| diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp |
| index ff6c4e0304b8..6eea030afb00 100644 |
| --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp |
| +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp |
| @@ -60,9 +60,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) |
| Occupancy = ST.computeOccupancy(F, getLDSSize()); |
| CallingConv::ID CC = F.getCallingConv(); |
| |
| - const_cast<MachineFunction &>(MF).getRegInfo().addDelegate(this); |
| - VRegFlags.reserve(256); |
| - |
| // FIXME: Should have analysis or something rather than attribute to detect |
| // calls. |
| const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); |
| @@ -310,11 +307,24 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs, |
| bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF, |
| int FI, |
| unsigned LaneIndex) { |
| + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| + const SIRegisterInfo *TRI = ST.getRegisterInfo(); |
| MachineRegisterInfo &MRI = MF.getRegInfo(); |
| Register LaneVGPR; |
| if (!LaneIndex) { |
| - LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); |
| + LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); |
| + if (LaneVGPR == AMDGPU::NoRegister) { |
| + // We have no VGPRs left for spilling SGPRs. Reset because we will not |
| + // partially spill the SGPR to VGPRs. |
| + SGPRSpillToVGPRLanes.erase(FI); |
| + return false; |
| + } |
| + |
| SpillVGPRs.push_back(LaneVGPR); |
| + // Add this register as live-in to all blocks to avoid machine verifier |
| + // complaining about use of an undefined physical register. |
| + for (MachineBasicBlock &BB : MF) |
| + BB.addLiveIn(LaneVGPR); |
| } else { |
| LaneVGPR = SpillVGPRs.back(); |
| } |
| @@ -522,16 +532,6 @@ MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const { |
| return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; |
| } |
| |
| -void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) { |
| - VRegFlags.grow(Reg); |
| -} |
| - |
| -void SIMachineFunctionInfo::MRI_NotecloneVirtualRegister(Register NewReg, |
| - Register SrcReg) { |
| - VRegFlags.grow(NewReg); |
| - VRegFlags[NewReg] = VRegFlags[SrcReg]; |
| -} |
| - |
| Register |
| SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const { |
| const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| @@ -639,10 +639,6 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( |
| |
| if (MFI.getVGPRForAGPRCopy()) |
| VGPRForAGPRCopy = regToString(MFI.getVGPRForAGPRCopy(), TRI); |
| - |
| - if (MFI.getSGPRForEXECCopy()) |
| - SGPRForEXECCopy = regToString(MFI.getSGPRForEXECCopy(), TRI); |
| - |
| auto SFI = MFI.getOptionalScavengeFI(); |
| if (SFI) |
| ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo()); |
| diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h |
| index 32e79116322b..c0cfc36e0a96 100644 |
| --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h |
| +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h |
| @@ -275,7 +275,6 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { |
| SIMode Mode; |
| std::optional<FrameIndex> ScavengeFI; |
| StringValue VGPRForAGPRCopy; |
| - StringValue SGPRForEXECCopy; |
| |
| SIMachineFunctionInfo() = default; |
| SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &, |
| @@ -317,8 +316,6 @@ template <> struct MappingTraits<SIMachineFunctionInfo> { |
| YamlIO.mapOptional("scavengeFI", MFI.ScavengeFI); |
| YamlIO.mapOptional("vgprForAGPRCopy", MFI.VGPRForAGPRCopy, |
| StringValue()); // Don't print out when it's empty. |
| - YamlIO.mapOptional("sgprForEXECCopy", MFI.SGPRForEXECCopy, |
| - StringValue()); // Don't print out when it's empty. |
| } |
| }; |
| |
| @@ -355,8 +352,7 @@ public: |
| |
| /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which |
| /// tells the hardware which interpolation parameters to load. |
| -class SIMachineFunctionInfo final : public AMDGPUMachineFunction, |
| - private MachineRegisterInfo::Delegate { |
| +class SIMachineFunctionInfo final : public AMDGPUMachineFunction { |
| friend class GCNTargetMachine; |
| |
| // State of MODE register, assumed FP mode. |
| @@ -454,9 +450,6 @@ private: |
| |
| unsigned HighBitsOf32BitAddress; |
| |
| - // Flags associated with the virtual registers. |
| - IndexedMap<uint8_t, VirtReg2IndexFunctor> VRegFlags; |
| - |
| // Current recorded maximum possible occupancy. |
| unsigned Occupancy; |
| |
| @@ -466,10 +459,6 @@ private: |
| |
| MCPhysReg getNextSystemSGPR() const; |
| |
| - // MachineRegisterInfo callback functions to notify events. |
| - void MRI_NoteNewVirtualRegister(Register Reg) override; |
| - void MRI_NotecloneVirtualRegister(Register NewReg, Register SrcReg) override; |
| - |
| public: |
| struct VGPRSpillToAGPR { |
| SmallVector<MCPhysReg, 32> Lanes; |
| @@ -478,11 +467,11 @@ public: |
| }; |
| |
| private: |
| - // To track virtual VGPR + lane index for each subregister of the SGPR spilled |
| - // to frameindex key during SILowerSGPRSpills pass. |
| + // To track VGPR + lane index for each subregister of the SGPR spilled to |
| + // frameindex key during SILowerSGPRSpills pass. |
| DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> SGPRSpillToVGPRLanes; |
| - // To track physical VGPR + lane index for spilling special SGPRs like Frame |
| - // Pointer identified during PrologEpilogInserter. |
| + // To track VGPR + lane index for spilling special SGPRs like Frame Pointer |
| + // identified during PrologEpilogInserter. |
| DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> |
| PrologEpilogSGPRSpillToVGPRLanes; |
| unsigned NumVGPRSpillLanes = 0; |
| @@ -512,9 +501,6 @@ private: |
| // PrologEpilogInserter. |
| PrologEpilogSGPRSpillsMap PrologEpilogSGPRSpills; |
| |
| - // To save/restore EXEC MASK around WWM spills and copies. |
| - Register SGPRForEXECCopy; |
| - |
| DenseMap<int, VGPRSpillToAGPR> VGPRToAGPRSpills; |
| |
| // AGPRs used for VGPR spills. |
| @@ -638,19 +624,6 @@ public: |
| : makeArrayRef(I->second); |
| } |
| |
| - void setFlag(Register Reg, uint8_t Flag) { |
| - assert(Reg.isVirtual()); |
| - if (VRegFlags.inBounds(Reg)) |
| - VRegFlags[Reg] |= (uint8_t)1 << Flag; |
| - } |
| - |
| - bool checkFlag(Register Reg, uint8_t Flag) const { |
| - if (Reg.isPhysical()) |
| - return false; |
| - |
| - return VRegFlags.inBounds(Reg) && VRegFlags[Reg] & ((uint8_t)1 << Flag); |
| - } |
| - |
| void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size = 4, |
| Align Alignment = Align(4)); |
| |
| @@ -663,10 +636,6 @@ public: |
| return SpillAGPR; |
| } |
| |
| - Register getSGPRForEXECCopy() const { return SGPRForEXECCopy; } |
| - |
| - void setSGPRForEXECCopy(Register Reg) { SGPRForEXECCopy = Reg; } |
| - |
| ArrayRef<MCPhysReg> getVGPRSpillAGPRs() const { |
| return SpillVGPR; |
| } |
| diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp |
| index 9c524d7cb2e8..c5ef7bf7dd00 100644 |
| --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp |
| +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp |
| @@ -646,11 +646,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { |
| assert(!isSubRegister(ScratchRSrcReg, BasePtrReg)); |
| } |
| |
| - // SGPR used to preserve EXEC MASK around WWM spill/copy instructions. |
| - Register ExecCopyReg = MFI->getSGPRForEXECCopy(); |
| - if (ExecCopyReg) |
| - reserveRegisterTuples(Reserved, ExecCopyReg); |
| - |
| // Reserve VGPRs/AGPRs. |
| // |
| unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF); |
| @@ -716,6 +711,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { |
| for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) |
| reserveRegisterTuples(Reserved, Reg); |
| |
| + for (auto Reg : MFI->getSGPRSpillVGPRs()) |
| + reserveRegisterTuples(Reserved, Reg); |
| + |
| return Reserved; |
| } |
| |
| @@ -1067,8 +1065,6 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { |
| case AMDGPU::SI_SPILL_A32_RESTORE: |
| case AMDGPU::SI_SPILL_AV32_SAVE: |
| case AMDGPU::SI_SPILL_AV32_RESTORE: |
| - case AMDGPU::SI_SPILL_WWM_V32_SAVE: |
| - case AMDGPU::SI_SPILL_WWM_V32_RESTORE: |
| return 1; |
| default: llvm_unreachable("Invalid spill opcode"); |
| } |
| @@ -2009,40 +2005,6 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( |
| } |
| } |
| |
| -static void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, |
| - MachineBasicBlock::iterator MBBI, |
| - const DebugLoc &DL, Register Reg, |
| - RegScavenger *RS) { |
| - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| - const SIInstrInfo *TII = ST.getInstrInfo(); |
| - bool IsWave32 = ST.isWave32(); |
| - if (RS->isRegUsed(AMDGPU::SCC)) { |
| - // Insert two move instructions, one to save the original value of EXEC and |
| - // the other to turn on all bits in EXEC. This is required as we can't use |
| - // the single instruction S_OR_SAVEEXEC that clobbers SCC. |
| - unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
| - MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
| - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg).addReg(Exec, RegState::Kill); |
| - BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); |
| - } else { |
| - const unsigned OrSaveExec = |
| - IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; |
| - auto SaveExec = |
| - BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1); |
| - SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. |
| - } |
| -} |
| - |
| -static void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, |
| - MachineBasicBlock::iterator MBBI, const DebugLoc &DL, |
| - Register Reg) { |
| - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); |
| - const SIInstrInfo *TII = ST.getInstrInfo(); |
| - unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; |
| - MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; |
| - BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec).addReg(Reg, RegState::Kill); |
| -} |
| - |
| bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, |
| int SPAdj, unsigned FIOperandNum, |
| RegScavenger *RS) const { |
| @@ -2141,8 +2103,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, |
| case AMDGPU::SI_SPILL_AV128_SAVE: |
| case AMDGPU::SI_SPILL_AV96_SAVE: |
| case AMDGPU::SI_SPILL_AV64_SAVE: |
| - case AMDGPU::SI_SPILL_AV32_SAVE: |
| - case AMDGPU::SI_SPILL_WWM_V32_SAVE: { |
| + case AMDGPU::SI_SPILL_AV32_SAVE: { |
| const MachineOperand *VData = TII->getNamedOperand(*MI, |
| AMDGPU::OpName::vdata); |
| assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == |
| @@ -2151,18 +2112,11 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, |
| unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR |
| : AMDGPU::BUFFER_STORE_DWORD_OFFSET; |
| auto *MBB = MI->getParent(); |
| - bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); |
| - if (IsWWMRegSpill) |
| - insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), RS); |
| - |
| buildSpillLoadStore( |
| *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, |
| TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), |
| *MI->memoperands_begin(), RS); |
| MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode())); |
| - if (IsWWMRegSpill) |
| - restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); |
| - |
| MI->eraseFromParent(); |
| return true; |
| } |
| @@ -2207,8 +2161,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, |
| case AMDGPU::SI_SPILL_AV352_RESTORE: |
| case AMDGPU::SI_SPILL_AV384_RESTORE: |
| case AMDGPU::SI_SPILL_AV512_RESTORE: |
| - case AMDGPU::SI_SPILL_AV1024_RESTORE: |
| - case AMDGPU::SI_SPILL_WWM_V32_RESTORE: { |
| + case AMDGPU::SI_SPILL_AV1024_RESTORE: { |
| const MachineOperand *VData = TII->getNamedOperand(*MI, |
| AMDGPU::OpName::vdata); |
| assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == |
| @@ -2217,17 +2170,10 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, |
| unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR |
| : AMDGPU::BUFFER_LOAD_DWORD_OFFSET; |
| auto *MBB = MI->getParent(); |
| - bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode()); |
| - if (IsWWMRegSpill) |
| - insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), RS); |
| - |
| buildSpillLoadStore( |
| *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, |
| TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), |
| *MI->memoperands_begin(), RS); |
| - if (IsWWMRegSpill) |
| - restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy()); |
| - |
| MI->eraseFromParent(); |
| return true; |
| } |
| diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll |
| index 2b3d710fb9f2..8ae7f0520392 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll |
| @@ -13,7 +13,6 @@ define ptr addrspace(1) @call_assert_align() { |
| ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_mov_b64 exec, s[18:19] |
| -; CHECK-NEXT: ; implicit-def: $vgpr40 |
| ; CHECK-NEXT: s_addk_i32 s32, 0x400 |
| ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0 |
| diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll |
| index c4169300c4f1..28a7b1a62a70 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll |
| @@ -243,7 +243,6 @@ define void @func_caller_stack() { |
| ; MUBUF-NEXT: v_mov_b32_e32 v0, 9 |
| ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 |
| ; MUBUF-NEXT: v_mov_b32_e32 v0, 10 |
| -; MUBUF-NEXT: ; implicit-def: $vgpr40 |
| ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 |
| ; MUBUF-NEXT: v_mov_b32_e32 v0, 11 |
| ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 |
| @@ -281,7 +280,6 @@ define void @func_caller_stack() { |
| ; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 |
| ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:4 |
| ; FLATSCR-NEXT: v_mov_b32_e32 v0, 10 |
| -; FLATSCR-NEXT: ; implicit-def: $vgpr40 |
| ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:8 |
| ; FLATSCR-NEXT: v_mov_b32_e32 v0, 11 |
| ; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 |
| @@ -322,9 +320,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) { |
| ; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen |
| ; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 |
| ; MUBUF-NEXT: s_addk_i32 s32, 0x400 |
| -; MUBUF-NEXT: ; implicit-def: $vgpr40 |
| -; MUBUF-NEXT: v_writelane_b32 v41, s4, 0 |
| ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 |
| +; MUBUF-NEXT: v_writelane_b32 v41, s4, 0 |
| ; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 |
| ; MUBUF-NEXT: s_getpc_b64 s[4:5] |
| ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4 |
| @@ -406,9 +403,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) { |
| ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] |
| ; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off |
| ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 |
| -; FLATSCR-NEXT: ; implicit-def: $vgpr40 |
| -; FLATSCR-NEXT: v_writelane_b32 v41, s0, 0 |
| ; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 |
| +; FLATSCR-NEXT: v_writelane_b32 v41, s0, 0 |
| ; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 |
| ; FLATSCR-NEXT: s_getpc_b64 s[0:1] |
| ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4 |
| diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll |
| index 157aebb25033..3a985d1b2dee 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll |
| @@ -8,83 +8,75 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; CHECK-NEXT: s_xor_saveexec_b32 s4, -1 |
| -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill |
| -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill |
| +; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_mov_b32 exec_lo, s4 |
| -; CHECK-NEXT: v_mov_b32_e32 v14, v1 |
| -; CHECK-NEXT: v_mov_b32_e32 v13, v2 |
| -; CHECK-NEXT: v_mov_b32_e32 v12, v3 |
| -; CHECK-NEXT: v_mov_b32_e32 v11, v4 |
| -; CHECK-NEXT: v_mov_b32_e32 v10, v5 |
| -; CHECK-NEXT: v_mov_b32_e32 v9, v6 |
| -; CHECK-NEXT: v_mov_b32_e32 v8, v7 |
| +; CHECK-NEXT: v_mov_b32_e32 v15, v1 |
| +; CHECK-NEXT: v_mov_b32_e32 v14, v2 |
| +; CHECK-NEXT: v_mov_b32_e32 v13, v3 |
| +; CHECK-NEXT: v_mov_b32_e32 v12, v4 |
| +; CHECK-NEXT: v_mov_b32_e32 v11, v5 |
| +; CHECK-NEXT: v_mov_b32_e32 v10, v6 |
| +; CHECK-NEXT: v_mov_b32_e32 v9, v7 |
| ; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec |
| -; CHECK-NEXT: v_mov_b32_e32 v1, v14 |
| -; CHECK-NEXT: v_mov_b32_e32 v2, v13 |
| -; CHECK-NEXT: v_mov_b32_e32 v3, v12 |
| -; CHECK-NEXT: v_mov_b32_e32 v4, v11 |
| -; CHECK-NEXT: v_mov_b32_e32 v5, v10 |
| -; CHECK-NEXT: v_mov_b32_e32 v6, v9 |
| -; CHECK-NEXT: v_mov_b32_e32 v7, v8 |
| -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill |
| -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill |
| -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill |
| -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill |
| -; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill |
| -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill |
| -; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill |
| -; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill |
| +; CHECK-NEXT: v_mov_b32_e32 v1, v15 |
| +; CHECK-NEXT: v_mov_b32_e32 v2, v14 |
| +; CHECK-NEXT: v_mov_b32_e32 v3, v13 |
| +; CHECK-NEXT: v_mov_b32_e32 v4, v12 |
| +; CHECK-NEXT: v_mov_b32_e32 v5, v11 |
| +; CHECK-NEXT: v_mov_b32_e32 v6, v10 |
| +; CHECK-NEXT: v_mov_b32_e32 v7, v9 |
| +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill |
| +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill |
| +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill |
| +; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill |
| +; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill |
| +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill |
| +; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_mov_b32 s8, 0 |
| ; CHECK-NEXT: s_mov_b32 s4, s8 |
| ; CHECK-NEXT: s_mov_b32 s5, s8 |
| ; CHECK-NEXT: s_mov_b32 s6, s8 |
| ; CHECK-NEXT: s_mov_b32 s7, s8 |
| -; CHECK-NEXT: ; implicit-def: $vgpr0 |
| -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 |
| -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 |
| -; CHECK-NEXT: v_writelane_b32 v0, s6, 2 |
| -; CHECK-NEXT: v_writelane_b32 v0, s7, 3 |
| +; CHECK-NEXT: v_writelane_b32 v8, s4, 0 |
| +; CHECK-NEXT: v_writelane_b32 v8, s5, 1 |
| +; CHECK-NEXT: v_writelane_b32 v8, s6, 2 |
| +; CHECK-NEXT: v_writelane_b32 v8, s7, 3 |
| ; CHECK-NEXT: s_mov_b32 s6, 0 |
| ; CHECK-NEXT: s_mov_b32 s4, s6 |
| ; CHECK-NEXT: s_mov_b32 s5, s6 |
| -; CHECK-NEXT: v_mov_b32_e32 v1, s4 |
| -; CHECK-NEXT: v_mov_b32_e32 v2, s5 |
| +; CHECK-NEXT: v_mov_b32_e32 v0, s4 |
| +; CHECK-NEXT: v_mov_b32_e32 v1, s5 |
| +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill |
| -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_mov_b32 s4, exec_lo |
| -; CHECK-NEXT: v_writelane_b32 v0, s4, 4 |
| -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 |
| -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill |
| -; CHECK-NEXT: s_mov_b32 exec_lo, s21 |
| +; CHECK-NEXT: v_writelane_b32 v8, s4, 4 |
| ; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 |
| -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 |
| -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload |
| -; CHECK-NEXT: s_mov_b32 exec_lo, s21 |
| -; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload |
| -; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload |
| -; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload |
| -; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload |
| -; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload |
| -; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload |
| -; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload |
| -; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload |
| +; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| +; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload |
| +; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload |
| +; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload |
| +; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload |
| +; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload |
| +; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload |
| +; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| -; CHECK-NEXT: v_mov_b32_e32 v8, v9 |
| -; CHECK-NEXT: v_mov_b32_e32 v7, v10 |
| -; CHECK-NEXT: v_mov_b32_e32 v6, v11 |
| -; CHECK-NEXT: v_mov_b32_e32 v5, v12 |
| -; CHECK-NEXT: v_mov_b32_e32 v4, v13 |
| -; CHECK-NEXT: v_mov_b32_e32 v3, v14 |
| -; CHECK-NEXT: v_mov_b32_e32 v2, v15 |
| -; CHECK-NEXT: v_mov_b32_e32 v1, v16 |
| -; CHECK-NEXT: v_readfirstlane_b32 s12, v8 |
| -; CHECK-NEXT: v_readfirstlane_b32 s10, v7 |
| -; CHECK-NEXT: v_readfirstlane_b32 s9, v6 |
| -; CHECK-NEXT: v_readfirstlane_b32 s8, v5 |
| -; CHECK-NEXT: v_readfirstlane_b32 s7, v4 |
| -; CHECK-NEXT: v_readfirstlane_b32 s6, v3 |
| -; CHECK-NEXT: v_readfirstlane_b32 s5, v2 |
| -; CHECK-NEXT: v_readfirstlane_b32 s4, v1 |
| +; CHECK-NEXT: v_mov_b32_e32 v7, v9 |
| +; CHECK-NEXT: v_mov_b32_e32 v6, v10 |
| +; CHECK-NEXT: v_mov_b32_e32 v5, v11 |
| +; CHECK-NEXT: v_mov_b32_e32 v4, v12 |
| +; CHECK-NEXT: v_mov_b32_e32 v3, v13 |
| +; CHECK-NEXT: v_mov_b32_e32 v2, v14 |
| +; CHECK-NEXT: v_mov_b32_e32 v1, v15 |
| +; CHECK-NEXT: v_mov_b32_e32 v0, v16 |
| +; CHECK-NEXT: v_readfirstlane_b32 s12, v7 |
| +; CHECK-NEXT: v_readfirstlane_b32 s10, v6 |
| +; CHECK-NEXT: v_readfirstlane_b32 s9, v5 |
| +; CHECK-NEXT: v_readfirstlane_b32 s8, v4 |
| +; CHECK-NEXT: v_readfirstlane_b32 s7, v3 |
| +; CHECK-NEXT: v_readfirstlane_b32 s6, v2 |
| +; CHECK-NEXT: v_readfirstlane_b32 s5, v1 |
| +; CHECK-NEXT: v_readfirstlane_b32 s4, v0 |
| ; CHECK-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 |
| ; CHECK-NEXT: s_mov_b32 s13, s10 |
| ; CHECK-NEXT: s_mov_b32 s14, s9 |
| @@ -93,79 +85,68 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) { |
| ; CHECK-NEXT: s_mov_b32 s17, s6 |
| ; CHECK-NEXT: s_mov_b32 s18, s5 |
| ; CHECK-NEXT: s_mov_b32 s19, s4 |
| -; CHECK-NEXT: v_writelane_b32 v0, s12, 5 |
| -; CHECK-NEXT: v_writelane_b32 v0, s13, 6 |
| -; CHECK-NEXT: v_writelane_b32 v0, s14, 7 |
| -; CHECK-NEXT: v_writelane_b32 v0, s15, 8 |
| -; CHECK-NEXT: v_writelane_b32 v0, s16, 9 |
| -; CHECK-NEXT: v_writelane_b32 v0, s17, 10 |
| -; CHECK-NEXT: v_writelane_b32 v0, s18, 11 |
| -; CHECK-NEXT: v_writelane_b32 v0, s19, 12 |
| -; CHECK-NEXT: v_mov_b32_e32 v7, v9 |
| -; CHECK-NEXT: v_mov_b32_e32 v8, v10 |
| -; CHECK-NEXT: v_mov_b32_e32 v5, v11 |
| -; CHECK-NEXT: v_mov_b32_e32 v6, v12 |
| -; CHECK-NEXT: v_mov_b32_e32 v3, v13 |
| -; CHECK-NEXT: v_mov_b32_e32 v4, v14 |
| -; CHECK-NEXT: v_mov_b32_e32 v1, v15 |
| -; CHECK-NEXT: v_mov_b32_e32 v2, v16 |
| +; CHECK-NEXT: v_writelane_b32 v8, s12, 5 |
| +; CHECK-NEXT: v_writelane_b32 v8, s13, 6 |
| +; CHECK-NEXT: v_writelane_b32 v8, s14, 7 |
| +; CHECK-NEXT: v_writelane_b32 v8, s15, 8 |
| +; CHECK-NEXT: v_writelane_b32 v8, s16, 9 |
| +; CHECK-NEXT: v_writelane_b32 v8, s17, 10 |
| +; CHECK-NEXT: v_writelane_b32 v8, s18, 11 |
| +; CHECK-NEXT: v_writelane_b32 v8, s19, 12 |
| +; CHECK-NEXT: v_mov_b32_e32 v6, v9 |
| +; CHECK-NEXT: v_mov_b32_e32 v7, v10 |
| +; CHECK-NEXT: v_mov_b32_e32 v4, v11 |
| +; CHECK-NEXT: v_mov_b32_e32 v5, v12 |
| +; CHECK-NEXT: v_mov_b32_e32 v2, v13 |
| +; CHECK-NEXT: v_mov_b32_e32 v3, v14 |
| +; CHECK-NEXT: v_mov_b32_e32 v0, v15 |
| +; CHECK-NEXT: v_mov_b32_e32 v1, v16 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[12:13] |
| ; CHECK-NEXT: s_mov_b64 s[10:11], s[14:15] |
| ; CHECK-NEXT: s_mov_b64 s[8:9], s[16:17] |
| ; CHECK-NEXT: s_mov_b64 s[6:7], s[18:19] |
| -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[4:5], v[7:8] |
| -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[5:6] |
| +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[4:5], v[6:7] |
| +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[4:5] |
| ; CHECK-NEXT: s_and_b32 s4, s4, s5 |
| -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[3:4] |
| +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[2:3] |
| ; CHECK-NEXT: s_and_b32 s4, s4, s5 |
| -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[1:2] |
| +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[0:1] |
| ; CHECK-NEXT: s_and_b32 s4, s4, s5 |
| ; CHECK-NEXT: s_and_saveexec_b32 s4, s4 |
| -; CHECK-NEXT: v_writelane_b32 v0, s4, 13 |
| -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 |
| -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill |
| -; CHECK-NEXT: s_mov_b32 exec_lo, s21 |
| +; CHECK-NEXT: v_writelane_b32 v8, s4, 13 |
| ; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 |
| -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload |
| -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 |
| -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload |
| -; CHECK-NEXT: s_mov_b32 exec_lo, s21 |
| +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload |
| +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload |
| +; CHECK-NEXT: v_readlane_b32 s4, v8, 13 |
| +; CHECK-NEXT: v_readlane_b32 s8, v8, 5 |
| +; CHECK-NEXT: v_readlane_b32 s9, v8, 6 |
| +; CHECK-NEXT: v_readlane_b32 s10, v8, 7 |
| +; CHECK-NEXT: v_readlane_b32 s11, v8, 8 |
| +; CHECK-NEXT: v_readlane_b32 s12, v8, 9 |
| +; CHECK-NEXT: v_readlane_b32 s13, v8, 10 |
| +; CHECK-NEXT: v_readlane_b32 s14, v8, 11 |
| +; CHECK-NEXT: v_readlane_b32 s15, v8, 12 |
| +; CHECK-NEXT: v_readlane_b32 s16, v8, 0 |
| +; CHECK-NEXT: v_readlane_b32 s17, v8, 1 |
| +; CHECK-NEXT: v_readlane_b32 s18, v8, 2 |
| +; CHECK-NEXT: v_readlane_b32 s19, v8, 3 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| -; CHECK-NEXT: v_readlane_b32 s4, v2, 13 |
| -; CHECK-NEXT: v_readlane_b32 s8, v2, 5 |
| -; CHECK-NEXT: v_readlane_b32 s9, v2, 6 |
| -; CHECK-NEXT: v_readlane_b32 s10, v2, 7 |
| -; CHECK-NEXT: v_readlane_b32 s11, v2, 8 |
| -; CHECK-NEXT: v_readlane_b32 s12, v2, 9 |
| -; CHECK-NEXT: v_readlane_b32 s13, v2, 10 |
| -; CHECK-NEXT: v_readlane_b32 s14, v2, 11 |
| -; CHECK-NEXT: v_readlane_b32 s15, v2, 12 |
| -; CHECK-NEXT: v_readlane_b32 s16, v2, 0 |
| -; CHECK-NEXT: v_readlane_b32 s17, v2, 1 |
| -; CHECK-NEXT: v_readlane_b32 s18, v2, 2 |
| -; CHECK-NEXT: v_readlane_b32 s19, v2, 3 |
| ; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill |
| +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4 |
| ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 |
| ; CHECK-NEXT: ; %bb.3: |
| -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 |
| -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload |
| -; CHECK-NEXT: s_mov_b32 exec_lo, s21 |
| -; CHECK-NEXT: s_waitcnt vmcnt(0) |
| -; CHECK-NEXT: v_readlane_b32 s4, v0, 4 |
| +; CHECK-NEXT: v_readlane_b32 s4, v8, 4 |
| ; CHECK-NEXT: s_mov_b32 exec_lo, s4 |
| ; CHECK-NEXT: ; %bb.4: |
| -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload |
| +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload |
| ; CHECK-NEXT: ; implicit-def: $sgpr4 |
| ; CHECK-NEXT: v_mov_b32_e32 v1, s4 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, s4 |
| ; CHECK-NEXT: v_mov_b32_e32 v3, s4 |
| ; CHECK-NEXT: s_xor_saveexec_b32 s4, -1 |
| -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload |
| -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload |
| +; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_mov_b32 exec_lo, s4 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 |
| diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll |
| index 66ee43a7cee2..c7e5931c110a 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll |
| @@ -242,9 +242,8 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) { |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX9-NEXT: global_load_dword v0, v[0:1], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: v_writelane_b32 v41, s16, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX9-NEXT: v_writelane_b32 v41, s16, 0 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], 0 |
| diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll |
| index f234847900e7..313baf1c3282 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll |
| @@ -23,7 +23,6 @@ define void @parent_func_missing_inputs() #0 { |
| ; FIXEDABI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; FIXEDABI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; FIXEDABI-NEXT: s_mov_b64 exec, s[18:19] |
| -; FIXEDABI-NEXT: ; implicit-def: $vgpr40 |
| ; FIXEDABI-NEXT: s_addk_i32 s32, 0x400 |
| ; FIXEDABI-NEXT: v_writelane_b32 v40, s30, 0 |
| ; FIXEDABI-NEXT: v_writelane_b32 v41, s16, 0 |
| diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll |
| index d5caefda01d6..3f711da77503 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/bf16.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll |
| @@ -1417,13 +1417,12 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { |
| ; GCN-LABEL: test_call: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GCN-NEXT: s_mov_b32 s10, s33 |
| +; GCN-NEXT: s_mov_b32 s8, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| -; GCN-NEXT: ; implicit-def: $vgpr2 |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v2, s31, 1 |
| @@ -1442,14 +1441,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { |
| ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GCN-NEXT: s_mov_b32 s33, s10 |
| +; GCN-NEXT: s_mov_b32 s33, s8 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_call: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX7-NEXT: s_mov_b32 s10, s33 |
| +; GFX7-NEXT: s_mov_b32 s8, s33 |
| ; GFX7-NEXT: s_mov_b32 s33, s32 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -1459,7 +1458,6 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { |
| ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 |
| ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 |
| ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX7-NEXT: ; implicit-def: $vgpr2 |
| ; GFX7-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GFX7-NEXT: v_writelane_b32 v2, s31, 1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -1473,14 +1471,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { |
| ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX7-NEXT: s_mov_b32 s33, s10 |
| +; GFX7-NEXT: s_mov_b32 s33, s8 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_call: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX8-NEXT: s_mov_b32 s8, s33 |
| +; GFX8-NEXT: s_mov_b32 s6, s33 |
| ; GFX8-NEXT: s_mov_b32 s33, s32 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -1490,7 +1488,6 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { |
| ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX8-NEXT: ; implicit-def: $vgpr2 |
| ; GFX8-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GFX8-NEXT: v_writelane_b32 v2, s31, 1 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -1504,14 +1501,14 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { |
| ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX8-NEXT: s_mov_b32 s33, s8 |
| +; GFX8-NEXT: s_mov_b32 s33, s6 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_call: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX9-NEXT: s_mov_b32 s8, s33 |
| +; GFX9-NEXT: s_mov_b32 s6, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -1521,7 +1518,6 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { |
| ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr2 |
| ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -1534,7 +1530,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { |
| ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX9-NEXT: s_mov_b32 s33, s8 |
| +; GFX9-NEXT: s_mov_b32 s33, s6 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| @@ -1542,7 +1538,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX10-NEXT: s_mov_b32 s7, s33 |
| +; GFX10-NEXT: s_mov_b32 s6, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -1552,7 +1548,6 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store@gotpcrel32@hi+12 |
| -; GFX10-NEXT: ; implicit-def: $vgpr2 |
| ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 |
| @@ -1567,7 +1562,7 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) { |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 |
| -; GFX10-NEXT: s_mov_b32 s33, s7 |
| +; GFX10-NEXT: s_mov_b32 s33, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| @@ -1580,13 +1575,12 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GCN-LABEL: test_call_v2bf16: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GCN-NEXT: s_mov_b32 s10, s33 |
| +; GCN-NEXT: s_mov_b32 s8, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| -; GCN-NEXT: ; implicit-def: $vgpr3 |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_writelane_b32 v3, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v3, s31, 1 |
| @@ -1609,14 +1603,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GCN-NEXT: s_mov_b32 s33, s10 |
| +; GCN-NEXT: s_mov_b32 s33, s8 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_call_v2bf16: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX7-NEXT: s_mov_b32 s10, s33 |
| +; GFX7-NEXT: s_mov_b32 s8, s33 |
| ; GFX7-NEXT: s_mov_b32 s33, s32 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -1626,7 +1620,6 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX7-NEXT: ; implicit-def: $vgpr3 |
| ; GFX7-NEXT: v_writelane_b32 v3, s30, 0 |
| ; GFX7-NEXT: v_writelane_b32 v3, s31, 1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -1644,14 +1637,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX7-NEXT: s_mov_b32 s33, s10 |
| +; GFX7-NEXT: s_mov_b32 s33, s8 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_call_v2bf16: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX8-NEXT: s_mov_b32 s8, s33 |
| +; GFX8-NEXT: s_mov_b32 s6, s33 |
| ; GFX8-NEXT: s_mov_b32 s33, s32 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -1661,7 +1654,6 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX8-NEXT: ; implicit-def: $vgpr2 |
| ; GFX8-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GFX8-NEXT: v_writelane_b32 v2, s31, 1 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -1674,14 +1666,14 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX8-NEXT: s_mov_b32 s33, s8 |
| +; GFX8-NEXT: s_mov_b32 s33, s6 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_call_v2bf16: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX9-NEXT: s_mov_b32 s8, s33 |
| +; GFX9-NEXT: s_mov_b32 s6, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -1691,7 +1683,6 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr2 |
| ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -1704,7 +1695,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX9-NEXT: s_mov_b32 s33, s8 |
| +; GFX9-NEXT: s_mov_b32 s33, s6 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| @@ -1712,7 +1703,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX10-NEXT: s_mov_b32 s7, s33 |
| +; GFX10-NEXT: s_mov_b32 s6, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -1722,7 +1713,6 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| -; GFX10-NEXT: ; implicit-def: $vgpr2 |
| ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 |
| @@ -1737,7 +1727,7 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 |
| -; GFX10-NEXT: s_mov_b32 s33, s7 |
| +; GFX10-NEXT: s_mov_b32 s33, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| @@ -1750,13 +1740,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GCN-LABEL: test_call_v3bf16: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GCN-NEXT: s_mov_b32 s10, s33 |
| +; GCN-NEXT: s_mov_b32 s8, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| -; GCN-NEXT: ; implicit-def: $vgpr4 |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_writelane_b32 v4, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v4, s31, 1 |
| @@ -1780,14 +1769,14 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GCN-NEXT: s_mov_b32 s33, s10 |
| +; GCN-NEXT: s_mov_b32 s33, s8 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_call_v3bf16: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX7-NEXT: s_mov_b32 s10, s33 |
| +; GFX7-NEXT: s_mov_b32 s8, s33 |
| ; GFX7-NEXT: s_mov_b32 s33, s32 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -1797,7 +1786,6 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX7-NEXT: ; implicit-def: $vgpr4 |
| ; GFX7-NEXT: v_writelane_b32 v4, s30, 0 |
| ; GFX7-NEXT: v_writelane_b32 v4, s31, 1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -1816,14 +1804,14 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX7-NEXT: s_mov_b32 s33, s10 |
| +; GFX7-NEXT: s_mov_b32 s33, s8 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_call_v3bf16: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX8-NEXT: s_mov_b32 s8, s33 |
| +; GFX8-NEXT: s_mov_b32 s6, s33 |
| ; GFX8-NEXT: s_mov_b32 s33, s32 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -1833,9 +1821,8 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX8-NEXT: ; implicit-def: $vgpr3 |
| -; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 |
| ; GFX8-NEXT: v_writelane_b32 v3, s30, 0 |
| +; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1 |
| ; GFX8-NEXT: v_writelane_b32 v3, s31, 1 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| @@ -1850,14 +1837,14 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX8-NEXT: s_mov_b32 s33, s8 |
| +; GFX8-NEXT: s_mov_b32 s33, s6 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_call_v3bf16: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX9-NEXT: s_mov_b32 s8, s33 |
| +; GFX9-NEXT: s_mov_b32 s6, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -1870,9 +1857,8 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr3 |
| -; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 |
| ; GFX9-NEXT: v_writelane_b32 v3, s30, 0 |
| +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 |
| ; GFX9-NEXT: v_writelane_b32 v3, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| @@ -1886,7 +1872,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX9-NEXT: s_mov_b32 s33, s8 |
| +; GFX9-NEXT: s_mov_b32 s33, s6 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| @@ -1894,7 +1880,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX10-NEXT: s_mov_b32 s7, s33 |
| +; GFX10-NEXT: s_mov_b32 s6, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -1906,9 +1892,8 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX10-NEXT: ; implicit-def: $vgpr3 |
| -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 |
| ; GFX10-NEXT: v_writelane_b32 v3, s30, 0 |
| +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 |
| ; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v4 |
| ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -1924,7 +1909,7 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 |
| -; GFX10-NEXT: s_mov_b32 s33, s7 |
| +; GFX10-NEXT: s_mov_b32 s33, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| @@ -1937,13 +1922,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GCN-LABEL: test_call_v4bf16: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GCN-NEXT: s_mov_b32 s10, s33 |
| +; GCN-NEXT: s_mov_b32 s8, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| -; GCN-NEXT: ; implicit-def: $vgpr5 |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_writelane_b32 v5, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v5, s31, 1 |
| @@ -1974,14 +1958,14 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GCN-NEXT: s_mov_b32 s33, s10 |
| +; GCN-NEXT: s_mov_b32 s33, s8 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_call_v4bf16: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX7-NEXT: s_mov_b32 s10, s33 |
| +; GFX7-NEXT: s_mov_b32 s8, s33 |
| ; GFX7-NEXT: s_mov_b32 s33, s32 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -1991,7 +1975,6 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX7-NEXT: ; implicit-def: $vgpr5 |
| ; GFX7-NEXT: v_writelane_b32 v5, s30, 0 |
| ; GFX7-NEXT: v_writelane_b32 v5, s31, 1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -2017,14 +2000,14 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX7-NEXT: s_mov_b32 s33, s10 |
| +; GFX7-NEXT: s_mov_b32 s33, s8 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_call_v4bf16: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX8-NEXT: s_mov_b32 s8, s33 |
| +; GFX8-NEXT: s_mov_b32 s6, s33 |
| ; GFX8-NEXT: s_mov_b32 s33, s32 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -2034,7 +2017,6 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX8-NEXT: ; implicit-def: $vgpr3 |
| ; GFX8-NEXT: v_writelane_b32 v3, s30, 0 |
| ; GFX8-NEXT: v_writelane_b32 v3, s31, 1 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -2058,14 +2040,14 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX8-NEXT: s_mov_b32 s33, s8 |
| +; GFX8-NEXT: s_mov_b32 s33, s6 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_call_v4bf16: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX9-NEXT: s_mov_b32 s8, s33 |
| +; GFX9-NEXT: s_mov_b32 s6, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -2075,7 +2057,6 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr3 |
| ; GFX9-NEXT: v_writelane_b32 v3, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v3, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -2094,7 +2075,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX9-NEXT: s_mov_b32 s33, s8 |
| +; GFX9-NEXT: s_mov_b32 s33, s6 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| @@ -2102,7 +2083,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX10-NEXT: s_mov_b32 s7, s33 |
| +; GFX10-NEXT: s_mov_b32 s6, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -2112,7 +2093,6 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| -; GFX10-NEXT: ; implicit-def: $vgpr3 |
| ; GFX10-NEXT: v_writelane_b32 v3, s30, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v3, s31, 1 |
| @@ -2133,7 +2113,7 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 |
| -; GFX10-NEXT: s_mov_b32 s33, s7 |
| +; GFX10-NEXT: s_mov_b32 s33, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| @@ -2146,13 +2126,12 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GCN-LABEL: test_call_v8bf16: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GCN-NEXT: s_mov_b32 s10, s33 |
| +; GCN-NEXT: s_mov_b32 s8, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| -; GCN-NEXT: ; implicit-def: $vgpr9 |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_writelane_b32 v9, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v9, s31, 1 |
| @@ -2199,14 +2178,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GCN-NEXT: s_mov_b32 s33, s10 |
| +; GCN-NEXT: s_mov_b32 s33, s8 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_call_v8bf16: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX7-NEXT: s_mov_b32 s10, s33 |
| +; GFX7-NEXT: s_mov_b32 s8, s33 |
| ; GFX7-NEXT: s_mov_b32 s33, s32 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -2216,7 +2195,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX7-NEXT: ; implicit-def: $vgpr9 |
| ; GFX7-NEXT: v_writelane_b32 v9, s30, 0 |
| ; GFX7-NEXT: v_writelane_b32 v9, s31, 1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -2258,14 +2236,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX7-NEXT: s_mov_b32 s33, s10 |
| +; GFX7-NEXT: s_mov_b32 s33, s8 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_call_v8bf16: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX8-NEXT: s_mov_b32 s8, s33 |
| +; GFX8-NEXT: s_mov_b32 s6, s33 |
| ; GFX8-NEXT: s_mov_b32 s33, s32 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -2275,7 +2253,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX8-NEXT: ; implicit-def: $vgpr5 |
| ; GFX8-NEXT: v_writelane_b32 v5, s30, 0 |
| ; GFX8-NEXT: v_writelane_b32 v5, s31, 1 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -2313,14 +2290,14 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX8-NEXT: s_mov_b32 s33, s8 |
| +; GFX8-NEXT: s_mov_b32 s33, s6 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_call_v8bf16: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX9-NEXT: s_mov_b32 s8, s33 |
| +; GFX9-NEXT: s_mov_b32 s6, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -2330,7 +2307,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr5 |
| ; GFX9-NEXT: v_writelane_b32 v5, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v5, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -2357,7 +2333,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX9-NEXT: s_mov_b32 s33, s8 |
| +; GFX9-NEXT: s_mov_b32 s33, s6 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| @@ -2365,7 +2341,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX10-NEXT: s_mov_b32 s7, s33 |
| +; GFX10-NEXT: s_mov_b32 s6, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -2375,7 +2351,6 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| -; GFX10-NEXT: ; implicit-def: $vgpr5 |
| ; GFX10-NEXT: v_writelane_b32 v5, s30, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v5, s31, 1 |
| @@ -2404,7 +2379,7 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 |
| -; GFX10-NEXT: s_mov_b32 s33, s7 |
| +; GFX10-NEXT: s_mov_b32 s33, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| @@ -2417,13 +2392,12 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GCN-LABEL: test_call_v16bf16: |
| ; GCN: ; %bb.0: ; %entry |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GCN-NEXT: s_mov_b32 s10, s33 |
| +; GCN-NEXT: s_mov_b32 s8, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| -; GCN-NEXT: ; implicit-def: $vgpr17 |
| ; GCN-NEXT: s_waitcnt expcnt(0) |
| ; GCN-NEXT: v_writelane_b32 v17, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v17, s31, 1 |
| @@ -2502,14 +2476,14 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GCN-NEXT: s_mov_b32 s33, s10 |
| +; GCN-NEXT: s_mov_b32 s33, s8 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX7-LABEL: test_call_v16bf16: |
| ; GFX7: ; %bb.0: ; %entry |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX7-NEXT: s_mov_b32 s10, s33 |
| +; GFX7-NEXT: s_mov_b32 s8, s33 |
| ; GFX7-NEXT: s_mov_b32 s33, s32 |
| ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX7-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -2519,7 +2493,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX7-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX7-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX7-NEXT: ; implicit-def: $vgpr17 |
| ; GFX7-NEXT: v_writelane_b32 v17, s30, 0 |
| ; GFX7-NEXT: v_writelane_b32 v17, s31, 1 |
| ; GFX7-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -2593,14 +2566,14 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX7-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX7-NEXT: s_mov_b32 s33, s10 |
| +; GFX7-NEXT: s_mov_b32 s33, s8 |
| ; GFX7-NEXT: s_waitcnt vmcnt(0) |
| ; GFX7-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX8-LABEL: test_call_v16bf16: |
| ; GFX8: ; %bb.0: ; %entry |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX8-NEXT: s_mov_b32 s8, s33 |
| +; GFX8-NEXT: s_mov_b32 s6, s33 |
| ; GFX8-NEXT: s_mov_b32 s33, s32 |
| ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX8-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -2610,7 +2583,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX8-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX8-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX8-NEXT: ; implicit-def: $vgpr9 |
| ; GFX8-NEXT: v_writelane_b32 v9, s30, 0 |
| ; GFX8-NEXT: v_writelane_b32 v9, s31, 1 |
| ; GFX8-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -2676,14 +2648,14 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX8-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX8-NEXT: s_mov_b32 s33, s8 |
| +; GFX8-NEXT: s_mov_b32 s33, s6 |
| ; GFX8-NEXT: s_waitcnt vmcnt(0) |
| ; GFX8-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GFX9-LABEL: test_call_v16bf16: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX9-NEXT: s_mov_b32 s8, s33 |
| +; GFX9-NEXT: s_mov_b32 s6, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -2693,7 +2665,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr9 |
| ; GFX9-NEXT: v_writelane_b32 v9, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v9, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -2736,7 +2707,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX9-NEXT: s_mov_b32 s33, s8 |
| +; GFX9-NEXT: s_mov_b32 s33, s6 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| @@ -2744,7 +2715,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX10-NEXT: s_mov_b32 s7, s33 |
| +; GFX10-NEXT: s_mov_b32 s6, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill |
| @@ -2754,7 +2725,6 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12 |
| -; GFX10-NEXT: ; implicit-def: $vgpr9 |
| ; GFX10-NEXT: v_writelane_b32 v9, s30, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v9, s31, 1 |
| @@ -2799,7 +2769,7 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) { |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 |
| -; GFX10-NEXT: s_mov_b32 s33, s7 |
| +; GFX10-NEXT: s_mov_b32 s33, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll |
| index f0b0afa02b1a..2635edcb9d8a 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll |
| @@ -902,11 +902,6 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { |
| ; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_mov_b64 exec, s[4:5] |
| -; CHECK-NEXT: ; implicit-def: $vgpr0 |
| -; CHECK-NEXT: ; implicit-def: $vgpr1 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: s_mov_b32 s0, 0 |
| -; CHECK-NEXT: ;;#ASMEND |
| ; CHECK-NEXT: s_waitcnt expcnt(1) |
| ; CHECK-NEXT: v_writelane_b32 v0, s30, 0 |
| ; CHECK-NEXT: v_writelane_b32 v0, s31, 1 |
| @@ -983,6 +978,9 @@ define void @spill_func(ptr addrspace(1) %arg) #0 { |
| ; CHECK-NEXT: v_writelane_b32 v1, s101, 6 |
| ; CHECK-NEXT: s_cmp_eq_u32 s31, 0 |
| ; CHECK-NEXT: ;;#ASMSTART |
| +; CHECK-NEXT: s_mov_b32 s0, 0 |
| +; CHECK-NEXT: ;;#ASMEND |
| +; CHECK-NEXT: ;;#ASMSTART |
| ; CHECK-NEXT: s_mov_b32 s1, 0 |
| ; CHECK-NEXT: ;;#ASMEND |
| ; CHECK-NEXT: ;;#ASMSTART |
| diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll |
| index 6c78e9a4e9b7..4d55d4974be7 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll |
| @@ -8,12 +8,12 @@ |
| @alias = hidden alias void (), ptr @aliasee_default |
| |
| ; ALL-LABEL: {{^}}kernel: |
| -; GFX908: .amdhsa_next_free_vgpr 32 |
| +; GFX908: .amdhsa_next_free_vgpr 41 |
| ; GFX908-NEXT: .amdhsa_next_free_sgpr 33 |
| |
| -; GFX90A: .amdhsa_next_free_vgpr 59 |
| +; GFX90A: .amdhsa_next_free_vgpr 71 |
| ; GFX90A-NEXT: .amdhsa_next_free_sgpr 33 |
| -; GFX90A-NEXT: .amdhsa_accum_offset 32 |
| +; GFX90A-NEXT: .amdhsa_accum_offset 44 |
| define amdgpu_kernel void @kernel() #0 { |
| bb: |
| call void @alias() #2 |
| diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll |
| index 59eae79ca122..5672cbb3f94f 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll |
| @@ -9,7 +9,7 @@ |
| ; The parent kernel has a higher VGPR usage than the possible callees. |
| |
| ; CHECK-LABEL: {{^}}kernel1: |
| -; CHECK: .amdhsa_next_free_vgpr 41 |
| +; CHECK: .amdhsa_next_free_vgpr 42 |
| ; CHECK-NEXT: .amdhsa_next_free_sgpr 33 |
| define amdgpu_kernel void @kernel1() #0 { |
| bb: |
| diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll |
| index 9451a062af6a..cde7716ab7c1 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll |
| @@ -178,7 +178,7 @@ declare hidden void @external_void_func_void() #0 |
| ; restored. No FP is required. |
| ; |
| ; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls: |
| -; GCN: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} |
| +; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} |
| ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill |
| ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] |
| @@ -189,7 +189,7 @@ declare hidden void @external_void_func_void() #0 |
| ; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]] |
| ; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]] |
| |
| -; GCN: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} |
| +; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} |
| ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload |
| ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] |
| @@ -227,7 +227,6 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 { |
| ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| ; GCN-NEXT: v_writelane_b32 v0, s42, 0 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; clobber s42 |
| @@ -401,9 +400,8 @@ define void @realign_stack_no_fp_elim() #1 { |
| ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| -; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 |
| ; GCN: v_writelane_b32 [[CSR_VGPR]], s30, 0 |
| +; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 |
| ; GCN: v_writelane_b32 [[CSR_VGPR]], s31, 1 |
| ; MUBUF: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}} |
| ; FLATSCR: scratch_store_dword off, [[ZERO]], s33{{$}} |
| @@ -441,11 +439,10 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { |
| ; GCN: s_waitcnt |
| ; GCN-NEXT: s_mov_b32 vcc_lo, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} |
| +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} |
| ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] |
| -; GCN-NEXT: ; implicit-def: $vgpr48 |
| |
| ; MUBUF-DAG: buffer_store_dword |
| ; FLATSCR-DAG: scratch_store_dword |
| @@ -453,7 +450,7 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { |
| ; FLATSCR: s_add_i32 s32, s32, 12{{$}} |
| |
| ; GCN: ;;#ASMSTART |
| -; GCN: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} |
| +; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} |
| ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] |
| @@ -488,7 +485,7 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { |
| ; GCN: s_waitcnt |
| ; GCN-NEXT: s_mov_b32 vcc_lo, s33 |
| ; GCN-DAG: s_mov_b32 s33, s32 |
| -; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} |
| +; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} |
| ; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100 |
| ; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004 |
| ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill |
| @@ -500,7 +497,7 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { |
| ; FLATSCR-DAG: scratch_store_dword |
| |
| ; GCN: ;;#ASMSTART |
| -; GCN: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} |
| +; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} |
| ; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100 |
| ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Reload |
| ; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004 |
| @@ -594,7 +591,7 @@ define void @callee_need_to_spill_fp_to_memory() #3 { |
| ; VGPR. |
| ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr: |
| ; MUBUF: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 |
| -; FLATSCR: s_mov_b32 s33, s2 |
| +; FLATSCR: s_mov_b32 s33, s0 |
| ; MUBUF: s_mov_b32 s33, s32 |
| ; MUBUF: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} |
| ; MUBUF: s_mov_b64 exec, [[COPY_EXEC1]] |
| @@ -635,14 +632,14 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 { |
| ; Make sure that the FP save happens after restoring exec from the same |
| ; register. |
| ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_reg: |
| -; FLATSCR: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 |
| +; FLATSCR: s_mov_b32 s0, s33 |
| ; FLATSCR: s_mov_b32 s33, s32 |
| ; GCN-NOT: v_writelane_b32 v40, s33 |
| -; FLATSCR: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} |
| -; FLATSCR: s_mov_b64 exec, [[COPY_EXEC0]] |
| -; FLATSCR: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} |
| +; FLATSCR: s_or_saveexec_b64 s[2:3], -1 |
| +; FLATSCR: s_mov_b64 exec, s[2:3] |
| +; FLATSCR: s_or_saveexec_b64 s[2:3], -1 |
| ; GCN-NOT: v_readlane_b32 s33, v40 |
| -; FLATSCR: s_mov_b32 s33, [[FP_SCRATCH_COPY]] |
| +; FLATSCR: s_mov_b32 s33, s0 |
| ; GCN: s_setpc_b64 |
| define void @callee_need_to_spill_fp_to_reg() #1 { |
| call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs", |
| @@ -675,7 +672,7 @@ define void @callee_need_to_spill_fp_to_reg() #1 { |
| ; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill |
| ; MUBUF: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]] |
| ; GCN-NOT: v_mov_b32_e32 v0, 0x100c |
| -; MUBUF: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40200 |
| +; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40200 |
| ; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill |
| ; FLATSCR: v_mov_b32_e32 v0, 0 |
| ; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1000 |
| diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll |
| index 880a88ed9026..449d57f09e68 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll |
| @@ -30,14 +30,7 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { |
| ; |
| ; GCN_DBG-LABEL: test_loop: |
| ; GCN_DBG: ; %bb.0: ; %entry |
| -; GCN_DBG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 |
| -; GCN_DBG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 |
| -; GCN_DBG-NEXT: s_mov_b32 s10, -1 |
| -; GCN_DBG-NEXT: s_mov_b32 s11, 0xe8f000 |
| -; GCN_DBG-NEXT: s_add_u32 s8, s8, s3 |
| -; GCN_DBG-NEXT: s_addc_u32 s9, s9, 0 |
| ; GCN_DBG-NEXT: s_load_dword s2, s[0:1], 0x9 |
| -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 |
| ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN_DBG-NEXT: v_writelane_b32 v0, s2, 0 |
| ; GCN_DBG-NEXT: s_load_dword s1, s[0:1], 0xa |
| @@ -46,20 +39,11 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { |
| ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN_DBG-NEXT: s_cmp_lg_u32 s1, s2 |
| ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 |
| -; GCN_DBG-NEXT: s_mov_b64 s[4:5], exec |
| -; GCN_DBG-NEXT: s_mov_b64 exec, -1 |
| -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill |
| -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2 |
| ; GCN_DBG-NEXT: ; %bb.1: ; %for.exit |
| ; GCN_DBG-NEXT: s_endpgm |
| ; GCN_DBG-NEXT: .LBB0_2: ; %for.body |
| ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 |
| -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| -; GCN_DBG-NEXT: s_waitcnt expcnt(0) |
| -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload |
| -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] |
| -; GCN_DBG-NEXT: s_waitcnt vmcnt(0) |
| ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 |
| ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 |
| ; GCN_DBG-NEXT: s_mov_b32 s1, 2 |
| @@ -81,9 +65,6 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind { |
| ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 |
| ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] |
| ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 |
| -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill |
| -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2 |
| ; GCN_DBG-NEXT: ; %bb.3: ; %DummyReturnBlock |
| ; GCN_DBG-NEXT: s_endpgm |
| @@ -124,31 +105,16 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi |
| ; |
| ; GCN_DBG-LABEL: loop_const_true: |
| ; GCN_DBG: ; %bb.0: ; %entry |
| -; GCN_DBG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 |
| -; GCN_DBG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 |
| -; GCN_DBG-NEXT: s_mov_b32 s10, -1 |
| -; GCN_DBG-NEXT: s_mov_b32 s11, 0xe8f000 |
| -; GCN_DBG-NEXT: s_add_u32 s8, s8, s3 |
| -; GCN_DBG-NEXT: s_addc_u32 s9, s9, 0 |
| ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 |
| -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 |
| ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 |
| ; GCN_DBG-NEXT: s_mov_b32 s0, 0 |
| ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 |
| -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill |
| -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN_DBG-NEXT: s_branch .LBB1_2 |
| ; GCN_DBG-NEXT: .LBB1_1: ; %for.exit |
| ; GCN_DBG-NEXT: s_endpgm |
| ; GCN_DBG-NEXT: .LBB1_2: ; %for.body |
| ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 |
| -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| -; GCN_DBG-NEXT: s_waitcnt expcnt(0) |
| -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload |
| -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] |
| -; GCN_DBG-NEXT: s_waitcnt vmcnt(0) |
| ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 |
| ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 |
| ; GCN_DBG-NEXT: s_mov_b32 s1, 2 |
| @@ -170,9 +136,6 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi |
| ; GCN_DBG-NEXT: s_mov_b64 s[2:3], 0 |
| ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] |
| ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 |
| -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill |
| -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB1_1 |
| ; GCN_DBG-NEXT: s_branch .LBB1_2 |
| entry: |
| @@ -207,31 +170,16 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw |
| ; |
| ; GCN_DBG-LABEL: loop_const_false: |
| ; GCN_DBG: ; %bb.0: ; %entry |
| -; GCN_DBG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 |
| -; GCN_DBG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 |
| -; GCN_DBG-NEXT: s_mov_b32 s10, -1 |
| -; GCN_DBG-NEXT: s_mov_b32 s11, 0xe8f000 |
| -; GCN_DBG-NEXT: s_add_u32 s8, s8, s3 |
| -; GCN_DBG-NEXT: s_addc_u32 s9, s9, 0 |
| ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 |
| -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 |
| ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 |
| ; GCN_DBG-NEXT: s_mov_b32 s0, 0 |
| ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 |
| -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill |
| -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN_DBG-NEXT: s_branch .LBB2_2 |
| ; GCN_DBG-NEXT: .LBB2_1: ; %for.exit |
| ; GCN_DBG-NEXT: s_endpgm |
| ; GCN_DBG-NEXT: .LBB2_2: ; %for.body |
| ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 |
| -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| -; GCN_DBG-NEXT: s_waitcnt expcnt(0) |
| -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload |
| -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] |
| -; GCN_DBG-NEXT: s_waitcnt vmcnt(0) |
| ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 |
| ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 |
| ; GCN_DBG-NEXT: s_mov_b32 s1, 2 |
| @@ -253,9 +201,6 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw |
| ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 |
| ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] |
| ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 |
| -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill |
| -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB2_1 |
| ; GCN_DBG-NEXT: s_branch .LBB2_2 |
| entry: |
| @@ -291,31 +236,16 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw |
| ; |
| ; GCN_DBG-LABEL: loop_const_undef: |
| ; GCN_DBG: ; %bb.0: ; %entry |
| -; GCN_DBG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 |
| -; GCN_DBG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 |
| -; GCN_DBG-NEXT: s_mov_b32 s10, -1 |
| -; GCN_DBG-NEXT: s_mov_b32 s11, 0xe8f000 |
| -; GCN_DBG-NEXT: s_add_u32 s8, s8, s3 |
| -; GCN_DBG-NEXT: s_addc_u32 s9, s9, 0 |
| ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 |
| -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 |
| ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 |
| ; GCN_DBG-NEXT: s_mov_b32 s0, 0 |
| ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 |
| -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill |
| -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN_DBG-NEXT: s_branch .LBB3_2 |
| ; GCN_DBG-NEXT: .LBB3_1: ; %for.exit |
| ; GCN_DBG-NEXT: s_endpgm |
| ; GCN_DBG-NEXT: .LBB3_2: ; %for.body |
| ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 |
| -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| -; GCN_DBG-NEXT: s_waitcnt expcnt(0) |
| -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload |
| -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] |
| -; GCN_DBG-NEXT: s_waitcnt vmcnt(0) |
| ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 |
| ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 |
| ; GCN_DBG-NEXT: s_mov_b32 s1, 2 |
| @@ -335,9 +265,6 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw |
| ; GCN_DBG-NEXT: s_mov_b32 s1, 1 |
| ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 |
| ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 |
| -; GCN_DBG-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill |
| -; GCN_DBG-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB3_1 |
| ; GCN_DBG-NEXT: s_branch .LBB3_2 |
| entry: |
| @@ -387,14 +314,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { |
| ; |
| ; GCN_DBG-LABEL: loop_arg_0: |
| ; GCN_DBG: ; %bb.0: ; %entry |
| -; GCN_DBG-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 |
| -; GCN_DBG-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 |
| -; GCN_DBG-NEXT: s_mov_b32 s10, -1 |
| -; GCN_DBG-NEXT: s_mov_b32 s11, 0xe8f000 |
| -; GCN_DBG-NEXT: s_add_u32 s8, s8, s3 |
| -; GCN_DBG-NEXT: s_addc_u32 s9, s9, 0 |
| ; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9 |
| -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 |
| ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 |
| ; GCN_DBG-NEXT: v_mov_b32_e32 v1, 0 |
| @@ -411,19 +331,11 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { |
| ; GCN_DBG-NEXT: v_writelane_b32 v0, s1, 2 |
| ; GCN_DBG-NEXT: s_mov_b32 s0, 0 |
| ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 |
| -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill |
| -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] |
| ; GCN_DBG-NEXT: s_branch .LBB4_2 |
| ; GCN_DBG-NEXT: .LBB4_1: ; %for.exit |
| ; GCN_DBG-NEXT: s_endpgm |
| ; GCN_DBG-NEXT: .LBB4_2: ; %for.body |
| ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 |
| -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| -; GCN_DBG-NEXT: s_waitcnt expcnt(0) |
| -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload |
| -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] |
| -; GCN_DBG-NEXT: s_waitcnt vmcnt(0) |
| ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 3 |
| ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 1 |
| ; GCN_DBG-NEXT: v_readlane_b32 s3, v0, 2 |
| @@ -446,9 +358,6 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { |
| ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 |
| ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] |
| ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 |
| -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill |
| -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] |
| ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB4_1 |
| ; GCN_DBG-NEXT: s_branch .LBB4_2 |
| entry: |
| diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll |
| index ee9da868068a..bf87cea9089d 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll |
| @@ -19,14 +19,14 @@ |
| ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] |
| -; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
| +; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] |
| ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: |
| ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]] |
| -; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
| +; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]] |
| ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: |
| @@ -84,14 +84,14 @@ bb.outer.end: ; preds = %bb.outer.then, %bb. |
| ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] |
| -; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}] |
| +; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] |
| ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: |
| ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]] |
| -; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}] |
| +; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]] |
| ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: |
| @@ -161,7 +161,7 @@ bb.outer.end: ; preds = %bb.inner.then, %bb |
| ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] |
| -; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}] |
| +; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] |
| ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: |
| @@ -170,9 +170,6 @@ bb.outer.end: ; preds = %bb.inner.then, %bb |
| ; GCN-O0-NEXT: s_xor_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[THEN_SPILL_LANE_0:[0-9]+]] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[THEN_SPILL_LANE_1:[0-9]+]] |
| -; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 |
| -; GCN-O0-NEXT: buffer_store_dword |
| -; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] |
| ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_cbranch_execz [[THEN_INNER:.LBB[0-9_]+]] |
| ; GCN-O0-NEXT: s_branch [[TEMP_BB:.LBB[0-9_]+]] |
| @@ -183,7 +180,7 @@ bb.outer.end: ; preds = %bb.inner.then, %bb |
| ; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_0:[0-9]+]] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_SPILL_LANE_1:[0-9]+]] |
| -; GCN-O0-DAG: s_xor_b64 exec, exec, s[{{[0-9:]+}}] |
| +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_INNER:.LBB[0-9_]+]] |
| ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: |
| ; GCN-O0: store_dword |
| @@ -263,9 +260,6 @@ bb.outer.end: ; preds = %bb, %bb.then, %b |
| ; GCN-O0-NEXT: s_xor_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_0:[0-9]+]] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_SPILL_LANE_1:[0-9]+]] |
| -; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 |
| -; GCN-O0-NEXT: buffer_store_dword [[VGPR]] |
| -; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] |
| ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_cbranch_execz [[THEN_OUTER:.LBB[0-9_]+]] |
| ; GCN-O0-NEXT: s_branch [[INNER_IF_OUTER_ELSE:.LBB[0-9_]+]] |
| @@ -276,14 +270,14 @@ bb.outer.end: ; preds = %bb, %bb.then, %b |
| ; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_2_SPILL_LANE_0:[0-9]+]] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_2_SPILL_LANE_1:[0-9]+]] |
| -; GCN-O0-DAG: s_xor_b64 exec, exec, s[{{[0-9:]+}}] |
| +; GCN-O0-NEXT: s_xor_b64 exec, exec, s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF_OUTER:.LBB[0-9_]+]] |
| ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: |
| ; GCN-O0: store_dword |
| ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[ELSE_SPILL_LANE_0:[0-9]+]] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[ELSE_SPILL_LANE_1:[0-9]+]] |
| -; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
| +; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_cbranch_execz [[FLOW1:.LBB[0-9_]+]] |
| ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: |
| @@ -293,7 +287,7 @@ bb.outer.end: ; preds = %bb, %bb.then, %b |
| ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_IF_OUTER_ELSE_SPILL_LANE_0:[0-9]+]] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_IF_OUTER_ELSE_SPILL_LANE_1:[0-9]+]] |
| -; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
| +; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_cbranch_execz [[THEN_OUTER_FLOW:.LBB[0-9_]+]] |
| ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: |
| @@ -363,7 +357,7 @@ bb.outer.end: |
| ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR:v[0-9]+]], s{{[0-9]+}}, [[SPILL_LANE_0:[0-9]+]] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[SPILL_LANE_1:[0-9]+]] |
| -; GCN-O0-DAG: s_and_b64 s[{{[0-9:]+}}] |
| +; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_cbranch_execz [[ENDIF:.LBB[0-9_]+]] |
| ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: |
| @@ -425,81 +419,61 @@ bb.end: ; preds = %bb.then, %bb |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1:[0-9]+]] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0:[0-9]+]] |
| ; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1:[0-9]+]] |
| -; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 |
| -; GCN-O0-NEXT: buffer_store_dword [[VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Spill |
| -; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] |
| ; GCN-O0: [[INNER_LOOP:.LBB[0-9]+_[0-9]+]]: |
| -; GCN-O0: buffer_load_dword [[RESTORED_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload |
| -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] |
| -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] |
| -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] |
| -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_0:[0-9]+]] |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_1:[0-9]+]] |
| +; GCN-O0: buffer_load_dword |
| +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] |
| +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] |
| +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] |
| +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_0:[0-9]+]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_1:[0-9]+]] |
| ; GCN-O0: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0:[0-9]+]] |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1:[0-9]+]] |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0:[0-9]+]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1:[0-9]+]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] |
| ; GCN-O0-NEXT: s_mov_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] |
| -; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 |
| -; GCN-O0-NEXT: buffer_store_dword [[RESTORED_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Spill |
| -; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] |
| ; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_cbranch_execnz [[INNER_LOOP]] |
| ; GCN-O0-NEXT: ; %bb.{{[0-9]+}}: |
| -; GCN-O0: buffer_load_dword [[RESTORED_1_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload |
| -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_1_VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0]] |
| -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_1_VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1]] |
| +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_0]] |
| +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_OUT_EXEC_SPILL_LANE_1]] |
| ; GCN-O0-NEXT: s_or_b64 exec, exec, s[{{[0-9:]+}}] |
| ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_1_VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_0:[0-9]+]] |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_1_VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_1:[0-9]+]] |
| -; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 |
| -; GCN-O0-NEXT: buffer_store_dword [[RESTORED_1_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Spill |
| -; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_0:[0-9]+]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW2_IN_EXEC_SPILL_LANE_1:[0-9]+]] |
| ; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_cbranch_execz [[FLOW2:.LBB[0-9_]+]] |
| ; GCN-O0: {{^}}[[FLOW2]]: |
| -; GCN-O0: buffer_load_dword [[RESTORED_2_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload |
| -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_2_VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_0]] |
| -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_2_VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_1]] |
| +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_0]] |
| +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW2_IN_EXEC_SPILL_LANE_1]] |
| ; GCN-O0: s_branch [[FLOW:.LBB[0-9_]+]] |
| ; GCN-O0: {{^}}[[FLOW]]: |
| -; GCN-O0: buffer_load_dword [[RESTORED_3_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload |
| ; GCN-O0: s_mov_b64 s[{{[0-9:]+}}], exec |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_3_VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_0:[0-9]+]] |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_3_VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_1:[0-9]+]] |
| -; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 |
| -; GCN-O0-NEXT: buffer_store_dword [[RESTORED_3_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Spill |
| -; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_0:[0-9]+]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW3_IN_EXEC_SPILL_LANE_1:[0-9]+]] |
| ; GCN-O0-NEXT: s_and_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_mov_b64 exec, s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_cbranch_execz [[FLOW3:.LBB[0-9_]+]] |
| ; GCN-O0: ; %bb.{{[0-9]+}}: |
| -; GCN-O0: buffer_load_dword [[RESTORED_4_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_4_VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_0:[0-9]+]] |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_4_VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_1:[0-9]+]] |
| -; GCN-O0-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 |
| -; GCN-O0-NEXT: buffer_store_dword [[RESTORED_4_VGPR]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Spill |
| -; GCN-O0-NEXT: s_mov_b64 exec, [[EXEC_COPY]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_0:[0-9]+]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[FLOW1_OUT_EXEC_SPILL_LANE_1:[0-9]+]] |
| ; GCN-O0: {{^}}[[FLOW3]]: |
| ; GCN-O0-COUNT-4: buffer_load_dword |
| -; GCN-O0: buffer_load_dword [[RESTORED_5_VGPR:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload |
| -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_5_VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_0]] |
| -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_5_VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_1]] |
| -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_5_VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_0]] |
| -; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[RESTORED_5_VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_1]] |
| +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_0]] |
| +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[OUTER_LOOP_EXEC_SPILL_LANE_1]] |
| +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_0]] |
| +; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[FLOW1_OUT_EXEC_SPILL_LANE_1]] |
| ; GCN-O0: s_and_b64 s[{{[0-9:]+}}], exec, s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] |
| ; GCN-O0-COUNT-2: s_mov_b64 |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_5_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_5_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_5_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] |
| -; GCN-O0-DAG: v_writelane_b32 [[RESTORED_5_VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]] |
| +; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]] |
| ; GCN-O0-COUNT-4: buffer_store_dword |
| ; GCN-O0: s_andn2_b64 exec, exec, s[{{[0-9:]+}}] |
| ; GCN-O0-NEXT: s_cbranch_execnz [[INNER_LOOP]] |
| diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll |
| index 71dade0f278d..a2f83301f2a1 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll |
| @@ -10,7 +10,7 @@ |
| |
| |
| ; GCN-LABEL: {{^}}divergent_if_endif: |
| -; VGPR: workitem_private_segment_byte_size = 16{{$}} |
| +; VGPR: workitem_private_segment_byte_size = 12{{$}} |
| |
| |
| ; GCN: {{^}}; %bb.0: |
| @@ -19,7 +19,7 @@ |
| |
| ; Spill load |
| ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill |
| -; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, s{{[0-9]+}} |
| +; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, s{{[0-9]+}} |
| |
| ; Spill saved exec |
| ; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec |
| @@ -82,13 +82,13 @@ endif: |
| } |
| |
| ; GCN-LABEL: {{^}}divergent_loop: |
| -; VGPR: workitem_private_segment_byte_size = 20{{$}} |
| +; VGPR: workitem_private_segment_byte_size = 16{{$}} |
| |
| ; GCN: {{^}}; %bb.0: |
| ; GCN-DAG: s_mov_b32 m0, -1 |
| ; GCN-DAG: v_mov_b32_e32 [[PTR0:v[0-9]+]], 0{{$}} |
| ; GCN: ds_read_b32 [[LOAD0:v[0-9]+]], [[PTR0]] |
| -; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s{{[0-9]+}} |
| +; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v0, s{{[0-9]+}} |
| |
| ; Spill load |
| ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill |
| @@ -166,7 +166,7 @@ end: |
| ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill |
| |
| ; GCN: s_mov_b32 [[ZERO:s[0-9]+]], 0 |
| -; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, [[ZERO]] |
| +; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, [[ZERO]] |
| |
| ; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec |
| ; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], [[CMP0]] |
| @@ -187,7 +187,6 @@ end: |
| ; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]] |
| |
| ; GCN: [[FLOW]]: ; %Flow |
| -; VGPR: buffer_load_dword |
| ; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload |
| ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] |
| ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] |
| diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll |
| index 28e82208f53a..b3251e835b07 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll |
| @@ -33,7 +33,6 @@ define float @call_split_type_used_outside_block_v2f32() #0 { |
| ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[18:19] |
| -; GCN-NEXT: ; implicit-def: $vgpr40 |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| ; GCN-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v41, s16, 0 |
| @@ -72,7 +71,6 @@ define float @call_split_type_used_outside_block_v3f32() #0 { |
| ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[18:19] |
| -; GCN-NEXT: ; implicit-def: $vgpr40 |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| ; GCN-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v41, s16, 0 |
| @@ -111,7 +109,6 @@ define half @call_split_type_used_outside_block_v4f16() #0 { |
| ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[18:19] |
| -; GCN-NEXT: ; implicit-def: $vgpr40 |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| ; GCN-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v41, s16, 0 |
| @@ -150,7 +147,6 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 { |
| ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[18:19] |
| -; GCN-NEXT: ; implicit-def: $vgpr40 |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| ; GCN-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v41, s16, 0 |
| diff --git a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir |
| index 2a96f5eef8c9..aed642d1f067 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir |
| +++ b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir |
| @@ -19,11 +19,10 @@ body: | |
| ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec |
| ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) |
| ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5 |
| - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF |
| - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr42, 0, killed $vgpr0 |
| - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr43, 1, killed $vgpr0 |
| - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr46, 2, killed $vgpr0 |
| - ; CHECK-NEXT: dead renamable $vgpr0 = V_WRITELANE_B32 $sgpr47, 3, killed $vgpr0 |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr42, 0, $vgpr0 |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr43, 1, $vgpr0 |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr46, 2, $vgpr0 |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr47, 3, $vgpr0 |
| ; CHECK-NEXT: S_NOP 0 |
| ; CHECK-NEXT: {{ $}} |
| ; CHECK-NEXT: bb.1: |
| diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll |
| index 060641476d2f..0807a567a412 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll |
| @@ -19,26 +19,25 @@ define weak_odr void @test(i32 %0) !dbg !34 { |
| ; CHECK-NEXT: s_mov_b32 s16, s33 |
| ; CHECK-NEXT: s_mov_b32 s33, s32 |
| ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 |
| -; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_mov_b64 exec, s[18:19] |
| -; CHECK-NEXT: ; implicit-def: $vgpr41 |
| +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 |
| +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 |
| +; CHECK-NEXT: v_writelane_b32 v40, s34, 2 |
| +; CHECK-NEXT: v_writelane_b32 v40, s35, 3 |
| +; CHECK-NEXT: v_writelane_b32 v40, s36, 4 |
| +; CHECK-NEXT: v_writelane_b32 v40, s37, 5 |
| +; CHECK-NEXT: v_writelane_b32 v40, s38, 6 |
| +; CHECK-NEXT: v_writelane_b32 v40, s39, 7 |
| +; CHECK-NEXT: v_writelane_b32 v40, s40, 8 |
| +; CHECK-NEXT: v_writelane_b32 v40, s41, 9 |
| +; CHECK-NEXT: v_writelane_b32 v40, s42, 10 |
| +; CHECK-NEXT: v_writelane_b32 v40, s43, 11 |
| +; CHECK-NEXT: v_writelane_b32 v40, s44, 12 |
| ; CHECK-NEXT: s_addk_i32 s32, 0x400 |
| -; CHECK-NEXT: v_writelane_b32 v41, s30, 0 |
| -; CHECK-NEXT: v_writelane_b32 v41, s31, 1 |
| -; CHECK-NEXT: v_writelane_b32 v41, s34, 2 |
| -; CHECK-NEXT: v_writelane_b32 v41, s35, 3 |
| -; CHECK-NEXT: v_writelane_b32 v41, s36, 4 |
| -; CHECK-NEXT: v_writelane_b32 v41, s37, 5 |
| -; CHECK-NEXT: v_writelane_b32 v41, s38, 6 |
| -; CHECK-NEXT: v_writelane_b32 v41, s39, 7 |
| -; CHECK-NEXT: v_writelane_b32 v41, s40, 8 |
| -; CHECK-NEXT: v_writelane_b32 v41, s41, 9 |
| -; CHECK-NEXT: v_writelane_b32 v41, s42, 10 |
| -; CHECK-NEXT: v_writelane_b32 v41, s43, 11 |
| -; CHECK-NEXT: v_writelane_b32 v41, s44, 12 |
| -; CHECK-NEXT: v_writelane_b32 v41, s45, 13 |
| -; CHECK-NEXT: v_writelane_b32 v41, s46, 14 |
| +; CHECK-NEXT: v_writelane_b32 v40, s45, 13 |
| +; CHECK-NEXT: v_writelane_b32 v40, s46, 14 |
| ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] |
| ; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- undef |
| ; CHECK-NEXT: .Ltmp0: |
| @@ -46,12 +45,12 @@ define weak_odr void @test(i32 %0) !dbg !34 { |
| ; CHECK-NEXT: s_getpc_b64 s[4:5] |
| ; CHECK-NEXT: s_add_u32 s4, s4, __kmpc_alloc_shared@gotpcrel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, __kmpc_alloc_shared@gotpcrel32@hi+12 |
| -; CHECK-NEXT: v_writelane_b32 v41, s47, 15 |
| +; CHECK-NEXT: v_writelane_b32 v40, s47, 15 |
| ; CHECK-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0x0 |
| ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] |
| ; CHECK-NEXT: v_writelane_b32 v42, s16, 0 |
| -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; CHECK-NEXT: v_mov_b32_e32 v40, v31 |
| +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; CHECK-NEXT: v_mov_b32_e32 v41, v31 |
| ; CHECK-NEXT: s_mov_b32 s42, s15 |
| ; CHECK-NEXT: s_mov_b32 s43, s14 |
| ; CHECK-NEXT: s_mov_b32 s44, s13 |
| @@ -69,33 +68,33 @@ define weak_odr void @test(i32 %0) !dbg !34 { |
| ; CHECK-NEXT: s_mov_b32 s13, s44 |
| ; CHECK-NEXT: s_mov_b32 s14, s43 |
| ; CHECK-NEXT: s_mov_b32 s15, s42 |
| -; CHECK-NEXT: v_mov_b32_e32 v31, v40 |
| +; CHECK-NEXT: v_mov_b32_e32 v31, v41 |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[46:47] |
| ; CHECK-NEXT: .Ltmp1: |
| ; CHECK-NEXT: ;DEBUG_VALUE: dummy:dummy <- [$vgpr0_vgpr1+0] |
| ; CHECK-NEXT: .loc 1 0 9 is_stmt 0 ; dummy:0:9 |
| -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload |
| +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; CHECK-NEXT: v_mov_b32_e32 v2, 0 |
| ; CHECK-NEXT: flat_store_dword v[0:1], v2 |
| -; CHECK-NEXT: v_readlane_b32 s47, v41, 15 |
| -; CHECK-NEXT: v_readlane_b32 s46, v41, 14 |
| -; CHECK-NEXT: v_readlane_b32 s45, v41, 13 |
| -; CHECK-NEXT: v_readlane_b32 s44, v41, 12 |
| -; CHECK-NEXT: v_readlane_b32 s43, v41, 11 |
| -; CHECK-NEXT: v_readlane_b32 s42, v41, 10 |
| -; CHECK-NEXT: v_readlane_b32 s41, v41, 9 |
| -; CHECK-NEXT: v_readlane_b32 s40, v41, 8 |
| -; CHECK-NEXT: v_readlane_b32 s39, v41, 7 |
| -; CHECK-NEXT: v_readlane_b32 s38, v41, 6 |
| -; CHECK-NEXT: v_readlane_b32 s37, v41, 5 |
| -; CHECK-NEXT: v_readlane_b32 s36, v41, 4 |
| -; CHECK-NEXT: v_readlane_b32 s35, v41, 3 |
| -; CHECK-NEXT: v_readlane_b32 s34, v41, 2 |
| -; CHECK-NEXT: v_readlane_b32 s31, v41, 1 |
| -; CHECK-NEXT: v_readlane_b32 s30, v41, 0 |
| +; CHECK-NEXT: v_readlane_b32 s47, v40, 15 |
| +; CHECK-NEXT: v_readlane_b32 s46, v40, 14 |
| +; CHECK-NEXT: v_readlane_b32 s45, v40, 13 |
| +; CHECK-NEXT: v_readlane_b32 s44, v40, 12 |
| +; CHECK-NEXT: v_readlane_b32 s43, v40, 11 |
| +; CHECK-NEXT: v_readlane_b32 s42, v40, 10 |
| +; CHECK-NEXT: v_readlane_b32 s41, v40, 9 |
| +; CHECK-NEXT: v_readlane_b32 s40, v40, 8 |
| +; CHECK-NEXT: v_readlane_b32 s39, v40, 7 |
| +; CHECK-NEXT: v_readlane_b32 s38, v40, 6 |
| +; CHECK-NEXT: v_readlane_b32 s37, v40, 5 |
| +; CHECK-NEXT: v_readlane_b32 s36, v40, 4 |
| +; CHECK-NEXT: v_readlane_b32 s35, v40, 3 |
| +; CHECK-NEXT: v_readlane_b32 s34, v40, 2 |
| +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 |
| +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 |
| ; CHECK-NEXT: v_readlane_b32 s4, v42, 0 |
| ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| -; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_mov_b64 exec, s[6:7] |
| ; CHECK-NEXT: s_addk_i32 s32, 0xfc00 |
| diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll |
| index dd8b39d11071..97383490841e 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll |
| @@ -25,7 +25,6 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 { |
| ; GCN-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 |
| ; GCN-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 |
| ; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 |
| -; GCN-NEXT: ; implicit-def: $vgpr42 |
| ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: v_writelane_b32 v42, s30, 0 |
| diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll |
| index 45d685d7664e..f69602bdc39b 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll |
| @@ -117,15 +117,21 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { |
| ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 |
| ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 |
| ; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 |
| -; FLAT_SCR_OPT-NEXT: ; implicit-def: $vgpr0 |
| +; FLAT_SCR_OPT-NEXT: s_mov_b32 s104, exec_lo |
| +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, 3 |
| +; FLAT_SCR_OPT-NEXT: s_mov_b32 s4, 0 |
| +; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v72, s4 |
| ; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0) |
| -; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s2, 0 |
| -; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s3, 1 |
| -; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 |
| -; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, 4 |
| -; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill |
| +; FLAT_SCR_OPT-NEXT: v_writelane_b32 v72, s2, 0 |
| +; FLAT_SCR_OPT-NEXT: s_mov_b32 s4, 4 |
| +; FLAT_SCR_OPT-NEXT: v_writelane_b32 v72, s3, 1 |
| +; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v72, s4 ; 4-byte Folded Spill |
| ; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 |
| -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 |
| +; FLAT_SCR_OPT-NEXT: s_mov_b32 s4, 0 |
| +; FLAT_SCR_OPT-NEXT: scratch_load_dword v72, off, s4 |
| +; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) |
| +; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 |
| +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s104 |
| ; FLAT_SCR_OPT-NEXT: s_load_dword vcc_lo, s[0:1], 0x8 |
| ; FLAT_SCR_OPT-NEXT: ; kill: killed $sgpr0_sgpr1 |
| ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART |
| @@ -222,14 +228,22 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { |
| ; FLAT_SCR_OPT-NEXT: ;;#ASMEND |
| ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART |
| ; FLAT_SCR_OPT-NEXT: ;;#ASMEND |
| -; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 |
| -; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 4 |
| -; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload |
| +; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, exec_lo |
| +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, 3 |
| +; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 0 |
| +; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v2, s3 |
| +; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 |
| +; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 4 |
| +; FLAT_SCR_OPT-NEXT: scratch_load_dword v2, off, s3 ; 4-byte Folded Reload |
| ; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 |
| -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 |
| +; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 0 |
| ; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) |
| -; FLAT_SCR_OPT-NEXT: v_readlane_b32 s0, v1, 0 |
| -; FLAT_SCR_OPT-NEXT: v_readlane_b32 s1, v1, 1 |
| +; FLAT_SCR_OPT-NEXT: v_readlane_b32 s0, v2, 0 |
| +; FLAT_SCR_OPT-NEXT: v_readlane_b32 s1, v2, 1 |
| +; FLAT_SCR_OPT-NEXT: scratch_load_dword v2, off, s3 |
| +; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) |
| +; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 |
| +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s2 |
| ; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, 0 |
| ; FLAT_SCR_OPT-NEXT: global_store_dword v1, v0, s[0:1] |
| ; FLAT_SCR_OPT-NEXT: s_endpgm |
| @@ -237,15 +251,21 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { |
| ; FLAT_SCR_ARCH-LABEL: test: |
| ; FLAT_SCR_ARCH: ; %bb.0: |
| ; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 |
| -; FLAT_SCR_ARCH-NEXT: ; implicit-def: $vgpr0 |
| +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s104, exec_lo |
| +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, 3 |
| +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s4, 0 |
| +; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v72, s4 |
| ; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0) |
| -; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s2, 0 |
| -; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s3, 1 |
| -; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 |
| -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, 4 |
| -; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill |
| +; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v72, s2, 0 |
| +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s4, 4 |
| +; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v72, s3, 1 |
| +; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v72, s4 ; 4-byte Folded Spill |
| ; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 |
| -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 |
| +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s4, 0 |
| +; FLAT_SCR_ARCH-NEXT: scratch_load_dword v72, off, s4 |
| +; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) |
| +; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 |
| +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s104 |
| ; FLAT_SCR_ARCH-NEXT: s_load_dword vcc_lo, s[0:1], 0x8 |
| ; FLAT_SCR_ARCH-NEXT: ; kill: killed $sgpr0_sgpr1 |
| ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART |
| @@ -342,14 +362,22 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { |
| ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND |
| ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART |
| ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND |
| -; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 |
| -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s0, 4 |
| -; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload |
| +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, exec_lo |
| +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, 3 |
| +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 0 |
| +; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v2, s3 |
| +; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 |
| +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 4 |
| +; FLAT_SCR_ARCH-NEXT: scratch_load_dword v2, off, s3 ; 4-byte Folded Reload |
| ; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 |
| -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 |
| +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 0 |
| ; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) |
| -; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s0, v1, 0 |
| -; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s1, v1, 1 |
| +; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s0, v2, 0 |
| +; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s1, v2, 1 |
| +; FLAT_SCR_ARCH-NEXT: scratch_load_dword v2, off, s3 |
| +; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) |
| +; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 |
| +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s2 |
| ; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v1, 0 |
| ; FLAT_SCR_ARCH-NEXT: global_store_dword v1, v0, s[0:1] |
| ; FLAT_SCR_ARCH-NEXT: s_endpgm |
| diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir |
| index f8c3629764ea..aca1351de3ef 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir |
| +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir |
| @@ -12,13 +12,14 @@ machineFunctionInfo: |
| body: | |
| bb.0: |
| ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_lo |
| - ; CHECK: S_NOP 0, implicit-def $exec_lo |
| + ; CHECK: liveins: $vgpr0 |
| + ; CHECK-NEXT: {{ $}} |
| + ; CHECK-NEXT: S_NOP 0, implicit-def $exec_lo |
| ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_lo |
| - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF |
| - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 |
| ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 |
| ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 |
| - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 |
| + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 |
| ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 |
| ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec |
| S_NOP 0, implicit-def $exec_lo |
| @@ -37,13 +38,14 @@ machineFunctionInfo: |
| body: | |
| bb.0: |
| ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_hi |
| - ; CHECK: S_NOP 0, implicit-def $exec_hi |
| + ; CHECK: liveins: $vgpr0 |
| + ; CHECK-NEXT: {{ $}} |
| + ; CHECK-NEXT: S_NOP 0, implicit-def $exec_hi |
| ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_hi |
| - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF |
| - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 |
| ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 |
| ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 |
| - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 |
| + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 |
| ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0 |
| ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec |
| S_NOP 0, implicit-def $exec_hi |
| @@ -62,16 +64,17 @@ machineFunctionInfo: |
| body: | |
| bb.0: |
| ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec |
| - ; CHECK: S_NOP 0, implicit-def $exec |
| + ; CHECK: liveins: $vgpr0 |
| + ; CHECK-NEXT: {{ $}} |
| + ; CHECK-NEXT: S_NOP 0, implicit-def $exec |
| ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec |
| - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF |
| - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 |
| - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1 |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 |
| ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 |
| ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 |
| ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1, implicit killed renamable $sgpr0_sgpr1 |
| ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 |
| - ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1 |
| + ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 |
| ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 |
| ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec |
| S_NOP 0, implicit-def $exec |
| @@ -93,12 +96,13 @@ machineFunctionInfo: |
| body: | |
| bb.0: |
| ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_lo |
| - ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo |
| - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF |
| - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 |
| + ; CHECK: liveins: $vgpr0 |
| + ; CHECK-NEXT: {{ $}} |
| + ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 |
| ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 |
| ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 |
| - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 |
| + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 |
| ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 |
| ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec |
| S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_lo |
| @@ -116,12 +120,13 @@ machineFunctionInfo: |
| body: | |
| bb.0: |
| ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_hi |
| - ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi |
| - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF |
| - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 |
| + ; CHECK: liveins: $vgpr0 |
| + ; CHECK-NEXT: {{ $}} |
| + ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 |
| ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 |
| ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 |
| - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 |
| + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 |
| ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0 |
| ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec |
| S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_hi |
| @@ -139,15 +144,16 @@ machineFunctionInfo: |
| body: | |
| bb.0: |
| ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec |
| - ; CHECK: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec |
| - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF |
| - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 |
| - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1 |
| + ; CHECK: liveins: $vgpr0 |
| + ; CHECK-NEXT: {{ $}} |
| + ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 |
| ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 |
| ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 |
| ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1 |
| ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 |
| - ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1 |
| + ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 |
| ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 |
| ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec |
| S_NOP 0, implicit-def %0:sreg_64, implicit-def %1:sreg_64, implicit-def $exec |
| diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir |
| index 5732e43b3c42..e8688d8f55d0 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir |
| +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir |
| @@ -13,13 +13,14 @@ body: | |
| bb.0: |
| |
| ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_m0 |
| - ; CHECK: S_NOP 0, implicit-def $m0 |
| + ; CHECK: liveins: $vgpr0 |
| + ; CHECK-NEXT: {{ $}} |
| + ; CHECK-NEXT: S_NOP 0, implicit-def $m0 |
| ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $m0 |
| - ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF |
| - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 |
| ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 |
| ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 |
| - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 |
| + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 |
| ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0 |
| ; CHECK-NEXT: S_NOP 0 |
| ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec |
| @@ -43,12 +44,13 @@ body: | |
| bb.0: |
| |
| ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_m0 |
| - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF |
| + ; CHECK: liveins: $vgpr0 |
| + ; CHECK-NEXT: {{ $}} |
| ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $m0 |
| - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 |
| ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 |
| ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 |
| - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 |
| + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 |
| ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0 |
| ; CHECK-NEXT: S_NOP 0 |
| ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec |
| diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll |
| index 52eab573ea44..e7cab2606aa8 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll |
| @@ -16,7 +16,6 @@ define void @callee_with_stack_and_call() #0 { |
| ; SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; SPILL-TO-VGPR-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[8:9] |
| -; SPILL-TO-VGPR-NEXT: ; implicit-def: $vgpr40 |
| ; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400 |
| ; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0 |
| ; SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 |
| diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll |
| index 1a4577ea2e1c..7c9d01db9c2c 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll |
| @@ -8,13 +8,11 @@ define amdgpu_gfx void @gfx_func() { |
| ; SDAG-LABEL: gfx_func: |
| ; SDAG: ; %bb.0: |
| ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; SDAG-NEXT: s_mov_b32 s38, s33 |
| +; SDAG-NEXT: s_mov_b32 s36, s33 |
| ; SDAG-NEXT: s_mov_b32 s33, s32 |
| ; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| ; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; SDAG-NEXT: s_mov_b64 exec, s[34:35] |
| -; SDAG-NEXT: ; implicit-def: $vgpr40 |
| -; SDAG-NEXT: s_addk_i32 s32, 0x400 |
| ; SDAG-NEXT: v_writelane_b32 v40, s4, 0 |
| ; SDAG-NEXT: v_writelane_b32 v40, s5, 1 |
| ; SDAG-NEXT: v_writelane_b32 v40, s6, 2 |
| @@ -35,6 +33,7 @@ define amdgpu_gfx void @gfx_func() { |
| ; SDAG-NEXT: v_writelane_b32 v40, s21, 17 |
| ; SDAG-NEXT: v_writelane_b32 v40, s22, 18 |
| ; SDAG-NEXT: v_writelane_b32 v40, s23, 19 |
| +; SDAG-NEXT: s_addk_i32 s32, 0x400 |
| ; SDAG-NEXT: v_writelane_b32 v40, s24, 20 |
| ; SDAG-NEXT: v_writelane_b32 v40, s25, 21 |
| ; SDAG-NEXT: s_getpc_b64 s[34:35] |
| @@ -82,20 +81,18 @@ define amdgpu_gfx void @gfx_func() { |
| ; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; SDAG-NEXT: s_mov_b64 exec, s[34:35] |
| ; SDAG-NEXT: s_addk_i32 s32, 0xfc00 |
| -; SDAG-NEXT: s_mov_b32 s33, s38 |
| +; SDAG-NEXT: s_mov_b32 s33, s36 |
| ; SDAG-NEXT: s_waitcnt vmcnt(0) |
| ; SDAG-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GISEL-LABEL: gfx_func: |
| ; GISEL: ; %bb.0: |
| ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GISEL-NEXT: s_mov_b32 s38, s33 |
| +; GISEL-NEXT: s_mov_b32 s36, s33 |
| ; GISEL-NEXT: s_mov_b32 s33, s32 |
| ; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GISEL-NEXT: s_mov_b64 exec, s[34:35] |
| -; GISEL-NEXT: ; implicit-def: $vgpr40 |
| -; GISEL-NEXT: s_addk_i32 s32, 0x400 |
| ; GISEL-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GISEL-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GISEL-NEXT: v_writelane_b32 v40, s6, 2 |
| @@ -116,6 +113,7 @@ define amdgpu_gfx void @gfx_func() { |
| ; GISEL-NEXT: v_writelane_b32 v40, s21, 17 |
| ; GISEL-NEXT: v_writelane_b32 v40, s22, 18 |
| ; GISEL-NEXT: v_writelane_b32 v40, s23, 19 |
| +; GISEL-NEXT: s_addk_i32 s32, 0x400 |
| ; GISEL-NEXT: v_writelane_b32 v40, s24, 20 |
| ; GISEL-NEXT: v_writelane_b32 v40, s25, 21 |
| ; GISEL-NEXT: s_getpc_b64 s[34:35] |
| @@ -163,7 +161,7 @@ define amdgpu_gfx void @gfx_func() { |
| ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GISEL-NEXT: s_mov_b64 exec, s[34:35] |
| ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GISEL-NEXT: s_mov_b32 s33, s38 |
| +; GISEL-NEXT: s_mov_b32 s33, s36 |
| ; GISEL-NEXT: s_waitcnt vmcnt(0) |
| ; GISEL-NEXT: s_setpc_b64 s[30:31] |
| call void @extern_c_func() |
| diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll |
| index f0324ae9ff58..8da0dc3c0e0e 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll |
| @@ -103,7 +103,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1 |
| @@ -137,16 +136,15 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1@rel32@hi+12 |
| -; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -173,18 +171,16 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 |
| -; GFX11-NEXT: scratch_store_b8 off, v0, s32 |
| ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX11-NEXT: scratch_store_b8 off, v0, s32 |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 |
| @@ -209,16 +205,15 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 |
| ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 |
| -; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 |
| ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -249,7 +244,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -257,7 +251,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { |
| ; GFX9-NEXT: s_getpc_b64 s[34:35] |
| ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_i1_signext@rel32@hi+12 |
| -; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 |
| ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| @@ -286,15 +279,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_signext@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| -; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 |
| ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| @@ -325,15 +316,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 |
| ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| -; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 |
| ; GFX11-NEXT: scratch_store_b8 off, v0, s32 |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| @@ -363,15 +352,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 { |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc |
| ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 |
| ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| -; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 |
| ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 |
| ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| @@ -405,7 +392,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -413,7 +399,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { |
| ; GFX9-NEXT: s_getpc_b64 s[34:35] |
| ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_i1_zeroext@rel32@hi+12 |
| -; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 |
| ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s32 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| @@ -442,15 +427,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_zeroext@rel32@hi+12 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| -; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 |
| ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| @@ -481,15 +464,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 |
| ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| -; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 |
| ; GFX11-NEXT: scratch_store_b8 off, v0, s32 |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| @@ -519,15 +500,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 { |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc |
| ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 |
| ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| -; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 |
| ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 |
| ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| @@ -559,7 +538,6 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b |
| @@ -592,9 +570,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| @@ -627,9 +604,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| @@ -662,9 +638,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| @@ -701,7 +676,6 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -735,9 +709,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_signext@rel32@lo+4 |
| @@ -771,9 +744,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: global_load_i8 v0, v[0:1], off glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 |
| @@ -807,9 +779,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 { |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: global_load_sbyte v0, v[0:1], off glc dlc |
| ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 |
| @@ -846,7 +817,6 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -880,9 +850,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_zeroext@rel32@lo+4 |
| @@ -916,9 +885,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 |
| @@ -952,9 +920,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 { |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc |
| ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 |
| @@ -989,7 +956,6 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b |
| @@ -1022,9 +988,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| @@ -1057,9 +1022,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| @@ -1092,9 +1056,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| @@ -1131,7 +1094,6 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -1165,9 +1127,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_signext@rel32@lo+4 |
| @@ -1201,9 +1162,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 |
| @@ -1237,9 +1197,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 { |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc |
| ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 |
| @@ -1276,7 +1235,6 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -1310,9 +1268,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_zeroext@rel32@lo+4 |
| @@ -1346,9 +1303,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 |
| @@ -1382,9 +1338,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 { |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc |
| ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 |
| @@ -1419,7 +1374,6 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 42 |
| @@ -1452,9 +1406,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 42 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 42 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| @@ -1487,9 +1440,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_mov_b32_e32 v0, 42 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_mov_b32_e32 v0, 42 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| @@ -1522,9 +1474,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| @@ -1559,7 +1510,6 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b |
| @@ -1593,16 +1543,15 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i64@rel32@hi+12 |
| -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -1629,17 +1578,16 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12 |
| -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 |
| @@ -1664,16 +1612,15 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 |
| ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -1705,7 +1652,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -1739,9 +1685,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 |
| @@ -1776,9 +1721,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, 0 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 |
| @@ -1813,9 +1757,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 { |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 |
| @@ -1851,7 +1794,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1 |
| @@ -1887,9 +1829,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 4 |
| @@ -1925,9 +1866,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 |
| ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| @@ -1961,9 +1901,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 |
| @@ -2004,7 +1943,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, 1 |
| @@ -2040,9 +1978,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v4, 1 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v4, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, 2 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| @@ -2079,9 +2016,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2 |
| ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| @@ -2116,9 +2052,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 { |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| @@ -2161,7 +2096,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v4, 1 |
| @@ -2199,9 +2133,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v4, 1 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v4, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v5, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v6, 3 |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| @@ -2240,18 +2173,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2 |
| ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4 |
| ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i64@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64@rel32@hi+12 |
| -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 |
| @@ -2278,9 +2210,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 { |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 3 |
| ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| @@ -2321,7 +2252,6 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400 |
| @@ -2354,9 +2284,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| @@ -2389,9 +2318,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| @@ -2424,9 +2352,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x4400 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x4400 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| @@ -2461,7 +2388,6 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 4.0 |
| @@ -2494,9 +2420,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 4.0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 4.0 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| @@ -2529,9 +2454,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_mov_b32_e32 v0, 4.0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_mov_b32_e32 v0, 4.0 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| @@ -2564,9 +2488,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4.0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4.0 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| @@ -2601,7 +2524,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 |
| @@ -2635,16 +2557,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f32@rel32@hi+12 |
| -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -2671,17 +2592,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12 |
| -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 |
| @@ -2706,16 +2626,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 |
| ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -2744,7 +2663,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 |
| @@ -2779,9 +2697,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| @@ -2816,18 +2733,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, 4.0 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32@rel32@hi+12 |
| -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 |
| @@ -2852,9 +2768,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| @@ -2891,7 +2806,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 |
| @@ -2928,9 +2842,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, -1.0 |
| @@ -2967,9 +2880,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 |
| ; GFX11-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0 |
| ; GFX11-NEXT: v_mov_b32_e32 v4, 0.5 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| @@ -3004,9 +2916,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, -1.0 |
| @@ -3045,7 +2956,6 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| @@ -3079,16 +2989,15 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40100000 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f64@rel32@hi+12 |
| -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -3115,17 +3024,16 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12 |
| -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 |
| @@ -3150,16 +3058,15 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40100000 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 |
| ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -3188,7 +3095,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| @@ -3224,9 +3130,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 |
| @@ -3262,9 +3167,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 |
| ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| @@ -3298,9 +3202,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 |
| @@ -3338,7 +3241,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| @@ -3376,9 +3278,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 |
| @@ -3416,9 +3317,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 |
| ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 |
| ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| @@ -3453,9 +3353,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 |
| @@ -3496,7 +3395,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: global_load_dword v0, v[0:1], off |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -3529,9 +3427,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: global_load_dword v0, v[0:1], off |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16@rel32@lo+4 |
| @@ -3564,9 +3461,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: global_load_b32 v0, v[0:1], off |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 |
| @@ -3599,9 +3495,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 { |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 |
| @@ -3637,7 +3532,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -3670,9 +3564,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 |
| @@ -3705,9 +3598,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 |
| @@ -3740,9 +3632,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 { |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 |
| @@ -3778,7 +3669,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -3811,9 +3701,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 |
| @@ -3846,9 +3735,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 |
| @@ -3881,9 +3769,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 { |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 |
| @@ -3918,7 +3805,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 |
| @@ -3952,16 +3838,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 3 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16@rel32@hi+12 |
| -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -3988,17 +3873,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 |
| -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 |
| @@ -4023,16 +3907,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 3 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 |
| ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -4061,7 +3944,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00 |
| @@ -4095,16 +3977,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0x4400 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16@rel32@hi+12 |
| -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -4131,18 +4012,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, 0x4400 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 |
| -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 |
| @@ -4167,16 +4047,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40003c00 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40003c00 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x4400 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 |
| ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -4206,7 +4085,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -4239,9 +4117,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 |
| @@ -4274,9 +4151,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 |
| @@ -4309,9 +4185,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 { |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 |
| @@ -4346,7 +4221,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 |
| @@ -4380,16 +4254,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40003 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16@rel32@hi+12 |
| -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -4416,18 +4289,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, 0x40003 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 |
| -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 |
| @@ -4452,16 +4324,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40003 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 |
| ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -4491,7 +4362,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: global_load_dword v0, v[0:1], off |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -4524,9 +4394,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: global_load_dword v0, v[0:1], off |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16@rel32@lo+4 |
| @@ -4559,9 +4428,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: global_load_b32 v0, v[0:1], off |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 |
| @@ -4594,9 +4462,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 { |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 |
| @@ -4632,7 +4499,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -4665,9 +4531,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 |
| @@ -4700,9 +4565,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 |
| @@ -4735,9 +4599,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 { |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 |
| @@ -4772,7 +4635,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1 |
| @@ -4806,16 +4668,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32@rel32@hi+12 |
| -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -4842,17 +4703,16 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 |
| -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 |
| @@ -4877,16 +4737,15 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 |
| ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -4915,7 +4774,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 3 |
| @@ -4950,9 +4808,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 4 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 5 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| @@ -4987,18 +4844,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 |
| ; GFX11-NEXT: v_mov_b32_e32 v2, 5 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32@rel32@hi+12 |
| -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 |
| @@ -5023,9 +4879,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| @@ -5062,7 +4917,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 3 |
| @@ -5098,9 +4952,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 4 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 5 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 6 |
| @@ -5136,9 +4989,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 |
| ; GFX11-NEXT: v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| @@ -5172,9 +5024,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 6 |
| @@ -5213,7 +5064,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -5246,9 +5096,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 |
| @@ -5281,9 +5130,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 |
| @@ -5316,9 +5164,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 { |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 |
| @@ -5353,7 +5200,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1 |
| @@ -5389,9 +5235,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 4 |
| @@ -5427,9 +5272,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 |
| ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| @@ -5463,9 +5307,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 |
| @@ -5503,7 +5346,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1 |
| @@ -5540,9 +5382,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 4 |
| @@ -5579,9 +5420,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 |
| ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 |
| ; GFX11-NEXT: v_mov_b32_e32 v4, 5 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| @@ -5616,9 +5456,8 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 |
| @@ -5660,7 +5499,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX9-NEXT: v_mov_b32_e32 v8, 0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -5697,9 +5535,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX10-NEXT: v_mov_b32_e32 v8, 0 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[34:35] |
| @@ -5737,9 +5574,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: v_mov_b32_e32 v4, 0 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] |
| @@ -5777,9 +5613,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 { |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, 0 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-SCRATCH-NEXT: s_clause 0x1 |
| ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] |
| @@ -5818,7 +5653,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1 |
| @@ -5858,9 +5692,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 4 |
| @@ -5900,9 +5733,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 |
| ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 |
| ; GFX11-NEXT: v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6 |
| ; GFX11-NEXT: v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8 |
| @@ -5938,9 +5770,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 |
| @@ -5985,7 +5816,6 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX9-NEXT: v_mov_b32_e32 v16, 0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -6024,9 +5854,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX10-NEXT: v_mov_b32_e32 v16, 0 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_clause 0x3 |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v16, s[34:35] |
| @@ -6066,9 +5895,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: v_mov_b32_e32 v12, 0 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_clause 0x3 |
| ; GFX11-NEXT: global_load_b128 v[0:3], v12, s[0:1] |
| @@ -6108,9 +5936,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 { |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-SCRATCH-NEXT: s_clause 0x3 |
| ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] |
| @@ -6154,7 +5981,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX9-NEXT: v_mov_b32_e32 v28, 0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -6198,9 +6024,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX10-NEXT: v_mov_b32_e32 v32, 0 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_clause 0x7 |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v32, s[34:35] |
| @@ -6244,9 +6069,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: v_mov_b32_e32 v28, 0 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_clause 0x7 |
| ; GFX11-NEXT: global_load_b128 v[0:3], v28, s[0:1] |
| @@ -6290,9 +6114,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 { |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-SCRATCH-NEXT: s_clause 0x7 |
| ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] |
| @@ -6341,7 +6164,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX9-NEXT: v_mov_b32_e32 v28, 0 |
| ; GFX9-NEXT: global_load_dword v32, v[0:1], off |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: global_load_dwordx4 v[0:3], v28, s[34:35] |
| @@ -6387,9 +6209,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX10-NEXT: v_mov_b32_e32 v32, 0 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: global_load_dword v33, v[0:1], off |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_clause 0x7 |
| @@ -6436,9 +6257,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: v_mov_b32_e32 v28, 0 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: global_load_b32 v32, v[0:1], off |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_clause 0x7 |
| @@ -6484,9 +6304,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 { |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: global_load_dword v33, v[0:1], off |
| ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-SCRATCH-NEXT: s_clause 0x7 |
| @@ -6532,32 +6351,31 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou |
| ; GFX9-NEXT: s_mov_b32 s34, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 |
| -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr42 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x800 |
| -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GFX9-NEXT: v_writelane_b32 v42, s30, 0 |
| -; GFX9-NEXT: v_mov_b32_e32 v40, v0 |
| +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX9-NEXT: v_mov_b32_e32 v41, v0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 42 |
| ; GFX9-NEXT: v_writelane_b32 v43, s34, 0 |
| -; GFX9-NEXT: v_writelane_b32 v42, s31, 1 |
| -; GFX9-NEXT: v_mov_b32_e32 v41, v1 |
| +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX9-NEXT: v_mov_b32_e32 v42, v1 |
| ; GFX9-NEXT: s_getpc_b64 s[34:35] |
| ; GFX9-NEXT: s_add_u32 s34, s34, external_i32_func_i32@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s35, s35, external_i32_func_i32@rel32@hi+12 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| -; GFX9-NEXT: global_store_dword v[40:41], v0, off |
| +; GFX9-NEXT: global_store_dword v[41:42], v0, off |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload |
| -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| -; GFX9-NEXT: v_readlane_b32 s31, v42, 1 |
| -; GFX9-NEXT: v_readlane_b32 s30, v42, 0 |
| +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload |
| +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 |
| +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s34, v43, 0 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 |
| -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xf800 |
| @@ -6572,35 +6390,34 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou |
| ; GFX10-NEXT: s_mov_b32 s34, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 |
| -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr42 |
| -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GFX10-NEXT: v_writelane_b32 v42, s30, 0 |
| -; GFX10-NEXT: v_mov_b32_e32 v40, v0 |
| +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v41, v0 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 42 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX10-NEXT: v_writelane_b32 v43, s34, 0 |
| -; GFX10-NEXT: v_writelane_b32 v42, s31, 1 |
| -; GFX10-NEXT: v_mov_b32_e32 v41, v1 |
| +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX10-NEXT: v_mov_b32_e32 v42, v1 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_i32_func_i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, external_i32_func_i32@rel32@hi+12 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| -; GFX10-NEXT: global_store_dword v[40:41], v0, off |
| +; GFX10-NEXT: global_store_dword v[41:42], v0, off |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_clause 0x1 |
| -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 |
| -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 |
| -; GFX10-NEXT: v_readlane_b32 s31, v42, 1 |
| -; GFX10-NEXT: v_readlane_b32 s30, v42, 0 |
| +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 |
| +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 |
| +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 |
| +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s34, v43, 0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 |
| ; GFX10-NEXT: s_clause 0x1 |
| -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 |
| +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 |
| ; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| @@ -6617,35 +6434,34 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou |
| ; GFX11-NEXT: s_mov_b32 s33, s32 |
| ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 |
| ; GFX11-NEXT: s_clause 0x1 |
| -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 |
| +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 |
| ; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:12 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr42 |
| ; GFX11-NEXT: s_clause 0x1 |
| -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 |
| -; GFX11-NEXT: scratch_store_b32 off, v41, s33 |
| -; GFX11-NEXT: v_writelane_b32 v42, s30, 0 |
| -; GFX11-NEXT: v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0 |
| +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| +; GFX11-NEXT: scratch_store_b32 off, v42, s33 |
| +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v41, v0 |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 42 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 32 |
| ; GFX11-NEXT: v_writelane_b32 v43, s0, 0 |
| -; GFX11-NEXT: v_writelane_b32 v42, s31, 1 |
| +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX11-NEXT: global_store_b32 v[40:41], v0, off dlc |
| +; GFX11-NEXT: global_store_b32 v[41:42], v0, off dlc |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: s_clause 0x1 |
| -; GFX11-NEXT: scratch_load_b32 v41, off, s33 |
| -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 |
| -; GFX11-NEXT: v_readlane_b32 s31, v42, 1 |
| -; GFX11-NEXT: v_readlane_b32 s30, v42, 0 |
| +; GFX11-NEXT: scratch_load_b32 v42, off, s33 |
| +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 |
| +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 |
| +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v43, 0 |
| ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 |
| ; GFX11-NEXT: s_clause 0x1 |
| -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 |
| +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 |
| ; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:12 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 |
| @@ -6660,35 +6476,34 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s0, s33 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 |
| -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill |
| +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v43, s33 offset:12 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr42 |
| -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill |
| -; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s30, 0 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v40, v0 |
| +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v43, s0, 0 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v42, s31, 1 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v1 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, v1 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 |
| ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 |
| ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX10-SCRATCH-NEXT: global_store_dword v[40:41], v0, off |
| +; GFX10-SCRATCH-NEXT: global_store_dword v[41:42], v0, off |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-SCRATCH-NEXT: s_clause 0x1 |
| -; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 |
| -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:4 |
| -; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v42, 1 |
| -; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v42, 0 |
| +; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 |
| +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 |
| +; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 |
| +; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v43, 0 |
| ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 |
| ; GFX10-SCRATCH-NEXT: s_clause 0x1 |
| -; GFX10-SCRATCH-NEXT: scratch_load_dword v42, off, s33 offset:8 |
| +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 |
| ; GFX10-SCRATCH-NEXT: scratch_load_dword v43, off, s33 offset:12 |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| @@ -6714,7 +6529,6 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX9-NEXT: v_mov_b32_e32 v2, 0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -6751,9 +6565,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: global_load_ubyte v0, v2, s[34:35] |
| @@ -6791,9 +6604,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: v_mov_b32_e32 v1, 0 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: global_load_u8 v0, v1, s[0:1] |
| @@ -6831,9 +6643,8 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 { |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-SCRATCH-NEXT: s_clause 0x1 |
| ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v2, s[0:1] |
| @@ -6873,7 +6684,6 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 3 |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 8 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x800 |
| @@ -6911,17 +6721,16 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 8 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 |
| ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 |
| ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 |
| +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_byval_struct_i8_i32@rel32@hi+12 |
| -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -6949,20 +6758,19 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 32 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 32 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: scratch_store_b8 off, v0, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v1, s33 offset:4 |
| ; GFX11-NEXT: v_mov_b32_e32 v0, s33 |
| +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 |
| -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 |
| @@ -6989,17 +6797,16 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 { |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33 |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s33 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 |
| ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -7036,7 +6843,6 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 8 |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 |
| ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x800 |
| @@ -7081,9 +6887,8 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 3 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 8 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 |
| ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 |
| @@ -7135,11 +6940,10 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 |
| ; GFX11-NEXT: s_add_i32 vcc_lo, s33, 8 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: scratch_store_b8 off, v0, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v1, s33 offset:4 |
| -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX11-NEXT: v_dual_mov_b32 v0, vcc_lo :: v_dual_mov_b32 v1, s33 |
| ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| @@ -7183,10 +6987,9 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 |
| ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 vcc_lo, s33, 8 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33 |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, vcc_lo |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s33 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| @@ -7242,7 +7045,6 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| @@ -7297,9 +7099,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[34:35] |
| @@ -7355,9 +7156,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: v_mov_b32_e32 v0, 0 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1] |
| @@ -7409,9 +7209,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 { |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] |
| @@ -7462,7 +7261,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { |
| ; GFX9-LABEL: tail_call_byval_align16: |
| ; GFX9: ; %bb.0: ; %entry |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX9-NEXT: s_mov_b32 s8, s33 |
| +; GFX9-NEXT: s_mov_b32 s6, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill |
| @@ -7470,8 +7269,6 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { |
| ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20 |
| ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16 |
| ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x800 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s34, 2 |
| @@ -7502,6 +7299,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { |
| ; GFX9-NEXT: v_writelane_b32 v40, s59, 27 |
| ; GFX9-NEXT: v_writelane_b32 v40, s60, 28 |
| ; GFX9-NEXT: v_writelane_b32 v40, s61, 29 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x800 |
| ; GFX9-NEXT: v_writelane_b32 v40, s62, 30 |
| ; GFX9-NEXT: v_writelane_b32 v40, s63, 31 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| @@ -7548,7 +7346,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { |
| ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xf800 |
| -; GFX9-NEXT: s_mov_b32 s33, s8 |
| +; GFX9-NEXT: s_mov_b32 s33, s6 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| @@ -7556,7 +7354,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { |
| ; GFX10: ; %bb.0: ; %entry |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX10-NEXT: s_mov_b32 s7, s33 |
| +; GFX10-NEXT: s_mov_b32 s6, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill |
| @@ -7566,9 +7364,8 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { |
| ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20 |
| ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16 |
| ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12 |
| @@ -7645,7 +7442,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GFX10-NEXT: s_mov_b32 s33, s7 |
| +; GFX10-NEXT: s_mov_b32 s33, s6 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| ; |
| @@ -7653,7 +7450,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { |
| ; GFX11: ; %bb.0: ; %entry |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX11-NEXT: s_mov_b32 s5, s33 |
| +; GFX11-NEXT: s_mov_b32 s4, s33 |
| ; GFX11-NEXT: s_mov_b32 s33, s32 |
| ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:24 ; 4-byte Folded Spill |
| @@ -7661,9 +7458,8 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 offset:16 |
| ; GFX11-NEXT: scratch_load_b32 v31, off, s33 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 32 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 32 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 |
| @@ -7737,7 +7533,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { |
| ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s0 |
| ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 |
| -; GFX11-NEXT: s_mov_b32 s33, s5 |
| +; GFX11-NEXT: s_mov_b32 s33, s4 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| ; |
| @@ -7745,7 +7541,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { |
| ; GFX10-SCRATCH: ; %bb.0: ; %entry |
| ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX10-SCRATCH-NEXT: s_mov_b32 s5, s33 |
| +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, s33 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:24 ; 4-byte Folded Spill |
| @@ -7754,9 +7550,8 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { |
| ; GFX10-SCRATCH-NEXT: s_clause 0x1 |
| ; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 offset:16 |
| ; GFX10-SCRATCH-NEXT: scratch_load_dword v31, off, s33 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4 |
| ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12 |
| @@ -7831,7 +7626,7 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 { |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 |
| ; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 |
| -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s5 |
| +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s4 |
| ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| @@ -7851,7 +7646,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 1 |
| @@ -7885,16 +7679,15 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_inreg@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_inreg@rel32@hi+12 |
| -; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -7921,18 +7714,16 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 |
| -; GFX11-NEXT: scratch_store_b8 off, v0, s32 |
| ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX11-NEXT: scratch_store_b8 off, v0, s32 |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v41, 0 |
| @@ -7957,16 +7748,15 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 |
| ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 |
| -; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 |
| ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 |
| ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 |
| @@ -7995,9 +7785,8 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7b |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -8030,10 +7819,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_movk_i32 s4, 0x7b |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_inreg@rel32@lo+4 |
| @@ -8067,10 +7855,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_movk_i32 s4, 0x7b |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 |
| @@ -8104,10 +7891,9 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 |
| @@ -8143,9 +7929,8 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7b |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -8178,10 +7963,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_movk_i32 s4, 0x7b |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_inreg@rel32@lo+4 |
| @@ -8215,10 +7999,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_movk_i32 s4, 0x7b |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 |
| @@ -8252,10 +8035,9 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 |
| @@ -8291,9 +8073,8 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 |
| ; GFX9-NEXT: s_mov_b32 s4, 42 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -8326,10 +8107,9 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 42 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32_inreg@rel32@lo+4 |
| @@ -8363,10 +8143,9 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 42 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 |
| @@ -8400,10 +8179,9 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 |
| @@ -8439,10 +8217,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x7b |
| ; GFX9-NEXT: s_mov_b32 s5, 0 |
| @@ -8477,10 +8254,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_movk_i32 s4, 0x7b |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64_inreg@rel32@lo+4 |
| @@ -8517,10 +8293,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_movk_i32 s4, 0x7b |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 |
| @@ -8557,10 +8332,9 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 |
| @@ -8599,10 +8373,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| ; GFX9-NEXT: s_mov_b64 s[34:35], 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 |
| @@ -8641,9 +8414,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_mov_b64 s[34:35], 0 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -8685,9 +8457,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -8729,9 +8500,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -8776,12 +8546,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 |
| ; GFX9-NEXT: s_mov_b32 s4, 1 |
| ; GFX9-NEXT: s_mov_b32 s5, 2 |
| @@ -8820,10 +8589,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 1 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64_inreg@rel32@lo+4 |
| @@ -8866,10 +8634,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 1 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 |
| @@ -8912,10 +8679,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 |
| @@ -8960,10 +8726,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| ; GFX9-NEXT: s_mov_b64 s[34:35], 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 |
| @@ -9008,9 +8773,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_mov_b64 s[34:35], 0 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -9058,9 +8822,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -9108,9 +8871,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -9163,11 +8925,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 |
| ; GFX9-NEXT: s_mov_b64 s[34:35], 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 |
| @@ -9217,9 +8978,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_mov_b64 s[34:35], 0 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -9273,9 +9033,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -9329,9 +9088,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -9389,9 +9147,8 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 |
| ; GFX9-NEXT: s_movk_i32 s4, 0x4400 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -9424,10 +9181,9 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_movk_i32 s4, 0x4400 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16_inreg@rel32@lo+4 |
| @@ -9461,10 +9217,9 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_movk_i32 s4, 0x4400 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 |
| @@ -9498,10 +9253,9 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 |
| @@ -9537,9 +9291,8 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 |
| ; GFX9-NEXT: s_mov_b32 s4, 4.0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -9572,10 +9325,9 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 4.0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32_inreg@rel32@lo+4 |
| @@ -9609,10 +9361,9 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 4.0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 |
| @@ -9646,10 +9397,9 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 |
| @@ -9685,10 +9435,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 |
| ; GFX9-NEXT: s_mov_b32 s4, 1.0 |
| ; GFX9-NEXT: s_mov_b32 s5, 2.0 |
| @@ -9723,10 +9472,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 1.0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32_inreg@rel32@lo+4 |
| @@ -9763,10 +9511,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 1.0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 |
| @@ -9803,10 +9550,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 |
| @@ -9845,11 +9591,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 3 |
| ; GFX9-NEXT: s_mov_b32 s4, 1.0 |
| ; GFX9-NEXT: s_mov_b32 s5, 2.0 |
| @@ -9886,10 +9631,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 1.0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32_inreg@rel32@lo+4 |
| @@ -9929,10 +9673,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 1.0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 |
| @@ -9972,10 +9715,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 |
| @@ -10017,13 +9759,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 |
| ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 5 |
| ; GFX9-NEXT: s_mov_b32 s4, 1.0 |
| ; GFX9-NEXT: s_mov_b32 s5, 2.0 |
| @@ -10064,10 +9805,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 1.0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32_inreg@rel32@lo+4 |
| @@ -10113,10 +9853,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 1.0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 |
| @@ -10162,10 +9901,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 |
| @@ -10213,10 +9951,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 |
| ; GFX9-NEXT: s_mov_b32 s4, 0 |
| ; GFX9-NEXT: s_mov_b32 s5, 0x40100000 |
| @@ -10251,10 +9988,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64_inreg@rel32@lo+4 |
| @@ -10291,10 +10027,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 |
| @@ -10331,10 +10066,9 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 |
| @@ -10373,12 +10107,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 |
| ; GFX9-NEXT: s_mov_b32 s4, 0 |
| ; GFX9-NEXT: s_mov_b32 s5, 2.0 |
| @@ -10417,10 +10150,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64_inreg@rel32@lo+4 |
| @@ -10463,10 +10195,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 |
| @@ -10509,10 +10240,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 |
| @@ -10557,14 +10287,13 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 |
| ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 |
| ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 6 |
| ; GFX9-NEXT: s_mov_b32 s4, 0 |
| ; GFX9-NEXT: s_mov_b32 s5, 2.0 |
| @@ -10607,10 +10336,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f64_inreg@rel32@lo+4 |
| @@ -10659,10 +10387,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 |
| @@ -10711,10 +10438,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 |
| @@ -10765,10 +10491,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 |
| @@ -10800,10 +10525,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16_inreg@rel32@lo+4 |
| @@ -10837,10 +10561,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 |
| @@ -10874,10 +10597,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 |
| @@ -10914,11 +10636,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 |
| @@ -10951,9 +10672,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 |
| @@ -10990,9 +10710,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 |
| @@ -11029,9 +10748,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| @@ -11071,11 +10789,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 |
| @@ -11108,9 +10825,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 |
| @@ -11147,9 +10863,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 |
| @@ -11186,9 +10901,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| @@ -11228,10 +10942,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 |
| ; GFX9-NEXT: s_mov_b32 s4, 0x20001 |
| ; GFX9-NEXT: s_mov_b32 s5, 3 |
| @@ -11266,10 +10979,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 0x20001 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16_inreg@rel32@lo+4 |
| @@ -11306,10 +11018,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 0x20001 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 |
| @@ -11346,10 +11057,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 |
| @@ -11388,10 +11098,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 |
| ; GFX9-NEXT: s_mov_b32 s4, 0x40003c00 |
| ; GFX9-NEXT: s_movk_i32 s5, 0x4400 |
| @@ -11426,10 +11135,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16_inreg@rel32@lo+4 |
| @@ -11466,10 +11174,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 0x40003c00 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 |
| @@ -11506,10 +11213,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 |
| @@ -11548,11 +11254,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 |
| @@ -11585,9 +11290,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 |
| @@ -11624,9 +11328,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 |
| @@ -11663,9 +11366,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| @@ -11705,10 +11407,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 |
| ; GFX9-NEXT: s_mov_b32 s4, 0x20001 |
| ; GFX9-NEXT: s_mov_b32 s5, 0x40003 |
| @@ -11743,10 +11444,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 0x20001 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16_inreg@rel32@lo+4 |
| @@ -11783,10 +11483,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 0x20001 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 |
| @@ -11823,10 +11522,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 |
| @@ -11865,10 +11563,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 |
| @@ -11900,10 +11597,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16_inreg@rel32@lo+4 |
| @@ -11937,10 +11633,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 |
| @@ -11974,10 +11669,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 |
| @@ -12014,11 +11708,10 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 |
| @@ -12051,9 +11744,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 |
| @@ -12090,9 +11782,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 |
| @@ -12129,9 +11820,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 |
| @@ -12171,10 +11861,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 |
| ; GFX9-NEXT: s_mov_b32 s4, 1 |
| ; GFX9-NEXT: s_mov_b32 s5, 2 |
| @@ -12209,10 +11898,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 1 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32_inreg@rel32@lo+4 |
| @@ -12249,10 +11937,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 1 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 |
| @@ -12289,10 +11976,9 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 |
| @@ -12331,11 +12017,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 3 |
| ; GFX9-NEXT: s_mov_b32 s4, 3 |
| ; GFX9-NEXT: s_mov_b32 s5, 4 |
| @@ -12372,10 +12057,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 3 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_inreg@rel32@lo+4 |
| @@ -12415,10 +12099,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 3 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 |
| @@ -12458,10 +12141,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 |
| @@ -12503,12 +12185,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 |
| ; GFX9-NEXT: s_mov_b32 s4, 3 |
| ; GFX9-NEXT: s_mov_b32 s5, 4 |
| @@ -12547,10 +12228,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 3 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg@rel32@lo+4 |
| @@ -12593,10 +12273,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 3 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 |
| @@ -12639,10 +12318,9 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 |
| @@ -12687,13 +12365,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 |
| ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 |
| @@ -12728,9 +12405,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 |
| @@ -12771,9 +12447,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 |
| @@ -12814,9 +12489,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 |
| @@ -12860,12 +12534,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 |
| ; GFX9-NEXT: s_mov_b32 s4, 1 |
| ; GFX9-NEXT: s_mov_b32 s5, 2 |
| @@ -12904,10 +12577,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 1 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32_inreg@rel32@lo+4 |
| @@ -12950,10 +12622,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 1 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 |
| @@ -12996,10 +12667,9 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 |
| @@ -13044,13 +12714,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 |
| ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 5 |
| ; GFX9-NEXT: s_mov_b32 s4, 1 |
| ; GFX9-NEXT: s_mov_b32 s5, 2 |
| @@ -13091,10 +12760,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 1 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32_inreg@rel32@lo+4 |
| @@ -13140,10 +12808,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 1 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 |
| @@ -13189,10 +12856,9 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 |
| @@ -13240,10 +12906,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 |
| @@ -13291,9 +12956,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -13344,9 +13008,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -13397,9 +13060,8 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -13454,8 +13116,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| @@ -13464,6 +13124,7 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { |
| ; GFX9-NEXT: v_writelane_b32 v40, s9, 5 |
| ; GFX9-NEXT: v_writelane_b32 v40, s10, 6 |
| ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 8 |
| ; GFX9-NEXT: s_mov_b32 s4, 1 |
| ; GFX9-NEXT: s_mov_b32 s5, 2 |
| @@ -13510,10 +13171,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_mov_b32 s4, 1 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32_inreg@rel32@lo+4 |
| @@ -13568,10 +13228,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_mov_b32 s4, 1 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 |
| @@ -13626,10 +13285,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 |
| @@ -13686,8 +13344,6 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| @@ -13698,6 +13354,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { |
| ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 |
| ; GFX9-NEXT: v_writelane_b32 v40, s12, 8 |
| ; GFX9-NEXT: v_writelane_b32 v40, s13, 9 |
| +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s14, 10 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s15, 11 |
| @@ -13753,9 +13410,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -13822,9 +13478,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -13891,9 +13546,8 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -13964,8 +13618,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| @@ -13981,6 +13633,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { |
| ; GFX9-NEXT: v_writelane_b32 v40, s16, 12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 |
| ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 |
| +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s20, 16 |
| @@ -14075,9 +13728,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -14189,9 +13841,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -14297,9 +13948,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -14411,8 +14061,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 |
| @@ -14427,6 +14075,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { |
| ; GFX9-NEXT: v_writelane_b32 v40, s15, 11 |
| ; GFX9-NEXT: v_writelane_b32 v40, s16, 12 |
| ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 |
| +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 |
| @@ -14527,9 +14176,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -14646,9 +14294,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -14757,9 +14404,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { |
| ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 |
| @@ -14879,7 +14525,6 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 |
| ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x800 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -14918,9 +14563,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d |
| ; GFX10-NEXT: s_clause 0x1 |
| ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 |
| ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, stack_passed_f64_arg@rel32@lo+4 |
| @@ -14957,9 +14601,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 32 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 32 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 |
| @@ -14993,9 +14636,8 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d |
| ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] |
| ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 |
| @@ -15038,7 +14680,6 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 13 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 14 |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 15 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| @@ -15109,12 +14750,11 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 14 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_mov_b32_e32 v3, 15 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 |
| ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 |
| ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 |
| ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 |
| -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0 |
| @@ -15181,9 +14821,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { |
| ; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13 |
| ; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 1 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 1 |
| ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 |
| ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 |
| ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1 |
| @@ -15236,9 +14875,8 @@ define amdgpu_gfx void @stack_12xv3i32() #0 { |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 15 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 |
| ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 1 |
| ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 |
| @@ -15331,7 +14969,6 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 13 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 14 |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 15 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| @@ -15408,7 +15045,6 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 11 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 12 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 13 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, 15 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 |
| @@ -15484,11 +15120,10 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { |
| ; GFX11-NEXT: v_dual_mov_b32 v4, 8 :: v_dual_mov_b32 v5, 9 |
| ; GFX11-NEXT: v_dual_mov_b32 v6, 10 :: v_dual_mov_b32 v7, 11 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 |
| ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s32 |
| -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 |
| ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0 |
| ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1 |
| @@ -15545,10 +15180,9 @@ define amdgpu_gfx void @stack_8xv5i32() #0 { |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 10 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 11 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 |
| ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 |
| @@ -15637,7 +15271,6 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000 |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 |
| ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| @@ -15714,7 +15347,6 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { |
| ; GFX10-NEXT: v_mov_b32_e32 v0, 0x41300000 |
| ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41400000 |
| ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41500000 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| ; GFX10-NEXT: v_mov_b32_e32 v4, 0x41700000 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 |
| @@ -15794,12 +15426,11 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { |
| ; GFX11-NEXT: v_mov_b32_e32 v6, 0x41200000 |
| ; GFX11-NEXT: v_mov_b32_e32 v7, 0x41300000 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX11-NEXT: s_clause 0x1 |
| ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16 |
| ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s32 |
| ; GFX11-NEXT: v_mov_b32_e32 v6, 1.0 |
| -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 |
| ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0 |
| ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1.0 |
| @@ -15857,10 +15488,9 @@ define amdgpu_gfx void @stack_8xv5f32() #0 { |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 0x41200000 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41300000 |
| ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 |
| -; GFX10-SCRATCH-NEXT: ; implicit-def: $vgpr40 |
| +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 |
| ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s32 |
| -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 |
| diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll |
| index 4eaca1701ea9..3d95bd3b6c0c 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll |
| @@ -15,10 +15,9 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 |
| @@ -54,9 +53,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| @@ -95,9 +93,8 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GFX11-NEXT: s_getpc_b64 s[4:5] |
| @@ -137,7 +134,6 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 { |
| ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[34:35] |
| -; GFX9-NEXT: ; implicit-def: $vgpr0 |
| ; GFX9-NEXT: v_writelane_b32 v0, s28, 0 |
| ; GFX9-NEXT: v_writelane_b32 v0, s29, 1 |
| ; GFX9-NEXT: v_writelane_b32 v0, s30, 2 |
| @@ -166,7 +162,6 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 { |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s34 |
| -; GFX10-NEXT: ; implicit-def: $vgpr0 |
| ; GFX10-NEXT: v_writelane_b32 v0, s28, 0 |
| ; GFX10-NEXT: v_writelane_b32 v0, s29, 1 |
| ; GFX10-NEXT: v_writelane_b32 v0, s30, 2 |
| @@ -193,10 +188,9 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 { |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 |
| +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 |
| ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill |
| -; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr0 |
| +; GFX11-NEXT: s_mov_b32 exec_lo, s0 |
| ; GFX11-NEXT: v_writelane_b32 v0, s28, 0 |
| ; GFX11-NEXT: v_writelane_b32 v0, s29, 1 |
| ; GFX11-NEXT: v_writelane_b32 v0, s30, 2 |
| @@ -212,9 +206,9 @@ define amdgpu_gfx void @void_func_void_clobber_s28_s29() #1 { |
| ; GFX11-NEXT: v_readlane_b32 s30, v0, 2 |
| ; GFX11-NEXT: v_readlane_b32 s29, v0, 1 |
| ; GFX11-NEXT: v_readlane_b32 s28, v0, 0 |
| -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 |
| +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 |
| ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload |
| -; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| +; GFX11-NEXT: s_mov_b32 exec_lo, s0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| @@ -238,9 +232,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 |
| @@ -280,9 +273,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 |
| @@ -324,9 +316,8 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 |
| @@ -368,33 +359,32 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) |
| ; GFX9-NEXT: s_mov_b32 s34, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 |
| -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr41 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| -; GFX9-NEXT: v_writelane_b32 v41, s30, 0 |
| +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v42, s34, 0 |
| -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GFX9-NEXT: v_writelane_b32 v41, s31, 1 |
| +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: ;;#ASMSTART |
| ; GFX9-NEXT: ; def v31 |
| ; GFX9-NEXT: ;;#ASMEND |
| -; GFX9-NEXT: v_mov_b32_e32 v40, v31 |
| +; GFX9-NEXT: v_mov_b32_e32 v41, v31 |
| ; GFX9-NEXT: s_getpc_b64 s[34:35] |
| ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| -; GFX9-NEXT: v_mov_b32_e32 v31, v40 |
| +; GFX9-NEXT: v_mov_b32_e32 v31, v41 |
| ; GFX9-NEXT: ;;#ASMSTART |
| ; GFX9-NEXT: ; use v31 |
| ; GFX9-NEXT: ;;#ASMEND |
| -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload |
| -; GFX9-NEXT: v_readlane_b32 s31, v41, 1 |
| -; GFX9-NEXT: v_readlane_b32 s30, v41, 0 |
| +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload |
| +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 |
| +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s34, v42, 0 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 |
| -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 |
| @@ -409,35 +399,34 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) |
| ; GFX10-NEXT: s_mov_b32 s34, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 |
| -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr41 |
| +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| -; GFX10-NEXT: v_writelane_b32 v41, s30, 0 |
| ; GFX10-NEXT: v_writelane_b32 v42, s34, 0 |
| -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ; def v31 |
| ; GFX10-NEXT: ;;#ASMEND |
| -; GFX10-NEXT: v_mov_b32_e32 v40, v31 |
| -; GFX10-NEXT: v_writelane_b32 v41, s31, 1 |
| +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX10-NEXT: v_mov_b32_e32 v41, v31 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| -; GFX10-NEXT: v_mov_b32_e32 v31, v40 |
| +; GFX10-NEXT: v_mov_b32_e32 v31, v41 |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ; use v31 |
| ; GFX10-NEXT: ;;#ASMEND |
| -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload |
| -; GFX10-NEXT: v_readlane_b32 s31, v41, 1 |
| -; GFX10-NEXT: v_readlane_b32 s30, v41, 0 |
| +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload |
| +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 |
| +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s34, v42, 0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 |
| ; GFX10-NEXT: s_clause 0x1 |
| -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 |
| +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 |
| ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| @@ -454,35 +443,34 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) |
| ; GFX11-NEXT: s_mov_b32 s33, s32 |
| ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 |
| ; GFX11-NEXT: s_clause 0x1 |
| -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 |
| ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr41 |
| +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| -; GFX11-NEXT: v_writelane_b32 v41, s30, 0 |
| ; GFX11-NEXT: v_writelane_b32 v42, s0, 0 |
| -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill |
| +; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill |
| ; GFX11-NEXT: ;;#ASMSTART |
| ; GFX11-NEXT: ; def v31 |
| ; GFX11-NEXT: ;;#ASMEND |
| -; GFX11-NEXT: v_mov_b32_e32 v40, v31 |
| -; GFX11-NEXT: v_writelane_b32 v41, s31, 1 |
| +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX11-NEXT: v_mov_b32_e32 v41, v31 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 |
| ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX11-NEXT: v_mov_b32_e32 v31, v40 |
| +; GFX11-NEXT: v_mov_b32_e32 v31, v41 |
| ; GFX11-NEXT: ;;#ASMSTART |
| ; GFX11-NEXT: ; use v31 |
| ; GFX11-NEXT: ;;#ASMEND |
| -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload |
| -; GFX11-NEXT: v_readlane_b32 s31, v41, 1 |
| -; GFX11-NEXT: v_readlane_b32 s30, v41, 0 |
| +; GFX11-NEXT: scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload |
| +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 |
| +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v42, 0 |
| ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 |
| ; GFX11-NEXT: s_clause 0x1 |
| -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 |
| +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 |
| ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: s_add_i32 s32, s32, -16 |
| @@ -506,9 +494,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 |
| @@ -548,9 +535,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 |
| @@ -558,8 +544,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ; def s33 |
| ; GFX10-NEXT: ;;#ASMEND |
| -; GFX10-NEXT: s_mov_b32 s4, s33 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 |
| +; GFX10-NEXT: s_mov_b32 s4, s33 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| ; GFX10-NEXT: s_mov_b32 s33, s4 |
| @@ -592,9 +578,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 |
| @@ -602,8 +587,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1) |
| ; GFX11-NEXT: ;;#ASMSTART |
| ; GFX11-NEXT: ; def s33 |
| ; GFX11-NEXT: ;;#ASMEND |
| -; GFX11-NEXT: s_mov_b32 s4, s33 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 |
| +; GFX11-NEXT: s_mov_b32 s4, s33 |
| ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX11-NEXT: s_mov_b32 s33, s4 |
| @@ -640,9 +625,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 |
| ; GFX9-NEXT: ;;#ASMSTART |
| @@ -682,9 +666,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[36:37] |
| ; GFX10-NEXT: s_add_u32 s36, s36, external_void_func_void@rel32@lo+4 |
| @@ -692,8 +675,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ; def s34 |
| ; GFX10-NEXT: ;;#ASMEND |
| -; GFX10-NEXT: s_mov_b32 s4, s34 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 |
| +; GFX10-NEXT: s_mov_b32 s4, s34 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] |
| ; GFX10-NEXT: s_mov_b32 s34, s4 |
| @@ -726,9 +709,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 |
| @@ -736,8 +718,8 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1) |
| ; GFX11-NEXT: ;;#ASMSTART |
| ; GFX11-NEXT: ; def s34 |
| ; GFX11-NEXT: ;;#ASMEND |
| -; GFX11-NEXT: s_mov_b32 s4, s34 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 |
| +; GFX11-NEXT: s_mov_b32 s4, s34 |
| ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX11-NEXT: s_mov_b32 s34, s4 |
| @@ -774,7 +756,6 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1) |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr41 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v41, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v42, s34, 0 |
| @@ -814,18 +795,17 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1) |
| ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr41 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v42, s34, 0 |
| ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ; def v40 |
| ; GFX10-NEXT: ;;#ASMEND |
| +; GFX10-NEXT: v_writelane_b32 v41, s31, 1 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 |
| -; GFX10-NEXT: v_writelane_b32 v41, s31, 1 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ; use v40 |
| @@ -856,18 +836,18 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1) |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr41 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v42, s0, 0 |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill |
| ; GFX11-NEXT: ;;#ASMSTART |
| ; GFX11-NEXT: ; def v40 |
| ; GFX11-NEXT: ;;#ASMEND |
| +; GFX11-NEXT: v_writelane_b32 v41, s31, 1 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 |
| -; GFX11-NEXT: v_writelane_b32 v41, s31, 1 |
| +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX11-NEXT: ;;#ASMSTART |
| ; GFX11-NEXT: ; use v40 |
| @@ -895,18 +875,17 @@ define hidden void @void_func_void_clobber_s33() #1 { |
| ; GFX9-LABEL: void_func_void_clobber_s33: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 |
| +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill |
| -; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| -; GFX9-NEXT: ; implicit-def: $vgpr0 |
| +; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v0, s33, 0 |
| ; GFX9-NEXT: ;;#ASMSTART |
| ; GFX9-NEXT: ; clobber |
| ; GFX9-NEXT: ;;#ASMEND |
| ; GFX9-NEXT: v_readlane_b32 s33, v0, 0 |
| -; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 |
| +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload |
| -; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| +; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| @@ -914,20 +893,19 @@ define hidden void @void_func_void_clobber_s33() #1 { |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 |
| +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| -; GFX10-NEXT: s_mov_b32 exec_lo, s5 |
| -; GFX10-NEXT: ; implicit-def: $vgpr0 |
| +; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v0, s33, 0 |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ; clobber |
| ; GFX10-NEXT: ;;#ASMEND |
| ; GFX10-NEXT: v_readlane_b32 s33, v0, 0 |
| -; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 |
| +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| -; GFX10-NEXT: s_mov_b32 exec_lo, s5 |
| +; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| @@ -936,19 +914,18 @@ define hidden void @void_func_void_clobber_s33() #1 { |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 |
| +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 |
| ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill |
| -; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr0 |
| +; GFX11-NEXT: s_mov_b32 exec_lo, s0 |
| ; GFX11-NEXT: v_writelane_b32 v0, s33, 0 |
| ; GFX11-NEXT: ;;#ASMSTART |
| ; GFX11-NEXT: ; clobber |
| ; GFX11-NEXT: ;;#ASMEND |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readlane_b32 s33, v0, 0 |
| -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 |
| +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 |
| ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload |
| -; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| +; GFX11-NEXT: s_mov_b32 exec_lo, s0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| @@ -960,18 +937,17 @@ define hidden void @void_func_void_clobber_s34() #1 { |
| ; GFX9-LABEL: void_func_void_clobber_s34: |
| ; GFX9: ; %bb.0: |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 |
| +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill |
| -; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| -; GFX9-NEXT: ; implicit-def: $vgpr0 |
| +; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: v_writelane_b32 v0, s34, 0 |
| ; GFX9-NEXT: ;;#ASMSTART |
| ; GFX9-NEXT: ; clobber |
| ; GFX9-NEXT: ;;#ASMEND |
| ; GFX9-NEXT: v_readlane_b32 s34, v0, 0 |
| -; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 |
| +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload |
| -; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| +; GFX9-NEXT: s_mov_b64 exec, s[4:5] |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: s_setpc_b64 s[30:31] |
| ; |
| @@ -979,20 +955,19 @@ define hidden void @void_func_void_clobber_s34() #1 { |
| ; GFX10: ; %bb.0: |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 |
| +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| -; GFX10-NEXT: s_mov_b32 exec_lo, s5 |
| -; GFX10-NEXT: ; implicit-def: $vgpr0 |
| +; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: v_writelane_b32 v0, s34, 0 |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ; clobber |
| ; GFX10-NEXT: ;;#ASMEND |
| ; GFX10-NEXT: v_readlane_b32 s34, v0, 0 |
| -; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 |
| +; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 |
| ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| -; GFX10-NEXT: s_mov_b32 exec_lo, s5 |
| +; GFX10-NEXT: s_mov_b32 exec_lo, s4 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX10-NEXT: s_setpc_b64 s[30:31] |
| @@ -1001,19 +976,18 @@ define hidden void @void_func_void_clobber_s34() #1 { |
| ; GFX11: ; %bb.0: |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 |
| +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 |
| ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill |
| -; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr0 |
| +; GFX11-NEXT: s_mov_b32 exec_lo, s0 |
| ; GFX11-NEXT: v_writelane_b32 v0, s34, 0 |
| ; GFX11-NEXT: ;;#ASMSTART |
| ; GFX11-NEXT: ; clobber |
| ; GFX11-NEXT: ;;#ASMEND |
| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) |
| ; GFX11-NEXT: v_readlane_b32 s34, v0, 0 |
| -; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 |
| +; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 |
| ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload |
| -; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| +; GFX11-NEXT: s_mov_b32 exec_lo, s0 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| @@ -1031,7 +1005,6 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -1063,9 +1036,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s33@rel32@lo+4 |
| @@ -1097,9 +1069,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s33@rel32@lo+4 |
| @@ -1133,7 +1104,6 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| @@ -1165,9 +1135,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s34@rel32@lo+4 |
| @@ -1199,9 +1168,8 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s34@rel32@lo+4 |
| @@ -1235,9 +1203,8 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { |
| ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr40 |
| -; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 |
| ; GFX9-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 |
| @@ -1276,9 +1243,8 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { |
| ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr40 |
| -; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| ; GFX10-NEXT: v_writelane_b32 v41, s34, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 |
| @@ -1286,8 +1252,8 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ; def s40 |
| ; GFX10-NEXT: ;;#ASMEND |
| -; GFX10-NEXT: s_mov_b32 s4, s40 |
| ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 |
| +; GFX10-NEXT: s_mov_b32 s4, s40 |
| ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| ; GFX10-NEXT: ;;#ASMSTART |
| @@ -1319,9 +1285,8 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { |
| ; GFX11-NEXT: scratch_store_b32 off, v40, s33 |
| ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr40 |
| -; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| +; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| ; GFX11-NEXT: v_writelane_b32 v41, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 |
| @@ -1329,8 +1294,8 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 { |
| ; GFX11-NEXT: ;;#ASMSTART |
| ; GFX11-NEXT: ; def s40 |
| ; GFX11-NEXT: ;;#ASMEND |
| -; GFX11-NEXT: s_mov_b32 s4, s40 |
| ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 |
| +; GFX11-NEXT: s_mov_b32 s4, s40 |
| ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX11-NEXT: ;;#ASMSTART |
| @@ -1363,16 +1328,15 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { |
| ; GFX9-NEXT: s_mov_b32 s34, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 |
| -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-NEXT: ; implicit-def: $vgpr41 |
| +; GFX9-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x400 |
| -; GFX9-NEXT: v_writelane_b32 v41, s4, 0 |
| -; GFX9-NEXT: v_writelane_b32 v41, s30, 1 |
| +; GFX9-NEXT: v_writelane_b32 v40, s30, 1 |
| ; GFX9-NEXT: v_writelane_b32 v42, s34, 0 |
| -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GFX9-NEXT: v_writelane_b32 v41, s31, 2 |
| +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GFX9-NEXT: v_writelane_b32 v40, s31, 2 |
| ; GFX9-NEXT: ;;#ASMSTART |
| ; GFX9-NEXT: ; def s40 |
| ; GFX9-NEXT: ;;#ASMEND |
| @@ -1380,7 +1344,7 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { |
| ; GFX9-NEXT: ;;#ASMSTART |
| ; GFX9-NEXT: ; def v32 |
| ; GFX9-NEXT: ;;#ASMEND |
| -; GFX9-NEXT: v_mov_b32_e32 v40, v32 |
| +; GFX9-NEXT: v_mov_b32_e32 v41, v32 |
| ; GFX9-NEXT: s_getpc_b64 s[34:35] |
| ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 |
| @@ -1389,15 +1353,15 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { |
| ; GFX9-NEXT: ; use s4 |
| ; GFX9-NEXT: ;;#ASMEND |
| ; GFX9-NEXT: ;;#ASMSTART |
| -; GFX9-NEXT: ; use v40 |
| +; GFX9-NEXT: ; use v41 |
| ; GFX9-NEXT: ;;#ASMEND |
| -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload |
| -; GFX9-NEXT: v_readlane_b32 s31, v41, 2 |
| -; GFX9-NEXT: v_readlane_b32 s30, v41, 1 |
| -; GFX9-NEXT: v_readlane_b32 s4, v41, 0 |
| +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload |
| +; GFX9-NEXT: v_readlane_b32 s31, v40, 2 |
| +; GFX9-NEXT: v_readlane_b32 s30, v40, 1 |
| +; GFX9-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s34, v42, 0 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 |
| -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 |
| @@ -1412,43 +1376,42 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { |
| ; GFX10-NEXT: s_mov_b32 s34, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 |
| -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| -; GFX10-NEXT: ; implicit-def: $vgpr41 |
| +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX10-NEXT: s_addk_i32 s32, 0x200 |
| -; GFX10-NEXT: v_writelane_b32 v41, s4, 0 |
| ; GFX10-NEXT: v_writelane_b32 v42, s34, 0 |
| -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ; def s40 |
| ; GFX10-NEXT: ;;#ASMEND |
| +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 |
| ; GFX10-NEXT: s_mov_b32 s4, s40 |
| -; GFX10-NEXT: v_writelane_b32 v41, s30, 1 |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ; def v32 |
| ; GFX10-NEXT: ;;#ASMEND |
| -; GFX10-NEXT: v_mov_b32_e32 v40, v32 |
| +; GFX10-NEXT: v_mov_b32_e32 v41, v32 |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 |
| -; GFX10-NEXT: v_writelane_b32 v41, s31, 2 |
| +; GFX10-NEXT: v_writelane_b32 v40, s31, 2 |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ; use s4 |
| ; GFX10-NEXT: ;;#ASMEND |
| ; GFX10-NEXT: ;;#ASMSTART |
| -; GFX10-NEXT: ; use v40 |
| +; GFX10-NEXT: ; use v41 |
| ; GFX10-NEXT: ;;#ASMEND |
| -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload |
| -; GFX10-NEXT: v_readlane_b32 s31, v41, 2 |
| -; GFX10-NEXT: v_readlane_b32 s30, v41, 1 |
| -; GFX10-NEXT: v_readlane_b32 s4, v41, 0 |
| +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload |
| +; GFX10-NEXT: v_readlane_b32 s31, v40, 2 |
| +; GFX10-NEXT: v_readlane_b32 s30, v40, 1 |
| +; GFX10-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s34, v42, 0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 |
| ; GFX10-NEXT: s_clause 0x1 |
| -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 |
| +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 |
| ; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s35 |
| @@ -1465,42 +1428,41 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 { |
| ; GFX11-NEXT: s_mov_b32 s33, s32 |
| ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 |
| ; GFX11-NEXT: s_clause 0x1 |
| -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 |
| +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 |
| ; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| -; GFX11-NEXT: ; implicit-def: $vgpr41 |
| +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GFX11-NEXT: s_add_i32 s32, s32, 16 |
| -; GFX11-NEXT: v_writelane_b32 v41, s4, 0 |
| ; GFX11-NEXT: v_writelane_b32 v42, s0, 0 |
| -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill |
| +; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill |
| ; GFX11-NEXT: ;;#ASMSTART |
| ; GFX11-NEXT: ; def s40 |
| ; GFX11-NEXT: ;;#ASMEND |
| +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 |
| ; GFX11-NEXT: s_mov_b32 s4, s40 |
| -; GFX11-NEXT: v_writelane_b32 v41, s30, 1 |
| ; GFX11-NEXT: ;;#ASMSTART |
| ; GFX11-NEXT: ; def v32 |
| ; GFX11-NEXT: ;;#ASMEND |
| -; GFX11-NEXT: v_mov_b32_e32 v40, v32 |
| +; GFX11-NEXT: v_mov_b32_e32 v41, v32 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 |
| -; GFX11-NEXT: v_writelane_b32 v41, s31, 2 |
| +; GFX11-NEXT: v_writelane_b32 v40, s31, 2 |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| ; GFX11-NEXT: ;;#ASMSTART |
| ; GFX11-NEXT: ; use s4 |
| ; GFX11-NEXT: ;;#ASMEND |
| ; GFX11-NEXT: ;;#ASMSTART |
| -; GFX11-NEXT: ; use v40 |
| +; GFX11-NEXT: ; use v41 |
| ; GFX11-NEXT: ;;#ASMEND |
| -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload |
| -; GFX11-NEXT: v_readlane_b32 s31, v41, 2 |
| -; GFX11-NEXT: v_readlane_b32 s30, v41, 1 |
| -; GFX11-NEXT: v_readlane_b32 s4, v41, 0 |
| +; GFX11-NEXT: scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload |
| +; GFX11-NEXT: v_readlane_b32 s31, v40, 2 |
| +; GFX11-NEXT: v_readlane_b32 s30, v40, 1 |
| +; GFX11-NEXT: v_readlane_b32 s4, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v42, 0 |
| ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 |
| ; GFX11-NEXT: s_clause 0x1 |
| -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 |
| +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 |
| ; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: s_add_i32 s32, s32, -16 |
| diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll |
| index b55d65b0a7a3..a67a44971b64 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll |
| @@ -34,7 +34,6 @@ define amdgpu_gfx void @call_i1() #0 { |
| ; GFX9-NEXT: s_add_u32 s34, s34, return_i1@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s35, s35, return_i1@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr1 |
| ; GFX9-NEXT: v_writelane_b32 v1, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v1, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -63,7 +62,6 @@ define amdgpu_gfx void @call_i1() #0 { |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, return_i1@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, return_i1@gotpcrel32@hi+12 |
| -; GFX10-NEXT: ; implicit-def: $vgpr1 |
| ; GFX10-NEXT: v_writelane_b32 v1, s30, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 |
| @@ -84,7 +82,7 @@ define amdgpu_gfx void @call_i1() #0 { |
| ; GFX11: ; %bb.0: ; %entry |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX11-NEXT: s_mov_b32 s3, s33 |
| +; GFX11-NEXT: s_mov_b32 s2, s33 |
| ; GFX11-NEXT: s_mov_b32 s33, s32 |
| ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 |
| ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill |
| @@ -93,7 +91,6 @@ define amdgpu_gfx void @call_i1() #0 { |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, return_i1@gotpcrel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, return_i1@gotpcrel32@hi+12 |
| -; GFX11-NEXT: ; implicit-def: $vgpr1 |
| ; GFX11-NEXT: v_writelane_b32 v1, s30, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: v_writelane_b32 v1, s31, 1 |
| @@ -106,7 +103,7 @@ define amdgpu_gfx void @call_i1() #0 { |
| ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s0 |
| ; GFX11-NEXT: s_add_i32 s32, s32, -16 |
| -; GFX11-NEXT: s_mov_b32 s33, s3 |
| +; GFX11-NEXT: s_mov_b32 s33, s2 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| @@ -145,7 +142,6 @@ define amdgpu_gfx void @call_i16() #0 { |
| ; GFX9-NEXT: s_add_u32 s34, s34, return_i16@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s35, s35, return_i16@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr1 |
| ; GFX9-NEXT: v_writelane_b32 v1, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v1, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -174,7 +170,6 @@ define amdgpu_gfx void @call_i16() #0 { |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, return_i16@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, return_i16@gotpcrel32@hi+12 |
| -; GFX10-NEXT: ; implicit-def: $vgpr1 |
| ; GFX10-NEXT: v_writelane_b32 v1, s30, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 |
| @@ -195,7 +190,7 @@ define amdgpu_gfx void @call_i16() #0 { |
| ; GFX11: ; %bb.0: ; %entry |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX11-NEXT: s_mov_b32 s3, s33 |
| +; GFX11-NEXT: s_mov_b32 s2, s33 |
| ; GFX11-NEXT: s_mov_b32 s33, s32 |
| ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 |
| ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill |
| @@ -204,7 +199,6 @@ define amdgpu_gfx void @call_i16() #0 { |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, return_i16@gotpcrel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, return_i16@gotpcrel32@hi+12 |
| -; GFX11-NEXT: ; implicit-def: $vgpr1 |
| ; GFX11-NEXT: v_writelane_b32 v1, s30, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: v_writelane_b32 v1, s31, 1 |
| @@ -217,7 +211,7 @@ define amdgpu_gfx void @call_i16() #0 { |
| ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s0 |
| ; GFX11-NEXT: s_add_i32 s32, s32, -16 |
| -; GFX11-NEXT: s_mov_b32 s33, s3 |
| +; GFX11-NEXT: s_mov_b32 s33, s2 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| @@ -256,7 +250,6 @@ define amdgpu_gfx void @call_2xi16() #0 { |
| ; GFX9-NEXT: s_add_u32 s34, s34, return_2xi16@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s35, s35, return_2xi16@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr1 |
| ; GFX9-NEXT: v_writelane_b32 v1, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v1, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -285,7 +278,6 @@ define amdgpu_gfx void @call_2xi16() #0 { |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, return_2xi16@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, return_2xi16@gotpcrel32@hi+12 |
| -; GFX10-NEXT: ; implicit-def: $vgpr1 |
| ; GFX10-NEXT: v_writelane_b32 v1, s30, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v1, s31, 1 |
| @@ -306,7 +298,7 @@ define amdgpu_gfx void @call_2xi16() #0 { |
| ; GFX11: ; %bb.0: ; %entry |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX11-NEXT: s_mov_b32 s3, s33 |
| +; GFX11-NEXT: s_mov_b32 s2, s33 |
| ; GFX11-NEXT: s_mov_b32 s33, s32 |
| ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 |
| ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill |
| @@ -315,7 +307,6 @@ define amdgpu_gfx void @call_2xi16() #0 { |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, return_2xi16@gotpcrel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, return_2xi16@gotpcrel32@hi+12 |
| -; GFX11-NEXT: ; implicit-def: $vgpr1 |
| ; GFX11-NEXT: v_writelane_b32 v1, s30, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: v_writelane_b32 v1, s31, 1 |
| @@ -328,7 +319,7 @@ define amdgpu_gfx void @call_2xi16() #0 { |
| ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s0 |
| ; GFX11-NEXT: s_add_i32 s32, s32, -16 |
| -; GFX11-NEXT: s_mov_b32 s33, s3 |
| +; GFX11-NEXT: s_mov_b32 s33, s2 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| @@ -376,7 +367,6 @@ define amdgpu_gfx void @call_3xi16() #0 { |
| ; GFX9-NEXT: s_add_u32 s34, s34, return_3xi16@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s35, s35, return_3xi16@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr2 |
| ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| @@ -405,7 +395,6 @@ define amdgpu_gfx void @call_3xi16() #0 { |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, return_3xi16@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, return_3xi16@gotpcrel32@hi+12 |
| -; GFX10-NEXT: ; implicit-def: $vgpr2 |
| ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 |
| @@ -426,7 +415,7 @@ define amdgpu_gfx void @call_3xi16() #0 { |
| ; GFX11: ; %bb.0: ; %entry |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 |
| -; GFX11-NEXT: s_mov_b32 s3, s33 |
| +; GFX11-NEXT: s_mov_b32 s2, s33 |
| ; GFX11-NEXT: s_mov_b32 s33, s32 |
| ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 |
| ; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill |
| @@ -435,7 +424,6 @@ define amdgpu_gfx void @call_3xi16() #0 { |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, return_3xi16@gotpcrel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, return_3xi16@gotpcrel32@hi+12 |
| -; GFX11-NEXT: ; implicit-def: $vgpr2 |
| ; GFX11-NEXT: v_writelane_b32 v2, s30, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: v_writelane_b32 v2, s31, 1 |
| @@ -448,7 +436,7 @@ define amdgpu_gfx void @call_3xi16() #0 { |
| ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s0 |
| ; GFX11-NEXT: s_add_i32 s32, s32, -16 |
| -; GFX11-NEXT: s_mov_b32 s33, s3 |
| +; GFX11-NEXT: s_mov_b32 s33, s2 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| @@ -1661,9 +1649,8 @@ define amdgpu_gfx void @call_512xi32() #0 { |
| ; GFX9-NEXT: s_add_u32 s34, s34, return_512xi32@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s35, s35, return_512xi32@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr2 |
| -; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 |
| ; GFX9-NEXT: v_writelane_b32 v2, s30, 0 |
| +; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 |
| ; GFX9-NEXT: v_writelane_b32 v2, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| @@ -1692,10 +1679,9 @@ define amdgpu_gfx void @call_512xi32() #0 { |
| ; GFX10-NEXT: s_getpc_b64 s[34:35] |
| ; GFX10-NEXT: s_add_u32 s34, s34, return_512xi32@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s35, s35, return_512xi32@gotpcrel32@hi+12 |
| -; GFX10-NEXT: ; implicit-def: $vgpr2 |
| -; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 |
| -; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| ; GFX10-NEXT: v_writelane_b32 v2, s30, 0 |
| +; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 |
| +; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 |
| ; GFX10-NEXT: v_writelane_b32 v2, s31, 1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| @@ -1725,10 +1711,9 @@ define amdgpu_gfx void @call_512xi32() #0 { |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, return_512xi32@gotpcrel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, return_512xi32@gotpcrel32@hi+12 |
| -; GFX11-NEXT: ; implicit-def: $vgpr5 |
| -; GFX11-NEXT: v_mov_b32_e32 v0, s33 |
| -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| ; GFX11-NEXT: v_writelane_b32 v5, s30, 0 |
| +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| +; GFX11-NEXT: v_mov_b32_e32 v0, s33 |
| ; GFX11-NEXT: v_writelane_b32 v5, s31, 1 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll |
| index cc8d85c85b0b..5040e5348aa1 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll |
| @@ -401,7 +401,6 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { |
| ; GCN-NEXT: s_mov_b64 exec, s[18:19] |
| ; GCN-NEXT: v_writelane_b32 v41, s16, 0 |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| -; GCN-NEXT: ; implicit-def: $vgpr40 |
| ; GCN-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GCN-NEXT: v_writelane_b32 v40, s34, 2 |
| @@ -488,7 +487,6 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) { |
| ; GISEL-NEXT: s_mov_b64 exec, s[18:19] |
| ; GISEL-NEXT: v_writelane_b32 v41, s16, 0 |
| ; GISEL-NEXT: s_addk_i32 s32, 0x400 |
| -; GISEL-NEXT: ; implicit-def: $vgpr40 |
| ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 |
| @@ -579,7 +577,6 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { |
| ; GCN-NEXT: s_mov_b64 exec, s[18:19] |
| ; GCN-NEXT: v_writelane_b32 v41, s16, 0 |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| -; GCN-NEXT: ; implicit-def: $vgpr40 |
| ; GCN-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GCN-NEXT: v_writelane_b32 v40, s34, 2 |
| @@ -669,7 +666,6 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) { |
| ; GISEL-NEXT: s_mov_b64 exec, s[18:19] |
| ; GISEL-NEXT: v_writelane_b32 v41, s16, 0 |
| ; GISEL-NEXT: s_addk_i32 s32, 0x400 |
| -; GISEL-NEXT: ; implicit-def: $vgpr40 |
| ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 |
| @@ -761,7 +757,6 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { |
| ; GCN-NEXT: s_mov_b64 exec, s[18:19] |
| ; GCN-NEXT: v_writelane_b32 v41, s16, 0 |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| -; GCN-NEXT: ; implicit-def: $vgpr40 |
| ; GCN-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GCN-NEXT: v_writelane_b32 v40, s34, 2 |
| @@ -850,7 +845,6 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) { |
| ; GISEL-NEXT: s_mov_b64 exec, s[18:19] |
| ; GISEL-NEXT: v_writelane_b32 v41, s16, 0 |
| ; GISEL-NEXT: s_addk_i32 s32, 0x400 |
| -; GISEL-NEXT: ; implicit-def: $vgpr40 |
| ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 |
| @@ -944,7 +938,6 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { |
| ; GCN-NEXT: s_mov_b64 exec, s[18:19] |
| ; GCN-NEXT: v_writelane_b32 v41, s16, 0 |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| -; GCN-NEXT: ; implicit-def: $vgpr40 |
| ; GCN-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GCN-NEXT: v_writelane_b32 v40, s34, 2 |
| @@ -1042,7 +1035,6 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) { |
| ; GISEL-NEXT: s_mov_b64 exec, s[18:19] |
| ; GISEL-NEXT: v_writelane_b32 v41, s16, 0 |
| ; GISEL-NEXT: s_addk_i32 s32, 0x400 |
| -; GISEL-NEXT: ; implicit-def: $vgpr40 |
| ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 |
| @@ -1149,7 +1141,6 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { |
| ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[6:7] |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| -; GCN-NEXT: ; implicit-def: $vgpr40 |
| ; GCN-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GCN-NEXT: v_writelane_b32 v40, s34, 2 |
| @@ -1244,7 +1235,6 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { |
| ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GISEL-NEXT: s_mov_b64 exec, s[6:7] |
| ; GISEL-NEXT: s_addk_i32 s32, 0x400 |
| -; GISEL-NEXT: ; implicit-def: $vgpr40 |
| ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 |
| @@ -1337,198 +1327,196 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) { |
| ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GCN-NEXT: s_mov_b32 s12, s33 |
| +; GCN-NEXT: s_mov_b32 s10, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GCN-NEXT: ; implicit-def: $vgpr41 |
| -; GCN-NEXT: v_writelane_b32 v41, s30, 0 |
| -; GCN-NEXT: v_writelane_b32 v41, s31, 1 |
| -; GCN-NEXT: v_writelane_b32 v41, s34, 2 |
| -; GCN-NEXT: v_writelane_b32 v41, s35, 3 |
| -; GCN-NEXT: v_writelane_b32 v41, s36, 4 |
| -; GCN-NEXT: v_writelane_b32 v41, s37, 5 |
| -; GCN-NEXT: v_writelane_b32 v41, s38, 6 |
| -; GCN-NEXT: v_writelane_b32 v41, s39, 7 |
| -; GCN-NEXT: v_writelane_b32 v41, s40, 8 |
| -; GCN-NEXT: v_writelane_b32 v41, s41, 9 |
| -; GCN-NEXT: v_writelane_b32 v41, s42, 10 |
| -; GCN-NEXT: v_writelane_b32 v41, s43, 11 |
| -; GCN-NEXT: v_writelane_b32 v41, s44, 12 |
| -; GCN-NEXT: v_writelane_b32 v41, s45, 13 |
| -; GCN-NEXT: v_writelane_b32 v41, s46, 14 |
| -; GCN-NEXT: v_writelane_b32 v41, s47, 15 |
| -; GCN-NEXT: v_writelane_b32 v41, s48, 16 |
| -; GCN-NEXT: v_writelane_b32 v41, s49, 17 |
| -; GCN-NEXT: v_writelane_b32 v41, s50, 18 |
| -; GCN-NEXT: v_writelane_b32 v41, s51, 19 |
| -; GCN-NEXT: v_writelane_b32 v41, s52, 20 |
| -; GCN-NEXT: v_writelane_b32 v41, s53, 21 |
| -; GCN-NEXT: v_writelane_b32 v41, s54, 22 |
| -; GCN-NEXT: v_writelane_b32 v41, s55, 23 |
| -; GCN-NEXT: v_writelane_b32 v41, s56, 24 |
| -; GCN-NEXT: v_writelane_b32 v41, s57, 25 |
| -; GCN-NEXT: v_writelane_b32 v41, s58, 26 |
| -; GCN-NEXT: v_writelane_b32 v41, s59, 27 |
| -; GCN-NEXT: v_writelane_b32 v41, s60, 28 |
| -; GCN-NEXT: v_writelane_b32 v41, s61, 29 |
| -; GCN-NEXT: v_writelane_b32 v41, s62, 30 |
| -; GCN-NEXT: v_writelane_b32 v41, s63, 31 |
| -; GCN-NEXT: v_mov_b32_e32 v40, v0 |
| +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GCN-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GCN-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GCN-NEXT: v_writelane_b32 v40, s34, 2 |
| +; GCN-NEXT: v_writelane_b32 v40, s35, 3 |
| +; GCN-NEXT: v_writelane_b32 v40, s36, 4 |
| +; GCN-NEXT: v_writelane_b32 v40, s37, 5 |
| +; GCN-NEXT: v_writelane_b32 v40, s38, 6 |
| +; GCN-NEXT: v_writelane_b32 v40, s39, 7 |
| +; GCN-NEXT: v_writelane_b32 v40, s40, 8 |
| +; GCN-NEXT: v_writelane_b32 v40, s41, 9 |
| +; GCN-NEXT: v_writelane_b32 v40, s42, 10 |
| +; GCN-NEXT: v_writelane_b32 v40, s43, 11 |
| +; GCN-NEXT: v_writelane_b32 v40, s44, 12 |
| +; GCN-NEXT: v_writelane_b32 v40, s45, 13 |
| +; GCN-NEXT: v_writelane_b32 v40, s46, 14 |
| +; GCN-NEXT: v_writelane_b32 v40, s47, 15 |
| +; GCN-NEXT: v_writelane_b32 v40, s48, 16 |
| +; GCN-NEXT: v_writelane_b32 v40, s49, 17 |
| +; GCN-NEXT: v_writelane_b32 v40, s50, 18 |
| +; GCN-NEXT: v_writelane_b32 v40, s51, 19 |
| +; GCN-NEXT: v_writelane_b32 v40, s52, 20 |
| +; GCN-NEXT: v_writelane_b32 v40, s53, 21 |
| +; GCN-NEXT: v_writelane_b32 v40, s54, 22 |
| +; GCN-NEXT: v_writelane_b32 v40, s55, 23 |
| +; GCN-NEXT: v_writelane_b32 v40, s56, 24 |
| +; GCN-NEXT: v_writelane_b32 v40, s57, 25 |
| +; GCN-NEXT: v_writelane_b32 v40, s58, 26 |
| +; GCN-NEXT: v_writelane_b32 v40, s59, 27 |
| +; GCN-NEXT: v_writelane_b32 v40, s60, 28 |
| +; GCN-NEXT: v_writelane_b32 v40, s61, 29 |
| +; GCN-NEXT: v_writelane_b32 v40, s62, 30 |
| +; GCN-NEXT: v_writelane_b32 v40, s63, 31 |
| +; GCN-NEXT: v_mov_b32_e32 v41, v0 |
| ; GCN-NEXT: s_mov_b64 s[4:5], exec |
| ; GCN-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 |
| ; GCN-NEXT: v_readfirstlane_b32 s6, v1 |
| ; GCN-NEXT: v_readfirstlane_b32 s7, v2 |
| ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] |
| ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc |
| -; GCN-NEXT: v_mov_b32_e32 v0, v40 |
| +; GCN-NEXT: v_mov_b32_e32 v0, v41 |
| ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GCN-NEXT: ; implicit-def: $vgpr1_vgpr2 |
| ; GCN-NEXT: s_xor_b64 exec, exec, s[8:9] |
| ; GCN-NEXT: s_cbranch_execnz .LBB7_1 |
| ; GCN-NEXT: ; %bb.2: |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| -; GCN-NEXT: v_mov_b32_e32 v0, v40 |
| -; GCN-NEXT: v_readlane_b32 s63, v41, 31 |
| -; GCN-NEXT: v_readlane_b32 s62, v41, 30 |
| -; GCN-NEXT: v_readlane_b32 s61, v41, 29 |
| -; GCN-NEXT: v_readlane_b32 s60, v41, 28 |
| -; GCN-NEXT: v_readlane_b32 s59, v41, 27 |
| -; GCN-NEXT: v_readlane_b32 s58, v41, 26 |
| -; GCN-NEXT: v_readlane_b32 s57, v41, 25 |
| -; GCN-NEXT: v_readlane_b32 s56, v41, 24 |
| -; GCN-NEXT: v_readlane_b32 s55, v41, 23 |
| -; GCN-NEXT: v_readlane_b32 s54, v41, 22 |
| -; GCN-NEXT: v_readlane_b32 s53, v41, 21 |
| -; GCN-NEXT: v_readlane_b32 s52, v41, 20 |
| -; GCN-NEXT: v_readlane_b32 s51, v41, 19 |
| -; GCN-NEXT: v_readlane_b32 s50, v41, 18 |
| -; GCN-NEXT: v_readlane_b32 s49, v41, 17 |
| -; GCN-NEXT: v_readlane_b32 s48, v41, 16 |
| -; GCN-NEXT: v_readlane_b32 s47, v41, 15 |
| -; GCN-NEXT: v_readlane_b32 s46, v41, 14 |
| -; GCN-NEXT: v_readlane_b32 s45, v41, 13 |
| -; GCN-NEXT: v_readlane_b32 s44, v41, 12 |
| -; GCN-NEXT: v_readlane_b32 s43, v41, 11 |
| -; GCN-NEXT: v_readlane_b32 s42, v41, 10 |
| -; GCN-NEXT: v_readlane_b32 s41, v41, 9 |
| -; GCN-NEXT: v_readlane_b32 s40, v41, 8 |
| -; GCN-NEXT: v_readlane_b32 s39, v41, 7 |
| -; GCN-NEXT: v_readlane_b32 s38, v41, 6 |
| -; GCN-NEXT: v_readlane_b32 s37, v41, 5 |
| -; GCN-NEXT: v_readlane_b32 s36, v41, 4 |
| -; GCN-NEXT: v_readlane_b32 s35, v41, 3 |
| -; GCN-NEXT: v_readlane_b32 s34, v41, 2 |
| -; GCN-NEXT: v_readlane_b32 s31, v41, 1 |
| -; GCN-NEXT: v_readlane_b32 s30, v41, 0 |
| -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload |
| +; GCN-NEXT: v_mov_b32_e32 v0, v41 |
| +; GCN-NEXT: v_readlane_b32 s63, v40, 31 |
| +; GCN-NEXT: v_readlane_b32 s62, v40, 30 |
| +; GCN-NEXT: v_readlane_b32 s61, v40, 29 |
| +; GCN-NEXT: v_readlane_b32 s60, v40, 28 |
| +; GCN-NEXT: v_readlane_b32 s59, v40, 27 |
| +; GCN-NEXT: v_readlane_b32 s58, v40, 26 |
| +; GCN-NEXT: v_readlane_b32 s57, v40, 25 |
| +; GCN-NEXT: v_readlane_b32 s56, v40, 24 |
| +; GCN-NEXT: v_readlane_b32 s55, v40, 23 |
| +; GCN-NEXT: v_readlane_b32 s54, v40, 22 |
| +; GCN-NEXT: v_readlane_b32 s53, v40, 21 |
| +; GCN-NEXT: v_readlane_b32 s52, v40, 20 |
| +; GCN-NEXT: v_readlane_b32 s51, v40, 19 |
| +; GCN-NEXT: v_readlane_b32 s50, v40, 18 |
| +; GCN-NEXT: v_readlane_b32 s49, v40, 17 |
| +; GCN-NEXT: v_readlane_b32 s48, v40, 16 |
| +; GCN-NEXT: v_readlane_b32 s47, v40, 15 |
| +; GCN-NEXT: v_readlane_b32 s46, v40, 14 |
| +; GCN-NEXT: v_readlane_b32 s45, v40, 13 |
| +; GCN-NEXT: v_readlane_b32 s44, v40, 12 |
| +; GCN-NEXT: v_readlane_b32 s43, v40, 11 |
| +; GCN-NEXT: v_readlane_b32 s42, v40, 10 |
| +; GCN-NEXT: v_readlane_b32 s41, v40, 9 |
| +; GCN-NEXT: v_readlane_b32 s40, v40, 8 |
| +; GCN-NEXT: v_readlane_b32 s39, v40, 7 |
| +; GCN-NEXT: v_readlane_b32 s38, v40, 6 |
| +; GCN-NEXT: v_readlane_b32 s37, v40, 5 |
| +; GCN-NEXT: v_readlane_b32 s36, v40, 4 |
| +; GCN-NEXT: v_readlane_b32 s35, v40, 3 |
| +; GCN-NEXT: v_readlane_b32 s34, v40, 2 |
| +; GCN-NEXT: v_readlane_b32 s31, v40, 1 |
| +; GCN-NEXT: v_readlane_b32 s30, v40, 0 |
| +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GCN-NEXT: s_mov_b32 s33, s12 |
| +; GCN-NEXT: s_mov_b32 s33, s10 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse: |
| ; GISEL: ; %bb.0: |
| ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GISEL-NEXT: s_mov_b32 s12, s33 |
| +; GISEL-NEXT: s_mov_b32 s10, s33 |
| ; GISEL-NEXT: s_mov_b32 s33, s32 |
| ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| -; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GISEL-NEXT: s_mov_b64 exec, s[4:5] |
| ; GISEL-NEXT: s_addk_i32 s32, 0x400 |
| -; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GISEL-NEXT: ; implicit-def: $vgpr41 |
| -; GISEL-NEXT: v_writelane_b32 v41, s30, 0 |
| -; GISEL-NEXT: v_writelane_b32 v41, s31, 1 |
| -; GISEL-NEXT: v_writelane_b32 v41, s34, 2 |
| -; GISEL-NEXT: v_writelane_b32 v41, s35, 3 |
| -; GISEL-NEXT: v_writelane_b32 v41, s36, 4 |
| -; GISEL-NEXT: v_writelane_b32 v41, s37, 5 |
| -; GISEL-NEXT: v_writelane_b32 v41, s38, 6 |
| -; GISEL-NEXT: v_writelane_b32 v41, s39, 7 |
| -; GISEL-NEXT: v_writelane_b32 v41, s40, 8 |
| -; GISEL-NEXT: v_writelane_b32 v41, s41, 9 |
| -; GISEL-NEXT: v_writelane_b32 v41, s42, 10 |
| -; GISEL-NEXT: v_writelane_b32 v41, s43, 11 |
| -; GISEL-NEXT: v_writelane_b32 v41, s44, 12 |
| -; GISEL-NEXT: v_writelane_b32 v41, s45, 13 |
| -; GISEL-NEXT: v_writelane_b32 v41, s46, 14 |
| -; GISEL-NEXT: v_writelane_b32 v41, s47, 15 |
| -; GISEL-NEXT: v_writelane_b32 v41, s48, 16 |
| -; GISEL-NEXT: v_writelane_b32 v41, s49, 17 |
| -; GISEL-NEXT: v_writelane_b32 v41, s50, 18 |
| -; GISEL-NEXT: v_writelane_b32 v41, s51, 19 |
| -; GISEL-NEXT: v_writelane_b32 v41, s52, 20 |
| -; GISEL-NEXT: v_writelane_b32 v41, s53, 21 |
| -; GISEL-NEXT: v_writelane_b32 v41, s54, 22 |
| -; GISEL-NEXT: v_writelane_b32 v41, s55, 23 |
| -; GISEL-NEXT: v_writelane_b32 v41, s56, 24 |
| -; GISEL-NEXT: v_writelane_b32 v41, s57, 25 |
| -; GISEL-NEXT: v_writelane_b32 v41, s58, 26 |
| -; GISEL-NEXT: v_writelane_b32 v41, s59, 27 |
| -; GISEL-NEXT: v_writelane_b32 v41, s60, 28 |
| -; GISEL-NEXT: v_writelane_b32 v41, s61, 29 |
| -; GISEL-NEXT: v_writelane_b32 v41, s62, 30 |
| -; GISEL-NEXT: v_writelane_b32 v41, s63, 31 |
| -; GISEL-NEXT: v_mov_b32_e32 v40, v0 |
| +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GISEL-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GISEL-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GISEL-NEXT: v_writelane_b32 v40, s34, 2 |
| +; GISEL-NEXT: v_writelane_b32 v40, s35, 3 |
| +; GISEL-NEXT: v_writelane_b32 v40, s36, 4 |
| +; GISEL-NEXT: v_writelane_b32 v40, s37, 5 |
| +; GISEL-NEXT: v_writelane_b32 v40, s38, 6 |
| +; GISEL-NEXT: v_writelane_b32 v40, s39, 7 |
| +; GISEL-NEXT: v_writelane_b32 v40, s40, 8 |
| +; GISEL-NEXT: v_writelane_b32 v40, s41, 9 |
| +; GISEL-NEXT: v_writelane_b32 v40, s42, 10 |
| +; GISEL-NEXT: v_writelane_b32 v40, s43, 11 |
| +; GISEL-NEXT: v_writelane_b32 v40, s44, 12 |
| +; GISEL-NEXT: v_writelane_b32 v40, s45, 13 |
| +; GISEL-NEXT: v_writelane_b32 v40, s46, 14 |
| +; GISEL-NEXT: v_writelane_b32 v40, s47, 15 |
| +; GISEL-NEXT: v_writelane_b32 v40, s48, 16 |
| +; GISEL-NEXT: v_writelane_b32 v40, s49, 17 |
| +; GISEL-NEXT: v_writelane_b32 v40, s50, 18 |
| +; GISEL-NEXT: v_writelane_b32 v40, s51, 19 |
| +; GISEL-NEXT: v_writelane_b32 v40, s52, 20 |
| +; GISEL-NEXT: v_writelane_b32 v40, s53, 21 |
| +; GISEL-NEXT: v_writelane_b32 v40, s54, 22 |
| +; GISEL-NEXT: v_writelane_b32 v40, s55, 23 |
| +; GISEL-NEXT: v_writelane_b32 v40, s56, 24 |
| +; GISEL-NEXT: v_writelane_b32 v40, s57, 25 |
| +; GISEL-NEXT: v_writelane_b32 v40, s58, 26 |
| +; GISEL-NEXT: v_writelane_b32 v40, s59, 27 |
| +; GISEL-NEXT: v_writelane_b32 v40, s60, 28 |
| +; GISEL-NEXT: v_writelane_b32 v40, s61, 29 |
| +; GISEL-NEXT: v_writelane_b32 v40, s62, 30 |
| +; GISEL-NEXT: v_writelane_b32 v40, s63, 31 |
| +; GISEL-NEXT: v_mov_b32_e32 v41, v0 |
| ; GISEL-NEXT: s_mov_b64 s[4:5], exec |
| ; GISEL-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 |
| ; GISEL-NEXT: v_readfirstlane_b32 s6, v1 |
| ; GISEL-NEXT: v_readfirstlane_b32 s7, v2 |
| ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2] |
| ; GISEL-NEXT: s_and_saveexec_b64 s[8:9], vcc |
| -; GISEL-NEXT: v_mov_b32_e32 v0, v40 |
| +; GISEL-NEXT: v_mov_b32_e32 v0, v41 |
| ; GISEL-NEXT: s_swappc_b64 s[30:31], s[6:7] |
| ; GISEL-NEXT: ; implicit-def: $vgpr1 |
| ; GISEL-NEXT: s_xor_b64 exec, exec, s[8:9] |
| ; GISEL-NEXT: s_cbranch_execnz .LBB7_1 |
| ; GISEL-NEXT: ; %bb.2: |
| ; GISEL-NEXT: s_mov_b64 exec, s[4:5] |
| -; GISEL-NEXT: v_mov_b32_e32 v0, v40 |
| -; GISEL-NEXT: v_readlane_b32 s63, v41, 31 |
| -; GISEL-NEXT: v_readlane_b32 s62, v41, 30 |
| -; GISEL-NEXT: v_readlane_b32 s61, v41, 29 |
| -; GISEL-NEXT: v_readlane_b32 s60, v41, 28 |
| -; GISEL-NEXT: v_readlane_b32 s59, v41, 27 |
| -; GISEL-NEXT: v_readlane_b32 s58, v41, 26 |
| -; GISEL-NEXT: v_readlane_b32 s57, v41, 25 |
| -; GISEL-NEXT: v_readlane_b32 s56, v41, 24 |
| -; GISEL-NEXT: v_readlane_b32 s55, v41, 23 |
| -; GISEL-NEXT: v_readlane_b32 s54, v41, 22 |
| -; GISEL-NEXT: v_readlane_b32 s53, v41, 21 |
| -; GISEL-NEXT: v_readlane_b32 s52, v41, 20 |
| -; GISEL-NEXT: v_readlane_b32 s51, v41, 19 |
| -; GISEL-NEXT: v_readlane_b32 s50, v41, 18 |
| -; GISEL-NEXT: v_readlane_b32 s49, v41, 17 |
| -; GISEL-NEXT: v_readlane_b32 s48, v41, 16 |
| -; GISEL-NEXT: v_readlane_b32 s47, v41, 15 |
| -; GISEL-NEXT: v_readlane_b32 s46, v41, 14 |
| -; GISEL-NEXT: v_readlane_b32 s45, v41, 13 |
| -; GISEL-NEXT: v_readlane_b32 s44, v41, 12 |
| -; GISEL-NEXT: v_readlane_b32 s43, v41, 11 |
| -; GISEL-NEXT: v_readlane_b32 s42, v41, 10 |
| -; GISEL-NEXT: v_readlane_b32 s41, v41, 9 |
| -; GISEL-NEXT: v_readlane_b32 s40, v41, 8 |
| -; GISEL-NEXT: v_readlane_b32 s39, v41, 7 |
| -; GISEL-NEXT: v_readlane_b32 s38, v41, 6 |
| -; GISEL-NEXT: v_readlane_b32 s37, v41, 5 |
| -; GISEL-NEXT: v_readlane_b32 s36, v41, 4 |
| -; GISEL-NEXT: v_readlane_b32 s35, v41, 3 |
| -; GISEL-NEXT: v_readlane_b32 s34, v41, 2 |
| -; GISEL-NEXT: v_readlane_b32 s31, v41, 1 |
| -; GISEL-NEXT: v_readlane_b32 s30, v41, 0 |
| -; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload |
| +; GISEL-NEXT: v_mov_b32_e32 v0, v41 |
| +; GISEL-NEXT: v_readlane_b32 s63, v40, 31 |
| +; GISEL-NEXT: v_readlane_b32 s62, v40, 30 |
| +; GISEL-NEXT: v_readlane_b32 s61, v40, 29 |
| +; GISEL-NEXT: v_readlane_b32 s60, v40, 28 |
| +; GISEL-NEXT: v_readlane_b32 s59, v40, 27 |
| +; GISEL-NEXT: v_readlane_b32 s58, v40, 26 |
| +; GISEL-NEXT: v_readlane_b32 s57, v40, 25 |
| +; GISEL-NEXT: v_readlane_b32 s56, v40, 24 |
| +; GISEL-NEXT: v_readlane_b32 s55, v40, 23 |
| +; GISEL-NEXT: v_readlane_b32 s54, v40, 22 |
| +; GISEL-NEXT: v_readlane_b32 s53, v40, 21 |
| +; GISEL-NEXT: v_readlane_b32 s52, v40, 20 |
| +; GISEL-NEXT: v_readlane_b32 s51, v40, 19 |
| +; GISEL-NEXT: v_readlane_b32 s50, v40, 18 |
| +; GISEL-NEXT: v_readlane_b32 s49, v40, 17 |
| +; GISEL-NEXT: v_readlane_b32 s48, v40, 16 |
| +; GISEL-NEXT: v_readlane_b32 s47, v40, 15 |
| +; GISEL-NEXT: v_readlane_b32 s46, v40, 14 |
| +; GISEL-NEXT: v_readlane_b32 s45, v40, 13 |
| +; GISEL-NEXT: v_readlane_b32 s44, v40, 12 |
| +; GISEL-NEXT: v_readlane_b32 s43, v40, 11 |
| +; GISEL-NEXT: v_readlane_b32 s42, v40, 10 |
| +; GISEL-NEXT: v_readlane_b32 s41, v40, 9 |
| +; GISEL-NEXT: v_readlane_b32 s40, v40, 8 |
| +; GISEL-NEXT: v_readlane_b32 s39, v40, 7 |
| +; GISEL-NEXT: v_readlane_b32 s38, v40, 6 |
| +; GISEL-NEXT: v_readlane_b32 s37, v40, 5 |
| +; GISEL-NEXT: v_readlane_b32 s36, v40, 4 |
| +; GISEL-NEXT: v_readlane_b32 s35, v40, 3 |
| +; GISEL-NEXT: v_readlane_b32 s34, v40, 2 |
| +; GISEL-NEXT: v_readlane_b32 s31, v40, 1 |
| +; GISEL-NEXT: v_readlane_b32 s30, v40, 0 |
| +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| -; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| +; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; GISEL-NEXT: s_mov_b64 exec, s[4:5] |
| ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GISEL-NEXT: s_mov_b32 s33, s12 |
| +; GISEL-NEXT: s_mov_b32 s33, s10 |
| ; GISEL-NEXT: s_waitcnt vmcnt(0) |
| ; GISEL-NEXT: s_setpc_b64 s[30:31] |
| call amdgpu_gfx void %fptr(i32 %i) |
| @@ -1543,13 +1531,12 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { |
| ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_return: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GCN-NEXT: s_mov_b32 s12, s33 |
| +; GCN-NEXT: s_mov_b32 s10, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| -; GCN-NEXT: ; implicit-def: $vgpr40 |
| ; GCN-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GCN-NEXT: v_writelane_b32 v40, s34, 2 |
| @@ -1633,20 +1620,19 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { |
| ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GCN-NEXT: s_mov_b32 s33, s12 |
| +; GCN-NEXT: s_mov_b32 s33, s10 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_return: |
| ; GISEL: ; %bb.0: |
| ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GISEL-NEXT: s_mov_b32 s12, s33 |
| +; GISEL-NEXT: s_mov_b32 s10, s33 |
| ; GISEL-NEXT: s_mov_b32 s33, s32 |
| ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GISEL-NEXT: s_mov_b64 exec, s[4:5] |
| ; GISEL-NEXT: s_addk_i32 s32, 0x400 |
| -; GISEL-NEXT: ; implicit-def: $vgpr40 |
| ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 |
| @@ -1730,7 +1716,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) { |
| ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GISEL-NEXT: s_mov_b64 exec, s[4:5] |
| ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GISEL-NEXT: s_mov_b32 s33, s12 |
| +; GISEL-NEXT: s_mov_b32 s33, s10 |
| ; GISEL-NEXT: s_waitcnt vmcnt(0) |
| ; GISEL-NEXT: s_setpc_b64 s[30:31] |
| %ret = call amdgpu_gfx i32 %fptr(i32 %i) |
| @@ -1742,13 +1728,12 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { |
| ; GCN-LABEL: test_indirect_tail_call_vgpr_ptr: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GCN-NEXT: s_mov_b32 s12, s33 |
| +; GCN-NEXT: s_mov_b32 s10, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| -; GCN-NEXT: ; implicit-def: $vgpr40 |
| ; GCN-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GCN-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GCN-NEXT: v_writelane_b32 v40, s34, 2 |
| @@ -1829,20 +1814,19 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { |
| ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GCN-NEXT: s_mov_b32 s33, s12 |
| +; GCN-NEXT: s_mov_b32 s33, s10 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| ; |
| ; GISEL-LABEL: test_indirect_tail_call_vgpr_ptr: |
| ; GISEL: ; %bb.0: |
| ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GISEL-NEXT: s_mov_b32 s12, s33 |
| +; GISEL-NEXT: s_mov_b32 s10, s33 |
| ; GISEL-NEXT: s_mov_b32 s33, s32 |
| ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GISEL-NEXT: s_mov_b64 exec, s[4:5] |
| ; GISEL-NEXT: s_addk_i32 s32, 0x400 |
| -; GISEL-NEXT: ; implicit-def: $vgpr40 |
| ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GISEL-NEXT: v_writelane_b32 v40, s34, 2 |
| @@ -1923,7 +1907,7 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) { |
| ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GISEL-NEXT: s_mov_b64 exec, s[4:5] |
| ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 |
| -; GISEL-NEXT: s_mov_b32 s33, s12 |
| +; GISEL-NEXT: s_mov_b32 s33, s10 |
| ; GISEL-NEXT: s_waitcnt vmcnt(0) |
| ; GISEL-NEXT: s_setpc_b64 s[30:31] |
| tail call amdgpu_gfx void %fptr() |
| diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll |
| index f956251e4fb5..f196192f723f 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll |
| @@ -13,30 +13,17 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { |
| ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 |
| ; CHECK-NEXT: s_add_u32 s0, s0, s17 |
| ; CHECK-NEXT: s_addc_u32 s1, s1, 0 |
| -; CHECK-NEXT: ; implicit-def: $vgpr3 |
| -; CHECK-NEXT: v_writelane_b32 v3, s16, 0 |
| -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; CHECK-NEXT: s_add_i32 s12, s33, 0x100200 |
| -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s12 ; 4-byte Folded Spill |
| -; CHECK-NEXT: s_mov_b64 exec, s[34:35] |
| +; CHECK-NEXT: v_writelane_b32 v40, s16, 0 |
| ; CHECK-NEXT: s_mov_b32 s13, s15 |
| ; CHECK-NEXT: s_mov_b32 s12, s14 |
| -; CHECK-NEXT: v_readlane_b32 s14, v3, 0 |
| +; CHECK-NEXT: v_readlane_b32 s14, v40, 0 |
| ; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9] |
| ; CHECK-NEXT: v_mov_b32_e32 v3, v2 |
| ; CHECK-NEXT: v_mov_b32_e32 v2, v1 |
| ; CHECK-NEXT: v_mov_b32_e32 v1, v0 |
| -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; CHECK-NEXT: s_add_i32 s8, s33, 0x100200 |
| -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s8 ; 4-byte Folded Reload |
| -; CHECK-NEXT: s_mov_b64 exec, s[34:35] |
| ; CHECK-NEXT: s_load_dword s8, s[16:17], 0x0 |
| -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| -; CHECK-NEXT: v_writelane_b32 v0, s8, 1 |
| -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; CHECK-NEXT: s_add_i32 s8, s33, 0x100200 |
| -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s8 ; 4-byte Folded Spill |
| -; CHECK-NEXT: s_mov_b64 exec, s[34:35] |
| +; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| +; CHECK-NEXT: v_writelane_b32 v40, s8, 1 |
| ; CHECK-NEXT: ;;#ASMSTART |
| ; CHECK-NEXT: ; def vgpr10 |
| ; CHECK-NEXT: ;;#ASMEND |
| @@ -69,14 +56,9 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { |
| ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; CHECK-NEXT: s_add_i32 s4, s33, 0x100200 |
| -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload |
| -; CHECK-NEXT: s_mov_b64 exec, s[34:35] |
| ; CHECK-NEXT: s_add_i32 s4, s33, 0x100100 |
| ; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s4 ; 4-byte Folded Reload |
| -; CHECK-NEXT: s_waitcnt vmcnt(1) |
| -; CHECK-NEXT: v_readlane_b32 s4, v0, 1 |
| +; CHECK-NEXT: v_readlane_b32 s4, v40, 1 |
| ; CHECK-NEXT: s_mov_b32 s5, 0 |
| ; CHECK-NEXT: s_cmp_eq_u32 s4, s5 |
| ; CHECK-NEXT: v_mov_b32_e32 v0, 0x4000 |
| diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll |
| index 98641f302a6a..8946846898f8 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll |
| @@ -3037,17 +3037,17 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % |
| ; GCN-HSA-NEXT: s_lshr_b32 s35, s8, 16 |
| ; GCN-HSA-NEXT: s_lshr_b32 s37, s11, 16 |
| ; GCN-HSA-NEXT: s_lshr_b32 s38, s10, 16 |
| -; GCN-HSA-NEXT: s_lshr_b32 s40, s13, 16 |
| -; GCN-HSA-NEXT: s_lshr_b32 s41, s12, 16 |
| -; GCN-HSA-NEXT: s_lshr_b32 s42, s15, 16 |
| -; GCN-HSA-NEXT: s_lshr_b32 s43, s14, 16 |
| +; GCN-HSA-NEXT: s_lshr_b32 s39, s13, 16 |
| +; GCN-HSA-NEXT: s_lshr_b32 s40, s12, 16 |
| +; GCN-HSA-NEXT: s_lshr_b32 s41, s15, 16 |
| +; GCN-HSA-NEXT: s_lshr_b32 s42, s14, 16 |
| ; GCN-HSA-NEXT: s_and_b32 s25, s1, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s27, s0, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s29, s3, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s31, s2, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s34, s5, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s36, s4, 0xffff |
| -; GCN-HSA-NEXT: s_and_b32 s39, s7, 0xffff |
| +; GCN-HSA-NEXT: s_and_b32 s43, s7, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s44, s6, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s45, s9, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s46, s8, 0xffff |
| @@ -3172,13 +3172,13 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s52 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s18 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3] |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s43 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s42 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s50 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s51 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s42 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s41 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s40 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s49 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s40 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[20:23] |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7] |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] |
| @@ -3206,7 +3206,7 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) % |
| ; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s44 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s30 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s43 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 |
| ; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| @@ -6181,129 +6181,129 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) % |
| ; GCN-HSA-NEXT: s_lshr_b32 s33, s4, 16 |
| ; GCN-HSA-NEXT: s_lshr_b32 s34, s2, 16 |
| ; GCN-HSA-NEXT: s_lshr_b32 s18, s0, 16 |
| -; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff |
| -; GCN-HSA-NEXT: s_and_b32 s35, s2, 0xffff |
| +; GCN-HSA-NEXT: s_and_b32 s35, s0, 0xffff |
| +; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s12, s12, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s14, s14, 0xffff |
| -; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff |
| -; GCN-HSA-NEXT: s_and_b32 s36, s3, 0xffff |
| +; GCN-HSA-NEXT: s_and_b32 s36, s1, 0xffff |
| +; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s9, s9, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff |
| ; GCN-HSA-NEXT: s_and_b32 s15, s15, 0xffff |
| -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xf0 |
| -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 |
| -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xd0 |
| -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 |
| -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xb0 |
| -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 |
| -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x90 |
| -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 |
| +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xf0 |
| +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 |
| +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xd0 |
| +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s1 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s0 |
| +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xb0 |
| +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 |
| +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x90 |
| +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s26 |
| -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x70 |
| +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x70 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 |
| +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s13 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s24 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s23 |
| -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x50 |
| +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x50 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] |
| -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 |
| +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 |
| -; GCN-HSA-NEXT: s_add_u32 s2, s16, 48 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 |
| +; GCN-HSA-NEXT: s_add_u32 s0, s16, 48 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21 |
| -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 |
| +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 |
| -; GCN-HSA-NEXT: s_add_u32 s2, s16, 16 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 |
| +; GCN-HSA-NEXT: s_add_u32 s0, s16, 16 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20 |
| -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 |
| +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 |
| -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xe0 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s1 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 |
| +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xe0 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 |
| -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 |
| +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 |
| -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xc0 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 |
| +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xc0 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 |
| -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 |
| +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 |
| -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0xa0 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 |
| +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0xa0 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28 |
| -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 |
| +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 |
| -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x80 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 |
| +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x80 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 |
| -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 |
| +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 |
| -; GCN-HSA-NEXT: s_add_u32 s2, s16, 0x60 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 |
| +; GCN-HSA-NEXT: s_add_u32 s0, s16, 0x60 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s30 |
| -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 |
| +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 |
| -; GCN-HSA-NEXT: s_add_u32 s2, s16, 64 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 |
| +; GCN-HSA-NEXT: s_add_u32 s0, s16, 64 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 |
| -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 |
| +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 |
| -; GCN-HSA-NEXT: s_add_u32 s2, s16, 32 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 |
| +; GCN-HSA-NEXT: s_add_u32 s0, s16, 32 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s33 |
| -; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 |
| +; GCN-HSA-NEXT: s_addc_u32 s1, s17, 0 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 |
| -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 |
| +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 |
| ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 |
| ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] |
| @@ -6589,17 +6589,17 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % |
| ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s12, 16 |
| ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s10, 16 |
| ; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s8, 16 |
| -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[20:21], 0x100000 |
| -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[52:53], s[18:19], 0x100000 |
| -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s54, s6, 16 |
| -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s4, 16 |
| -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s2, 16 |
| -; GCN-NOHSA-SI-NEXT: s_lshr_b32 s60, s0, 16 |
| +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[20:21], 0x100000 |
| +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[50:51], s[18:19], 0x100000 |
| +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s52, s6, 16 |
| +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s54, s4, 16 |
| +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s56, s2, 16 |
| +; GCN-NOHSA-SI-NEXT: s_lshr_b32 s58, s0, 16 |
| ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 |
| ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 |
| ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[4:5], 0x100000 |
| ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x100000 |
| -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[48:49], s[8:9], 0x100000 |
| +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[60:61], s[8:9], 0x100000 |
| ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[62:63], s[10:11], 0x100000 |
| ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[64:65], s[12:13], 0x100000 |
| ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[66:67], s[14:15], 0x100000 |
| @@ -6613,12 +6613,12 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % |
| ; GCN-NOHSA-SI-NEXT: s_ashr_i64 s[4:5], s[4:5], 48 |
| ; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s16 |
| ; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s17 |
| -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s52 |
| -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s53 |
| +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s50 |
| +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s51 |
| ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12 |
| ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13 |
| -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s50 |
| -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s51 |
| +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s48 |
| +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s49 |
| ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s2 |
| ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s3 |
| ; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000 |
| @@ -6647,10 +6647,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % |
| ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, s4 |
| ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, s5 |
| ; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208 |
| -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[60:61], 0x100000 |
| -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[58:59], 0x100000 |
| -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[56:57], 0x100000 |
| -; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[54:55], 0x100000 |
| +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[58:59], 0x100000 |
| +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[56:57], 0x100000 |
| +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[54:55], 0x100000 |
| +; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[52:53], 0x100000 |
| ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[30:31], 0x100000 |
| ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[28:29], 0x100000 |
| ; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[26:27], 0x100000 |
| @@ -6678,8 +6678,8 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % |
| ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s65 |
| ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s62 |
| ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s63 |
| -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s48 |
| -; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s49 |
| +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s60 |
| +; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s61 |
| ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s34 |
| ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s35 |
| ; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, s24 |
| @@ -6923,123 +6923,123 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) % |
| ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NOHSA-VI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 |
| ; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0) |
| -; GCN-NOHSA-VI-NEXT: s_mov_b32 s38, s15 |
| -; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s13 |
| +; GCN-NOHSA-VI-NEXT: s_mov_b32 s36, s15 |
| +; GCN-NOHSA-VI-NEXT: s_mov_b32 s38, s13 |
| ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[82:83], s[14:15], 48 |
| -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 |
| -; GCN-NOHSA-VI-NEXT: s_mov_b32 s42, s11 |
| -; GCN-NOHSA-VI-NEXT: s_mov_b32 s50, s3 |
| -; GCN-NOHSA-VI-NEXT: s_mov_b32 s52, s1 |
| -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s2, 16 |
| -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s68, s0, 16 |
| +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x100000 |
| +; GCN-NOHSA-VI-NEXT: s_mov_b32 s40, s11 |
| +; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s3 |
| +; GCN-NOHSA-VI-NEXT: s_mov_b32 s50, s1 |
| +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s2, 16 |
| +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s66, s0, 16 |
| ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[0:1], 0x100000 |
| ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[2:3], 0x100000 |
| -; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[36:37], s[0:1], 48 |
| +; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[68:69], s[0:1], 48 |
| ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[70:71], s[2:3], 48 |
| ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[80:81], s[12:13], 48 |
| ; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000 |
| ; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1 |
| ; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s16 |
| ; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s17 |
| -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 |
| +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x100000 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s82 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s83 |
| -; GCN-NOHSA-VI-NEXT: s_mov_b32 s44, s9 |
| +; GCN-NOHSA-VI-NEXT: s_mov_b32 s42, s9 |
| ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[78:79], s[10:11], 48 |
| -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 |
| +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x100000 |
| ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240 |
| -; GCN-NOHSA-VI-NEXT: s_mov_b32 s46, s7 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41 |
| +; GCN-NOHSA-VI-NEXT: s_mov_b32 s44, s7 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s38 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s39 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s80 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s81 |
| ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[76:77], s[8:9], 48 |
| -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 |
| +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x100000 |
| ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208 |
| -; GCN-NOHSA-VI-NEXT: s_mov_b32 s48, s5 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 |
| +; GCN-NOHSA-VI-NEXT: s_mov_b32 s46, s5 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s40 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s41 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s78 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s79 |
| ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[74:75], s[6:7], 48 |
| -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 |
| +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x100000 |
| ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176 |
| ; GCN-NOHSA-VI-NEXT: s_ashr_i64 s[72:73], s[4:5], 48 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s42 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s43 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s76 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s77 |
| -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 |
| +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x100000 |
| ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144 |
| -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 |
| +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x100000 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s44 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s45 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s74 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s75 |
| ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112 |
| -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s14, 16 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 |
| +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s14, 16 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s46 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s47 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s73 |
| -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 |
| +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x100000 |
| ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80 |
| -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s12, 16 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 |
| +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s12, 16 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s48 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s49 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71 |
| ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[14:15], 0x100000 |
| -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x100000 |
| +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x100000 |
| ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48 |
| -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s10, 16 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s52 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s53 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s36 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s37 |
| +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s10, 16 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s68 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s69 |
| ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[12:13], 0x100000 |
| -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[56:57], 0x100000 |
| +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[54:55], 0x100000 |
| ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16 |
| -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s8, 16 |
| +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s8, 16 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54 |
| -; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52 |
| +; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53 |
| ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[10:11], 0x100000 |
| -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[58:59], 0x100000 |
| +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[56:57], 0x100000 |
| ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224 |
| -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s6, 16 |
| +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s6, 16 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17 |
| ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x100000 |
| -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[60:61], 0x100000 |
| +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[58:59], 0x100000 |
| ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192 |
| -; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s4, 16 |
| +; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s4, 16 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15 |
| ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x100000 |
| -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[62:63], 0x100000 |
| +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x100000 |
| ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160 |
| ; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[4:5], 0x100000 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13 |
| -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[64:65], 0x100000 |
| +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x100000 |
| ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128 |
| -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x100000 |
| +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[64:65], 0x100000 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s24 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s25 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11 |
| ; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96 |
| -; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[68:69], 0x100000 |
| +; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[66:67], 0x100000 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23 |
| ; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8 |
| diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll |
| index f5ca18d721fb..36ac9d212b68 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll |
| @@ -231,10 +231,10 @@ entry: |
| ; W64-O0-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec |
| |
| ; W64-O0: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1 |
| -; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload |
| -; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload |
| -; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:40 ; 4-byte Folded Reload |
| -; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:44 ; 4-byte Folded Reload |
| +; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:28 ; 4-byte Folded Reload |
| +; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload |
| +; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload |
| +; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:40 ; 4-byte Folded Reload |
| ; W64-O0: s_waitcnt vmcnt(0) |
| ; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]] |
| ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]] |
| @@ -251,7 +251,7 @@ entry: |
| ; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]] |
| ; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]] |
| ; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] |
| -; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4 ; 4-byte Folded Reload |
| +; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload |
| ; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen |
| ; W64-O0: s_waitcnt vmcnt(0) |
| ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill |
| @@ -264,18 +264,16 @@ entry: |
| ; W64-O0: s_cbranch_execz [[TERMBB:.LBB[0-9]+_[0-9]+]] |
| |
| ; W64-O0: ; %bb.{{[0-9]+}}: ; %bb1 |
| -; W64-O0: buffer_load_dword |
| -; W64-O0: buffer_store_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill |
| +; W64-O0-DAG: buffer_store_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF:[0-9]+]] ; 4-byte Folded Spill |
| ; W64-O0-DAG: s_mov_b64 s[[[SAVEEXEC0:[0-9]+]]:[[SAVEEXEC1:[0-9]+]]], exec |
| ; W64-O0: v_writelane_b32 [[VSAVEEXEC:v[0-9]+]], s[[SAVEEXEC0]], [[SAVEEXEC_IDX0:[0-9]+]] |
| ; W64-O0: v_writelane_b32 [[VSAVEEXEC]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]] |
| -; W64-O0: buffer_store_dword [[VSAVEEXEC]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill |
| |
| ; W64-O0: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1 |
| -; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| -; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload |
| -; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:16 ; 4-byte Folded Reload |
| -; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:20 ; 4-byte Folded Reload |
| +; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload |
| +; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| +; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload |
| +; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:16 ; 4-byte Folded Reload |
| ; W64-O0: s_waitcnt vmcnt(0) |
| ; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]] |
| ; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]] |
| @@ -292,9 +290,7 @@ entry: |
| ; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]] |
| ; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]] |
| ; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]] |
| -; W64-O0: buffer_store_dword |
| ; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:[[IDX_OFF]] ; 4-byte Folded Reload |
| -; W64-O0: buffer_load_dword |
| ; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen |
| ; W64-O0: s_waitcnt vmcnt(0) |
| ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill |
| @@ -302,9 +298,8 @@ entry: |
| ; W64-O0-NEXT: s_cbranch_execnz [[LOOPBB1]] |
| |
| ; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload |
| -; W64-O0: buffer_load_dword [[VSAVEEXEC1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Reload |
| -; W64-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC1]], [[SAVEEXEC_IDX0]] |
| -; W64-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC1]], [[SAVEEXEC_IDX1]] |
| +; W64-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]] |
| +; W64-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]] |
| ; W64-O0: s_mov_b64 exec, s[[[SAVEEXEC0]]:[[SAVEEXEC1]]] |
| ; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF]] ; 4-byte Folded Spill |
| |
| diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll |
| index d5bcf685f9f0..6a14b88eb630 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll |
| @@ -191,49 +191,48 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 { |
| ; GFX9-NEXT: s_mov_b32 s4, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| -; GFX9-NEXT: ; implicit-def: $vgpr42 |
| +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x800 |
| -; GFX9-NEXT: v_writelane_b32 v42, s30, 0 |
| -; GFX9-NEXT: v_writelane_b32 v42, s31, 1 |
| -; GFX9-NEXT: v_writelane_b32 v42, s34, 2 |
| +; GFX9-NEXT: v_writelane_b32 v40, s34, 2 |
| ; GFX9-NEXT: v_writelane_b32 v44, s4, 0 |
| -; GFX9-NEXT: v_writelane_b32 v42, s36, 3 |
| +; GFX9-NEXT: v_writelane_b32 v40, s36, 3 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+12 |
| -; GFX9-NEXT: v_writelane_b32 v42, s37, 4 |
| +; GFX9-NEXT: v_writelane_b32 v40, s37, 4 |
| ; GFX9-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 |
| -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GFX9-NEXT: v_mov_b32_e32 v40, v1 |
| -; GFX9-NEXT: v_mov_b32_e32 v41, v0 |
| -; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40 |
| +; GFX9-NEXT: v_mov_b32_e32 v41, v1 |
| +; GFX9-NEXT: v_mov_b32_e32 v42, v0 |
| +; GFX9-NEXT: v_mul_u32_u24_e32 v0, v42, v41 |
| ; GFX9-NEXT: s_mov_b32 s34, s15 |
| -; GFX9-NEXT: v_and_b32_e32 v43, 0xffffff, v40 |
| +; GFX9-NEXT: v_and_b32_e32 v43, 0xffffff, v41 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] |
| -; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v43 |
| +; GFX9-NEXT: v_mad_u32_u24 v41, v42, v41, v43 |
| ; GFX9-NEXT: s_mov_b32 s15, s34 |
| -; GFX9-NEXT: v_mov_b32_e32 v0, v40 |
| +; GFX9-NEXT: v_mov_b32_e32 v0, v41 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] |
| -; GFX9-NEXT: v_add_u32_e32 v0, v40, v43 |
| +; GFX9-NEXT: v_add_u32_e32 v0, v41, v43 |
| ; GFX9-NEXT: s_mov_b32 s15, s34 |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] |
| ; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload |
| -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| -; GFX9-NEXT: v_readlane_b32 s37, v42, 4 |
| -; GFX9-NEXT: v_readlane_b32 s36, v42, 3 |
| -; GFX9-NEXT: v_readlane_b32 s34, v42, 2 |
| -; GFX9-NEXT: v_readlane_b32 s31, v42, 1 |
| -; GFX9-NEXT: v_readlane_b32 s30, v42, 0 |
| +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| +; GFX9-NEXT: v_readlane_b32 s37, v40, 4 |
| +; GFX9-NEXT: v_readlane_b32 s36, v40, 3 |
| +; GFX9-NEXT: v_readlane_b32 s34, v40, 2 |
| +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 |
| +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s4, v44, 0 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload |
| +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload |
| ; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xf800 |
| diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll |
| index f11959c53dc1..50c27d1835c9 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll |
| @@ -27,19 +27,15 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { |
| ; CHECK-LABEL: csr_vgpr_spill_fp_callee: |
| ; CHECK: ; %bb.0: ; %bb |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; CHECK-NEXT: s_mov_b32 s14, s33 |
| +; CHECK-NEXT: s_mov_b32 s6, s33 |
| ; CHECK-NEXT: s_mov_b32 s33, s32 |
| ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_mov_b64 exec, s[4:5] |
| ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 |
| ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; CHECK-NEXT: ; implicit-def: $vgpr0 |
| -; CHECK-NEXT: v_writelane_b32 v0, s30, 0 |
| -; CHECK-NEXT: v_writelane_b32 v0, s31, 1 |
| -; CHECK-NEXT: s_or_saveexec_b64 s[12:13], -1 |
| -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| -; CHECK-NEXT: s_mov_b64 exec, s[12:13] |
| +; CHECK-NEXT: v_writelane_b32 v1, s30, 0 |
| +; CHECK-NEXT: v_writelane_b32 v1, s31, 1 |
| ; CHECK-NEXT: s_getpc_b64 s[4:5] |
| ; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12 |
| @@ -48,21 +44,17 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 { |
| ; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9] |
| ; CHECK-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| -; CHECK-NEXT: s_or_saveexec_b64 s[12:13], -1 |
| -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| -; CHECK-NEXT: s_mov_b64 exec, s[12:13] |
| ; CHECK-NEXT: ;;#ASMSTART |
| ; CHECK-NEXT: ; clobber csr v40 |
| ; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: s_waitcnt vmcnt(0) |
| -; CHECK-NEXT: v_readlane_b32 s31, v0, 1 |
| -; CHECK-NEXT: v_readlane_b32 s30, v0, 0 |
| +; CHECK-NEXT: v_readlane_b32 s31, v1, 1 |
| +; CHECK-NEXT: v_readlane_b32 s30, v1, 0 |
| ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_mov_b64 exec, s[4:5] |
| ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 |
| -; CHECK-NEXT: s_mov_b32 s33, s14 |
| +; CHECK-NEXT: s_mov_b32 s33, s6 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| bb: |
| @@ -99,22 +91,21 @@ define internal fastcc void @csr_vgpr_spill_fp_tailcall_callee() #0 { |
| ; CHECK: ; %bb.0: ; %bb |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill |
| +; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_mov_b64 exec, s[4:5] |
| ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill |
| -; CHECK-NEXT: ; implicit-def: $vgpr0 |
| -; CHECK-NEXT: v_writelane_b32 v0, s33, 0 |
| +; CHECK-NEXT: v_writelane_b32 v1, s33, 0 |
| ; CHECK-NEXT: ;;#ASMSTART |
| ; CHECK-NEXT: ; clobber csr v40 |
| ; CHECK-NEXT: ;;#ASMEND |
| ; CHECK-NEXT: s_getpc_b64 s[4:5] |
| ; CHECK-NEXT: s_add_u32 s4, s4, callee_has_fp@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, callee_has_fp@rel32@hi+12 |
| -; CHECK-NEXT: v_readlane_b32 s33, v0, 0 |
| +; CHECK-NEXT: v_readlane_b32 s33, v1, 0 |
| ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload |
| -; CHECK-NEXT: s_xor_saveexec_b64 s[8:9], -1 |
| -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload |
| -; CHECK-NEXT: s_mov_b64 exec, s[8:9] |
| +; CHECK-NEXT: s_xor_saveexec_b64 s[6:7], -1 |
| +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload |
| +; CHECK-NEXT: s_mov_b64 exec, s[6:7] |
| ; CHECK-NEXT: s_setpc_b64 s[4:5] |
| bb: |
| call void asm sideeffect "; clobber csr v40", "~{v40}"() |
| @@ -161,13 +152,12 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { |
| ; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; CHECK-NEXT: s_mov_b32 s12, s33 |
| +; CHECK-NEXT: s_mov_b32 s6, s33 |
| ; CHECK-NEXT: s_mov_b32 s33, s32 |
| ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_mov_b64 exec, s[4:5] |
| ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 |
| -; CHECK-NEXT: ; implicit-def: $vgpr1 |
| ; CHECK-NEXT: v_writelane_b32 v1, s30, 0 |
| ; CHECK-NEXT: v_writelane_b32 v1, s31, 1 |
| ; CHECK-NEXT: s_getpc_b64 s[4:5] |
| @@ -184,7 +174,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 { |
| ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_mov_b64 exec, s[4:5] |
| ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 |
| -; CHECK-NEXT: s_mov_b32 s33, s12 |
| +; CHECK-NEXT: s_mov_b32 s33, s6 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| @@ -196,19 +186,14 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { |
| ; CHECK-LABEL: caller_save_vgpr_spill_fp: |
| ; CHECK: ; %bb.0: ; %entry |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; CHECK-NEXT: s_mov_b32 s13, s33 |
| +; CHECK-NEXT: s_mov_b32 s7, s33 |
| ; CHECK-NEXT: s_mov_b32 s33, s32 |
| ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| -; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| +; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_mov_b64 exec, s[4:5] |
| ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 |
| -; CHECK-NEXT: ; implicit-def: $vgpr0 |
| -; CHECK-NEXT: v_writelane_b32 v0, s30, 0 |
| -; CHECK-NEXT: v_writelane_b32 v0, s31, 1 |
| -; CHECK-NEXT: s_or_saveexec_b64 s[14:15], -1 |
| -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; CHECK-NEXT: s_mov_b64 exec, s[14:15] |
| +; CHECK-NEXT: v_writelane_b32 v2, s30, 0 |
| +; CHECK-NEXT: v_writelane_b32 v2, s31, 1 |
| ; CHECK-NEXT: s_getpc_b64 s[4:5] |
| ; CHECK-NEXT: s_add_u32 s4, s4, caller_save_vgpr_spill_fp_tail_call@rel32@lo+4 |
| ; CHECK-NEXT: s_addc_u32 s5, s5, caller_save_vgpr_spill_fp_tail_call@rel32@hi+12 |
| @@ -217,18 +202,13 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 { |
| ; CHECK-NEXT: s_mov_b64 s[0:1], s[8:9] |
| ; CHECK-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| -; CHECK-NEXT: s_or_saveexec_b64 s[14:15], -1 |
| -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload |
| -; CHECK-NEXT: s_mov_b64 exec, s[14:15] |
| -; CHECK-NEXT: s_waitcnt vmcnt(0) |
| -; CHECK-NEXT: v_readlane_b32 s31, v1, 1 |
| -; CHECK-NEXT: v_readlane_b32 s30, v1, 0 |
| +; CHECK-NEXT: v_readlane_b32 s31, v2, 1 |
| +; CHECK-NEXT: v_readlane_b32 s30, v2, 0 |
| ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_mov_b64 exec, s[4:5] |
| ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 |
| -; CHECK-NEXT: s_mov_b32 s33, s13 |
| +; CHECK-NEXT: s_mov_b32 s33, s7 |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) |
| ; CHECK-NEXT: s_setpc_b64 s[30:31] |
| entry: |
| diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll |
| index 7df0d1027738..c71938eb8188 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll |
| @@ -15,21 +15,14 @@ define hidden void @_ZL3barv() #0 !dbg !1644 { |
| ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; CHECK-NEXT: s_mov_b32 s16, s33 |
| ; CHECK-NEXT: s_mov_b32 s33, s32 |
| -; CHECK-NEXT: s_xor_saveexec_b64 s[18:19], -1 |
| -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| -; CHECK-NEXT: s_mov_b64 exec, -1 |
| -; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| +; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 |
| +; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; CHECK-NEXT: s_mov_b64 exec, s[18:19] |
| -; CHECK-NEXT: v_writelane_b32 v40, s34, 0 |
| -; CHECK-NEXT: v_writelane_b32 v40, s35, 1 |
| -; CHECK-NEXT: v_writelane_b32 v40, s16, 2 |
| +; CHECK-NEXT: v_writelane_b32 v41, s16, 0 |
| ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 |
| -; CHECK-NEXT: ; implicit-def: $vgpr0 |
| -; CHECK-NEXT: v_writelane_b32 v0, s30, 0 |
| -; CHECK-NEXT: v_writelane_b32 v0, s31, 1 |
| -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; CHECK-NEXT: s_mov_b64 exec, s[34:35] |
| +; CHECK-NEXT: v_writelane_b32 v40, s30, 0 |
| +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 |
| ; CHECK-NEXT: .Ltmp0: |
| ; CHECK-NEXT: .loc 0 31 3 prologue_end ; lane-info.cpp:31:3 |
| ; CHECK-NEXT: s_getpc_b64 s[16:17] |
| @@ -43,20 +36,13 @@ define hidden void @_ZL3barv() #0 !dbg !1644 { |
| ; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; CHECK-NEXT: .Ltmp1: |
| -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload |
| -; CHECK-NEXT: s_mov_b64 exec, s[34:35] |
| ; CHECK-NEXT: .loc 0 32 1 ; lane-info.cpp:32:1 |
| -; CHECK-NEXT: s_waitcnt vmcnt(0) |
| -; CHECK-NEXT: v_readlane_b32 s31, v0, 1 |
| -; CHECK-NEXT: v_readlane_b32 s30, v0, 0 |
| -; CHECK-NEXT: v_readlane_b32 s34, v40, 0 |
| -; CHECK-NEXT: v_readlane_b32 s35, v40, 1 |
| -; CHECK-NEXT: v_readlane_b32 s4, v40, 2 |
| -; CHECK-NEXT: s_xor_saveexec_b64 s[6:7], -1 |
| -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| -; CHECK-NEXT: s_mov_b64 exec, -1 |
| -; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| +; CHECK-NEXT: v_readlane_b32 s31, v40, 1 |
| +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 |
| +; CHECK-NEXT: v_readlane_b32 s4, v41, 0 |
| +; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| +; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload |
| +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; CHECK-NEXT: s_mov_b64 exec, s[6:7] |
| ; CHECK-NEXT: .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1 |
| ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 |
| diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll |
| index bf29873ba280..54229988f2ee 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll |
| @@ -11,17 +11,10 @@ |
| define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, i32 %in) #0 { |
| ; GCN-LABEL: spill_sgprs_to_multiple_vgprs: |
| ; GCN: ; %bb.0: |
| -; GCN-NEXT: s_mov_b32 s92, SCRATCH_RSRC_DWORD0 |
| -; GCN-NEXT: s_mov_b32 s93, SCRATCH_RSRC_DWORD1 |
| -; GCN-NEXT: s_mov_b32 s94, -1 |
| -; GCN-NEXT: s_mov_b32 s95, 0xe8f000 |
| -; GCN-NEXT: s_add_u32 s92, s92, s3 |
| -; GCN-NEXT: s_addc_u32 s93, s93, 0 |
| ; GCN-NEXT: s_load_dword s0, s[0:1], 0xb |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:11] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| ; GCN-NEXT: v_writelane_b32 v0, s4, 0 |
| ; GCN-NEXT: v_writelane_b32 v0, s5, 1 |
| ; GCN-NEXT: v_writelane_b32 v0, s6, 2 |
| @@ -107,285 +100,264 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, |
| ; GCN-NEXT: v_writelane_b32 v0, s9, 61 |
| ; GCN-NEXT: v_writelane_b32 v0, s10, 62 |
| ; GCN-NEXT: v_writelane_b32 v0, s11, 63 |
| -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:12 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[34:35] |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:11] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 0 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 1 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 2 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 3 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 4 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 5 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 6 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 7 |
| +; GCN-NEXT: v_writelane_b32 v1, s4, 0 |
| +; GCN-NEXT: v_writelane_b32 v1, s5, 1 |
| +; GCN-NEXT: v_writelane_b32 v1, s6, 2 |
| +; GCN-NEXT: v_writelane_b32 v1, s7, 3 |
| +; GCN-NEXT: v_writelane_b32 v1, s8, 4 |
| +; GCN-NEXT: v_writelane_b32 v1, s9, 5 |
| +; GCN-NEXT: v_writelane_b32 v1, s10, 6 |
| +; GCN-NEXT: v_writelane_b32 v1, s11, 7 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:11] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 8 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 9 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 10 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 11 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 12 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 13 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 14 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 15 |
| +; GCN-NEXT: v_writelane_b32 v1, s4, 8 |
| +; GCN-NEXT: v_writelane_b32 v1, s5, 9 |
| +; GCN-NEXT: v_writelane_b32 v1, s6, 10 |
| +; GCN-NEXT: v_writelane_b32 v1, s7, 11 |
| +; GCN-NEXT: v_writelane_b32 v1, s8, 12 |
| +; GCN-NEXT: v_writelane_b32 v1, s9, 13 |
| +; GCN-NEXT: v_writelane_b32 v1, s10, 14 |
| +; GCN-NEXT: v_writelane_b32 v1, s11, 15 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:11] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 16 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 17 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 18 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 19 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 20 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 21 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 22 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 23 |
| +; GCN-NEXT: v_writelane_b32 v1, s4, 16 |
| +; GCN-NEXT: v_writelane_b32 v1, s5, 17 |
| +; GCN-NEXT: v_writelane_b32 v1, s6, 18 |
| +; GCN-NEXT: v_writelane_b32 v1, s7, 19 |
| +; GCN-NEXT: v_writelane_b32 v1, s8, 20 |
| +; GCN-NEXT: v_writelane_b32 v1, s9, 21 |
| +; GCN-NEXT: v_writelane_b32 v1, s10, 22 |
| +; GCN-NEXT: v_writelane_b32 v1, s11, 23 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:11] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 24 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 25 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 26 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 27 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 28 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 29 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 30 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 31 |
| +; GCN-NEXT: v_writelane_b32 v1, s4, 24 |
| +; GCN-NEXT: v_writelane_b32 v1, s5, 25 |
| +; GCN-NEXT: v_writelane_b32 v1, s6, 26 |
| +; GCN-NEXT: v_writelane_b32 v1, s7, 27 |
| +; GCN-NEXT: v_writelane_b32 v1, s8, 28 |
| +; GCN-NEXT: v_writelane_b32 v1, s9, 29 |
| +; GCN-NEXT: v_writelane_b32 v1, s10, 30 |
| +; GCN-NEXT: v_writelane_b32 v1, s11, 31 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:11] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 32 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 33 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 34 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 35 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 36 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 37 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 38 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 39 |
| +; GCN-NEXT: v_writelane_b32 v1, s4, 32 |
| +; GCN-NEXT: v_writelane_b32 v1, s5, 33 |
| +; GCN-NEXT: v_writelane_b32 v1, s6, 34 |
| +; GCN-NEXT: v_writelane_b32 v1, s7, 35 |
| +; GCN-NEXT: v_writelane_b32 v1, s8, 36 |
| +; GCN-NEXT: v_writelane_b32 v1, s9, 37 |
| +; GCN-NEXT: v_writelane_b32 v1, s10, 38 |
| +; GCN-NEXT: v_writelane_b32 v1, s11, 39 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:11] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 40 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 41 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 42 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 43 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 44 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 45 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 46 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 47 |
| +; GCN-NEXT: v_writelane_b32 v1, s4, 40 |
| +; GCN-NEXT: v_writelane_b32 v1, s5, 41 |
| +; GCN-NEXT: v_writelane_b32 v1, s6, 42 |
| +; GCN-NEXT: v_writelane_b32 v1, s7, 43 |
| +; GCN-NEXT: v_writelane_b32 v1, s8, 44 |
| +; GCN-NEXT: v_writelane_b32 v1, s9, 45 |
| +; GCN-NEXT: v_writelane_b32 v1, s10, 46 |
| +; GCN-NEXT: v_writelane_b32 v1, s11, 47 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:11] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 48 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 49 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 50 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 51 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 52 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 53 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 54 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 55 |
| +; GCN-NEXT: v_writelane_b32 v1, s4, 48 |
| +; GCN-NEXT: v_writelane_b32 v1, s5, 49 |
| +; GCN-NEXT: v_writelane_b32 v1, s6, 50 |
| +; GCN-NEXT: v_writelane_b32 v1, s7, 51 |
| +; GCN-NEXT: v_writelane_b32 v1, s8, 52 |
| +; GCN-NEXT: v_writelane_b32 v1, s9, 53 |
| +; GCN-NEXT: v_writelane_b32 v1, s10, 54 |
| +; GCN-NEXT: v_writelane_b32 v1, s11, 55 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:11] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 56 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 57 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 58 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 59 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 60 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 61 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 62 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 63 |
| -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:8 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[34:35] |
| +; GCN-NEXT: v_writelane_b32 v1, s4, 56 |
| +; GCN-NEXT: v_writelane_b32 v1, s5, 57 |
| +; GCN-NEXT: v_writelane_b32 v1, s6, 58 |
| +; GCN-NEXT: v_writelane_b32 v1, s7, 59 |
| +; GCN-NEXT: v_writelane_b32 v1, s8, 60 |
| +; GCN-NEXT: v_writelane_b32 v1, s9, 61 |
| +; GCN-NEXT: v_writelane_b32 v1, s10, 62 |
| +; GCN-NEXT: v_writelane_b32 v1, s11, 63 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:11] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 0 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 1 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 2 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 3 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 4 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 5 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 6 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 7 |
| -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[34:35] |
| +; GCN-NEXT: v_writelane_b32 v2, s4, 0 |
| +; GCN-NEXT: v_writelane_b32 v2, s5, 1 |
| +; GCN-NEXT: v_writelane_b32 v2, s6, 2 |
| +; GCN-NEXT: v_writelane_b32 v2, s7, 3 |
| +; GCN-NEXT: v_writelane_b32 v2, s8, 4 |
| +; GCN-NEXT: v_writelane_b32 v2, s9, 5 |
| +; GCN-NEXT: v_writelane_b32 v2, s10, 6 |
| +; GCN-NEXT: v_writelane_b32 v2, s11, 7 |
| ; GCN-NEXT: s_mov_b32 s1, 0 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: s_cmp_lg_u32 s0, s1 |
| ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 |
| ; GCN-NEXT: ; %bb.1: ; %bb0 |
| -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[34:35] |
| -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[34:35] |
| -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[34:35] |
| -; GCN-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-NEXT: v_readlane_b32 s8, v2, 56 |
| -; GCN-NEXT: v_readlane_b32 s9, v2, 57 |
| -; GCN-NEXT: v_readlane_b32 s10, v2, 58 |
| -; GCN-NEXT: v_readlane_b32 s11, v2, 59 |
| -; GCN-NEXT: v_readlane_b32 s12, v2, 60 |
| -; GCN-NEXT: v_readlane_b32 s13, v2, 61 |
| -; GCN-NEXT: v_readlane_b32 s14, v2, 62 |
| -; GCN-NEXT: v_readlane_b32 s15, v2, 63 |
| -; GCN-NEXT: v_readlane_b32 s16, v2, 48 |
| -; GCN-NEXT: v_readlane_b32 s17, v2, 49 |
| -; GCN-NEXT: v_readlane_b32 s18, v2, 50 |
| -; GCN-NEXT: v_readlane_b32 s19, v2, 51 |
| -; GCN-NEXT: v_readlane_b32 s20, v2, 52 |
| -; GCN-NEXT: v_readlane_b32 s21, v2, 53 |
| -; GCN-NEXT: v_readlane_b32 s22, v2, 54 |
| -; GCN-NEXT: v_readlane_b32 s23, v2, 55 |
| -; GCN-NEXT: v_readlane_b32 s24, v2, 40 |
| -; GCN-NEXT: v_readlane_b32 s25, v2, 41 |
| -; GCN-NEXT: v_readlane_b32 s26, v2, 42 |
| -; GCN-NEXT: v_readlane_b32 s27, v2, 43 |
| -; GCN-NEXT: v_readlane_b32 s28, v2, 44 |
| -; GCN-NEXT: v_readlane_b32 s29, v2, 45 |
| -; GCN-NEXT: v_readlane_b32 s30, v2, 46 |
| -; GCN-NEXT: v_readlane_b32 s31, v2, 47 |
| -; GCN-NEXT: v_readlane_b32 s36, v2, 32 |
| -; GCN-NEXT: v_readlane_b32 s37, v2, 33 |
| -; GCN-NEXT: v_readlane_b32 s38, v2, 34 |
| -; GCN-NEXT: v_readlane_b32 s39, v2, 35 |
| -; GCN-NEXT: v_readlane_b32 s40, v2, 36 |
| -; GCN-NEXT: v_readlane_b32 s41, v2, 37 |
| -; GCN-NEXT: v_readlane_b32 s42, v2, 38 |
| -; GCN-NEXT: v_readlane_b32 s43, v2, 39 |
| -; GCN-NEXT: v_readlane_b32 s44, v2, 24 |
| -; GCN-NEXT: v_readlane_b32 s45, v2, 25 |
| -; GCN-NEXT: v_readlane_b32 s46, v2, 26 |
| -; GCN-NEXT: v_readlane_b32 s47, v2, 27 |
| -; GCN-NEXT: v_readlane_b32 s48, v2, 28 |
| -; GCN-NEXT: v_readlane_b32 s49, v2, 29 |
| -; GCN-NEXT: v_readlane_b32 s50, v2, 30 |
| -; GCN-NEXT: v_readlane_b32 s51, v2, 31 |
| -; GCN-NEXT: v_readlane_b32 s52, v2, 16 |
| -; GCN-NEXT: v_readlane_b32 s53, v2, 17 |
| -; GCN-NEXT: v_readlane_b32 s54, v2, 18 |
| -; GCN-NEXT: v_readlane_b32 s55, v2, 19 |
| -; GCN-NEXT: v_readlane_b32 s56, v2, 20 |
| -; GCN-NEXT: v_readlane_b32 s57, v2, 21 |
| -; GCN-NEXT: v_readlane_b32 s58, v2, 22 |
| -; GCN-NEXT: v_readlane_b32 s59, v2, 23 |
| -; GCN-NEXT: v_readlane_b32 s60, v2, 8 |
| -; GCN-NEXT: v_readlane_b32 s61, v2, 9 |
| -; GCN-NEXT: v_readlane_b32 s62, v2, 10 |
| -; GCN-NEXT: v_readlane_b32 s63, v2, 11 |
| -; GCN-NEXT: v_readlane_b32 s64, v2, 12 |
| -; GCN-NEXT: v_readlane_b32 s65, v2, 13 |
| -; GCN-NEXT: v_readlane_b32 s66, v2, 14 |
| -; GCN-NEXT: v_readlane_b32 s67, v2, 15 |
| -; GCN-NEXT: v_readlane_b32 s68, v2, 0 |
| -; GCN-NEXT: v_readlane_b32 s69, v2, 1 |
| -; GCN-NEXT: v_readlane_b32 s70, v2, 2 |
| -; GCN-NEXT: v_readlane_b32 s71, v2, 3 |
| -; GCN-NEXT: v_readlane_b32 s72, v2, 4 |
| -; GCN-NEXT: v_readlane_b32 s73, v2, 5 |
| -; GCN-NEXT: v_readlane_b32 s74, v2, 6 |
| -; GCN-NEXT: v_readlane_b32 s75, v2, 7 |
| -; GCN-NEXT: v_readlane_b32 s76, v1, 56 |
| -; GCN-NEXT: v_readlane_b32 s77, v1, 57 |
| -; GCN-NEXT: v_readlane_b32 s78, v1, 58 |
| -; GCN-NEXT: v_readlane_b32 s79, v1, 59 |
| -; GCN-NEXT: v_readlane_b32 s80, v1, 60 |
| -; GCN-NEXT: v_readlane_b32 s81, v1, 61 |
| -; GCN-NEXT: v_readlane_b32 s82, v1, 62 |
| -; GCN-NEXT: v_readlane_b32 s83, v1, 63 |
| -; GCN-NEXT: v_readlane_b32 s84, v1, 48 |
| -; GCN-NEXT: v_readlane_b32 s85, v1, 49 |
| -; GCN-NEXT: v_readlane_b32 s86, v1, 50 |
| -; GCN-NEXT: v_readlane_b32 s87, v1, 51 |
| -; GCN-NEXT: v_readlane_b32 s88, v1, 52 |
| -; GCN-NEXT: v_readlane_b32 s89, v1, 53 |
| -; GCN-NEXT: v_readlane_b32 s90, v1, 54 |
| -; GCN-NEXT: v_readlane_b32 s91, v1, 55 |
| -; GCN-NEXT: v_readlane_b32 s0, v1, 0 |
| -; GCN-NEXT: v_readlane_b32 s1, v1, 1 |
| -; GCN-NEXT: v_readlane_b32 s2, v1, 2 |
| -; GCN-NEXT: v_readlane_b32 s3, v1, 3 |
| -; GCN-NEXT: v_readlane_b32 s4, v1, 4 |
| -; GCN-NEXT: v_readlane_b32 s5, v1, 5 |
| -; GCN-NEXT: v_readlane_b32 s6, v1, 6 |
| -; GCN-NEXT: v_readlane_b32 s7, v1, 7 |
| +; GCN-NEXT: v_readlane_b32 s8, v1, 56 |
| +; GCN-NEXT: v_readlane_b32 s9, v1, 57 |
| +; GCN-NEXT: v_readlane_b32 s10, v1, 58 |
| +; GCN-NEXT: v_readlane_b32 s11, v1, 59 |
| +; GCN-NEXT: v_readlane_b32 s12, v1, 60 |
| +; GCN-NEXT: v_readlane_b32 s13, v1, 61 |
| +; GCN-NEXT: v_readlane_b32 s14, v1, 62 |
| +; GCN-NEXT: v_readlane_b32 s15, v1, 63 |
| +; GCN-NEXT: v_readlane_b32 s16, v1, 48 |
| +; GCN-NEXT: v_readlane_b32 s17, v1, 49 |
| +; GCN-NEXT: v_readlane_b32 s18, v1, 50 |
| +; GCN-NEXT: v_readlane_b32 s19, v1, 51 |
| +; GCN-NEXT: v_readlane_b32 s20, v1, 52 |
| +; GCN-NEXT: v_readlane_b32 s21, v1, 53 |
| +; GCN-NEXT: v_readlane_b32 s22, v1, 54 |
| +; GCN-NEXT: v_readlane_b32 s23, v1, 55 |
| +; GCN-NEXT: v_readlane_b32 s24, v1, 40 |
| +; GCN-NEXT: v_readlane_b32 s25, v1, 41 |
| +; GCN-NEXT: v_readlane_b32 s26, v1, 42 |
| +; GCN-NEXT: v_readlane_b32 s27, v1, 43 |
| +; GCN-NEXT: v_readlane_b32 s28, v1, 44 |
| +; GCN-NEXT: v_readlane_b32 s29, v1, 45 |
| +; GCN-NEXT: v_readlane_b32 s30, v1, 46 |
| +; GCN-NEXT: v_readlane_b32 s31, v1, 47 |
| +; GCN-NEXT: v_readlane_b32 s36, v1, 32 |
| +; GCN-NEXT: v_readlane_b32 s37, v1, 33 |
| +; GCN-NEXT: v_readlane_b32 s38, v1, 34 |
| +; GCN-NEXT: v_readlane_b32 s39, v1, 35 |
| +; GCN-NEXT: v_readlane_b32 s40, v1, 36 |
| +; GCN-NEXT: v_readlane_b32 s41, v1, 37 |
| +; GCN-NEXT: v_readlane_b32 s42, v1, 38 |
| +; GCN-NEXT: v_readlane_b32 s43, v1, 39 |
| +; GCN-NEXT: v_readlane_b32 s44, v1, 24 |
| +; GCN-NEXT: v_readlane_b32 s45, v1, 25 |
| +; GCN-NEXT: v_readlane_b32 s46, v1, 26 |
| +; GCN-NEXT: v_readlane_b32 s47, v1, 27 |
| +; GCN-NEXT: v_readlane_b32 s48, v1, 28 |
| +; GCN-NEXT: v_readlane_b32 s49, v1, 29 |
| +; GCN-NEXT: v_readlane_b32 s50, v1, 30 |
| +; GCN-NEXT: v_readlane_b32 s51, v1, 31 |
| +; GCN-NEXT: v_readlane_b32 s52, v1, 16 |
| +; GCN-NEXT: v_readlane_b32 s53, v1, 17 |
| +; GCN-NEXT: v_readlane_b32 s54, v1, 18 |
| +; GCN-NEXT: v_readlane_b32 s55, v1, 19 |
| +; GCN-NEXT: v_readlane_b32 s56, v1, 20 |
| +; GCN-NEXT: v_readlane_b32 s57, v1, 21 |
| +; GCN-NEXT: v_readlane_b32 s58, v1, 22 |
| +; GCN-NEXT: v_readlane_b32 s59, v1, 23 |
| +; GCN-NEXT: v_readlane_b32 s60, v1, 8 |
| +; GCN-NEXT: v_readlane_b32 s61, v1, 9 |
| +; GCN-NEXT: v_readlane_b32 s62, v1, 10 |
| +; GCN-NEXT: v_readlane_b32 s63, v1, 11 |
| +; GCN-NEXT: v_readlane_b32 s64, v1, 12 |
| +; GCN-NEXT: v_readlane_b32 s65, v1, 13 |
| +; GCN-NEXT: v_readlane_b32 s66, v1, 14 |
| +; GCN-NEXT: v_readlane_b32 s67, v1, 15 |
| +; GCN-NEXT: v_readlane_b32 s68, v1, 0 |
| +; GCN-NEXT: v_readlane_b32 s69, v1, 1 |
| +; GCN-NEXT: v_readlane_b32 s70, v1, 2 |
| +; GCN-NEXT: v_readlane_b32 s71, v1, 3 |
| +; GCN-NEXT: v_readlane_b32 s72, v1, 4 |
| +; GCN-NEXT: v_readlane_b32 s73, v1, 5 |
| +; GCN-NEXT: v_readlane_b32 s74, v1, 6 |
| +; GCN-NEXT: v_readlane_b32 s75, v1, 7 |
| +; GCN-NEXT: v_readlane_b32 s76, v0, 56 |
| +; GCN-NEXT: v_readlane_b32 s77, v0, 57 |
| +; GCN-NEXT: v_readlane_b32 s78, v0, 58 |
| +; GCN-NEXT: v_readlane_b32 s79, v0, 59 |
| +; GCN-NEXT: v_readlane_b32 s80, v0, 60 |
| +; GCN-NEXT: v_readlane_b32 s81, v0, 61 |
| +; GCN-NEXT: v_readlane_b32 s82, v0, 62 |
| +; GCN-NEXT: v_readlane_b32 s83, v0, 63 |
| +; GCN-NEXT: v_readlane_b32 s84, v0, 48 |
| +; GCN-NEXT: v_readlane_b32 s85, v0, 49 |
| +; GCN-NEXT: v_readlane_b32 s86, v0, 50 |
| +; GCN-NEXT: v_readlane_b32 s87, v0, 51 |
| +; GCN-NEXT: v_readlane_b32 s88, v0, 52 |
| +; GCN-NEXT: v_readlane_b32 s89, v0, 53 |
| +; GCN-NEXT: v_readlane_b32 s90, v0, 54 |
| +; GCN-NEXT: v_readlane_b32 s91, v0, 55 |
| +; GCN-NEXT: v_readlane_b32 s0, v0, 0 |
| +; GCN-NEXT: v_readlane_b32 s1, v0, 1 |
| +; GCN-NEXT: v_readlane_b32 s2, v0, 2 |
| +; GCN-NEXT: v_readlane_b32 s3, v0, 3 |
| +; GCN-NEXT: v_readlane_b32 s4, v0, 4 |
| +; GCN-NEXT: v_readlane_b32 s5, v0, 5 |
| +; GCN-NEXT: v_readlane_b32 s6, v0, 6 |
| +; GCN-NEXT: v_readlane_b32 s7, v0, 7 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; use s[0:7] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_readlane_b32 s0, v1, 8 |
| -; GCN-NEXT: v_readlane_b32 s1, v1, 9 |
| -; GCN-NEXT: v_readlane_b32 s2, v1, 10 |
| -; GCN-NEXT: v_readlane_b32 s3, v1, 11 |
| -; GCN-NEXT: v_readlane_b32 s4, v1, 12 |
| -; GCN-NEXT: v_readlane_b32 s5, v1, 13 |
| -; GCN-NEXT: v_readlane_b32 s6, v1, 14 |
| -; GCN-NEXT: v_readlane_b32 s7, v1, 15 |
| +; GCN-NEXT: v_readlane_b32 s0, v0, 8 |
| +; GCN-NEXT: v_readlane_b32 s1, v0, 9 |
| +; GCN-NEXT: v_readlane_b32 s2, v0, 10 |
| +; GCN-NEXT: v_readlane_b32 s3, v0, 11 |
| +; GCN-NEXT: v_readlane_b32 s4, v0, 12 |
| +; GCN-NEXT: v_readlane_b32 s5, v0, 13 |
| +; GCN-NEXT: v_readlane_b32 s6, v0, 14 |
| +; GCN-NEXT: v_readlane_b32 s7, v0, 15 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; use s[0:7] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_readlane_b32 s0, v1, 16 |
| -; GCN-NEXT: v_readlane_b32 s1, v1, 17 |
| -; GCN-NEXT: v_readlane_b32 s2, v1, 18 |
| -; GCN-NEXT: v_readlane_b32 s3, v1, 19 |
| -; GCN-NEXT: v_readlane_b32 s4, v1, 20 |
| -; GCN-NEXT: v_readlane_b32 s5, v1, 21 |
| -; GCN-NEXT: v_readlane_b32 s6, v1, 22 |
| -; GCN-NEXT: v_readlane_b32 s7, v1, 23 |
| +; GCN-NEXT: v_readlane_b32 s0, v0, 16 |
| +; GCN-NEXT: v_readlane_b32 s1, v0, 17 |
| +; GCN-NEXT: v_readlane_b32 s2, v0, 18 |
| +; GCN-NEXT: v_readlane_b32 s3, v0, 19 |
| +; GCN-NEXT: v_readlane_b32 s4, v0, 20 |
| +; GCN-NEXT: v_readlane_b32 s5, v0, 21 |
| +; GCN-NEXT: v_readlane_b32 s6, v0, 22 |
| +; GCN-NEXT: v_readlane_b32 s7, v0, 23 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; use s[0:7] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_readlane_b32 s0, v1, 24 |
| -; GCN-NEXT: v_readlane_b32 s1, v1, 25 |
| -; GCN-NEXT: v_readlane_b32 s2, v1, 26 |
| -; GCN-NEXT: v_readlane_b32 s3, v1, 27 |
| -; GCN-NEXT: v_readlane_b32 s4, v1, 28 |
| -; GCN-NEXT: v_readlane_b32 s5, v1, 29 |
| -; GCN-NEXT: v_readlane_b32 s6, v1, 30 |
| -; GCN-NEXT: v_readlane_b32 s7, v1, 31 |
| +; GCN-NEXT: v_readlane_b32 s0, v0, 24 |
| +; GCN-NEXT: v_readlane_b32 s1, v0, 25 |
| +; GCN-NEXT: v_readlane_b32 s2, v0, 26 |
| +; GCN-NEXT: v_readlane_b32 s3, v0, 27 |
| +; GCN-NEXT: v_readlane_b32 s4, v0, 28 |
| +; GCN-NEXT: v_readlane_b32 s5, v0, 29 |
| +; GCN-NEXT: v_readlane_b32 s6, v0, 30 |
| +; GCN-NEXT: v_readlane_b32 s7, v0, 31 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; use s[0:7] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_readlane_b32 s0, v1, 32 |
| -; GCN-NEXT: v_readlane_b32 s1, v1, 33 |
| -; GCN-NEXT: v_readlane_b32 s2, v1, 34 |
| -; GCN-NEXT: v_readlane_b32 s3, v1, 35 |
| -; GCN-NEXT: v_readlane_b32 s4, v1, 36 |
| -; GCN-NEXT: v_readlane_b32 s5, v1, 37 |
| -; GCN-NEXT: v_readlane_b32 s6, v1, 38 |
| -; GCN-NEXT: v_readlane_b32 s7, v1, 39 |
| +; GCN-NEXT: v_readlane_b32 s0, v0, 32 |
| +; GCN-NEXT: v_readlane_b32 s1, v0, 33 |
| +; GCN-NEXT: v_readlane_b32 s2, v0, 34 |
| +; GCN-NEXT: v_readlane_b32 s3, v0, 35 |
| +; GCN-NEXT: v_readlane_b32 s4, v0, 36 |
| +; GCN-NEXT: v_readlane_b32 s5, v0, 37 |
| +; GCN-NEXT: v_readlane_b32 s6, v0, 38 |
| +; GCN-NEXT: v_readlane_b32 s7, v0, 39 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; use s[0:7] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_readlane_b32 s0, v1, 40 |
| -; GCN-NEXT: v_readlane_b32 s1, v1, 41 |
| -; GCN-NEXT: v_readlane_b32 s2, v1, 42 |
| -; GCN-NEXT: v_readlane_b32 s3, v1, 43 |
| -; GCN-NEXT: v_readlane_b32 s4, v1, 44 |
| -; GCN-NEXT: v_readlane_b32 s5, v1, 45 |
| -; GCN-NEXT: v_readlane_b32 s6, v1, 46 |
| -; GCN-NEXT: v_readlane_b32 s7, v1, 47 |
| +; GCN-NEXT: v_readlane_b32 s0, v0, 40 |
| +; GCN-NEXT: v_readlane_b32 s1, v0, 41 |
| +; GCN-NEXT: v_readlane_b32 s2, v0, 42 |
| +; GCN-NEXT: v_readlane_b32 s3, v0, 43 |
| +; GCN-NEXT: v_readlane_b32 s4, v0, 44 |
| +; GCN-NEXT: v_readlane_b32 s5, v0, 45 |
| +; GCN-NEXT: v_readlane_b32 s6, v0, 46 |
| +; GCN-NEXT: v_readlane_b32 s7, v0, 47 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; use s[0:7] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_readlane_b32 s0, v0, 0 |
| -; GCN-NEXT: v_readlane_b32 s1, v0, 1 |
| -; GCN-NEXT: v_readlane_b32 s2, v0, 2 |
| -; GCN-NEXT: v_readlane_b32 s3, v0, 3 |
| -; GCN-NEXT: v_readlane_b32 s4, v0, 4 |
| -; GCN-NEXT: v_readlane_b32 s5, v0, 5 |
| -; GCN-NEXT: v_readlane_b32 s6, v0, 6 |
| -; GCN-NEXT: v_readlane_b32 s7, v0, 7 |
| +; GCN-NEXT: v_readlane_b32 s0, v2, 0 |
| +; GCN-NEXT: v_readlane_b32 s1, v2, 1 |
| +; GCN-NEXT: v_readlane_b32 s2, v2, 2 |
| +; GCN-NEXT: v_readlane_b32 s3, v2, 3 |
| +; GCN-NEXT: v_readlane_b32 s4, v2, 4 |
| +; GCN-NEXT: v_readlane_b32 s5, v2, 5 |
| +; GCN-NEXT: v_readlane_b32 s6, v2, 6 |
| +; GCN-NEXT: v_readlane_b32 s7, v2, 7 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; use s[84:91] |
| ; GCN-NEXT: ;;#ASMEND |
| @@ -470,17 +442,10 @@ ret: |
| define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %in) #1 { |
| ; GCN-LABEL: split_sgpr_spill_2_vgprs: |
| ; GCN: ; %bb.0: |
| -; GCN-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 |
| -; GCN-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 |
| -; GCN-NEXT: s_mov_b32 s54, -1 |
| -; GCN-NEXT: s_mov_b32 s55, 0xe8f000 |
| -; GCN-NEXT: s_add_u32 s52, s52, s3 |
| -; GCN-NEXT: s_addc_u32 s53, s53, 0 |
| ; GCN-NEXT: s_load_dword s0, s[0:1], 0xb |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:19] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| ; GCN-NEXT: v_writelane_b32 v0, s4, 0 |
| ; GCN-NEXT: v_writelane_b32 v0, s5, 1 |
| ; GCN-NEXT: v_writelane_b32 v0, s6, 2 |
| @@ -554,41 +519,27 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 % |
| ; GCN-NEXT: v_writelane_b32 v0, s17, 61 |
| ; GCN-NEXT: v_writelane_b32 v0, s18, 62 |
| ; GCN-NEXT: v_writelane_b32 v0, s19, 63 |
| -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[28:29] |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:11] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 0 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 1 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 2 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 3 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 4 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 5 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 6 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 7 |
| +; GCN-NEXT: v_writelane_b32 v1, s4, 0 |
| +; GCN-NEXT: v_writelane_b32 v1, s5, 1 |
| +; GCN-NEXT: v_writelane_b32 v1, s6, 2 |
| +; GCN-NEXT: v_writelane_b32 v1, s7, 3 |
| +; GCN-NEXT: v_writelane_b32 v1, s8, 4 |
| +; GCN-NEXT: v_writelane_b32 v1, s9, 5 |
| +; GCN-NEXT: v_writelane_b32 v1, s10, 6 |
| +; GCN-NEXT: v_writelane_b32 v1, s11, 7 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[2:3] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_writelane_b32 v0, s2, 8 |
| -; GCN-NEXT: v_writelane_b32 v0, s3, 9 |
| -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[28:29] |
| +; GCN-NEXT: v_writelane_b32 v1, s2, 8 |
| +; GCN-NEXT: v_writelane_b32 v1, s3, 9 |
| ; GCN-NEXT: s_mov_b32 s1, 0 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: s_cmp_lg_u32 s0, s1 |
| ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 |
| ; GCN-NEXT: ; %bb.1: ; %bb0 |
| -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[28:29] |
| -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 |
| -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[28:29] |
| -; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_readlane_b32 s16, v1, 8 |
| ; GCN-NEXT: v_readlane_b32 s17, v1, 9 |
| ; GCN-NEXT: v_readlane_b32 s20, v1, 0 |
| @@ -734,176 +685,176 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 % |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:19] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 0 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 1 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 2 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 3 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 4 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 5 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 6 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 7 |
| -; GCN-NEXT: v_writelane_b32 v0, s12, 8 |
| -; GCN-NEXT: v_writelane_b32 v0, s13, 9 |
| -; GCN-NEXT: v_writelane_b32 v0, s14, 10 |
| -; GCN-NEXT: v_writelane_b32 v0, s15, 11 |
| -; GCN-NEXT: v_writelane_b32 v0, s16, 12 |
| -; GCN-NEXT: v_writelane_b32 v0, s17, 13 |
| -; GCN-NEXT: v_writelane_b32 v0, s18, 14 |
| -; GCN-NEXT: v_writelane_b32 v0, s19, 15 |
| +; GCN-NEXT: v_writelane_b32 v31, s4, 0 |
| +; GCN-NEXT: v_writelane_b32 v31, s5, 1 |
| +; GCN-NEXT: v_writelane_b32 v31, s6, 2 |
| +; GCN-NEXT: v_writelane_b32 v31, s7, 3 |
| +; GCN-NEXT: v_writelane_b32 v31, s8, 4 |
| +; GCN-NEXT: v_writelane_b32 v31, s9, 5 |
| +; GCN-NEXT: v_writelane_b32 v31, s10, 6 |
| +; GCN-NEXT: v_writelane_b32 v31, s11, 7 |
| +; GCN-NEXT: v_writelane_b32 v31, s12, 8 |
| +; GCN-NEXT: v_writelane_b32 v31, s13, 9 |
| +; GCN-NEXT: v_writelane_b32 v31, s14, 10 |
| +; GCN-NEXT: v_writelane_b32 v31, s15, 11 |
| +; GCN-NEXT: v_writelane_b32 v31, s16, 12 |
| +; GCN-NEXT: v_writelane_b32 v31, s17, 13 |
| +; GCN-NEXT: v_writelane_b32 v31, s18, 14 |
| +; GCN-NEXT: v_writelane_b32 v31, s19, 15 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:19] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 16 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 17 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 18 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 19 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 20 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 21 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 22 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 23 |
| -; GCN-NEXT: v_writelane_b32 v0, s12, 24 |
| -; GCN-NEXT: v_writelane_b32 v0, s13, 25 |
| -; GCN-NEXT: v_writelane_b32 v0, s14, 26 |
| -; GCN-NEXT: v_writelane_b32 v0, s15, 27 |
| -; GCN-NEXT: v_writelane_b32 v0, s16, 28 |
| -; GCN-NEXT: v_writelane_b32 v0, s17, 29 |
| -; GCN-NEXT: v_writelane_b32 v0, s18, 30 |
| -; GCN-NEXT: v_writelane_b32 v0, s19, 31 |
| +; GCN-NEXT: v_writelane_b32 v31, s4, 16 |
| +; GCN-NEXT: v_writelane_b32 v31, s5, 17 |
| +; GCN-NEXT: v_writelane_b32 v31, s6, 18 |
| +; GCN-NEXT: v_writelane_b32 v31, s7, 19 |
| +; GCN-NEXT: v_writelane_b32 v31, s8, 20 |
| +; GCN-NEXT: v_writelane_b32 v31, s9, 21 |
| +; GCN-NEXT: v_writelane_b32 v31, s10, 22 |
| +; GCN-NEXT: v_writelane_b32 v31, s11, 23 |
| +; GCN-NEXT: v_writelane_b32 v31, s12, 24 |
| +; GCN-NEXT: v_writelane_b32 v31, s13, 25 |
| +; GCN-NEXT: v_writelane_b32 v31, s14, 26 |
| +; GCN-NEXT: v_writelane_b32 v31, s15, 27 |
| +; GCN-NEXT: v_writelane_b32 v31, s16, 28 |
| +; GCN-NEXT: v_writelane_b32 v31, s17, 29 |
| +; GCN-NEXT: v_writelane_b32 v31, s18, 30 |
| +; GCN-NEXT: v_writelane_b32 v31, s19, 31 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:19] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 32 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 33 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 34 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 35 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 36 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 37 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 38 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 39 |
| -; GCN-NEXT: v_writelane_b32 v0, s12, 40 |
| -; GCN-NEXT: v_writelane_b32 v0, s13, 41 |
| -; GCN-NEXT: v_writelane_b32 v0, s14, 42 |
| -; GCN-NEXT: v_writelane_b32 v0, s15, 43 |
| -; GCN-NEXT: v_writelane_b32 v0, s16, 44 |
| -; GCN-NEXT: v_writelane_b32 v0, s17, 45 |
| -; GCN-NEXT: v_writelane_b32 v0, s18, 46 |
| -; GCN-NEXT: v_writelane_b32 v0, s19, 47 |
| +; GCN-NEXT: v_writelane_b32 v31, s4, 32 |
| +; GCN-NEXT: v_writelane_b32 v31, s5, 33 |
| +; GCN-NEXT: v_writelane_b32 v31, s6, 34 |
| +; GCN-NEXT: v_writelane_b32 v31, s7, 35 |
| +; GCN-NEXT: v_writelane_b32 v31, s8, 36 |
| +; GCN-NEXT: v_writelane_b32 v31, s9, 37 |
| +; GCN-NEXT: v_writelane_b32 v31, s10, 38 |
| +; GCN-NEXT: v_writelane_b32 v31, s11, 39 |
| +; GCN-NEXT: v_writelane_b32 v31, s12, 40 |
| +; GCN-NEXT: v_writelane_b32 v31, s13, 41 |
| +; GCN-NEXT: v_writelane_b32 v31, s14, 42 |
| +; GCN-NEXT: v_writelane_b32 v31, s15, 43 |
| +; GCN-NEXT: v_writelane_b32 v31, s16, 44 |
| +; GCN-NEXT: v_writelane_b32 v31, s17, 45 |
| +; GCN-NEXT: v_writelane_b32 v31, s18, 46 |
| +; GCN-NEXT: v_writelane_b32 v31, s19, 47 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:19] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 48 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 49 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 50 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 51 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 52 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 53 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 54 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 55 |
| -; GCN-NEXT: v_writelane_b32 v0, s12, 56 |
| -; GCN-NEXT: v_writelane_b32 v0, s13, 57 |
| -; GCN-NEXT: v_writelane_b32 v0, s14, 58 |
| -; GCN-NEXT: v_writelane_b32 v0, s15, 59 |
| -; GCN-NEXT: v_writelane_b32 v0, s16, 60 |
| -; GCN-NEXT: v_writelane_b32 v0, s17, 61 |
| -; GCN-NEXT: v_writelane_b32 v0, s18, 62 |
| -; GCN-NEXT: v_writelane_b32 v0, s19, 63 |
| -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[34:35] |
| +; GCN-NEXT: v_writelane_b32 v31, s4, 48 |
| +; GCN-NEXT: v_writelane_b32 v31, s5, 49 |
| +; GCN-NEXT: v_writelane_b32 v31, s6, 50 |
| +; GCN-NEXT: v_writelane_b32 v31, s7, 51 |
| +; GCN-NEXT: v_writelane_b32 v31, s8, 52 |
| +; GCN-NEXT: v_writelane_b32 v31, s9, 53 |
| +; GCN-NEXT: v_writelane_b32 v31, s10, 54 |
| +; GCN-NEXT: v_writelane_b32 v31, s11, 55 |
| +; GCN-NEXT: v_writelane_b32 v31, s12, 56 |
| +; GCN-NEXT: v_writelane_b32 v31, s13, 57 |
| +; GCN-NEXT: v_writelane_b32 v31, s14, 58 |
| +; GCN-NEXT: v_writelane_b32 v31, s15, 59 |
| +; GCN-NEXT: v_writelane_b32 v31, s16, 60 |
| +; GCN-NEXT: v_writelane_b32 v31, s17, 61 |
| +; GCN-NEXT: v_writelane_b32 v31, s18, 62 |
| +; GCN-NEXT: v_writelane_b32 v31, s19, 63 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[2:3] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| +; GCN-NEXT: s_mov_b64 s[4:5], exec |
| +; GCN-NEXT: s_mov_b64 exec, 3 |
| +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 |
| ; GCN-NEXT: v_writelane_b32 v0, s2, 0 |
| ; GCN-NEXT: v_writelane_b32 v0, s3, 1 |
| -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| ; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[34:35] |
| +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 |
| +; GCN-NEXT: s_waitcnt vmcnt(0) |
| +; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_mov_b32 s1, 0 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: s_cmp_lg_u32 s0, s1 |
| ; GCN-NEXT: s_cbranch_scc1 .LBB2_2 |
| ; GCN-NEXT: ; %bb.1: ; %bb0 |
| -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[34:35] |
| -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[34:35] |
| -; GCN-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-NEXT: v_readlane_b32 s36, v1, 32 |
| -; GCN-NEXT: v_readlane_b32 s37, v1, 33 |
| -; GCN-NEXT: v_readlane_b32 s38, v1, 34 |
| -; GCN-NEXT: v_readlane_b32 s39, v1, 35 |
| -; GCN-NEXT: v_readlane_b32 s40, v1, 36 |
| -; GCN-NEXT: v_readlane_b32 s41, v1, 37 |
| -; GCN-NEXT: v_readlane_b32 s42, v1, 38 |
| -; GCN-NEXT: v_readlane_b32 s43, v1, 39 |
| -; GCN-NEXT: v_readlane_b32 s44, v1, 40 |
| -; GCN-NEXT: v_readlane_b32 s45, v1, 41 |
| -; GCN-NEXT: v_readlane_b32 s46, v1, 42 |
| -; GCN-NEXT: v_readlane_b32 s47, v1, 43 |
| -; GCN-NEXT: v_readlane_b32 s48, v1, 44 |
| -; GCN-NEXT: v_readlane_b32 s49, v1, 45 |
| -; GCN-NEXT: v_readlane_b32 s50, v1, 46 |
| -; GCN-NEXT: v_readlane_b32 s51, v1, 47 |
| -; GCN-NEXT: v_readlane_b32 s0, v1, 16 |
| -; GCN-NEXT: v_readlane_b32 s1, v1, 17 |
| -; GCN-NEXT: v_readlane_b32 s2, v1, 18 |
| -; GCN-NEXT: v_readlane_b32 s3, v1, 19 |
| -; GCN-NEXT: v_readlane_b32 s4, v1, 20 |
| -; GCN-NEXT: v_readlane_b32 s5, v1, 21 |
| -; GCN-NEXT: v_readlane_b32 s6, v1, 22 |
| -; GCN-NEXT: v_readlane_b32 s7, v1, 23 |
| -; GCN-NEXT: v_readlane_b32 s8, v1, 24 |
| -; GCN-NEXT: v_readlane_b32 s9, v1, 25 |
| -; GCN-NEXT: v_readlane_b32 s10, v1, 26 |
| -; GCN-NEXT: v_readlane_b32 s11, v1, 27 |
| -; GCN-NEXT: v_readlane_b32 s12, v1, 28 |
| -; GCN-NEXT: v_readlane_b32 s13, v1, 29 |
| -; GCN-NEXT: v_readlane_b32 s14, v1, 30 |
| -; GCN-NEXT: v_readlane_b32 s15, v1, 31 |
| -; GCN-NEXT: v_readlane_b32 s16, v1, 0 |
| -; GCN-NEXT: v_readlane_b32 s17, v1, 1 |
| -; GCN-NEXT: v_readlane_b32 s18, v1, 2 |
| -; GCN-NEXT: v_readlane_b32 s19, v1, 3 |
| -; GCN-NEXT: v_readlane_b32 s20, v1, 4 |
| -; GCN-NEXT: v_readlane_b32 s21, v1, 5 |
| -; GCN-NEXT: v_readlane_b32 s22, v1, 6 |
| -; GCN-NEXT: v_readlane_b32 s23, v1, 7 |
| -; GCN-NEXT: v_readlane_b32 s24, v1, 8 |
| -; GCN-NEXT: v_readlane_b32 s25, v1, 9 |
| -; GCN-NEXT: v_readlane_b32 s26, v1, 10 |
| -; GCN-NEXT: v_readlane_b32 s27, v1, 11 |
| -; GCN-NEXT: v_readlane_b32 s28, v1, 12 |
| -; GCN-NEXT: v_readlane_b32 s29, v1, 13 |
| -; GCN-NEXT: v_readlane_b32 s30, v1, 14 |
| -; GCN-NEXT: v_readlane_b32 s31, v1, 15 |
| +; GCN-NEXT: v_readlane_b32 s36, v31, 32 |
| +; GCN-NEXT: v_readlane_b32 s37, v31, 33 |
| +; GCN-NEXT: v_readlane_b32 s38, v31, 34 |
| +; GCN-NEXT: v_readlane_b32 s39, v31, 35 |
| +; GCN-NEXT: v_readlane_b32 s40, v31, 36 |
| +; GCN-NEXT: v_readlane_b32 s41, v31, 37 |
| +; GCN-NEXT: v_readlane_b32 s42, v31, 38 |
| +; GCN-NEXT: v_readlane_b32 s43, v31, 39 |
| +; GCN-NEXT: v_readlane_b32 s44, v31, 40 |
| +; GCN-NEXT: v_readlane_b32 s45, v31, 41 |
| +; GCN-NEXT: v_readlane_b32 s46, v31, 42 |
| +; GCN-NEXT: v_readlane_b32 s47, v31, 43 |
| +; GCN-NEXT: v_readlane_b32 s48, v31, 44 |
| +; GCN-NEXT: v_readlane_b32 s49, v31, 45 |
| +; GCN-NEXT: v_readlane_b32 s50, v31, 46 |
| +; GCN-NEXT: v_readlane_b32 s51, v31, 47 |
| +; GCN-NEXT: v_readlane_b32 s0, v31, 16 |
| +; GCN-NEXT: v_readlane_b32 s1, v31, 17 |
| +; GCN-NEXT: v_readlane_b32 s2, v31, 18 |
| +; GCN-NEXT: v_readlane_b32 s3, v31, 19 |
| +; GCN-NEXT: v_readlane_b32 s4, v31, 20 |
| +; GCN-NEXT: v_readlane_b32 s5, v31, 21 |
| +; GCN-NEXT: v_readlane_b32 s6, v31, 22 |
| +; GCN-NEXT: v_readlane_b32 s7, v31, 23 |
| +; GCN-NEXT: v_readlane_b32 s8, v31, 24 |
| +; GCN-NEXT: v_readlane_b32 s9, v31, 25 |
| +; GCN-NEXT: v_readlane_b32 s10, v31, 26 |
| +; GCN-NEXT: v_readlane_b32 s11, v31, 27 |
| +; GCN-NEXT: v_readlane_b32 s12, v31, 28 |
| +; GCN-NEXT: v_readlane_b32 s13, v31, 29 |
| +; GCN-NEXT: v_readlane_b32 s14, v31, 30 |
| +; GCN-NEXT: v_readlane_b32 s15, v31, 31 |
| +; GCN-NEXT: v_readlane_b32 s16, v31, 0 |
| +; GCN-NEXT: v_readlane_b32 s17, v31, 1 |
| +; GCN-NEXT: v_readlane_b32 s18, v31, 2 |
| +; GCN-NEXT: v_readlane_b32 s19, v31, 3 |
| +; GCN-NEXT: v_readlane_b32 s20, v31, 4 |
| +; GCN-NEXT: v_readlane_b32 s21, v31, 5 |
| +; GCN-NEXT: v_readlane_b32 s22, v31, 6 |
| +; GCN-NEXT: v_readlane_b32 s23, v31, 7 |
| +; GCN-NEXT: v_readlane_b32 s24, v31, 8 |
| +; GCN-NEXT: v_readlane_b32 s25, v31, 9 |
| +; GCN-NEXT: v_readlane_b32 s26, v31, 10 |
| +; GCN-NEXT: v_readlane_b32 s27, v31, 11 |
| +; GCN-NEXT: v_readlane_b32 s28, v31, 12 |
| +; GCN-NEXT: v_readlane_b32 s29, v31, 13 |
| +; GCN-NEXT: v_readlane_b32 s30, v31, 14 |
| +; GCN-NEXT: v_readlane_b32 s31, v31, 15 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; use s[16:31] |
| ; GCN-NEXT: ;;#ASMEND |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; use s[0:15] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_readlane_b32 s4, v1, 48 |
| -; GCN-NEXT: v_readlane_b32 s5, v1, 49 |
| -; GCN-NEXT: v_readlane_b32 s6, v1, 50 |
| -; GCN-NEXT: v_readlane_b32 s7, v1, 51 |
| -; GCN-NEXT: v_readlane_b32 s8, v1, 52 |
| -; GCN-NEXT: v_readlane_b32 s9, v1, 53 |
| -; GCN-NEXT: v_readlane_b32 s10, v1, 54 |
| -; GCN-NEXT: v_readlane_b32 s11, v1, 55 |
| -; GCN-NEXT: v_readlane_b32 s12, v1, 56 |
| -; GCN-NEXT: v_readlane_b32 s13, v1, 57 |
| -; GCN-NEXT: v_readlane_b32 s14, v1, 58 |
| -; GCN-NEXT: v_readlane_b32 s15, v1, 59 |
| -; GCN-NEXT: v_readlane_b32 s16, v1, 60 |
| -; GCN-NEXT: v_readlane_b32 s17, v1, 61 |
| -; GCN-NEXT: v_readlane_b32 s18, v1, 62 |
| -; GCN-NEXT: v_readlane_b32 s19, v1, 63 |
| +; GCN-NEXT: v_readlane_b32 s4, v31, 48 |
| +; GCN-NEXT: v_readlane_b32 s5, v31, 49 |
| +; GCN-NEXT: v_readlane_b32 s6, v31, 50 |
| +; GCN-NEXT: v_readlane_b32 s7, v31, 51 |
| +; GCN-NEXT: v_readlane_b32 s8, v31, 52 |
| +; GCN-NEXT: v_readlane_b32 s9, v31, 53 |
| +; GCN-NEXT: v_readlane_b32 s10, v31, 54 |
| +; GCN-NEXT: v_readlane_b32 s11, v31, 55 |
| +; GCN-NEXT: v_readlane_b32 s12, v31, 56 |
| +; GCN-NEXT: v_readlane_b32 s13, v31, 57 |
| +; GCN-NEXT: v_readlane_b32 s14, v31, 58 |
| +; GCN-NEXT: v_readlane_b32 s15, v31, 59 |
| +; GCN-NEXT: v_readlane_b32 s16, v31, 60 |
| +; GCN-NEXT: v_readlane_b32 s17, v31, 61 |
| +; GCN-NEXT: v_readlane_b32 s18, v31, 62 |
| +; GCN-NEXT: v_readlane_b32 s19, v31, 63 |
| +; GCN-NEXT: s_mov_b64 s[2:3], exec |
| +; GCN-NEXT: s_mov_b64 exec, 3 |
| +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 |
| +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload |
| +; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_readlane_b32 s0, v0, 0 |
| ; GCN-NEXT: v_readlane_b32 s1, v0, 1 |
| +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 |
| +; GCN-NEXT: s_waitcnt vmcnt(0) |
| +; GCN-NEXT: s_mov_b64 exec, s[2:3] |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; use s[36:51] |
| ; GCN-NEXT: ;;#ASMEND |
| @@ -969,152 +920,144 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:19] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 0 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 1 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 2 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 3 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 4 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 5 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 6 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 7 |
| -; GCN-NEXT: v_writelane_b32 v0, s12, 8 |
| -; GCN-NEXT: v_writelane_b32 v0, s13, 9 |
| -; GCN-NEXT: v_writelane_b32 v0, s14, 10 |
| -; GCN-NEXT: v_writelane_b32 v0, s15, 11 |
| -; GCN-NEXT: v_writelane_b32 v0, s16, 12 |
| -; GCN-NEXT: v_writelane_b32 v0, s17, 13 |
| -; GCN-NEXT: v_writelane_b32 v0, s18, 14 |
| -; GCN-NEXT: v_writelane_b32 v0, s19, 15 |
| +; GCN-NEXT: v_writelane_b32 v31, s4, 0 |
| +; GCN-NEXT: v_writelane_b32 v31, s5, 1 |
| +; GCN-NEXT: v_writelane_b32 v31, s6, 2 |
| +; GCN-NEXT: v_writelane_b32 v31, s7, 3 |
| +; GCN-NEXT: v_writelane_b32 v31, s8, 4 |
| +; GCN-NEXT: v_writelane_b32 v31, s9, 5 |
| +; GCN-NEXT: v_writelane_b32 v31, s10, 6 |
| +; GCN-NEXT: v_writelane_b32 v31, s11, 7 |
| +; GCN-NEXT: v_writelane_b32 v31, s12, 8 |
| +; GCN-NEXT: v_writelane_b32 v31, s13, 9 |
| +; GCN-NEXT: v_writelane_b32 v31, s14, 10 |
| +; GCN-NEXT: v_writelane_b32 v31, s15, 11 |
| +; GCN-NEXT: v_writelane_b32 v31, s16, 12 |
| +; GCN-NEXT: v_writelane_b32 v31, s17, 13 |
| +; GCN-NEXT: v_writelane_b32 v31, s18, 14 |
| +; GCN-NEXT: v_writelane_b32 v31, s19, 15 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:19] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 16 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 17 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 18 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 19 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 20 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 21 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 22 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 23 |
| -; GCN-NEXT: v_writelane_b32 v0, s12, 24 |
| -; GCN-NEXT: v_writelane_b32 v0, s13, 25 |
| -; GCN-NEXT: v_writelane_b32 v0, s14, 26 |
| -; GCN-NEXT: v_writelane_b32 v0, s15, 27 |
| -; GCN-NEXT: v_writelane_b32 v0, s16, 28 |
| -; GCN-NEXT: v_writelane_b32 v0, s17, 29 |
| -; GCN-NEXT: v_writelane_b32 v0, s18, 30 |
| -; GCN-NEXT: v_writelane_b32 v0, s19, 31 |
| +; GCN-NEXT: v_writelane_b32 v31, s4, 16 |
| +; GCN-NEXT: v_writelane_b32 v31, s5, 17 |
| +; GCN-NEXT: v_writelane_b32 v31, s6, 18 |
| +; GCN-NEXT: v_writelane_b32 v31, s7, 19 |
| +; GCN-NEXT: v_writelane_b32 v31, s8, 20 |
| +; GCN-NEXT: v_writelane_b32 v31, s9, 21 |
| +; GCN-NEXT: v_writelane_b32 v31, s10, 22 |
| +; GCN-NEXT: v_writelane_b32 v31, s11, 23 |
| +; GCN-NEXT: v_writelane_b32 v31, s12, 24 |
| +; GCN-NEXT: v_writelane_b32 v31, s13, 25 |
| +; GCN-NEXT: v_writelane_b32 v31, s14, 26 |
| +; GCN-NEXT: v_writelane_b32 v31, s15, 27 |
| +; GCN-NEXT: v_writelane_b32 v31, s16, 28 |
| +; GCN-NEXT: v_writelane_b32 v31, s17, 29 |
| +; GCN-NEXT: v_writelane_b32 v31, s18, 30 |
| +; GCN-NEXT: v_writelane_b32 v31, s19, 31 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:19] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 32 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 33 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 34 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 35 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 36 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 37 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 38 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 39 |
| -; GCN-NEXT: v_writelane_b32 v0, s12, 40 |
| -; GCN-NEXT: v_writelane_b32 v0, s13, 41 |
| -; GCN-NEXT: v_writelane_b32 v0, s14, 42 |
| -; GCN-NEXT: v_writelane_b32 v0, s15, 43 |
| -; GCN-NEXT: v_writelane_b32 v0, s16, 44 |
| -; GCN-NEXT: v_writelane_b32 v0, s17, 45 |
| -; GCN-NEXT: v_writelane_b32 v0, s18, 46 |
| -; GCN-NEXT: v_writelane_b32 v0, s19, 47 |
| +; GCN-NEXT: v_writelane_b32 v31, s4, 32 |
| +; GCN-NEXT: v_writelane_b32 v31, s5, 33 |
| +; GCN-NEXT: v_writelane_b32 v31, s6, 34 |
| +; GCN-NEXT: v_writelane_b32 v31, s7, 35 |
| +; GCN-NEXT: v_writelane_b32 v31, s8, 36 |
| +; GCN-NEXT: v_writelane_b32 v31, s9, 37 |
| +; GCN-NEXT: v_writelane_b32 v31, s10, 38 |
| +; GCN-NEXT: v_writelane_b32 v31, s11, 39 |
| +; GCN-NEXT: v_writelane_b32 v31, s12, 40 |
| +; GCN-NEXT: v_writelane_b32 v31, s13, 41 |
| +; GCN-NEXT: v_writelane_b32 v31, s14, 42 |
| +; GCN-NEXT: v_writelane_b32 v31, s15, 43 |
| +; GCN-NEXT: v_writelane_b32 v31, s16, 44 |
| +; GCN-NEXT: v_writelane_b32 v31, s17, 45 |
| +; GCN-NEXT: v_writelane_b32 v31, s18, 46 |
| +; GCN-NEXT: v_writelane_b32 v31, s19, 47 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[4:19] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 48 |
| -; GCN-NEXT: v_writelane_b32 v0, s5, 49 |
| -; GCN-NEXT: v_writelane_b32 v0, s6, 50 |
| -; GCN-NEXT: v_writelane_b32 v0, s7, 51 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 52 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 53 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 54 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 55 |
| -; GCN-NEXT: v_writelane_b32 v0, s12, 56 |
| -; GCN-NEXT: v_writelane_b32 v0, s13, 57 |
| -; GCN-NEXT: v_writelane_b32 v0, s14, 58 |
| -; GCN-NEXT: v_writelane_b32 v0, s15, 59 |
| -; GCN-NEXT: v_writelane_b32 v0, s16, 60 |
| -; GCN-NEXT: v_writelane_b32 v0, s17, 61 |
| -; GCN-NEXT: v_writelane_b32 v0, s18, 62 |
| -; GCN-NEXT: v_writelane_b32 v0, s19, 63 |
| -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[34:35] |
| +; GCN-NEXT: v_writelane_b32 v31, s4, 48 |
| +; GCN-NEXT: v_writelane_b32 v31, s5, 49 |
| +; GCN-NEXT: v_writelane_b32 v31, s6, 50 |
| +; GCN-NEXT: v_writelane_b32 v31, s7, 51 |
| +; GCN-NEXT: v_writelane_b32 v31, s8, 52 |
| +; GCN-NEXT: v_writelane_b32 v31, s9, 53 |
| +; GCN-NEXT: v_writelane_b32 v31, s10, 54 |
| +; GCN-NEXT: v_writelane_b32 v31, s11, 55 |
| +; GCN-NEXT: v_writelane_b32 v31, s12, 56 |
| +; GCN-NEXT: v_writelane_b32 v31, s13, 57 |
| +; GCN-NEXT: v_writelane_b32 v31, s14, 58 |
| +; GCN-NEXT: v_writelane_b32 v31, s15, 59 |
| +; GCN-NEXT: v_writelane_b32 v31, s16, 60 |
| +; GCN-NEXT: v_writelane_b32 v31, s17, 61 |
| +; GCN-NEXT: v_writelane_b32 v31, s18, 62 |
| +; GCN-NEXT: v_writelane_b32 v31, s19, 63 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[2:3] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| +; GCN-NEXT: s_mov_b64 s[4:5], exec |
| +; GCN-NEXT: s_mov_b64 exec, 3 |
| +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 |
| ; GCN-NEXT: v_writelane_b32 v0, s2, 0 |
| ; GCN-NEXT: v_writelane_b32 v0, s3, 1 |
| -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| ; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[34:35] |
| +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 |
| +; GCN-NEXT: s_waitcnt vmcnt(0) |
| +; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_mov_b32 s1, 0 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: s_cmp_lg_u32 s0, s1 |
| ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 |
| ; GCN-NEXT: ; %bb.1: ; %bb0 |
| -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[34:35] |
| -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; GCN-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[34:35] |
| -; GCN-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-NEXT: v_readlane_b32 s36, v2, 32 |
| -; GCN-NEXT: v_readlane_b32 s37, v2, 33 |
| -; GCN-NEXT: v_readlane_b32 s38, v2, 34 |
| -; GCN-NEXT: v_readlane_b32 s39, v2, 35 |
| -; GCN-NEXT: v_readlane_b32 s40, v2, 36 |
| -; GCN-NEXT: v_readlane_b32 s41, v2, 37 |
| -; GCN-NEXT: v_readlane_b32 s42, v2, 38 |
| -; GCN-NEXT: v_readlane_b32 s43, v2, 39 |
| -; GCN-NEXT: v_readlane_b32 s44, v2, 40 |
| -; GCN-NEXT: v_readlane_b32 s45, v2, 41 |
| -; GCN-NEXT: v_readlane_b32 s46, v2, 42 |
| -; GCN-NEXT: v_readlane_b32 s47, v2, 43 |
| -; GCN-NEXT: v_readlane_b32 s48, v2, 44 |
| -; GCN-NEXT: v_readlane_b32 s49, v2, 45 |
| -; GCN-NEXT: v_readlane_b32 s50, v2, 46 |
| -; GCN-NEXT: v_readlane_b32 s51, v2, 47 |
| -; GCN-NEXT: v_readlane_b32 s0, v2, 16 |
| -; GCN-NEXT: v_readlane_b32 s1, v2, 17 |
| -; GCN-NEXT: v_readlane_b32 s2, v2, 18 |
| -; GCN-NEXT: v_readlane_b32 s3, v2, 19 |
| -; GCN-NEXT: v_readlane_b32 s4, v2, 20 |
| -; GCN-NEXT: v_readlane_b32 s5, v2, 21 |
| -; GCN-NEXT: v_readlane_b32 s6, v2, 22 |
| -; GCN-NEXT: v_readlane_b32 s7, v2, 23 |
| -; GCN-NEXT: v_readlane_b32 s8, v2, 24 |
| -; GCN-NEXT: v_readlane_b32 s9, v2, 25 |
| -; GCN-NEXT: v_readlane_b32 s10, v2, 26 |
| -; GCN-NEXT: v_readlane_b32 s11, v2, 27 |
| -; GCN-NEXT: v_readlane_b32 s12, v2, 28 |
| -; GCN-NEXT: v_readlane_b32 s13, v2, 29 |
| -; GCN-NEXT: v_readlane_b32 s14, v2, 30 |
| -; GCN-NEXT: v_readlane_b32 s15, v2, 31 |
| -; GCN-NEXT: v_readlane_b32 s16, v2, 0 |
| -; GCN-NEXT: v_readlane_b32 s17, v2, 1 |
| -; GCN-NEXT: v_readlane_b32 s18, v2, 2 |
| -; GCN-NEXT: v_readlane_b32 s19, v2, 3 |
| -; GCN-NEXT: v_readlane_b32 s20, v2, 4 |
| -; GCN-NEXT: v_readlane_b32 s21, v2, 5 |
| -; GCN-NEXT: v_readlane_b32 s22, v2, 6 |
| -; GCN-NEXT: v_readlane_b32 s23, v2, 7 |
| -; GCN-NEXT: v_readlane_b32 s24, v2, 8 |
| -; GCN-NEXT: v_readlane_b32 s25, v2, 9 |
| -; GCN-NEXT: v_readlane_b32 s26, v2, 10 |
| -; GCN-NEXT: v_readlane_b32 s27, v2, 11 |
| -; GCN-NEXT: v_readlane_b32 s28, v2, 12 |
| -; GCN-NEXT: v_readlane_b32 s29, v2, 13 |
| -; GCN-NEXT: v_readlane_b32 s30, v2, 14 |
| -; GCN-NEXT: v_readlane_b32 s31, v2, 15 |
| +; GCN-NEXT: v_readlane_b32 s36, v31, 32 |
| +; GCN-NEXT: v_readlane_b32 s37, v31, 33 |
| +; GCN-NEXT: v_readlane_b32 s38, v31, 34 |
| +; GCN-NEXT: v_readlane_b32 s39, v31, 35 |
| +; GCN-NEXT: v_readlane_b32 s40, v31, 36 |
| +; GCN-NEXT: v_readlane_b32 s41, v31, 37 |
| +; GCN-NEXT: v_readlane_b32 s42, v31, 38 |
| +; GCN-NEXT: v_readlane_b32 s43, v31, 39 |
| +; GCN-NEXT: v_readlane_b32 s44, v31, 40 |
| +; GCN-NEXT: v_readlane_b32 s45, v31, 41 |
| +; GCN-NEXT: v_readlane_b32 s46, v31, 42 |
| +; GCN-NEXT: v_readlane_b32 s47, v31, 43 |
| +; GCN-NEXT: v_readlane_b32 s48, v31, 44 |
| +; GCN-NEXT: v_readlane_b32 s49, v31, 45 |
| +; GCN-NEXT: v_readlane_b32 s50, v31, 46 |
| +; GCN-NEXT: v_readlane_b32 s51, v31, 47 |
| +; GCN-NEXT: v_readlane_b32 s0, v31, 16 |
| +; GCN-NEXT: v_readlane_b32 s1, v31, 17 |
| +; GCN-NEXT: v_readlane_b32 s2, v31, 18 |
| +; GCN-NEXT: v_readlane_b32 s3, v31, 19 |
| +; GCN-NEXT: v_readlane_b32 s4, v31, 20 |
| +; GCN-NEXT: v_readlane_b32 s5, v31, 21 |
| +; GCN-NEXT: v_readlane_b32 s6, v31, 22 |
| +; GCN-NEXT: v_readlane_b32 s7, v31, 23 |
| +; GCN-NEXT: v_readlane_b32 s8, v31, 24 |
| +; GCN-NEXT: v_readlane_b32 s9, v31, 25 |
| +; GCN-NEXT: v_readlane_b32 s10, v31, 26 |
| +; GCN-NEXT: v_readlane_b32 s11, v31, 27 |
| +; GCN-NEXT: v_readlane_b32 s12, v31, 28 |
| +; GCN-NEXT: v_readlane_b32 s13, v31, 29 |
| +; GCN-NEXT: v_readlane_b32 s14, v31, 30 |
| +; GCN-NEXT: v_readlane_b32 s15, v31, 31 |
| +; GCN-NEXT: v_readlane_b32 s16, v31, 0 |
| +; GCN-NEXT: v_readlane_b32 s17, v31, 1 |
| +; GCN-NEXT: v_readlane_b32 s18, v31, 2 |
| +; GCN-NEXT: v_readlane_b32 s19, v31, 3 |
| +; GCN-NEXT: v_readlane_b32 s20, v31, 4 |
| +; GCN-NEXT: v_readlane_b32 s21, v31, 5 |
| +; GCN-NEXT: v_readlane_b32 s22, v31, 6 |
| +; GCN-NEXT: v_readlane_b32 s23, v31, 7 |
| +; GCN-NEXT: v_readlane_b32 s24, v31, 8 |
| +; GCN-NEXT: v_readlane_b32 s25, v31, 9 |
| +; GCN-NEXT: v_readlane_b32 s26, v31, 10 |
| +; GCN-NEXT: v_readlane_b32 s27, v31, 11 |
| +; GCN-NEXT: v_readlane_b32 s28, v31, 12 |
| +; GCN-NEXT: v_readlane_b32 s29, v31, 13 |
| +; GCN-NEXT: v_readlane_b32 s30, v31, 14 |
| +; GCN-NEXT: v_readlane_b32 s31, v31, 15 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def v0 |
| ; GCN-NEXT: ;;#ASMEND |
| @@ -1124,24 +1067,32 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 { |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; use s[0:15] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_readlane_b32 s4, v2, 48 |
| -; GCN-NEXT: v_readlane_b32 s5, v2, 49 |
| -; GCN-NEXT: v_readlane_b32 s6, v2, 50 |
| -; GCN-NEXT: v_readlane_b32 s7, v2, 51 |
| -; GCN-NEXT: v_readlane_b32 s8, v2, 52 |
| -; GCN-NEXT: v_readlane_b32 s9, v2, 53 |
| -; GCN-NEXT: v_readlane_b32 s10, v2, 54 |
| -; GCN-NEXT: v_readlane_b32 s11, v2, 55 |
| -; GCN-NEXT: v_readlane_b32 s12, v2, 56 |
| -; GCN-NEXT: v_readlane_b32 s13, v2, 57 |
| -; GCN-NEXT: v_readlane_b32 s14, v2, 58 |
| -; GCN-NEXT: v_readlane_b32 s15, v2, 59 |
| -; GCN-NEXT: v_readlane_b32 s16, v2, 60 |
| -; GCN-NEXT: v_readlane_b32 s17, v2, 61 |
| -; GCN-NEXT: v_readlane_b32 s18, v2, 62 |
| -; GCN-NEXT: v_readlane_b32 s19, v2, 63 |
| +; GCN-NEXT: v_readlane_b32 s4, v31, 48 |
| +; GCN-NEXT: v_readlane_b32 s5, v31, 49 |
| +; GCN-NEXT: v_readlane_b32 s6, v31, 50 |
| +; GCN-NEXT: v_readlane_b32 s7, v31, 51 |
| +; GCN-NEXT: v_readlane_b32 s8, v31, 52 |
| +; GCN-NEXT: v_readlane_b32 s9, v31, 53 |
| +; GCN-NEXT: v_readlane_b32 s10, v31, 54 |
| +; GCN-NEXT: v_readlane_b32 s11, v31, 55 |
| +; GCN-NEXT: v_readlane_b32 s12, v31, 56 |
| +; GCN-NEXT: v_readlane_b32 s13, v31, 57 |
| +; GCN-NEXT: v_readlane_b32 s14, v31, 58 |
| +; GCN-NEXT: v_readlane_b32 s15, v31, 59 |
| +; GCN-NEXT: v_readlane_b32 s16, v31, 60 |
| +; GCN-NEXT: v_readlane_b32 s17, v31, 61 |
| +; GCN-NEXT: v_readlane_b32 s18, v31, 62 |
| +; GCN-NEXT: v_readlane_b32 s19, v31, 63 |
| +; GCN-NEXT: s_mov_b64 s[2:3], exec |
| +; GCN-NEXT: s_mov_b64 exec, 3 |
| +; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 |
| +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload |
| +; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_readlane_b32 s0, v1, 0 |
| ; GCN-NEXT: v_readlane_b32 s1, v1, 1 |
| +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 |
| +; GCN-NEXT: s_waitcnt vmcnt(0) |
| +; GCN-NEXT: s_mov_b64 exec, s[2:3] |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; use s[36:51] |
| ; GCN-NEXT: ;;#ASMEND |
| diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll |
| index 2cbb505cd55d..959bc7f33426 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll |
| @@ -1,377 +1,22 @@ |
| -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s |
| +; RUN: not --crash llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s |
| |
| -; This was a negative test to catch an extreme case when all options are exhausted |
| -; while trying to spill SGPRs to memory. After we enabled SGPR spills into virtual VGPRs |
| -; the edge case won't arise and the test would always compile. |
| +; This ends up needing to spill SGPRs to memory, and also does not |
| +; have any free SGPRs available to save the exec mask when doing so. |
| +; The register scavenger also needs to use the emergency stack slot, |
| +; which tries to place the scavenged register restore instruction as |
| +; far the block as possible, near the terminator. This places a |
| +; restore instruction between the condition and the conditional |
| +; branch, which gets expanded into a sequence involving s_not_b64 on |
| +; the exec mask, clobbering SCC value before the branch. We probably |
| +; have to stop relying on being able to flip and restore the exec |
| +; mask, and always require a free SGPR for saving exec. |
| |
| -define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { |
| -; CHECK-LABEL: kernel0: |
| -; CHECK: ; %bb.0: |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[2:3] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ; implicit-def: $vgpr0 |
| -; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 |
| -; CHECK-NEXT: v_writelane_b32 v0, s2, 0 |
| -; CHECK-NEXT: v_writelane_b32 v0, s3, 1 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[4:7] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_writelane_b32 v0, s4, 2 |
| -; CHECK-NEXT: v_writelane_b32 v0, s5, 3 |
| -; CHECK-NEXT: v_writelane_b32 v0, s6, 4 |
| -; CHECK-NEXT: v_writelane_b32 v0, s7, 5 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[4:11] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_writelane_b32 v0, s4, 6 |
| -; CHECK-NEXT: v_writelane_b32 v0, s5, 7 |
| -; CHECK-NEXT: v_writelane_b32 v0, s6, 8 |
| -; CHECK-NEXT: v_writelane_b32 v0, s7, 9 |
| -; CHECK-NEXT: v_writelane_b32 v0, s8, 10 |
| -; CHECK-NEXT: v_writelane_b32 v0, s9, 11 |
| -; CHECK-NEXT: v_writelane_b32 v0, s10, 12 |
| -; CHECK-NEXT: v_writelane_b32 v0, s11, 13 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[4:19] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_writelane_b32 v0, s4, 14 |
| -; CHECK-NEXT: v_writelane_b32 v0, s5, 15 |
| -; CHECK-NEXT: v_writelane_b32 v0, s6, 16 |
| -; CHECK-NEXT: v_writelane_b32 v0, s7, 17 |
| -; CHECK-NEXT: v_writelane_b32 v0, s8, 18 |
| -; CHECK-NEXT: v_writelane_b32 v0, s9, 19 |
| -; CHECK-NEXT: v_writelane_b32 v0, s10, 20 |
| -; CHECK-NEXT: v_writelane_b32 v0, s11, 21 |
| -; CHECK-NEXT: v_writelane_b32 v0, s12, 22 |
| -; CHECK-NEXT: v_writelane_b32 v0, s13, 23 |
| -; CHECK-NEXT: v_writelane_b32 v0, s14, 24 |
| -; CHECK-NEXT: v_writelane_b32 v0, s15, 25 |
| -; CHECK-NEXT: v_writelane_b32 v0, s16, 26 |
| -; CHECK-NEXT: v_writelane_b32 v0, s17, 27 |
| -; CHECK-NEXT: v_writelane_b32 v0, s18, 28 |
| -; CHECK-NEXT: v_writelane_b32 v0, s19, 29 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[2:3] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_writelane_b32 v0, s2, 30 |
| -; CHECK-NEXT: v_writelane_b32 v0, s3, 31 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[4:7] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_writelane_b32 v0, s4, 32 |
| -; CHECK-NEXT: v_writelane_b32 v0, s5, 33 |
| -; CHECK-NEXT: v_writelane_b32 v0, s6, 34 |
| -; CHECK-NEXT: v_writelane_b32 v0, s7, 35 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[4:11] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_writelane_b32 v0, s4, 36 |
| -; CHECK-NEXT: v_writelane_b32 v0, s5, 37 |
| -; CHECK-NEXT: v_writelane_b32 v0, s6, 38 |
| -; CHECK-NEXT: v_writelane_b32 v0, s7, 39 |
| -; CHECK-NEXT: v_writelane_b32 v0, s8, 40 |
| -; CHECK-NEXT: v_writelane_b32 v0, s9, 41 |
| -; CHECK-NEXT: v_writelane_b32 v0, s10, 42 |
| -; CHECK-NEXT: v_writelane_b32 v0, s11, 43 |
| -; CHECK-NEXT: s_waitcnt lgkmcnt(0) |
| -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[16:31] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[52:53] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[48:51] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[36:43] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[0:15] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_writelane_b32 v0, s0, 44 |
| -; CHECK-NEXT: v_writelane_b32 v0, s1, 45 |
| -; CHECK-NEXT: v_writelane_b32 v0, s2, 46 |
| -; CHECK-NEXT: v_writelane_b32 v0, s3, 47 |
| -; CHECK-NEXT: v_writelane_b32 v0, s4, 48 |
| -; CHECK-NEXT: v_writelane_b32 v0, s5, 49 |
| -; CHECK-NEXT: v_writelane_b32 v0, s6, 50 |
| -; CHECK-NEXT: v_writelane_b32 v0, s7, 51 |
| -; CHECK-NEXT: v_writelane_b32 v0, s8, 52 |
| -; CHECK-NEXT: v_writelane_b32 v0, s9, 53 |
| -; CHECK-NEXT: v_writelane_b32 v0, s10, 54 |
| -; CHECK-NEXT: v_writelane_b32 v0, s11, 55 |
| -; CHECK-NEXT: v_writelane_b32 v0, s12, 56 |
| -; CHECK-NEXT: v_writelane_b32 v0, s13, 57 |
| -; CHECK-NEXT: v_writelane_b32 v0, s14, 58 |
| -; CHECK-NEXT: v_writelane_b32 v0, s15, 59 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[34:35] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[44:47] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[0:7] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ; implicit-def: $vgpr1 |
| -; CHECK-NEXT: v_writelane_b32 v0, s0, 60 |
| -; CHECK-NEXT: v_writelane_b32 v1, s4, 0 |
| -; CHECK-NEXT: v_writelane_b32 v0, s1, 61 |
| -; CHECK-NEXT: v_writelane_b32 v1, s5, 1 |
| -; CHECK-NEXT: v_writelane_b32 v0, s2, 62 |
| -; CHECK-NEXT: v_writelane_b32 v1, s6, 2 |
| -; CHECK-NEXT: v_writelane_b32 v0, s3, 63 |
| -; CHECK-NEXT: v_writelane_b32 v1, s7, 3 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[0:15] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_writelane_b32 v1, s0, 4 |
| -; CHECK-NEXT: v_writelane_b32 v1, s1, 5 |
| -; CHECK-NEXT: v_writelane_b32 v1, s2, 6 |
| -; CHECK-NEXT: v_writelane_b32 v1, s3, 7 |
| -; CHECK-NEXT: v_writelane_b32 v1, s4, 8 |
| -; CHECK-NEXT: v_writelane_b32 v1, s5, 9 |
| -; CHECK-NEXT: v_writelane_b32 v1, s6, 10 |
| -; CHECK-NEXT: v_writelane_b32 v1, s7, 11 |
| -; CHECK-NEXT: v_writelane_b32 v1, s8, 12 |
| -; CHECK-NEXT: v_writelane_b32 v1, s9, 13 |
| -; CHECK-NEXT: v_writelane_b32 v1, s10, 14 |
| -; CHECK-NEXT: v_writelane_b32 v1, s11, 15 |
| -; CHECK-NEXT: v_writelane_b32 v1, s12, 16 |
| -; CHECK-NEXT: v_writelane_b32 v1, s13, 17 |
| -; CHECK-NEXT: v_writelane_b32 v1, s14, 18 |
| -; CHECK-NEXT: v_writelane_b32 v1, s15, 19 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[54:55] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[0:3] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_writelane_b32 v1, s0, 20 |
| -; CHECK-NEXT: v_writelane_b32 v1, s1, 21 |
| -; CHECK-NEXT: v_writelane_b32 v1, s2, 22 |
| -; CHECK-NEXT: v_writelane_b32 v1, s3, 23 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[0:7] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_writelane_b32 v1, s0, 24 |
| -; CHECK-NEXT: v_writelane_b32 v1, s1, 25 |
| -; CHECK-NEXT: v_writelane_b32 v1, s2, 26 |
| -; CHECK-NEXT: v_writelane_b32 v1, s3, 27 |
| -; CHECK-NEXT: v_writelane_b32 v1, s4, 28 |
| -; CHECK-NEXT: v_writelane_b32 v1, s5, 29 |
| -; CHECK-NEXT: v_writelane_b32 v1, s6, 30 |
| -; CHECK-NEXT: v_writelane_b32 v1, s7, 31 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; def s[0:15] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_writelane_b32 v1, s0, 32 |
| -; CHECK-NEXT: v_writelane_b32 v1, s1, 33 |
| -; CHECK-NEXT: v_writelane_b32 v1, s2, 34 |
| -; CHECK-NEXT: v_writelane_b32 v1, s3, 35 |
| -; CHECK-NEXT: v_writelane_b32 v1, s4, 36 |
| -; CHECK-NEXT: v_writelane_b32 v1, s5, 37 |
| -; CHECK-NEXT: v_writelane_b32 v1, s6, 38 |
| -; CHECK-NEXT: v_writelane_b32 v1, s7, 39 |
| -; CHECK-NEXT: v_writelane_b32 v1, s8, 40 |
| -; CHECK-NEXT: v_writelane_b32 v1, s9, 41 |
| -; CHECK-NEXT: v_writelane_b32 v1, s10, 42 |
| -; CHECK-NEXT: v_writelane_b32 v1, s11, 43 |
| -; CHECK-NEXT: v_writelane_b32 v1, s12, 44 |
| -; CHECK-NEXT: v_writelane_b32 v1, s13, 45 |
| -; CHECK-NEXT: v_writelane_b32 v1, s14, 46 |
| -; CHECK-NEXT: v_writelane_b32 v1, s15, 47 |
| -; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 |
| -; CHECK-NEXT: ; %bb.1: ; %ret |
| -; CHECK-NEXT: s_endpgm |
| -; CHECK-NEXT: .LBB0_2: ; %bb0 |
| -; CHECK-NEXT: v_readlane_b32 s0, v0, 0 |
| -; CHECK-NEXT: v_readlane_b32 s1, v0, 1 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[0:1] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_readlane_b32 s0, v0, 2 |
| -; CHECK-NEXT: v_readlane_b32 s1, v0, 3 |
| -; CHECK-NEXT: v_readlane_b32 s2, v0, 4 |
| -; CHECK-NEXT: v_readlane_b32 s3, v0, 5 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[0:3] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_readlane_b32 s0, v0, 6 |
| -; CHECK-NEXT: v_readlane_b32 s1, v0, 7 |
| -; CHECK-NEXT: v_readlane_b32 s2, v0, 8 |
| -; CHECK-NEXT: v_readlane_b32 s3, v0, 9 |
| -; CHECK-NEXT: v_readlane_b32 s4, v0, 10 |
| -; CHECK-NEXT: v_readlane_b32 s5, v0, 11 |
| -; CHECK-NEXT: v_readlane_b32 s6, v0, 12 |
| -; CHECK-NEXT: v_readlane_b32 s7, v0, 13 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[0:7] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_readlane_b32 s0, v0, 14 |
| -; CHECK-NEXT: v_readlane_b32 s1, v0, 15 |
| -; CHECK-NEXT: v_readlane_b32 s2, v0, 16 |
| -; CHECK-NEXT: v_readlane_b32 s3, v0, 17 |
| -; CHECK-NEXT: v_readlane_b32 s4, v0, 18 |
| -; CHECK-NEXT: v_readlane_b32 s5, v0, 19 |
| -; CHECK-NEXT: v_readlane_b32 s6, v0, 20 |
| -; CHECK-NEXT: v_readlane_b32 s7, v0, 21 |
| -; CHECK-NEXT: v_readlane_b32 s8, v0, 22 |
| -; CHECK-NEXT: v_readlane_b32 s9, v0, 23 |
| -; CHECK-NEXT: v_readlane_b32 s10, v0, 24 |
| -; CHECK-NEXT: v_readlane_b32 s11, v0, 25 |
| -; CHECK-NEXT: v_readlane_b32 s12, v0, 26 |
| -; CHECK-NEXT: v_readlane_b32 s13, v0, 27 |
| -; CHECK-NEXT: v_readlane_b32 s14, v0, 28 |
| -; CHECK-NEXT: v_readlane_b32 s15, v0, 29 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[0:15] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_readlane_b32 s0, v0, 30 |
| -; CHECK-NEXT: v_readlane_b32 s1, v0, 31 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[0:1] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_readlane_b32 s0, v0, 32 |
| -; CHECK-NEXT: v_readlane_b32 s1, v0, 33 |
| -; CHECK-NEXT: v_readlane_b32 s2, v0, 34 |
| -; CHECK-NEXT: v_readlane_b32 s3, v0, 35 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[0:3] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_readlane_b32 s0, v0, 36 |
| -; CHECK-NEXT: v_readlane_b32 s1, v0, 37 |
| -; CHECK-NEXT: v_readlane_b32 s2, v0, 38 |
| -; CHECK-NEXT: v_readlane_b32 s3, v0, 39 |
| -; CHECK-NEXT: v_readlane_b32 s4, v0, 40 |
| -; CHECK-NEXT: v_readlane_b32 s5, v0, 41 |
| -; CHECK-NEXT: v_readlane_b32 s6, v0, 42 |
| -; CHECK-NEXT: v_readlane_b32 s7, v0, 43 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[0:7] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_readlane_b32 s0, v0, 44 |
| -; CHECK-NEXT: v_readlane_b32 s1, v0, 45 |
| -; CHECK-NEXT: v_readlane_b32 s2, v0, 46 |
| -; CHECK-NEXT: v_readlane_b32 s3, v0, 47 |
| -; CHECK-NEXT: v_readlane_b32 s4, v0, 48 |
| -; CHECK-NEXT: v_readlane_b32 s5, v0, 49 |
| -; CHECK-NEXT: v_readlane_b32 s6, v0, 50 |
| -; CHECK-NEXT: v_readlane_b32 s7, v0, 51 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[16:31] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[52:53] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[48:51] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[36:43] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_readlane_b32 s8, v0, 52 |
| -; CHECK-NEXT: v_readlane_b32 s9, v0, 53 |
| -; CHECK-NEXT: v_readlane_b32 s10, v0, 54 |
| -; CHECK-NEXT: v_readlane_b32 s11, v0, 55 |
| -; CHECK-NEXT: v_readlane_b32 s12, v0, 56 |
| -; CHECK-NEXT: v_readlane_b32 s13, v0, 57 |
| -; CHECK-NEXT: v_readlane_b32 s14, v0, 58 |
| -; CHECK-NEXT: v_readlane_b32 s15, v0, 59 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[0:15] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_readlane_b32 s0, v0, 60 |
| -; CHECK-NEXT: v_readlane_b32 s1, v0, 61 |
| -; CHECK-NEXT: v_readlane_b32 s2, v0, 62 |
| -; CHECK-NEXT: v_readlane_b32 s3, v0, 63 |
| -; CHECK-NEXT: v_readlane_b32 s4, v1, 0 |
| -; CHECK-NEXT: v_readlane_b32 s5, v1, 1 |
| -; CHECK-NEXT: v_readlane_b32 s6, v1, 2 |
| -; CHECK-NEXT: v_readlane_b32 s7, v1, 3 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[34:35] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[44:47] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[0:7] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_readlane_b32 s0, v1, 4 |
| -; CHECK-NEXT: v_readlane_b32 s1, v1, 5 |
| -; CHECK-NEXT: v_readlane_b32 s2, v1, 6 |
| -; CHECK-NEXT: v_readlane_b32 s3, v1, 7 |
| -; CHECK-NEXT: v_readlane_b32 s4, v1, 8 |
| -; CHECK-NEXT: v_readlane_b32 s5, v1, 9 |
| -; CHECK-NEXT: v_readlane_b32 s6, v1, 10 |
| -; CHECK-NEXT: v_readlane_b32 s7, v1, 11 |
| -; CHECK-NEXT: v_readlane_b32 s8, v1, 12 |
| -; CHECK-NEXT: v_readlane_b32 s9, v1, 13 |
| -; CHECK-NEXT: v_readlane_b32 s10, v1, 14 |
| -; CHECK-NEXT: v_readlane_b32 s11, v1, 15 |
| -; CHECK-NEXT: v_readlane_b32 s12, v1, 16 |
| -; CHECK-NEXT: v_readlane_b32 s13, v1, 17 |
| -; CHECK-NEXT: v_readlane_b32 s14, v1, 18 |
| -; CHECK-NEXT: v_readlane_b32 s15, v1, 19 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[0:15] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_readlane_b32 s0, v1, 20 |
| -; CHECK-NEXT: v_readlane_b32 s1, v1, 21 |
| -; CHECK-NEXT: v_readlane_b32 s2, v1, 22 |
| -; CHECK-NEXT: v_readlane_b32 s3, v1, 23 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[54:55] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[0:3] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_readlane_b32 s0, v1, 24 |
| -; CHECK-NEXT: v_readlane_b32 s1, v1, 25 |
| -; CHECK-NEXT: v_readlane_b32 s2, v1, 26 |
| -; CHECK-NEXT: v_readlane_b32 s3, v1, 27 |
| -; CHECK-NEXT: v_readlane_b32 s4, v1, 28 |
| -; CHECK-NEXT: v_readlane_b32 s5, v1, 29 |
| -; CHECK-NEXT: v_readlane_b32 s6, v1, 30 |
| -; CHECK-NEXT: v_readlane_b32 s7, v1, 31 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[0:7] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: v_readlane_b32 s0, v1, 32 |
| -; CHECK-NEXT: v_readlane_b32 s1, v1, 33 |
| -; CHECK-NEXT: v_readlane_b32 s2, v1, 34 |
| -; CHECK-NEXT: v_readlane_b32 s3, v1, 35 |
| -; CHECK-NEXT: v_readlane_b32 s4, v1, 36 |
| -; CHECK-NEXT: v_readlane_b32 s5, v1, 37 |
| -; CHECK-NEXT: v_readlane_b32 s6, v1, 38 |
| -; CHECK-NEXT: v_readlane_b32 s7, v1, 39 |
| -; CHECK-NEXT: v_readlane_b32 s8, v1, 40 |
| -; CHECK-NEXT: v_readlane_b32 s9, v1, 41 |
| -; CHECK-NEXT: v_readlane_b32 s10, v1, 42 |
| -; CHECK-NEXT: v_readlane_b32 s11, v1, 43 |
| -; CHECK-NEXT: v_readlane_b32 s12, v1, 44 |
| -; CHECK-NEXT: v_readlane_b32 s13, v1, 45 |
| -; CHECK-NEXT: v_readlane_b32 s14, v1, 46 |
| -; CHECK-NEXT: v_readlane_b32 s15, v1, 47 |
| -; CHECK-NEXT: ;;#ASMSTART |
| -; CHECK-NEXT: ; use s[0:15] |
| -; CHECK-NEXT: ;;#ASMEND |
| -; CHECK-NEXT: s_endpgm |
| +; CHECK: *** Bad machine code: Using an undefined physical register *** |
| +; CHECK-NEXT: - function: kernel0 |
| +; CHECK-NEXT: - basic block: %bb.0 |
| +; CHECK-NEXT: - instruction: S_CBRANCH_SCC1 %bb.2, implicit killed $scc |
| +; CHECK-NEXT: - operand 1: implicit killed $scc |
| +define amdgpu_kernel void @kernel0(i32 addrspace(1)* %out, i32 %in) #1 { |
| call void asm sideeffect "", "~{v[0:7]}" () #0 |
| call void asm sideeffect "", "~{v[8:15]}" () #0 |
| call void asm sideeffect "", "~{v[16:19]}"() #0 |
| diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir |
| index 3ce4c6c67718..687adc69bd14 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir |
| +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir |
| @@ -1,6 +1,4 @@ |
| -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py |
| -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILL %s |
| -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs --start-before=si-lower-sgpr-spills --stop-after=prologepilog -o - %s | FileCheck -check-prefix=PEI %s |
| +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s |
| |
| # After handling the SGPR spill to VGPR in SILowerSGPRSpills pass, replace the dead frame index in the DBG_VALUE instruction with reg 0. |
| # Otherwise, the test would crash during PEI while trying to replace the dead frame index. |
| @@ -41,21 +39,13 @@ machineFunctionInfo: |
| workGroupIDX: { reg: '$sgpr8' } |
| privateSegmentWaveByteOffset: { reg: '$sgpr9' } |
| body: | |
| - ; SGPR_SPILL-LABEL: name: test |
| - ; SGPR_SPILL: bb.0: |
| - ; SGPR_SPILL: [[VGPR:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF |
| - ; SGPR_SPILL: [[VGPR]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[VGPR]] |
| - ; SGPR_SPILL: DBG_VALUE $noreg, 0 |
| - ; SGPR_SPILL: bb.1: |
| - ; SGPR_SPILL: $sgpr10 = V_READLANE_B32 [[VGPR]], 0 |
| - ; SGPR_SPILL: S_ENDPGM 0 |
| - ; PEI-LABEL: name: test |
| - ; PEI: bb.0: |
| - ; PEI: renamable $[[VGPR:vgpr[0-9]+]] = IMPLICIT_DEF |
| - ; PEI: renamable $[[VGPR]] = V_WRITELANE_B32 killed $sgpr10, 0, killed $[[VGPR]] |
| - ; PEI: bb.1: |
| - ; PEI: $sgpr10 = V_READLANE_B32 killed $[[VGPR]], 0 |
| - ; PEI: S_ENDPGM 0 |
| + ; CHECK-LABEL: name: test |
| + ; CHECK: bb.0: |
| + ; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 0, $vgpr0 |
| + ; CHECK: DBG_VALUE $noreg, 0 |
| + ; CHECK: bb.1: |
| + ; CHECK: $sgpr10 = V_READLANE_B32 $vgpr0, 0 |
| + ; CHECK: S_ENDPGM 0 |
| bb.0: |
| renamable $sgpr10 = IMPLICIT_DEF |
| SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 |
| diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir |
| index a6cb7d4af764..4694810379fe 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir |
| +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir |
| @@ -1,4 +1,4 @@ |
| -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s |
| +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s |
| |
| # After handling the SGPR spill to VGPR in SILowerSGPRSpills pass, we replace the dead frame index in the DBG_VALUE instruction with reg 0. |
| # Skip looking for frame indices in the debug value instruction for incoming arguments passed via stack. The test would crash otherwise. |
| @@ -45,7 +45,7 @@ machineFunctionInfo: |
| body: | |
| ; CHECK-LABEL: name: test |
| ; CHECK: bb.0: |
| - ; CHECK: DBG_VALUE |
| + ; CHECK: DBG_VALUE $noreg, 0 |
| bb.0: |
| renamable $sgpr10 = IMPLICIT_DEF |
| SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 |
| diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll |
| index 804779d5a63f..16aadade906e 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll |
| @@ -1,10 +1,8 @@ |
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s |
| |
| -; This test was originally written when SGPRs are spilled directly to physical VGPRs and |
| -; stressed a case when there wasn't enough VGPRs to accommodate all spills. |
| -; When we started spilling them into virtual VGPR lanes, we always succeed in doing so. |
| -; The regalloc pass later takes care of allocating VGPRs to these virtual registers. |
| +; The first 64 SGPR spills can go to a VGPR, but there isn't a second |
| +; so some spills must be to memory. The last 16 element spill runs out of lanes at the 15th element. |
| |
| define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %in) #1 { |
| ; GCN-LABEL: partial_no_vgprs_last_sgpr_spill: |
| @@ -25,179 +23,179 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[8:23] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 0 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 1 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 2 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 3 |
| -; GCN-NEXT: v_writelane_b32 v0, s12, 4 |
| -; GCN-NEXT: v_writelane_b32 v0, s13, 5 |
| -; GCN-NEXT: v_writelane_b32 v0, s14, 6 |
| -; GCN-NEXT: v_writelane_b32 v0, s15, 7 |
| -; GCN-NEXT: v_writelane_b32 v0, s16, 8 |
| -; GCN-NEXT: v_writelane_b32 v0, s17, 9 |
| -; GCN-NEXT: v_writelane_b32 v0, s18, 10 |
| -; GCN-NEXT: v_writelane_b32 v0, s19, 11 |
| -; GCN-NEXT: v_writelane_b32 v0, s20, 12 |
| -; GCN-NEXT: v_writelane_b32 v0, s21, 13 |
| -; GCN-NEXT: v_writelane_b32 v0, s22, 14 |
| -; GCN-NEXT: v_writelane_b32 v0, s23, 15 |
| +; GCN-NEXT: v_writelane_b32 v23, s8, 0 |
| +; GCN-NEXT: v_writelane_b32 v23, s9, 1 |
| +; GCN-NEXT: v_writelane_b32 v23, s10, 2 |
| +; GCN-NEXT: v_writelane_b32 v23, s11, 3 |
| +; GCN-NEXT: v_writelane_b32 v23, s12, 4 |
| +; GCN-NEXT: v_writelane_b32 v23, s13, 5 |
| +; GCN-NEXT: v_writelane_b32 v23, s14, 6 |
| +; GCN-NEXT: v_writelane_b32 v23, s15, 7 |
| +; GCN-NEXT: v_writelane_b32 v23, s16, 8 |
| +; GCN-NEXT: v_writelane_b32 v23, s17, 9 |
| +; GCN-NEXT: v_writelane_b32 v23, s18, 10 |
| +; GCN-NEXT: v_writelane_b32 v23, s19, 11 |
| +; GCN-NEXT: v_writelane_b32 v23, s20, 12 |
| +; GCN-NEXT: v_writelane_b32 v23, s21, 13 |
| +; GCN-NEXT: v_writelane_b32 v23, s22, 14 |
| +; GCN-NEXT: v_writelane_b32 v23, s23, 15 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[8:23] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 16 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 17 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 18 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 19 |
| -; GCN-NEXT: v_writelane_b32 v0, s12, 20 |
| -; GCN-NEXT: v_writelane_b32 v0, s13, 21 |
| -; GCN-NEXT: v_writelane_b32 v0, s14, 22 |
| -; GCN-NEXT: v_writelane_b32 v0, s15, 23 |
| -; GCN-NEXT: v_writelane_b32 v0, s16, 24 |
| -; GCN-NEXT: v_writelane_b32 v0, s17, 25 |
| -; GCN-NEXT: v_writelane_b32 v0, s18, 26 |
| -; GCN-NEXT: v_writelane_b32 v0, s19, 27 |
| -; GCN-NEXT: v_writelane_b32 v0, s20, 28 |
| -; GCN-NEXT: v_writelane_b32 v0, s21, 29 |
| -; GCN-NEXT: v_writelane_b32 v0, s22, 30 |
| -; GCN-NEXT: v_writelane_b32 v0, s23, 31 |
| +; GCN-NEXT: v_writelane_b32 v23, s8, 16 |
| +; GCN-NEXT: v_writelane_b32 v23, s9, 17 |
| +; GCN-NEXT: v_writelane_b32 v23, s10, 18 |
| +; GCN-NEXT: v_writelane_b32 v23, s11, 19 |
| +; GCN-NEXT: v_writelane_b32 v23, s12, 20 |
| +; GCN-NEXT: v_writelane_b32 v23, s13, 21 |
| +; GCN-NEXT: v_writelane_b32 v23, s14, 22 |
| +; GCN-NEXT: v_writelane_b32 v23, s15, 23 |
| +; GCN-NEXT: v_writelane_b32 v23, s16, 24 |
| +; GCN-NEXT: v_writelane_b32 v23, s17, 25 |
| +; GCN-NEXT: v_writelane_b32 v23, s18, 26 |
| +; GCN-NEXT: v_writelane_b32 v23, s19, 27 |
| +; GCN-NEXT: v_writelane_b32 v23, s20, 28 |
| +; GCN-NEXT: v_writelane_b32 v23, s21, 29 |
| +; GCN-NEXT: v_writelane_b32 v23, s22, 30 |
| +; GCN-NEXT: v_writelane_b32 v23, s23, 31 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[8:23] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 32 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 33 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 34 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 35 |
| -; GCN-NEXT: v_writelane_b32 v0, s12, 36 |
| -; GCN-NEXT: v_writelane_b32 v0, s13, 37 |
| -; GCN-NEXT: v_writelane_b32 v0, s14, 38 |
| -; GCN-NEXT: v_writelane_b32 v0, s15, 39 |
| -; GCN-NEXT: v_writelane_b32 v0, s16, 40 |
| -; GCN-NEXT: v_writelane_b32 v0, s17, 41 |
| -; GCN-NEXT: v_writelane_b32 v0, s18, 42 |
| -; GCN-NEXT: v_writelane_b32 v0, s19, 43 |
| -; GCN-NEXT: v_writelane_b32 v0, s20, 44 |
| -; GCN-NEXT: v_writelane_b32 v0, s21, 45 |
| -; GCN-NEXT: v_writelane_b32 v0, s22, 46 |
| -; GCN-NEXT: v_writelane_b32 v0, s23, 47 |
| +; GCN-NEXT: v_writelane_b32 v23, s8, 32 |
| +; GCN-NEXT: v_writelane_b32 v23, s9, 33 |
| +; GCN-NEXT: v_writelane_b32 v23, s10, 34 |
| +; GCN-NEXT: v_writelane_b32 v23, s11, 35 |
| +; GCN-NEXT: v_writelane_b32 v23, s12, 36 |
| +; GCN-NEXT: v_writelane_b32 v23, s13, 37 |
| +; GCN-NEXT: v_writelane_b32 v23, s14, 38 |
| +; GCN-NEXT: v_writelane_b32 v23, s15, 39 |
| +; GCN-NEXT: v_writelane_b32 v23, s16, 40 |
| +; GCN-NEXT: v_writelane_b32 v23, s17, 41 |
| +; GCN-NEXT: v_writelane_b32 v23, s18, 42 |
| +; GCN-NEXT: v_writelane_b32 v23, s19, 43 |
| +; GCN-NEXT: v_writelane_b32 v23, s20, 44 |
| +; GCN-NEXT: v_writelane_b32 v23, s21, 45 |
| +; GCN-NEXT: v_writelane_b32 v23, s22, 46 |
| +; GCN-NEXT: v_writelane_b32 v23, s23, 47 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[8:23] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_writelane_b32 v0, s8, 48 |
| -; GCN-NEXT: v_writelane_b32 v0, s9, 49 |
| -; GCN-NEXT: v_writelane_b32 v0, s10, 50 |
| -; GCN-NEXT: v_writelane_b32 v0, s11, 51 |
| -; GCN-NEXT: v_writelane_b32 v0, s12, 52 |
| -; GCN-NEXT: v_writelane_b32 v0, s13, 53 |
| -; GCN-NEXT: v_writelane_b32 v0, s14, 54 |
| -; GCN-NEXT: v_writelane_b32 v0, s15, 55 |
| -; GCN-NEXT: v_writelane_b32 v0, s16, 56 |
| -; GCN-NEXT: v_writelane_b32 v0, s17, 57 |
| -; GCN-NEXT: v_writelane_b32 v0, s18, 58 |
| -; GCN-NEXT: v_writelane_b32 v0, s19, 59 |
| -; GCN-NEXT: v_writelane_b32 v0, s20, 60 |
| -; GCN-NEXT: v_writelane_b32 v0, s21, 61 |
| -; GCN-NEXT: v_writelane_b32 v0, s22, 62 |
| -; GCN-NEXT: v_writelane_b32 v0, s23, 63 |
| -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[24:25] |
| +; GCN-NEXT: v_writelane_b32 v23, s8, 48 |
| +; GCN-NEXT: v_writelane_b32 v23, s9, 49 |
| +; GCN-NEXT: v_writelane_b32 v23, s10, 50 |
| +; GCN-NEXT: v_writelane_b32 v23, s11, 51 |
| +; GCN-NEXT: v_writelane_b32 v23, s12, 52 |
| +; GCN-NEXT: v_writelane_b32 v23, s13, 53 |
| +; GCN-NEXT: v_writelane_b32 v23, s14, 54 |
| +; GCN-NEXT: v_writelane_b32 v23, s15, 55 |
| +; GCN-NEXT: v_writelane_b32 v23, s16, 56 |
| +; GCN-NEXT: v_writelane_b32 v23, s17, 57 |
| +; GCN-NEXT: v_writelane_b32 v23, s18, 58 |
| +; GCN-NEXT: v_writelane_b32 v23, s19, 59 |
| +; GCN-NEXT: v_writelane_b32 v23, s20, 60 |
| +; GCN-NEXT: v_writelane_b32 v23, s21, 61 |
| +; GCN-NEXT: v_writelane_b32 v23, s22, 62 |
| +; GCN-NEXT: v_writelane_b32 v23, s23, 63 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s[6:7] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| +; GCN-NEXT: s_mov_b64 s[8:9], exec |
| +; GCN-NEXT: s_mov_b64 exec, 3 |
| +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| ; GCN-NEXT: v_writelane_b32 v0, s6, 0 |
| ; GCN-NEXT: v_writelane_b32 v0, s7, 1 |
| -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 |
| ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[24:25] |
| +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 |
| +; GCN-NEXT: s_waitcnt vmcnt(0) |
| +; GCN-NEXT: s_mov_b64 exec, s[8:9] |
| ; GCN-NEXT: s_mov_b32 s5, 0 |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: s_cmp_lg_u32 s4, s5 |
| ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 |
| ; GCN-NEXT: ; %bb.1: ; %bb0 |
| -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[24:25] |
| -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 |
| -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[24:25] |
| -; GCN-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-NEXT: v_readlane_b32 s4, v1, 0 |
| -; GCN-NEXT: v_readlane_b32 s5, v1, 1 |
| -; GCN-NEXT: v_readlane_b32 s6, v1, 2 |
| -; GCN-NEXT: v_readlane_b32 s7, v1, 3 |
| -; GCN-NEXT: v_readlane_b32 s8, v1, 4 |
| -; GCN-NEXT: v_readlane_b32 s9, v1, 5 |
| -; GCN-NEXT: v_readlane_b32 s10, v1, 6 |
| -; GCN-NEXT: v_readlane_b32 s11, v1, 7 |
| -; GCN-NEXT: v_readlane_b32 s12, v1, 8 |
| -; GCN-NEXT: v_readlane_b32 s13, v1, 9 |
| -; GCN-NEXT: v_readlane_b32 s14, v1, 10 |
| -; GCN-NEXT: v_readlane_b32 s15, v1, 11 |
| -; GCN-NEXT: v_readlane_b32 s16, v1, 12 |
| -; GCN-NEXT: v_readlane_b32 s17, v1, 13 |
| -; GCN-NEXT: v_readlane_b32 s18, v1, 14 |
| -; GCN-NEXT: v_readlane_b32 s19, v1, 15 |
| +; GCN-NEXT: v_readlane_b32 s4, v23, 0 |
| +; GCN-NEXT: v_readlane_b32 s5, v23, 1 |
| +; GCN-NEXT: v_readlane_b32 s6, v23, 2 |
| +; GCN-NEXT: v_readlane_b32 s7, v23, 3 |
| +; GCN-NEXT: v_readlane_b32 s8, v23, 4 |
| +; GCN-NEXT: v_readlane_b32 s9, v23, 5 |
| +; GCN-NEXT: v_readlane_b32 s10, v23, 6 |
| +; GCN-NEXT: v_readlane_b32 s11, v23, 7 |
| +; GCN-NEXT: v_readlane_b32 s12, v23, 8 |
| +; GCN-NEXT: v_readlane_b32 s13, v23, 9 |
| +; GCN-NEXT: v_readlane_b32 s14, v23, 10 |
| +; GCN-NEXT: v_readlane_b32 s15, v23, 11 |
| +; GCN-NEXT: v_readlane_b32 s16, v23, 12 |
| +; GCN-NEXT: v_readlane_b32 s17, v23, 13 |
| +; GCN-NEXT: v_readlane_b32 s18, v23, 14 |
| +; GCN-NEXT: v_readlane_b32 s19, v23, 15 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; use s[4:19] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_readlane_b32 s4, v1, 16 |
| -; GCN-NEXT: v_readlane_b32 s5, v1, 17 |
| -; GCN-NEXT: v_readlane_b32 s6, v1, 18 |
| -; GCN-NEXT: v_readlane_b32 s7, v1, 19 |
| -; GCN-NEXT: v_readlane_b32 s8, v1, 20 |
| -; GCN-NEXT: v_readlane_b32 s9, v1, 21 |
| -; GCN-NEXT: v_readlane_b32 s10, v1, 22 |
| -; GCN-NEXT: v_readlane_b32 s11, v1, 23 |
| -; GCN-NEXT: v_readlane_b32 s12, v1, 24 |
| -; GCN-NEXT: v_readlane_b32 s13, v1, 25 |
| -; GCN-NEXT: v_readlane_b32 s14, v1, 26 |
| -; GCN-NEXT: v_readlane_b32 s15, v1, 27 |
| -; GCN-NEXT: v_readlane_b32 s16, v1, 28 |
| -; GCN-NEXT: v_readlane_b32 s17, v1, 29 |
| -; GCN-NEXT: v_readlane_b32 s18, v1, 30 |
| -; GCN-NEXT: v_readlane_b32 s19, v1, 31 |
| +; GCN-NEXT: v_readlane_b32 s4, v23, 16 |
| +; GCN-NEXT: v_readlane_b32 s5, v23, 17 |
| +; GCN-NEXT: v_readlane_b32 s6, v23, 18 |
| +; GCN-NEXT: v_readlane_b32 s7, v23, 19 |
| +; GCN-NEXT: v_readlane_b32 s8, v23, 20 |
| +; GCN-NEXT: v_readlane_b32 s9, v23, 21 |
| +; GCN-NEXT: v_readlane_b32 s10, v23, 22 |
| +; GCN-NEXT: v_readlane_b32 s11, v23, 23 |
| +; GCN-NEXT: v_readlane_b32 s12, v23, 24 |
| +; GCN-NEXT: v_readlane_b32 s13, v23, 25 |
| +; GCN-NEXT: v_readlane_b32 s14, v23, 26 |
| +; GCN-NEXT: v_readlane_b32 s15, v23, 27 |
| +; GCN-NEXT: v_readlane_b32 s16, v23, 28 |
| +; GCN-NEXT: v_readlane_b32 s17, v23, 29 |
| +; GCN-NEXT: v_readlane_b32 s18, v23, 30 |
| +; GCN-NEXT: v_readlane_b32 s19, v23, 31 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; use s[4:19] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_readlane_b32 s4, v1, 32 |
| -; GCN-NEXT: v_readlane_b32 s5, v1, 33 |
| -; GCN-NEXT: v_readlane_b32 s6, v1, 34 |
| -; GCN-NEXT: v_readlane_b32 s7, v1, 35 |
| -; GCN-NEXT: v_readlane_b32 s8, v1, 36 |
| -; GCN-NEXT: v_readlane_b32 s9, v1, 37 |
| -; GCN-NEXT: v_readlane_b32 s10, v1, 38 |
| -; GCN-NEXT: v_readlane_b32 s11, v1, 39 |
| -; GCN-NEXT: v_readlane_b32 s12, v1, 40 |
| -; GCN-NEXT: v_readlane_b32 s13, v1, 41 |
| -; GCN-NEXT: v_readlane_b32 s14, v1, 42 |
| -; GCN-NEXT: v_readlane_b32 s15, v1, 43 |
| -; GCN-NEXT: v_readlane_b32 s16, v1, 44 |
| -; GCN-NEXT: v_readlane_b32 s17, v1, 45 |
| -; GCN-NEXT: v_readlane_b32 s18, v1, 46 |
| -; GCN-NEXT: v_readlane_b32 s19, v1, 47 |
| +; GCN-NEXT: v_readlane_b32 s4, v23, 32 |
| +; GCN-NEXT: v_readlane_b32 s5, v23, 33 |
| +; GCN-NEXT: v_readlane_b32 s6, v23, 34 |
| +; GCN-NEXT: v_readlane_b32 s7, v23, 35 |
| +; GCN-NEXT: v_readlane_b32 s8, v23, 36 |
| +; GCN-NEXT: v_readlane_b32 s9, v23, 37 |
| +; GCN-NEXT: v_readlane_b32 s10, v23, 38 |
| +; GCN-NEXT: v_readlane_b32 s11, v23, 39 |
| +; GCN-NEXT: v_readlane_b32 s12, v23, 40 |
| +; GCN-NEXT: v_readlane_b32 s13, v23, 41 |
| +; GCN-NEXT: v_readlane_b32 s14, v23, 42 |
| +; GCN-NEXT: v_readlane_b32 s15, v23, 43 |
| +; GCN-NEXT: v_readlane_b32 s16, v23, 44 |
| +; GCN-NEXT: v_readlane_b32 s17, v23, 45 |
| +; GCN-NEXT: v_readlane_b32 s18, v23, 46 |
| +; GCN-NEXT: v_readlane_b32 s19, v23, 47 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; use s[4:19] |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: v_readlane_b32 s8, v1, 48 |
| -; GCN-NEXT: v_readlane_b32 s9, v1, 49 |
| -; GCN-NEXT: v_readlane_b32 s10, v1, 50 |
| -; GCN-NEXT: v_readlane_b32 s11, v1, 51 |
| -; GCN-NEXT: v_readlane_b32 s12, v1, 52 |
| -; GCN-NEXT: v_readlane_b32 s13, v1, 53 |
| -; GCN-NEXT: v_readlane_b32 s14, v1, 54 |
| -; GCN-NEXT: v_readlane_b32 s15, v1, 55 |
| -; GCN-NEXT: v_readlane_b32 s16, v1, 56 |
| -; GCN-NEXT: v_readlane_b32 s17, v1, 57 |
| -; GCN-NEXT: v_readlane_b32 s18, v1, 58 |
| -; GCN-NEXT: v_readlane_b32 s19, v1, 59 |
| -; GCN-NEXT: v_readlane_b32 s20, v1, 60 |
| -; GCN-NEXT: v_readlane_b32 s21, v1, 61 |
| -; GCN-NEXT: v_readlane_b32 s22, v1, 62 |
| -; GCN-NEXT: v_readlane_b32 s23, v1, 63 |
| +; GCN-NEXT: v_readlane_b32 s8, v23, 48 |
| +; GCN-NEXT: v_readlane_b32 s9, v23, 49 |
| +; GCN-NEXT: v_readlane_b32 s10, v23, 50 |
| +; GCN-NEXT: v_readlane_b32 s11, v23, 51 |
| +; GCN-NEXT: v_readlane_b32 s12, v23, 52 |
| +; GCN-NEXT: v_readlane_b32 s13, v23, 53 |
| +; GCN-NEXT: v_readlane_b32 s14, v23, 54 |
| +; GCN-NEXT: v_readlane_b32 s15, v23, 55 |
| +; GCN-NEXT: v_readlane_b32 s16, v23, 56 |
| +; GCN-NEXT: v_readlane_b32 s17, v23, 57 |
| +; GCN-NEXT: v_readlane_b32 s18, v23, 58 |
| +; GCN-NEXT: v_readlane_b32 s19, v23, 59 |
| +; GCN-NEXT: v_readlane_b32 s20, v23, 60 |
| +; GCN-NEXT: v_readlane_b32 s21, v23, 61 |
| +; GCN-NEXT: v_readlane_b32 s22, v23, 62 |
| +; GCN-NEXT: v_readlane_b32 s23, v23, 63 |
| +; GCN-NEXT: s_mov_b64 s[6:7], exec |
| +; GCN-NEXT: s_mov_b64 exec, 3 |
| +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload |
| +; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: v_readlane_b32 s4, v0, 0 |
| ; GCN-NEXT: v_readlane_b32 s5, v0, 1 |
| +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 |
| +; GCN-NEXT: s_waitcnt vmcnt(0) |
| +; GCN-NEXT: s_mov_b64 exec, s[6:7] |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; use s[8:23] |
| ; GCN-NEXT: ;;#ASMEND |
| diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir |
| index 26a5eedc3eca..9596d3b7f635 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir |
| +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir |
| @@ -20,11 +20,10 @@ body: | |
| liveins: $sgpr4 |
| |
| ; CHECK-LABEL: name: sgpr_spill_s64_undef_high32 |
| - ; CHECK: liveins: $sgpr4 |
| + ; CHECK: liveins: $sgpr4, $vgpr0 |
| ; CHECK-NEXT: {{ $}} |
| - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF |
| - ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 |
| - ; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5 |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5 |
| SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) |
| |
| ... |
| @@ -46,11 +45,10 @@ body: | |
| liveins: $sgpr5 |
| |
| ; CHECK-LABEL: name: sgpr_spill_s64_undef_low32 |
| - ; CHECK: liveins: $sgpr5 |
| + ; CHECK: liveins: $sgpr5, $vgpr0 |
| ; CHECK-NEXT: {{ $}} |
| - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF |
| - ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 |
| - ; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5 |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5 |
| SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) |
| |
| ... |
| diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll |
| index c0364b4d0e90..b20f540cf247 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll |
| @@ -12,17 +12,16 @@ define amdgpu_kernel void @kernel() { |
| ; GCN-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0 |
| ; GCN-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 |
| ; GCN-NEXT: s_mov_b32 s38, -1 |
| -; GCN-NEXT: ; implicit-def: $vgpr3 |
| ; GCN-NEXT: s_mov_b32 s39, 0xe00000 |
| -; GCN-NEXT: v_writelane_b32 v3, s4, 0 |
| +; GCN-NEXT: v_writelane_b32 v40, s4, 0 |
| ; GCN-NEXT: s_add_u32 s36, s36, s11 |
| -; GCN-NEXT: v_writelane_b32 v3, s5, 1 |
| +; GCN-NEXT: v_writelane_b32 v40, s5, 1 |
| ; GCN-NEXT: s_addc_u32 s37, s37, 0 |
| ; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] |
| -; GCN-NEXT: v_readlane_b32 s0, v3, 0 |
| +; GCN-NEXT: v_readlane_b32 s0, v40, 0 |
| ; GCN-NEXT: s_mov_b32 s13, s9 |
| ; GCN-NEXT: s_mov_b32 s12, s8 |
| -; GCN-NEXT: v_readlane_b32 s1, v3, 1 |
| +; GCN-NEXT: v_readlane_b32 s1, v40, 1 |
| ; GCN-NEXT: s_add_u32 s8, s0, 36 |
| ; GCN-NEXT: s_addc_u32 s9, s1, 0 |
| ; GCN-NEXT: s_getpc_b64 s[0:1] |
| diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir |
| index 542c756a6757..aadd9e79ff61 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir |
| +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir |
| @@ -1,5 +1,5 @@ |
| # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py |
| -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=false -verify-machineinstrs -start-before=si-lower-sgpr-spills -stop-after=prologepilog -o - %s | FileCheck %s |
| +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=false -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s |
| |
| # Check that we allocate 2 emergency stack slots if we're spilling |
| # SGPRs to memory and potentially have an offset larger than fits in |
| @@ -29,7 +29,7 @@ body: | |
| ; CHECK-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $exec |
| ; CHECK-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr2 |
| ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) |
| - ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr10, 0, undef $vgpr2 |
| + ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 killed $sgpr10, 0, undef $vgpr2 |
| ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) |
| ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) |
| ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7, implicit killed $vgpr2 |
| diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll |
| index 03a538e975be..fc28fd757504 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll |
| @@ -16,10 +16,10 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { |
| ; GCN-LABEL: spill_sgpr_with_no_lower_vgpr_available: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GCN-NEXT: s_mov_b32 s14, s33 |
| +; GCN-NEXT: s_mov_b32 s6, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill |
| +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| +; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 |
| ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill |
| @@ -133,20 +133,13 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { |
| ; GCN-NEXT: buffer_store_dword v252, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| -; GCN-NEXT: v_writelane_b32 v0, s30, 0 |
| -; GCN-NEXT: v_writelane_b32 v0, s31, 1 |
| -; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[12:13] |
| +; GCN-NEXT: v_writelane_b32 v255, s30, 0 |
| +; GCN-NEXT: v_writelane_b32 v255, s31, 1 |
| ; GCN-NEXT: v_mov_b32_e32 v0, 0 |
| ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:444 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[12:13] |
| ; GCN-NEXT: s_getpc_b64 s[4:5] |
| ; GCN-NEXT: s_add_u32 s4, s4, child_function@gotpcrel32@lo+4 |
| ; GCN-NEXT: s_addc_u32 s5, s5, child_function@gotpcrel32@hi+12 |
| @@ -157,8 +150,8 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { |
| ; GCN-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| -; GCN-NEXT: v_readlane_b32 s31, v0, 1 |
| -; GCN-NEXT: v_readlane_b32 s30, v0, 0 |
| +; GCN-NEXT: v_readlane_b32 s31, v255, 1 |
| +; GCN-NEXT: v_readlane_b32 s30, v255, 0 |
| ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| @@ -270,11 +263,11 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 { |
| ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload |
| +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| +; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 |
| -; GCN-NEXT: s_mov_b32 s33, s14 |
| +; GCN-NEXT: s_mov_b32 s33, s6 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| %alloca = alloca i32, align 4, addrspace(5) |
| @@ -315,10 +308,10 @@ define void @spill_to_lowest_available_vgpr() #0 { |
| ; GCN-LABEL: spill_to_lowest_available_vgpr: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GCN-NEXT: s_mov_b32 s14, s33 |
| +; GCN-NEXT: s_mov_b32 s6, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill |
| +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| +; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 |
| ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill |
| @@ -431,20 +424,13 @@ define void @spill_to_lowest_available_vgpr() #0 { |
| ; GCN-NEXT: buffer_store_dword v251, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; GCN-NEXT: buffer_store_dword v252, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| -; GCN-NEXT: v_writelane_b32 v0, s30, 0 |
| -; GCN-NEXT: v_writelane_b32 v0, s31, 1 |
| -; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[12:13] |
| +; GCN-NEXT: v_writelane_b32 v254, s30, 0 |
| +; GCN-NEXT: v_writelane_b32 v254, s31, 1 |
| ; GCN-NEXT: v_mov_b32_e32 v0, 0 |
| ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:440 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[12:13] |
| ; GCN-NEXT: s_getpc_b64 s[4:5] |
| ; GCN-NEXT: s_add_u32 s4, s4, child_function@gotpcrel32@lo+4 |
| ; GCN-NEXT: s_addc_u32 s5, s5, child_function@gotpcrel32@hi+12 |
| @@ -455,8 +441,8 @@ define void @spill_to_lowest_available_vgpr() #0 { |
| ; GCN-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| -; GCN-NEXT: v_readlane_b32 s31, v0, 1 |
| -; GCN-NEXT: v_readlane_b32 s30, v0, 0 |
| +; GCN-NEXT: v_readlane_b32 s31, v254, 1 |
| +; GCN-NEXT: v_readlane_b32 s30, v254, 0 |
| ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v252, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v251, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| @@ -567,11 +553,11 @@ define void @spill_to_lowest_available_vgpr() #0 { |
| ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:428 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload |
| +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| +; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 |
| -; GCN-NEXT: s_mov_b32 s33, s14 |
| +; GCN-NEXT: s_mov_b32 s33, s6 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| %alloca = alloca i32, align 4, addrspace(5) |
| @@ -612,8 +598,8 @@ define void @spill_sgpr_with_sgpr_uses() #0 { |
| ; GCN-LABEL: spill_sgpr_with_sgpr_uses: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill |
| +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| +; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill |
| ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill |
| @@ -733,18 +719,10 @@ define void @spill_sgpr_with_sgpr_uses() #0 { |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; def s4 |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| -; GCN-NEXT: v_writelane_b32 v0, s4, 0 |
| -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[8:9] |
| +; GCN-NEXT: v_writelane_b32 v254, s4, 0 |
| ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 |
| ; GCN-NEXT: ; %bb.1: ; %bb0 |
| -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[8:9] |
| -; GCN-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-NEXT: v_readlane_b32 s4, v0, 0 |
| +; GCN-NEXT: v_readlane_b32 s4, v254, 0 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ; use s4 |
| ; GCN-NEXT: ;;#ASMEND |
| @@ -859,8 +837,8 @@ define void @spill_sgpr_with_sgpr_uses() #0 { |
| ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload |
| +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 |
| +; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| @@ -1183,8 +1161,7 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Spill |
| -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Spill |
| +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill |
| ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill |
| @@ -1298,54 +1275,45 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in |
| ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill |
| ; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s32 ; 4-byte Folded Spill |
| -; GCN-NEXT: ; implicit-def: $vgpr4 |
| ; GCN-NEXT: v_writelane_b32 v4, s34, 0 |
| ; GCN-NEXT: v_writelane_b32 v4, s35, 1 |
| ; GCN-NEXT: v_writelane_b32 v4, s36, 2 |
| ; GCN-NEXT: v_writelane_b32 v4, s37, 3 |
| -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 |
| -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[8:9] |
| ; GCN-NEXT: v_mov_b32_e32 v5, v3 |
| -; GCN-NEXT: v_mov_b32_e32 v3, v2 |
| -; GCN-NEXT: v_mov_b32_e32 v4, v1 |
| -; GCN-NEXT: v_mov_b32_e32 v1, v0 |
| -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[8:9] |
| +; GCN-NEXT: v_mov_b32_e32 v3, v1 |
| ; GCN-NEXT: ; implicit-def: $sgpr4 |
| ; GCN-NEXT: ; implicit-def: $sgpr4 |
| -; GCN-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $exec |
| -; GCN-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec |
| -; GCN-NEXT: v_mov_b32_e32 v2, v4 |
| +; GCN-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $exec |
| +; GCN-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec |
| +; GCN-NEXT: v_mov_b32_e32 v1, v3 |
| ; GCN-NEXT: ; implicit-def: $sgpr4 |
| ; GCN-NEXT: ; implicit-def: $sgpr4 |
| ; GCN-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec |
| -; GCN-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec |
| -; GCN-NEXT: v_mov_b32_e32 v4, v5 |
| +; GCN-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec |
| +; GCN-NEXT: v_mov_b32_e32 v3, v5 |
| ; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 |
| ; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 |
| -; GCN-NEXT: flat_load_dwordx4 v[3:6], v[3:4] |
| +; GCN-NEXT: flat_load_dwordx4 v[5:8], v[2:3] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill |
| +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill |
| -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill |
| -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill |
| +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill |
| +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill |
| +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ;;#ASMEND |
| -; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload |
| -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload |
| -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload |
| -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload |
| +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload |
| +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload |
| +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload |
| +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ;;#ASMEND |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-NEXT: flat_store_dwordx4 v[1:2], v[3:6] |
| -; GCN-NEXT: v_readlane_b32 s37, v0, 3 |
| -; GCN-NEXT: v_readlane_b32 s36, v0, 2 |
| -; GCN-NEXT: v_readlane_b32 s35, v0, 1 |
| -; GCN-NEXT: v_readlane_b32 s34, v0, 0 |
| +; GCN-NEXT: flat_store_dwordx4 v[0:1], v[5:8] |
| +; GCN-NEXT: v_readlane_b32 s37, v4, 3 |
| +; GCN-NEXT: v_readlane_b32 s36, v4, 2 |
| +; GCN-NEXT: v_readlane_b32 s35, v4, 1 |
| +; GCN-NEXT: v_readlane_b32 s34, v4, 0 |
| ; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s32 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| @@ -1459,8 +1427,7 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in |
| ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:468 ; 4-byte Folded Reload |
| -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:472 ; 4-byte Folded Reload |
| +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| @@ -1541,11 +1508,8 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { |
| ; GCN-LABEL: spill_sgpr_no_free_vgpr_ipra: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GCN-NEXT: s_mov_b32 s14, s33 |
| +; GCN-NEXT: s_mov_b32 s6, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 |
| ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill |
| ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill |
| @@ -1659,11 +1623,21 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { |
| ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| -; GCN-NEXT: v_writelane_b32 v0, s30, 0 |
| -; GCN-NEXT: v_writelane_b32 v0, s31, 1 |
| -; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill |
| +; GCN-NEXT: s_mov_b64 s[14:15], exec |
| +; GCN-NEXT: s_mov_b64 exec, 1 |
| +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:456 |
| +; GCN-NEXT: v_writelane_b32 v1, s30, 0 |
| +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill |
| +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:456 |
| +; GCN-NEXT: s_waitcnt vmcnt(0) |
| +; GCN-NEXT: s_mov_b64 exec, s[14:15] |
| +; GCN-NEXT: s_mov_b64 s[12:13], exec |
| +; GCN-NEXT: s_mov_b64 exec, 1 |
| +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456 |
| +; GCN-NEXT: v_writelane_b32 v0, s31, 0 |
| +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill |
| +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456 |
| +; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: s_mov_b64 exec, s[12:13] |
| ; GCN-NEXT: s_getpc_b64 s[4:5] |
| ; GCN-NEXT: s_add_u32 s4, s4, child_function_ipra@rel32@lo+4 |
| @@ -1673,12 +1647,24 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { |
| ; GCN-NEXT: s_mov_b64 s[0:1], s[8:9] |
| ; GCN-NEXT: s_mov_b64 s[2:3], s[10:11] |
| ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| -; GCN-NEXT: s_or_saveexec_b64 s[12:13], -1 |
| +; GCN-NEXT: s_mov_b64 s[8:9], exec |
| +; GCN-NEXT: s_mov_b64 exec, 1 |
| +; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:456 |
| +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload |
| +; GCN-NEXT: s_waitcnt vmcnt(0) |
| +; GCN-NEXT: v_readlane_b32 s31, v1, 0 |
| +; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:456 |
| +; GCN-NEXT: s_waitcnt vmcnt(0) |
| +; GCN-NEXT: s_mov_b64 exec, s[8:9] |
| +; GCN-NEXT: s_mov_b64 s[4:5], exec |
| +; GCN-NEXT: s_mov_b64 exec, 1 |
| +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:456 |
| ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[12:13] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-NEXT: v_readlane_b32 s31, v0, 1 |
| ; GCN-NEXT: v_readlane_b32 s30, v0, 0 |
| +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:456 |
| +; GCN-NEXT: s_waitcnt vmcnt(0) |
| +; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| @@ -1791,11 +1777,8 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 { |
| ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 |
| -; GCN-NEXT: s_mov_b32 s33, s14 |
| +; GCN-NEXT: s_mov_b32 s33, s6 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| call void @child_function_ipra() |
| diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll |
| index 5eb0ec734cf2..f82b9e4637ba 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll |
| @@ -6,16 +6,16 @@ |
| ; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000 |
| |
| ; Make sure we are handling hazards correctly. |
| -; SGPR: v_mov_b32_e32 v0, vcc_lo |
| -; SGPR-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 |
| -; SGPR-NEXT: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; 4-byte Folded Reload |
| -; SGPR-NEXT: s_mov_b64 exec, [[EXEC_COPY]] |
| +; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 |
| ; SGPR-NEXT: s_waitcnt vmcnt(0) |
| ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0 |
| ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 1 |
| ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 2 |
| ; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3 |
| -; SGPR-NEXT: s_nop 4 |
| +; SGPR-NEXT: buffer_load_dword [[VHI]], off, s[96:99], 0 |
| +; SGPR-NEXT: s_waitcnt vmcnt(0) |
| +; SGPR-NEXT: s_mov_b64 exec, s[4:5] |
| +; SGPR-NEXT: s_nop 1 |
| ; SGPR-NEXT: buffer_store_dword v0, off, s[0:3], 0 |
| |
| ; ALL: s_endpgm |
| diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll |
| index c178fa917476..1c426e281df3 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll |
| @@ -212,15 +212,15 @@ entry: |
| ; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12 |
| |
| ; GCN-DAG: v_writelane_b32 [[CSRV]], s30, 0 |
| -; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| -; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GCN-DAG: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-DAG: v_writelane_b32 [[CSRV]], s31, 1 |
| |
| |
| ; GCN: s_swappc_b64 |
| |
| -; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload |
| -; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| +; GCN-DAG: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload |
| +; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| |
| ; GCN: s_getpc_b64 s[4:5] |
| ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 |
| diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll |
| index 601c36f3146a..8d49223215ca 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll |
| @@ -2,22 +2,20 @@ |
| |
| ; GCN-LABEL: {{^}}spill_csr_s5_copy: |
| ; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 |
| -; GCN: s_xor_saveexec_b64 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, -1 |
| -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| +; GCN: s_or_saveexec_b64 |
| +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec |
| -; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 2 |
| +; GCN: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0 |
| ; GCN: s_swappc_b64 |
| |
| ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 |
| ; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}} |
| |
| -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2 |
| -; GCN: s_xor_saveexec_b64 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, -1 |
| -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload |
| +; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0 |
| +; GCN: s_or_saveexec_b64 |
| +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| ; GCN: s_mov_b64 exec |
| ; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] |
| ; GCN: s_setpc_b64 |
| diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll |
| index 65292eb96c69..67ed07878b7f 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll |
| @@ -117,16 +117,16 @@ define void @test_sgpr_offset_function_scavenge_fail_func() #2 { |
| ; MUBUF-NEXT: ;;#ASMEND |
| ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 glc |
| ; MUBUF-NEXT: s_waitcnt vmcnt(0) |
| -; MUBUF-NEXT: s_add_i32 s10, s32, 0x40100 |
| -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill |
| +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004 |
| +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill |
| ; MUBUF-NEXT: ;;#ASMSTART |
| ; MUBUF-NEXT: ;;#ASMEND |
| ; MUBUF-NEXT: ;;#ASMSTART |
| ; MUBUF-NEXT: ;;#ASMEND |
| ; MUBUF-NEXT: ;;#ASMSTART |
| ; MUBUF-NEXT: ;;#ASMEND |
| -; MUBUF-NEXT: s_add_i32 s10, s32, 0x40100 |
| -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload |
| +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004 |
| +; MUBUF-NEXT: buffer_load_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Reload |
| ; MUBUF-NEXT: s_waitcnt vmcnt(0) |
| ; MUBUF-NEXT: ;;#ASMSTART |
| ; MUBUF-NEXT: ;;#ASMEND |
| @@ -199,15 +199,16 @@ define amdgpu_kernel void @test_sgpr_offset_function_scavenge_fail_kernel() #3 { |
| ; MUBUF-NEXT: ;;#ASMEND |
| ; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 glc |
| ; MUBUF-NEXT: s_waitcnt vmcnt(0) |
| -; MUBUF-NEXT: s_mov_b32 s10, 0x40100 |
| -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s10 ; 4-byte Folded Spill |
| +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004 |
| +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; 4-byte Folded Spill |
| ; MUBUF-NEXT: ;;#ASMSTART |
| ; MUBUF-NEXT: ;;#ASMEND |
| ; MUBUF-NEXT: ;;#ASMSTART |
| ; MUBUF-NEXT: ;;#ASMEND |
| ; MUBUF-NEXT: ;;#ASMSTART |
| ; MUBUF-NEXT: ;;#ASMEND |
| -; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s10 ; 4-byte Folded Reload |
| +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1004 |
| +; MUBUF-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen ; 4-byte Folded Reload |
| ; MUBUF-NEXT: s_waitcnt vmcnt(0) |
| ; MUBUF-NEXT: ;;#ASMSTART |
| ; MUBUF-NEXT: ;;#ASMEND |
| @@ -638,5 +639,5 @@ entry: |
| |
| attributes #0 = { nounwind } |
| attributes #1 = { nounwind "amdgpu-num-sgpr"="17" "amdgpu-num-vgpr"="8" } |
| -attributes #2 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } |
| -attributes #3 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" } |
| +attributes #2 = { nounwind "amdgpu-num-sgpr"="14" "amdgpu-num-vgpr"="8" } |
| +attributes #3 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } |
| diff --git a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir |
| index c16342579401..a4400670ab55 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir |
| +++ b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir |
| @@ -1,5 +1,5 @@ |
| # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py |
| -# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s |
| +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-lower-sgpr-spills,prologepilog,machine-cp -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s |
| |
| # Make sure the initial first $sgpr1 = COPY $sgpr2 copy is not deleted |
| # by the copy propagation after lowering the spill. |
| @@ -26,12 +26,11 @@ body: | |
| ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) |
| ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9 |
| ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2 |
| - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF |
| - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 |
| - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 |
| - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 |
| - ; GCN-NEXT: dead renamable $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 |
| - ; GCN-NEXT: renamable $sgpr8 = COPY renamable $sgpr1 |
| + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 |
| + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 |
| + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 |
| + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 |
| + ; GCN-NEXT: renamable $sgpr8 = COPY killed renamable $sgpr1 |
| ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec |
| ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) |
| ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 |
| @@ -64,11 +63,10 @@ body: | |
| ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) |
| ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9 |
| ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2 |
| - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF |
| - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 |
| - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 |
| - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 |
| - ; GCN-NEXT: dead renamable $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 |
| + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 |
| + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 |
| + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 |
| + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr3, 3, $vgpr0, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 |
| ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec |
| ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) |
| ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 |
| @@ -95,12 +93,12 @@ body: | |
| ; GCN-LABEL: name: spill_vgpr128_use_subreg |
| ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 |
| ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2, implicit $exec |
| + ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2 |
| ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) |
| ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 4, addrspace 5) |
| ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 8, addrspace 5) |
| ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) |
| - ; GCN-NEXT: renamable $vgpr8 = COPY $vgpr2, implicit $exec |
| + ; GCN-NEXT: renamable $vgpr8 = COPY killed renamable $vgpr1 |
| ; GCN-NEXT: S_ENDPGM 0, implicit $vgpr8 |
| renamable $vgpr1 = COPY $vgpr2 |
| SI_SPILL_V128_SAVE renamable $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) |
| @@ -125,11 +123,11 @@ body: | |
| ; GCN-LABEL: name: spill_vgpr128_use_kill |
| ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 |
| ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2, implicit $exec |
| - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) |
| - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 4, addrspace 5) |
| - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 8, addrspace 5) |
| - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) |
| + ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2 |
| + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) |
| + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 4, addrspace 5) |
| + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 8, addrspace 5) |
| + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) |
| ; GCN-NEXT: S_ENDPGM 0 |
| renamable $vgpr1 = COPY $vgpr2 |
| SI_SPILL_V128_SAVE renamable killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) |
| diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll |
| index 2aba719a03a5..db8b2c4371c3 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll |
| @@ -10085,25 +10085,16 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 |
| ; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v5, -1, v0 |
| ; GFX6-NEXT: v_mov_b32_e32 v6, 0 |
| -; GFX6-NEXT: s_mov_b32 s6, 0 |
| -; GFX6-NEXT: s_mov_b32 s7, 0xf000 |
| +; GFX6-NEXT: s_mov_b32 s38, 0 |
| +; GFX6-NEXT: s_mov_b32 s39, 0xf000 |
| ; GFX6-NEXT: s_waitcnt lgkmcnt(0) |
| -; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] |
| +; GFX6-NEXT: s_mov_b64 s[36:37], s[2:3] |
| ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 8, v5 |
| ; GFX6-NEXT: v_mov_b32_e32 v8, v6 |
| -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:240 |
| +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:240 |
| ; GFX6-NEXT: s_addc_u32 s41, s41, 0 |
| -; GFX6-NEXT: s_mov_b32 s2, 0x83800 |
| -; GFX6-NEXT: s_mov_b64 s[34:35], exec |
| -; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill |
| -; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill |
| -; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill |
| -; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill |
| -; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:224 |
| ; GFX6-NEXT: s_mov_b32 s2, 0x83400 |
| +; GFX6-NEXT: s_mov_b64 s[44:45], exec |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| @@ -10111,7 +10102,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill |
| ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:208 |
| +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:224 |
| ; GFX6-NEXT: s_mov_b32 s2, 0x83000 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill |
| @@ -10120,7 +10111,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill |
| ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:192 |
| +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:208 |
| ; GFX6-NEXT: s_mov_b32 s2, 0x82c00 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill |
| @@ -10129,7 +10120,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill |
| ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:176 |
| +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:192 |
| ; GFX6-NEXT: s_mov_b32 s2, 0x82800 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill |
| @@ -10138,7 +10129,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill |
| ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:160 |
| +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:176 |
| ; GFX6-NEXT: s_mov_b32 s2, 0x82400 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill |
| @@ -10147,7 +10138,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill |
| ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:144 |
| +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:160 |
| ; GFX6-NEXT: s_mov_b32 s2, 0x82000 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill |
| @@ -10156,7 +10147,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill |
| ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:128 |
| +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:144 |
| ; GFX6-NEXT: s_mov_b32 s2, 0x81c00 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill |
| @@ -10165,7 +10156,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill |
| ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:112 |
| +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:128 |
| ; GFX6-NEXT: s_mov_b32 s2, 0x81800 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill |
| @@ -10174,7 +10165,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill |
| ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:96 |
| +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:112 |
| ; GFX6-NEXT: s_mov_b32 s2, 0x81400 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill |
| @@ -10183,7 +10174,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill |
| ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:80 |
| +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:96 |
| ; GFX6-NEXT: s_mov_b32 s2, 0x81000 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill |
| @@ -10192,8 +10183,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill |
| ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:64 |
| -; GFX6-NEXT: s_mov_b32 s2, 0x80800 |
| +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:80 |
| +; GFX6-NEXT: s_mov_b32 s2, 0x80c00 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| @@ -10201,32 +10192,26 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill |
| ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 |
| -; GFX6-NEXT: buffer_load_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:16 |
| -; GFX6-NEXT: s_mov_b32 s2, 0x80c00 |
| +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:64 |
| +; GFX6-NEXT: s_mov_b32 s2, 0x80400 |
| +; GFX6-NEXT: s_waitcnt vmcnt(0) |
| +; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill |
| +; GFX6-NEXT: s_waitcnt vmcnt(0) |
| +; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill |
| +; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill |
| +; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill |
| +; GFX6-NEXT: s_waitcnt expcnt(0) |
| +; GFX6-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 |
| +; GFX6-NEXT: buffer_load_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:16 |
| +; GFX6-NEXT: s_mov_b32 s2, 0x80800 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s2 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill |
| ; GFX6-NEXT: buffer_store_dword v11, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill |
| ; GFX6-NEXT: buffer_store_dword v12, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill |
| -; GFX6-NEXT: buffer_load_dwordx4 v[13:16], v[7:8], s[4:7], 0 addr64 offset:32 |
| -; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] |
| -; GFX6-NEXT: s_waitcnt expcnt(3) |
| -; GFX6-NEXT: s_mov_b64 exec, 15 |
| -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 |
| -; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: v_writelane_b32 v9, s0, 0 |
| -; GFX6-NEXT: v_writelane_b32 v9, s1, 1 |
| -; GFX6-NEXT: v_writelane_b32 v9, s2, 2 |
| -; GFX6-NEXT: v_writelane_b32 v9, s3, 3 |
| -; GFX6-NEXT: s_mov_b32 s8, 0x80400 |
| -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s8 ; 4-byte Folded Spill |
| -; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 |
| -; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: s_mov_b64 exec, s[34:35] |
| -; GFX6-NEXT: buffer_load_dwordx4 v[17:20], v[7:8], s[4:7], 0 addr64 offset:48 |
| +; GFX6-NEXT: buffer_load_dwordx4 v[13:16], v[7:8], s[36:39], 0 addr64 offset:32 |
| +; GFX6-NEXT: buffer_load_dwordx4 v[17:20], v[7:8], s[36:39], 0 addr64 offset:48 |
| ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 13, v0 |
| ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 16, v4 |
| ; GFX6-NEXT: v_mov_b32_e32 v7, 1 |
| @@ -10234,7 +10219,6 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: ;;#ASMSTART |
| ; GFX6-NEXT: ; def s[4:11] |
| ; GFX6-NEXT: ;;#ASMEND |
| -; GFX6-NEXT: s_mov_b64 s[36:37], exec |
| ; GFX6-NEXT: s_mov_b64 exec, 0xff |
| ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| @@ -10246,12 +10230,12 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: v_writelane_b32 v4, s9, 5 |
| ; GFX6-NEXT: v_writelane_b32 v4, s10, 6 |
| ; GFX6-NEXT: v_writelane_b32 v4, s11, 7 |
| -; GFX6-NEXT: s_mov_b32 s2, 0x83c00 |
| +; GFX6-NEXT: s_mov_b32 s2, 0x83800 |
| ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: s_mov_b64 exec, s[36:37] |
| +; GFX6-NEXT: s_mov_b64 exec, s[44:45] |
| ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 |
| ; GFX6-NEXT: ;;#ASMSTART |
| ; GFX6-NEXT: ; def s[8:15] |
| @@ -10269,211 +10253,272 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: ; def s[2:3] |
| ; GFX6-NEXT: ;;#ASMEND |
| ; GFX6-NEXT: ;;#ASMSTART |
| +; GFX6-NEXT: ; def s[36:37] |
| +; GFX6-NEXT: ;;#ASMEND |
| +; GFX6-NEXT: ;;#ASMSTART |
| ; GFX6-NEXT: ; def s33 |
| ; GFX6-NEXT: ;;#ASMEND |
| ; GFX6-NEXT: s_and_saveexec_b64 s[34:35], vcc |
| ; GFX6-NEXT: s_cbranch_execz .LBB1_2 |
| ; GFX6-NEXT: ; %bb.1: ; %bb0 |
| -; GFX6-NEXT: s_mov_b64 s[38:39], exec |
| +; GFX6-NEXT: s_mov_b64 s[44:45], exec |
| +; GFX6-NEXT: s_mov_b64 exec, 0xff |
| +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 |
| +; GFX6-NEXT: s_waitcnt expcnt(0) |
| +; GFX6-NEXT: v_writelane_b32 v9, s8, 0 |
| +; GFX6-NEXT: v_writelane_b32 v9, s9, 1 |
| +; GFX6-NEXT: v_writelane_b32 v9, s10, 2 |
| +; GFX6-NEXT: v_writelane_b32 v9, s11, 3 |
| +; GFX6-NEXT: v_writelane_b32 v9, s12, 4 |
| +; GFX6-NEXT: v_writelane_b32 v9, s13, 5 |
| +; GFX6-NEXT: v_writelane_b32 v9, s14, 6 |
| +; GFX6-NEXT: v_writelane_b32 v9, s15, 7 |
| +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2100 |
| +; GFX6-NEXT: buffer_store_dword v9, v4, s[40:43], 0 offen ; 4-byte Folded Spill |
| +; GFX6-NEXT: s_waitcnt expcnt(0) |
| +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 |
| +; GFX6-NEXT: s_waitcnt vmcnt(0) |
| +; GFX6-NEXT: s_mov_b64 exec, s[44:45] |
| +; GFX6-NEXT: s_mov_b64 s[44:45], exec |
| +; GFX6-NEXT: s_mov_b64 exec, 0xff |
| +; GFX6-NEXT: v_mov_b32_e32 v4, 0x20e0 |
| +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], 0 |
| +; GFX6-NEXT: s_waitcnt expcnt(0) |
| +; GFX6-NEXT: buffer_load_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Reload |
| +; GFX6-NEXT: s_waitcnt vmcnt(0) |
| +; GFX6-NEXT: v_readlane_b32 s8, v8, 0 |
| +; GFX6-NEXT: v_readlane_b32 s9, v8, 1 |
| +; GFX6-NEXT: v_readlane_b32 s10, v8, 2 |
| +; GFX6-NEXT: v_readlane_b32 s11, v8, 3 |
| +; GFX6-NEXT: v_readlane_b32 s12, v8, 4 |
| +; GFX6-NEXT: v_readlane_b32 s13, v8, 5 |
| +; GFX6-NEXT: v_readlane_b32 s14, v8, 6 |
| +; GFX6-NEXT: v_readlane_b32 s15, v8, 7 |
| +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], 0 |
| +; GFX6-NEXT: s_waitcnt vmcnt(0) |
| +; GFX6-NEXT: s_mov_b64 exec, s[44:45] |
| +; GFX6-NEXT: s_mov_b64 s[44:45], exec |
| ; GFX6-NEXT: s_mov_b64 exec, 0xff |
| ; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: v_writelane_b32 v7, s8, 0 |
| -; GFX6-NEXT: v_writelane_b32 v7, s9, 1 |
| -; GFX6-NEXT: v_writelane_b32 v7, s10, 2 |
| -; GFX6-NEXT: v_writelane_b32 v7, s11, 3 |
| -; GFX6-NEXT: v_writelane_b32 v7, s12, 4 |
| -; GFX6-NEXT: v_writelane_b32 v7, s13, 5 |
| -; GFX6-NEXT: v_writelane_b32 v7, s14, 6 |
| -; GFX6-NEXT: v_writelane_b32 v7, s15, 7 |
| -; GFX6-NEXT: s_mov_b32 s36, 0x84400 |
| -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s36 ; 4-byte Folded Spill |
| +; GFX6-NEXT: v_writelane_b32 v7, s16, 0 |
| +; GFX6-NEXT: v_writelane_b32 v7, s17, 1 |
| +; GFX6-NEXT: v_writelane_b32 v7, s18, 2 |
| +; GFX6-NEXT: v_writelane_b32 v7, s19, 3 |
| +; GFX6-NEXT: v_writelane_b32 v7, s20, 4 |
| +; GFX6-NEXT: v_writelane_b32 v7, s21, 5 |
| +; GFX6-NEXT: v_writelane_b32 v7, s22, 6 |
| +; GFX6-NEXT: v_writelane_b32 v7, s23, 7 |
| +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2120 |
| +; GFX6-NEXT: buffer_store_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: s_mov_b64 exec, s[38:39] |
| +; GFX6-NEXT: s_mov_b64 exec, s[44:45] |
| ; GFX6-NEXT: s_mov_b64 s[44:45], exec |
| ; GFX6-NEXT: s_mov_b64 exec, 0xff |
| -; GFX6-NEXT: s_mov_b32 s36, 0x83c00 |
| -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 |
| +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2100 |
| +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload |
| +; GFX6-NEXT: buffer_load_dword v9, v4, s[40:43], 0 offen ; 4-byte Folded Reload |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: v_readlane_b32 s8, v4, 0 |
| -; GFX6-NEXT: v_readlane_b32 s9, v4, 1 |
| -; GFX6-NEXT: v_readlane_b32 s10, v4, 2 |
| -; GFX6-NEXT: v_readlane_b32 s11, v4, 3 |
| -; GFX6-NEXT: v_readlane_b32 s12, v4, 4 |
| -; GFX6-NEXT: v_readlane_b32 s13, v4, 5 |
| -; GFX6-NEXT: v_readlane_b32 s14, v4, 6 |
| -; GFX6-NEXT: v_readlane_b32 s15, v4, 7 |
| -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 |
| +; GFX6-NEXT: v_readlane_b32 s16, v9, 0 |
| +; GFX6-NEXT: v_readlane_b32 s17, v9, 1 |
| +; GFX6-NEXT: v_readlane_b32 s18, v9, 2 |
| +; GFX6-NEXT: v_readlane_b32 s19, v9, 3 |
| +; GFX6-NEXT: v_readlane_b32 s20, v9, 4 |
| +; GFX6-NEXT: v_readlane_b32 s21, v9, 5 |
| +; GFX6-NEXT: v_readlane_b32 s22, v9, 6 |
| +; GFX6-NEXT: v_readlane_b32 s23, v9, 7 |
| +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: s_mov_b64 exec, s[44:45] |
| -; GFX6-NEXT: s_mov_b64 s[38:39], exec |
| +; GFX6-NEXT: s_mov_b64 s[44:45], exec |
| ; GFX6-NEXT: s_mov_b64 exec, 0xff |
| ; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: v_writelane_b32 v8, s16, 0 |
| -; GFX6-NEXT: v_writelane_b32 v8, s17, 1 |
| -; GFX6-NEXT: v_writelane_b32 v8, s18, 2 |
| -; GFX6-NEXT: v_writelane_b32 v8, s19, 3 |
| -; GFX6-NEXT: v_writelane_b32 v8, s20, 4 |
| -; GFX6-NEXT: v_writelane_b32 v8, s21, 5 |
| -; GFX6-NEXT: v_writelane_b32 v8, s22, 6 |
| -; GFX6-NEXT: v_writelane_b32 v8, s23, 7 |
| -; GFX6-NEXT: s_mov_b32 s36, 0x84c00 |
| -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s36 ; 4-byte Folded Spill |
| +; GFX6-NEXT: v_writelane_b32 v8, s24, 0 |
| +; GFX6-NEXT: v_writelane_b32 v8, s25, 1 |
| +; GFX6-NEXT: v_writelane_b32 v8, s26, 2 |
| +; GFX6-NEXT: v_writelane_b32 v8, s27, 3 |
| +; GFX6-NEXT: v_writelane_b32 v8, s28, 4 |
| +; GFX6-NEXT: v_writelane_b32 v8, s29, 5 |
| +; GFX6-NEXT: v_writelane_b32 v8, s30, 6 |
| +; GFX6-NEXT: v_writelane_b32 v8, s31, 7 |
| +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2140 |
| +; GFX6-NEXT: buffer_store_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: s_mov_b64 exec, s[38:39] |
| +; GFX6-NEXT: s_mov_b64 exec, s[44:45] |
| ; GFX6-NEXT: s_mov_b64 s[44:45], exec |
| ; GFX6-NEXT: s_mov_b64 exec, 0xff |
| -; GFX6-NEXT: s_mov_b32 s36, 0x84400 |
| +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2120 |
| ; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s36 ; 4-byte Folded Reload |
| +; GFX6-NEXT: buffer_load_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Reload |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: v_readlane_b32 s16, v7, 0 |
| -; GFX6-NEXT: v_readlane_b32 s17, v7, 1 |
| -; GFX6-NEXT: v_readlane_b32 s18, v7, 2 |
| -; GFX6-NEXT: v_readlane_b32 s19, v7, 3 |
| -; GFX6-NEXT: v_readlane_b32 s20, v7, 4 |
| -; GFX6-NEXT: v_readlane_b32 s21, v7, 5 |
| -; GFX6-NEXT: v_readlane_b32 s22, v7, 6 |
| -; GFX6-NEXT: v_readlane_b32 s23, v7, 7 |
| +; GFX6-NEXT: v_readlane_b32 s24, v7, 0 |
| +; GFX6-NEXT: v_readlane_b32 s25, v7, 1 |
| +; GFX6-NEXT: v_readlane_b32 s26, v7, 2 |
| +; GFX6-NEXT: v_readlane_b32 s27, v7, 3 |
| +; GFX6-NEXT: v_readlane_b32 s28, v7, 4 |
| +; GFX6-NEXT: v_readlane_b32 s29, v7, 5 |
| +; GFX6-NEXT: v_readlane_b32 s30, v7, 6 |
| +; GFX6-NEXT: v_readlane_b32 s31, v7, 7 |
| ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: s_mov_b64 exec, s[44:45] |
| -; GFX6-NEXT: s_mov_b64 s[38:39], exec |
| -; GFX6-NEXT: s_mov_b64 exec, 0xff |
| -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 |
| +; GFX6-NEXT: s_mov_b64 s[44:45], exec |
| +; GFX6-NEXT: s_mov_b64 exec, 15 |
| +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: v_writelane_b32 v4, s24, 0 |
| -; GFX6-NEXT: v_writelane_b32 v4, s25, 1 |
| -; GFX6-NEXT: v_writelane_b32 v4, s26, 2 |
| -; GFX6-NEXT: v_writelane_b32 v4, s27, 3 |
| -; GFX6-NEXT: v_writelane_b32 v4, s28, 4 |
| -; GFX6-NEXT: v_writelane_b32 v4, s29, 5 |
| -; GFX6-NEXT: v_writelane_b32 v4, s30, 6 |
| -; GFX6-NEXT: v_writelane_b32 v4, s31, 7 |
| -; GFX6-NEXT: s_mov_b32 s36, 0x85400 |
| -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill |
| +; GFX6-NEXT: v_writelane_b32 v10, s0, 0 |
| +; GFX6-NEXT: v_writelane_b32 v10, s1, 1 |
| +; GFX6-NEXT: v_writelane_b32 v10, s2, 2 |
| +; GFX6-NEXT: v_writelane_b32 v10, s3, 3 |
| +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2160 |
| +; GFX6-NEXT: buffer_store_dword v10, v4, s[40:43], 0 offen ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 |
| +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: s_mov_b64 exec, s[38:39] |
| +; GFX6-NEXT: s_mov_b64 exec, s[44:45] |
| ; GFX6-NEXT: s_mov_b64 s[44:45], exec |
| -; GFX6-NEXT: s_mov_b64 exec, 0xff |
| -; GFX6-NEXT: s_mov_b32 s36, 0x84c00 |
| -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 |
| +; GFX6-NEXT: s_mov_b64 exec, 15 |
| +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], 0 |
| +; GFX6-NEXT: s_waitcnt expcnt(0) |
| +; GFX6-NEXT: v_writelane_b32 v8, s4, 0 |
| +; GFX6-NEXT: v_writelane_b32 v8, s5, 1 |
| +; GFX6-NEXT: v_writelane_b32 v8, s6, 2 |
| +; GFX6-NEXT: v_writelane_b32 v8, s7, 3 |
| +; GFX6-NEXT: s_mov_b32 s0, 0x85c00 |
| +; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s0 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s36 ; 4-byte Folded Reload |
| +; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: v_readlane_b32 s24, v9, 0 |
| -; GFX6-NEXT: v_readlane_b32 s25, v9, 1 |
| -; GFX6-NEXT: v_readlane_b32 s26, v9, 2 |
| -; GFX6-NEXT: v_readlane_b32 s27, v9, 3 |
| -; GFX6-NEXT: v_readlane_b32 s28, v9, 4 |
| -; GFX6-NEXT: v_readlane_b32 s29, v9, 5 |
| -; GFX6-NEXT: v_readlane_b32 s30, v9, 6 |
| -; GFX6-NEXT: v_readlane_b32 s31, v9, 7 |
| +; GFX6-NEXT: s_mov_b64 exec, s[44:45] |
| +; GFX6-NEXT: s_mov_b64 s[0:1], exec |
| +; GFX6-NEXT: s_mov_b64 exec, 3 |
| +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 |
| +; GFX6-NEXT: s_waitcnt expcnt(0) |
| +; GFX6-NEXT: v_writelane_b32 v9, s2, 0 |
| +; GFX6-NEXT: v_writelane_b32 v9, s3, 1 |
| +; GFX6-NEXT: s_mov_b32 s4, 0x86600 |
| +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], s4 ; 4-byte Folded Spill |
| +; GFX6-NEXT: s_waitcnt expcnt(0) |
| ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| +; GFX6-NEXT: s_mov_b64 exec, s[0:1] |
| +; GFX6-NEXT: s_mov_b64 s[44:45], exec |
| +; GFX6-NEXT: s_mov_b64 exec, 0xff |
| +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2140 |
| +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 |
| +; GFX6-NEXT: s_waitcnt expcnt(0) |
| +; GFX6-NEXT: buffer_load_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Reload |
| +; GFX6-NEXT: s_waitcnt vmcnt(0) |
| +; GFX6-NEXT: v_readlane_b32 s0, v7, 0 |
| +; GFX6-NEXT: v_readlane_b32 s1, v7, 1 |
| +; GFX6-NEXT: v_readlane_b32 s2, v7, 2 |
| +; GFX6-NEXT: v_readlane_b32 s3, v7, 3 |
| +; GFX6-NEXT: v_readlane_b32 s4, v7, 4 |
| +; GFX6-NEXT: v_readlane_b32 s5, v7, 5 |
| +; GFX6-NEXT: v_readlane_b32 s6, v7, 6 |
| +; GFX6-NEXT: v_readlane_b32 s7, v7, 7 |
| +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 |
| +; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: s_mov_b64 exec, s[44:45] |
| -; GFX6-NEXT: s_mov_b64 s[36:37], exec |
| +; GFX6-NEXT: s_mov_b64 s[44:45], exec |
| ; GFX6-NEXT: s_mov_b64 exec, 15 |
| ; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: v_writelane_b32 v8, s0, 0 |
| -; GFX6-NEXT: v_writelane_b32 v8, s1, 1 |
| -; GFX6-NEXT: v_writelane_b32 v8, s2, 2 |
| -; GFX6-NEXT: v_writelane_b32 v8, s3, 3 |
| -; GFX6-NEXT: s_mov_b32 s38, 0x85c00 |
| -; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], s38 ; 4-byte Folded Spill |
| +; GFX6-NEXT: v_writelane_b32 v8, s36, 0 |
| +; GFX6-NEXT: v_writelane_b32 v8, s37, 1 |
| +; GFX6-NEXT: v_writelane_b32 v8, s38, 2 |
| +; GFX6-NEXT: v_writelane_b32 v8, s39, 3 |
| +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2180 |
| +; GFX6-NEXT: buffer_store_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: s_mov_b64 exec, s[36:37] |
| +; GFX6-NEXT: s_mov_b64 exec, s[44:45] |
| ; GFX6-NEXT: s_mov_b64 s[38:39], exec |
| -; GFX6-NEXT: s_mov_b64 exec, 15 |
| -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 |
| +; GFX6-NEXT: s_mov_b64 exec, 3 |
| +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: v_writelane_b32 v4, s4, 0 |
| -; GFX6-NEXT: v_writelane_b32 v4, s5, 1 |
| -; GFX6-NEXT: v_writelane_b32 v4, s6, 2 |
| -; GFX6-NEXT: v_writelane_b32 v4, s7, 3 |
| -; GFX6-NEXT: s_mov_b32 s0, 0x86000 |
| -; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], s0 ; 4-byte Folded Spill |
| +; GFX6-NEXT: v_writelane_b32 v10, s36, 0 |
| +; GFX6-NEXT: v_writelane_b32 v10, s37, 1 |
| +; GFX6-NEXT: s_mov_b32 s44, 0x86400 |
| +; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], s44 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 |
| +; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: s_mov_b64 exec, s[38:39] |
| ; GFX6-NEXT: s_mov_b64 s[44:45], exec |
| -; GFX6-NEXT: s_mov_b64 exec, 3 |
| -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 |
| -; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: v_writelane_b32 v7, s2, 0 |
| -; GFX6-NEXT: v_writelane_b32 v7, s3, 1 |
| -; GFX6-NEXT: s_mov_b32 s0, 0x86400 |
| -; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], s0 ; 4-byte Folded Spill |
| +; GFX6-NEXT: s_mov_b64 exec, 15 |
| +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2170 |
| +; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 |
| +; GFX6-NEXT: buffer_load_dword v9, v4, s[40:43], 0 offen ; 4-byte Folded Reload |
| +; GFX6-NEXT: s_waitcnt vmcnt(0) |
| +; GFX6-NEXT: v_readlane_b32 s36, v9, 0 |
| +; GFX6-NEXT: v_readlane_b32 s37, v9, 1 |
| +; GFX6-NEXT: v_readlane_b32 s38, v9, 2 |
| +; GFX6-NEXT: v_readlane_b32 s39, v9, 3 |
| +; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: s_mov_b64 exec, s[44:45] |
| -; GFX6-NEXT: s_mov_b64 s[36:37], exec |
| -; GFX6-NEXT: s_mov_b64 exec, 0xff |
| -; GFX6-NEXT: s_mov_b32 s38, 0x85400 |
| -; GFX6-NEXT: buffer_store_dword v9, off, s[40:43], 0 |
| +; GFX6-NEXT: s_not_b64 exec, exec |
| +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2190 |
| +; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s38 ; 4-byte Folded Reload |
| +; GFX6-NEXT: buffer_load_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Reload |
| +; GFX6-NEXT: s_not_b64 exec, exec |
| +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2190 |
| +; GFX6-NEXT: buffer_load_dword v7, v4, s[40:43], 0 offen ; 4-byte Folded Reload |
| +; GFX6-NEXT: s_not_b64 exec, exec |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: v_readlane_b32 s0, v9, 0 |
| -; GFX6-NEXT: v_readlane_b32 s1, v9, 1 |
| -; GFX6-NEXT: v_readlane_b32 s2, v9, 2 |
| -; GFX6-NEXT: v_readlane_b32 s3, v9, 3 |
| -; GFX6-NEXT: v_readlane_b32 s4, v9, 4 |
| -; GFX6-NEXT: v_readlane_b32 s5, v9, 5 |
| -; GFX6-NEXT: v_readlane_b32 s6, v9, 6 |
| -; GFX6-NEXT: v_readlane_b32 s7, v9, 7 |
| -; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], 0 |
| +; GFX6-NEXT: v_readlane_b32 s44, v7, 0 |
| +; GFX6-NEXT: v_readlane_b32 s45, v7, 1 |
| +; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX6-NEXT: s_mov_b64 s[44:45], exec |
| -; GFX6-NEXT: s_mov_b64 exec, 15 |
| -; GFX6-NEXT: v_mov_b32_e32 v4, 0x2180 |
| +; GFX6-NEXT: s_not_b64 exec, exec |
| +; GFX6-NEXT: s_mov_b64 vcc, s[34:35] |
| +; GFX6-NEXT: s_not_b64 exec, exec |
| +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2198 |
| ; GFX6-NEXT: buffer_store_dword v8, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| ; GFX6-NEXT: buffer_load_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Reload |
| +; GFX6-NEXT: s_not_b64 exec, exec |
| +; GFX6-NEXT: v_mov_b32_e32 v4, 0x2198 |
| +; GFX6-NEXT: buffer_load_dword v8, v4, s[40:43], 0 offen ; 4-byte Folded Reload |
| +; GFX6-NEXT: s_not_b64 exec, exec |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: v_readlane_b32 s36, v8, 0 |
| -; GFX6-NEXT: v_readlane_b32 s37, v8, 1 |
| -; GFX6-NEXT: v_readlane_b32 s38, v8, 2 |
| -; GFX6-NEXT: v_readlane_b32 s39, v8, 3 |
| +; GFX6-NEXT: v_readlane_b32 s34, v8, 0 |
| +; GFX6-NEXT: v_readlane_b32 s35, v8, 1 |
| ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: s_mov_b64 exec, s[44:45] |
| -; GFX6-NEXT: s_mov_b64 vcc, s[34:35] |
| -; GFX6-NEXT: s_mov_b64 s[44:45], exec |
| -; GFX6-NEXT: s_mov_b64 exec, 3 |
| -; GFX6-NEXT: v_mov_b32_e32 v7, 0x2190 |
| +; GFX6-NEXT: s_not_b64 exec, exec |
| +; GFX6-NEXT: ;;#ASMSTART |
| +; GFX6-NEXT: ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35],s[44:45] |
| +; GFX6-NEXT: ;;#ASMEND |
| +; GFX6-NEXT: s_mov_b64 s[34:35], vcc |
| +; GFX6-NEXT: s_mov_b64 s[8:9], exec |
| +; GFX6-NEXT: s_mov_b64 exec, 15 |
| +; GFX6-NEXT: s_mov_b32 s0, 0x86000 |
| ; GFX6-NEXT: buffer_store_dword v4, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload |
| +; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], s0 ; 4-byte Folded Reload |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: v_readlane_b32 s34, v4, 0 |
| -; GFX6-NEXT: v_readlane_b32 s35, v4, 1 |
| +; GFX6-NEXT: v_readlane_b32 s36, v4, 0 |
| +; GFX6-NEXT: v_readlane_b32 s37, v4, 1 |
| +; GFX6-NEXT: v_readlane_b32 s38, v4, 2 |
| +; GFX6-NEXT: v_readlane_b32 s39, v4, 3 |
| ; GFX6-NEXT: buffer_load_dword v4, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: s_mov_b64 exec, s[44:45] |
| -; GFX6-NEXT: ;;#ASMSTART |
| -; GFX6-NEXT: ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35] |
| -; GFX6-NEXT: ;;#ASMEND |
| -; GFX6-NEXT: s_mov_b64 s[34:35], vcc |
| +; GFX6-NEXT: s_mov_b64 exec, s[8:9] |
| ; GFX6-NEXT: s_mov_b64 s[4:5], exec |
| ; GFX6-NEXT: s_mov_b64 exec, 15 |
| -; GFX6-NEXT: s_mov_b32 s6, 0x85c00 |
| +; GFX6-NEXT: s_mov_b32 s6, 0x85800 |
| ; GFX6-NEXT: buffer_store_dword v7, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s6 ; 4-byte Folded Reload |
| @@ -10485,19 +10530,19 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], 0 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: s_mov_b64 exec, s[4:5] |
| -; GFX6-NEXT: s_mov_b32 s2, 0x83c00 |
| +; GFX6-NEXT: s_mov_b32 s2, 0x83800 |
| ; GFX6-NEXT: buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill |
| ; GFX6-NEXT: buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill |
| ; GFX6-NEXT: buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill |
| -; GFX6-NEXT: s_mov_b32 s2, 0x84400 |
| +; GFX6-NEXT: s_mov_b32 s2, 0x84000 |
| ; GFX6-NEXT: buffer_store_dword v13, off, s[40:43], s2 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill |
| ; GFX6-NEXT: buffer_store_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill |
| ; GFX6-NEXT: buffer_store_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill |
| -; GFX6-NEXT: s_mov_b32 s2, 0x84c00 |
| +; GFX6-NEXT: s_mov_b32 s2, 0x84800 |
| ; GFX6-NEXT: buffer_store_dword v17, off, s[40:43], s2 ; 4-byte Folded Spill |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill |
| @@ -10510,12 +10555,12 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_load_dword v18, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v19, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v20, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload |
| -; GFX6-NEXT: s_mov_b32 s2, 0x84400 |
| +; GFX6-NEXT: s_mov_b32 s2, 0x84000 |
| ; GFX6-NEXT: buffer_load_dword v13, off, s[40:43], s2 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload |
| -; GFX6-NEXT: s_mov_b32 s2, 0x83c00 |
| +; GFX6-NEXT: s_mov_b32 s2, 0x83800 |
| ; GFX6-NEXT: buffer_load_dword v0, off, s[40:43], s2 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload |
| @@ -10534,28 +10579,14 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: ;;#ASMEND |
| ; GFX6-NEXT: .LBB1_2: ; %ret |
| ; GFX6-NEXT: s_or_b64 exec, exec, s[34:35] |
| -; GFX6-NEXT: s_mov_b64 s[8:9], exec |
| -; GFX6-NEXT: s_mov_b64 exec, 15 |
| -; GFX6-NEXT: s_mov_b32 s2, 0x80400 |
| -; GFX6-NEXT: buffer_store_dword v10, off, s[40:43], 0 |
| -; GFX6-NEXT: s_waitcnt expcnt(0) |
| -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], s2 ; 4-byte Folded Reload |
| -; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: v_readlane_b32 s4, v10, 0 |
| -; GFX6-NEXT: v_readlane_b32 s5, v10, 1 |
| -; GFX6-NEXT: v_readlane_b32 s6, v10, 2 |
| -; GFX6-NEXT: v_readlane_b32 s7, v10, 3 |
| -; GFX6-NEXT: buffer_load_dword v10, off, s[40:43], 0 |
| -; GFX6-NEXT: s_waitcnt vmcnt(0) |
| -; GFX6-NEXT: s_mov_b64 exec, s[8:9] |
| -; GFX6-NEXT: s_mov_b32 s4, 0x83800 |
| +; GFX6-NEXT: s_mov_b32 s4, 0x83400 |
| ; GFX6-NEXT: v_lshl_b64 v[4:5], v[5:6], 8 |
| ; GFX6-NEXT: buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload |
| -; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7] |
| -; GFX6-NEXT: s_mov_b32 s4, 0x83400 |
| +; GFX6-NEXT: s_mov_b64 s[2:3], s[38:39] |
| +; GFX6-NEXT: s_mov_b32 s4, 0x83000 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:240 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| @@ -10563,7 +10594,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload |
| -; GFX6-NEXT: s_mov_b32 s4, 0x83000 |
| +; GFX6-NEXT: s_mov_b32 s4, 0x82c00 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:224 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| @@ -10571,7 +10602,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload |
| -; GFX6-NEXT: s_mov_b32 s4, 0x82c00 |
| +; GFX6-NEXT: s_mov_b32 s4, 0x82800 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:208 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| @@ -10579,7 +10610,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload |
| -; GFX6-NEXT: s_mov_b32 s4, 0x82800 |
| +; GFX6-NEXT: s_mov_b32 s4, 0x82400 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:192 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| @@ -10587,7 +10618,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload |
| -; GFX6-NEXT: s_mov_b32 s4, 0x82400 |
| +; GFX6-NEXT: s_mov_b32 s4, 0x82000 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:176 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| @@ -10595,7 +10626,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload |
| -; GFX6-NEXT: s_mov_b32 s4, 0x82000 |
| +; GFX6-NEXT: s_mov_b32 s4, 0x81c00 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:160 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| @@ -10603,7 +10634,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload |
| -; GFX6-NEXT: s_mov_b32 s4, 0x81c00 |
| +; GFX6-NEXT: s_mov_b32 s4, 0x81800 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:144 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| @@ -10611,7 +10642,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload |
| -; GFX6-NEXT: s_mov_b32 s4, 0x81800 |
| +; GFX6-NEXT: s_mov_b32 s4, 0x81400 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:128 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| @@ -10619,7 +10650,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload |
| -; GFX6-NEXT: s_mov_b32 s4, 0x81400 |
| +; GFX6-NEXT: s_mov_b32 s4, 0x81000 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:112 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| @@ -10627,7 +10658,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload |
| -; GFX6-NEXT: s_mov_b32 s4, 0x81000 |
| +; GFX6-NEXT: s_mov_b32 s4, 0x80c00 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:96 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| @@ -10635,7 +10666,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload |
| -; GFX6-NEXT: s_mov_b32 s4, 0x80800 |
| +; GFX6-NEXT: s_mov_b32 s4, 0x80400 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:80 |
| ; GFX6-NEXT: s_waitcnt expcnt(0) |
| @@ -10643,7 +10674,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX6-NEXT: buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload |
| ; GFX6-NEXT: buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload |
| -; GFX6-NEXT: s_mov_b32 s4, 0x80c00 |
| +; GFX6-NEXT: s_mov_b32 s4, 0x80800 |
| ; GFX6-NEXT: s_waitcnt vmcnt(0) |
| ; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:64 |
| ; GFX6-NEXT: buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:48 |
| @@ -10745,13 +10776,16 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX9-FLATSCR-NEXT: ; def s[38:39] |
| ; GFX9-FLATSCR-NEXT: ;;#ASMEND |
| ; GFX9-FLATSCR-NEXT: ;;#ASMSTART |
| +; GFX9-FLATSCR-NEXT: ; def s[44:45] |
| +; GFX9-FLATSCR-NEXT: ;;#ASMEND |
| +; GFX9-FLATSCR-NEXT: ;;#ASMSTART |
| ; GFX9-FLATSCR-NEXT: ; def s33 |
| ; GFX9-FLATSCR-NEXT: ;;#ASMEND |
| ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[34:35], vcc |
| ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB1_2 |
| ; GFX9-FLATSCR-NEXT: ; %bb.1: ; %bb0 |
| ; GFX9-FLATSCR-NEXT: ;;#ASMSTART |
| -; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39] |
| +; GFX9-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39],s[44:45] |
| ; GFX9-FLATSCR-NEXT: ;;#ASMEND |
| ; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x20d0 |
| ; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill |
| @@ -10896,13 +10930,16 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac |
| ; GFX10-FLATSCR-NEXT: ; def s[34:35] |
| ; GFX10-FLATSCR-NEXT: ;;#ASMEND |
| ; GFX10-FLATSCR-NEXT: ;;#ASMSTART |
| -; GFX10-FLATSCR-NEXT: ; def s38 |
| +; GFX10-FLATSCR-NEXT: ; def s[38:39] |
| +; GFX10-FLATSCR-NEXT: ;;#ASMEND |
| +; GFX10-FLATSCR-NEXT: ;;#ASMSTART |
| +; GFX10-FLATSCR-NEXT: ; def s44 |
| ; GFX10-FLATSCR-NEXT: ;;#ASMEND |
| ; GFX10-FLATSCR-NEXT: v_cmpx_eq_u32_e32 0, v0 |
| ; GFX10-FLATSCR-NEXT: s_cbranch_execz .LBB1_2 |
| ; GFX10-FLATSCR-NEXT: ; %bb.1: ; %bb0 |
| ; GFX10-FLATSCR-NEXT: ;;#ASMSTART |
| -; GFX10-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35] |
| +; GFX10-FLATSCR-NEXT: ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35],s[38:39] |
| ; GFX10-FLATSCR-NEXT: ;;#ASMEND |
| ; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x2010 |
| ; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v88, v59 |
| @@ -11079,14 +11116,15 @@ entry: |
| %sgpr3 = call <8 x i32> asm sideeffect "; def $0", "=s" () |
| %sgpr4 = call <4 x i32> asm sideeffect "; def $0", "=s" () |
| %sgpr5 = call <2 x i32> asm sideeffect "; def $0", "=s" () |
| - %sgpr6 = call i32 asm sideeffect "; def $0", "=s" () |
| + %sgpr6 = call <2 x i32> asm sideeffect "; def $0", "=s" () |
| + %sgpr7 = call i32 asm sideeffect "; def $0", "=s" () |
| |
| %cmp = icmp eq i32 %x, 0 |
| br i1 %cmp, label %bb0, label %ret |
| |
| bb0: |
| ; create SGPR pressure |
| - call void asm sideeffect "; use $0,$1,$2,$3,$4,$5", "s,s,s,s,s,s,s"(<8 x i32> %sgpr0, <8 x i32> %sgpr1, <8 x i32> %sgpr2, <8 x i32> %sgpr3, <4 x i32> %sgpr4, <2 x i32> %sgpr5, i32 %sgpr6) |
| + call void asm sideeffect "; use $0,$1,$2,$3,$4,$5,$6", "s,s,s,s,s,s,s,s"(<8 x i32> %sgpr0, <8 x i32> %sgpr1, <8 x i32> %sgpr2, <8 x i32> %sgpr3, <4 x i32> %sgpr4, <2 x i32> %sgpr5, <2 x i32> %sgpr6, i32 %sgpr7) |
| |
| ; mark most VGPR registers as used to increase register pressure |
| call void asm sideeffect "", "~{v4},~{v8},~{v12},~{v16},~{v20},~{v24},~{v28},~{v32}" () |
| diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir |
| index 32710d1be223..89c305b82b45 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir |
| +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir |
| @@ -10,10 +10,9 @@ body: | |
| bb.0: |
| liveins: $sgpr50 |
| ; CHECK-LABEL: name: spill_csr_sgpr_argument |
| - ; CHECK: liveins: $sgpr50 |
| + ; CHECK: liveins: $sgpr50, $vgpr0 |
| ; CHECK-NEXT: {{ $}} |
| - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF |
| - ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr50, 0, [[V_WRITELANE_B32_]] |
| + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr50, 0, $vgpr0 |
| ; CHECK-NEXT: S_NOP 0, implicit $sgpr50 |
| ; CHECK-NEXT: $sgpr50 = S_MOV_B32 0 |
| S_NOP 0, implicit $sgpr50 |
| diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll b/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll |
| index a636abe38c69..91d2ec82c81e 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-stack-no-sgpr.ll |
| @@ -1,50 +1,59 @@ |
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s |
| |
| -; The test was originally written to spill an SGPR to scratch without having spare SGPRs available to save exec. |
| -; This scenario no longer exists when we enabled SGPR spill into virtual VGPRs. |
| +; Spill an SGPR to scratch without having spare SGPRs available to save exec |
| |
| define amdgpu_kernel void @test() #1 { |
| ; GFX10-LABEL: test: |
| ; GFX10: ; %bb.0: |
| -; GFX10-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 |
| -; GFX10-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 |
| -; GFX10-NEXT: s_mov_b32 s14, -1 |
| -; GFX10-NEXT: s_mov_b32 s15, 0x31e16000 |
| -; GFX10-NEXT: s_add_u32 s12, s12, s1 |
| -; GFX10-NEXT: s_addc_u32 s13, s13, 0 |
| +; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 |
| +; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 |
| +; GFX10-NEXT: s_mov_b32 s10, -1 |
| +; GFX10-NEXT: s_mov_b32 s11, 0x31e16000 |
| +; GFX10-NEXT: s_add_u32 s8, s8, s1 |
| +; GFX10-NEXT: s_addc_u32 s9, s9, 0 |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ; def s[0:7] |
| ; GFX10-NEXT: ;;#ASMEND |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ; def s[8:12] |
| ; GFX10-NEXT: ;;#ASMEND |
| -; GFX10-NOT: s_not_b64 exec, exec |
| -; GFX10-NEXT: ; implicit-def: $vgpr0 |
| +; GFX10-NEXT: s_not_b64 exec, exec |
| +; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 |
| ; GFX10-NEXT: v_writelane_b32 v0, s8, 0 |
| ; GFX10-NEXT: v_writelane_b32 v0, s9, 1 |
| ; GFX10-NEXT: v_writelane_b32 v0, s10, 2 |
| ; GFX10-NEXT: v_writelane_b32 v0, s11, 3 |
| ; GFX10-NEXT: v_writelane_b32 v0, s12, 4 |
| -; GFX10-NEXT: s_or_saveexec_b64 s[14:15], -1 |
| -; GFX10-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| -; GFX10-NEXT: s_mov_b64 exec, s[14:15] |
| +; GFX10-NEXT: s_not_b64 exec, exec |
| +; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill |
| +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| +; GFX10-NEXT: s_not_b64 exec, exec |
| +; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 |
| +; GFX10-NEXT: s_waitcnt vmcnt(0) |
| +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| +; GFX10-NEXT: s_not_b64 exec, exec |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ;;#ASMEND |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ; use s[0:7] |
| ; GFX10-NEXT: ;;#ASMEND |
| -; GFX10-NEXT: s_or_saveexec_b64 s[14:15], -1 |
| -; GFX10-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload |
| -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| -; GFX10-NEXT: s_mov_b64 exec, s[14:15] |
| +; GFX10-NEXT: s_mov_b64 s[6:7], exec |
| +; GFX10-NEXT: s_mov_b64 exec, 31 |
| +; GFX10-NEXT: buffer_store_dword v0, off, s[8:11], 0 |
| +; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: v_readlane_b32 s0, v0, 0 |
| ; GFX10-NEXT: v_readlane_b32 s1, v0, 1 |
| ; GFX10-NEXT: v_readlane_b32 s2, v0, 2 |
| ; GFX10-NEXT: v_readlane_b32 s3, v0, 3 |
| ; GFX10-NEXT: v_readlane_b32 s4, v0, 4 |
| +; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 |
| +; GFX10-NEXT: s_waitcnt vmcnt(0) |
| +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| +; GFX10-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ; use s[0:4] |
| ; GFX10-NEXT: ;;#ASMEND |
| @@ -58,4 +67,4 @@ define amdgpu_kernel void @test() #1 { |
| } |
| |
| attributes #0 = { nounwind } |
| -attributes #1 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" } |
| +attributes #1 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" } |
| diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir |
| deleted file mode 100644 |
| index c5ad6d4dffe9..000000000000 |
| --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir |
| +++ /dev/null |
| @@ -1,320 +0,0 @@ |
| -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py |
| -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -run-pass=si-lower-sgpr-spills -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s |
| - |
| -# A simple SGPR spill. Implicit def for lane VGPR should be inserted just before the spill instruction. |
| ---- |
| -name: sgpr32_spill |
| -tracksRegLiveness: true |
| -frameInfo: |
| - maxAlignment: 4 |
| -stack: |
| - - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } |
| -machineFunctionInfo: |
| - isEntryFunction: false |
| - scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' |
| - stackPtrOffsetReg: '$sgpr32' |
| - frameOffsetReg: '$sgpr33' |
| - hasSpilledSGPRs: true |
| -body: | |
| - bb.0: |
| - liveins: $sgpr30_sgpr31, $sgpr10 |
| - ; GCN-LABEL: name: sgpr32_spill |
| - ; GCN: liveins: $sgpr30_sgpr31, $sgpr10 |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: S_NOP 0 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF |
| - ; GCN-NEXT: [[V_WRITELANE_B32_]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] |
| - ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_]], 0 |
| - ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31 |
| - S_NOP 0 |
| - SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 |
| - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 |
| - S_SETPC_B64 $sgpr30_sgpr31 |
| -... |
| - |
| -# Needed an additional virtual lane register as the lanes of current register are fully occupied while spilling a wide SGPR tuple. |
| -# There must be two implicit def for the two lane VGPRs. |
| - |
| ---- |
| -name: sgpr_spill_lane_crossover |
| -tracksRegLiveness: true |
| -frameInfo: |
| - maxAlignment: 4 |
| -stack: |
| - - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } |
| - - { id: 1, type: spill-slot, size: 128, alignment: 4, stack-id: sgpr-spill } |
| -machineFunctionInfo: |
| - isEntryFunction: false |
| - scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' |
| - stackPtrOffsetReg: '$sgpr32' |
| - frameOffsetReg: '$sgpr33' |
| - hasSpilledSGPRs: true |
| -body: | |
| - bb.0: |
| - liveins: $sgpr30_sgpr31, $sgpr10, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-LABEL: name: sgpr_spill_lane_crossover |
| - ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, $sgpr30_sgpr31 |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr64, 0, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr65, 1, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr66, 2, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr67, 3, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr68, 4, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr69, 5, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr70, 6, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr71, 7, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr72, 8, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr73, 9, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr74, 10, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr75, 11, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr76, 12, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr77, 13, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr78, 14, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr79, 15, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr80, 16, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr81, 17, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr82, 18, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr83, 19, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr84, 20, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr85, 21, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr86, 22, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr87, 23, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr88, 24, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr89, 25, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr90, 26, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr91, 27, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr92, 28, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr93, 29, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr94, 30, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr95, 31, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: S_NOP 0 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 32, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: [[V_WRITELANE_B32_2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr64, 33, [[V_WRITELANE_B32_1]], implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr65, 34, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr66, 35, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr67, 36, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr68, 37, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr69, 38, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr70, 39, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr71, 40, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr72, 41, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr73, 42, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr74, 43, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr75, 44, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr76, 45, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr77, 46, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr78, 47, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr79, 48, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr80, 49, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr81, 50, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr82, 51, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr83, 52, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr84, 53, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr85, 54, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr86, 55, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr87, 56, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr88, 57, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr89, 58, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr90, 59, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr91, 60, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr92, 61, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr93, 62, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1]]:vgpr_32 = V_WRITELANE_B32 $sgpr94, 63, [[V_WRITELANE_B32_1]], implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_2]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr95, 0, [[V_WRITELANE_B32_2]], implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: S_NOP 0 |
| - ; GCN-NEXT: $sgpr64 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 33, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 |
| - ; GCN-NEXT: $sgpr65 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 34 |
| - ; GCN-NEXT: $sgpr66 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 35 |
| - ; GCN-NEXT: $sgpr67 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 36 |
| - ; GCN-NEXT: $sgpr68 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 37 |
| - ; GCN-NEXT: $sgpr69 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 38 |
| - ; GCN-NEXT: $sgpr70 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 39 |
| - ; GCN-NEXT: $sgpr71 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 40 |
| - ; GCN-NEXT: $sgpr72 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 41 |
| - ; GCN-NEXT: $sgpr73 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 42 |
| - ; GCN-NEXT: $sgpr74 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 43 |
| - ; GCN-NEXT: $sgpr75 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 44 |
| - ; GCN-NEXT: $sgpr76 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 45 |
| - ; GCN-NEXT: $sgpr77 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 46 |
| - ; GCN-NEXT: $sgpr78 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 47 |
| - ; GCN-NEXT: $sgpr79 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 48 |
| - ; GCN-NEXT: $sgpr80 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 49 |
| - ; GCN-NEXT: $sgpr81 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 50 |
| - ; GCN-NEXT: $sgpr82 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 51 |
| - ; GCN-NEXT: $sgpr83 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 52 |
| - ; GCN-NEXT: $sgpr84 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 53 |
| - ; GCN-NEXT: $sgpr85 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 54 |
| - ; GCN-NEXT: $sgpr86 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 55 |
| - ; GCN-NEXT: $sgpr87 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 56 |
| - ; GCN-NEXT: $sgpr88 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 57 |
| - ; GCN-NEXT: $sgpr89 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 58 |
| - ; GCN-NEXT: $sgpr90 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 59 |
| - ; GCN-NEXT: $sgpr91 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 60 |
| - ; GCN-NEXT: $sgpr92 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 61 |
| - ; GCN-NEXT: $sgpr93 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 62 |
| - ; GCN-NEXT: $sgpr94 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 63 |
| - ; GCN-NEXT: $sgpr95 = V_READLANE_B32 [[V_WRITELANE_B32_2]], 0 |
| - ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 32 |
| - ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31 |
| - S_NOP 0 |
| - SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 |
| - SI_SPILL_S1024_SAVE killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 |
| - S_NOP 0 |
| - renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 |
| - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 |
| - S_SETPC_B64 $sgpr30_sgpr31 |
| -... |
| - |
| -# The implicit def for the lane VGPR should be inserted at the common dominator block (the entry block here). |
| - |
| ---- |
| -name: lane_vgpr_implicit_def_at_common_dominator_block |
| -tracksRegLiveness: true |
| -frameInfo: |
| - maxAlignment: 4 |
| -stack: |
| - - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } |
| -machineFunctionInfo: |
| - isEntryFunction: false |
| - scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' |
| - stackPtrOffsetReg: '$sgpr32' |
| - frameOffsetReg: '$sgpr33' |
| - hasSpilledSGPRs: true |
| -body: | |
| - ; GCN-LABEL: name: lane_vgpr_implicit_def_at_common_dominator_block |
| - ; GCN: bb.0: |
| - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) |
| - ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: S_NOP 0 |
| - ; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc |
| - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF |
| - ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: bb.1: |
| - ; GCN-NEXT: successors: %bb.3(0x80000000) |
| - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: $sgpr10 = S_MOV_B32 10 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] |
| - ; GCN-NEXT: S_BRANCH %bb.3 |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: bb.2: |
| - ; GCN-NEXT: successors: %bb.3(0x80000000) |
| - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: $sgpr10 = S_MOV_B32 20 |
| - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_1]] |
| - ; GCN-NEXT: S_BRANCH %bb.3 |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: bb.3: |
| - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0 |
| - ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 |
| - bb.0: |
| - liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 |
| - S_NOP 0 |
| - S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc |
| - S_CBRANCH_SCC1 %bb.2, implicit killed $scc |
| - bb.1: |
| - liveins: $sgpr10, $sgpr30_sgpr31 |
| - $sgpr10 = S_MOV_B32 10 |
| - SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 |
| - S_BRANCH %bb.3 |
| - bb.2: |
| - liveins: $sgpr10, $sgpr30_sgpr31 |
| - $sgpr10 = S_MOV_B32 20 |
| - SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 |
| - S_BRANCH %bb.3 |
| - bb.3: |
| - liveins: $sgpr10, $sgpr30_sgpr31 |
| - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 |
| - S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 |
| -... |
| - |
| -# The common dominator block is visited only at the end. The insertion point was initially identified to the |
| -# terminator instruction in the dominator block which later becomes the point where a spill get inserted in the same block. |
| - |
| ---- |
| -name: dominator_block_follows_the_successors_bbs |
| -tracksRegLiveness: true |
| -frameInfo: |
| - maxAlignment: 4 |
| -stack: |
| - - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } |
| -machineFunctionInfo: |
| - isEntryFunction: false |
| - scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' |
| - stackPtrOffsetReg: '$sgpr32' |
| - frameOffsetReg: '$sgpr33' |
| - hasSpilledSGPRs: true |
| -body: | |
| - ; GCN-LABEL: name: dominator_block_follows_the_successors_bbs |
| - ; GCN: bb.0: |
| - ; GCN-NEXT: successors: %bb.3(0x80000000) |
| - ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: S_NOP 0 |
| - ; GCN-NEXT: S_BRANCH %bb.3 |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: bb.1: |
| - ; GCN-NEXT: successors: %bb.2(0x80000000) |
| - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: $sgpr10 = V_READLANE_B32 %0, 0 |
| - ; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc |
| - ; GCN-NEXT: S_BRANCH %bb.2 |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: bb.2: |
| - ; GCN-NEXT: successors: %bb.3(0x80000000) |
| - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: $sgpr10 = V_READLANE_B32 %0, 0 |
| - ; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc |
| - ; GCN-NEXT: S_BRANCH %bb.3 |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: bb.3: |
| - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) |
| - ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: $sgpr10 = S_MOV_B32 10 |
| - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF |
| - ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] |
| - ; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc |
| - ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc |
| - ; GCN-NEXT: S_BRANCH %bb.1 |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: bb.4: |
| - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 |
| - ; GCN-NEXT: {{ $}} |
| - ; GCN-NEXT: S_NOP 0 |
| - ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 |
| - bb.0: |
| - liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 |
| - S_NOP 0 |
| - S_BRANCH %bb.3 |
| - bb.1: |
| - liveins: $sgpr10, $sgpr30_sgpr31 |
| - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 |
| - $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc |
| - S_BRANCH %bb.2 |
| - bb.2: |
| - liveins: $sgpr10, $sgpr30_sgpr31 |
| - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 |
| - $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc |
| - S_BRANCH %bb.3 |
| - bb.3: |
| - liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 |
| - $sgpr10 = S_MOV_B32 10 |
| - SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 |
| - S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc |
| - S_CBRANCH_SCC1 %bb.2, implicit killed $scc |
| - S_BRANCH %bb.1 |
| - bb.4: |
| - liveins: $sgpr10, $sgpr30_sgpr31 |
| - S_NOP 0 |
| - S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 |
| -... |
| diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll |
| deleted file mode 100644 |
| index 71cd094f743c..000000000000 |
| --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll |
| +++ /dev/null |
| @@ -1,85 +0,0 @@ |
| -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
| -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -verify-machineinstrs -o - %s | FileCheck %s |
| - |
| -; Regression test for `processFunctionBeforeFrameFinalized`: |
| -; Check that it correctly updates RegisterScavenger so we |
| -; don't end up with bad machine code due to using undefined |
| -; physical registers. |
| - |
| -define void @test() { |
| -; CHECK-LABEL: test: |
| -; CHECK: ; %bb.0: ; %bb.0 |
| -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill |
| -; CHECK-NEXT: s_mov_b64 exec, s[4:5] |
| -; CHECK-NEXT: .LBB0_1: ; %bb.1 |
| -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 |
| -; CHECK-NEXT: s_cbranch_scc1 .LBB0_3 |
| -; CHECK-NEXT: ; %bb.2: ; %bb.2 |
| -; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 |
| -; CHECK-NEXT: .LBB0_3: ; %bb.3 |
| -; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 |
| -; CHECK-NEXT: ; implicit-def: $sgpr4 |
| -; CHECK-NEXT: v_mov_b32_e32 v0, s4 |
| -; CHECK-NEXT: v_readfirstlane_b32 s6, v0 |
| -; CHECK-NEXT: s_mov_b64 s[4:5], -1 |
| -; CHECK-NEXT: s_mov_b32 s7, 0 |
| -; CHECK-NEXT: s_cmp_eq_u32 s6, s7 |
| -; CHECK-NEXT: ; implicit-def: $vgpr0 |
| -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 |
| -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 |
| -; CHECK-NEXT: s_mov_b64 s[10:11], exec |
| -; CHECK-NEXT: s_mov_b64 exec, -1 |
| -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; Reload Reuse |
| -; CHECK-NEXT: s_mov_b64 exec, s[10:11] |
| -; CHECK-NEXT: s_cbranch_scc1 .LBB0_5 |
| -; CHECK-NEXT: ; %bb.4: ; %bb.4 |
| -; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 |
| -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 |
| -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse |
| -; CHECK-NEXT: s_mov_b64 exec, s[10:11] |
| -; CHECK-NEXT: s_mov_b64 s[4:5], 0 |
| -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 |
| -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 |
| -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 |
| -; CHECK-NEXT: s_nop 0 |
| -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; Reload Reuse |
| -; CHECK-NEXT: s_mov_b64 exec, s[10:11] |
| -; CHECK-NEXT: .LBB0_5: ; %Flow |
| -; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 |
| -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 |
| -; CHECK-NEXT: s_nop 0 |
| -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse |
| -; CHECK-NEXT: s_mov_b64 exec, s[10:11] |
| -; CHECK-NEXT: v_readlane_b32 s4, v0, 0 |
| -; CHECK-NEXT: v_readlane_b32 s5, v0, 1 |
| -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] |
| -; CHECK-NEXT: s_mov_b32 s4, 1 |
| -; CHECK-NEXT: ; implicit-def: $sgpr5 |
| -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s4 |
| -; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] |
| -; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 |
| -; CHECK-NEXT: ; %bb.6: ; %bb.5 |
| -; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload |
| -; CHECK-NEXT: s_mov_b64 exec, s[4:5] |
| -; CHECK-NEXT: s_waitcnt vmcnt(0) |
| -; CHECK-NEXT: s_setpc_b64 s[30:31] |
| -bb.0: |
| - br label %bb.1 |
| -bb.1: ; preds = %bb.4, %bb.0 |
| - br i1 poison, label %bb.2, label %bb.3 |
| -bb.2: ; preds = %bb.1 |
| - br label %bb.3 |
| -bb.3: ; preds = %bb.2, %bb.1 |
| - %call = tail call i32 @llvm.amdgcn.readfirstlane(i32 poison) |
| - %cmp = icmp eq i32 %call, 0 |
| - br i1 %cmp, label %bb.5, label %bb.4 |
| -bb.4: ; preds = %bb.3 |
| - br label %bb.1 |
| -bb.5: ; preds = %bb.3 |
| - ret void |
| -} |
| - |
| -declare i32 @llvm.amdgcn.readfirstlane(i32) |
| diff --git a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll |
| index e8a46bd72aec..c1fc297d4564 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll |
| @@ -10,17 +10,16 @@ define void @sgpr_spill_writelane() { |
| ; GCN-LABEL: sgpr_spill_writelane: |
| ; GCN: ; %bb.0: |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 |
| +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[6:7] |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| +; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: v_writelane_b32 v0, s35, 0 |
| ; GCN-NEXT: ;;#ASMSTART |
| ; GCN-NEXT: ;;#ASMEND |
| ; GCN-NEXT: v_readlane_b32 s35, v0, 0 |
| -; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 |
| +; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 |
| ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[6:7] |
| +; GCN-NEXT: s_mov_b64 exec, s[4:5] |
| ; GCN-NEXT: s_waitcnt vmcnt(0) |
| ; GCN-NEXT: s_setpc_b64 s[30:31] |
| call void asm sideeffect "", "~{s35}"() |
| diff --git a/llvm/test/CodeGen/AMDGPU/spill192.mir b/llvm/test/CodeGen/AMDGPU/spill192.mir |
| index 6fd96da2318b..df0c836b556e 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/spill192.mir |
| +++ b/llvm/test/CodeGen/AMDGPU/spill192.mir |
| @@ -32,29 +32,32 @@ body: | |
| ; EXPANDED-LABEL: name: spill_restore_sgpr192 |
| ; EXPANDED: bb.0: |
| ; EXPANDED-NEXT: successors: %bb.1(0x80000000) |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 |
| - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr9, 5, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 |
| ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: bb.1: |
| ; EXPANDED-NEXT: successors: %bb.2(0x80000000) |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: S_NOP 1 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: bb.2: |
| - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 |
| - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 |
| - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 |
| - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 |
| - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 |
| - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| + ; EXPANDED-NEXT: {{ $}} |
| + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 |
| + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 |
| + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 |
| + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 |
| + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 |
| + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 |
| ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 |
| bb.0: |
| S_NOP 0, implicit-def %0:sgpr_192 |
| diff --git a/llvm/test/CodeGen/AMDGPU/spill224.mir b/llvm/test/CodeGen/AMDGPU/spill224.mir |
| index 7f4402deadd8..09f208246995 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/spill224.mir |
| +++ b/llvm/test/CodeGen/AMDGPU/spill224.mir |
| @@ -30,31 +30,34 @@ body: | |
| ; EXPANDED-LABEL: name: spill_restore_sgpr224 |
| ; EXPANDED: bb.0: |
| ; EXPANDED-NEXT: successors: %bb.1(0x80000000) |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 6, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 6, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: bb.1: |
| ; EXPANDED-NEXT: successors: %bb.2(0x80000000) |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: S_NOP 1 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: bb.2: |
| - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 |
| - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 |
| - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 |
| - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 |
| - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 |
| - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| + ; EXPANDED-NEXT: {{ $}} |
| + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 |
| + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 |
| + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 |
| + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 |
| + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 |
| + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 |
| ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 |
| bb.0: |
| S_NOP 0, implicit-def %0:sgpr_224 |
| diff --git a/llvm/test/CodeGen/AMDGPU/spill288.mir b/llvm/test/CodeGen/AMDGPU/spill288.mir |
| index 646d2be19e8a..6e6b7f8bd7e3 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/spill288.mir |
| +++ b/llvm/test/CodeGen/AMDGPU/spill288.mir |
| @@ -30,35 +30,38 @@ body: | |
| ; EXPANDED-LABEL: name: spill_restore_sgpr288 |
| ; EXPANDED: bb.0: |
| ; EXPANDED-NEXT: successors: %bb.1(0x80000000) |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr12, 8, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr12, 8, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: bb.1: |
| ; EXPANDED-NEXT: successors: %bb.2(0x80000000) |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: S_NOP 1 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: bb.2: |
| - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 |
| - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 |
| - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 |
| - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 |
| - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 |
| - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 |
| - ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 |
| - ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| + ; EXPANDED-NEXT: {{ $}} |
| + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 |
| + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 |
| + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 |
| + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 |
| + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 |
| + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 |
| + ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 $vgpr0, 7 |
| + ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 8 |
| ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 |
| bb.0: |
| S_NOP 0, implicit-def %0:sgpr_288 |
| diff --git a/llvm/test/CodeGen/AMDGPU/spill320.mir b/llvm/test/CodeGen/AMDGPU/spill320.mir |
| index 7c866cd15f90..a5a1074d1e7d 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/spill320.mir |
| +++ b/llvm/test/CodeGen/AMDGPU/spill320.mir |
| @@ -30,37 +30,40 @@ body: | |
| ; EXPANDED-LABEL: name: spill_restore_sgpr320 |
| ; EXPANDED: bb.0: |
| ; EXPANDED-NEXT: successors: %bb.1(0x80000000) |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr13, 9, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 8, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr13, 9, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: bb.1: |
| ; EXPANDED-NEXT: successors: %bb.2(0x80000000) |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: S_NOP 1 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: bb.2: |
| - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 |
| - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 |
| - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 |
| - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 |
| - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 |
| - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 |
| - ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 |
| - ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 |
| - ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9 |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| + ; EXPANDED-NEXT: {{ $}} |
| + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 |
| + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 |
| + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 |
| + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 |
| + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 |
| + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 |
| + ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 $vgpr0, 7 |
| + ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 8 |
| + ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 9 |
| ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 |
| bb.0: |
| S_NOP 0, implicit-def %0:sgpr_320 |
| diff --git a/llvm/test/CodeGen/AMDGPU/spill352.mir b/llvm/test/CodeGen/AMDGPU/spill352.mir |
| index ded934f94d32..12a15152e6a1 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/spill352.mir |
| +++ b/llvm/test/CodeGen/AMDGPU/spill352.mir |
| @@ -30,39 +30,42 @@ body: | |
| ; EXPANDED-LABEL: name: spill_restore_sgpr352 |
| ; EXPANDED: bb.0: |
| ; EXPANDED-NEXT: successors: %bb.1(0x80000000) |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr13, 9, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr14, 10, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 8, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 9, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr14, 10, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: bb.1: |
| ; EXPANDED-NEXT: successors: %bb.2(0x80000000) |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: S_NOP 1 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: bb.2: |
| - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 |
| - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 |
| - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 |
| - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 |
| - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 |
| - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 |
| - ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 |
| - ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 |
| - ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9 |
| - ; EXPANDED-NEXT: $sgpr14 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 10 |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| + ; EXPANDED-NEXT: {{ $}} |
| + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 |
| + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 |
| + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 |
| + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 |
| + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 |
| + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 |
| + ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 $vgpr0, 7 |
| + ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 8 |
| + ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 9 |
| + ; EXPANDED-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 10 |
| ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 |
| bb.0: |
| S_NOP 0, implicit-def %0:sgpr_352 |
| diff --git a/llvm/test/CodeGen/AMDGPU/spill384.mir b/llvm/test/CodeGen/AMDGPU/spill384.mir |
| index b929391f67c0..6029ff52e9b5 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/spill384.mir |
| +++ b/llvm/test/CodeGen/AMDGPU/spill384.mir |
| @@ -30,41 +30,44 @@ body: | |
| ; EXPANDED-LABEL: name: spill_restore_sgpr384 |
| ; EXPANDED: bb.0: |
| ; EXPANDED-NEXT: successors: %bb.1(0x80000000) |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr13, 9, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr14, 10, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr15, 11, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 8, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 9, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 10, $vgpr0, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr15, 11, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: bb.1: |
| ; EXPANDED-NEXT: successors: %bb.2(0x80000000) |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: S_NOP 1 |
| ; EXPANDED-NEXT: {{ $}} |
| ; EXPANDED-NEXT: bb.2: |
| - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 |
| - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 |
| - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 |
| - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 |
| - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 |
| - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 |
| - ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 |
| - ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 |
| - ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9 |
| - ; EXPANDED-NEXT: $sgpr14 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 10 |
| - ; EXPANDED-NEXT: $sgpr15 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 11 |
| + ; EXPANDED-NEXT: liveins: $vgpr0 |
| + ; EXPANDED-NEXT: {{ $}} |
| + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 |
| + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 |
| + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 |
| + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 |
| + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 |
| + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 |
| + ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 $vgpr0, 7 |
| + ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 8 |
| + ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 9 |
| + ; EXPANDED-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 10 |
| + ; EXPANDED-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 11 |
| ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 |
| bb.0: |
| S_NOP 0, implicit-def %0:sgpr_384 |
| diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll |
| index d209c02dccc8..e2aaa47c3c0b 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/tail-call-amdgpu-gfx.ll |
| @@ -22,9 +22,8 @@ define amdgpu_gfx float @caller(float %arg0) { |
| ; GCN-NEXT: s_xor_saveexec_b64 s[34:35], -1 |
| ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[34:35] |
| -; GCN-NEXT: ; implicit-def: $vgpr1 |
| -; GCN-NEXT: s_addk_i32 s32, 0x400 |
| ; GCN-NEXT: v_writelane_b32 v1, s4, 0 |
| +; GCN-NEXT: s_addk_i32 s32, 0x400 |
| ; GCN-NEXT: v_writelane_b32 v1, s30, 1 |
| ; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0 |
| ; GCN-NEXT: s_mov_b32 s4, 2.0 |
| diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll |
| index d9b0106ccc4e..6297b4136fb4 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll |
| @@ -10,20 +10,18 @@ define internal fastcc void @widget() { |
| ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX90A-NEXT: s_mov_b32 s16, s33 |
| ; GFX90A-NEXT: s_mov_b32 s33, s32 |
| -; GFX90A-NEXT: s_xor_saveexec_b64 s[18:19], -1 |
| -; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GFX90A-NEXT: s_mov_b64 exec, -1 |
| -; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GFX90A-NEXT: s_or_saveexec_b64 s[18:19], -1 |
| +; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX90A-NEXT: s_mov_b64 exec, s[18:19] |
| ; GFX90A-NEXT: s_addk_i32 s32, 0x400 |
| -; GFX90A-NEXT: v_writelane_b32 v40, s16, 0 |
| +; GFX90A-NEXT: v_writelane_b32 v41, s16, 0 |
| ; GFX90A-NEXT: s_getpc_b64 s[16:17] |
| ; GFX90A-NEXT: s_add_u32 s16, s16, wobble@gotpcrel32@lo+4 |
| ; GFX90A-NEXT: s_addc_u32 s17, s17, wobble@gotpcrel32@hi+12 |
| ; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 |
| -; GFX90A-NEXT: ; implicit-def: $vgpr0 |
| -; GFX90A-NEXT: v_writelane_b32 v0, s30, 0 |
| -; GFX90A-NEXT: v_writelane_b32 v0, s31, 1 |
| +; GFX90A-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX90A-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX90A-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| bb: |
| @@ -37,12 +35,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[54:55], s[6:7] |
| ; GLOBALNESS1-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 |
| ; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 |
| -; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0 |
| -; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0 |
| +; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v0 |
| +; GLOBALNESS1-NEXT: v_mov_b32_e32 v40, 0 |
| ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 |
| -; GLOBALNESS1-NEXT: global_store_dword v[0:1], v42, off |
| +; GLOBALNESS1-NEXT: global_store_dword v[0:1], v40, off |
| ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) |
| -; GLOBALNESS1-NEXT: global_load_dword v0, v42, s[36:37] |
| +; GLOBALNESS1-NEXT: global_load_dword v0, v40, s[36:37] |
| ; GLOBALNESS1-NEXT: s_add_u32 flat_scratch_lo, s12, s17 |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[64:65], s[4:5] |
| ; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 |
| @@ -50,11 +48,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 |
| ; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17 |
| ; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 |
| -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, 0x40994400 |
| +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, 0x40994400 |
| ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s38, 0 |
| ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) |
| -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], v[42:43] |
| -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[42:43], s[4:5], 0 |
| +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[40:41] |
| +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], 0 |
| ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 |
| ; GLOBALNESS1-NEXT: s_xor_b64 s[94:95], s[4:5], -1 |
| ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0 |
| @@ -67,34 +65,33 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS1-NEXT: s_add_u32 s6, s6, wobble@gotpcrel32@lo+4 |
| ; GLOBALNESS1-NEXT: s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12 |
| ; GLOBALNESS1-NEXT: s_xor_b64 s[86:87], s[4:5], -1 |
| -; GLOBALNESS1-NEXT: ; implicit-def: $vgpr40 |
| ; GLOBALNESS1-NEXT: s_load_dwordx2 s[66:67], s[6:7], 0x0 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s98, s16 |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[62:63], s[8:9] |
| ; GLOBALNESS1-NEXT: s_mov_b32 s99, s15 |
| -; GLOBALNESS1-NEXT: s_mov_b32 s56, s14 |
| +; GLOBALNESS1-NEXT: s_mov_b32 s100, s14 |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[34:35], s[10:11] |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[92:93], 0x80 |
| -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[36:37], 1, v1 |
| +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s69, 0x3ff00000 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s32, 0 |
| ; GLOBALNESS1-NEXT: ; implicit-def: $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63 |
| ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) |
| ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 |
| -; GLOBALNESS1-NEXT: v_writelane_b32 v40, s4, 0 |
| -; GLOBALNESS1-NEXT: v_writelane_b32 v40, s5, 1 |
| +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 0 |
| +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 1 |
| ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 |
| -; GLOBALNESS1-NEXT: v_writelane_b32 v40, s4, 2 |
| -; GLOBALNESS1-NEXT: v_writelane_b32 v40, s5, 3 |
| +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 2 |
| +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 3 |
| ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 |
| -; GLOBALNESS1-NEXT: v_writelane_b32 v40, s4, 4 |
| +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s4, 4 |
| ; GLOBALNESS1-NEXT: v_cmp_gt_i32_e64 s[90:91], 1, v0 |
| -; GLOBALNESS1-NEXT: v_writelane_b32 v40, s5, 5 |
| +; GLOBALNESS1-NEXT: v_writelane_b32 v42, s5, 5 |
| ; GLOBALNESS1-NEXT: s_branch .LBB1_4 |
| ; GLOBALNESS1-NEXT: .LBB1_1: ; %bb70.i |
| ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v40, 4 |
| -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v40, 5 |
| +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v42, 4 |
| +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v42, 5 |
| ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] |
| ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_29 |
| ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow6 |
| @@ -144,19 +141,19 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1] |
| ; GLOBALNESS1-NEXT: flat_load_dword v44, v[0:1] |
| ; GLOBALNESS1-NEXT: s_add_u32 s8, s62, 40 |
| -; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0 |
| +; GLOBALNESS1-NEXT: buffer_store_dword v40, off, s[0:3], 0 |
| ; GLOBALNESS1-NEXT: flat_load_dword v45, v[0:1] |
| ; GLOBALNESS1-NEXT: s_addc_u32 s9, s63, 0 |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] |
| -; GLOBALNESS1-NEXT: s_mov_b32 s12, s56 |
| +; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 |
| -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 |
| +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 |
| ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) |
| ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] |
| -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[36:37] |
| +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[42:43] |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 |
| ; GLOBALNESS1-NEXT: ; implicit-def: $sgpr4_sgpr5 |
| ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_8 |
| @@ -223,23 +220,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS1-NEXT: ; %bb.10: ; %bb33.i |
| ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[32:33], off |
| -; GLOBALNESS1-NEXT: v_readlane_b32 s4, v40, 0 |
| -; GLOBALNESS1-NEXT: v_readlane_b32 s5, v40, 1 |
| -; GLOBALNESS1-NEXT: s_mov_b64 s[72:73], s[36:37] |
| -; GLOBALNESS1-NEXT: s_mov_b32 s75, s39 |
| +; GLOBALNESS1-NEXT: v_readlane_b32 s4, v42, 0 |
| +; GLOBALNESS1-NEXT: v_readlane_b32 s5, v42, 1 |
| ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] |
| ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_12 |
| ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb39.i |
| ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 |
| +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 |
| ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 |
| -; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off |
| +; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[40:41], off |
| ; GLOBALNESS1-NEXT: .LBB1_12: ; %bb44.lr.ph.i |
| ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45 |
| ; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v44, vcc |
| +; GLOBALNESS1-NEXT: s_mov_b64 s[72:73], s[42:43] |
| +; GLOBALNESS1-NEXT: s_mov_b32 s75, s39 |
| ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) |
| -; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[36:37], 0, v[0:1] |
| +; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1] |
| ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[58:59], 0, v2 |
| ; GLOBALNESS1-NEXT: s_branch .LBB1_15 |
| ; GLOBALNESS1-NEXT: .LBB1_13: ; %Flow7 |
| @@ -260,15 +257,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_14 |
| ; GLOBALNESS1-NEXT: ; %bb.17: ; %bb50.i |
| ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 |
| -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[40:41] |
| +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[36:37] |
| ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_20 |
| ; GLOBALNESS1-NEXT: ; %bb.18: ; %bb3.i.i |
| ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 |
| -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[42:43] |
| +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[40:41] |
| ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_20 |
| ; GLOBALNESS1-NEXT: ; %bb.19: ; %bb6.i.i |
| ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 |
| -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[36:37] |
| +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[56:57] |
| ; GLOBALNESS1-NEXT: .LBB1_20: ; %spam.exit.i |
| ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 |
| ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[90:91] |
| @@ -281,28 +278,28 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[60:61] |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] |
| -; GLOBALNESS1-NEXT: s_mov_b32 s12, s56 |
| +; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 |
| -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 |
| +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 |
| ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] |
| ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[44:45], 0, 0 |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], s[60:61] |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] |
| -; GLOBALNESS1-NEXT: s_mov_b32 s12, s56 |
| +; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 |
| -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 |
| +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 |
| ; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], a[32:33], off |
| ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[66:67] |
| ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] |
| ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_13 |
| ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb62.i |
| ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 |
| -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 |
| -; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off |
| +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 |
| +; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[40:41], off |
| ; GLOBALNESS1-NEXT: s_branch .LBB1_13 |
| ; GLOBALNESS1-NEXT: .LBB1_23: ; %LeafBlock |
| ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| @@ -317,14 +314,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS1-NEXT: s_branch .LBB1_3 |
| ; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow14 |
| ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[36:37] |
| ; GLOBALNESS1-NEXT: s_mov_b32 s36, s93 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s37, s93 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s38, s93 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s39, s93 |
| -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] |
| +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[40:41] |
| ; GLOBALNESS1-NEXT: s_mov_b32 s40, s93 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s41, s93 |
| -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[42:43] |
| ; GLOBALNESS1-NEXT: s_mov_b32 s42, s93 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s43, s93 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s44, s93 |
| @@ -353,10 +350,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[26:27], s[62:63], s[62:63] op_sel:[0,1] |
| ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[28:29], s[64:65], s[64:65] op_sel:[0,1] |
| ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[30:31], s[66:67], s[66:67] op_sel:[0,1] |
| -; GLOBALNESS1-NEXT: s_mov_b64 s[42:43], s[6:7] |
| -; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5] |
| +; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[6:7] |
| +; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[4:5] |
| ; GLOBALNESS1-NEXT: s_mov_b32 s39, s75 |
| -; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[72:73] |
| +; GLOBALNESS1-NEXT: s_mov_b64 s[42:43], s[72:73] |
| ; GLOBALNESS1-NEXT: .LBB1_26: ; %Flow15 |
| ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[70:71] |
| @@ -364,21 +361,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_2 |
| ; GLOBALNESS1-NEXT: ; %bb.27: ; %bb67.i |
| ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| -; GLOBALNESS1-NEXT: v_readlane_b32 s6, v40, 2 |
| -; GLOBALNESS1-NEXT: v_readlane_b32 s7, v40, 3 |
| +; GLOBALNESS1-NEXT: v_readlane_b32 s6, v42, 2 |
| +; GLOBALNESS1-NEXT: v_readlane_b32 s7, v42, 3 |
| ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] |
| ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 |
| ; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i |
| ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 |
| +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 |
| ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 |
| -; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[42:43], off |
| +; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[40:41], off |
| ; GLOBALNESS1-NEXT: s_branch .LBB1_1 |
| ; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i |
| ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 |
| +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 |
| ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[32:33], 0, 0 |
| -; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[42:43], off |
| +; GLOBALNESS1-NEXT: global_store_dwordx2 v[32:33], v[40:41], off |
| ; GLOBALNESS1-NEXT: s_branch .LBB1_2 |
| ; GLOBALNESS1-NEXT: .LBB1_30: ; %loop.exit.guard |
| ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] |
| @@ -390,10 +387,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] |
| -; GLOBALNESS1-NEXT: s_mov_b32 s12, s56 |
| +; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 |
| -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 |
| +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 |
| ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] |
| ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 |
| ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 |
| @@ -408,10 +405,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[64:65] |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[54:55] |
| ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] |
| -; GLOBALNESS1-NEXT: s_mov_b32 s12, s56 |
| +; GLOBALNESS1-NEXT: s_mov_b32 s12, s100 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s13, s99 |
| ; GLOBALNESS1-NEXT: s_mov_b32 s14, s98 |
| -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 |
| +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v43 |
| ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] |
| ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 |
| ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 |
| @@ -423,12 +420,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[54:55], s[6:7] |
| ; GLOBALNESS0-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0 |
| ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 |
| -; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0 |
| -; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0 |
| +; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v0 |
| +; GLOBALNESS0-NEXT: v_mov_b32_e32 v40, 0 |
| ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 |
| -; GLOBALNESS0-NEXT: global_store_dword v[0:1], v42, off |
| +; GLOBALNESS0-NEXT: global_store_dword v[0:1], v40, off |
| ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) |
| -; GLOBALNESS0-NEXT: global_load_dword v0, v42, s[36:37] |
| +; GLOBALNESS0-NEXT: global_load_dword v0, v40, s[36:37] |
| ; GLOBALNESS0-NEXT: s_add_u32 flat_scratch_lo, s12, s17 |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[62:63], s[4:5] |
| ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 |
| @@ -436,11 +433,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 |
| ; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17 |
| ; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 |
| -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, 0x40994400 |
| +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, 0x40994400 |
| ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s38, 0 |
| ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) |
| -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], v[42:43] |
| -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[42:43], s[4:5], 0 |
| +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[36:37], s[4:5], v[40:41] |
| +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[40:41], s[4:5], 0 |
| ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 |
| ; GLOBALNESS0-NEXT: s_xor_b64 s[94:95], s[4:5], -1 |
| ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0 |
| @@ -453,34 +450,33 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS0-NEXT: s_add_u32 s6, s6, wobble@gotpcrel32@lo+4 |
| ; GLOBALNESS0-NEXT: s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12 |
| ; GLOBALNESS0-NEXT: s_xor_b64 s[86:87], s[4:5], -1 |
| -; GLOBALNESS0-NEXT: ; implicit-def: $vgpr40 |
| ; GLOBALNESS0-NEXT: s_load_dwordx2 s[66:67], s[6:7], 0x0 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s98, s16 |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[60:61], s[8:9] |
| ; GLOBALNESS0-NEXT: s_mov_b32 s99, s15 |
| -; GLOBALNESS0-NEXT: s_mov_b32 s56, s14 |
| +; GLOBALNESS0-NEXT: s_mov_b32 s100, s14 |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[34:35], s[10:11] |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[92:93], 0x80 |
| -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[36:37], 1, v1 |
| +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[42:43], 1, v1 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s69, 0x3ff00000 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s32, 0 |
| ; GLOBALNESS0-NEXT: ; implicit-def: $agpr32_agpr33_agpr34_agpr35_agpr36_agpr37_agpr38_agpr39_agpr40_agpr41_agpr42_agpr43_agpr44_agpr45_agpr46_agpr47_agpr48_agpr49_agpr50_agpr51_agpr52_agpr53_agpr54_agpr55_agpr56_agpr57_agpr58_agpr59_agpr60_agpr61_agpr62_agpr63 |
| ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) |
| ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v0 |
| -; GLOBALNESS0-NEXT: v_writelane_b32 v40, s4, 0 |
| -; GLOBALNESS0-NEXT: v_writelane_b32 v40, s5, 1 |
| +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 0 |
| +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 1 |
| ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 |
| -; GLOBALNESS0-NEXT: v_writelane_b32 v40, s4, 2 |
| -; GLOBALNESS0-NEXT: v_writelane_b32 v40, s5, 3 |
| +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 2 |
| +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 3 |
| ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 |
| -; GLOBALNESS0-NEXT: v_writelane_b32 v40, s4, 4 |
| +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s4, 4 |
| ; GLOBALNESS0-NEXT: v_cmp_gt_i32_e64 s[90:91], 1, v0 |
| -; GLOBALNESS0-NEXT: v_writelane_b32 v40, s5, 5 |
| +; GLOBALNESS0-NEXT: v_writelane_b32 v42, s5, 5 |
| ; GLOBALNESS0-NEXT: s_branch .LBB1_4 |
| ; GLOBALNESS0-NEXT: .LBB1_1: ; %bb70.i |
| ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v40, 4 |
| -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v40, 5 |
| +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v42, 4 |
| +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v42, 5 |
| ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] |
| ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_29 |
| ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow6 |
| @@ -530,19 +526,19 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[92:93], s[92:93] op_sel:[0,1] |
| ; GLOBALNESS0-NEXT: flat_load_dword v44, v[0:1] |
| ; GLOBALNESS0-NEXT: s_add_u32 s8, s60, 40 |
| -; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0 |
| +; GLOBALNESS0-NEXT: buffer_store_dword v40, off, s[0:3], 0 |
| ; GLOBALNESS0-NEXT: flat_load_dword v45, v[0:1] |
| ; GLOBALNESS0-NEXT: s_addc_u32 s9, s61, 0 |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] |
| -; GLOBALNESS0-NEXT: s_mov_b32 s12, s56 |
| +; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 |
| -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 |
| +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 |
| ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) |
| ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] |
| -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[36:37] |
| +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[42:43] |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 |
| ; GLOBALNESS0-NEXT: ; implicit-def: $sgpr4_sgpr5 |
| ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_8 |
| @@ -609,23 +605,23 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS0-NEXT: ; %bb.10: ; %bb33.i |
| ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[32:33], off |
| -; GLOBALNESS0-NEXT: v_readlane_b32 s4, v40, 0 |
| -; GLOBALNESS0-NEXT: v_readlane_b32 s5, v40, 1 |
| -; GLOBALNESS0-NEXT: s_mov_b64 s[72:73], s[36:37] |
| -; GLOBALNESS0-NEXT: s_mov_b32 s75, s39 |
| +; GLOBALNESS0-NEXT: v_readlane_b32 s4, v42, 0 |
| +; GLOBALNESS0-NEXT: v_readlane_b32 s5, v42, 1 |
| ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] |
| ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_12 |
| ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb39.i |
| ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 |
| +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 |
| ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 |
| -; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off |
| +; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[40:41], off |
| ; GLOBALNESS0-NEXT: .LBB1_12: ; %bb44.lr.ph.i |
| ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v45 |
| ; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v44, vcc |
| +; GLOBALNESS0-NEXT: s_mov_b64 s[72:73], s[42:43] |
| +; GLOBALNESS0-NEXT: s_mov_b32 s75, s39 |
| ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) |
| -; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[36:37], 0, v[0:1] |
| +; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1] |
| ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[58:59], 0, v2 |
| ; GLOBALNESS0-NEXT: s_branch .LBB1_15 |
| ; GLOBALNESS0-NEXT: .LBB1_13: ; %Flow7 |
| @@ -646,15 +642,15 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_14 |
| ; GLOBALNESS0-NEXT: ; %bb.17: ; %bb50.i |
| ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 |
| -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[40:41] |
| +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[36:37] |
| ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_20 |
| ; GLOBALNESS0-NEXT: ; %bb.18: ; %bb3.i.i |
| ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 |
| -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[42:43] |
| +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[40:41] |
| ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_20 |
| ; GLOBALNESS0-NEXT: ; %bb.19: ; %bb6.i.i |
| ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 |
| -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[36:37] |
| +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[56:57] |
| ; GLOBALNESS0-NEXT: .LBB1_20: ; %spam.exit.i |
| ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 |
| ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[90:91] |
| @@ -667,28 +663,28 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[64:65] |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] |
| -; GLOBALNESS0-NEXT: s_mov_b32 s12, s56 |
| +; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 |
| -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 |
| +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 |
| ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] |
| ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[44:45], 0, 0 |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], s[64:65] |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] |
| -; GLOBALNESS0-NEXT: s_mov_b32 s12, s56 |
| +; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 |
| -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 |
| +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 |
| ; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], a[32:33], off |
| ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[66:67] |
| ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[58:59] |
| ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_13 |
| ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb62.i |
| ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 |
| -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 |
| -; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off |
| +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 |
| +; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[40:41], off |
| ; GLOBALNESS0-NEXT: s_branch .LBB1_13 |
| ; GLOBALNESS0-NEXT: .LBB1_23: ; %LeafBlock |
| ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| @@ -703,14 +699,14 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS0-NEXT: s_branch .LBB1_3 |
| ; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow14 |
| ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[36:37] |
| ; GLOBALNESS0-NEXT: s_mov_b32 s36, s93 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s37, s93 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s38, s93 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s39, s93 |
| -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] |
| +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[40:41] |
| ; GLOBALNESS0-NEXT: s_mov_b32 s40, s93 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s41, s93 |
| -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[42:43] |
| ; GLOBALNESS0-NEXT: s_mov_b32 s42, s93 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s43, s93 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s44, s93 |
| @@ -739,10 +735,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[26:27], s[62:63], s[62:63] op_sel:[0,1] |
| ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[28:29], s[64:65], s[64:65] op_sel:[0,1] |
| ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[30:31], s[66:67], s[66:67] op_sel:[0,1] |
| -; GLOBALNESS0-NEXT: s_mov_b64 s[42:43], s[6:7] |
| -; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5] |
| +; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[6:7] |
| +; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[4:5] |
| ; GLOBALNESS0-NEXT: s_mov_b32 s39, s75 |
| -; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[72:73] |
| +; GLOBALNESS0-NEXT: s_mov_b64 s[42:43], s[72:73] |
| ; GLOBALNESS0-NEXT: .LBB1_26: ; %Flow15 |
| ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[70:71] |
| @@ -750,21 +746,21 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_2 |
| ; GLOBALNESS0-NEXT: ; %bb.27: ; %bb67.i |
| ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| -; GLOBALNESS0-NEXT: v_readlane_b32 s6, v40, 2 |
| -; GLOBALNESS0-NEXT: v_readlane_b32 s7, v40, 3 |
| +; GLOBALNESS0-NEXT: v_readlane_b32 s6, v42, 2 |
| +; GLOBALNESS0-NEXT: v_readlane_b32 s7, v42, 3 |
| ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] |
| ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 |
| ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i |
| ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 |
| +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 |
| ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 |
| -; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[42:43], off |
| +; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[40:41], off |
| ; GLOBALNESS0-NEXT: s_branch .LBB1_1 |
| ; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i |
| ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 |
| -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 |
| +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 |
| ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[32:33], 0, 0 |
| -; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[42:43], off |
| +; GLOBALNESS0-NEXT: global_store_dwordx2 v[32:33], v[40:41], off |
| ; GLOBALNESS0-NEXT: s_branch .LBB1_2 |
| ; GLOBALNESS0-NEXT: .LBB1_30: ; %loop.exit.guard |
| ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] |
| @@ -776,10 +772,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] |
| -; GLOBALNESS0-NEXT: s_mov_b32 s12, s56 |
| +; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 |
| -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 |
| +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 |
| ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] |
| ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 |
| ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 |
| @@ -794,10 +790,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[62:63] |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[54:55] |
| ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] |
| -; GLOBALNESS0-NEXT: s_mov_b32 s12, s56 |
| +; GLOBALNESS0-NEXT: s_mov_b32 s12, s100 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s13, s99 |
| ; GLOBALNESS0-NEXT: s_mov_b32 s14, s98 |
| -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 |
| +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v43 |
| ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] |
| ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 |
| ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 |
| diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll |
| index 0988921a2452..6dc9b9d0f7e9 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll |
| @@ -9,30 +9,29 @@ define hidden void @widget() { |
| ; GCN-NEXT: s_mov_b32 s16, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 |
| -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[18:19] |
| ; GCN-NEXT: v_writelane_b32 v42, s16, 0 |
| ; GCN-NEXT: s_addk_i32 s32, 0x400 |
| -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GCN-NEXT: ; implicit-def: $vgpr41 |
| -; GCN-NEXT: v_writelane_b32 v41, s30, 0 |
| -; GCN-NEXT: v_writelane_b32 v41, s31, 1 |
| -; GCN-NEXT: v_writelane_b32 v41, s34, 2 |
| -; GCN-NEXT: v_writelane_b32 v41, s35, 3 |
| -; GCN-NEXT: v_writelane_b32 v41, s36, 4 |
| -; GCN-NEXT: v_writelane_b32 v41, s37, 5 |
| -; GCN-NEXT: v_writelane_b32 v41, s38, 6 |
| -; GCN-NEXT: v_writelane_b32 v41, s39, 7 |
| -; GCN-NEXT: v_writelane_b32 v41, s40, 8 |
| -; GCN-NEXT: v_writelane_b32 v41, s41, 9 |
| -; GCN-NEXT: v_writelane_b32 v41, s42, 10 |
| -; GCN-NEXT: v_writelane_b32 v41, s43, 11 |
| -; GCN-NEXT: v_writelane_b32 v41, s44, 12 |
| -; GCN-NEXT: v_writelane_b32 v41, s45, 13 |
| -; GCN-NEXT: v_writelane_b32 v41, s46, 14 |
| -; GCN-NEXT: v_writelane_b32 v41, s47, 15 |
| -; GCN-NEXT: v_mov_b32_e32 v40, v31 |
| +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GCN-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GCN-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GCN-NEXT: v_writelane_b32 v40, s34, 2 |
| +; GCN-NEXT: v_writelane_b32 v40, s35, 3 |
| +; GCN-NEXT: v_writelane_b32 v40, s36, 4 |
| +; GCN-NEXT: v_writelane_b32 v40, s37, 5 |
| +; GCN-NEXT: v_writelane_b32 v40, s38, 6 |
| +; GCN-NEXT: v_writelane_b32 v40, s39, 7 |
| +; GCN-NEXT: v_writelane_b32 v40, s40, 8 |
| +; GCN-NEXT: v_writelane_b32 v40, s41, 9 |
| +; GCN-NEXT: v_writelane_b32 v40, s42, 10 |
| +; GCN-NEXT: v_writelane_b32 v40, s43, 11 |
| +; GCN-NEXT: v_writelane_b32 v40, s44, 12 |
| +; GCN-NEXT: v_writelane_b32 v40, s45, 13 |
| +; GCN-NEXT: v_writelane_b32 v40, s46, 14 |
| +; GCN-NEXT: v_writelane_b32 v40, s47, 15 |
| +; GCN-NEXT: v_mov_b32_e32 v41, v31 |
| ; GCN-NEXT: s_mov_b32 s42, s15 |
| ; GCN-NEXT: s_mov_b32 s43, s14 |
| ; GCN-NEXT: s_mov_b32 s44, s13 |
| @@ -69,7 +68,7 @@ define hidden void @widget() { |
| ; GCN-NEXT: s_mov_b32 s13, s44 |
| ; GCN-NEXT: s_mov_b32 s14, s43 |
| ; GCN-NEXT: s_mov_b32 s15, s42 |
| -; GCN-NEXT: v_mov_b32_e32 v31, v40 |
| +; GCN-NEXT: v_mov_b32_e32 v31, v41 |
| ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 |
| ; GCN-NEXT: s_mov_b64 s[4:5], 0 |
| @@ -101,29 +100,29 @@ define hidden void @widget() { |
| ; GCN-NEXT: s_mov_b32 s13, s44 |
| ; GCN-NEXT: s_mov_b32 s14, s43 |
| ; GCN-NEXT: s_mov_b32 s15, s42 |
| -; GCN-NEXT: v_mov_b32_e32 v31, v40 |
| +; GCN-NEXT: v_mov_b32_e32 v31, v41 |
| ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| ; GCN-NEXT: .LBB0_8: ; %UnifiedReturnBlock |
| -; GCN-NEXT: v_readlane_b32 s47, v41, 15 |
| -; GCN-NEXT: v_readlane_b32 s46, v41, 14 |
| -; GCN-NEXT: v_readlane_b32 s45, v41, 13 |
| -; GCN-NEXT: v_readlane_b32 s44, v41, 12 |
| -; GCN-NEXT: v_readlane_b32 s43, v41, 11 |
| -; GCN-NEXT: v_readlane_b32 s42, v41, 10 |
| -; GCN-NEXT: v_readlane_b32 s41, v41, 9 |
| -; GCN-NEXT: v_readlane_b32 s40, v41, 8 |
| -; GCN-NEXT: v_readlane_b32 s39, v41, 7 |
| -; GCN-NEXT: v_readlane_b32 s38, v41, 6 |
| -; GCN-NEXT: v_readlane_b32 s37, v41, 5 |
| -; GCN-NEXT: v_readlane_b32 s36, v41, 4 |
| -; GCN-NEXT: v_readlane_b32 s35, v41, 3 |
| -; GCN-NEXT: v_readlane_b32 s34, v41, 2 |
| -; GCN-NEXT: v_readlane_b32 s31, v41, 1 |
| -; GCN-NEXT: v_readlane_b32 s30, v41, 0 |
| -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload |
| +; GCN-NEXT: v_readlane_b32 s47, v40, 15 |
| +; GCN-NEXT: v_readlane_b32 s46, v40, 14 |
| +; GCN-NEXT: v_readlane_b32 s45, v40, 13 |
| +; GCN-NEXT: v_readlane_b32 s44, v40, 12 |
| +; GCN-NEXT: v_readlane_b32 s43, v40, 11 |
| +; GCN-NEXT: v_readlane_b32 s42, v40, 10 |
| +; GCN-NEXT: v_readlane_b32 s41, v40, 9 |
| +; GCN-NEXT: v_readlane_b32 s40, v40, 8 |
| +; GCN-NEXT: v_readlane_b32 s39, v40, 7 |
| +; GCN-NEXT: v_readlane_b32 s38, v40, 6 |
| +; GCN-NEXT: v_readlane_b32 s37, v40, 5 |
| +; GCN-NEXT: v_readlane_b32 s36, v40, 4 |
| +; GCN-NEXT: v_readlane_b32 s35, v40, 3 |
| +; GCN-NEXT: v_readlane_b32 s34, v40, 2 |
| +; GCN-NEXT: v_readlane_b32 s31, v40, 1 |
| +; GCN-NEXT: v_readlane_b32 s30, v40, 0 |
| +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GCN-NEXT: v_readlane_b32 s4, v42, 0 |
| ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| -; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| +; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| ; GCN-NEXT: s_mov_b64 exec, s[6:7] |
| ; GCN-NEXT: s_addk_i32 s32, 0xfc00 |
| @@ -268,38 +267,36 @@ define hidden void @blam() { |
| ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GCN-NEXT: s_mov_b32 s16, s33 |
| ; GCN-NEXT: s_mov_b32 s33, s32 |
| -; GCN-NEXT: s_xor_saveexec_b64 s[18:19], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, -1 |
| -; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill |
| +; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 |
| +; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill |
| +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill |
| ; GCN-NEXT: s_mov_b64 exec, s[18:19] |
| -; GCN-NEXT: v_writelane_b32 v45, s16, 0 |
| +; GCN-NEXT: v_writelane_b32 v46, s16, 0 |
| ; GCN-NEXT: s_addk_i32 s32, 0x800 |
| -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill |
| -; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| -; GCN-NEXT: v_writelane_b32 v0, s30, 0 |
| -; GCN-NEXT: v_writelane_b32 v0, s31, 1 |
| -; GCN-NEXT: v_writelane_b32 v0, s34, 2 |
| -; GCN-NEXT: v_writelane_b32 v0, s35, 3 |
| -; GCN-NEXT: v_writelane_b32 v0, s36, 4 |
| -; GCN-NEXT: v_writelane_b32 v0, s37, 5 |
| -; GCN-NEXT: v_writelane_b32 v0, s38, 6 |
| -; GCN-NEXT: v_writelane_b32 v0, s39, 7 |
| -; GCN-NEXT: v_writelane_b32 v0, s40, 8 |
| -; GCN-NEXT: v_writelane_b32 v0, s41, 9 |
| -; GCN-NEXT: v_writelane_b32 v0, s42, 10 |
| -; GCN-NEXT: v_writelane_b32 v0, s43, 11 |
| -; GCN-NEXT: v_writelane_b32 v0, s44, 12 |
| -; GCN-NEXT: v_writelane_b32 v0, s45, 13 |
| -; GCN-NEXT: v_writelane_b32 v0, s46, 14 |
| -; GCN-NEXT: v_writelane_b32 v0, s47, 15 |
| -; GCN-NEXT: v_writelane_b32 v0, s48, 16 |
| -; GCN-NEXT: v_writelane_b32 v0, s49, 17 |
| -; GCN-NEXT: v_mov_b32_e32 v40, v31 |
| +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill |
| +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GCN-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GCN-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GCN-NEXT: v_writelane_b32 v40, s34, 2 |
| +; GCN-NEXT: v_writelane_b32 v40, s35, 3 |
| +; GCN-NEXT: v_writelane_b32 v40, s36, 4 |
| +; GCN-NEXT: v_writelane_b32 v40, s37, 5 |
| +; GCN-NEXT: v_writelane_b32 v40, s38, 6 |
| +; GCN-NEXT: v_writelane_b32 v40, s39, 7 |
| +; GCN-NEXT: v_writelane_b32 v40, s40, 8 |
| +; GCN-NEXT: v_writelane_b32 v40, s41, 9 |
| +; GCN-NEXT: v_writelane_b32 v40, s42, 10 |
| +; GCN-NEXT: v_writelane_b32 v40, s43, 11 |
| +; GCN-NEXT: v_writelane_b32 v40, s44, 12 |
| +; GCN-NEXT: v_writelane_b32 v40, s45, 13 |
| +; GCN-NEXT: v_writelane_b32 v40, s46, 14 |
| +; GCN-NEXT: v_writelane_b32 v40, s47, 15 |
| +; GCN-NEXT: v_writelane_b32 v40, s48, 16 |
| +; GCN-NEXT: v_writelane_b32 v40, s49, 17 |
| +; GCN-NEXT: v_mov_b32_e32 v41, v31 |
| ; GCN-NEXT: s_mov_b32 s44, s15 |
| ; GCN-NEXT: s_mov_b32 s45, s14 |
| ; GCN-NEXT: s_mov_b32 s46, s13 |
| @@ -311,24 +308,24 @@ define hidden void @blam() { |
| ; GCN-NEXT: s_mov_b64 s[4:5], 0 |
| ; GCN-NEXT: v_mov_b32_e32 v0, 0 |
| ; GCN-NEXT: v_mov_b32_e32 v1, 0 |
| -; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v40 |
| -; GCN-NEXT: v_mov_b32_e32 v42, 0 |
| -; GCN-NEXT: flat_load_dword v43, v[0:1] |
| -; GCN-NEXT: v_mov_b32_e32 v44, 0x7fc00000 |
| +; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v41 |
| +; GCN-NEXT: v_mov_b32_e32 v43, 0 |
| +; GCN-NEXT: flat_load_dword v44, v[0:1] |
| +; GCN-NEXT: v_mov_b32_e32 v45, 0x7fc00000 |
| ; GCN-NEXT: s_getpc_b64 s[48:49] |
| ; GCN-NEXT: s_add_u32 s48, s48, spam@rel32@lo+4 |
| ; GCN-NEXT: s_addc_u32 s49, s49, spam@rel32@hi+12 |
| -; GCN-NEXT: v_lshlrev_b32_e32 v41, 2, v2 |
| +; GCN-NEXT: v_lshlrev_b32_e32 v42, 2, v2 |
| ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) |
| -; GCN-NEXT: v_cmp_eq_f32_e64 s[42:43], 0, v43 |
| +; GCN-NEXT: v_cmp_eq_f32_e64 s[42:43], 0, v44 |
| ; GCN-NEXT: s_branch .LBB1_3 |
| ; GCN-NEXT: .LBB1_1: ; %bb10 |
| ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 |
| ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] |
| -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 |
| +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 |
| ; GCN-NEXT: .LBB1_2: ; %bb18 |
| ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 |
| -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 |
| +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 |
| ; GCN-NEXT: s_mov_b64 s[4:5], 0 |
| ; GCN-NEXT: .LBB1_3: ; %bb2 |
| ; GCN-NEXT: ; =>This Loop Header: Depth=1 |
| @@ -337,8 +334,8 @@ define hidden void @blam() { |
| ; GCN-NEXT: .LBB1_4: ; %bb2 |
| ; GCN-NEXT: ; Parent Loop BB1_3 Depth=1 |
| ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 |
| -; GCN-NEXT: flat_load_dword v0, v[41:42] |
| -; GCN-NEXT: buffer_store_dword v42, off, s[0:3], 0 |
| +; GCN-NEXT: flat_load_dword v0, v[42:43] |
| +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 |
| ; GCN-NEXT: s_waitcnt vmcnt(1) |
| ; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 3, v0 |
| ; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc |
| @@ -370,7 +367,7 @@ define hidden void @blam() { |
| ; GCN-NEXT: s_mov_b32 s13, s46 |
| ; GCN-NEXT: s_mov_b32 s14, s45 |
| ; GCN-NEXT: s_mov_b32 s15, s44 |
| -; GCN-NEXT: v_mov_b32_e32 v31, v40 |
| +; GCN-NEXT: v_mov_b32_e32 v31, v41 |
| ; GCN-NEXT: s_swappc_b64 s[30:31], s[48:49] |
| ; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 0, v0 |
| ; GCN-NEXT: s_mov_b64 s[4:5], 0 |
| @@ -385,10 +382,10 @@ define hidden void @blam() { |
| ; GCN-NEXT: ; %bb.9: ; %bb16 |
| ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 |
| ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] |
| -; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 |
| +; GCN-NEXT: buffer_store_dword v45, off, s[0:3], 0 |
| ; GCN-NEXT: .LBB1_10: ; %bb17 |
| ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 |
| -; GCN-NEXT: buffer_store_dword v43, off, s[0:3], 0 |
| +; GCN-NEXT: buffer_store_dword v44, off, s[0:3], 0 |
| ; GCN-NEXT: s_branch .LBB1_2 |
| bb: |
| %tmp = load float, ptr null, align 16 |
| diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll |
| index a97c436d303b..7285510a5f89 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll |
| @@ -14,7 +14,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, |
| ; GFX9-NEXT: s_mov_b32 s4, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill |
| +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: v_mov_b32_e32 v36, v16 |
| @@ -22,10 +22,10 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, |
| ; GFX9-NEXT: v_mov_b32_e32 v34, v14 |
| ; GFX9-NEXT: v_mov_b32_e32 v33, v13 |
| ; GFX9-NEXT: v_mov_b32_e32 v32, v12 |
| -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX9-NEXT: ;;#ASMSTART |
| ; GFX9-NEXT: ;;#ASMEND |
| ; GFX9-NEXT: ;;#ASMSTART |
| @@ -34,31 +34,30 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, |
| ; GFX9-NEXT: ;;#ASMEND |
| ; GFX9-NEXT: ;;#ASMSTART |
| ; GFX9-NEXT: ;;#ASMEND |
| -; GFX9-NEXT: image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1 |
| +; GFX9-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x800 |
| ; GFX9-NEXT: v_writelane_b32 v45, s4, 0 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr44 |
| -; GFX9-NEXT: v_writelane_b32 v44, s30, 0 |
| -; GFX9-NEXT: v_writelane_b32 v44, s31, 1 |
| +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| -; GFX9-NEXT: v_mov_b32_e32 v0, v40 |
| -; GFX9-NEXT: v_mov_b32_e32 v1, v41 |
| -; GFX9-NEXT: v_mov_b32_e32 v2, v42 |
| -; GFX9-NEXT: v_mov_b32_e32 v3, v43 |
| -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload |
| -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload |
| -; GFX9-NEXT: v_readlane_b32 s31, v44, 1 |
| -; GFX9-NEXT: v_readlane_b32 s30, v44, 0 |
| +; GFX9-NEXT: v_mov_b32_e32 v0, v41 |
| +; GFX9-NEXT: v_mov_b32_e32 v1, v42 |
| +; GFX9-NEXT: v_mov_b32_e32 v2, v43 |
| +; GFX9-NEXT: v_mov_b32_e32 v3, v44 |
| +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload |
| +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload |
| +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 |
| +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s4, v45, 0 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload |
| +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload |
| ; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xf800 |
| @@ -73,7 +72,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, |
| ; GFX10-NEXT: s_mov_b32 s4, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 |
| -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill |
| ; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s5 |
| @@ -82,10 +81,10 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, |
| ; GFX10-NEXT: v_mov_b32_e32 v34, v14 |
| ; GFX10-NEXT: v_mov_b32_e32 v33, v13 |
| ; GFX10-NEXT: v_mov_b32_e32 v32, v12 |
| -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ;;#ASMEND |
| ; GFX10-NEXT: ;;#ASMSTART |
| @@ -94,33 +93,32 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, |
| ; GFX10-NEXT: ;;#ASMEND |
| ; GFX10-NEXT: ;;#ASMSTART |
| ; GFX10-NEXT: ;;#ASMEND |
| -; GFX10-NEXT: image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D |
| +; GFX10-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D |
| ; GFX10-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX10-NEXT: v_writelane_b32 v45, s4, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 |
| -; GFX10-NEXT: ; implicit-def: $vgpr44 |
| +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX10-NEXT: v_writelane_b32 v44, s30, 0 |
| -; GFX10-NEXT: v_writelane_b32 v44, s31, 1 |
| +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| -; GFX10-NEXT: v_mov_b32_e32 v0, v40 |
| -; GFX10-NEXT: v_mov_b32_e32 v1, v41 |
| -; GFX10-NEXT: v_mov_b32_e32 v2, v42 |
| -; GFX10-NEXT: v_mov_b32_e32 v3, v43 |
| +; GFX10-NEXT: v_mov_b32_e32 v0, v41 |
| +; GFX10-NEXT: v_mov_b32_e32 v1, v42 |
| +; GFX10-NEXT: v_mov_b32_e32 v2, v43 |
| +; GFX10-NEXT: v_mov_b32_e32 v3, v44 |
| ; GFX10-NEXT: s_clause 0x3 |
| -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 |
| -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 |
| -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 |
| -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 |
| -; GFX10-NEXT: v_readlane_b32 s31, v44, 1 |
| -; GFX10-NEXT: v_readlane_b32 s30, v44, 0 |
| +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 |
| +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 |
| +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 |
| +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 |
| +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 |
| +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s4, v45, 0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 |
| ; GFX10-NEXT: s_clause 0x1 |
| -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 |
| +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 |
| ; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s5 |
| @@ -137,17 +135,17 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, |
| ; GFX11-NEXT: s_mov_b32 s33, s32 |
| ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 |
| ; GFX11-NEXT: s_clause 0x1 |
| -; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:16 |
| +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 |
| ; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:20 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15 |
| ; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 |
| ; GFX11-NEXT: v_mov_b32_e32 v32, v12 |
| ; GFX11-NEXT: s_clause 0x3 |
| -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:12 |
| -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:8 |
| -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:4 |
| -; GFX11-NEXT: scratch_store_b32 off, v43, s33 |
| +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 |
| +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 |
| +; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:4 |
| +; GFX11-NEXT: scratch_store_b32 off, v44, s33 |
| ; GFX11-NEXT: ;;#ASMSTART |
| ; GFX11-NEXT: ;;#ASMEND |
| ; GFX11-NEXT: ;;#ASMSTART |
| @@ -156,31 +154,30 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, |
| ; GFX11-NEXT: ;;#ASMEND |
| ; GFX11-NEXT: ;;#ASMSTART |
| ; GFX11-NEXT: ;;#ASMEND |
| -; GFX11-NEXT: image_gather4_c_b_cl v[40:43], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D |
| +; GFX11-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D |
| ; GFX11-NEXT: s_add_i32 s32, s32, 32 |
| ; GFX11-NEXT: v_writelane_b32 v45, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 |
| -; GFX11-NEXT: ; implicit-def: $vgpr44 |
| +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| -; GFX11-NEXT: v_writelane_b32 v44, s30, 0 |
| -; GFX11-NEXT: v_writelane_b32 v44, s31, 1 |
| +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX11-NEXT: v_dual_mov_b32 v0, v40 :: v_dual_mov_b32 v1, v41 |
| -; GFX11-NEXT: v_dual_mov_b32 v2, v42 :: v_dual_mov_b32 v3, v43 |
| +; GFX11-NEXT: v_dual_mov_b32 v0, v41 :: v_dual_mov_b32 v1, v42 |
| +; GFX11-NEXT: v_dual_mov_b32 v2, v43 :: v_dual_mov_b32 v3, v44 |
| ; GFX11-NEXT: s_clause 0x3 |
| -; GFX11-NEXT: scratch_load_b32 v43, off, s33 |
| -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:4 |
| -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:8 |
| -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:12 |
| -; GFX11-NEXT: v_readlane_b32 s31, v44, 1 |
| -; GFX11-NEXT: v_readlane_b32 s30, v44, 0 |
| +; GFX11-NEXT: scratch_load_b32 v44, off, s33 |
| +; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:4 |
| +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 |
| +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 |
| +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 |
| +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v45, 0 |
| ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 |
| ; GFX11-NEXT: s_clause 0x1 |
| -; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:16 |
| +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 |
| ; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:20 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 |
| @@ -218,45 +215,44 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, |
| ; GFX9-NEXT: s_mov_b32 s4, s33 |
| ; GFX9-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| -; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill |
| +; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill |
| ; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| -; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill |
| -; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| -; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| -; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| -; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GFX9-NEXT: v_mov_b32_e32 v44, v16 |
| -; GFX9-NEXT: v_mov_b32_e32 v43, v15 |
| -; GFX9-NEXT: v_mov_b32_e32 v42, v14 |
| -; GFX9-NEXT: v_mov_b32_e32 v41, v13 |
| -; GFX9-NEXT: v_mov_b32_e32 v40, v12 |
| -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:44], s[4:11], s[4:7] dmask:0x1 |
| +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill |
| +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GFX9-NEXT: v_mov_b32_e32 v45, v16 |
| +; GFX9-NEXT: v_mov_b32_e32 v44, v15 |
| +; GFX9-NEXT: v_mov_b32_e32 v43, v14 |
| +; GFX9-NEXT: v_mov_b32_e32 v42, v13 |
| +; GFX9-NEXT: v_mov_b32_e32 v41, v12 |
| +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1 |
| ; GFX9-NEXT: s_addk_i32 s32, 0x800 |
| ; GFX9-NEXT: v_writelane_b32 v46, s4, 0 |
| ; GFX9-NEXT: s_getpc_b64 s[4:5] |
| ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 |
| ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 |
| ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX9-NEXT: ; implicit-def: $vgpr45 |
| -; GFX9-NEXT: v_writelane_b32 v45, s30, 0 |
| -; GFX9-NEXT: v_writelane_b32 v45, s31, 1 |
| +; GFX9-NEXT: v_writelane_b32 v40, s30, 0 |
| +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 |
| ; GFX9-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off |
| ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| -; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:44], s[4:11], s[4:7] dmask:0x1 |
| +; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1 |
| ; GFX9-NEXT: s_nop 0 |
| -; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload |
| -; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| -; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| -; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload |
| -; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload |
| -; GFX9-NEXT: v_readlane_b32 s31, v45, 1 |
| -; GFX9-NEXT: v_readlane_b32 s30, v45, 0 |
| +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload |
| +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload |
| +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload |
| +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 |
| +; GFX9-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX9-NEXT: v_readlane_b32 s4, v46, 0 |
| ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 |
| -; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload |
| +; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload |
| ; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload |
| ; GFX9-NEXT: s_mov_b64 exec, s[6:7] |
| ; GFX9-NEXT: s_addk_i32 s32, 0xf800 |
| @@ -271,47 +267,46 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, |
| ; GFX10-NEXT: s_mov_b32 s4, s33 |
| ; GFX10-NEXT: s_mov_b32 s33, s32 |
| ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 |
| -; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill |
| ; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s5 |
| -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill |
| -; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| -; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| -; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| -; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill |
| ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D |
| ; GFX10-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX10-NEXT: v_writelane_b32 v46, s4, 0 |
| ; GFX10-NEXT: s_getpc_b64 s[4:5] |
| ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 |
| ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 |
| -; GFX10-NEXT: ; implicit-def: $vgpr45 |
| -; GFX10-NEXT: v_mov_b32_e32 v40, v16 |
| +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 |
| -; GFX10-NEXT: v_writelane_b32 v45, s30, 0 |
| -; GFX10-NEXT: v_mov_b32_e32 v41, v15 |
| -; GFX10-NEXT: v_mov_b32_e32 v42, v14 |
| -; GFX10-NEXT: v_mov_b32_e32 v43, v13 |
| -; GFX10-NEXT: v_mov_b32_e32 v44, v12 |
| -; GFX10-NEXT: v_writelane_b32 v45, s31, 1 |
| +; GFX10-NEXT: v_mov_b32_e32 v41, v16 |
| +; GFX10-NEXT: v_mov_b32_e32 v42, v15 |
| +; GFX10-NEXT: v_mov_b32_e32 v43, v14 |
| +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX10-NEXT: v_mov_b32_e32 v44, v13 |
| +; GFX10-NEXT: v_mov_b32_e32 v45, v12 |
| ; GFX10-NEXT: s_waitcnt vmcnt(0) |
| ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off |
| ; GFX10-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] |
| -; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D |
| +; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D |
| ; GFX10-NEXT: s_clause 0x4 |
| -; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 |
| -; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 |
| -; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 |
| -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 |
| -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 |
| -; GFX10-NEXT: v_readlane_b32 s31, v45, 1 |
| -; GFX10-NEXT: v_readlane_b32 s30, v45, 0 |
| +; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 |
| +; GFX10-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 |
| +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 |
| +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 |
| +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 |
| +; GFX10-NEXT: v_readlane_b32 s31, v40, 1 |
| +; GFX10-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX10-NEXT: v_readlane_b32 s4, v46, 0 |
| ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 |
| ; GFX10-NEXT: s_clause 0x1 |
| -; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 |
| +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 |
| ; GFX10-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 |
| ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 |
| ; GFX10-NEXT: s_mov_b32 exec_lo, s5 |
| @@ -328,45 +323,44 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, |
| ; GFX11-NEXT: s_mov_b32 s33, s32 |
| ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 |
| ; GFX11-NEXT: s_clause 0x1 |
| -; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:20 |
| +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:20 |
| ; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:24 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: s_clause 0x4 |
| -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 |
| -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 |
| -; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 |
| -; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:4 |
| -; GFX11-NEXT: scratch_store_b32 off, v44, s33 |
| +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:16 |
| +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:12 |
| +; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:8 |
| +; GFX11-NEXT: scratch_store_b32 off, v44, s33 offset:4 |
| +; GFX11-NEXT: scratch_store_b32 off, v45, s33 |
| ; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D |
| ; GFX11-NEXT: s_add_i32 s32, s32, 32 |
| ; GFX11-NEXT: v_writelane_b32 v46, s0, 0 |
| ; GFX11-NEXT: s_getpc_b64 s[0:1] |
| ; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 |
| ; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 |
| -; GFX11-NEXT: ; implicit-def: $vgpr45 |
| -; GFX11-NEXT: v_dual_mov_b32 v40, v16 :: v_dual_mov_b32 v41, v15 |
| +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 |
| ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 |
| -; GFX11-NEXT: v_writelane_b32 v45, s30, 0 |
| -; GFX11-NEXT: v_dual_mov_b32 v42, v14 :: v_dual_mov_b32 v43, v13 |
| -; GFX11-NEXT: v_mov_b32_e32 v44, v12 |
| -; GFX11-NEXT: v_writelane_b32 v45, s31, 1 |
| +; GFX11-NEXT: v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15 |
| +; GFX11-NEXT: v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13 |
| +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 |
| +; GFX11-NEXT: v_mov_b32_e32 v45, v12 |
| ; GFX11-NEXT: s_waitcnt vmcnt(0) |
| ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off |
| ; GFX11-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] |
| -; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D |
| +; GFX11-NEXT: image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D |
| ; GFX11-NEXT: s_clause 0x4 |
| -; GFX11-NEXT: scratch_load_b32 v44, off, s33 |
| -; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:4 |
| -; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 |
| -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 |
| -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 |
| -; GFX11-NEXT: v_readlane_b32 s31, v45, 1 |
| -; GFX11-NEXT: v_readlane_b32 s30, v45, 0 |
| +; GFX11-NEXT: scratch_load_b32 v45, off, s33 |
| +; GFX11-NEXT: scratch_load_b32 v44, off, s33 offset:4 |
| +; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:8 |
| +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:12 |
| +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:16 |
| +; GFX11-NEXT: v_readlane_b32 s31, v40, 1 |
| +; GFX11-NEXT: v_readlane_b32 s30, v40, 0 |
| ; GFX11-NEXT: v_readlane_b32 s0, v46, 0 |
| ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 |
| ; GFX11-NEXT: s_clause 0x1 |
| -; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:20 |
| +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:20 |
| ; GFX11-NEXT: scratch_load_b32 v46, off, s33 offset:24 |
| ; GFX11-NEXT: s_mov_b32 exec_lo, s1 |
| ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 |
| diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll |
| index d721bf48e445..09106f0da591 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll |
| @@ -16,11 +16,10 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id |
| ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 |
| ; CHECK-NEXT: s_add_u32 s0, s0, s17 |
| ; CHECK-NEXT: s_addc_u32 s1, s1, 0 |
| -; CHECK-NEXT: ; implicit-def: $vgpr3 |
| -; CHECK-NEXT: v_writelane_b32 v3, s16, 0 |
| +; CHECK-NEXT: v_writelane_b32 v40, s16, 0 |
| ; CHECK-NEXT: s_mov_b32 s13, s15 |
| ; CHECK-NEXT: s_mov_b32 s12, s14 |
| -; CHECK-NEXT: v_readlane_b32 s14, v3, 0 |
| +; CHECK-NEXT: v_readlane_b32 s14, v40, 0 |
| ; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9] |
| ; CHECK-NEXT: s_load_dwordx2 s[8:9], s[16:17], 0x0 |
| ; CHECK-NEXT: v_mov_b32_e32 v5, 42 |
| diff --git a/llvm/test/CodeGen/AMDGPU/wwm-register-spill-during-regalloc.ll b/llvm/test/CodeGen/AMDGPU/wwm-register-spill-during-regalloc.ll |
| deleted file mode 100644 |
| index d3144b2648fc..000000000000 |
| --- a/llvm/test/CodeGen/AMDGPU/wwm-register-spill-during-regalloc.ll |
| +++ /dev/null |
| @@ -1,166 +0,0 @@ |
| -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=virtregrewriter,1 --verify-machineinstrs -o - %s | FileCheck -check-prefix=WWM-SPILL %s |
| -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -stop-after=regallocfast,1 --verify-machineinstrs -o - %s | FileCheck -check-prefix=WWM-SPILL-O0 %s |
| -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s |
| -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 --verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-O0 %s |
| - |
| -; Test whole-wave register spilling. |
| - |
| -; In the testcase, the return address registers (SGPR30_SGPR31) should be preserved across the call. |
| -; Since the test limits the VGPR numbers, they are all in the call-clobber (scratch) range and RA should |
| -; spill any VGPR borrowed for spilling SGPRs. The writelane/readlane instructions that spill/restore |
| -; SGPRs into/from VGPR are whole-wave operations and hence the VGPRs involved in such operations require |
| -; whole-wave spilling. |
| - |
| -define void @test() #0 { |
| -; WWM-SPILL-LABEL: name: test |
| -; WWM-SPILL: bb.0 (%ir-block.0): |
| -; WWM-SPILL-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 |
| -; WWM-SPILL-NEXT: {{ $}} |
| -; WWM-SPILL-NEXT: renamable $vgpr0 = IMPLICIT_DEF |
| -; WWM-SPILL-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr30, 0, killed $vgpr0 |
| -; WWM-SPILL-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr31, 1, killed $vgpr0 |
| -; WWM-SPILL-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) |
| -; WWM-SPILL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 |
| -; WWM-SPILL-NEXT: renamable $sgpr16_sgpr17 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @ext_func + 4, target-flags(amdgpu-gotprel32-hi) @ext_func + 12, implicit-def dead $scc |
| -; WWM-SPILL-NEXT: renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) |
| -; WWM-SPILL-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @ext_func, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3 |
| -; WWM-SPILL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 |
| -; WWM-SPILL-NEXT: renamable $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) |
| -; WWM-SPILL-NEXT: $sgpr31 = V_READLANE_B32 $vgpr0, 1 |
| -; WWM-SPILL-NEXT: $sgpr30 = V_READLANE_B32 killed $vgpr0, 0 |
| -; WWM-SPILL-NEXT: SI_RETURN |
| -; |
| -; WWM-SPILL-O0-LABEL: name: test |
| -; WWM-SPILL-O0: bb.0 (%ir-block.0): |
| -; WWM-SPILL-O0-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr30, $sgpr31, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 |
| -; WWM-SPILL-O0-NEXT: {{ $}} |
| -; WWM-SPILL-O0-NEXT: renamable $vgpr0 = IMPLICIT_DEF |
| -; WWM-SPILL-O0-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr0 |
| -; WWM-SPILL-O0-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr0 |
| -; WWM-SPILL-O0-NEXT: SI_SPILL_WWM_V32_SAVE $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) |
| -; WWM-SPILL-O0-NEXT: renamable $vgpr0 = COPY $vgpr31 |
| -; WWM-SPILL-O0-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 |
| -; WWM-SPILL-O0-NEXT: renamable $sgpr16_sgpr17 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @ext_func + 4, target-flags(amdgpu-gotprel32-hi) @ext_func + 12, implicit-def dead $scc |
| -; WWM-SPILL-O0-NEXT: renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) |
| -; WWM-SPILL-O0-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 |
| -; WWM-SPILL-O0-NEXT: $vgpr31 = COPY killed renamable $vgpr0 |
| -; WWM-SPILL-O0-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed renamable $sgpr20_sgpr21_sgpr22_sgpr23 |
| -; WWM-SPILL-O0-NEXT: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @ext_func, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3 |
| -; WWM-SPILL-O0-NEXT: $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) |
| -; WWM-SPILL-O0-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 |
| -; WWM-SPILL-O0-NEXT: dead $sgpr31 = V_READLANE_B32 $vgpr0, 1 |
| -; WWM-SPILL-O0-NEXT: dead $sgpr30 = V_READLANE_B32 killed $vgpr0, 0 |
| -; WWM-SPILL-O0-NEXT: SI_RETURN |
| -; |
| -; GCN-LABEL: test: |
| -; GCN: ; %bb.0: |
| -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GCN-NEXT: s_mov_b32 s16, s33 |
| -; GCN-NEXT: s_mov_b32 s33, s32 |
| -; GCN-NEXT: s_xor_saveexec_b64 s[18:19], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[18:19] |
| -; GCN-NEXT: v_mov_b32_e32 v1, s34 |
| -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| -; GCN-NEXT: v_mov_b32_e32 v1, s35 |
| -; GCN-NEXT: ; implicit-def: $vgpr0 |
| -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| -; GCN-NEXT: v_mov_b32_e32 v1, s16 |
| -; GCN-NEXT: v_writelane_b32 v0, s30, 0 |
| -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_addk_i32 s32, 0x800 |
| -; GCN-NEXT: v_writelane_b32 v0, s31, 1 |
| -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GCN-NEXT: s_mov_b64 exec, s[34:35] |
| -; GCN-NEXT: s_getpc_b64 s[16:17] |
| -; GCN-NEXT: s_add_u32 s16, s16, ext_func@gotpcrel32@lo+4 |
| -; GCN-NEXT: s_addc_u32 s17, s17, ext_func@gotpcrel32@hi+12 |
| -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 |
| -; GCN-NEXT: s_waitcnt lgkmcnt(0) |
| -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[34:35] |
| -; GCN-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-NEXT: v_readlane_b32 s31, v0, 1 |
| -; GCN-NEXT: v_readlane_b32 s30, v0, 0 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-NEXT: v_readfirstlane_b32 s34, v0 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-NEXT: v_readfirstlane_b32 s35, v0 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-NEXT: v_readfirstlane_b32 s4, v0 |
| -; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 |
| -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| -; GCN-NEXT: s_mov_b64 exec, s[6:7] |
| -; GCN-NEXT: s_addk_i32 s32, 0xf800 |
| -; GCN-NEXT: s_mov_b32 s33, s4 |
| -; GCN-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-NEXT: s_setpc_b64 s[30:31] |
| -; |
| -; GCN-O0-LABEL: test: |
| -; GCN-O0: ; %bb.0: |
| -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GCN-O0-NEXT: s_mov_b32 s16, s33 |
| -; GCN-O0-NEXT: s_mov_b32 s33, s32 |
| -; GCN-O0-NEXT: s_xor_saveexec_b64 s[18:19], -1 |
| -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| -; GCN-O0-NEXT: s_mov_b64 exec, s[18:19] |
| -; GCN-O0-NEXT: v_mov_b32_e32 v1, s34 |
| -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| -; GCN-O0-NEXT: v_mov_b32_e32 v1, s35 |
| -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| -; GCN-O0-NEXT: v_mov_b32_e32 v1, s16 |
| -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill |
| -; GCN-O0-NEXT: s_add_i32 s32, s32, 0x800 |
| -; GCN-O0-NEXT: ; implicit-def: $vgpr0 |
| -; GCN-O0-NEXT: v_writelane_b32 v0, s30, 0 |
| -; GCN-O0-NEXT: v_writelane_b32 v0, s31, 1 |
| -; GCN-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GCN-O0-NEXT: s_mov_b64 exec, s[34:35] |
| -; GCN-O0-NEXT: v_mov_b32_e32 v0, v31 |
| -; GCN-O0-NEXT: s_getpc_b64 s[16:17] |
| -; GCN-O0-NEXT: s_add_u32 s16, s16, ext_func@gotpcrel32@lo+4 |
| -; GCN-O0-NEXT: s_addc_u32 s17, s17, ext_func@gotpcrel32@hi+12 |
| -; GCN-O0-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 |
| -; GCN-O0-NEXT: s_mov_b64 s[22:23], s[2:3] |
| -; GCN-O0-NEXT: s_mov_b64 s[20:21], s[0:1] |
| -; GCN-O0-NEXT: v_mov_b32_e32 v31, v0 |
| -; GCN-O0-NEXT: s_mov_b64 s[0:1], s[20:21] |
| -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[22:23] |
| -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) |
| -; GCN-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] |
| -; GCN-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload |
| -; GCN-O0-NEXT: s_mov_b64 exec, s[34:35] |
| -; GCN-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-O0-NEXT: v_readlane_b32 s31, v0, 1 |
| -; GCN-O0-NEXT: v_readlane_b32 s30, v0, 0 |
| -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| -; GCN-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-O0-NEXT: v_readfirstlane_b32 s34, v0 |
| -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload |
| -; GCN-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-O0-NEXT: v_readfirstlane_b32 s35, v0 |
| -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload |
| -; GCN-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-O0-NEXT: v_readfirstlane_b32 s4, v0 |
| -; GCN-O0-NEXT: s_xor_saveexec_b64 s[6:7], -1 |
| -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] |
| -; GCN-O0-NEXT: s_add_i32 s32, s32, 0xfffff800 |
| -; GCN-O0-NEXT: s_mov_b32 s33, s4 |
| -; GCN-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GCN-O0-NEXT: s_setpc_b64 s[30:31] |
| - call void @ext_func() |
| - ret void |
| -} |
| - |
| -declare void @ext_func(); |
| - |
| -attributes #0 = { nounwind "amdgpu-num-vgpr"="4" } |
| diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll |
| index 92e929c5df10..f4765a328618 100644 |
| --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll |
| +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll |
| @@ -133,12 +133,10 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { |
| ; GFX9-O0: ; %bb.0: ; %entry |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 |
| -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 |
| ; GFX9-O0-NEXT: s_mov_b32 s36, s4 |
| ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 |
| ; GFX9-O0-NEXT: s_mov_b32 s37, s5 |
| @@ -146,17 +144,16 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { |
| ; GFX9-O0-NEXT: s_mov_b32 s39, s7 |
| ; GFX9-O0-NEXT: s_mov_b64 s[42:43], s[38:39] |
| ; GFX9-O0-NEXT: s_mov_b64 s[40:41], s[36:37] |
| -; GFX9-O0-NEXT: ; implicit-def: $vgpr0 |
| -; GFX9-O0-NEXT: v_writelane_b32 v0, s40, 0 |
| -; GFX9-O0-NEXT: v_writelane_b32 v0, s41, 1 |
| -; GFX9-O0-NEXT: v_writelane_b32 v0, s42, 2 |
| -; GFX9-O0-NEXT: v_writelane_b32 v0, s43, 3 |
| +; GFX9-O0-NEXT: v_writelane_b32 v3, s40, 0 |
| +; GFX9-O0-NEXT: v_writelane_b32 v3, s41, 1 |
| +; GFX9-O0-NEXT: v_writelane_b32 v3, s42, 2 |
| +; GFX9-O0-NEXT: v_writelane_b32 v3, s43, 3 |
| ; GFX9-O0-NEXT: s_mov_b32 s34, 0 |
| ; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], s34 |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 |
| ; GFX9-O0-NEXT: s_not_b64 exec, exec |
| @@ -169,25 +166,22 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { |
| ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 |
| ; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 |
| -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v3, s34 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s34 |
| -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: s_mov_b64 s[34:35], exec |
| -; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 4 |
| -; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 5 |
| -; GFX9-O0-NEXT: s_or_saveexec_b64 s[44:45], -1 |
| +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v0, s34 |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 |
| ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: s_mov_b64 exec, s[44:45] |
| +; GFX9-O0-NEXT: s_mov_b64 s[34:35], exec |
| +; GFX9-O0-NEXT: v_writelane_b32 v3, s34, 4 |
| +; GFX9-O0-NEXT: v_writelane_b32 v3, s35, 5 |
| ; GFX9-O0-NEXT: s_and_b64 s[34:35], s[34:35], s[36:37] |
| ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] |
| ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_2 |
| ; GFX9-O0-NEXT: ; %bb.1: ; %if |
| -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 |
| ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 |
| ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] |
| @@ -200,23 +194,20 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { |
| ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 |
| ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 |
| -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: .LBB1_2: ; %merge |
| -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: s_or_saveexec_b64 s[44:45], -1 |
| ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: s_mov_b64 exec, s[44:45] |
| -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: v_readlane_b32 s34, v4, 4 |
| -; GFX9-O0-NEXT: v_readlane_b32 s35, v4, 5 |
| +; GFX9-O0-NEXT: v_readlane_b32 s34, v3, 4 |
| +; GFX9-O0-NEXT: v_readlane_b32 s35, v3, 5 |
| ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[34:35] |
| -; GFX9-O0-NEXT: v_readlane_b32 s36, v4, 0 |
| -; GFX9-O0-NEXT: v_readlane_b32 s37, v4, 1 |
| -; GFX9-O0-NEXT: v_readlane_b32 s38, v4, 2 |
| -; GFX9-O0-NEXT: v_readlane_b32 s39, v4, 3 |
| -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v0, v3 |
| +; GFX9-O0-NEXT: v_readlane_b32 s36, v3, 0 |
| +; GFX9-O0-NEXT: v_readlane_b32 s37, v3, 1 |
| +; GFX9-O0-NEXT: v_readlane_b32 s38, v3, 2 |
| +; GFX9-O0-NEXT: v_readlane_b32 s39, v3, 3 |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v0, v4 |
| ; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35] |
| ; GFX9-O0-NEXT: s_mov_b32 s34, 1 |
| ; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s34, v0 |
| @@ -225,10 +216,9 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) { |
| ; GFX9-O0-NEXT: s_mov_b32 s34, 0 |
| ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 |
| ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 |
| -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] |
| @@ -345,17 +335,13 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) |
| ; GFX9-O0-NEXT: s_mov_b32 s35, s33 |
| ; GFX9-O0-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[36:37], -1 |
| -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x800 |
| -; GFX9-O0-NEXT: ; implicit-def: $vgpr0 |
| -; GFX9-O0-NEXT: v_writelane_b32 v0, s30, 0 |
| -; GFX9-O0-NEXT: v_writelane_b32 v0, s31, 1 |
| -; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 |
| -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] |
| +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x400 |
| +; GFX9-O0-NEXT: v_writelane_b32 v3, s30, 0 |
| +; GFX9-O0-NEXT: v_writelane_b32 v3, s31, 1 |
| ; GFX9-O0-NEXT: s_mov_b32 s36, s4 |
| ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 |
| ; GFX9-O0-NEXT: s_mov_b32 s37, s5 |
| @@ -378,22 +364,18 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 |
| ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[42:43] |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0 |
| -; GFX9-O0-NEXT: s_or_saveexec_b64 s[48:49], -1 |
| -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: s_mov_b64 exec, s[48:49] |
| ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 |
| ; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41] |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 |
| -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[36:39], s34 offset:4 |
| -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) |
| -; GFX9-O0-NEXT: v_readlane_b32 s31, v0, 1 |
| -; GFX9-O0-NEXT: v_readlane_b32 s30, v0, 0 |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 |
| +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 |
| +; GFX9-O0-NEXT: v_readlane_b32 s31, v3, 1 |
| +; GFX9-O0-NEXT: v_readlane_b32 s30, v3, 0 |
| ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[36:37], -1 |
| -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] |
| -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff800 |
| +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 |
| ; GFX9-O0-NEXT: s_mov_b32 s33, s35 |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] |
| @@ -408,9 +390,8 @@ define amdgpu_gfx void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) |
| ; GFX9-O3-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-O3-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] |
| -; GFX9-O3-NEXT: ; implicit-def: $vgpr3 |
| -; GFX9-O3-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-O3-NEXT: v_writelane_b32 v3, s30, 0 |
| +; GFX9-O3-NEXT: s_addk_i32 s32, 0x400 |
| ; GFX9-O3-NEXT: v_writelane_b32 v3, s31, 1 |
| ; GFX9-O3-NEXT: v_mov_b32_e32 v2, s8 |
| ; GFX9-O3-NEXT: s_not_b64 exec, exec |
| @@ -535,39 +516,37 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a |
| ; GFX9-O0-LABEL: strict_wwm_call_i64: |
| ; GFX9-O0: ; %bb.0: |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| -; GFX9-O0-NEXT: s_mov_b32 s44, s33 |
| +; GFX9-O0-NEXT: s_mov_b32 s42, s33 |
| ; GFX9-O0-NEXT: s_mov_b32 s33, s32 |
| ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 |
| -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] |
| -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x1000 |
| -; GFX9-O0-NEXT: ; implicit-def: $vgpr0 |
| -; GFX9-O0-NEXT: v_writelane_b32 v0, s30, 0 |
| -; GFX9-O0-NEXT: v_writelane_b32 v0, s31, 1 |
| +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00 |
| +; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 |
| +; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 |
| ; GFX9-O0-NEXT: s_mov_b32 s34, s8 |
| ; GFX9-O0-NEXT: s_mov_b32 s36, s4 |
| ; GFX9-O0-NEXT: ; kill: def $sgpr36 killed $sgpr36 def $sgpr36_sgpr37_sgpr38_sgpr39 |
| ; GFX9-O0-NEXT: s_mov_b32 s37, s5 |
| ; GFX9-O0-NEXT: s_mov_b32 s38, s6 |
| ; GFX9-O0-NEXT: s_mov_b32 s39, s7 |
| -; GFX9-O0-NEXT: v_writelane_b32 v0, s36, 2 |
| -; GFX9-O0-NEXT: v_writelane_b32 v0, s37, 3 |
| -; GFX9-O0-NEXT: v_writelane_b32 v0, s38, 4 |
| -; GFX9-O0-NEXT: v_writelane_b32 v0, s39, 5 |
| +; GFX9-O0-NEXT: v_writelane_b32 v10, s36, 2 |
| +; GFX9-O0-NEXT: v_writelane_b32 v10, s37, 3 |
| +; GFX9-O0-NEXT: v_writelane_b32 v10, s38, 4 |
| +; GFX9-O0-NEXT: v_writelane_b32 v10, s39, 5 |
| ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 |
| ; GFX9-O0-NEXT: s_mov_b32 s35, s9 |
| ; GFX9-O0-NEXT: ; kill: def $sgpr40_sgpr41 killed $sgpr34_sgpr35 |
| @@ -579,11 +558,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v9, s37 |
| ; GFX9-O0-NEXT: s_not_b64 exec, exec |
| ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| -; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 6 |
| -; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 7 |
| -; GFX9-O0-NEXT: s_or_saveexec_b64 s[42:43], -1 |
| -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: s_mov_b64 exec, s[42:43] |
| +; GFX9-O0-NEXT: v_writelane_b32 v10, s34, 6 |
| +; GFX9-O0-NEXT: v_writelane_b32 v10, s35, 7 |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 |
| ; GFX9-O0-NEXT: s_mov_b32 s34, 32 |
| ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 |
| @@ -600,20 +576,13 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 |
| ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) |
| ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] |
| -; GFX9-O0-NEXT: s_or_saveexec_b64 s[42:43], -1 |
| -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: s_mov_b64 exec, s[42:43] |
| -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 6 |
| -; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 7 |
| -; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 2 |
| -; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 3 |
| -; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 4 |
| -; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 5 |
| +; GFX9-O0-NEXT: v_readlane_b32 s34, v10, 6 |
| +; GFX9-O0-NEXT: v_readlane_b32 s35, v10, 7 |
| +; GFX9-O0-NEXT: v_readlane_b32 s36, v10, 2 |
| +; GFX9-O0-NEXT: v_readlane_b32 s37, v10, 3 |
| +; GFX9-O0-NEXT: v_readlane_b32 s38, v10, 4 |
| +; GFX9-O0-NEXT: v_readlane_b32 s39, v10, 5 |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 |
| -; GFX9-O0-NEXT: s_or_saveexec_b64 s[42:43], -1 |
| -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: s_mov_b64 exec, s[42:43] |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 |
| ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 |
| ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 |
| @@ -623,38 +592,36 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a |
| ; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4 |
| ; GFX9-O0-NEXT: v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41] |
| ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 |
| ; GFX9-O0-NEXT: s_mov_b32 s34, 0 |
| -; GFX9-O0-NEXT: buffer_store_dwordx2 v[6:7], off, s[36:39], s34 offset:4 |
| -; GFX9-O0-NEXT: s_waitcnt vmcnt(1) |
| -; GFX9-O0-NEXT: v_readlane_b32 s31, v0, 1 |
| -; GFX9-O0-NEXT: v_readlane_b32 s30, v0, 0 |
| +; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4 |
| +; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1 |
| +; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0 |
| ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 |
| -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] |
| -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff000 |
| -; GFX9-O0-NEXT: s_mov_b32 s33, s44 |
| +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400 |
| +; GFX9-O0-NEXT: s_mov_b32 s33, s42 |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] |
| ; |
| @@ -674,9 +641,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %a |
| ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-O3-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill |
| ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] |
| -; GFX9-O3-NEXT: ; implicit-def: $vgpr8 |
| -; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 |
| ; GFX9-O3-NEXT: v_writelane_b32 v8, s30, 0 |
| +; GFX9-O3-NEXT: s_addk_i32 s32, 0x800 |
| ; GFX9-O3-NEXT: v_writelane_b32 v8, s31, 1 |
| ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 |
| ; GFX9-O3-NEXT: s_getpc_b64 s[36:37] |
| @@ -912,81 +878,64 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt |
| ; GFX9-O0: ; %bb.0: |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 |
| -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v50, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: s_mov_b64 exec, -1 |
| +; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v12, s30 |
| -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v12, s31 |
| -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: ; implicit-def: $vgpr11 |
| -; GFX9-O0-NEXT: v_writelane_b32 v11, s64, 0 |
| -; GFX9-O0-NEXT: v_writelane_b32 v11, s65, 1 |
| -; GFX9-O0-NEXT: v_writelane_b32 v11, s66, 2 |
| -; GFX9-O0-NEXT: v_writelane_b32 v11, s67, 3 |
| -; GFX9-O0-NEXT: s_or_saveexec_b64 s[30:31], -1 |
| -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: s_mov_b64 exec, s[30:31] |
| -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:20 |
| +; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_writelane_b32 v42, s64, 0 |
| +; GFX9-O0-NEXT: v_writelane_b32 v42, s65, 1 |
| +; GFX9-O0-NEXT: v_writelane_b32 v42, s66, 2 |
| +; GFX9-O0-NEXT: v_writelane_b32 v42, s67, 3 |
| +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: buffer_store_dword v51, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:16 |
| +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:16 |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: buffer_store_dword v52, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:8 |
| +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:8 |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: buffer_store_dword v54, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:4 |
| +; GFX9-O0-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:4 |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: buffer_store_dword v55, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 |
| +; GFX9-O0-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 |
| ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 |
| ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 |
| @@ -994,114 +943,161 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt |
| ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 |
| ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v50, s5 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 |
| -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 |
| -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 |
| -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 |
| -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 |
| -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 |
| -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 |
| -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 |
| -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v53, s14 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v63, s15 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v62, s16 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v61, s17 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v60, s18 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v59, s19 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v58, s20 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v57, s21 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v56, s22 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v47, s23 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v46, s24 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v45, s25 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v44, s26 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v43, s27 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v42, s28 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v41, s29 |
| -; GFX9-O0-NEXT: ; kill: def $vgpr40 killed $vgpr40 killed $exec |
| -; GFX9-O0-NEXT: ; kill: def $vgpr55 killed $vgpr55 killed $exec |
| -; GFX9-O0-NEXT: ; kill: def $vgpr54 killed $vgpr54 killed $exec |
| -; GFX9-O0-NEXT: ; kill: def $vgpr52 killed $vgpr52 killed $exec |
| -; GFX9-O0-NEXT: ; kill: def $vgpr51 killed $vgpr51 killed $exec |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v43, s5 |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s7 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s8 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s9 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s10 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s11 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s12 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s13 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s14 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s15 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s16 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s17 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s18 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s19 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s20 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s21 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s22 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s23 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s24 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s25 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s26 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s27 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s28 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, s29 |
| +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill |
| +; GFX9-O0-NEXT: ; kill: def $vgpr45 killed $vgpr45 killed $exec |
| +; GFX9-O0-NEXT: ; kill: def $vgpr47 killed $vgpr47 killed $exec |
| +; GFX9-O0-NEXT: ; kill: def $vgpr46 killed $vgpr46 killed $exec |
| +; GFX9-O0-NEXT: ; kill: def $vgpr44 killed $vgpr44 killed $exec |
| +; GFX9-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec |
| +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill |
| ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v50 |
| -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v50 |
| -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v50 |
| -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v50 |
| -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v50 |
| -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v50 |
| -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v50 |
| -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v50 |
| -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v21, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v22, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v50 |
| -; GFX9-O0-NEXT: s_or_saveexec_b64 s[30:31], -1 |
| -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: s_mov_b64 exec, s[30:31] |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v53 |
| -; GFX9-O0-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v63 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v62 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v61 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v60 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v59 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v16, v58 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v57 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v56 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v47 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v20, v46 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v21, v45 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v22, v44 |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v23, v43 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v24, v42 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v25, v41 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v26, v40 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v27, v55 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v28, v54 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v29, v53 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v30, v52 |
| -; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr51 killed $exec |
| -; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v24, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v25, v43 |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v26, v45 |
| +; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v27, v47 |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v28, v46 |
| +; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v29, v45 |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v30, v44 |
| +; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr43 killed $exec |
| +; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 |
| ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 |
| ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $exec |
| @@ -1160,11 +1156,11 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v38, s34 |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v39, s35 |
| ; GFX9-O0-NEXT: s_not_b64 exec, exec |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v48, v2 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v49, v3 |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v40, v2 |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v41, v3 |
| ; GFX9-O0-NEXT: s_not_b64 exec, exec |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v48, s34 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v49, s35 |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v40, s34 |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v41, s35 |
| ; GFX9-O0-NEXT: s_not_b64 exec, exec |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v33 |
| ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4 |
| @@ -1182,9 +1178,9 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt |
| ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:28 |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v38 |
| ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:24 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v49 |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v41 |
| ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:36 |
| -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v48 |
| +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v40 |
| ; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:32 |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s5 |
| @@ -1212,55 +1208,39 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v23, s27 |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v24, s28 |
| ; GFX9-O0-NEXT: v_mov_b32_e32 v25, s29 |
| -; GFX9-O0-NEXT: v_readlane_b32 s67, v50, 3 |
| -; GFX9-O0-NEXT: v_readlane_b32 s66, v50, 2 |
| -; GFX9-O0-NEXT: v_readlane_b32 s65, v50, 1 |
| -; GFX9-O0-NEXT: v_readlane_b32 s64, v50, 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v63, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v60, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: v_readfirstlane_b32 s30, v50 |
| -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O0-NEXT: v_readfirstlane_b32 s31, v50 |
| +; GFX9-O0-NEXT: v_readlane_b32 s67, v42, 3 |
| +; GFX9-O0-NEXT: v_readlane_b32 s66, v42, 2 |
| +; GFX9-O0-NEXT: v_readlane_b32 s65, v42, 1 |
| +; GFX9-O0-NEXT: v_readlane_b32 s64, v42, 0 |
| +; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 |
| -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload |
| -; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_nop 0 |
| -; GFX9-O0-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload |
| +; GFX9-O0-NEXT: s_mov_b64 exec, -1 |
| +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload |
| ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] |
| ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] |
| @@ -1269,22 +1249,24 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt |
| ; GFX9-O3: ; %bb.0: |
| ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) |
| ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 |
| -; GFX9-O3-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill |
| +; GFX9-O3-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill |
| ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O3-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill |
| -; GFX9-O3-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill |
| +; GFX9-O3-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill |
| +; GFX9-O3-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill |
| ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O3-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill |
| -; GFX9-O3-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill |
| +; GFX9-O3-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill |
| +; GFX9-O3-NEXT: buffer_store_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill |
| ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O3-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill |
| -; GFX9-O3-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill |
| +; GFX9-O3-NEXT: buffer_store_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill |
| +; GFX9-O3-NEXT: buffer_store_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill |
| ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O3-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill |
| -; GFX9-O3-NEXT: buffer_store_dword v48, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill |
| +; GFX9-O3-NEXT: buffer_store_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill |
| +; GFX9-O3-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill |
| ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) |
| -; GFX9-O3-NEXT: buffer_store_dword v49, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill |
| +; GFX9-O3-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill |
| ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] |
| +; GFX9-O3-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill |
| +; GFX9-O3-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill |
| ; GFX9-O3-NEXT: buffer_load_dword v26, off, s[0:3], s32 |
| ; GFX9-O3-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:4 |
| ; GFX9-O3-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:8 |
| @@ -1315,11 +1297,11 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt |
| ; GFX9-O3-NEXT: v_mov_b32_e32 v38, 0 |
| ; GFX9-O3-NEXT: v_mov_b32_e32 v39, 0 |
| ; GFX9-O3-NEXT: s_not_b64 exec, exec |
| -; GFX9-O3-NEXT: v_mov_b32_e32 v48, v9 |
| -; GFX9-O3-NEXT: v_mov_b32_e32 v49, v10 |
| +; GFX9-O3-NEXT: v_mov_b32_e32 v40, v9 |
| +; GFX9-O3-NEXT: v_mov_b32_e32 v41, v10 |
| ; GFX9-O3-NEXT: s_not_b64 exec, exec |
| -; GFX9-O3-NEXT: v_mov_b32_e32 v48, 0 |
| -; GFX9-O3-NEXT: v_mov_b32_e32 v49, 0 |
| +; GFX9-O3-NEXT: v_mov_b32_e32 v40, 0 |
| +; GFX9-O3-NEXT: v_mov_b32_e32 v41, 0 |
| ; GFX9-O3-NEXT: s_not_b64 exec, exec |
| ; GFX9-O3-NEXT: buffer_store_dword v33, v0, s[0:3], 0 offen offset:4 |
| ; GFX9-O3-NEXT: buffer_store_dword v32, v0, s[0:3], 0 offen |
| @@ -1329,8 +1311,11 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt |
| ; GFX9-O3-NEXT: buffer_store_dword v36, v0, s[0:3], 0 offen offset:16 |
| ; GFX9-O3-NEXT: buffer_store_dword v39, v0, s[0:3], 0 offen offset:28 |
| ; GFX9-O3-NEXT: buffer_store_dword v38, v0, s[0:3], 0 offen offset:24 |
| -; GFX9-O3-NEXT: buffer_store_dword v49, v0, s[0:3], 0 offen offset:36 |
| -; GFX9-O3-NEXT: buffer_store_dword v48, v0, s[0:3], 0 offen offset:32 |
| +; GFX9-O3-NEXT: buffer_store_dword v41, v0, s[0:3], 0 offen offset:36 |
| +; GFX9-O3-NEXT: buffer_store_dword v40, v0, s[0:3], 0 offen offset:32 |
| +; GFX9-O3-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload |
| +; GFX9-O3-NEXT: s_nop 0 |
| +; GFX9-O3-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload |
| ; GFX9-O3-NEXT: v_mov_b32_e32 v0, s4 |
| ; GFX9-O3-NEXT: v_mov_b32_e32 v1, s5 |
| ; GFX9-O3-NEXT: v_mov_b32_e32 v2, s6 |
| @@ -1358,25 +1343,25 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt |
| ; GFX9-O3-NEXT: v_mov_b32_e32 v24, s28 |
| ; GFX9-O3-NEXT: v_mov_b32_e32 v25, s29 |
| ; GFX9-O3-NEXT: s_xor_saveexec_b64 s[34:35], -1 |
| -; GFX9-O3-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload |
| +; GFX9-O3-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload |
| ; GFX9-O3-NEXT: s_nop 0 |
| -; GFX9-O3-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload |
| +; GFX9-O3-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload |
| ; GFX9-O3-NEXT: s_nop 0 |
| -; GFX9-O3-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload |
| +; GFX9-O3-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload |
| ; GFX9-O3-NEXT: s_nop 0 |
| -; GFX9-O3-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload |
| +; GFX9-O3-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload |
| ; GFX9-O3-NEXT: s_nop 0 |
| -; GFX9-O3-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload |
| +; GFX9-O3-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload |
| ; GFX9-O3-NEXT: s_nop 0 |
| -; GFX9-O3-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload |
| +; GFX9-O3-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload |
| ; GFX9-O3-NEXT: s_nop 0 |
| -; GFX9-O3-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload |
| +; GFX9-O3-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload |
| ; GFX9-O3-NEXT: s_nop 0 |
| -; GFX9-O3-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload |
| +; GFX9-O3-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload |
| ; GFX9-O3-NEXT: s_nop 0 |
| -; GFX9-O3-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload |
| +; GFX9-O3-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload |
| ; GFX9-O3-NEXT: s_nop 0 |
| -; GFX9-O3-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload |
| +; GFX9-O3-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload |
| ; GFX9-O3-NEXT: s_mov_b64 exec, s[34:35] |
| ; GFX9-O3-NEXT: s_waitcnt vmcnt(0) |
| ; GFX9-O3-NEXT: s_setpc_b64 s[30:31] |
| @@ -1411,4 +1396,4 @@ declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i3 |
| declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32) |
| declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) |
| |
| -attributes #0 = { "amdgpu-waves-per-eu"="4,4" } |
| +attributes #0 = { "amdgpu-waves-per-eu"="5,5" } |
| diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll |
| index 3e652170c970..acff981f9850 100644 |
| --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll |
| +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll |
| @@ -37,7 +37,6 @@ |
| ; AFTER-PEI-NEXT: occupancy: 5 |
| ; AFTER-PEI-NEXT: scavengeFI: '%fixed-stack.0' |
| ; AFTER-PEI-NEXT: vgprForAGPRCopy: '' |
| -; AFTER-PEI-NEXT: sgprForEXECCopy: '' |
| ; AFTER-PEI-NEXT: body: |
| define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 { |
| %wide.sgpr0 = call <32 x i32> asm sideeffect "; def $0", "=s" () #0 |
| diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir |
| index c82f5f23a893..172744e060cb 100644 |
| --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir |
| +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir |
| @@ -46,7 +46,6 @@ |
| # FULL-NEXT: highBitsOf32BitAddress: 0 |
| # FULL-NEXT: occupancy: 10 |
| # FULL-NEXT: vgprForAGPRCopy: '' |
| -# FULL-NEXT: sgprForEXECCopy: '' |
| # FULL-NEXT: body: |
| |
| # SIMPLE: machineFunctionInfo: |
| @@ -145,7 +144,6 @@ body: | |
| # FULL-NEXT: highBitsOf32BitAddress: 0 |
| # FULL-NEXT: occupancy: 10 |
| # FULL-NEXT: vgprForAGPRCopy: '' |
| -# FULL-NEXT: sgprForEXECCopy: '' |
| # FULL-NEXT: body: |
| |
| # SIMPLE: machineFunctionInfo: |
| @@ -215,7 +213,6 @@ body: | |
| # FULL-NEXT: highBitsOf32BitAddress: 0 |
| # FULL-NEXT: occupancy: 10 |
| # FULL-NEXT: vgprForAGPRCopy: '' |
| -# FULL-NEXT: sgprForEXECCopy: '' |
| # FULL-NEXT: body: |
| |
| # SIMPLE: machineFunctionInfo: |
| @@ -286,7 +283,6 @@ body: | |
| # FULL-NEXT: highBitsOf32BitAddress: 0 |
| # FULL-NEXT: occupancy: 10 |
| # FULL-NEXT: vgprForAGPRCopy: '' |
| -# FULL-NEXT: sgprForEXECCopy: '' |
| # FULL-NEXT: body: |
| |
| # SIMPLE: machineFunctionInfo: |
| @@ -533,28 +529,3 @@ body: | |
| SI_RETURN |
| |
| ... |
| - |
| ---- |
| -# ALL-LABEL: name: sgpr_for_exec_copy |
| -# ALL: sgprForEXECCopy: '$sgpr2_sgpr3' |
| -name: sgpr_for_exec_copy |
| -machineFunctionInfo: |
| - sgprForEXECCopy: '$sgpr2_sgpr3' |
| -body: | |
| - bb.0: |
| - SI_RETURN |
| - |
| -... |
| - |
| ---- |
| -# ALL-LABEL: name: sgpr_for_exec_copy_noreg |
| -# FULL: sgprForEXECCopy: '' |
| -# SIMPLE-NOT: sgprForEXECCopy |
| -name: sgpr_for_exec_copy_noreg |
| -machineFunctionInfo: |
| - sgprForEXECCopy: '$noreg' |
| -body: | |
| - bb.0: |
| - SI_RETURN |
| - |
| -... |
| diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll |
| index 3dc9b8b70db5..26a35113dae4 100644 |
| --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll |
| +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll |
| @@ -40,7 +40,6 @@ |
| ; CHECK-NEXT: highBitsOf32BitAddress: 0 |
| ; CHECK-NEXT: occupancy: 10 |
| ; CHECK-NEXT: vgprForAGPRCopy: '' |
| -; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' |
| ; CHECK-NEXT: body: |
| define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { |
| %gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0 |
| @@ -81,7 +80,6 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { |
| ; CHECK-NEXT: highBitsOf32BitAddress: 0 |
| ; CHECK-NEXT: occupancy: 10 |
| ; CHECK-NEXT: vgprForAGPRCopy: '' |
| -; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' |
| ; CHECK-NEXT: body: |
| define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) { |
| %gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0 |
| @@ -136,7 +134,6 @@ define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 { |
| ; CHECK-NEXT: highBitsOf32BitAddress: 0 |
| ; CHECK-NEXT: occupancy: 10 |
| ; CHECK-NEXT: vgprForAGPRCopy: '' |
| -; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' |
| ; CHECK-NEXT: body: |
| define void @function() { |
| ret void |
| @@ -183,7 +180,6 @@ define void @function() { |
| ; CHECK-NEXT: highBitsOf32BitAddress: 0 |
| ; CHECK-NEXT: occupancy: 10 |
| ; CHECK-NEXT: vgprForAGPRCopy: '' |
| -; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101' |
| ; CHECK-NEXT: body: |
| define void @function_nsz() #0 { |
| ret void |
| diff --git a/llvm/test/CodeGen/MIR/AMDGPU/sgpr-for-exec-copy-invalid-reg.mir b/llvm/test/CodeGen/MIR/AMDGPU/sgpr-for-exec-copy-invalid-reg.mir |
| deleted file mode 100644 |
| index 172c388e7cb1..000000000000 |
| --- a/llvm/test/CodeGen/MIR/AMDGPU/sgpr-for-exec-copy-invalid-reg.mir |
| +++ /dev/null |
| @@ -1,12 +0,0 @@ |
| -# RUN: not llc -mtriple=amdgcn-amd-amdhsa -run-pass=none -verify-machineinstrs %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s |
| - |
| ---- |
| -name: invalid_reg |
| -machineFunctionInfo: |
| -# ERR: [[@LINE+1]]:21: unknown register name 'srst' |
| - sgprForEXECCopy: '$srst' |
| -body: | |
| - bb.0: |
| - S_ENDPGM 0 |
| - |
| -... |
| diff --git a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir |
| index 240c60e72db2..156891fef362 100644 |
| --- a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir |
| +++ b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir |
| @@ -3,7 +3,7 @@ |
| # contains not dead objects only. So using objects IDs as offset in the storage |
| # caused out of bounds access. |
| |
| -# RUN: llc -march=amdgcn -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs -o - %s | FileCheck %s |
| +# RUN: llc -march=amdgcn -run-pass=si-lower-sgpr-spills,prologepilog -verify-machineinstrs -o - %s | FileCheck %s |
| |
| # CHECK-LABEL: name: foo |
| # CHECK: {{^}}fixedStack: [] |
| -- |
| 2.40.0.348.gf938b09366-goog |
| |