; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10-SCRATCH %s

declare hidden amdgpu_gfx void @external_void_func_i1(i1) #0
declare hidden amdgpu_gfx void @external_void_func_i1_signext(i1 signext) #0
declare hidden amdgpu_gfx void @external_void_func_i1_zeroext(i1 zeroext) #0

declare hidden amdgpu_gfx void @external_void_func_i8(i8) #0
declare hidden amdgpu_gfx void @external_void_func_i8_signext(i8 signext) #0
declare hidden amdgpu_gfx void @external_void_func_i8_zeroext(i8 zeroext) #0
declare hidden amdgpu_gfx void @external_void_func_v2i8(<2 x i8>) #0
declare hidden amdgpu_gfx void @external_void_func_v3i8(<3 x i8>) #0
declare hidden amdgpu_gfx void @external_void_func_v4i8(<4 x i8>) #0
declare hidden amdgpu_gfx void @external_void_func_v5i8(<5 x i8>) #0
declare hidden amdgpu_gfx void @external_void_func_v8i8(<8 x i8>) #0
declare hidden amdgpu_gfx void @external_void_func_v16i8(<8 x i8>) #0
declare hidden amdgpu_gfx void @external_void_func_v32i8(<32 x i8>) #0

declare hidden amdgpu_gfx i8 @external_void_func_i8_ret(i8) #0
declare hidden amdgpu_gfx <2 x i8> @external_void_func_v2i8_ret(<2 x i8>) #0
declare hidden amdgpu_gfx <3 x i8> @external_void_func_v3i8_ret(<3 x i8>) #0
declare hidden amdgpu_gfx <4 x i8> @external_void_func_v4i8_ret(<4 x i8>) #0
declare hidden amdgpu_gfx <5 x i8> @external_void_func_v5i8_ret(<5 x i8>) #0
declare hidden amdgpu_gfx <8 x i8> @external_void_func_v8i8_ret(<8 x i8>) #0
declare hidden amdgpu_gfx <32 x i8> @external_void_func_v32i8_ret(<32 x i8>) #0

declare hidden amdgpu_gfx void @external_void_func_i16(i16) #0
declare hidden amdgpu_gfx void @external_void_func_i16_signext(i16 signext) #0
declare hidden amdgpu_gfx void @external_void_func_i16_zeroext(i16 zeroext) #0

declare hidden amdgpu_gfx void @external_void_func_i32(i32) #0
declare hidden amdgpu_gfx void @external_void_func_i64(i64) #0
declare hidden amdgpu_gfx void @external_void_func_v2i64(<2 x i64>) #0
declare hidden amdgpu_gfx void @external_void_func_v3i64(<3 x i64>) #0
declare hidden amdgpu_gfx void @external_void_func_v4i64(<4 x i64>) #0

declare hidden amdgpu_gfx void @external_void_func_f16(half) #0
declare hidden amdgpu_gfx void @external_void_func_f32(float) #0
declare hidden amdgpu_gfx void @external_void_func_f64(double) #0
declare hidden amdgpu_gfx void @external_void_func_v2f32(<2 x float>) #0
declare hidden amdgpu_gfx void @external_void_func_v2f64(<2 x double>) #0
declare hidden amdgpu_gfx void @external_void_func_v3f32(<3 x float>) #0
declare hidden amdgpu_gfx void @external_void_func_v3f64(<3 x double>) #0
declare hidden amdgpu_gfx void @external_void_func_v5f32(<5 x float>) #0

declare hidden amdgpu_gfx void @external_void_func_v2i16(<2 x i16>) #0
declare hidden amdgpu_gfx void @external_void_func_v2f16(<2 x half>) #0
declare hidden amdgpu_gfx void @external_void_func_v3i16(<3 x i16>) #0
declare hidden amdgpu_gfx void @external_void_func_v3f16(<3 x half>) #0
declare hidden amdgpu_gfx void @external_void_func_v4i16(<4 x i16>) #0
declare hidden amdgpu_gfx void @external_void_func_v4f16(<4 x half>) #0

declare hidden amdgpu_gfx void @external_void_func_bf16(bfloat) #0
declare hidden amdgpu_gfx void @external_void_func_v1bf16(<1 x bfloat>) #0
declare hidden amdgpu_gfx void @external_void_func_v2bf16(<2 x bfloat>) #0
declare hidden amdgpu_gfx void @external_void_func_v3bf16(<3 x bfloat>) #0
declare hidden amdgpu_gfx void @external_void_func_v4bf16(<4 x bfloat>) #0
declare hidden amdgpu_gfx void @external_void_func_v8bf16(<8 x bfloat>) #0
declare hidden amdgpu_gfx void @external_void_func_v16bf16(<16 x bfloat>) #0

declare hidden amdgpu_gfx void @external_void_func_bf16_inreg(bfloat inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v1bf16_inreg(<1 x bfloat> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v3bf16_inreg(<3 x bfloat> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v4bf16_inreg(<4 x bfloat> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v8bf16_inreg(<8 x bfloat> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v16bf16_inreg(<16 x bfloat> inreg) #0

declare hidden amdgpu_gfx void @external_void_func_v2i32(<2 x i32>) #0
declare hidden amdgpu_gfx void @external_void_func_v3i32(<3 x i32>) #0
declare hidden amdgpu_gfx void @external_void_func_v3i32_i32(<3 x i32>, i32) #0
declare hidden amdgpu_gfx void @external_void_func_v4i32(<4 x i32>) #0
declare hidden amdgpu_gfx void @external_void_func_v5i32(<5 x i32>) #0
declare hidden amdgpu_gfx void @external_void_func_v8i32(<8 x i32>) #0
declare hidden amdgpu_gfx void @external_void_func_v16i32(<16 x i32>) #0
declare hidden amdgpu_gfx void @external_void_func_v32i32(<32 x i32>) #0
declare hidden amdgpu_gfx void @external_void_func_v32i32_i32(<32 x i32>, i32) #0

declare hidden amdgpu_gfx void @external_void_func_i1_inreg(i1 inreg) #0
declare hidden amdgpu_gfx void @external_void_func_i8_inreg(i8 inreg) #0
declare hidden amdgpu_gfx void @external_void_func_i16_inreg(i16 inreg) #0
declare hidden amdgpu_gfx void @external_void_func_i32_inreg(i32 inreg) #0
declare hidden amdgpu_gfx void @external_void_func_i64_inreg(i64 inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v3i64_inreg(<3 x i64> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v4i64_inreg(<4 x i64> inreg) #0

declare hidden amdgpu_gfx void @external_void_func_f16_inreg(half inreg) #0
declare hidden amdgpu_gfx void @external_void_func_f32_inreg(float inreg) #0
declare hidden amdgpu_gfx void @external_void_func_f64_inreg(double inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v2f32_inreg(<2 x float> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v2f64_inreg(<2 x double> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v3f32_inreg(<3 x float> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v3f64_inreg(<3 x double> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v5f32_inreg(<5 x float> inreg) #0

declare hidden amdgpu_gfx void @external_void_func_v2i16_inreg(<2 x i16> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v2f16_inreg(<2 x half> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v4f16_inreg(<4 x half> inreg) #0

declare hidden amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v3i32_inreg(<3 x i32> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v3i32_i32_inreg(<3 x i32> inreg, i32 inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v5i32_inreg(<5 x i32> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v16i32_inreg(<16 x i32> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v32i32_inreg(<32 x i32> inreg) #0
declare hidden amdgpu_gfx void @external_void_func_v32i32_i32_inreg(<32 x i32> inreg, i32 inreg) #0

; return value and argument
declare hidden amdgpu_gfx i32 @external_i32_func_i32(i32) #0

; Structs
declare hidden amdgpu_gfx void @external_void_func_struct_i8_i32({ i8, i32 }) #0
declare hidden amdgpu_gfx void @external_void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 })) #0
declare hidden amdgpu_gfx void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }), ptr addrspace(5) byval({ i8, i32 })) #0

define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_i1_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    v_mov_b32_e32 v0, 1
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i1@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i1@abs32@lo
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_i1_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 1
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i1@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i1@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i1_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_mov_b32_e32 v0, 1
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i1@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i1@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    scratch_store_b8 off, v0, s32
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i1@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i1@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_i1(i1 true)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
; GFX9-LABEL: test_call_external_void_func_i1_signext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i1_signext@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i1_signext@abs32@lo
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_i1_signext:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i1_signext@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i1_signext@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i1_signext:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i1_signext@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i1_signext@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
; GFX11-NEXT:    scratch_store_b8 off, v0, s32
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_signext:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i1_signext@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i1_signext@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v0, 1, v0
; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %var = load volatile i1, ptr addrspace(1) undef
  call amdgpu_gfx void @external_void_func_i1_signext(i1 signext%var)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
; GFX9-LABEL: test_call_external_void_func_i1_zeroext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i1_zeroext@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i1_zeroext@abs32@lo
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_i1_zeroext:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i1_zeroext@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i1_zeroext@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i1_zeroext:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i1_zeroext@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i1_zeroext@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
; GFX11-NEXT:    scratch_store_b8 off, v0, s32
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_zeroext:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i1_zeroext@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i1_zeroext@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v0, 1, v0
; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %var = load volatile i1, ptr addrspace(1) undef
  call amdgpu_gfx void @external_void_func_i1_zeroext(i1 zeroext %var)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
; GFX9-LABEL: test_call_external_void_func_i8_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i8@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i8@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_i8_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 0x7b
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i8@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i8@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i8_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_mov_b32_e32 v0, 0x7b
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i8@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i8@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x7b
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i8@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i8@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_i8(i8 123)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
; GFX9-LABEL: test_call_external_void_func_i8_signext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    global_load_sbyte v0, v[0:1], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i8_signext@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i8_signext@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_i8_signext:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    global_load_sbyte v0, v[0:1], off glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i8_signext@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i8_signext@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i8_signext:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    global_load_i8 v0, v[0:1], off glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i8_signext@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i8_signext@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_signext:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    global_load_sbyte v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i8_signext@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i8_signext@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %var = load volatile i8, ptr addrspace(1) undef
  call amdgpu_gfx void @external_void_func_i8_signext(i8 signext %var)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
; GFX9-LABEL: test_call_external_void_func_i8_zeroext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i8_zeroext@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i8_zeroext@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_i8_zeroext:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i8_zeroext@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i8_zeroext@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i8_zeroext:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    global_load_u8 v0, v[0:1], off glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i8_zeroext@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i8_zeroext@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_zeroext:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i8_zeroext@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i8_zeroext@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %var = load volatile i8, ptr addrspace(1) undef
  call amdgpu_gfx void @external_void_func_i8_zeroext(i8 zeroext %var)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_i16_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i16@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_i16_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 0x7b
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i16_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_mov_b32_e32 v0, 0x7b
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x7b
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_i16(i16 123)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
; GFX9-LABEL: test_call_external_void_func_i16_signext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    global_load_ushort v0, v[0:1], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i16_signext@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i16_signext@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_i16_signext:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    global_load_ushort v0, v[0:1], off glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i16_signext@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i16_signext@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i16_signext:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    global_load_u16 v0, v[0:1], off glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i16_signext@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i16_signext@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_signext:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    global_load_ushort v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i16_signext@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i16_signext@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %var = load volatile i16, ptr addrspace(1) undef
  call amdgpu_gfx void @external_void_func_i16_signext(i16 signext %var)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
; GFX9-LABEL: test_call_external_void_func_i16_zeroext:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    global_load_ushort v0, v[0:1], off glc
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i16_zeroext@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i16_zeroext@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_i16_zeroext:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    global_load_ushort v0, v[0:1], off glc dlc
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i16_zeroext@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i16_zeroext@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i16_zeroext:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    global_load_u16 v0, v[0:1], off glc dlc
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i16_zeroext@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i16_zeroext@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_zeroext:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    global_load_ushort v0, v[0:1], off glc dlc
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i16_zeroext@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i16_zeroext@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %var = load volatile i16, ptr addrspace(1) undef
  call amdgpu_gfx void @external_void_func_i16_zeroext(i16 zeroext %var)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
; GFX9-LABEL: test_call_external_void_func_i32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i32@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 42
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_i32_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 42
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i32@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_mov_b32_e32 v0, 42
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i32@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i32_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 42
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i32@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_i32(i32 42)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_i64_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i64@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i64@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_i64_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 0x7b
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i64@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i64@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i64_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i64@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i64@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i64_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x7b
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i64@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i64@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_i64(i64 123)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
; GFX9-LABEL: test_call_external_void_func_v2i64:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i64@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i64@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2i64:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i64@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i64@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2i64:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_mov_b32_e32 v0, 0
; GFX11-NEXT:    v_mov_b32_e32 v1, 0
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i64@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i64@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i64@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i64@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <2 x i64>, ptr addrspace(1) null
  call amdgpu_gfx void @external_void_func_v2i64(<2 x i64> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_v2i64_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i64@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i64@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 1
; GFX9-NEXT:    v_mov_b32_e32 v1, 2
; GFX9-NEXT:    v_mov_b32_e32 v2, 3
; GFX9-NEXT:    v_mov_b32_e32 v3, 4
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2i64_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 1
; GFX10-NEXT:    v_mov_b32_e32 v1, 2
; GFX10-NEXT:    v_mov_b32_e32 v2, 3
; GFX10-NEXT:    v_mov_b32_e32 v3, 4
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i64@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i64@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2i64_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i64@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i64@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i64@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i64@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v2i64(<2 x i64> <i64 8589934593, i64 17179869187>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
; GFX9-LABEL: test_call_external_void_func_v3i64:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i64@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i64@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v4, 1
; GFX9-NEXT:    v_mov_b32_e32 v5, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3i64:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v4, 1
; GFX10-NEXT:    v_mov_b32_e32 v5, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i64@abs32@hi
; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i64@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3i64:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2
; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i64@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i64@abs32@lo
; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i64:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i64@abs32@hi
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i64@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %load = load <2 x i64>, ptr addrspace(1) null
  %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>

  call amdgpu_gfx void @external_void_func_v3i64(<3 x i64> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
; GFX9-LABEL: test_call_external_void_func_v4i64:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i64@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i64@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v4, 1
; GFX9-NEXT:    v_mov_b32_e32 v5, 2
; GFX9-NEXT:    v_mov_b32_e32 v6, 3
; GFX9-NEXT:    v_mov_b32_e32 v7, 4
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v4i64:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v4, 1
; GFX10-NEXT:    v_mov_b32_e32 v5, 2
; GFX10-NEXT:    v_mov_b32_e32 v6, 3
; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_mov_b32_e32 v7, 4
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i64@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i64@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v4i64:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2
; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4
; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i64@abs32@hi
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i64@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i64:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 3
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 4
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i64@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i64@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %load = load <2 x i64>, ptr addrspace(1) null
  %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  call amdgpu_gfx void @external_void_func_v4i64(<4 x i64> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_f16_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_f16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_f16@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4400
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_f16_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 0x4400
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_f16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_f16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_f16_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_mov_b32_e32 v0, 0x4400
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_f16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_f16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_f16_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x4400
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_f16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_f16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_f16(half 4.0)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_f32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_f32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_f32@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 4.0
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_f32_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 4.0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_f32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_f32@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_f32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_mov_b32_e32 v0, 4.0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_f32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_f32@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_f32_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 4.0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_f32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_f32@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_f32(float 4.0)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_v2f32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2f32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2f32@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2f32_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2f32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2f32@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2f32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2f32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2f32@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f32_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2f32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2f32@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v2f32(<2 x float> <float 1.0, float 2.0>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_v3f32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3f32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3f32@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v2, 4.0
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3f32_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX10-NEXT:    v_mov_b32_e32 v2, 4.0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3f32@abs32@hi
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3f32@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3f32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT:    v_mov_b32_e32 v2, 4.0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3f32@abs32@hi
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3f32@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f32_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 4.0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3f32@abs32@hi
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3f32@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v3f32(<3 x float> <float 1.0, float 2.0, float 4.0>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_v5f32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v5f32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v5f32@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v2, 4.0
; GFX9-NEXT:    v_mov_b32_e32 v3, -1.0
; GFX9-NEXT:    v_mov_b32_e32 v4, 0.5
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v5f32_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX10-NEXT:    v_mov_b32_e32 v2, 4.0
; GFX10-NEXT:    v_mov_b32_e32 v3, -1.0
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_mov_b32_e32 v4, 0.5
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v5f32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v5f32@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v5f32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT:    v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_mov_b32_e32 v4, 0.5
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v5f32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v5f32@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5f32_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 4.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, -1.0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0.5
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v5f32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v5f32@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v5f32(<5 x float> <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_f64_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_f64@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_f64@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40100000
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_f64_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0x40100000
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_f64@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_f64@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_f64_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_f64@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_f64@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_f64_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x40100000
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_f64@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_f64@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_f64(double 4.0)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_v2f64_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2f64@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2f64@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v2, 0
; GFX9-NEXT:    v_mov_b32_e32 v3, 0x40100000
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2f64_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX10-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-NEXT:    v_mov_b32_e32 v3, 0x40100000
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2f64@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2f64@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2f64_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2f64@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2f64@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f64_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 0x40100000
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2f64@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2f64@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v2f64(<2 x double> <double 2.0, double 4.0>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_v3f64_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3f64@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3f64@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v2, 0
; GFX9-NEXT:    v_mov_b32_e32 v3, 0x40100000
; GFX9-NEXT:    v_mov_b32_e32 v4, 0
; GFX9-NEXT:    v_mov_b32_e32 v5, 0x40200000
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3f64_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX10-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-NEXT:    v_mov_b32_e32 v3, 0x40100000
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_mov_b32_e32 v4, 0
; GFX10-NEXT:    v_mov_b32_e32 v5, 0x40200000
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3f64@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3f64@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3f64_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3f64@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3f64@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f64_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 0x40100000
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 0x40200000
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3f64@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3f64@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v3f64(<3 x double> <double 2.0, double 4.0, double 8.0>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
; GFX9-LABEL: test_call_external_void_func_v2i8:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    global_load_ushort v0, v[0:1], off
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i8@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i8@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2i8:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i8@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i8@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v0
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2i8:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_mov_b32_e32 v0, 0
; GFX11-NEXT:    v_mov_b32_e32 v1, 0
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i8@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i8@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_lshrrev_b16 v1, 8, v0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i8:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i8@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i8@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    global_load_ushort v0, v[0:1], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_lshrrev_b16 v1, 8, v0
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <2 x i8>, ptr addrspace(1) null
  call amdgpu_gfx void @external_void_func_v2i8(<2 x i8> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
; GFX9-LABEL: test_call_external_void_func_v3i8:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    global_load_dword v0, v[0:1], off
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i8@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i8@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3i8:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i8@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i8@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3i8:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_mov_b32_e32 v0, 0
; GFX11-NEXT:    v_mov_b32_e32 v1, 0
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i8@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i8@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i8:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i8@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i8@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <3 x i8>, ptr addrspace(1) null
  call amdgpu_gfx void @external_void_func_v3i8(<3 x i8> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
; GFX9-LABEL: test_call_external_void_func_v4i8:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    global_load_dword v0, v[0:1], off
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i8@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i8@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v4i8:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i8@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i8@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v4i8:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_mov_b32_e32 v0, 0
; GFX11-NEXT:    v_mov_b32_e32 v1, 0
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i8@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i8@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i8:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i8@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i8@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <4 x i8>, ptr addrspace(1) null
  call amdgpu_gfx void @external_void_func_v4i8(<4 x i8> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
; GFX9-LABEL: test_call_external_void_func_v5i8:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v5i8@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v5i8@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
; GFX9-NEXT:    v_mov_b32_e32 v0, v5
; GFX9-NEXT:    v_mov_b32_e32 v4, v6
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v5i8:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v5i8@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v5i8@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
; GFX10-NEXT:    v_mov_b32_e32 v0, v5
; GFX10-NEXT:    v_mov_b32_e32 v4, v6
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v5i8:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_mov_b32_e32 v0, 0
; GFX11-NEXT:    v_mov_b32_e32 v1, 0
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v5i8@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v5i8@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    global_load_b64 v[5:6], v[0:1], off
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, v5
; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
; GFX11-NEXT:    v_mov_b32_e32 v4, v6
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5i8:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v5i8@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v5i8@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[5:6], v[0:1], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, v5
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, v6
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <5 x i8>, ptr addrspace(1) null
  call amdgpu_gfx void @external_void_func_v5i8(<5 x i8> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
; GFX9-LABEL: test_call_external_void_func_v8i8:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8i8@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8i8@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX9-NEXT:    v_mov_b32_e32 v4, v1
; GFX9-NEXT:    v_mov_b32_e32 v1, v8
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v8i8:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8i8@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8i8@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX10-NEXT:    v_mov_b32_e32 v4, v1
; GFX10-NEXT:    v_mov_b32_e32 v1, v8
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v8i8:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_mov_b32_e32 v0, 0
; GFX11-NEXT:    v_mov_b32_e32 v1, 0
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8i8@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8i8@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i8:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8i8@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8i8@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, v1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, v8
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <8 x i8>, ptr addrspace(1) null
  call amdgpu_gfx void @external_void_func_v8i8(<8 x i8> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
; GFX9-LABEL: test_call_external_void_func_v32i8:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v4, 16
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    v_mov_b32_e32 v5, 0
; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[4:5], off
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v32i8@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v32i8@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v0
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 8, v16
; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 24, v16
; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 8, v17
; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 8, v18
; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 24, v18
; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 8, v19
; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 24, v19
; GFX9-NEXT:    v_mov_b32_e32 v4, v1
; GFX9-NEXT:    v_mov_b32_e32 v8, v2
; GFX9-NEXT:    v_mov_b32_e32 v12, v3
; GFX9-NEXT:    v_mov_b32_e32 v20, v17
; GFX9-NEXT:    v_mov_b32_e32 v24, v18
; GFX9-NEXT:    v_mov_b32_e32 v28, v19
; GFX9-NEXT:    v_mov_b32_e32 v1, v35
; GFX9-NEXT:    v_mov_b32_e32 v2, v36
; GFX9-NEXT:    v_mov_b32_e32 v3, v37
; GFX9-NEXT:    v_mov_b32_e32 v17, v32
; GFX9-NEXT:    v_mov_b32_e32 v18, v33
; GFX9-NEXT:    v_mov_b32_e32 v19, v34
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v32i8:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    v_mov_b32_e32 v4, 16
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_mov_b32_e32 v5, 0
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v32i8@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i8@abs32@lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[4:5], off
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 24, v0
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 8, v16
; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 24, v16
; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 8, v17
; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 8, v18
; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 24, v18
; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 8, v19
; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 24, v19
; GFX10-NEXT:    v_mov_b32_e32 v4, v1
; GFX10-NEXT:    v_mov_b32_e32 v8, v2
; GFX10-NEXT:    v_mov_b32_e32 v12, v3
; GFX10-NEXT:    v_mov_b32_e32 v20, v17
; GFX10-NEXT:    v_mov_b32_e32 v24, v18
; GFX10-NEXT:    v_mov_b32_e32 v28, v19
; GFX10-NEXT:    v_mov_b32_e32 v1, v35
; GFX10-NEXT:    v_mov_b32_e32 v2, v36
; GFX10-NEXT:    v_mov_b32_e32 v3, v37
; GFX10-NEXT:    v_mov_b32_e32 v17, v32
; GFX10-NEXT:    v_mov_b32_e32 v18, v33
; GFX10-NEXT:    v_mov_b32_e32 v19, v34
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v32i8:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_mov_b32_e32 v0, 0
; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 16
; GFX11-NEXT:    v_mov_b32_e32 v5, 0
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v32i8@abs32@hi
; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v32i8@abs32@lo
; GFX11-NEXT:    global_load_b128 v[16:19], v[4:5], off
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_waitcnt vmcnt(1)
; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 8, v16
; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 24, v16
; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v17
; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v18
; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 24, v18
; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v19
; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v19
; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v35
; GFX11-NEXT:    v_mov_b32_e32 v8, v2
; GFX11-NEXT:    v_mov_b32_e32 v12, v3
; GFX11-NEXT:    v_mov_b32_e32 v20, v17
; GFX11-NEXT:    v_mov_b32_e32 v24, v18
; GFX11-NEXT:    v_dual_mov_b32 v28, v19 :: v_dual_mov_b32 v19, v34
; GFX11-NEXT:    v_dual_mov_b32 v2, v36 :: v_dual_mov_b32 v3, v37
; GFX11-NEXT:    v_dual_mov_b32 v17, v32 :: v_dual_mov_b32 v18, v33
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i8:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 16
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v32i8@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v32i8@abs32@lo
; GFX10-SCRATCH-NEXT:    s_clause 0x1
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[16:19], v[4:5], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v37, 24, v0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v32, 8, v16
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v34, 24, v16
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v21, 8, v17
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v25, 8, v18
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v27, 24, v18
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v29, 8, v19
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v31, 24, v19
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, v1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v8, v2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v12, v3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v20, v17
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v24, v18
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v28, v19
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, v35
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, v36
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, v37
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v17, v32
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v18, v33
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v19, v34
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <32 x i8>, ptr addrspace(1) null
  call amdgpu_gfx void @external_void_func_v32i8(<32 x i8> %val)
  ret void
}


define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
; GFX9-LABEL: test_call_external_void_func_i8_ret:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    v_mov_b32_e32 v40, 0
; GFX9-NEXT:    v_mov_b32_e32 v41, 0
; GFX9-NEXT:    global_load_ubyte v0, v[40:41], off
; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i8_ret@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i8_ret@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    global_store_byte v[40:41], v0, off
; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_nop 0
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_i8_ret:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    v_mov_b32_e32 v40, 0
; GFX10-NEXT:    v_mov_b32_e32 v41, 0
; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i8_ret@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i8_ret@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    global_load_ubyte v0, v[40:41], off
; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    global_store_byte v[40:41], v0, off
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i8_ret:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
; GFX11-NEXT:    scratch_store_b32 off, v41, s33
; GFX11-NEXT:    v_mov_b32_e32 v40, 0
; GFX11-NEXT:    v_mov_b32_e32 v41, 0
; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i8_ret@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i8_ret@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    global_load_u8 v0, v[40:41], off
; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    global_store_b8 v[40:41], v0, off
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_load_b32 v41, off, s33
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_ret:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i8_ret@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i8_ret@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    global_load_ubyte v0, v[40:41], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    global_store_byte v[40:41], v0, off
; GFX10-SCRATCH-NEXT:    s_clause 0x1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load i8, ptr addrspace(1) null
  %tmp = call amdgpu_gfx i8 @external_void_func_i8_ret(i8 %val)
  store i8 %tmp, ptr addrspace(1) null
  ret void
}


define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
; GFX9-LABEL: test_call_external_void_func_v2i8_ret:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    v_mov_b32_e32 v40, 0
; GFX9-NEXT:    v_mov_b32_e32 v41, 0
; GFX9-NEXT:    global_load_ushort v0, v[40:41], off
; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i8_ret@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i8_ret@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_short v[40:41], v0, off
; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_nop 0
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2i8_ret:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    v_mov_b32_e32 v40, 0
; GFX10-NEXT:    v_mov_b32_e32 v41, 0
; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i8_ret@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i8_ret@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    global_load_ushort v0, v[40:41], off
; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v0
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    global_store_short v[40:41], v0, off
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2i8_ret:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
; GFX11-NEXT:    scratch_store_b32 off, v41, s33
; GFX11-NEXT:    v_mov_b32_e32 v40, 0
; GFX11-NEXT:    v_mov_b32_e32 v41, 0
; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i8_ret@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i8_ret@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    global_load_u16 v0, v[40:41], off
; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_lshrrev_b16 v1, 8, v0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
; GFX11-NEXT:    global_store_b16 v[40:41], v0, off
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_load_b32 v41, off, s33
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i8_ret:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i8_ret@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i8_ret@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    global_load_ushort v0, v[40:41], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_lshrrev_b16 v1, 8, v0
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    global_store_short v[40:41], v0, off
; GFX10-SCRATCH-NEXT:    s_clause 0x1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <2 x i8>, ptr addrspace(1) null
  %tmp = call amdgpu_gfx <2 x i8> @external_void_func_v2i8_ret(<2 x i8> %val)
  store <2 x i8> %tmp, ptr addrspace(1) null
  ret void
}


define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
; GFX9-LABEL: test_call_external_void_func_v3i8_ret:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    v_mov_b32_e32 v40, 0
; GFX9-NEXT:    v_mov_b32_e32 v41, 0
; GFX9-NEXT:    global_load_dword v0, v[40:41], off
; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i8_ret@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i8_ret@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_mov_b32_e32 v3, 2
; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT:    v_mov_b32_e32 v4, 0
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_byte v[3:4], v2, off
; GFX9-NEXT:    global_store_short v[40:41], v0, off
; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_nop 0
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3i8_ret:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    v_mov_b32_e32 v40, 0
; GFX10-NEXT:    v_mov_b32_e32 v41, 0
; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i8_ret@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i8_ret@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    global_load_dword v0, v[40:41], off
; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT:    v_mov_b32_e32 v3, 2
; GFX10-NEXT:    v_mov_b32_e32 v4, 0
; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
; GFX10-NEXT:    global_store_byte v[3:4], v2, off
; GFX10-NEXT:    global_store_short v[40:41], v0, off
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3i8_ret:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
; GFX11-NEXT:    scratch_store_b32 off, v41, s33
; GFX11-NEXT:    v_mov_b32_e32 v40, 0
; GFX11-NEXT:    v_mov_b32_e32 v41, 0
; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    global_load_b32 v0, v[40:41], off
; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v0
; GFX11-NEXT:    v_mov_b32_e32 v0, 2
; GFX11-NEXT:    v_mov_b32_e32 v1, 0
; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
; GFX11-NEXT:    v_or_b32_e32 v3, v4, v3
; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_store_b8 v[0:1], v2, off
; GFX11-NEXT:    global_store_b16 v[40:41], v3, off
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_load_b32 v41, off, s33
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i8_ret:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[40:41], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
; GFX10-SCRATCH-NEXT:    global_store_byte v[3:4], v2, off
; GFX10-SCRATCH-NEXT:    global_store_short v[40:41], v0, off
; GFX10-SCRATCH-NEXT:    s_clause 0x1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <3 x i8>, ptr addrspace(1) null
  %tmp = call amdgpu_gfx <3 x i8> @external_void_func_v3i8_ret(<3 x i8> %val)
  store <3 x i8> %tmp, ptr addrspace(1) null
  ret void
}


define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX9-LABEL: test_call_external_void_func_v4i8_ret:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    v_mov_b32_e32 v40, 0
; GFX9-NEXT:    v_mov_b32_e32 v41, 0
; GFX9-NEXT:    global_load_dword v0, v[40:41], off
; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i8_ret@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i8_ret@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_dword v[40:41], v0, off
; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_nop 0
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v4i8_ret:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    v_mov_b32_e32 v40, 0
; GFX10-NEXT:    v_mov_b32_e32 v41, 0
; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i8_ret@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i8_ret@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    global_load_dword v0, v[40:41], off
; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    global_store_dword v[40:41], v0, off
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v4i8_ret:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
; GFX11-NEXT:    scratch_store_b32 off, v41, s33
; GFX11-NEXT:    v_mov_b32_e32 v40, 0
; GFX11-NEXT:    v_mov_b32_e32 v41, 0
; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i8_ret@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i8_ret@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    global_load_b32 v0, v[40:41], off
; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
; GFX11-NEXT:    global_store_b32 v[40:41], v0, off
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_load_b32 v41, off, s33
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i8_ret:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i8_ret@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i8_ret@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[40:41], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v3, 8, v3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    global_store_dword v[40:41], v0, off
; GFX10-SCRATCH-NEXT:    s_clause 0x1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <4 x i8>, ptr addrspace(1) null
  %tmp = call amdgpu_gfx <4 x i8> @external_void_func_v4i8_ret(<4 x i8> %val)
  store <4 x i8> %tmp, ptr addrspace(1) null
  ret void
}


define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX9-LABEL: test_call_external_void_func_v5i8_ret:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    v_mov_b32_e32 v40, 0
; GFX9-NEXT:    v_mov_b32_e32 v41, 0
; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[40:41], off
; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v5i8_ret@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v5i8_ret@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
; GFX9-NEXT:    v_mov_b32_e32 v0, v5
; GFX9-NEXT:    v_mov_b32_e32 v4, v6
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT:    v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_mov_b32_e32 v0, 4
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_byte v[0:1], v4, off
; GFX9-NEXT:    global_store_dword v[40:41], v2, off
; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_nop 0
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v5i8_ret:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    v_mov_b32_e32 v40, 0
; GFX10-NEXT:    v_mov_b32_e32 v41, 0
; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v5i8_ret@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v5i8_ret@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[40:41], off
; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
; GFX10-NEXT:    v_mov_b32_e32 v0, v5
; GFX10-NEXT:    v_mov_b32_e32 v4, v6
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
; GFX10-NEXT:    v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_mov_b32_e32 v0, 4
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    global_store_byte v[0:1], v4, off
; GFX10-NEXT:    global_store_dword v[40:41], v2, off
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v5i8_ret:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
; GFX11-NEXT:    scratch_store_b32 off, v41, s33
; GFX11-NEXT:    v_mov_b32_e32 v40, 0
; GFX11-NEXT:    v_mov_b32_e32 v41, 0
; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v5i8_ret@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v5i8_ret@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    global_load_b64 v[5:6], v[40:41], off
; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v0, v5
; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
; GFX11-NEXT:    v_mov_b32_e32 v4, v6
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
; GFX11-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_lshlrev_b32 v3, 16, v1
; GFX11-NEXT:    v_mov_b32_e32 v1, 0
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
; GFX11-NEXT:    global_store_b32 v[40:41], v2, off
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_load_b32 v41, off, s33
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5i8_ret:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v5i8_ret@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v5i8_ret@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[5:6], v[40:41], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, v5
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, v6
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v3, 8, v3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 4
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    global_store_byte v[0:1], v4, off
; GFX10-SCRATCH-NEXT:    global_store_dword v[40:41], v2, off
; GFX10-SCRATCH-NEXT:    s_clause 0x1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <5 x i8>, ptr addrspace(1) null
  %tmp = call amdgpu_gfx <5 x i8> @external_void_func_v5i8_ret(<5 x i8> %val)
  store <5 x i8> %tmp, ptr addrspace(1) null
  ret void
}


define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX9-LABEL: test_call_external_void_func_v8i8_ret:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    v_mov_b32_e32 v40, 0
; GFX9-NEXT:    v_mov_b32_e32 v41, 0
; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[40:41], off
; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8i8_ret@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8i8_ret@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX9-NEXT:    v_mov_b32_e32 v4, v1
; GFX9-NEXT:    v_mov_b32_e32 v1, v8
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 8, v7
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
; GFX9-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_dwordx2 v[40:41], v[3:4], off
; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_nop 0
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v8i8_ret:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    v_mov_b32_e32 v40, 0
; GFX10-NEXT:    v_mov_b32_e32 v41, 0
; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8i8_ret@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8i8_ret@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[40:41], off
; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX10-NEXT:    v_mov_b32_e32 v4, v1
; GFX10-NEXT:    v_mov_b32_e32 v1, v8
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_lshlrev_b16 v5, 8, v5
; GFX10-NEXT:    v_lshlrev_b16 v7, 8, v7
; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
; GFX10-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
; GFX10-NEXT:    v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    global_store_dwordx2 v[40:41], v[0:1], off
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v8i8_ret:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
; GFX11-NEXT:    scratch_store_b32 off, v41, s33
; GFX11-NEXT:    v_mov_b32_e32 v40, 0
; GFX11-NEXT:    v_mov_b32_e32 v41, 0
; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8i8_ret@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8i8_ret@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    global_load_b64 v[0:1], v[40:41], off
; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_mov_b32_e32 v4, v1
; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX11-NEXT:    v_mov_b32_e32 v1, v8
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
; GFX11-NEXT:    v_or_b32_e32 v5, v6, v7
; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v4
; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT:    v_or_b32_e32 v0, v0, v4
; GFX11-NEXT:    global_store_b64 v[40:41], v[0:1], off
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_load_b32 v41, off, s33
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i8_ret:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8i8_ret@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8i8_ret@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[0:1], v[40:41], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, v1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, v8
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v5, 8, v5
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v7, 8, v7
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v3, 8, v3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    global_store_dwordx2 v[40:41], v[0:1], off
; GFX10-SCRATCH-NEXT:    s_clause 0x1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <8 x i8>, ptr addrspace(1) null
  %tmp = call amdgpu_gfx <8 x i8> @external_void_func_v8i8_ret(<8 x i8> %val)
  store <8 x i8> %tmp, ptr addrspace(1) null
  ret void
}


define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX9-LABEL: test_call_external_void_func_v32i8_ret:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    v_mov_b32_e32 v40, 0
; GFX9-NEXT:    v_mov_b32_e32 v42, 16
; GFX9-NEXT:    v_mov_b32_e32 v41, 0
; GFX9-NEXT:    v_mov_b32_e32 v43, 0
; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[40:41], off
; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[42:43], off
; GFX9-NEXT:    v_writelane_b32 v44, s34, 2
; GFX9-NEXT:    v_writelane_b32 v44, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i8_ret@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i8_ret@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x800
; GFX9-NEXT:    v_writelane_b32 v44, s31, 1
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v37, 24, v0
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 8, v16
; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
; GFX9-NEXT:    v_lshrrev_b32_e32 v34, 24, v16
; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
; GFX9-NEXT:    v_lshrrev_b32_e32 v21, 8, v17
; GFX9-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
; GFX9-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
; GFX9-NEXT:    v_lshrrev_b32_e32 v25, 8, v18
; GFX9-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
; GFX9-NEXT:    v_lshrrev_b32_e32 v27, 24, v18
; GFX9-NEXT:    v_lshrrev_b32_e32 v29, 8, v19
; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 24, v19
; GFX9-NEXT:    v_mov_b32_e32 v4, v1
; GFX9-NEXT:    v_mov_b32_e32 v8, v2
; GFX9-NEXT:    v_mov_b32_e32 v12, v3
; GFX9-NEXT:    v_mov_b32_e32 v20, v17
; GFX9-NEXT:    v_mov_b32_e32 v24, v18
; GFX9-NEXT:    v_mov_b32_e32 v28, v19
; GFX9-NEXT:    v_mov_b32_e32 v1, v35
; GFX9-NEXT:    v_mov_b32_e32 v2, v36
; GFX9-NEXT:    v_mov_b32_e32 v3, v37
; GFX9-NEXT:    v_mov_b32_e32 v17, v32
; GFX9-NEXT:    v_mov_b32_e32 v18, v33
; GFX9-NEXT:    v_mov_b32_e32 v19, v34
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_lshlrev_b16_e32 v5, 8, v7
; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v6, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v29
; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v31
; GFX9-NEXT:    v_or_b32_sdwa v0, v28, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v25
; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v27
; GFX9-NEXT:    v_or_b32_sdwa v0, v24, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v1, v26, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v21
; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v23
; GFX9-NEXT:    v_or_b32_sdwa v0, v20, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v1, v22, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 8, v13
; GFX9-NEXT:    v_lshlrev_b16_e32 v9, 8, v9
; GFX9-NEXT:    v_or_b32_sdwa v7, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v17
; GFX9-NEXT:    v_lshlrev_b16_e32 v4, 8, v19
; GFX9-NEXT:    v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_lshlrev_b16_e32 v13, 8, v15
; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_lshlrev_b16_e32 v9, 8, v11
; GFX9-NEXT:    v_or_b32_sdwa v0, v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v4, v18, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v10, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v9, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off
; GFX9-NEXT:    global_store_dwordx4 v[40:41], v[6:9], off
; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_nop 0
; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
; GFX9-NEXT:    v_readlane_b32 s30, v44, 0
; GFX9-NEXT:    v_readlane_b32 s34, v44, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xf800
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v32i8_ret:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    v_mov_b32_e32 v40, 0
; GFX10-NEXT:    v_mov_b32_e32 v42, 16
; GFX10-NEXT:    v_mov_b32_e32 v41, 0
; GFX10-NEXT:    v_mov_b32_e32 v43, 0
; GFX10-NEXT:    v_writelane_b32 v44, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i8_ret@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i8_ret@abs32@lo
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[40:41], off
; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[42:43], off
; GFX10-NEXT:    v_writelane_b32 v44, s30, 0
; GFX10-NEXT:    s_addk_i32 s32, 0x400
; GFX10-NEXT:    v_writelane_b32 v44, s31, 1
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 24, v0
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 8, v16
; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 24, v16
; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
; GFX10-NEXT:    v_lshrrev_b32_e32 v21, 8, v17
; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 8, v18
; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
; GFX10-NEXT:    v_lshrrev_b32_e32 v27, 24, v18
; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 8, v19
; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 24, v19
; GFX10-NEXT:    v_mov_b32_e32 v4, v1
; GFX10-NEXT:    v_mov_b32_e32 v8, v2
; GFX10-NEXT:    v_mov_b32_e32 v12, v3
; GFX10-NEXT:    v_mov_b32_e32 v20, v17
; GFX10-NEXT:    v_mov_b32_e32 v24, v18
; GFX10-NEXT:    v_mov_b32_e32 v28, v19
; GFX10-NEXT:    v_mov_b32_e32 v1, v35
; GFX10-NEXT:    v_mov_b32_e32 v2, v36
; GFX10-NEXT:    v_mov_b32_e32 v3, v37
; GFX10-NEXT:    v_mov_b32_e32 v17, v32
; GFX10-NEXT:    v_mov_b32_e32 v18, v33
; GFX10-NEXT:    v_mov_b32_e32 v19, v34
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_lshlrev_b16 v13, 8, v13
; GFX10-NEXT:    v_lshlrev_b16 v9, 8, v9
; GFX10-NEXT:    v_lshlrev_b16 v11, 8, v11
; GFX10-NEXT:    v_lshlrev_b16 v5, 8, v5
; GFX10-NEXT:    v_lshlrev_b16 v7, 8, v7
; GFX10-NEXT:    v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_lshlrev_b16 v13, 8, v15
; GFX10-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v9, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v7, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX10-NEXT:    v_or_b32_sdwa v5, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    v_lshlrev_b16 v8, 8, v31
; GFX10-NEXT:    v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    v_lshlrev_b16 v7, 8, v29
; GFX10-NEXT:    v_lshlrev_b16 v9, 8, v25
; GFX10-NEXT:    v_or_b32_sdwa v6, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v3
; GFX10-NEXT:    v_or_b32_sdwa v3, v28, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v7, v30, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v8, v24, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_lshlrev_b16 v9, 8, v27
; GFX10-NEXT:    v_lshlrev_b16 v10, 8, v21
; GFX10-NEXT:    v_lshlrev_b16 v11, 8, v23
; GFX10-NEXT:    v_lshlrev_b16 v12, 8, v17
; GFX10-NEXT:    v_lshlrev_b16 v13, 8, v19
; GFX10-NEXT:    v_or_b32_sdwa v9, v26, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v14, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v11, v22, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v13, v18, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v10, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v9, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v8, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v7, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT:    global_store_dwordx4 v[42:43], v[7:10], off
; GFX10-NEXT:    global_store_dwordx4 v[40:41], v[3:6], off
; GFX10-NEXT:    s_clause 0x3
; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33
; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4
; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12
; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
; GFX10-NEXT:    v_readlane_b32 s30, v44, 0
; GFX10-NEXT:    v_readlane_b32 s34, v44, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v32i8_ret:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:16 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_clause 0x3
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:12
; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:8
; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:4
; GFX11-NEXT:    scratch_store_b32 off, v43, s33
; GFX11-NEXT:    v_mov_b32_e32 v40, 0
; GFX11-NEXT:    v_dual_mov_b32 v41, 0 :: v_dual_mov_b32 v42, 16
; GFX11-NEXT:    v_mov_b32_e32 v43, 0
; GFX11-NEXT:    v_writelane_b32 v44, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret@abs32@hi
; GFX11-NEXT:    global_load_b128 v[0:3], v[40:41], off
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret@abs32@lo
; GFX11-NEXT:    global_load_b128 v[16:19], v[42:43], off
; GFX11-NEXT:    v_writelane_b32 v44, s30, 0
; GFX11-NEXT:    s_add_i32 s32, s32, 32
; GFX11-NEXT:    v_writelane_b32 v44, s31, 1
; GFX11-NEXT:    s_waitcnt vmcnt(1)
; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 24, v0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 8, v16
; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 24, v16
; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
; GFX11-NEXT:    v_lshrrev_b32_e32 v21, 8, v17
; GFX11-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
; GFX11-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
; GFX11-NEXT:    v_lshrrev_b32_e32 v25, 8, v18
; GFX11-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
; GFX11-NEXT:    v_lshrrev_b32_e32 v27, 24, v18
; GFX11-NEXT:    v_lshrrev_b32_e32 v29, 8, v19
; GFX11-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
; GFX11-NEXT:    v_lshrrev_b32_e32 v31, 24, v19
; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v35
; GFX11-NEXT:    v_mov_b32_e32 v8, v2
; GFX11-NEXT:    v_mov_b32_e32 v12, v3
; GFX11-NEXT:    v_mov_b32_e32 v20, v17
; GFX11-NEXT:    v_mov_b32_e32 v24, v18
; GFX11-NEXT:    v_dual_mov_b32 v28, v19 :: v_dual_mov_b32 v19, v34
; GFX11-NEXT:    v_dual_mov_b32 v2, v36 :: v_dual_mov_b32 v3, v37
; GFX11-NEXT:    v_dual_mov_b32 v17, v32 :: v_dual_mov_b32 v18, v33
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_lshlrev_b16 v9, 8, v9
; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
; GFX11-NEXT:    v_lshlrev_b16 v11, 8, v11
; GFX11-NEXT:    v_and_b32_e32 v10, 0xff, v10
; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v13
; GFX11-NEXT:    v_and_b32_e32 v12, 0xff, v12
; GFX11-NEXT:    v_or_b32_e32 v8, v8, v9
; GFX11-NEXT:    v_or_b32_e32 v9, v10, v11
; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
; GFX11-NEXT:    v_or_b32_e32 v5, v6, v7
; GFX11-NEXT:    v_or_b32_e32 v12, v12, v13
; GFX11-NEXT:    v_lshlrev_b16 v13, 8, v15
; GFX11-NEXT:    v_and_b32_e32 v14, 0xff, v14
; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v8
; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff, v4
; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v5
; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
; GFX11-NEXT:    v_or_b32_e32 v13, v14, v13
; GFX11-NEXT:    v_or_b32_e32 v5, v7, v8
; GFX11-NEXT:    v_or_b32_e32 v4, v4, v9
; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v28
; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v29
; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v31
; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v30
; GFX11-NEXT:    v_lshlrev_b16 v10, 8, v25
; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v24
; GFX11-NEXT:    v_and_b32_e32 v12, 0xffff, v12
; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v13
; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
; GFX11-NEXT:    v_or_b32_e32 v1, v7, v1
; GFX11-NEXT:    v_or_b32_e32 v7, v9, v8
; GFX11-NEXT:    v_or_b32_e32 v8, v11, v10
; GFX11-NEXT:    v_or_b32_e32 v6, v12, v6
; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
; GFX11-NEXT:    v_and_b32_e32 v9, 0xff, v26
; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff, v8
; GFX11-NEXT:    v_lshlrev_b16 v8, 8, v27
; GFX11-NEXT:    v_lshlrev_b16 v10, 8, v21
; GFX11-NEXT:    v_and_b32_e32 v11, 0xff, v20
; GFX11-NEXT:    v_lshlrev_b16 v12, 8, v23
; GFX11-NEXT:    v_and_b32_e32 v13, 0xff, v22
; GFX11-NEXT:    v_lshlrev_b16 v14, 8, v17
; GFX11-NEXT:    v_and_b32_e32 v15, 0xff, v16
; GFX11-NEXT:    v_lshlrev_b16 v16, 8, v19
; GFX11-NEXT:    v_and_b32_e32 v17, 0xff, v18
; GFX11-NEXT:    v_or_b32_e32 v8, v9, v8
; GFX11-NEXT:    v_or_b32_e32 v9, v11, v10
; GFX11-NEXT:    v_or_b32_e32 v10, v13, v12
; GFX11-NEXT:    v_or_b32_e32 v11, v15, v14
; GFX11-NEXT:    v_or_b32_e32 v12, v17, v16
; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v1
; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff, v9
; GFX11-NEXT:    v_lshlrev_b32_e32 v14, 16, v10
; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff, v11
; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT:    v_or_b32_e32 v10, v1, v3
; GFX11-NEXT:    v_or_b32_e32 v9, v7, v8
; GFX11-NEXT:    v_or_b32_e32 v8, v13, v14
; GFX11-NEXT:    v_or_b32_e32 v7, v11, v12
; GFX11-NEXT:    v_or_b32_e32 v3, v0, v2
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_store_b128 v[42:43], v[7:10], off
; GFX11-NEXT:    global_store_b128 v[40:41], v[3:6], off
; GFX11-NEXT:    s_clause 0x3
; GFX11-NEXT:    scratch_load_b32 v43, off, s33
; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:4
; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:8
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:12
; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
; GFX11-NEXT:    v_readlane_b32 s30, v44, 0
; GFX11-NEXT:    v_readlane_b32 s0, v44, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:16 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i8_ret:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v44, s33 offset:16 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:12 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v43, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 16
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v43, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v44, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret@abs32@lo
; GFX10-SCRATCH-NEXT:    s_clause 0x1
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v[40:41], off
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[16:19], v[42:43], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v44, s30, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 32
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v44, s31, 1
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v37, 24, v0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v32, 8, v16
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v34, 24, v16
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v21, 8, v17
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v25, 8, v18
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v27, 24, v18
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v29, 8, v19
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v31, 24, v19
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, v1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v8, v2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v12, v3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v20, v17
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v24, v18
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v28, v19
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, v35
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, v36
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, v37
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v17, v32
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v18, v33
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v19, v34
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v13, 8, v13
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v9, 8, v9
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v11, 8, v11
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v5, 8, v5
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v7, 8, v7
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v12, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v13, 8, v15
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v8, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v9, v10, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v7, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v5, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v8, 8, v31
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v4, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v7, 8, v29
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v9, 8, v25
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v6, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v3
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v3, v28, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v7, v30, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v8, v24, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v9, 8, v27
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v10, 8, v21
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v11, 8, v23
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v12, 8, v17
; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v13, 8, v19
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v9, v26, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v14, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v11, v22, v11 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v12, v16, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v13, v18, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v10, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v9, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v8, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v7, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-SCRATCH-NEXT:    global_store_dwordx4 v[42:43], v[7:10], off
; GFX10-SCRATCH-NEXT:    global_store_dwordx4 v[40:41], v[3:6], off
; GFX10-SCRATCH-NEXT:    s_clause 0x3
; GFX10-SCRATCH-NEXT:    scratch_load_dword v43, off, s33
; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:4
; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:8
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:12
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v44, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v44, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v44, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v44, off, s33 offset:16 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <32 x i8>, ptr addrspace(1) null
  %tmp = call amdgpu_gfx <32 x i8> @external_void_func_v3i8_ret(<32 x i8> %val)
  store <32 x i8> %tmp, ptr addrspace(1) null
  ret void
}



define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
; GFX9-LABEL: test_call_external_void_func_v2i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    global_load_dword v0, v[0:1], off
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2i16:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2i16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i16:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <2 x i16>, ptr addrspace(1) undef
  call amdgpu_gfx void @external_void_func_v2i16(<2 x i16> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
; GFX9-LABEL: test_call_external_void_func_v3i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3i16:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3i16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <3 x i16>, ptr addrspace(1) undef
  call amdgpu_gfx void @external_void_func_v3i16(<3 x i16> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
; GFX9-LABEL: test_call_external_void_func_v3f16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3f16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3f16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3f16:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3f16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3f16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3f16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3f16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3f16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3f16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3f16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <3 x half>, ptr addrspace(1) undef
  call amdgpu_gfx void @external_void_func_v3f16(<3 x half> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_v3i16_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i16@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
; GFX9-NEXT:    v_mov_b32_e32 v1, 3
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3i16_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 0x20001
; GFX10-NEXT:    v_mov_b32_e32 v1, 3
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i16@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3i16_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i16@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x20001
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 3
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i16@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v3i16(<3 x i16> <i16 1, i16 2, i16 3>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_v3f16_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3f16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3f16@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x40003c00
; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4400
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3f16_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 0x40003c00
; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4400
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3f16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3f16@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3f16_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_mov_b32_e32 v0, 0x40003c00
; GFX11-NEXT:    v_mov_b32_e32 v1, 0x4400
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3f16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3f16@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x40003c00
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x4400
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3f16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3f16@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v3f16(<3 x half> <half 1.0, half 2.0, half 4.0>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
; GFX9-LABEL: test_call_external_void_func_v4i16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v4i16:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v4i16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <4 x i16>, ptr addrspace(1) undef
  call amdgpu_gfx void @external_void_func_v4i16(<4 x i16> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_v4i16_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i16@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40003
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v4i16_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 0x20001
; GFX10-NEXT:    v_mov_b32_e32 v1, 0x40003
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i16@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v4i16_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_mov_b32_e32 v0, 0x20001
; GFX11-NEXT:    v_mov_b32_e32 v1, 0x40003
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i16@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x20001
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x40003
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i16@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
; GFX9-LABEL: test_call_external_void_func_v2f16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    global_load_dword v0, v[0:1], off
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2f16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2f16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2f16:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2f16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2f16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2f16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2f16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2f16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f16:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[0:1], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2f16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2f16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <2 x half>, ptr addrspace(1) undef
  call amdgpu_gfx void @external_void_func_v2f16(<2 x half> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
; GFX9-LABEL: test_call_external_void_func_v2i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i32@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2i32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i32@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i32@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i32@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <2 x i32>, ptr addrspace(1) undef
  call amdgpu_gfx void @external_void_func_v2i32(<2 x i32> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_v2i32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i32@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 1
; GFX9-NEXT:    v_mov_b32_e32 v1, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2i32_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 1
; GFX10-NEXT:    v_mov_b32_e32 v1, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i32@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2i32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i32@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i32@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v2i32(<2 x i32> <i32 1, i32 2>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
; GFX9-LABEL: test_call_external_void_func_v3i32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i32@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 3
; GFX9-NEXT:    v_mov_b32_e32 v1, 4
; GFX9-NEXT:    v_mov_b32_e32 v2, 5
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3i32_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 3
; GFX10-NEXT:    v_mov_b32_e32 v1, 4
; GFX10-NEXT:    v_mov_b32_e32 v2, 5
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i32@abs32@hi
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i32@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3i32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
; GFX11-NEXT:    v_mov_b32_e32 v2, 5
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i32@abs32@hi
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i32@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 4
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 5
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i32@abs32@hi
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i32@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
; GFX9-LABEL: test_call_external_void_func_v3i32_i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i32_i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i32_i32@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 3
; GFX9-NEXT:    v_mov_b32_e32 v1, 4
; GFX9-NEXT:    v_mov_b32_e32 v2, 5
; GFX9-NEXT:    v_mov_b32_e32 v3, 6
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3i32_i32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 3
; GFX10-NEXT:    v_mov_b32_e32 v1, 4
; GFX10-NEXT:    v_mov_b32_e32 v2, 5
; GFX10-NEXT:    v_mov_b32_e32 v3, 6
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i32_i32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i32_i32@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3i32_i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
; GFX11-NEXT:    v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i32_i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i32_i32@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_i32:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 4
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 5
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 6
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i32_i32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i32_i32@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
; GFX9-LABEL: test_call_external_void_func_v4i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i32@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v4i32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i32@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v4i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i32@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i32@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <4 x i32>, ptr addrspace(1) undef
  call amdgpu_gfx void @external_void_func_v4i32(<4 x i32> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_v4i32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i32@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 1
; GFX9-NEXT:    v_mov_b32_e32 v1, 2
; GFX9-NEXT:    v_mov_b32_e32 v2, 3
; GFX9-NEXT:    v_mov_b32_e32 v3, 4
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v4i32_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 1
; GFX10-NEXT:    v_mov_b32_e32 v1, 2
; GFX10-NEXT:    v_mov_b32_e32 v2, 3
; GFX10-NEXT:    v_mov_b32_e32 v3, 4
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i32@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v4i32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i32@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i32@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_v5i32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v5i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v5i32@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 1
; GFX9-NEXT:    v_mov_b32_e32 v1, 2
; GFX9-NEXT:    v_mov_b32_e32 v2, 3
; GFX9-NEXT:    v_mov_b32_e32 v3, 4
; GFX9-NEXT:    v_mov_b32_e32 v4, 5
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v5i32_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 1
; GFX10-NEXT:    v_mov_b32_e32 v1, 2
; GFX10-NEXT:    v_mov_b32_e32 v2, 3
; GFX10-NEXT:    v_mov_b32_e32 v3, 4
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_mov_b32_e32 v4, 5
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v5i32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v5i32@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v5i32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_mov_b32_e32 v4, 5
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v5i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v5i32@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5i32_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 5
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v5i32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v5i32@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v5i32(<5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
; GFX9-LABEL: test_call_external_void_func_v8i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT:    v_mov_b32_e32 v8, 0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[34:35]
; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[34:35] offset:16
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8i32@abs32@lo
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v8i32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT:    v_mov_b32_e32 v8, 0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_dwordx4 v[0:3], v8, s[34:35]
; GFX10-NEXT:    global_load_dwordx4 v[4:7], v8, s[34:35] offset:16
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8i32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8i32@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v8i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    v_mov_b32_e32 v4, 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[0:1]
; GFX11-NEXT:    global_load_b128 v[4:7], v4, s[0:1] offset:16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8i32@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v8, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_clause 0x1
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v8, s[0:1]
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8i32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8i32@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %ptr = load ptr addrspace(1), ptr addrspace(4) undef
  %val = load <8 x i32>, ptr addrspace(1) %ptr
  call amdgpu_gfx void @external_void_func_v8i32(<8 x i32> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
; GFX9-LABEL: test_call_external_void_func_v8i32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8i32@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 1
; GFX9-NEXT:    v_mov_b32_e32 v1, 2
; GFX9-NEXT:    v_mov_b32_e32 v2, 3
; GFX9-NEXT:    v_mov_b32_e32 v3, 4
; GFX9-NEXT:    v_mov_b32_e32 v4, 5
; GFX9-NEXT:    v_mov_b32_e32 v5, 6
; GFX9-NEXT:    v_mov_b32_e32 v6, 7
; GFX9-NEXT:    v_mov_b32_e32 v7, 8
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v8i32_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 1
; GFX10-NEXT:    v_mov_b32_e32 v1, 2
; GFX10-NEXT:    v_mov_b32_e32 v2, 3
; GFX10-NEXT:    v_mov_b32_e32 v3, 4
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_mov_b32_e32 v4, 5
; GFX10-NEXT:    v_mov_b32_e32 v5, 6
; GFX10-NEXT:    v_mov_b32_e32 v6, 7
; GFX10-NEXT:    v_mov_b32_e32 v7, 8
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8i32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8i32@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v8i32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6
; GFX11-NEXT:    v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8i32@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 5
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 6
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 7
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 8
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8i32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8i32@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
; GFX9-LABEL: test_call_external_void_func_v16i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT:    v_mov_b32_e32 v16, 0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    global_load_dwordx4 v[0:3], v16, s[34:35]
; GFX9-NEXT:    global_load_dwordx4 v[4:7], v16, s[34:35] offset:16
; GFX9-NEXT:    global_load_dwordx4 v[8:11], v16, s[34:35] offset:32
; GFX9-NEXT:    global_load_dwordx4 v[12:15], v16, s[34:35] offset:48
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v16i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v16i32@abs32@lo
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v16i32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT:    v_mov_b32_e32 v16, 0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_clause 0x3
; GFX10-NEXT:    global_load_dwordx4 v[0:3], v16, s[34:35]
; GFX10-NEXT:    global_load_dwordx4 v[4:7], v16, s[34:35] offset:16
; GFX10-NEXT:    global_load_dwordx4 v[8:11], v16, s[34:35] offset:32
; GFX10-NEXT:    global_load_dwordx4 v[12:15], v16, s[34:35] offset:48
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v16i32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v16i32@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v16i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    v_mov_b32_e32 v12, 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_clause 0x3
; GFX11-NEXT:    global_load_b128 v[0:3], v12, s[0:1]
; GFX11-NEXT:    global_load_b128 v[4:7], v12, s[0:1] offset:16
; GFX11-NEXT:    global_load_b128 v[8:11], v12, s[0:1] offset:32
; GFX11-NEXT:    global_load_b128 v[12:15], v12, s[0:1] offset:48
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v16i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v16i32@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16i32:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v16, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_clause 0x3
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v16, s[0:1]
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[4:7], v16, s[0:1] offset:16
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[8:11], v16, s[0:1] offset:32
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[12:15], v16, s[0:1] offset:48
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v16i32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v16i32@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %ptr = load ptr addrspace(1), ptr addrspace(4) undef
  %val = load <16 x i32>, ptr addrspace(1) %ptr
  call amdgpu_gfx void @external_void_func_v16i32(<16 x i32> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
; GFX9-LABEL: test_call_external_void_func_v32i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT:    v_mov_b32_e32 v28, 0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    global_load_dwordx4 v[0:3], v28, s[34:35]
; GFX9-NEXT:    global_load_dwordx4 v[4:7], v28, s[34:35] offset:16
; GFX9-NEXT:    global_load_dwordx4 v[8:11], v28, s[34:35] offset:32
; GFX9-NEXT:    global_load_dwordx4 v[12:15], v28, s[34:35] offset:48
; GFX9-NEXT:    global_load_dwordx4 v[16:19], v28, s[34:35] offset:64
; GFX9-NEXT:    global_load_dwordx4 v[20:23], v28, s[34:35] offset:80
; GFX9-NEXT:    global_load_dwordx4 v[24:27], v28, s[34:35] offset:96
; GFX9-NEXT:    s_nop 0
; GFX9-NEXT:    global_load_dwordx4 v[28:31], v28, s[34:35] offset:112
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v32i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v32i32@abs32@lo
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v32i32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT:    v_mov_b32_e32 v32, 0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_clause 0x7
; GFX10-NEXT:    global_load_dwordx4 v[0:3], v32, s[34:35]
; GFX10-NEXT:    global_load_dwordx4 v[4:7], v32, s[34:35] offset:16
; GFX10-NEXT:    global_load_dwordx4 v[8:11], v32, s[34:35] offset:32
; GFX10-NEXT:    global_load_dwordx4 v[12:15], v32, s[34:35] offset:48
; GFX10-NEXT:    global_load_dwordx4 v[16:19], v32, s[34:35] offset:64
; GFX10-NEXT:    global_load_dwordx4 v[20:23], v32, s[34:35] offset:80
; GFX10-NEXT:    global_load_dwordx4 v[24:27], v32, s[34:35] offset:96
; GFX10-NEXT:    global_load_dwordx4 v[28:31], v32, s[34:35] offset:112
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v32i32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i32@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v32i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    v_mov_b32_e32 v28, 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_clause 0x7
; GFX11-NEXT:    global_load_b128 v[0:3], v28, s[0:1]
; GFX11-NEXT:    global_load_b128 v[4:7], v28, s[0:1] offset:16
; GFX11-NEXT:    global_load_b128 v[8:11], v28, s[0:1] offset:32
; GFX11-NEXT:    global_load_b128 v[12:15], v28, s[0:1] offset:48
; GFX11-NEXT:    global_load_b128 v[16:19], v28, s[0:1] offset:64
; GFX11-NEXT:    global_load_b128 v[20:23], v28, s[0:1] offset:80
; GFX11-NEXT:    global_load_b128 v[24:27], v28, s[0:1] offset:96
; GFX11-NEXT:    global_load_b128 v[28:31], v28, s[0:1] offset:112
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v32i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v32i32@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v32, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_clause 0x7
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v32i32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v32i32@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %ptr = load ptr addrspace(1), ptr addrspace(4) undef
  %val = load <32 x i32>, ptr addrspace(1) %ptr
  call amdgpu_gfx void @external_void_func_v32i32(<32 x i32> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
; GFX9-LABEL: test_call_external_void_func_v32i32_i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT:    v_mov_b32_e32 v28, 0
; GFX9-NEXT:    global_load_dword v32, v[0:1], off
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    global_load_dwordx4 v[0:3], v28, s[34:35]
; GFX9-NEXT:    global_load_dwordx4 v[4:7], v28, s[34:35] offset:16
; GFX9-NEXT:    global_load_dwordx4 v[8:11], v28, s[34:35] offset:32
; GFX9-NEXT:    global_load_dwordx4 v[12:15], v28, s[34:35] offset:48
; GFX9-NEXT:    global_load_dwordx4 v[16:19], v28, s[34:35] offset:64
; GFX9-NEXT:    global_load_dwordx4 v[20:23], v28, s[34:35] offset:80
; GFX9-NEXT:    global_load_dwordx4 v[24:27], v28, s[34:35] offset:96
; GFX9-NEXT:    s_nop 0
; GFX9-NEXT:    global_load_dwordx4 v[28:31], v28, s[34:35] offset:112
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v32i32_i32@abs32@lo
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_waitcnt vmcnt(8)
; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v32i32_i32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT:    v_mov_b32_e32 v32, 0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    global_load_dword v33, v[0:1], off
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_clause 0x7
; GFX10-NEXT:    global_load_dwordx4 v[0:3], v32, s[34:35]
; GFX10-NEXT:    global_load_dwordx4 v[4:7], v32, s[34:35] offset:16
; GFX10-NEXT:    global_load_dwordx4 v[8:11], v32, s[34:35] offset:32
; GFX10-NEXT:    global_load_dwordx4 v[12:15], v32, s[34:35] offset:48
; GFX10-NEXT:    global_load_dwordx4 v[16:19], v32, s[34:35] offset:64
; GFX10-NEXT:    global_load_dwordx4 v[20:23], v32, s[34:35] offset:80
; GFX10-NEXT:    global_load_dwordx4 v[24:27], v32, s[34:35] offset:96
; GFX10-NEXT:    global_load_dwordx4 v[28:31], v32, s[34:35] offset:112
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i32_i32@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_waitcnt vmcnt(8)
; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v32i32_i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    v_mov_b32_e32 v28, 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    global_load_b32 v32, v[0:1], off
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_clause 0x7
; GFX11-NEXT:    global_load_b128 v[0:3], v28, s[0:1]
; GFX11-NEXT:    global_load_b128 v[4:7], v28, s[0:1] offset:16
; GFX11-NEXT:    global_load_b128 v[8:11], v28, s[0:1] offset:32
; GFX11-NEXT:    global_load_b128 v[12:15], v28, s[0:1] offset:48
; GFX11-NEXT:    global_load_b128 v[16:19], v28, s[0:1] offset:64
; GFX11-NEXT:    global_load_b128 v[20:23], v28, s[0:1] offset:80
; GFX11-NEXT:    global_load_b128 v[24:27], v28, s[0:1] offset:96
; GFX11-NEXT:    global_load_b128 v[28:31], v28, s[0:1] offset:112
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v32i32_i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v32i32_i32@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_waitcnt vmcnt(8)
; GFX11-NEXT:    scratch_store_b32 off, v32, s32
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32_i32:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v32, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    global_load_dword v33, v[0:1], off
; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_clause 0x7
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v32i32_i32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v32i32_i32@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(8)
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v33, s32
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef
  %val0 = load <32 x i32>, ptr addrspace(1) %ptr0
  %val1 = load i32, ptr addrspace(1) undef
  call amdgpu_gfx void @external_void_func_v32i32_i32(<32 x i32> %val0, i32 %val1)
  ret void
}

define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %out) #0 {
; GFX9-LABEL: test_call_external_i32_func_i32_imm:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
; GFX9-NEXT:    v_mov_b32_e32 v40, v0
; GFX9-NEXT:    s_mov_b32 s35, external_i32_func_i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_i32_func_i32@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 42
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
; GFX9-NEXT:    v_mov_b32_e32 v41, v1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    global_store_dword v[40:41], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_i32_func_i32_imm:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    v_mov_b32_e32 v40, v0
; GFX10-NEXT:    v_mov_b32_e32 v0, 42
; GFX10-NEXT:    s_mov_b32 s35, external_i32_func_i32@abs32@hi
; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
; GFX10-NEXT:    s_mov_b32 s34, external_i32_func_i32@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_mov_b32_e32 v41, v1
; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    global_store_dword v[40:41], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_i32_func_i32_imm:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
; GFX11-NEXT:    scratch_store_b32 off, v41, s33
; GFX11-NEXT:    v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0
; GFX11-NEXT:    v_mov_b32_e32 v0, 42
; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
; GFX11-NEXT:    s_mov_b32 s1, external_i32_func_i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_i32_func_i32@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    global_store_b32 v[40:41], v0, off dlc
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_load_b32 v41, off, s33
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_i32_func_i32_imm:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, v0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 42
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_i32_func_i32@abs32@hi
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_i32_func_i32@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, v1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    global_store_dword v[40:41], v0, off
; GFX10-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT:    s_clause 0x1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = call amdgpu_gfx i32 @external_i32_func_i32(i32 42)
  store volatile i32 %val, ptr addrspace(1) %out
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
; GFX9-LABEL: test_call_external_void_func_struct_i8_i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT:    v_mov_b32_e32 v2, 0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    global_load_ubyte v0, v2, s[34:35]
; GFX9-NEXT:    global_load_dword v1, v2, s[34:35] offset:4
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_struct_i8_i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_struct_i8_i32@abs32@lo
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_struct_i8_i32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    global_load_ubyte v0, v2, s[34:35]
; GFX10-NEXT:    global_load_dword v1, v2, s[34:35] offset:4
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_struct_i8_i32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_struct_i8_i32@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_struct_i8_i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    v_mov_b32_e32 v1, 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    global_load_u8 v0, v1, s[0:1]
; GFX11-NEXT:    global_load_b32 v1, v1, s[0:1] offset:4
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_struct_i8_i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_struct_i8_i32@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_struct_i8_i32:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_clause 0x1
; GFX10-SCRATCH-NEXT:    global_load_ubyte v0, v2, s[0:1]
; GFX10-SCRATCH-NEXT:    global_load_dword v1, v2, s[0:1] offset:4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_struct_i8_i32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_struct_i8_i32@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef
  %val = load { i8, i32 }, ptr addrspace(1) %ptr0
  call amdgpu_gfx void @external_void_func_struct_i8_i32({ i8, i32 } %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
; GFX9-LABEL: test_call_external_void_func_byval_struct_i8_i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_mov_b32_e32 v0, 3
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s33
; GFX9-NEXT:    v_mov_b32_e32 v0, 8
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_byval_struct_i8_i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_byval_struct_i8_i32@abs32@lo
; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_byval_struct_i8_i32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 3
; GFX10-NEXT:    v_mov_b32_e32 v1, 8
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_byval_struct_i8_i32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_byval_struct_i8_i32@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s33
; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4
; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_byval_struct_i8_i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_byval_struct_i8_i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_byval_struct_i8_i32@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_store_b8 off, v0, s33
; GFX11-NEXT:    scratch_store_b32 off, v1, s33 offset:4
; GFX11-NEXT:    v_mov_b32_e32 v0, s33
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_byval_struct_i8_i32:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 8
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_byval_struct_i8_i32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_byval_struct_i8_i32@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s33
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v1, s33 offset:4
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s33
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = alloca { i8, i32 }, align 4, addrspace(5)
  %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 0
  %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 1
  store i8 3, ptr addrspace(5) %gep0
  store i32 8, ptr addrspace(5) %gep1
  call amdgpu_gfx void @external_void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 }) %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 {
; GFX9-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_mov_b32_e32 v0, 3
; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s33
; GFX9-NEXT:    v_mov_b32_e32 v0, 8
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@abs32@lo
; GFX9-NEXT:    v_add_u32_e32 v0, 8, v0
; GFX9-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
; GFX9-NEXT:    s_addk_i32 s32, 0x800
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:8
; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    global_store_byte v[0:1], v0, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    global_store_dword v[0:1], v1, off
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xf800
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_mov_b32_e32 v0, 3
; GFX10-NEXT:    v_mov_b32_e32 v1, 8
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@abs32@lo
; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s33
; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4
; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_lshrrev_b32_e64 v1, 5, s33
; GFX10-NEXT:    s_addk_i32 s32, 0x400
; GFX10-NEXT:    v_add_nc_u32_e32 v0, 8, v0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    buffer_load_ubyte v0, off, s[0:3], s33 offset:8
; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    global_store_byte v[0:1], v0, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    global_store_dword v[0:1], v1, off
; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
; GFX11-NEXT:    s_add_i32 s32, s32, 32
; GFX11-NEXT:    s_add_i32 s2, s33, 8
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_store_b8 off, v0, s33
; GFX11-NEXT:    scratch_store_b32 off, v1, s33 offset:4
; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s33
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_load_u8 v0, off, s33 offset:8
; GFX11-NEXT:    scratch_load_b32 v1, off, s33 offset:12
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    global_store_b32 v[0:1], v1, off dlc
; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:16 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 8
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 32
; GFX10-SCRATCH-NEXT:    s_add_i32 s2, s33, 8
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s33
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v1, s33 offset:4
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    s_clause 0x1
; GFX10-SCRATCH-NEXT:    scratch_load_ubyte v0, off, s33 offset:8
; GFX10-SCRATCH-NEXT:    scratch_load_dword v1, off, s33 offset:12
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    global_store_byte v[0:1], v0, off
; GFX10-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT:    global_store_dword v[0:1], v1, off
; GFX10-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:16 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %in.val = alloca { i8, i32 }, align 4, addrspace(5)
  %out.val = alloca { i8, i32 }, align 4, addrspace(5)
  %in.gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %in.val, i32 0, i32 0
  %in.gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %in.val, i32 0, i32 1
  store i8 3, ptr addrspace(5) %in.gep0
  store i32 8, ptr addrspace(5) %in.gep1
  call amdgpu_gfx void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }) %out.val, ptr addrspace(5) byval({ i8, i32 }) %in.val)
  %out.gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %out.val, i32 0, i32 0
  %out.gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %out.val, i32 0, i32 1
  %out.val0 = load i8, ptr addrspace(5) %out.gep0
  %out.val1 = load i32, ptr addrspace(5) %out.gep1

  store volatile i8 %out.val0, ptr addrspace(1) undef
  store volatile i32 %out.val1, ptr addrspace(1) undef
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
; GFX9-LABEL: test_call_external_void_func_v16i8:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    global_load_dwordx4 v[0:3], v0, s[34:35]
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v16i8@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v16i8@abs32@lo
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
; GFX9-NEXT:    v_mov_b32_e32 v4, v1
; GFX9-NEXT:    v_mov_b32_e32 v8, v2
; GFX9-NEXT:    v_mov_b32_e32 v12, v3
; GFX9-NEXT:    v_mov_b32_e32 v1, v16
; GFX9-NEXT:    v_mov_b32_e32 v2, v17
; GFX9-NEXT:    v_mov_b32_e32 v3, v18
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v16i8:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[34:35]
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v16i8@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v16i8@abs32@lo
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
; GFX10-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
; GFX10-NEXT:    v_mov_b32_e32 v4, v1
; GFX10-NEXT:    v_mov_b32_e32 v8, v2
; GFX10-NEXT:    v_mov_b32_e32 v12, v3
; GFX10-NEXT:    v_mov_b32_e32 v1, v16
; GFX10-NEXT:    v_mov_b32_e32 v2, v17
; GFX10-NEXT:    v_mov_b32_e32 v3, v18
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v16i8:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    v_mov_b32_e32 v0, 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[0:1]
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v16i8@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v16i8@abs32@lo
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
; GFX11-NEXT:    v_mov_b32_e32 v4, v1
; GFX11-NEXT:    v_mov_b32_e32 v8, v2
; GFX11-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18
; GFX11-NEXT:    v_dual_mov_b32 v1, v16 :: v_dual_mov_b32 v2, v17
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16i8:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v16i8@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v16i8@abs32@lo
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, v1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v8, v2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v12, v3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, v16
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, v17
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, v18
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %ptr = load ptr addrspace(1), ptr addrspace(4) undef
  %val = load <16 x i8>, ptr addrspace(1) %ptr
  call amdgpu_gfx void @external_void_func_v16i8(<16 x i8> %val)
  ret void
}

define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX9-LABEL: tail_call_byval_align16:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s6, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:20
; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:16
; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s33
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s35, 3
; GFX9-NEXT:    v_writelane_b32 v40, s36, 4
; GFX9-NEXT:    v_writelane_b32 v40, s37, 5
; GFX9-NEXT:    v_writelane_b32 v40, s38, 6
; GFX9-NEXT:    v_writelane_b32 v40, s39, 7
; GFX9-NEXT:    v_writelane_b32 v40, s40, 8
; GFX9-NEXT:    v_writelane_b32 v40, s41, 9
; GFX9-NEXT:    v_writelane_b32 v40, s42, 10
; GFX9-NEXT:    v_writelane_b32 v40, s43, 11
; GFX9-NEXT:    v_writelane_b32 v40, s44, 12
; GFX9-NEXT:    v_writelane_b32 v40, s45, 13
; GFX9-NEXT:    v_writelane_b32 v40, s46, 14
; GFX9-NEXT:    v_writelane_b32 v40, s47, 15
; GFX9-NEXT:    v_writelane_b32 v40, s48, 16
; GFX9-NEXT:    v_writelane_b32 v40, s49, 17
; GFX9-NEXT:    v_writelane_b32 v40, s50, 18
; GFX9-NEXT:    v_writelane_b32 v40, s51, 19
; GFX9-NEXT:    v_writelane_b32 v40, s52, 20
; GFX9-NEXT:    v_writelane_b32 v40, s53, 21
; GFX9-NEXT:    v_writelane_b32 v40, s54, 22
; GFX9-NEXT:    v_writelane_b32 v40, s55, 23
; GFX9-NEXT:    v_writelane_b32 v40, s56, 24
; GFX9-NEXT:    v_writelane_b32 v40, s57, 25
; GFX9-NEXT:    v_writelane_b32 v40, s58, 26
; GFX9-NEXT:    v_writelane_b32 v40, s59, 27
; GFX9-NEXT:    v_writelane_b32 v40, s60, 28
; GFX9-NEXT:    v_writelane_b32 v40, s61, 29
; GFX9-NEXT:    s_addk_i32 s32, 0x800
; GFX9-NEXT:    v_writelane_b32 v40, s62, 30
; GFX9-NEXT:    s_mov_b32 s5, byval_align16_f64_arg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s4, byval_align16_f64_arg@abs32@lo
; GFX9-NEXT:    v_writelane_b32 v40, s63, 31
; GFX9-NEXT:    s_waitcnt vmcnt(2)
; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
; GFX9-NEXT:    s_waitcnt vmcnt(2)
; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT:    v_readlane_b32 s63, v40, 31
; GFX9-NEXT:    v_readlane_b32 s62, v40, 30
; GFX9-NEXT:    v_readlane_b32 s61, v40, 29
; GFX9-NEXT:    v_readlane_b32 s60, v40, 28
; GFX9-NEXT:    v_readlane_b32 s59, v40, 27
; GFX9-NEXT:    v_readlane_b32 s58, v40, 26
; GFX9-NEXT:    v_readlane_b32 s57, v40, 25
; GFX9-NEXT:    v_readlane_b32 s56, v40, 24
; GFX9-NEXT:    v_readlane_b32 s55, v40, 23
; GFX9-NEXT:    v_readlane_b32 s54, v40, 22
; GFX9-NEXT:    v_readlane_b32 s53, v40, 21
; GFX9-NEXT:    v_readlane_b32 s52, v40, 20
; GFX9-NEXT:    v_readlane_b32 s51, v40, 19
; GFX9-NEXT:    v_readlane_b32 s50, v40, 18
; GFX9-NEXT:    v_readlane_b32 s49, v40, 17
; GFX9-NEXT:    v_readlane_b32 s48, v40, 16
; GFX9-NEXT:    v_readlane_b32 s47, v40, 15
; GFX9-NEXT:    v_readlane_b32 s46, v40, 14
; GFX9-NEXT:    v_readlane_b32 s45, v40, 13
; GFX9-NEXT:    v_readlane_b32 s44, v40, 12
; GFX9-NEXT:    v_readlane_b32 s43, v40, 11
; GFX9-NEXT:    v_readlane_b32 s42, v40, 10
; GFX9-NEXT:    v_readlane_b32 s41, v40, 9
; GFX9-NEXT:    v_readlane_b32 s40, v40, 8
; GFX9-NEXT:    v_readlane_b32 s39, v40, 7
; GFX9-NEXT:    v_readlane_b32 s38, v40, 6
; GFX9-NEXT:    v_readlane_b32 s37, v40, 5
; GFX9-NEXT:    v_readlane_b32 s36, v40, 4
; GFX9-NEXT:    v_readlane_b32 s35, v40, 3
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
; GFX9-NEXT:    s_addk_i32 s32, 0xf800
; GFX9-NEXT:    s_mov_b32 s33, s6
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: tail_call_byval_align16:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s6, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s4
; GFX10-NEXT:    s_clause 0x2
; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:20
; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:16
; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s33
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    s_addk_i32 s32, 0x400
; GFX10-NEXT:    s_mov_b32 s5, byval_align16_f64_arg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s4, byval_align16_f64_arg@abs32@lo
; GFX10-NEXT:    s_waitcnt vmcnt(2)
; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_writelane_b32 v40, s35, 3
; GFX10-NEXT:    v_writelane_b32 v40, s36, 4
; GFX10-NEXT:    v_writelane_b32 v40, s37, 5
; GFX10-NEXT:    v_writelane_b32 v40, s38, 6
; GFX10-NEXT:    v_writelane_b32 v40, s39, 7
; GFX10-NEXT:    v_writelane_b32 v40, s40, 8
; GFX10-NEXT:    v_writelane_b32 v40, s41, 9
; GFX10-NEXT:    v_writelane_b32 v40, s42, 10
; GFX10-NEXT:    v_writelane_b32 v40, s43, 11
; GFX10-NEXT:    v_writelane_b32 v40, s44, 12
; GFX10-NEXT:    v_writelane_b32 v40, s45, 13
; GFX10-NEXT:    v_writelane_b32 v40, s46, 14
; GFX10-NEXT:    v_writelane_b32 v40, s47, 15
; GFX10-NEXT:    v_writelane_b32 v40, s48, 16
; GFX10-NEXT:    v_writelane_b32 v40, s49, 17
; GFX10-NEXT:    v_writelane_b32 v40, s50, 18
; GFX10-NEXT:    v_writelane_b32 v40, s51, 19
; GFX10-NEXT:    v_writelane_b32 v40, s52, 20
; GFX10-NEXT:    v_writelane_b32 v40, s53, 21
; GFX10-NEXT:    v_writelane_b32 v40, s54, 22
; GFX10-NEXT:    v_writelane_b32 v40, s55, 23
; GFX10-NEXT:    v_writelane_b32 v40, s56, 24
; GFX10-NEXT:    v_writelane_b32 v40, s57, 25
; GFX10-NEXT:    v_writelane_b32 v40, s58, 26
; GFX10-NEXT:    v_writelane_b32 v40, s59, 27
; GFX10-NEXT:    v_writelane_b32 v40, s60, 28
; GFX10-NEXT:    v_writelane_b32 v40, s61, 29
; GFX10-NEXT:    v_writelane_b32 v40, s62, 30
; GFX10-NEXT:    v_writelane_b32 v40, s63, 31
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT:    v_readlane_b32 s63, v40, 31
; GFX10-NEXT:    v_readlane_b32 s62, v40, 30
; GFX10-NEXT:    v_readlane_b32 s61, v40, 29
; GFX10-NEXT:    v_readlane_b32 s60, v40, 28
; GFX10-NEXT:    v_readlane_b32 s59, v40, 27
; GFX10-NEXT:    v_readlane_b32 s58, v40, 26
; GFX10-NEXT:    v_readlane_b32 s57, v40, 25
; GFX10-NEXT:    v_readlane_b32 s56, v40, 24
; GFX10-NEXT:    v_readlane_b32 s55, v40, 23
; GFX10-NEXT:    v_readlane_b32 s54, v40, 22
; GFX10-NEXT:    v_readlane_b32 s53, v40, 21
; GFX10-NEXT:    v_readlane_b32 s52, v40, 20
; GFX10-NEXT:    v_readlane_b32 s51, v40, 19
; GFX10-NEXT:    v_readlane_b32 s50, v40, 18
; GFX10-NEXT:    v_readlane_b32 s49, v40, 17
; GFX10-NEXT:    v_readlane_b32 s48, v40, 16
; GFX10-NEXT:    v_readlane_b32 s47, v40, 15
; GFX10-NEXT:    v_readlane_b32 s46, v40, 14
; GFX10-NEXT:    v_readlane_b32 s45, v40, 13
; GFX10-NEXT:    v_readlane_b32 s44, v40, 12
; GFX10-NEXT:    v_readlane_b32 s43, v40, 11
; GFX10-NEXT:    v_readlane_b32 s42, v40, 10
; GFX10-NEXT:    v_readlane_b32 s41, v40, 9
; GFX10-NEXT:    v_readlane_b32 s40, v40, 8
; GFX10-NEXT:    v_readlane_b32 s39, v40, 7
; GFX10-NEXT:    v_readlane_b32 s38, v40, 6
; GFX10-NEXT:    v_readlane_b32 s37, v40, 5
; GFX10-NEXT:    v_readlane_b32 s36, v40, 4
; GFX10-NEXT:    v_readlane_b32 s35, v40, 3
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s4
; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
; GFX10-NEXT:    s_mov_b32 s33, s6
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: tail_call_byval_align16:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s4, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:24 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s0
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    scratch_load_b64 v[32:33], off, s33 offset:16
; GFX11-NEXT:    scratch_load_b32 v31, off, s33
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_add_i32 s32, s32, 32
; GFX11-NEXT:    s_mov_b32 s1, byval_align16_f64_arg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, byval_align16_f64_arg@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
; GFX11-NEXT:    v_writelane_b32 v40, s35, 3
; GFX11-NEXT:    v_writelane_b32 v40, s36, 4
; GFX11-NEXT:    v_writelane_b32 v40, s37, 5
; GFX11-NEXT:    v_writelane_b32 v40, s38, 6
; GFX11-NEXT:    v_writelane_b32 v40, s39, 7
; GFX11-NEXT:    v_writelane_b32 v40, s40, 8
; GFX11-NEXT:    v_writelane_b32 v40, s41, 9
; GFX11-NEXT:    v_writelane_b32 v40, s42, 10
; GFX11-NEXT:    v_writelane_b32 v40, s43, 11
; GFX11-NEXT:    v_writelane_b32 v40, s44, 12
; GFX11-NEXT:    v_writelane_b32 v40, s45, 13
; GFX11-NEXT:    v_writelane_b32 v40, s46, 14
; GFX11-NEXT:    v_writelane_b32 v40, s47, 15
; GFX11-NEXT:    v_writelane_b32 v40, s48, 16
; GFX11-NEXT:    v_writelane_b32 v40, s49, 17
; GFX11-NEXT:    v_writelane_b32 v40, s50, 18
; GFX11-NEXT:    v_writelane_b32 v40, s51, 19
; GFX11-NEXT:    v_writelane_b32 v40, s52, 20
; GFX11-NEXT:    v_writelane_b32 v40, s53, 21
; GFX11-NEXT:    v_writelane_b32 v40, s54, 22
; GFX11-NEXT:    v_writelane_b32 v40, s55, 23
; GFX11-NEXT:    v_writelane_b32 v40, s56, 24
; GFX11-NEXT:    v_writelane_b32 v40, s57, 25
; GFX11-NEXT:    v_writelane_b32 v40, s58, 26
; GFX11-NEXT:    v_writelane_b32 v40, s59, 27
; GFX11-NEXT:    v_writelane_b32 v40, s60, 28
; GFX11-NEXT:    v_writelane_b32 v40, s61, 29
; GFX11-NEXT:    v_writelane_b32 v40, s62, 30
; GFX11-NEXT:    v_writelane_b32 v40, s63, 31
; GFX11-NEXT:    s_waitcnt vmcnt(1)
; GFX11-NEXT:    scratch_store_b64 off, v[32:33], s32
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_readlane_b32 s63, v40, 31
; GFX11-NEXT:    v_readlane_b32 s62, v40, 30
; GFX11-NEXT:    v_readlane_b32 s61, v40, 29
; GFX11-NEXT:    v_readlane_b32 s60, v40, 28
; GFX11-NEXT:    v_readlane_b32 s59, v40, 27
; GFX11-NEXT:    v_readlane_b32 s58, v40, 26
; GFX11-NEXT:    v_readlane_b32 s57, v40, 25
; GFX11-NEXT:    v_readlane_b32 s56, v40, 24
; GFX11-NEXT:    v_readlane_b32 s55, v40, 23
; GFX11-NEXT:    v_readlane_b32 s54, v40, 22
; GFX11-NEXT:    v_readlane_b32 s53, v40, 21
; GFX11-NEXT:    v_readlane_b32 s52, v40, 20
; GFX11-NEXT:    v_readlane_b32 s51, v40, 19
; GFX11-NEXT:    v_readlane_b32 s50, v40, 18
; GFX11-NEXT:    v_readlane_b32 s49, v40, 17
; GFX11-NEXT:    v_readlane_b32 s48, v40, 16
; GFX11-NEXT:    v_readlane_b32 s47, v40, 15
; GFX11-NEXT:    v_readlane_b32 s46, v40, 14
; GFX11-NEXT:    v_readlane_b32 s45, v40, 13
; GFX11-NEXT:    v_readlane_b32 s44, v40, 12
; GFX11-NEXT:    v_readlane_b32 s43, v40, 11
; GFX11-NEXT:    v_readlane_b32 s42, v40, 10
; GFX11-NEXT:    v_readlane_b32 s41, v40, 9
; GFX11-NEXT:    v_readlane_b32 s40, v40, 8
; GFX11-NEXT:    v_readlane_b32 s39, v40, 7
; GFX11-NEXT:    v_readlane_b32 s38, v40, 6
; GFX11-NEXT:    v_readlane_b32 s37, v40, 5
; GFX11-NEXT:    v_readlane_b32 s36, v40, 4
; GFX11-NEXT:    v_readlane_b32 s35, v40, 3
; GFX11-NEXT:    v_readlane_b32 s34, v40, 2
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s0
; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
; GFX11-NEXT:    s_mov_b32 s33, s4
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: tail_call_byval_align16:
; GFX10-SCRATCH:       ; %bb.0: ; %entry
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:24 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT:    s_clause 0x1
; GFX10-SCRATCH-NEXT:    scratch_load_dwordx2 v[32:33], off, s33 offset:16
; GFX10-SCRATCH-NEXT:    scratch_load_dword v31, off, s33
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 32
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, byval_align16_f64_arg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, byval_align16_f64_arg@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s35, 3
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s36, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s37, 5
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s38, 6
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s39, 7
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s40, 8
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s41, 9
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s42, 10
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s43, 11
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s44, 12
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s45, 13
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s46, 14
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s47, 15
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s48, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s49, 17
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s50, 18
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s51, 19
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s52, 20
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s53, 21
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s54, 22
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s55, 23
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s56, 24
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s57, 25
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s58, 26
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s59, 27
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s60, 28
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s61, 29
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s62, 30
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s63, 31
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[32:33], s32
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s63, v40, 31
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s62, v40, 30
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s61, v40, 29
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s60, v40, 28
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s59, v40, 27
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s58, v40, 26
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s57, v40, 25
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s56, v40, 24
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s55, v40, 23
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s54, v40, 22
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s53, v40, 21
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s52, v40, 20
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s51, v40, 19
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s50, v40, 18
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s49, v40, 17
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s48, v40, 16
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s47, v40, 15
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s46, v40, 14
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s45, v40, 13
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s44, v40, 12
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s43, v40, 11
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s42, v40, 10
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s41, v40, 9
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s40, v40, 8
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s39, v40, 7
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s38, v40, 6
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s37, v40, 5
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s36, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s35, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:24 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s4
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
entry:
  %alloca = alloca double, align 8, addrspace(5)
  tail call amdgpu_gfx void @byval_align16_f64_arg(<32 x i32> %val, ptr addrspace(5) byval(double) align 16 %alloca)
  ret void
}

; inreg arguments are put in sgprs
define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_i1_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    v_mov_b32_e32 v0, 1
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i1_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i1_inreg@abs32@lo
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], s32
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_i1_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 1
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i1_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i1_inreg@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    buffer_store_byte v0, off, s[0:3], s32
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i1_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_mov_b32_e32 v0, 1
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i1_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i1_inreg@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    scratch_store_b8 off, v0, s32
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i1_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i1_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i1_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_i1_inreg(i1 inreg true)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
; GFX9-LABEL: test_call_external_void_func_i8_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i8_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i8_inreg@abs32@lo
; GFX9-NEXT:    s_movk_i32 s4, 0x7b
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_i8_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i8_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i8_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_movk_i32 s4, 0x7b
; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i8_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i8_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i8_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_movk_i32 s4, 0x7b
; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i8_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i8_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i8_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_i8_inreg(i8 inreg 123)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_i16_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i16_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i16_inreg@abs32@lo
; GFX9-NEXT:    s_movk_i32 s4, 0x7b
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_i16_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i16_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i16_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_movk_i32 s4, 0x7b
; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i16_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i16_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i16_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_movk_i32 s4, 0x7b
; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i16_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i16_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i16_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_i16_inreg(i16 inreg 123)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
; GFX9-LABEL: test_call_external_void_func_i32_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i32_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i32_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 42
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_i32_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i32_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i32_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 42
; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i32_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i32_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i32_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 42
; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i32_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i32_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i32_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 42
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_i32_inreg(i32 inreg 42)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_i64_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i64_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i64_inreg@abs32@lo
; GFX9-NEXT:    s_movk_i32 s4, 0x7b
; GFX9-NEXT:    s_mov_b32 s5, 0
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_i64_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i64_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i64_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_movk_i32 s4, 0x7b
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_mov_b32 s5, 0
; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_i64_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i64_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i64_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_movk_i32 s4, 0x7b
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_mov_b32 s5, 0
; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_i64_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i64_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i64_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_i64_inreg(i64 inreg 123)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v2i64_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    s_mov_b64 s[34:35], 0
; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2i64_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
; GFX10-NEXT:    s_mov_b64 s[34:35], 0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2i64_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
; GFX10-SCRATCH-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <2 x i64>, ptr addrspace(4) null
  call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v2i64_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 1
; GFX9-NEXT:    s_mov_b32 s5, 2
; GFX9-NEXT:    s_mov_b32 s6, 3
; GFX9-NEXT:    s_mov_b32 s7, 4
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2i64_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 1
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_mov_b32 s5, 2
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    s_mov_b32 s6, 3
; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-NEXT:    s_mov_b32 s7, 4
; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2i64_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 1
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_mov_b32 s5, 2
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    s_mov_b32 s6, 3
; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
; GFX11-NEXT:    s_mov_b32 s7, 4
; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i64_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v2i64_inreg(<2 x i64> inreg <i64 8589934593, i64 17179869187>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v3i64_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 8
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    s_mov_b64 s[34:35], 0
; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
; GFX9-NEXT:    v_writelane_b32 v40, s30, 6
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s8, 1
; GFX9-NEXT:    s_mov_b32 s9, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
; GFX9-NEXT:    v_readlane_b32 s30, v40, 6
; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 8
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3i64_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 8
; GFX10-NEXT:    s_mov_b64 s[34:35], 0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-NEXT:    s_mov_b32 s8, 1
; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
; GFX10-NEXT:    s_mov_b32 s9, 2
; GFX10-NEXT:    v_writelane_b32 v40, s30, 6
; GFX10-NEXT:    v_writelane_b32 v40, s31, 7
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
; GFX10-NEXT:    v_readlane_b32 s30, v40, 6
; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 8
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3i64_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 8
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i64_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i64_inreg@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
; GFX11-NEXT:    s_mov_b32 s8, 1
; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
; GFX11-NEXT:    s_mov_b32 s9, 2
; GFX11-NEXT:    v_writelane_b32 v40, s30, 6
; GFX11-NEXT:    v_writelane_b32 v40, s31, 7
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
; GFX11-NEXT:    v_readlane_b32 s30, v40, 6
; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 8
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i64_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 8
; GFX10-SCRATCH-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i64_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i64_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 6
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 7
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 6
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 8
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %load = load <2 x i64>, ptr addrspace(4) null
  %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>

  call amdgpu_gfx void @external_void_func_v3i64_inreg(<3 x i64> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v4i64_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 10
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
; GFX9-NEXT:    s_mov_b64 s[34:35], 0
; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
; GFX9-NEXT:    v_writelane_b32 v40, s30, 8
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s8, 1
; GFX9-NEXT:    s_mov_b32 s9, 2
; GFX9-NEXT:    s_mov_b32 s10, 3
; GFX9-NEXT:    s_mov_b32 s11, 4
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 10
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v4i64_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 10
; GFX10-NEXT:    s_mov_b64 s[34:35], 0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-NEXT:    s_mov_b32 s8, 1
; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
; GFX10-NEXT:    s_mov_b32 s9, 2
; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
; GFX10-NEXT:    s_mov_b32 s10, 3
; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
; GFX10-NEXT:    s_mov_b32 s11, 4
; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 10
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v4i64_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 10
; GFX11-NEXT:    s_mov_b64 s[0:1], 0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i64_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i64_inreg@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
; GFX11-NEXT:    s_mov_b32 s8, 1
; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
; GFX11-NEXT:    s_mov_b32 s9, 2
; GFX11-NEXT:    v_writelane_b32 v40, s10, 6
; GFX11-NEXT:    s_mov_b32 s10, 3
; GFX11-NEXT:    v_writelane_b32 v40, s11, 7
; GFX11-NEXT:    s_mov_b32 s11, 4
; GFX11-NEXT:    v_writelane_b32 v40, s30, 8
; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 10
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i64_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 10
; GFX10-SCRATCH-NEXT:    s_mov_b64 s[0:1], 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i64_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i64_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
; GFX10-SCRATCH-NEXT:    s_mov_b32 s10, 3
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
; GFX10-SCRATCH-NEXT:    s_mov_b32 s11, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 10
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %load = load <2 x i64>, ptr addrspace(4) null
  %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  call amdgpu_gfx void @external_void_func_v4i64_inreg(<4 x i64> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_f16_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_f16_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_f16_inreg@abs32@lo
; GFX9-NEXT:    s_movk_i32 s4, 0x4400
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_f16_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_f16_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_f16_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_movk_i32 s4, 0x4400
; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_f16_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_f16_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_f16_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_movk_i32 s4, 0x4400
; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_f16_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_f16_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_f16_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x4400
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_f16_inreg(half inreg 4.0)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_f32_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_f32_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_f32_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 4.0
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_f32_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_f32_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_f32_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 4.0
; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_f32_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_f32_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_f32_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 4.0
; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_f32_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_f32_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_f32_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 4.0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_f32_inreg(float inreg 4.0)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v2f32_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2f32_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2f32_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 1.0
; GFX9-NEXT:    s_mov_b32 s5, 2.0
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2f32_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2f32_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2f32_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 1.0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_mov_b32 s5, 2.0
; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2f32_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2f32_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2f32_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 1.0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_mov_b32 s5, 2.0
; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f32_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2f32_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2f32_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v2f32_inreg(<2 x float> inreg <float 1.0, float 2.0>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v3f32_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 5
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 3
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 1.0
; GFX9-NEXT:    s_mov_b32 s5, 2.0
; GFX9-NEXT:    s_mov_b32 s6, 4.0
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
; GFX9-NEXT:    v_readlane_b32 s30, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 5
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3f32_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 5
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 1.0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_mov_b32 s5, 2.0
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    s_mov_b32 s6, 4.0
; GFX10-NEXT:    v_writelane_b32 v40, s30, 3
; GFX10-NEXT:    v_writelane_b32 v40, s31, 4
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
; GFX10-NEXT:    v_readlane_b32 s30, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 5
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3f32_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 5
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3f32_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3f32_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 1.0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_mov_b32 s5, 2.0
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    s_mov_b32 s6, 4.0
; GFX11-NEXT:    v_writelane_b32 v40, s30, 3
; GFX11-NEXT:    v_writelane_b32 v40, s31, 4
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
; GFX11-NEXT:    v_readlane_b32 s30, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 5
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f32_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 5
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3f32_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3f32_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 4.0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 3
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 4
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 5
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v3f32_inreg(<3 x float> inreg <float 1.0, float 2.0, float 4.0>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v5f32_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 7
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
; GFX9-NEXT:    v_writelane_b32 v40, s30, 5
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 1.0
; GFX9-NEXT:    s_mov_b32 s5, 2.0
; GFX9-NEXT:    s_mov_b32 s6, 4.0
; GFX9-NEXT:    s_mov_b32 s7, -1.0
; GFX9-NEXT:    s_mov_b32 s8, 0.5
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
; GFX9-NEXT:    v_readlane_b32 s30, v40, 5
; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 7
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v5f32_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 7
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 1.0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_mov_b32 s5, 2.0
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    s_mov_b32 s6, 4.0
; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-NEXT:    s_mov_b32 s7, -1.0
; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-NEXT:    s_mov_b32 s8, 0.5
; GFX10-NEXT:    v_writelane_b32 v40, s30, 5
; GFX10-NEXT:    v_writelane_b32 v40, s31, 6
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
; GFX10-NEXT:    v_readlane_b32 s30, v40, 5
; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 7
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v5f32_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 7
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v5f32_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v5f32_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 1.0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_mov_b32 s5, 2.0
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    s_mov_b32 s6, 4.0
; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
; GFX11-NEXT:    s_mov_b32 s7, -1.0
; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
; GFX11-NEXT:    s_mov_b32 s8, 0.5
; GFX11-NEXT:    v_writelane_b32 v40, s30, 5
; GFX11-NEXT:    v_writelane_b32 v40, s31, 6
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
; GFX11-NEXT:    v_readlane_b32 s30, v40, 5
; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 7
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5f32_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 7
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v5f32_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v5f32_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 4.0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, -1.0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 0.5
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 5
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 6
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 5
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 7
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v5f32_inreg(<5 x float> inreg <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_f64_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_f64_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_f64_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 0
; GFX9-NEXT:    s_mov_b32 s5, 0x40100000
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_f64_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_f64_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_f64_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_mov_b32 s5, 0x40100000
; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_f64_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_f64_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_f64_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_mov_b32 s5, 0x40100000
; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_f64_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_f64_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_f64_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0x40100000
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_f64_inreg(double inreg 4.0)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v2f64_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 0
; GFX9-NEXT:    s_mov_b32 s5, 2.0
; GFX9-NEXT:    s_mov_b32 s6, 0
; GFX9-NEXT:    s_mov_b32 s7, 0x40100000
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2f64_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_mov_b32 s5, 2.0
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    s_mov_b32 s6, 0
; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-NEXT:    s_mov_b32 s7, 0x40100000
; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2f64_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2f64_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2f64_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_mov_b32 s5, 2.0
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    s_mov_b32 s6, 0
; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
; GFX11-NEXT:    s_mov_b32 s7, 0x40100000
; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f64_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2f64_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2f64_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 0x40100000
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v2f64_inreg(<2 x double> inreg <double 2.0, double 4.0>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v3f64_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 8
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
; GFX9-NEXT:    v_writelane_b32 v40, s30, 6
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 0
; GFX9-NEXT:    s_mov_b32 s5, 2.0
; GFX9-NEXT:    s_mov_b32 s6, 0
; GFX9-NEXT:    s_mov_b32 s7, 0x40100000
; GFX9-NEXT:    s_mov_b32 s8, 0
; GFX9-NEXT:    s_mov_b32 s9, 0x40200000
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
; GFX9-NEXT:    v_readlane_b32 s30, v40, 6
; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 8
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3f64_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 8
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_mov_b32 s5, 2.0
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    s_mov_b32 s6, 0
; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-NEXT:    s_mov_b32 s7, 0x40100000
; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-NEXT:    s_mov_b32 s8, 0
; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
; GFX10-NEXT:    s_mov_b32 s9, 0x40200000
; GFX10-NEXT:    v_writelane_b32 v40, s30, 6
; GFX10-NEXT:    v_writelane_b32 v40, s31, 7
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
; GFX10-NEXT:    v_readlane_b32 s30, v40, 6
; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 8
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3f64_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 8
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3f64_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3f64_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_mov_b32 s5, 2.0
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    s_mov_b32 s6, 0
; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
; GFX11-NEXT:    s_mov_b32 s7, 0x40100000
; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
; GFX11-NEXT:    s_mov_b32 s8, 0
; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
; GFX11-NEXT:    s_mov_b32 s9, 0x40200000
; GFX11-NEXT:    v_writelane_b32 v40, s30, 6
; GFX11-NEXT:    v_writelane_b32 v40, s31, 7
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
; GFX11-NEXT:    v_readlane_b32 s30, v40, 6
; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 8
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f64_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 8
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3f64_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3f64_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 0x40100000
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 0x40200000
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 6
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 7
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 6
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 8
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v3f64_inreg(<3 x double> inreg <double 2.0, double 4.0, double 8.0>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v2i16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    s_load_dword s4, s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i16_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i16_inreg@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2i16_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_load_dword s4, s[34:35], 0x0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i16_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i16_inreg@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2i16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i16_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i16_inreg@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i16_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_load_dword s4, s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i16_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i16_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <2 x i16>, ptr addrspace(4) undef
  call amdgpu_gfx void @external_void_func_v2i16_inreg(<2 x i16> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v3i16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3i16_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3i16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <3 x i16>, ptr addrspace(4) undef
  call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v3f16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3f16_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3f16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <3 x half>, ptr addrspace(4) undef
  call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v3i16_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 0x20001
; GFX9-NEXT:    s_mov_b32 s5, 3
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3i16_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 0x20001
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_mov_b32 s5, 3
; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3i16_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 0x20001
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_mov_b32 s5, 3
; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i16_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x20001
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 3
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v3i16_inreg(<3 x i16> inreg <i16 1, i16 2, i16 3>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v3f16_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 0x40003c00
; GFX9-NEXT:    s_movk_i32 s5, 0x4400
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3f16_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 0x40003c00
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_movk_i32 s5, 0x4400
; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3f16_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 0x40003c00
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_movk_i32 s5, 0x4400
; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3f16_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x40003c00
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_movk_i32 s5, 0x4400
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v3f16_inreg(<3 x half> inreg <half 1.0, half 2.0, half 4.0>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v4i16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v4i16_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v4i16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <4 x i16>, ptr addrspace(4) undef
  call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v4i16_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 0x20001
; GFX9-NEXT:    s_mov_b32 s5, 0x40003
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v4i16_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 0x20001
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_mov_b32 s5, 0x40003
; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v4i16_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 0x20001
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_mov_b32 s5, 0x40003
; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i16_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x20001
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0x40003
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v4i16_inreg(<4 x i16> inreg <i16 1, i16 2, i16 3, i16 4>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v2f16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    s_load_dword s4, s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2f16_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2f16_inreg@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2f16_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_load_dword s4, s[34:35], 0x0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2f16_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2f16_inreg@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2f16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2f16_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2f16_inreg@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2f16_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_load_dword s4, s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2f16_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2f16_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <2 x half>, ptr addrspace(4) undef
  call amdgpu_gfx void @external_void_func_v2f16_inreg(<2 x half> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v2i32_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2i32_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2i32_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <2 x i32>, ptr addrspace(4) undef
  call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v2i32_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 1
; GFX9-NEXT:    s_mov_b32 s5, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2i32_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 1
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_mov_b32 s5, 2
; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2i32_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 1
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_mov_b32 s5, 2
; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2i32_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v2i32_inreg(<2 x i32> inreg <i32 1, i32 2>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
; GFX9-LABEL: test_call_external_void_func_v3i32_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 5
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 3
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 3
; GFX9-NEXT:    s_mov_b32 s5, 4
; GFX9-NEXT:    s_mov_b32 s6, 5
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
; GFX9-NEXT:    v_readlane_b32 s30, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 5
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3i32_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 5
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 3
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_mov_b32 s5, 4
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    s_mov_b32 s6, 5
; GFX10-NEXT:    v_writelane_b32 v40, s30, 3
; GFX10-NEXT:    v_writelane_b32 v40, s31, 4
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
; GFX10-NEXT:    v_readlane_b32 s30, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 5
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3i32_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 5
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i32_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i32_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 3
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_mov_b32 s5, 4
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    s_mov_b32 s6, 5
; GFX11-NEXT:    v_writelane_b32 v40, s30, 3
; GFX11-NEXT:    v_writelane_b32 v40, s31, 4
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
; GFX11-NEXT:    v_readlane_b32 s30, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 5
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 5
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i32_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i32_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 3
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 5
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 3
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 4
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 5
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v3i32_inreg(<3 x i32> inreg <i32 3, i32 4, i32 5>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
; GFX9-LABEL: test_call_external_void_func_v3i32_i32_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 3
; GFX9-NEXT:    s_mov_b32 s5, 4
; GFX9-NEXT:    s_mov_b32 s6, 5
; GFX9-NEXT:    s_mov_b32 s7, 6
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3i32_i32_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 3
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_mov_b32 s5, 4
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    s_mov_b32 s6, 5
; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-NEXT:    s_mov_b32 s7, 6
; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3i32_i32_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i32_i32_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i32_i32_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 3
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_mov_b32 s5, 4
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    s_mov_b32 s6, 5
; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
; GFX11-NEXT:    s_mov_b32 s7, 6
; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3i32_i32_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i32_i32_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i32_i32_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 3
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 5
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 6
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v3i32_i32_inreg(<3 x i32> inreg <i32 3, i32 4, i32 5>, i32 inreg 6)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v4i32_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v4i32_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v4i32_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = load <4 x i32>, ptr addrspace(4) undef
  call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v4i32_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 1
; GFX9-NEXT:    s_mov_b32 s5, 2
; GFX9-NEXT:    s_mov_b32 s6, 3
; GFX9-NEXT:    s_mov_b32 s7, 4
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v4i32_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 1
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_mov_b32 s5, 2
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    s_mov_b32 s6, 3
; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-NEXT:    s_mov_b32 s7, 4
; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v4i32_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 1
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_mov_b32 s5, 2
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    s_mov_b32 s6, 3
; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
; GFX11-NEXT:    s_mov_b32 s7, 4
; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4i32_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v4i32_inreg(<4 x i32> inreg <i32 1, i32 2, i32 3, i32 4>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v5i32_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 7
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
; GFX9-NEXT:    v_writelane_b32 v40, s30, 5
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 1
; GFX9-NEXT:    s_mov_b32 s5, 2
; GFX9-NEXT:    s_mov_b32 s6, 3
; GFX9-NEXT:    s_mov_b32 s7, 4
; GFX9-NEXT:    s_mov_b32 s8, 5
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
; GFX9-NEXT:    v_readlane_b32 s30, v40, 5
; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 7
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v5i32_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 7
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 1
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_mov_b32 s5, 2
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    s_mov_b32 s6, 3
; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-NEXT:    s_mov_b32 s7, 4
; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-NEXT:    s_mov_b32 s8, 5
; GFX10-NEXT:    v_writelane_b32 v40, s30, 5
; GFX10-NEXT:    v_writelane_b32 v40, s31, 6
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
; GFX10-NEXT:    v_readlane_b32 s30, v40, 5
; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 7
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v5i32_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 7
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v5i32_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v5i32_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 1
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_mov_b32 s5, 2
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    s_mov_b32 s6, 3
; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
; GFX11-NEXT:    s_mov_b32 s7, 4
; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
; GFX11-NEXT:    s_mov_b32 s8, 5
; GFX11-NEXT:    v_writelane_b32 v40, s30, 5
; GFX11-NEXT:    v_writelane_b32 v40, s31, 6
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
; GFX11-NEXT:    v_readlane_b32 s30, v40, 5
; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 7
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v5i32_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 7
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v5i32_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v5i32_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 5
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 5
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 6
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 5
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 7
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v5i32_inreg(<5 x i32> inreg <i32 1, i32 2, i32 3, i32 4, i32 5>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v8i32_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 10
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 8
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 10
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v8i32_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 10
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[34:35], 0x0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 10
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v8i32_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 10
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
; GFX11-NEXT:    v_writelane_b32 v40, s10, 6
; GFX11-NEXT:    v_writelane_b32 v40, s11, 7
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_load_b256 s[4:11], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 8
; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 10
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 10
; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 10
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %ptr = load ptr addrspace(4), ptr addrspace(4) undef
  %val = load <8 x i32>, ptr addrspace(4) %ptr
  call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v8i32_imm_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 10
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
; GFX9-NEXT:    v_writelane_b32 v40, s30, 8
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s4, 1
; GFX9-NEXT:    s_mov_b32 s5, 2
; GFX9-NEXT:    s_mov_b32 s6, 3
; GFX9-NEXT:    s_mov_b32 s7, 4
; GFX9-NEXT:    s_mov_b32 s8, 5
; GFX9-NEXT:    s_mov_b32 s9, 6
; GFX9-NEXT:    s_mov_b32 s10, 7
; GFX9-NEXT:    s_mov_b32 s11, 8
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 10
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v8i32_imm_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 10
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    s_mov_b32 s4, 1
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    s_mov_b32 s5, 2
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    s_mov_b32 s6, 3
; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-NEXT:    s_mov_b32 s7, 4
; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-NEXT:    s_mov_b32 s8, 5
; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
; GFX10-NEXT:    s_mov_b32 s9, 6
; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
; GFX10-NEXT:    s_mov_b32 s10, 7
; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
; GFX10-NEXT:    s_mov_b32 s11, 8
; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 10
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v8i32_imm_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 10
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    s_mov_b32 s4, 1
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    s_mov_b32 s5, 2
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    s_mov_b32 s6, 3
; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
; GFX11-NEXT:    s_mov_b32 s7, 4
; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
; GFX11-NEXT:    s_mov_b32 s8, 5
; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
; GFX11-NEXT:    s_mov_b32 s9, 6
; GFX11-NEXT:    v_writelane_b32 v40, s10, 6
; GFX11-NEXT:    s_mov_b32 s10, 7
; GFX11-NEXT:    v_writelane_b32 v40, s11, 7
; GFX11-NEXT:    s_mov_b32 s11, 8
; GFX11-NEXT:    v_writelane_b32 v40, s30, 8
; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 10
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8i32_imm_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 10
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 5
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 6
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
; GFX10-SCRATCH-NEXT:    s_mov_b32 s10, 7
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
; GFX10-SCRATCH-NEXT:    s_mov_b32 s11, 8
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 10
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  call amdgpu_gfx void @external_void_func_v8i32_inreg(<8 x i32> inreg <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v16i32_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 18
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
; GFX9-NEXT:    v_writelane_b32 v40, s12, 8
; GFX9-NEXT:    v_writelane_b32 v40, s13, 9
; GFX9-NEXT:    v_writelane_b32 v40, s14, 10
; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s15, 11
; GFX9-NEXT:    v_writelane_b32 v40, s16, 12
; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s30, 16
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 17
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 17
; GFX9-NEXT:    v_readlane_b32 s30, v40, 16
; GFX9-NEXT:    v_readlane_b32 s19, v40, 15
; GFX9-NEXT:    v_readlane_b32 s18, v40, 14
; GFX9-NEXT:    v_readlane_b32 s17, v40, 13
; GFX9-NEXT:    v_readlane_b32 s16, v40, 12
; GFX9-NEXT:    v_readlane_b32 s15, v40, 11
; GFX9-NEXT:    v_readlane_b32 s14, v40, 10
; GFX9-NEXT:    v_readlane_b32 s13, v40, 9
; GFX9-NEXT:    v_readlane_b32 s12, v40, 8
; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 18
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v16i32_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 18
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
; GFX10-NEXT:    v_writelane_b32 v40, s12, 8
; GFX10-NEXT:    v_writelane_b32 v40, s13, 9
; GFX10-NEXT:    v_writelane_b32 v40, s14, 10
; GFX10-NEXT:    v_writelane_b32 v40, s15, 11
; GFX10-NEXT:    v_writelane_b32 v40, s16, 12
; GFX10-NEXT:    v_writelane_b32 v40, s17, 13
; GFX10-NEXT:    v_writelane_b32 v40, s18, 14
; GFX10-NEXT:    v_writelane_b32 v40, s19, 15
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s30, 16
; GFX10-NEXT:    v_writelane_b32 v40, s31, 17
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 17
; GFX10-NEXT:    v_readlane_b32 s30, v40, 16
; GFX10-NEXT:    v_readlane_b32 s19, v40, 15
; GFX10-NEXT:    v_readlane_b32 s18, v40, 14
; GFX10-NEXT:    v_readlane_b32 s17, v40, 13
; GFX10-NEXT:    v_readlane_b32 s16, v40, 12
; GFX10-NEXT:    v_readlane_b32 s15, v40, 11
; GFX10-NEXT:    v_readlane_b32 s14, v40, 10
; GFX10-NEXT:    v_readlane_b32 s13, v40, 9
; GFX10-NEXT:    v_readlane_b32 s12, v40, 8
; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 18
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v16i32_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 18
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
; GFX11-NEXT:    v_writelane_b32 v40, s10, 6
; GFX11-NEXT:    v_writelane_b32 v40, s11, 7
; GFX11-NEXT:    v_writelane_b32 v40, s12, 8
; GFX11-NEXT:    v_writelane_b32 v40, s13, 9
; GFX11-NEXT:    v_writelane_b32 v40, s14, 10
; GFX11-NEXT:    v_writelane_b32 v40, s15, 11
; GFX11-NEXT:    v_writelane_b32 v40, s16, 12
; GFX11-NEXT:    v_writelane_b32 v40, s17, 13
; GFX11-NEXT:    v_writelane_b32 v40, s18, 14
; GFX11-NEXT:    v_writelane_b32 v40, s19, 15
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v16i32_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v16i32_inreg@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 16
; GFX11-NEXT:    v_writelane_b32 v40, s31, 17
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 17
; GFX11-NEXT:    v_readlane_b32 s30, v40, 16
; GFX11-NEXT:    v_readlane_b32 s19, v40, 15
; GFX11-NEXT:    v_readlane_b32 s18, v40, 14
; GFX11-NEXT:    v_readlane_b32 s17, v40, 13
; GFX11-NEXT:    v_readlane_b32 s16, v40, 12
; GFX11-NEXT:    v_readlane_b32 s15, v40, 11
; GFX11-NEXT:    v_readlane_b32 s14, v40, 10
; GFX11-NEXT:    v_readlane_b32 s13, v40, 9
; GFX11-NEXT:    v_readlane_b32 s12, v40, 8
; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 18
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16i32_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 18
; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s12, 8
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s13, 9
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s14, 10
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s15, 11
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s16, 12
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s17, 13
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s18, 14
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s19, 15
; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v16i32_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v16i32_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 17
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 17
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 16
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s19, v40, 15
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s18, v40, 14
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s17, v40, 13
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s16, v40, 12
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s15, v40, 11
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s14, v40, 10
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s13, v40, 9
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s12, v40, 8
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 18
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %ptr = load ptr addrspace(4), ptr addrspace(4) undef
  %val = load <16 x i32>, ptr addrspace(4) %ptr
  call amdgpu_gfx void @external_void_func_v16i32_inreg(<16 x i32> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
; GFX9-LABEL: test_call_external_void_func_v32i32_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 28
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
; GFX9-NEXT:    v_writelane_b32 v40, s12, 8
; GFX9-NEXT:    v_writelane_b32 v40, s13, 9
; GFX9-NEXT:    v_writelane_b32 v40, s14, 10
; GFX9-NEXT:    v_writelane_b32 v40, s15, 11
; GFX9-NEXT:    v_writelane_b32 v40, s16, 12
; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s20, 16
; GFX9-NEXT:    v_writelane_b32 v40, s21, 17
; GFX9-NEXT:    v_writelane_b32 v40, s22, 18
; GFX9-NEXT:    v_writelane_b32 v40, s23, 19
; GFX9-NEXT:    v_writelane_b32 v40, s24, 20
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s25, 21
; GFX9-NEXT:    v_writelane_b32 v40, s26, 22
; GFX9-NEXT:    v_writelane_b32 v40, s27, 23
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s28, 24
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v0, s46
; GFX9-NEXT:    v_writelane_b32 v40, s29, 25
; GFX9-NEXT:    v_mov_b32_e32 v1, s47
; GFX9-NEXT:    v_mov_b32_e32 v2, s48
; GFX9-NEXT:    v_mov_b32_e32 v3, s49
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
; GFX9-NEXT:    v_mov_b32_e32 v0, s50
; GFX9-NEXT:    v_writelane_b32 v40, s30, 26
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; GFX9-NEXT:    v_mov_b32_e32 v0, s51
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s20, s36
; GFX9-NEXT:    s_mov_b32 s21, s37
; GFX9-NEXT:    s_mov_b32 s22, s38
; GFX9-NEXT:    s_mov_b32 s23, s39
; GFX9-NEXT:    s_mov_b32 s24, s40
; GFX9-NEXT:    s_mov_b32 s25, s41
; GFX9-NEXT:    s_mov_b32 s26, s42
; GFX9-NEXT:    s_mov_b32 s27, s43
; GFX9-NEXT:    s_mov_b32 s28, s44
; GFX9-NEXT:    s_mov_b32 s29, s45
; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
; GFX9-NEXT:    v_readlane_b32 s30, v40, 26
; GFX9-NEXT:    v_readlane_b32 s29, v40, 25
; GFX9-NEXT:    v_readlane_b32 s28, v40, 24
; GFX9-NEXT:    v_readlane_b32 s27, v40, 23
; GFX9-NEXT:    v_readlane_b32 s26, v40, 22
; GFX9-NEXT:    v_readlane_b32 s25, v40, 21
; GFX9-NEXT:    v_readlane_b32 s24, v40, 20
; GFX9-NEXT:    v_readlane_b32 s23, v40, 19
; GFX9-NEXT:    v_readlane_b32 s22, v40, 18
; GFX9-NEXT:    v_readlane_b32 s21, v40, 17
; GFX9-NEXT:    v_readlane_b32 s20, v40, 16
; GFX9-NEXT:    v_readlane_b32 s19, v40, 15
; GFX9-NEXT:    v_readlane_b32 s18, v40, 14
; GFX9-NEXT:    v_readlane_b32 s17, v40, 13
; GFX9-NEXT:    v_readlane_b32 s16, v40, 12
; GFX9-NEXT:    v_readlane_b32 s15, v40, 11
; GFX9-NEXT:    v_readlane_b32 s14, v40, 10
; GFX9-NEXT:    v_readlane_b32 s13, v40, 9
; GFX9-NEXT:    v_readlane_b32 s12, v40, 8
; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 28
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v32i32_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 28
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
; GFX10-NEXT:    v_writelane_b32 v40, s12, 8
; GFX10-NEXT:    v_writelane_b32 v40, s13, 9
; GFX10-NEXT:    v_writelane_b32 v40, s14, 10
; GFX10-NEXT:    v_writelane_b32 v40, s15, 11
; GFX10-NEXT:    v_writelane_b32 v40, s16, 12
; GFX10-NEXT:    v_writelane_b32 v40, s17, 13
; GFX10-NEXT:    v_writelane_b32 v40, s18, 14
; GFX10-NEXT:    v_writelane_b32 v40, s19, 15
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s20, 16
; GFX10-NEXT:    v_writelane_b32 v40, s21, 17
; GFX10-NEXT:    v_writelane_b32 v40, s22, 18
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s46
; GFX10-NEXT:    v_writelane_b32 v40, s23, 19
; GFX10-NEXT:    v_mov_b32_e32 v1, s47
; GFX10-NEXT:    v_mov_b32_e32 v2, s48
; GFX10-NEXT:    v_mov_b32_e32 v3, s49
; GFX10-NEXT:    s_mov_b32 s20, s36
; GFX10-NEXT:    v_writelane_b32 v40, s24, 20
; GFX10-NEXT:    s_mov_b32 s21, s37
; GFX10-NEXT:    s_mov_b32 s22, s38
; GFX10-NEXT:    s_mov_b32 s23, s39
; GFX10-NEXT:    s_mov_b32 s24, s40
; GFX10-NEXT:    v_writelane_b32 v40, s25, 21
; GFX10-NEXT:    s_mov_b32 s25, s41
; GFX10-NEXT:    v_mov_b32_e32 v4, s50
; GFX10-NEXT:    v_mov_b32_e32 v5, s51
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16
; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20
; GFX10-NEXT:    v_writelane_b32 v40, s26, 22
; GFX10-NEXT:    s_mov_b32 s26, s42
; GFX10-NEXT:    v_writelane_b32 v40, s27, 23
; GFX10-NEXT:    s_mov_b32 s27, s43
; GFX10-NEXT:    v_writelane_b32 v40, s28, 24
; GFX10-NEXT:    s_mov_b32 s28, s44
; GFX10-NEXT:    v_writelane_b32 v40, s29, 25
; GFX10-NEXT:    s_mov_b32 s29, s45
; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
; GFX10-NEXT:    v_readlane_b32 s30, v40, 26
; GFX10-NEXT:    v_readlane_b32 s29, v40, 25
; GFX10-NEXT:    v_readlane_b32 s28, v40, 24
; GFX10-NEXT:    v_readlane_b32 s27, v40, 23
; GFX10-NEXT:    v_readlane_b32 s26, v40, 22
; GFX10-NEXT:    v_readlane_b32 s25, v40, 21
; GFX10-NEXT:    v_readlane_b32 s24, v40, 20
; GFX10-NEXT:    v_readlane_b32 s23, v40, 19
; GFX10-NEXT:    v_readlane_b32 s22, v40, 18
; GFX10-NEXT:    v_readlane_b32 s21, v40, 17
; GFX10-NEXT:    v_readlane_b32 s20, v40, 16
; GFX10-NEXT:    v_readlane_b32 s19, v40, 15
; GFX10-NEXT:    v_readlane_b32 s18, v40, 14
; GFX10-NEXT:    v_readlane_b32 s17, v40, 13
; GFX10-NEXT:    v_readlane_b32 s16, v40, 12
; GFX10-NEXT:    v_readlane_b32 s15, v40, 11
; GFX10-NEXT:    v_readlane_b32 s14, v40, 10
; GFX10-NEXT:    v_readlane_b32 s13, v40, 9
; GFX10-NEXT:    v_readlane_b32 s12, v40, 8
; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 28
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v32i32_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 28
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_add_i32 s2, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
; GFX11-NEXT:    v_writelane_b32 v40, s10, 6
; GFX11-NEXT:    v_writelane_b32 v40, s11, 7
; GFX11-NEXT:    v_writelane_b32 v40, s12, 8
; GFX11-NEXT:    v_writelane_b32 v40, s13, 9
; GFX11-NEXT:    v_writelane_b32 v40, s14, 10
; GFX11-NEXT:    v_writelane_b32 v40, s15, 11
; GFX11-NEXT:    v_writelane_b32 v40, s16, 12
; GFX11-NEXT:    v_writelane_b32 v40, s17, 13
; GFX11-NEXT:    v_writelane_b32 v40, s18, 14
; GFX11-NEXT:    v_writelane_b32 v40, s19, 15
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_clause 0x1
; GFX11-NEXT:    s_load_b512 s[36:51], s[0:1], 0x40
; GFX11-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v32i32_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v32i32_inreg@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s20, 16
; GFX11-NEXT:    v_writelane_b32 v40, s21, 17
; GFX11-NEXT:    v_writelane_b32 v40, s22, 18
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_dual_mov_b32 v4, s50 :: v_dual_mov_b32 v5, s51
; GFX11-NEXT:    v_writelane_b32 v40, s23, 19
; GFX11-NEXT:    v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v1, s47
; GFX11-NEXT:    v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49
; GFX11-NEXT:    v_writelane_b32 v40, s24, 20
; GFX11-NEXT:    s_mov_b32 s20, s36
; GFX11-NEXT:    s_mov_b32 s21, s37
; GFX11-NEXT:    s_mov_b32 s22, s38
; GFX11-NEXT:    s_mov_b32 s23, s39
; GFX11-NEXT:    v_writelane_b32 v40, s25, 21
; GFX11-NEXT:    s_mov_b32 s24, s40
; GFX11-NEXT:    s_mov_b32 s25, s41
; GFX11-NEXT:    scratch_store_b64 off, v[4:5], s2
; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT:    v_writelane_b32 v40, s26, 22
; GFX11-NEXT:    s_mov_b32 s26, s42
; GFX11-NEXT:    v_writelane_b32 v40, s27, 23
; GFX11-NEXT:    s_mov_b32 s27, s43
; GFX11-NEXT:    v_writelane_b32 v40, s28, 24
; GFX11-NEXT:    s_mov_b32 s28, s44
; GFX11-NEXT:    v_writelane_b32 v40, s29, 25
; GFX11-NEXT:    s_mov_b32 s29, s45
; GFX11-NEXT:    v_writelane_b32 v40, s30, 26
; GFX11-NEXT:    v_writelane_b32 v40, s31, 27
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
; GFX11-NEXT:    v_readlane_b32 s30, v40, 26
; GFX11-NEXT:    v_readlane_b32 s29, v40, 25
; GFX11-NEXT:    v_readlane_b32 s28, v40, 24
; GFX11-NEXT:    v_readlane_b32 s27, v40, 23
; GFX11-NEXT:    v_readlane_b32 s26, v40, 22
; GFX11-NEXT:    v_readlane_b32 s25, v40, 21
; GFX11-NEXT:    v_readlane_b32 s24, v40, 20
; GFX11-NEXT:    v_readlane_b32 s23, v40, 19
; GFX11-NEXT:    v_readlane_b32 s22, v40, 18
; GFX11-NEXT:    v_readlane_b32 s21, v40, 17
; GFX11-NEXT:    v_readlane_b32 s20, v40, 16
; GFX11-NEXT:    v_readlane_b32 s19, v40, 15
; GFX11-NEXT:    v_readlane_b32 s18, v40, 14
; GFX11-NEXT:    v_readlane_b32 s17, v40, 13
; GFX11-NEXT:    v_readlane_b32 s16, v40, 12
; GFX11-NEXT:    v_readlane_b32 s15, v40, 11
; GFX11-NEXT:    v_readlane_b32 s14, v40, 10
; GFX11-NEXT:    v_readlane_b32 s13, v40, 9
; GFX11-NEXT:    v_readlane_b32 s12, v40, 8
; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 28
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 28
; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    s_add_i32 s2, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s12, 8
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s13, 9
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s14, 10
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s15, 11
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s16, 12
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s17, 13
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s18, 14
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s19, 15
; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_clause 0x1
; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0x40
; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v32i32_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v32i32_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s20, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s21, 17
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s22, 18
; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, s50
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s23, 19
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, s51
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s46
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, s47
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, s48
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s24, 20
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, s49
; GFX10-SCRATCH-NEXT:    s_mov_b32 s20, s36
; GFX10-SCRATCH-NEXT:    s_mov_b32 s21, s37
; GFX10-SCRATCH-NEXT:    s_mov_b32 s22, s38
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s25, 21
; GFX10-SCRATCH-NEXT:    s_mov_b32 s23, s39
; GFX10-SCRATCH-NEXT:    s_mov_b32 s24, s40
; GFX10-SCRATCH-NEXT:    s_mov_b32 s25, s41
; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s2
; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s26, 22
; GFX10-SCRATCH-NEXT:    s_mov_b32 s26, s42
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s27, 23
; GFX10-SCRATCH-NEXT:    s_mov_b32 s27, s43
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s28, 24
; GFX10-SCRATCH-NEXT:    s_mov_b32 s28, s44
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s29, 25
; GFX10-SCRATCH-NEXT:    s_mov_b32 s29, s45
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 26
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 27
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 26
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s29, v40, 25
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s28, v40, 24
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s27, v40, 23
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s26, v40, 22
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s25, v40, 21
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s24, v40, 20
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s23, v40, 19
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s22, v40, 18
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s21, v40, 17
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s20, v40, 16
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s19, v40, 15
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s18, v40, 14
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s17, v40, 13
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s16, v40, 12
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s15, v40, 11
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s14, v40, 10
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s13, v40, 9
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s12, v40, 8
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 28
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %ptr = load ptr addrspace(4), ptr addrspace(4) undef
  %val = load <32 x i32>, ptr addrspace(4) %ptr
  call amdgpu_gfx void @external_void_func_v32i32_inreg(<32 x i32> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
; GFX9-LABEL: test_call_external_void_func_v32i32_i32_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 28
; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
; GFX9-NEXT:    v_writelane_b32 v40, s12, 8
; GFX9-NEXT:    v_writelane_b32 v40, s13, 9
; GFX9-NEXT:    v_writelane_b32 v40, s14, 10
; GFX9-NEXT:    v_writelane_b32 v40, s15, 11
; GFX9-NEXT:    v_writelane_b32 v40, s16, 12
; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
; GFX9-NEXT:    v_writelane_b32 v40, s20, 16
; GFX9-NEXT:    v_writelane_b32 v40, s21, 17
; GFX9-NEXT:    v_writelane_b32 v40, s22, 18
; GFX9-NEXT:    v_writelane_b32 v40, s23, 19
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_load_dword s52, s[34:35], 0x0
; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
; GFX9-NEXT:    v_writelane_b32 v40, s24, 20
; GFX9-NEXT:    v_writelane_b32 v40, s25, 21
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s26, 22
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v0, s52
; GFX9-NEXT:    v_writelane_b32 v40, s27, 23
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
; GFX9-NEXT:    v_mov_b32_e32 v0, s46
; GFX9-NEXT:    v_writelane_b32 v40, s28, 24
; GFX9-NEXT:    v_mov_b32_e32 v1, s47
; GFX9-NEXT:    v_mov_b32_e32 v2, s48
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
; GFX9-NEXT:    v_mov_b32_e32 v0, s49
; GFX9-NEXT:    v_writelane_b32 v40, s29, 25
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; GFX9-NEXT:    v_mov_b32_e32 v0, s50
; GFX9-NEXT:    v_writelane_b32 v40, s30, 26
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; GFX9-NEXT:    v_mov_b32_e32 v0, s51
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo
; GFX9-NEXT:    s_mov_b32 s20, s36
; GFX9-NEXT:    s_mov_b32 s21, s37
; GFX9-NEXT:    s_mov_b32 s22, s38
; GFX9-NEXT:    s_mov_b32 s23, s39
; GFX9-NEXT:    s_mov_b32 s24, s40
; GFX9-NEXT:    s_mov_b32 s25, s41
; GFX9-NEXT:    s_mov_b32 s26, s42
; GFX9-NEXT:    s_mov_b32 s27, s43
; GFX9-NEXT:    s_mov_b32 s28, s44
; GFX9-NEXT:    s_mov_b32 s29, s45
; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
; GFX9-NEXT:    v_readlane_b32 s30, v40, 26
; GFX9-NEXT:    v_readlane_b32 s29, v40, 25
; GFX9-NEXT:    v_readlane_b32 s28, v40, 24
; GFX9-NEXT:    v_readlane_b32 s27, v40, 23
; GFX9-NEXT:    v_readlane_b32 s26, v40, 22
; GFX9-NEXT:    v_readlane_b32 s25, v40, 21
; GFX9-NEXT:    v_readlane_b32 s24, v40, 20
; GFX9-NEXT:    v_readlane_b32 s23, v40, 19
; GFX9-NEXT:    v_readlane_b32 s22, v40, 18
; GFX9-NEXT:    v_readlane_b32 s21, v40, 17
; GFX9-NEXT:    v_readlane_b32 s20, v40, 16
; GFX9-NEXT:    v_readlane_b32 s19, v40, 15
; GFX9-NEXT:    v_readlane_b32 s18, v40, 14
; GFX9-NEXT:    v_readlane_b32 s17, v40, 13
; GFX9-NEXT:    v_readlane_b32 s16, v40, 12
; GFX9-NEXT:    v_readlane_b32 s15, v40, 11
; GFX9-NEXT:    v_readlane_b32 s14, v40, 10
; GFX9-NEXT:    v_readlane_b32 s13, v40, 9
; GFX9-NEXT:    v_readlane_b32 s12, v40, 8
; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 28
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v32i32_i32_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 28
; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
; GFX10-NEXT:    v_writelane_b32 v40, s12, 8
; GFX10-NEXT:    v_writelane_b32 v40, s13, 9
; GFX10-NEXT:    v_writelane_b32 v40, s14, 10
; GFX10-NEXT:    v_writelane_b32 v40, s15, 11
; GFX10-NEXT:    v_writelane_b32 v40, s16, 12
; GFX10-NEXT:    v_writelane_b32 v40, s17, 13
; GFX10-NEXT:    v_writelane_b32 v40, s18, 14
; GFX10-NEXT:    v_writelane_b32 v40, s19, 15
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_clause 0x2
; GFX10-NEXT:    s_load_dword s52, s[34:35], 0x0
; GFX10-NEXT:    ; meta instruction
; GFX10-NEXT:    ; meta instruction
; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s20, 16
; GFX10-NEXT:    v_writelane_b32 v40, s21, 17
; GFX10-NEXT:    v_writelane_b32 v40, s22, 18
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, s52
; GFX10-NEXT:    v_mov_b32_e32 v1, s47
; GFX10-NEXT:    v_writelane_b32 v40, s23, 19
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
; GFX10-NEXT:    v_mov_b32_e32 v0, s46
; GFX10-NEXT:    v_mov_b32_e32 v2, s48
; GFX10-NEXT:    v_mov_b32_e32 v3, s49
; GFX10-NEXT:    v_writelane_b32 v40, s24, 20
; GFX10-NEXT:    s_mov_b32 s20, s36
; GFX10-NEXT:    s_mov_b32 s21, s37
; GFX10-NEXT:    s_mov_b32 s22, s38
; GFX10-NEXT:    s_mov_b32 s23, s39
; GFX10-NEXT:    v_writelane_b32 v40, s25, 21
; GFX10-NEXT:    s_mov_b32 s24, s40
; GFX10-NEXT:    s_mov_b32 s25, s41
; GFX10-NEXT:    v_mov_b32_e32 v4, s50
; GFX10-NEXT:    v_mov_b32_e32 v5, s51
; GFX10-NEXT:    v_writelane_b32 v40, s26, 22
; GFX10-NEXT:    s_mov_b32 s26, s42
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16
; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20
; GFX10-NEXT:    v_writelane_b32 v40, s27, 23
; GFX10-NEXT:    s_mov_b32 s27, s43
; GFX10-NEXT:    v_writelane_b32 v40, s28, 24
; GFX10-NEXT:    s_mov_b32 s28, s44
; GFX10-NEXT:    v_writelane_b32 v40, s29, 25
; GFX10-NEXT:    s_mov_b32 s29, s45
; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
; GFX10-NEXT:    v_readlane_b32 s30, v40, 26
; GFX10-NEXT:    v_readlane_b32 s29, v40, 25
; GFX10-NEXT:    v_readlane_b32 s28, v40, 24
; GFX10-NEXT:    v_readlane_b32 s27, v40, 23
; GFX10-NEXT:    v_readlane_b32 s26, v40, 22
; GFX10-NEXT:    v_readlane_b32 s25, v40, 21
; GFX10-NEXT:    v_readlane_b32 s24, v40, 20
; GFX10-NEXT:    v_readlane_b32 s23, v40, 19
; GFX10-NEXT:    v_readlane_b32 s22, v40, 18
; GFX10-NEXT:    v_readlane_b32 s21, v40, 17
; GFX10-NEXT:    v_readlane_b32 s20, v40, 16
; GFX10-NEXT:    v_readlane_b32 s19, v40, 15
; GFX10-NEXT:    v_readlane_b32 s18, v40, 14
; GFX10-NEXT:    v_readlane_b32 s17, v40, 13
; GFX10-NEXT:    v_readlane_b32 s16, v40, 12
; GFX10-NEXT:    v_readlane_b32 s15, v40, 11
; GFX10-NEXT:    v_readlane_b32 s14, v40, 10
; GFX10-NEXT:    v_readlane_b32 s13, v40, 9
; GFX10-NEXT:    v_readlane_b32 s12, v40, 8
; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 28
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v32i32_i32_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 28
; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT:    s_add_i32 s3, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
; GFX11-NEXT:    v_writelane_b32 v40, s10, 6
; GFX11-NEXT:    v_writelane_b32 v40, s11, 7
; GFX11-NEXT:    v_writelane_b32 v40, s12, 8
; GFX11-NEXT:    v_writelane_b32 v40, s13, 9
; GFX11-NEXT:    v_writelane_b32 v40, s14, 10
; GFX11-NEXT:    v_writelane_b32 v40, s15, 11
; GFX11-NEXT:    v_writelane_b32 v40, s16, 12
; GFX11-NEXT:    v_writelane_b32 v40, s17, 13
; GFX11-NEXT:    v_writelane_b32 v40, s18, 14
; GFX11-NEXT:    v_writelane_b32 v40, s19, 15
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    s_clause 0x2
; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x0
; GFX11-NEXT:    s_load_b512 s[36:51], s[0:1], 0x40
; GFX11-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v32i32_i32_inreg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v32i32_i32_inreg@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s20, 16
; GFX11-NEXT:    v_writelane_b32 v40, s21, 17
; GFX11-NEXT:    v_writelane_b32 v40, s22, 18
; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v5, s51
; GFX11-NEXT:    v_writelane_b32 v40, s23, 19
; GFX11-NEXT:    v_dual_mov_b32 v4, s50 :: v_dual_mov_b32 v1, s47
; GFX11-NEXT:    v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v3, s49
; GFX11-NEXT:    v_writelane_b32 v40, s24, 20
; GFX11-NEXT:    v_mov_b32_e32 v2, s48
; GFX11-NEXT:    s_add_i32 s2, s32, 24
; GFX11-NEXT:    s_mov_b32 s20, s36
; GFX11-NEXT:    s_mov_b32 s21, s37
; GFX11-NEXT:    v_writelane_b32 v40, s25, 21
; GFX11-NEXT:    s_mov_b32 s22, s38
; GFX11-NEXT:    s_mov_b32 s23, s39
; GFX11-NEXT:    s_mov_b32 s24, s40
; GFX11-NEXT:    s_mov_b32 s25, s41
; GFX11-NEXT:    v_writelane_b32 v40, s26, 22
; GFX11-NEXT:    s_mov_b32 s26, s42
; GFX11-NEXT:    scratch_store_b32 off, v6, s2
; GFX11-NEXT:    scratch_store_b64 off, v[4:5], s3
; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT:    v_writelane_b32 v40, s27, 23
; GFX11-NEXT:    s_mov_b32 s27, s43
; GFX11-NEXT:    v_writelane_b32 v40, s28, 24
; GFX11-NEXT:    s_mov_b32 s28, s44
; GFX11-NEXT:    v_writelane_b32 v40, s29, 25
; GFX11-NEXT:    s_mov_b32 s29, s45
; GFX11-NEXT:    v_writelane_b32 v40, s30, 26
; GFX11-NEXT:    v_writelane_b32 v40, s31, 27
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
; GFX11-NEXT:    v_readlane_b32 s30, v40, 26
; GFX11-NEXT:    v_readlane_b32 s29, v40, 25
; GFX11-NEXT:    v_readlane_b32 s28, v40, 24
; GFX11-NEXT:    v_readlane_b32 s27, v40, 23
; GFX11-NEXT:    v_readlane_b32 s26, v40, 22
; GFX11-NEXT:    v_readlane_b32 s25, v40, 21
; GFX11-NEXT:    v_readlane_b32 s24, v40, 20
; GFX11-NEXT:    v_readlane_b32 s23, v40, 19
; GFX11-NEXT:    v_readlane_b32 s22, v40, 18
; GFX11-NEXT:    v_readlane_b32 s21, v40, 17
; GFX11-NEXT:    v_readlane_b32 s20, v40, 16
; GFX11-NEXT:    v_readlane_b32 s19, v40, 15
; GFX11-NEXT:    v_readlane_b32 s18, v40, 14
; GFX11-NEXT:    v_readlane_b32 s17, v40, 13
; GFX11-NEXT:    v_readlane_b32 s16, v40, 12
; GFX11-NEXT:    v_readlane_b32 s15, v40, 11
; GFX11-NEXT:    v_readlane_b32 s14, v40, 10
; GFX11-NEXT:    v_readlane_b32 s13, v40, 9
; GFX11-NEXT:    v_readlane_b32 s12, v40, 8
; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 28
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v32i32_i32_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 28
; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    s_add_i32 s3, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s12, 8
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s13, 9
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s14, 10
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s15, 11
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s16, 12
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s17, 13
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s18, 14
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s19, 15
; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_clause 0x2
; GFX10-SCRATCH-NEXT:    s_load_dword s2, s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    ; meta instruction
; GFX10-SCRATCH-NEXT:    ; meta instruction
; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0x40
; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x0
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v32i32_i32_inreg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v32i32_i32_inreg@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s20, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s21, 17
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s22, 18
; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, s2
; GFX10-SCRATCH-NEXT:    s_add_i32 s2, s32, 24
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, s50
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s23, 19
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, s51
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s46
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, s47
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, s48
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s24, 20
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, s49
; GFX10-SCRATCH-NEXT:    s_mov_b32 s20, s36
; GFX10-SCRATCH-NEXT:    s_mov_b32 s21, s37
; GFX10-SCRATCH-NEXT:    s_mov_b32 s22, s38
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s25, 21
; GFX10-SCRATCH-NEXT:    s_mov_b32 s23, s39
; GFX10-SCRATCH-NEXT:    s_mov_b32 s24, s40
; GFX10-SCRATCH-NEXT:    s_mov_b32 s25, s41
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v6, s2
; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s3
; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s26, 22
; GFX10-SCRATCH-NEXT:    s_mov_b32 s26, s42
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s27, 23
; GFX10-SCRATCH-NEXT:    s_mov_b32 s27, s43
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s28, 24
; GFX10-SCRATCH-NEXT:    s_mov_b32 s28, s44
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s29, 25
; GFX10-SCRATCH-NEXT:    s_mov_b32 s29, s45
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 26
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 27
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 26
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s29, v40, 25
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s28, v40, 24
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s27, v40, 23
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s26, v40, 22
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s25, v40, 21
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s24, v40, 20
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s23, v40, 19
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s22, v40, 18
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s21, v40, 17
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s20, v40, 16
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s19, v40, 15
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s18, v40, 14
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s17, v40, 13
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s16, v40, 12
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s15, v40, 11
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s14, v40, 10
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s13, v40, 9
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s12, v40, 8
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 28
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %ptr0 = load ptr addrspace(4), ptr addrspace(4) undef
  %val0 = load <32 x i32>, ptr addrspace(4) %ptr0
  %val1 = load i32, ptr addrspace(4) undef
  call amdgpu_gfx void @external_void_func_v32i32_i32_inreg(<32 x i32> inreg %val0, i32 inreg %val1)
  ret void
}

define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33
; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:4
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, stack_passed_f64_arg@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, stack_passed_f64_arg@abs32@lo
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
; GFX9-NEXT:    s_waitcnt vmcnt(1)
; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: stack_passed_arg_alignment_v32i32_f64:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_clause 0x1
; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s33
; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:4
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    s_mov_b32 s35, stack_passed_f64_arg@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, stack_passed_f64_arg@abs32@lo
; GFX10-NEXT:    s_waitcnt vmcnt(1)
; GFX10-NEXT:    buffer_store_dword v32, off, s[0:3], s32
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: stack_passed_arg_alignment_v32i32_f64:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    scratch_load_b64 v[32:33], off, s33
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    s_mov_b32 s1, stack_passed_f64_arg@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, stack_passed_f64_arg@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    scratch_store_b64 off, v[32:33], s32
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: stack_passed_arg_alignment_v32i32_f64:
; GFX10-SCRATCH:       ; %bb.0: ; %entry
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    scratch_load_dwordx2 v[32:33], off, s33
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, stack_passed_f64_arg@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, stack_passed_f64_arg@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[32:33], s32
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
entry:
  call amdgpu_gfx void @stack_passed_f64_arg(<32 x i32> %val, double %tmp)
  ret void
}

define amdgpu_gfx void @stack_12xv3i32() #0 {
; GFX9-LABEL: stack_12xv3i32:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_mov_b32_e32 v0, 12
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT:    v_mov_b32_e32 v0, 13
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; GFX9-NEXT:    v_mov_b32_e32 v0, 14
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; GFX9-NEXT:    v_mov_b32_e32 v0, 15
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_12xv3i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_12xv3i32@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    v_mov_b32_e32 v2, 0
; GFX9-NEXT:    v_mov_b32_e32 v3, 1
; GFX9-NEXT:    v_mov_b32_e32 v4, 1
; GFX9-NEXT:    v_mov_b32_e32 v5, 1
; GFX9-NEXT:    v_mov_b32_e32 v6, 2
; GFX9-NEXT:    v_mov_b32_e32 v7, 2
; GFX9-NEXT:    v_mov_b32_e32 v8, 2
; GFX9-NEXT:    v_mov_b32_e32 v9, 3
; GFX9-NEXT:    v_mov_b32_e32 v10, 3
; GFX9-NEXT:    v_mov_b32_e32 v11, 3
; GFX9-NEXT:    v_mov_b32_e32 v12, 4
; GFX9-NEXT:    v_mov_b32_e32 v13, 4
; GFX9-NEXT:    v_mov_b32_e32 v14, 4
; GFX9-NEXT:    v_mov_b32_e32 v15, 5
; GFX9-NEXT:    v_mov_b32_e32 v16, 5
; GFX9-NEXT:    v_mov_b32_e32 v17, 5
; GFX9-NEXT:    v_mov_b32_e32 v18, 6
; GFX9-NEXT:    v_mov_b32_e32 v19, 6
; GFX9-NEXT:    v_mov_b32_e32 v20, 6
; GFX9-NEXT:    v_mov_b32_e32 v21, 7
; GFX9-NEXT:    v_mov_b32_e32 v22, 7
; GFX9-NEXT:    v_mov_b32_e32 v23, 7
; GFX9-NEXT:    v_mov_b32_e32 v24, 8
; GFX9-NEXT:    v_mov_b32_e32 v25, 8
; GFX9-NEXT:    v_mov_b32_e32 v26, 8
; GFX9-NEXT:    v_mov_b32_e32 v27, 9
; GFX9-NEXT:    v_mov_b32_e32 v28, 9
; GFX9-NEXT:    v_mov_b32_e32 v29, 9
; GFX9-NEXT:    v_mov_b32_e32 v30, 10
; GFX9-NEXT:    v_mov_b32_e32 v31, 11
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: stack_12xv3i32:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    v_mov_b32_e32 v0, 12
; GFX10-NEXT:    v_mov_b32_e32 v1, 13
; GFX10-NEXT:    v_mov_b32_e32 v2, 14
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_mov_b32_e32 v3, 15
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-NEXT:    v_mov_b32_e32 v3, 1
; GFX10-NEXT:    v_mov_b32_e32 v4, 1
; GFX10-NEXT:    v_mov_b32_e32 v5, 1
; GFX10-NEXT:    v_mov_b32_e32 v6, 2
; GFX10-NEXT:    v_mov_b32_e32 v7, 2
; GFX10-NEXT:    v_mov_b32_e32 v8, 2
; GFX10-NEXT:    v_mov_b32_e32 v9, 3
; GFX10-NEXT:    v_mov_b32_e32 v10, 3
; GFX10-NEXT:    v_mov_b32_e32 v11, 3
; GFX10-NEXT:    v_mov_b32_e32 v12, 4
; GFX10-NEXT:    v_mov_b32_e32 v13, 4
; GFX10-NEXT:    v_mov_b32_e32 v14, 4
; GFX10-NEXT:    v_mov_b32_e32 v15, 5
; GFX10-NEXT:    v_mov_b32_e32 v16, 5
; GFX10-NEXT:    v_mov_b32_e32 v17, 5
; GFX10-NEXT:    v_mov_b32_e32 v18, 6
; GFX10-NEXT:    v_mov_b32_e32 v19, 6
; GFX10-NEXT:    v_mov_b32_e32 v20, 6
; GFX10-NEXT:    v_mov_b32_e32 v21, 7
; GFX10-NEXT:    v_mov_b32_e32 v22, 7
; GFX10-NEXT:    v_mov_b32_e32 v23, 7
; GFX10-NEXT:    v_mov_b32_e32 v24, 8
; GFX10-NEXT:    v_mov_b32_e32 v25, 8
; GFX10-NEXT:    v_mov_b32_e32 v26, 8
; GFX10-NEXT:    v_mov_b32_e32 v27, 9
; GFX10-NEXT:    v_mov_b32_e32 v28, 9
; GFX10-NEXT:    v_mov_b32_e32 v29, 9
; GFX10-NEXT:    v_mov_b32_e32 v30, 10
; GFX10-NEXT:    v_mov_b32_e32 v31, 11
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_12xv3i32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_12xv3i32@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: stack_12xv3i32:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13
; GFX11-NEXT:    v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 1
; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1
; GFX11-NEXT:    v_dual_mov_b32 v6, 2 :: v_dual_mov_b32 v7, 2
; GFX11-NEXT:    v_dual_mov_b32 v8, 2 :: v_dual_mov_b32 v9, 3
; GFX11-NEXT:    v_dual_mov_b32 v10, 3 :: v_dual_mov_b32 v11, 3
; GFX11-NEXT:    v_dual_mov_b32 v12, 4 :: v_dual_mov_b32 v13, 4
; GFX11-NEXT:    v_dual_mov_b32 v14, 4 :: v_dual_mov_b32 v15, 5
; GFX11-NEXT:    v_dual_mov_b32 v16, 5 :: v_dual_mov_b32 v17, 5
; GFX11-NEXT:    v_dual_mov_b32 v18, 6 :: v_dual_mov_b32 v19, 6
; GFX11-NEXT:    v_dual_mov_b32 v20, 6 :: v_dual_mov_b32 v21, 7
; GFX11-NEXT:    v_dual_mov_b32 v22, 7 :: v_dual_mov_b32 v23, 7
; GFX11-NEXT:    v_dual_mov_b32 v24, 8 :: v_dual_mov_b32 v25, 8
; GFX11-NEXT:    v_dual_mov_b32 v26, 8 :: v_dual_mov_b32 v27, 9
; GFX11-NEXT:    v_dual_mov_b32 v28, 9 :: v_dual_mov_b32 v29, 9
; GFX11-NEXT:    v_dual_mov_b32 v30, 10 :: v_dual_mov_b32 v31, 11
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_12xv3i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_12xv3i32@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: stack_12xv3i32:
; GFX10-SCRATCH:       ; %bb.0: ; %entry
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 12
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 13
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 14
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 15
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 1
; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v8, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v9, 3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v10, 3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v11, 3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v12, 4
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v13, 4
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v14, 4
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v15, 5
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v16, 5
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v17, 5
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v18, 6
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v19, 6
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v20, 6
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v21, 7
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v22, 7
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v23, 7
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v24, 8
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v25, 8
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v26, 8
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v27, 9
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v28, 9
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v29, 9
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v30, 10
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v31, 11
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_12xv3i32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_12xv3i32@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
entry:
  call amdgpu_gfx void @external_void_func_12xv3i32(
      <3 x i32><i32 0, i32 0, i32 0>,
      <3 x i32><i32 1, i32 1, i32 1>,
      <3 x i32><i32 2, i32 2, i32 2>,
      <3 x i32><i32 3, i32 3, i32 3>,
      <3 x i32><i32 4, i32 4, i32 4>,
      <3 x i32><i32 5, i32 5, i32 5>,
      <3 x i32><i32 6, i32 6, i32 6>,
      <3 x i32><i32 7, i32 7, i32 7>,
      <3 x i32><i32 8, i32 8, i32 8>,
      <3 x i32><i32 9, i32 9, i32 9>,
      <3 x i32><i32 10, i32 11, i32 12>,
      <3 x i32><i32 13, i32 14, i32 15>)
  ret void
}

define amdgpu_gfx void @stack_8xv5i32() #0 {
; GFX9-LABEL: stack_8xv5i32:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_mov_b32_e32 v0, 8
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT:    v_mov_b32_e32 v0, 9
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; GFX9-NEXT:    v_mov_b32_e32 v0, 10
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; GFX9-NEXT:    v_mov_b32_e32 v0, 11
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; GFX9-NEXT:    v_mov_b32_e32 v0, 12
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; GFX9-NEXT:    v_mov_b32_e32 v0, 13
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
; GFX9-NEXT:    v_mov_b32_e32 v0, 14
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
; GFX9-NEXT:    v_mov_b32_e32 v0, 15
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_8xv5i32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_8xv5i32@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    v_mov_b32_e32 v2, 0
; GFX9-NEXT:    v_mov_b32_e32 v3, 0
; GFX9-NEXT:    v_mov_b32_e32 v4, 0
; GFX9-NEXT:    v_mov_b32_e32 v5, 1
; GFX9-NEXT:    v_mov_b32_e32 v6, 1
; GFX9-NEXT:    v_mov_b32_e32 v7, 1
; GFX9-NEXT:    v_mov_b32_e32 v8, 1
; GFX9-NEXT:    v_mov_b32_e32 v9, 1
; GFX9-NEXT:    v_mov_b32_e32 v10, 2
; GFX9-NEXT:    v_mov_b32_e32 v11, 2
; GFX9-NEXT:    v_mov_b32_e32 v12, 2
; GFX9-NEXT:    v_mov_b32_e32 v13, 2
; GFX9-NEXT:    v_mov_b32_e32 v14, 2
; GFX9-NEXT:    v_mov_b32_e32 v15, 3
; GFX9-NEXT:    v_mov_b32_e32 v16, 3
; GFX9-NEXT:    v_mov_b32_e32 v17, 3
; GFX9-NEXT:    v_mov_b32_e32 v18, 3
; GFX9-NEXT:    v_mov_b32_e32 v19, 3
; GFX9-NEXT:    v_mov_b32_e32 v20, 4
; GFX9-NEXT:    v_mov_b32_e32 v21, 4
; GFX9-NEXT:    v_mov_b32_e32 v22, 4
; GFX9-NEXT:    v_mov_b32_e32 v23, 4
; GFX9-NEXT:    v_mov_b32_e32 v24, 4
; GFX9-NEXT:    v_mov_b32_e32 v25, 5
; GFX9-NEXT:    v_mov_b32_e32 v26, 5
; GFX9-NEXT:    v_mov_b32_e32 v27, 5
; GFX9-NEXT:    v_mov_b32_e32 v28, 5
; GFX9-NEXT:    v_mov_b32_e32 v29, 5
; GFX9-NEXT:    v_mov_b32_e32 v30, 6
; GFX9-NEXT:    v_mov_b32_e32 v31, 7
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: stack_8xv5i32:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_mov_b32_e32 v0, 8
; GFX10-NEXT:    v_mov_b32_e32 v1, 9
; GFX10-NEXT:    v_mov_b32_e32 v2, 10
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
; GFX10-NEXT:    v_mov_b32_e32 v0, 11
; GFX10-NEXT:    v_mov_b32_e32 v1, 12
; GFX10-NEXT:    v_mov_b32_e32 v2, 13
; GFX10-NEXT:    v_mov_b32_e32 v3, 14
; GFX10-NEXT:    v_mov_b32_e32 v4, 15
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:16
; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:24
; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-NEXT:    v_mov_b32_e32 v3, 0
; GFX10-NEXT:    v_mov_b32_e32 v4, 0
; GFX10-NEXT:    v_mov_b32_e32 v5, 1
; GFX10-NEXT:    v_mov_b32_e32 v6, 1
; GFX10-NEXT:    v_mov_b32_e32 v7, 1
; GFX10-NEXT:    v_mov_b32_e32 v8, 1
; GFX10-NEXT:    v_mov_b32_e32 v9, 1
; GFX10-NEXT:    v_mov_b32_e32 v10, 2
; GFX10-NEXT:    v_mov_b32_e32 v11, 2
; GFX10-NEXT:    v_mov_b32_e32 v12, 2
; GFX10-NEXT:    v_mov_b32_e32 v13, 2
; GFX10-NEXT:    v_mov_b32_e32 v14, 2
; GFX10-NEXT:    v_mov_b32_e32 v15, 3
; GFX10-NEXT:    v_mov_b32_e32 v16, 3
; GFX10-NEXT:    v_mov_b32_e32 v17, 3
; GFX10-NEXT:    v_mov_b32_e32 v18, 3
; GFX10-NEXT:    v_mov_b32_e32 v19, 3
; GFX10-NEXT:    v_mov_b32_e32 v20, 4
; GFX10-NEXT:    v_mov_b32_e32 v21, 4
; GFX10-NEXT:    v_mov_b32_e32 v22, 4
; GFX10-NEXT:    v_mov_b32_e32 v23, 4
; GFX10-NEXT:    v_mov_b32_e32 v24, 4
; GFX10-NEXT:    v_mov_b32_e32 v25, 5
; GFX10-NEXT:    v_mov_b32_e32 v26, 5
; GFX10-NEXT:    v_mov_b32_e32 v27, 5
; GFX10-NEXT:    v_mov_b32_e32 v28, 5
; GFX10-NEXT:    v_mov_b32_e32 v29, 5
; GFX10-NEXT:    v_mov_b32_e32 v30, 6
; GFX10-NEXT:    v_mov_b32_e32 v31, 7
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_8xv5i32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_8xv5i32@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: stack_8xv5i32:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 9
; GFX11-NEXT:    v_dual_mov_b32 v2, 10 :: v_dual_mov_b32 v3, 11
; GFX11-NEXT:    v_dual_mov_b32 v4, 12 :: v_dual_mov_b32 v5, 13
; GFX11-NEXT:    v_dual_mov_b32 v6, 14 :: v_dual_mov_b32 v7, 15
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_add_i32 s0, s32, 16
; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1
; GFX11-NEXT:    v_dual_mov_b32 v6, 1 :: v_dual_mov_b32 v7, 1
; GFX11-NEXT:    v_dual_mov_b32 v8, 1 :: v_dual_mov_b32 v9, 1
; GFX11-NEXT:    v_dual_mov_b32 v10, 2 :: v_dual_mov_b32 v11, 2
; GFX11-NEXT:    v_dual_mov_b32 v12, 2 :: v_dual_mov_b32 v13, 2
; GFX11-NEXT:    v_dual_mov_b32 v14, 2 :: v_dual_mov_b32 v15, 3
; GFX11-NEXT:    v_dual_mov_b32 v16, 3 :: v_dual_mov_b32 v17, 3
; GFX11-NEXT:    v_dual_mov_b32 v18, 3 :: v_dual_mov_b32 v19, 3
; GFX11-NEXT:    v_dual_mov_b32 v20, 4 :: v_dual_mov_b32 v21, 4
; GFX11-NEXT:    v_dual_mov_b32 v22, 4 :: v_dual_mov_b32 v23, 4
; GFX11-NEXT:    v_dual_mov_b32 v24, 4 :: v_dual_mov_b32 v25, 5
; GFX11-NEXT:    v_dual_mov_b32 v26, 5 :: v_dual_mov_b32 v27, 5
; GFX11-NEXT:    v_dual_mov_b32 v28, 5 :: v_dual_mov_b32 v29, 5
; GFX11-NEXT:    v_dual_mov_b32 v30, 6 :: v_dual_mov_b32 v31, 7
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_8xv5i32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_8xv5i32@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: stack_8xv5i32:
; GFX10-SCRATCH:       ; %bb.0: ; %entry
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 8
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 9
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 10
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 11
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 12
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 13
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 14
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 15
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s0, s32, 16
; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[4:7], s0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v8, 1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v9, 1
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v10, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v11, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v12, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v13, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v14, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v15, 3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v16, 3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v17, 3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v18, 3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v19, 3
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v20, 4
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v21, 4
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v22, 4
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v23, 4
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v24, 4
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v25, 5
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v26, 5
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v27, 5
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v28, 5
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v29, 5
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v30, 6
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v31, 7
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_8xv5i32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_8xv5i32@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
entry:
  call amdgpu_gfx void @external_void_func_8xv5i32(
      <5 x i32><i32 0, i32 0, i32 0, i32 0, i32 0>,
      <5 x i32><i32 1, i32 1, i32 1, i32 1, i32 1>,
      <5 x i32><i32 2, i32 2, i32 2, i32 2, i32 2>,
      <5 x i32><i32 3, i32 3, i32 3, i32 3, i32 3>,
      <5 x i32><i32 4, i32 4, i32 4, i32 4, i32 4>,
      <5 x i32><i32 5, i32 5, i32 5, i32 5, i32 5>,
      <5 x i32><i32 6, i32 7, i32 8, i32 9, i32 10>,
      <5 x i32><i32 11, i32 12, i32 13, i32 14, i32 15>)
  ret void
}

define amdgpu_gfx void @stack_8xv5f32() #0 {
; GFX9-LABEL: stack_8xv5f32:
; GFX9:       ; %bb.0: ; %entry
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41000000
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41100000
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41200000
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41300000
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41400000
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41500000
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41600000
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41700000
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_8xv5f32@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_8xv5f32@abs32@lo
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, 0
; GFX9-NEXT:    v_mov_b32_e32 v2, 0
; GFX9-NEXT:    v_mov_b32_e32 v3, 0
; GFX9-NEXT:    v_mov_b32_e32 v4, 0
; GFX9-NEXT:    v_mov_b32_e32 v5, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v6, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v7, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v8, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v9, 1.0
; GFX9-NEXT:    v_mov_b32_e32 v10, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v11, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v12, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v13, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v14, 2.0
; GFX9-NEXT:    v_mov_b32_e32 v15, 0x40400000
; GFX9-NEXT:    v_mov_b32_e32 v16, 0x40400000
; GFX9-NEXT:    v_mov_b32_e32 v17, 0x40400000
; GFX9-NEXT:    v_mov_b32_e32 v18, 0x40400000
; GFX9-NEXT:    v_mov_b32_e32 v19, 0x40400000
; GFX9-NEXT:    v_mov_b32_e32 v20, 4.0
; GFX9-NEXT:    v_mov_b32_e32 v21, 4.0
; GFX9-NEXT:    v_mov_b32_e32 v22, 4.0
; GFX9-NEXT:    v_mov_b32_e32 v23, 4.0
; GFX9-NEXT:    v_mov_b32_e32 v24, 4.0
; GFX9-NEXT:    v_mov_b32_e32 v25, 0x40a00000
; GFX9-NEXT:    v_mov_b32_e32 v26, 0x40a00000
; GFX9-NEXT:    v_mov_b32_e32 v27, 0x40a00000
; GFX9-NEXT:    v_mov_b32_e32 v28, 0x40a00000
; GFX9-NEXT:    v_mov_b32_e32 v29, 0x40a00000
; GFX9-NEXT:    v_mov_b32_e32 v30, 0x40c00000
; GFX9-NEXT:    v_mov_b32_e32 v31, 0x40e00000
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: stack_8xv5f32:
; GFX10:       ; %bb.0: ; %entry
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_mov_b32_e32 v0, 0x41000000
; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41100000
; GFX10-NEXT:    v_mov_b32_e32 v2, 0x41200000
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
; GFX10-NEXT:    v_mov_b32_e32 v0, 0x41300000
; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41400000
; GFX10-NEXT:    v_mov_b32_e32 v2, 0x41500000
; GFX10-NEXT:    v_mov_b32_e32 v3, 0x41600000
; GFX10-NEXT:    v_mov_b32_e32 v4, 0x41700000
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:16
; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:24
; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-NEXT:    v_mov_b32_e32 v3, 0
; GFX10-NEXT:    v_mov_b32_e32 v4, 0
; GFX10-NEXT:    v_mov_b32_e32 v5, 1.0
; GFX10-NEXT:    v_mov_b32_e32 v6, 1.0
; GFX10-NEXT:    v_mov_b32_e32 v7, 1.0
; GFX10-NEXT:    v_mov_b32_e32 v8, 1.0
; GFX10-NEXT:    v_mov_b32_e32 v9, 1.0
; GFX10-NEXT:    v_mov_b32_e32 v10, 2.0
; GFX10-NEXT:    v_mov_b32_e32 v11, 2.0
; GFX10-NEXT:    v_mov_b32_e32 v12, 2.0
; GFX10-NEXT:    v_mov_b32_e32 v13, 2.0
; GFX10-NEXT:    v_mov_b32_e32 v14, 2.0
; GFX10-NEXT:    v_mov_b32_e32 v15, 0x40400000
; GFX10-NEXT:    v_mov_b32_e32 v16, 0x40400000
; GFX10-NEXT:    v_mov_b32_e32 v17, 0x40400000
; GFX10-NEXT:    v_mov_b32_e32 v18, 0x40400000
; GFX10-NEXT:    v_mov_b32_e32 v19, 0x40400000
; GFX10-NEXT:    v_mov_b32_e32 v20, 4.0
; GFX10-NEXT:    v_mov_b32_e32 v21, 4.0
; GFX10-NEXT:    v_mov_b32_e32 v22, 4.0
; GFX10-NEXT:    v_mov_b32_e32 v23, 4.0
; GFX10-NEXT:    v_mov_b32_e32 v24, 4.0
; GFX10-NEXT:    v_mov_b32_e32 v25, 0x40a00000
; GFX10-NEXT:    v_mov_b32_e32 v26, 0x40a00000
; GFX10-NEXT:    v_mov_b32_e32 v27, 0x40a00000
; GFX10-NEXT:    v_mov_b32_e32 v28, 0x40a00000
; GFX10-NEXT:    v_mov_b32_e32 v29, 0x40a00000
; GFX10-NEXT:    v_mov_b32_e32 v30, 0x40c00000
; GFX10-NEXT:    v_mov_b32_e32 v31, 0x40e00000
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_8xv5f32@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_8xv5f32@abs32@lo
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: stack_8xv5f32:
; GFX11:       ; %bb.0: ; %entry
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    v_mov_b32_e32 v0, 0x41000000
; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41100000
; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41200000
; GFX11-NEXT:    v_mov_b32_e32 v3, 0x41300000
; GFX11-NEXT:    v_mov_b32_e32 v4, 0x41400000
; GFX11-NEXT:    v_mov_b32_e32 v5, 0x41500000
; GFX11-NEXT:    v_mov_b32_e32 v6, 0x41600000
; GFX11-NEXT:    v_mov_b32_e32 v7, 0x41700000
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    s_add_i32 s0, s32, 16
; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
; GFX11-NEXT:    v_mov_b32_e32 v6, 1.0
; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1.0
; GFX11-NEXT:    v_dual_mov_b32 v7, 1.0 :: v_dual_mov_b32 v8, 1.0
; GFX11-NEXT:    v_dual_mov_b32 v9, 1.0 :: v_dual_mov_b32 v10, 2.0
; GFX11-NEXT:    v_dual_mov_b32 v11, 2.0 :: v_dual_mov_b32 v12, 2.0
; GFX11-NEXT:    v_dual_mov_b32 v13, 2.0 :: v_dual_mov_b32 v14, 2.0
; GFX11-NEXT:    v_dual_mov_b32 v15, 0x40400000 :: v_dual_mov_b32 v16, 0x40400000
; GFX11-NEXT:    v_dual_mov_b32 v17, 0x40400000 :: v_dual_mov_b32 v18, 0x40400000
; GFX11-NEXT:    v_dual_mov_b32 v19, 0x40400000 :: v_dual_mov_b32 v20, 4.0
; GFX11-NEXT:    v_dual_mov_b32 v21, 4.0 :: v_dual_mov_b32 v22, 4.0
; GFX11-NEXT:    v_dual_mov_b32 v23, 4.0 :: v_dual_mov_b32 v24, 4.0
; GFX11-NEXT:    v_dual_mov_b32 v25, 0x40a00000 :: v_dual_mov_b32 v26, 0x40a00000
; GFX11-NEXT:    v_dual_mov_b32 v27, 0x40a00000 :: v_dual_mov_b32 v28, 0x40a00000
; GFX11-NEXT:    v_mov_b32_e32 v29, 0x40a00000
; GFX11-NEXT:    v_mov_b32_e32 v30, 0x40c00000
; GFX11-NEXT:    v_mov_b32_e32 v31, 0x40e00000
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_8xv5f32@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_8xv5f32@abs32@lo
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: stack_8xv5f32:
; GFX10-SCRATCH:       ; %bb.0: ; %entry
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x41000000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x41100000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0x41200000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 0x41300000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0x41400000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 0x41500000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 0x41600000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 0x41700000
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    s_add_i32 s0, s32, 16
; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[4:7], s0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 1.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 1.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 1.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v8, 1.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v9, 1.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v10, 2.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v11, 2.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v12, 2.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v13, 2.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v14, 2.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v15, 0x40400000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v16, 0x40400000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v17, 0x40400000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v18, 0x40400000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v19, 0x40400000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v20, 4.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v21, 4.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v22, 4.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v23, 4.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v24, 4.0
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v25, 0x40a00000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v26, 0x40a00000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v27, 0x40a00000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v28, 0x40a00000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v29, 0x40a00000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v30, 0x40c00000
; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v31, 0x40e00000
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_8xv5f32@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_8xv5f32@abs32@lo
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
entry:
  call amdgpu_gfx void @external_void_func_8xv5f32(
      <5 x float><float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>,
      <5 x float><float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>,
      <5 x float><float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>,
      <5 x float><float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>,
      <5 x float><float 4.0, float 4.0, float 4.0, float 4.0, float 4.0>,
      <5 x float><float 5.0, float 5.0, float 5.0, float 5.0, float 5.0>,
      <5 x float><float 6.0, float 7.0, float 8.0, float 9.0, float 10.0>,
      <5 x float><float 11.0, float 12.0, float 13.0, float 14.0, float 15.0>)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_bf16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_bf16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_bf16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_bf16:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_bf16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_bf16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_bf16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_bf16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_bf16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_bf16:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_bf16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_bf16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = bitcast i16 %arg to bfloat
  call amdgpu_gfx void @external_void_func_bf16(bfloat %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v1bf16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v1bf16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v1bf16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v1bf16:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v1bf16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v1bf16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v1bf16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v1bf16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v1bf16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v1bf16:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v1bf16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v1bf16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = bitcast i16 %arg to <1 x bfloat>
  call amdgpu_gfx void @external_void_func_v1bf16(<1 x bfloat> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v2bf16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2bf16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2bf16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2bf16:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2bf16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2bf16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2bf16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2bf16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2bf16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2bf16:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2bf16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2bf16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = bitcast i32 %arg to <2 x bfloat>
  call amdgpu_gfx void @external_void_func_v2bf16(<2 x bfloat> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v3bf16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3bf16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3bf16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3bf16:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3bf16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3bf16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3bf16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3bf16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3bf16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3bf16:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3bf16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3bf16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = bitcast <3 x i16> %arg to <3 x bfloat>
  call amdgpu_gfx void @external_void_func_v3bf16(<3 x bfloat> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v4bf16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4bf16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4bf16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v4bf16:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4bf16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4bf16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v4bf16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4bf16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4bf16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4bf16:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4bf16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4bf16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = bitcast <4 x i16> %arg to <4 x bfloat>
  call amdgpu_gfx void @external_void_func_v4bf16(<4 x bfloat> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v8bf16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8bf16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8bf16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v8bf16:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8bf16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8bf16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v8bf16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8bf16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8bf16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8bf16:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8bf16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8bf16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = bitcast <8 x i16> %arg to <8 x bfloat>
  call amdgpu_gfx void @external_void_func_v8bf16(<8 x bfloat> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v16bf16:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v16bf16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v16bf16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v16bf16:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v16bf16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v16bf16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v16bf16:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v16bf16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v16bf16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16bf16:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v16bf16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v16bf16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = bitcast <16 x i16> %arg to <16 x bfloat>
  call amdgpu_gfx void @external_void_func_v16bf16(<16 x bfloat> %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_bf16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_bf16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_bf16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_bf16_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_bf16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_bf16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_bf16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_bf16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_bf16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_bf16_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_bf16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_bf16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = bitcast i16 %arg to bfloat
  call amdgpu_gfx void @external_void_func_bf16(bfloat inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v1bf16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v1bf16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v1bf16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v1bf16_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v1bf16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v1bf16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v1bf16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v1bf16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v1bf16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v1bf16_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v1bf16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v1bf16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = bitcast i16 %arg to <1 x bfloat>
  call amdgpu_gfx void @external_void_func_v1bf16(<1 x bfloat> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v2bf16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2bf16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2bf16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v2bf16_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2bf16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2bf16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v2bf16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2bf16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2bf16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2bf16_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2bf16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2bf16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = bitcast i32 %arg to <2 x bfloat>
  call amdgpu_gfx void @external_void_func_v2bf16(<2 x bfloat> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v3bf16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3bf16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3bf16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v3bf16_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3bf16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3bf16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v3bf16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3bf16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3bf16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3bf16_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3bf16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3bf16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = bitcast <3 x i16> %arg to <3 x bfloat>
  call amdgpu_gfx void @external_void_func_v3bf16(<3 x bfloat> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v4bf16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4bf16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4bf16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v4bf16_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4bf16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4bf16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v4bf16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4bf16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4bf16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4bf16_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4bf16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4bf16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = bitcast <4 x i16> %arg to <4 x bfloat>
  call amdgpu_gfx void @external_void_func_v4bf16(<4 x bfloat> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v8bf16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8bf16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8bf16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v8bf16_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8bf16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8bf16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v8bf16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8bf16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8bf16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8bf16_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8bf16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8bf16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = bitcast <8 x i16> %arg to <8 x bfloat>
  call amdgpu_gfx void @external_void_func_v8bf16(<8 x bfloat> inreg %val)
  ret void
}

define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_v16bf16_inreg:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    s_mov_b32 s34, s33
; GFX9-NEXT:    s_mov_b32 s33, s32
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v16bf16@abs32@hi
; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v16bf16@abs32@lo
; GFX9-NEXT:    s_addk_i32 s32, 0x400
; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
; GFX9-NEXT:    s_mov_b32 s33, s34
; GFX9-NEXT:    s_waitcnt vmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_call_external_void_func_v16bf16_inreg:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    s_mov_b32 s34, s33
; GFX10-NEXT:    s_mov_b32 s33, s32
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v16bf16@abs32@hi
; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v16bf16@abs32@lo
; GFX10-NEXT:    s_addk_i32 s32, 0x200
; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-NEXT:    s_mov_b32 exec_lo, s35
; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
; GFX10-NEXT:    s_mov_b32 s33, s34
; GFX10-NEXT:    s_waitcnt vmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; GFX11-LABEL: test_call_external_void_func_v16bf16_inreg:
; GFX11:       ; %bb.0:
; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT:    s_mov_b32 s0, s33
; GFX11-NEXT:    s_mov_b32 s33, s32
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v16bf16@abs32@hi
; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v16bf16@abs32@lo
; GFX11-NEXT:    s_add_i32 s32, s32, 16
; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT:    s_mov_b32 exec_lo, s1
; GFX11-NEXT:    s_add_i32 s32, s32, -16
; GFX11-NEXT:    s_mov_b32 s33, s0
; GFX11-NEXT:    s_waitcnt vmcnt(0)
; GFX11-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16bf16_inreg:
; GFX10-SCRATCH:       ; %bb.0:
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v16bf16@abs32@hi
; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v16bf16@abs32@lo
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
  %val = bitcast <16 x i16> %arg to <16 x bfloat>
  call amdgpu_gfx void @external_void_func_v16bf16(<16 x bfloat> inreg %val)
  ret void
}

declare hidden amdgpu_gfx void @byval_align16_f64_arg(<32 x i32>, ptr addrspace(5) byval(double) align 16) #0
declare hidden amdgpu_gfx void @stack_passed_f64_arg(<32 x i32>, double) #0
declare hidden amdgpu_gfx void @external_void_func_12xv3i32(<3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>,
    <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>, <3 x i32>) #0
declare hidden amdgpu_gfx void @external_void_func_8xv5i32(<5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>,
    <5 x i32>, <5 x i32>, <5 x i32>, <5 x i32>) #0
declare hidden amdgpu_gfx void @external_void_func_12xv3f32(<3 x float>, <3 x float>, <3 x float>, <3 x float>,
    <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>) #0
declare hidden amdgpu_gfx void @external_void_func_8xv5f32(<5 x float>, <5 x float>, <5 x float>, <5 x float>,
    <5 x float>, <5 x float>, <5 x float>, <5 x float>) #0
attributes #0 = { nounwind }
attributes #1 = { nounwind noinline }
