121 lines
2.9 KiB
Plaintext
121 lines
2.9 KiB
Plaintext
//
|
||
// Generated by NVIDIA NVVM Compiler
|
||
//
|
||
// Compiler Build ID: UNKNOWN
|
||
// Unknown Toolkit Version
|
||
// Based on NVVM 7.0.1
|
||
//
|
||
|
||
.version 8.5
|
||
.target sm_86, texmode_independent
|
||
.address_size 64
|
||
|
||
// .globl ConcatVectorsKernel
|
||
|
||
.entry ConcatVectorsKernel(
|
||
.param .u64 .ptr .global .align 16 ConcatVectorsKernel_param_0,
|
||
.param .u32 ConcatVectorsKernel_param_1,
|
||
.param .u32 ConcatVectorsKernel_param_2,
|
||
.param .u32 ConcatVectorsKernel_param_3,
|
||
.param .u64 .ptr .global .align 4 ConcatVectorsKernel_param_4,
|
||
.param .u32 ConcatVectorsKernel_param_5,
|
||
.param .u32 ConcatVectorsKernel_param_6,
|
||
.param .u32 ConcatVectorsKernel_param_7,
|
||
.param .u32 ConcatVectorsKernel_param_8,
|
||
.param .u32 ConcatVectorsKernel_param_9
|
||
)
|
||
{
|
||
.reg .pred %p<18>;
|
||
.reg .f32 %f<22>;
|
||
.reg .b32 %r<22>;
|
||
.reg .b64 %rd<16>;
|
||
|
||
|
||
ld.param.u64 %rd8, [ConcatVectorsKernel_param_0];
|
||
ld.param.u64 %rd9, [ConcatVectorsKernel_param_4];
|
||
ld.param.u32 %r3, [ConcatVectorsKernel_param_5];
|
||
mov.b32 %r4, %envreg3;
|
||
mov.u32 %r5, %ctaid.x;
|
||
mov.u32 %r6, %ntid.x;
|
||
mov.u32 %r7, %tid.x;
|
||
add.s32 %r8, %r7, %r4;
|
||
mad.lo.s32 %r1, %r6, %r5, %r8;
|
||
mov.u32 %r9, %ctaid.y;
|
||
mov.u32 %r10, %ntid.y;
|
||
mov.u32 %r11, %tid.y;
|
||
mov.b32 %r12, %envreg4;
|
||
add.s32 %r13, %r11, %r12;
|
||
mad.lo.s32 %r2, %r10, %r9, %r13;
|
||
setp.gt.s32 %p1, %r1, 2047;
|
||
setp.gt.s32 %p2, %r2, 2047;
|
||
or.pred %p3, %p1, %p2;
|
||
@%p3 bra $L__BB0_9;
|
||
|
||
shl.b32 %r14, %r2, 11;
|
||
add.s32 %r15, %r14, %r1;
|
||
mul.wide.s32 %rd10, %r15, 4;
|
||
add.s64 %rd14, %rd9, %rd10;
|
||
mul.wide.s32 %rd11, %r15, 16;
|
||
add.s64 %rd12, %rd8, %rd11;
|
||
ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd12];
|
||
abs.ftz.f32 %f10, %f5;
|
||
abs.ftz.f32 %f12, %f6;
|
||
abs.ftz.f32 %f14, %f7;
|
||
abs.ftz.f32 %f16, %f8;
|
||
setp.geu.ftz.f32 %p4, %f14, 0f7F800000;
|
||
setp.geu.ftz.f32 %p5, %f16, 0f7F800000;
|
||
setp.geu.ftz.f32 %p6, %f12, 0f7F800000;
|
||
setp.geu.ftz.f32 %p7, %f10, 0f7F800000;
|
||
or.pred %p8, %p7, %p6;
|
||
selp.b32 %r16, -1, 0, %p8;
|
||
or.pred %p9, %p5, %p4;
|
||
selp.b32 %r17, -1, %r16, %p9;
|
||
setp.gt.s32 %p10, %r17, -1;
|
||
mov.f32 %f17, 0f00000000;
|
||
selp.f32 %f18, %f5, 0f00000000, %p10;
|
||
max.f32 %f1, %f17, %f18;
|
||
selp.f32 %f19, %f6, 0f00000000, %p10;
|
||
max.f32 %f2, %f17, %f19;
|
||
selp.f32 %f20, %f7, 0f00000000, %p10;
|
||
max.f32 %f3, %f17, %f20;
|
||
selp.f32 %f21, %f8, 0f00000000, %p10;
|
||
max.f32 %f4, %f17, %f21;
|
||
and.b32 %r18, %r3, 1;
|
||
setp.eq.b32 %p11, %r18, 1;
|
||
mov.pred %p12, 0;
|
||
xor.pred %p13, %p11, %p12;
|
||
not.pred %p14, %p13;
|
||
@%p14 bra $L__BB0_3;
|
||
|
||
st.global.f32 [%rd14], %f1;
|
||
add.s64 %rd14, %rd14, 16777216;
|
||
|
||
$L__BB0_3:
|
||
and.b32 %r19, %r3, 2;
|
||
setp.eq.s32 %p15, %r19, 0;
|
||
@%p15 bra $L__BB0_5;
|
||
|
||
st.global.f32 [%rd14], %f2;
|
||
add.s64 %rd14, %rd14, 16777216;
|
||
|
||
$L__BB0_5:
|
||
and.b32 %r20, %r3, 4;
|
||
setp.eq.s32 %p16, %r20, 0;
|
||
@%p16 bra $L__BB0_7;
|
||
|
||
st.global.f32 [%rd14], %f3;
|
||
add.s64 %rd14, %rd14, 16777216;
|
||
|
||
$L__BB0_7:
|
||
and.b32 %r21, %r3, 8;
|
||
setp.eq.s32 %p17, %r21, 0;
|
||
@%p17 bra $L__BB0_9;
|
||
|
||
st.global.f32 [%rd14], %f4;
|
||
|
||
$L__BB0_9:
|
||
ret;
|
||
|
||
}
|
||
|
||
|