VirtualFramework/kernelsCache/ConcatImageFromPlanar-4e4df6c2.v.01070000BBCFD912.NVIDIA GeForce RTX 3070 Ti.560.94_64.bin
2025-03-07 17:23:20 +08:00

86 lines
2.2 KiB
Plaintext
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: UNKNOWN
// Unknown Toolkit Version
// Based on NVVM 7.0.1
//
.version 8.5
.target sm_86, texmode_independent
.address_size 64
// .globl ConcatImageFromPlanarKernel
.entry ConcatImageFromPlanarKernel(
.param .u64 .ptr .global .align 4 ConcatImageFromPlanarKernel_param_0,
.param .u64 .ptr .global .align 16 ConcatImageFromPlanarKernel_param_1,
.param .u64 .ptr .global .align 16 ConcatImageFromPlanarKernel_param_2
)
{
.reg .pred %p<11>;
.reg .f32 %f<17>;
.reg .b32 %r<23>;
.reg .b64 %rd<10>;
ld.param.u64 %rd1, [ConcatImageFromPlanarKernel_param_0];
ld.param.u64 %rd2, [ConcatImageFromPlanarKernel_param_1];
ld.param.u64 %rd3, [ConcatImageFromPlanarKernel_param_2];
mov.b32 %r3, %envreg3;
mov.u32 %r4, %ctaid.x;
mov.u32 %r5, %ntid.x;
mov.u32 %r6, %tid.x;
add.s32 %r7, %r6, %r3;
mad.lo.s32 %r1, %r5, %r4, %r7;
mov.u32 %r8, %ctaid.y;
mov.u32 %r9, %ntid.y;
mov.u32 %r10, %tid.y;
mov.b32 %r11, %envreg4;
add.s32 %r12, %r10, %r11;
mad.lo.s32 %r2, %r9, %r8, %r12;
setp.gt.s32 %p1, %r1, 2047;
setp.gt.s32 %p2, %r2, 2047;
or.pred %p3, %p1, %p2;
@%p3 bra $L__BB0_2;
shl.b32 %r13, %r2, 11;
add.s32 %r14, %r13, %r1;
mul.wide.s32 %rd4, %r14, 4;
add.s64 %rd5, %rd1, %rd4;
max.s32 %r15, %r1, 0;
min.s32 %r16, %r15, 2047;
max.s32 %r17, %r2, 0;
min.s32 %r18, %r17, 2047;
shl.b32 %r19, %r18, 11;
or.b32 %r20, %r19, %r16;
mul.wide.u32 %rd6, %r20, 16;
add.s64 %rd7, %rd2, %rd6;
ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd7];
abs.ftz.f32 %f6, %f1;
abs.ftz.f32 %f8, %f2;
abs.ftz.f32 %f10, %f3;
abs.ftz.f32 %f12, %f4;
setp.geu.ftz.f32 %p4, %f10, 0f7F800000;
setp.geu.ftz.f32 %p5, %f12, 0f7F800000;
setp.geu.ftz.f32 %p6, %f8, 0f7F800000;
setp.geu.ftz.f32 %p7, %f6, 0f7F800000;
or.pred %p8, %p7, %p6;
selp.b32 %r21, -1, 0, %p8;
or.pred %p9, %p5, %p4;
selp.b32 %r22, -1, %r21, %p9;
setp.gt.s32 %p10, %r22, -1;
mul.wide.s32 %rd8, %r14, 16;
add.s64 %rd9, %rd3, %rd8;
selp.f32 %f13, %f4, 0f00000000, %p10;
ld.global.nc.f32 %f14, [%rd5+33554432];
ld.global.nc.f32 %f15, [%rd5+16777216];
ld.global.nc.f32 %f16, [%rd5];
st.global.v4.f32 [%rd9], {%f16, %f15, %f14, %f13};
$L__BB0_2:
ret;
}