New hat/docs/ptx.md

  1 Here is the Java source code for a kernel
  2 ```Java
  3 public static void mandel(KernelContext kc, S32Array2D s32Array2D, S32Array pallette, float offsetx, float offsety, float scale) {
  4         if (kc.x < kc.maxX) {
  5             float width = s32Array2D.width();
  6             float height = s32Array2D.height();
  7             float x = ((kc.x % s32Array2D.width()) * scale - (scale / 2f * width)) / width + offsetx;
  8             float y = ((kc.x / s32Array2D.width()) * scale - (scale / 2f * height)) / height + offsety;
  9             float zx = x;
 10             float zy = y;
 11             float new_zx;
 12             int colorIdx = 0;
 13             while ((colorIdx < pallette.length()) && (((zx * zx) + (zy * zy)) < 4f)) {
 14                 new_zx = ((zx * zx) - (zy * zy)) + x;
 15                 zy = (2f * zx * zy) + y;
 16                 zx = new_zx;
 17                 colorIdx++;
 18             }
 19             int color = colorIdx < pallette.length() ? pallette.array(colorIdx) : 0;
 20             s32Array2D.array(kc.x, color);
 21         }
 22     }
 23 
 24 ```
 25 
 26 And here is the babylon code model
 27 
 28 ```
 29 func @"mandel" @loc="39:5:file:/Users/grfrost/orahub/hat/examples/mandel/src/java/mandel/MandelCompute.java"
 30  (%0 : hat.KernelContext, %1 : hat.buffer.S32Array2D, %2 : hat.buffer.S32Array, %3 : float, %4 : float, %5 : float)void -> {
 31     %6 : Var<hat.KernelContext> = var %0 @"kc" ;
 32     %7 : Var<hat.buffer.S32Array2D> = var %1 @"s32Array2D" ;
 33     %8 : Var<hat.buffer.S32Array> = var %2 @"pallette" ;
 34     %9 : Var<float> = var %3 @"offsetx" ;
 35     %10 : Var<float> = var %4 @"offsety" ;
 36     %11 : Var<float> = var %5 @"scale" ;
 37     java.if
 38         ()boolean -> {
 39             %12 : hat.KernelContext = var.load %6 ;
 40             %13 : int = field.load %12 @"hat.KernelContext::x()int" ;
 41             %14 : hat.KernelContext = var.load %6 ;
 42             %15 : int = field.load %14 @"hat.KernelContext::maxX()int" ;
 43             %16 : boolean = lt %13 %15 ;
 44             yield %16 ;
 45         }
 46         ()void -> {
 47             %17 : hat.buffer.S32Array2D = var.load %7 ;
 48             %18 : int = invoke %17 @"hat.buffer.S32Array2D::width()int" ;
 49             %19 : float = conv %18 ;
 50             %20 : Var<float> = var %19 @"width" ;
 51             %21 : hat.buffer.S32Array2D = var.load %7 ;
 52             %22 : int = invoke %21 @"hat.buffer.S32Array2D::height()int" ;
 53             %23 : float = conv %22 ;
 54             %24 : Var<float> = var %23 @"height" ;
 55             %25 : hat.KernelContext = var.load %6 ;
 56             %26 : int = field.load %25 @"hat.KernelContext::x()int" ;
 57             %27 : hat.buffer.S32Array2D = var.load %7 ;
 58             %28 : int = invoke %27 @"hat.buffer.S32Array2D::width()int" ;
 59             %29 : int = mod %26 %28 ;
 60             %30 : float = conv %29 ;
 61             %31 : float = var.load %11 ;
 62             %32 : float = mul %30 %31 ;
 63             %33 : float = var.load %11 ;
 64             %34 : float = constant @"2.0" ;
 65             %35 : float = div %33 %34 ;
 66             %36 : float = var.load %20 ;
 67             %37 : float = mul %35 %36 ;
 68             %38 : float = sub %32 %37 ;
 69             %39 : float = var.load %20 ;
 70             %40 : float = div %38 %39 ;
 71             %41 : float = var.load %9 ;
 72             %42 : float = add %40 %41 ;
 73             %43 : Var<float> = var %42 @"x" ;
 74             %44 : hat.KernelContext = var.load %6 ;
 75             %45 : int = field.load %44 @"hat.KernelContext::x()int" ;
 76             %46 : hat.buffer.S32Array2D = var.load %7 ;
 77             %47 : int = invoke %46 @"hat.buffer.S32Array2D::width()int" ;
 78             %48 : int = div %45 %47 ;
 79             %49 : float = conv %48 ;
 80             %50 : float = var.load %11 ;
 81             %51 : float = mul %49 %50 ;
 82             %52 : float = var.load %11 ;
 83             %53 : float = constant @"2.0" ;
 84             %54 : float = div %52 %53 ;
 85             %55 : float = var.load %24 ;
 86             %56 : float = mul %54 %55 ;
 87             %57 : float = sub %51 %56 ;
 88             %58 : float = var.load %24 ;
 89             %59 : float = div %57 %58 ;
 90             %60 : float = var.load %10 ;
 91             %61 : float = add %59 %60 ;
 92             %62 : Var<float> = var %61 @"y" ;
 93             %63 : float = var.load %43 ;
 94             %64 : Var<float> = var %63 @"zx" ;
 95             %65 : float = var.load %62 ;
 96             %66 : Var<float> = var %65 @"zy" ;
 97             %67 : float = constant @"0.0" ;
 98             %68 : Var<float> = var %67 @"new_zx" ;
 99             %69 : int = constant @"0" ;
100             %70 : Var<int> = var %69 @"colorIdx" ;
101             java.while
102                 ()boolean -> {
103                     %71 : boolean = java.cand
104                         ()boolean -> {
105                             %72 : int = var.load %70 ;
106                             %73 : hat.buffer.S32Array = var.load %8 ;
107                             %74 : int = invoke %73 @"hat.buffer.S32Array::length()int" ;
108                             %75 : boolean = lt %72 %74 ;
109                             yield %75 ;
110                         }
111                         ()boolean -> {
112                             %76 : float = var.load %64 ;
113                             %77 : float = var.load %64 ;
114                             %78 : float = mul %76 %77 ;
115                             %79 : float = var.load %66 ;
116                             %80 : float = var.load %66 ;
117                             %81 : float = mul %79 %80 ;
118                             %82 : float = add %78 %81 ;
119                             %83 : float = constant @"4.0" ;
120                             %84 : boolean = lt %82 %83 ;
121                             yield %84 ;
122                         };
123                     yield %71 ;
124                 }
125                 ()void -> {
126                     %85 : float = var.load %64 ;
127                     %86 : float = var.load %64 ;
128                     %87 : float = mul %85 %86 ;
129                     %88 : float = var.load %66 ;
130                     %89 : float = var.load %66 ;
131                     %90 : float = mul %88 %89 ;
132                     %91 : float = sub %87 %90 ;
133                     %92 : float = var.load %43 ;
134                     %93 : float = add %91 %92 ;
135                     var.store %68 %93 ;
136                     %94 : float = constant @"2.0" ;
137                     %95 : float = var.load %64 ;
138                     %96 : float = mul %94 %95 ;
139                     %97 : float = var.load %66 ;
140                     %98 : float = mul %96 %97 ;
141                     %99 : float = var.load %62 ;
142                     %100 : float = add %98 %99 ;
143                     var.store %66 %100 ;
144                     %101 : float = var.load %68 ;
145                     var.store %64 %101 ;
146                     %102 : int = var.load %70 ;
147                     %103 : int = constant @"1" ;
148                     %104 : int = add %102 %103 ;
149                     var.store %70 %104 ;
150                     java.continue ;
151                 };
152             %105 : int = java.cexpression
153                 ()boolean -> {
154                     %106 : int = var.load %70 ;
155                     %107 : hat.buffer.S32Array = var.load %8 ;
156                     %108 : int = invoke %107 @"hat.buffer.S32Array::length()int" ;
157                     %109 : boolean = lt %106 %108 ;
158                     yield %109 ;
159                 }
160                 ()int -> {
161                     %110 : hat.buffer.S32Array = var.load %8 ;
162                     %111 : int = var.load %70 ;
163                     %112 : long = conv %111 ;
164                     %113 : int = invoke %110 %112 @"hat.buffer.S32Array::array(long)int" ;
165                     yield %113 ;
166                 }
167                 ()int -> {
168                     %114 : int = constant @"0" ;
169                     yield %114 ;
170                 };
171             %115 : Var<int> = var %105 @"color" ;
172             %116 : hat.buffer.S32Array2D = var.load %7 ;
173             %117 : hat.KernelContext = var.load %6 ;
174             %118 : int = field.load %117 @"hat.KernelContext::x()int" ;
175             %119 : long = conv %118 ;
176             %120 : int = var.load %115 ;
177             invoke %116 %119 %120 @"hat.buffer.S32Array2D::array(long, int)void" ;
178             yield ;
179         }
180         ()void -> {
181             yield;
182         };
183     return ;
184 };
185 
186 ```
187 From the above we can generate C99 style CUDA code
188 
189 
190 Here is the Cuda C99 code generated
191 
192 ```C
193 typedef struct KernelContext_s{
194     int x;
195     int maxX;
196 }KernelContext_t;
197 
198 typedef struct S32Array2D_s{
199     int width;
200     int height;
201     int array[0];
202 }S32Array2D_t;
203 
204 typedef struct S32Array_s{
205     int length;
206     int array[0];
207 }S32Array_t;
208 
209 
210 extern "C" __global__ void mandel(
211      S32Array2D_t* s32Array2D,  S32Array_t* pallette, float offsetx, float offsety, float scale
212 ){
213     KernelContext_t kc;
214     kc.x=blockIdx.x*blockDim.x+threadIdx.x;
215     kc.maxX=gridDim.x*blockDim.x;
216     if(kc.x<kc.maxX){
217         float width = (float)s32Array2D->width;
218         float height = (float)s32Array2D->height;
219         float x = ((float)(kc.x%s32Array2D->width)*scale-scale/2.0*width)/width+offsetx;
220         float y = ((float)(kc.x/s32Array2D->width)*scale-scale/2.0*height)/height+offsety;
221         float zx = x;
222         float zy = y;
223         float new_zx = 0.0;
224         int colorIdx = 0;
225         while(colorIdx<pallette->length && zx*zx+zy*zy<4.0){
226             new_zx=zx*zx-zy*zy+x;
227             zy=2.0*zx*zy+y;
228             zx=new_zx;
229             colorIdx=colorIdx+1;
230         }
231         int color = colorIdx<pallette->length?pallette->array[(long)colorIdx]:0;
232         s32Array2D->array[(long)kc.x]=color;
233     }
234     return;
235 }
236 ```
237 
238 Which we can convert to ptx using the nvcc compiler (on a NVDIA platform)
239 
240 `nvcc -ptx mandel.cu -o mandel.ptx`
241 
242 ```
243 //
244 // Generated by NVIDIA NVVM Compiler
245 //
246 // Compiler Build ID: CL-33191640
247 // Cuda compilation tools, release 12.2, V12.2.140
248 // Based on NVVM 7.0.1
249 //
250 
251 .version 8.2
252 .target sm_52
253 .address_size 64
254 
255         // .globl       mandel
256 
257 .visible .entry mandel(
258         .param .u64 mandel_param_0,
259         .param .u64 mandel_param_1,
260         .param .f32 mandel_param_2,
261         .param .f32 mandel_param_3,
262         .param .f32 mandel_param_4
263 )
264 {
265         .reg .pred      %p<9>;
266         .reg .f32       %f<29>;
267         .reg .b32       %r<24>;
268         .reg .f64       %fd<22>;
269         .reg .b64       %rd<9>;
270 
271 
272         ld.param.u64    %rd3, [mandel_param_0];
273         ld.param.u64    %rd4, [mandel_param_1];
274         ld.param.f32    %f13, [mandel_param_2];
275         ld.param.f32    %f14, [mandel_param_3];
276         ld.param.f32    %f15, [mandel_param_4];
277         cvta.to.global.u64      %rd1, %rd4;
278         cvta.to.global.u64      %rd2, %rd3;
279         mov.u32         %r8, %ntid.x;
280         mov.u32         %r9, %ctaid.x;
281         mov.u32         %r10, %tid.x;
282         mad.lo.s32      %r1, %r9, %r8, %r10;
283         mov.u32         %r11, %nctaid.x;
284         mul.lo.s32      %r12, %r11, %r8;
285         setp.ge.s32     %p1, %r1, %r12;
286         @%p1 bra        $L__BB0_7;
287 
288         ld.global.u32   %r14, [%rd2];
289         cvt.rn.f32.s32  %f16, %r14;
290         ld.global.u32   %r15, [%rd2+4];
291         cvt.rn.f32.s32  %f17, %r15;
292         div.s32         %r16, %r1, %r14;
293         mul.lo.s32      %r17, %r16, %r14;
294         sub.s32         %r18, %r1, %r17;
295         cvt.rn.f32.s32  %f18, %r18;
296         mul.f32         %f19, %f18, %f15;
297         cvt.f64.f32     %fd2, %f19;
298         cvt.f64.f32     %fd3, %f15;
299         mul.f64         %fd4, %fd3, 0d3FE0000000000000;
300         cvt.f64.f32     %fd5, %f16;
301         mul.f64         %fd6, %fd4, %fd5;
302         sub.f64         %fd7, %fd2, %fd6;
303         div.rn.f64      %fd8, %fd7, %fd5;
304         cvt.f64.f32     %fd9, %f13;
305         add.f64         %fd10, %fd8, %fd9;
306         cvt.rn.f32.f64  %f1, %fd10;
307         cvt.rn.f32.s32  %f20, %r16;
308         mul.f32         %f21, %f20, %f15;
309         cvt.f64.f32     %fd11, %f21;
310         cvt.f64.f32     %fd12, %f17;
311         mul.f64         %fd13, %fd4, %fd12;
312         sub.f64         %fd14, %fd11, %fd13;
313         div.rn.f64      %fd15, %fd14, %fd12;
314         cvt.f64.f32     %fd16, %f14;
315         add.f64         %fd17, %fd15, %fd16;
316         cvt.rn.f32.f64  %f27, %fd17;
317         ld.global.u32   %r2, [%rd1];
318         setp.lt.s32     %p2, %r2, 1;
319         mov.u32         %r23, 0;
320         mul.f32         %f26, %f1, %f1;
321         mul.f32         %f25, %f27, %f27;
322         add.f32         %f22, %f26, %f25;
323         setp.geu.f32    %p3, %f22, 0f40800000;
324         or.pred         %p4, %p2, %p3;
325         mov.u32         %r22, %r23;
326         @%p4 bra        $L__BB0_4;
327 
328         cvt.f64.f32     %fd1, %f27;
329         mov.f32         %f28, %f1;
330 
331 $L__BB0_3:
332         sub.f32         %f23, %f26, %f25;
333         add.f32         %f9, %f23, %f1;
334         cvt.f64.f32     %fd18, %f28;
335         add.f64         %fd19, %fd18, %fd18;
336         cvt.f64.f32     %fd20, %f27;
337         fma.rn.f64      %fd21, %fd19, %fd20, %fd1;
338         cvt.rn.f32.f64  %f27, %fd21;
339         add.s32         %r22, %r22, 1;
340         setp.lt.s32     %p5, %r22, %r2;
341         mul.f32         %f26, %f9, %f9;
342         mul.f32         %f25, %f27, %f27;
343         add.f32         %f24, %f26, %f25;
344         setp.lt.f32     %p6, %f24, 0f40800000;
345         and.pred        %p7, %p5, %p6;
346         mov.f32         %f28, %f9;
347         @%p7 bra        $L__BB0_3;
348 
349 $L__BB0_4:
350         setp.ge.s32     %p8, %r22, %r2;
351         @%p8 bra        $L__BB0_6;
352 
353         mul.wide.s32    %rd5, %r22, 4;
354         add.s64         %rd6, %rd1, %rd5;
355         ld.global.u32   %r23, [%rd6+4];
356 
357 $L__BB0_6:
358         mul.wide.s32    %rd7, %r1, 4;
359         add.s64         %rd8, %rd2, %rd7;
360         st.global.u32   [%rd8+8], %r23;
361 
362 $L__BB0_7:
363         ret;
364 
365 }
366 ```
367 But we would like to create the ptx directly from the babylon model, without being on an NVidia platform.  Probably the lowered model
368 
369 We will probably use the lowered babylon model
370 
371 ```
372 func @"mandel" @loc="39:5:file:/Users/grfrost/orahub/hat/examples/mandel/src/java/mandel/MandelCompute.java"
373   (%0 : hat.KernelContext, %1 : hat.buffer.S32Array2D, %2 : hat.buffer.S32Array, %3 : float, %4 : float, %5 : float)void -> {
374     %6 : Var<hat.KernelContext> = var %0 @"kc" ;
375     %7 : Var<hat.buffer.S32Array2D> = var %1 @"s32Array2D" ;
376     %8 : Var<hat.buffer.S32Array> = var %2 @"pallette" ;
377     %9 : Var<float> = var %3 @"offsetx" ;
378     %10 : Var<float> = var %4 @"offsety" ;
379     %11 : Var<float> = var %5 @"scale" ;
380     %12 : hat.KernelContext = var.load %6 ;
381     %13 : int = field.load %12 @"hat.KernelContext::x()int" ;
382     %14 : hat.KernelContext = var.load %6 ;
383     %15 : int = field.load %14 @"hat.KernelContext::maxX()int" ;
384     %16 : boolean = lt %13 %15 ;
385     cbranch %16 ^block_0 ^block_1;
386 
387   ^block_0:
388     %17 : hat.buffer.S32Array2D = var.load %7 ;
389     %18 : int = invoke %17 @"hat.buffer.S32Array2D::width()int" ;
390     %19 : float = conv %18 ;
391     %20 : Var<float> = var %19 @"width" ;
392     %21 : hat.buffer.S32Array2D = var.load %7 ;
393     %22 : int = invoke %21 @"hat.buffer.S32Array2D::height()int" ;
394     %23 : float = conv %22 ;
395     %24 : Var<float> = var %23 @"height" ;
396     %25 : hat.KernelContext = var.load %6 ;
397     %26 : int = field.load %25 @"hat.KernelContext::x()int" ;
398     %27 : hat.buffer.S32Array2D = var.load %7 ;
399     %28 : int = invoke %27 @"hat.buffer.S32Array2D::width()int" ;
400     %29 : int = mod %26 %28 ;
401     %30 : float = conv %29 ;
402     %31 : float = var.load %11 ;
403     %32 : float = mul %30 %31 ;
404     %33 : float = var.load %11 ;
405     %34 : float = constant @"2.0" ;
406     %35 : float = div %33 %34 ;
407     %36 : float = var.load %20 ;
408     %37 : float = mul %35 %36 ;
409     %38 : float = sub %32 %37 ;
410     %39 : float = var.load %20 ;
411     %40 : float = div %38 %39 ;
412     %41 : float = var.load %9 ;
413     %42 : float = add %40 %41 ;
414     %43 : Var<float> = var %42 @"x" ;
415     %44 : hat.KernelContext = var.load %6 ;
416     %45 : int = field.load %44 @"hat.KernelContext::x()int" ;
417     %46 : hat.buffer.S32Array2D = var.load %7 ;
418     %47 : int = invoke %46 @"hat.buffer.S32Array2D::width()int" ;
419     %48 : int = div %45 %47 ;
420     %49 : float = conv %48 ;
421     %50 : float = var.load %11 ;
422     %51 : float = mul %49 %50 ;
423     %52 : float = var.load %11 ;
424     %53 : float = constant @"2.0" ;
425     %54 : float = div %52 %53 ;
426     %55 : float = var.load %24 ;
427     %56 : float = mul %54 %55 ;
428     %57 : float = sub %51 %56 ;
429     %58 : float = var.load %24 ;
430     %59 : float = div %57 %58 ;
431     %60 : float = var.load %10 ;
432     %61 : float = add %59 %60 ;
433     %62 : Var<float> = var %61 @"y" ;
434     %63 : float = var.load %43 ;
435     %64 : Var<float> = var %63 @"zx" ;
436     %65 : float = var.load %62 ;
437     %66 : Var<float> = var %65 @"zy" ;
438     %67 : float = constant @"0.0" ;
439     %68 : Var<float> = var %67 @"new_zx" ;
440     %69 : int = constant @"0" ;
441     %70 : Var<int> = var %69 @"colorIdx" ;
442     branch ^block_2;
443 
444   ^block_2:
445     %71 : int = var.load %70 ;
446     %72 : hat.buffer.S32Array = var.load %8 ;
447     %73 : int = invoke %72 @"hat.buffer.S32Array::length()int" ;
448     %74 : boolean = lt %71 %73 ;
449     cbranch %74 ^block_3 ^block_4(%74);
450 
451   ^block_3:
452     %75 : float = var.load %64 ;
453     %76 : float = var.load %64 ;
454     %77 : float = mul %75 %76 ;
455     %78 : float = var.load %66 ;
456     %79 : float = var.load %66 ;
457     %80 : float = mul %78 %79 ;
458     %81 : float = add %77 %80 ;
459     %82 : float = constant @"4.0" ;
460     %83 : boolean = lt %81 %82 ;
461     branch ^block_4(%83);
462 
463   ^block_4(%84 : boolean):
464     cbranch %84 ^block_5 ^block_6;
465 
466   ^block_5:
467     %85 : float = var.load %64 ;
468     %86 : float = var.load %64 ;
469     %87 : float = mul %85 %86 ;
470     %88 : float = var.load %66 ;
471     %89 : float = var.load %66 ;
472     %90 : float = mul %88 %89 ;
473     %91 : float = sub %87 %90 ;
474     %92 : float = var.load %43 ;
475     %93 : float = add %91 %92 ;
476     var.store %68 %93 ;
477     %94 : float = constant @"2.0" ;
478     %95 : float = var.load %64 ;
479     %96 : float = mul %94 %95 ;
480     %97 : float = var.load %66 ;
481     %98 : float = mul %96 %97 ;
482     %99 : float = var.load %62 ;
483     %100 : float = add %98 %99 ;
484     var.store %66 %100 ;
485     %101 : float = var.load %68 ;
486     var.store %64 %101 ;
487     %102 : int = var.load %70 ;
488     %103 : int = constant @"1" ;
489     %104 : int = add %102 %103 ;
490     var.store %70 %104 ;
491     branch ^block_2;
492 
493   ^block_6:
494     %105 : int = var.load %70 ;
495     %106 : hat.buffer.S32Array = var.load %8 ;
496     %107 : int = invoke %106 @"hat.buffer.S32Array::length()int" ;
497     %108 : boolean = lt %105 %107 ;
498     cbranch %108 ^block_7 ^block_8;
499 
500   ^block_7:
501     %109 : hat.buffer.S32Array = var.load %8 ;
502     %110 : int = var.load %70 ;
503     %111 : long = conv %110 ;
504     %112 : int = invoke %109 %111 @"hat.buffer.S32Array::array(long)int" ;
505     branch ^block_9(%112);
506 
507   ^block_8:
508     %113 : int = constant @"0" ;
509     branch ^block_9(%113);
510 
511   ^block_9(%114 : int):
512     %115 : Var<int> = var %114 @"color" ;
513     %116 : hat.buffer.S32Array2D = var.load %7 ;
514     %117 : hat.KernelContext = var.load %6 ;
515     %118 : int = field.load %117 @"hat.KernelContext::x()int" ;
516     %119 : long = conv %118 ;
517     %120 : int = var.load %115 ;
518     invoke %116 %119 %120 @"hat.buffer.S32Array2D::array(long, int)void" ;
519     branch ^block_10;
520 
521   ^block_1:
522     branch ^block_10;
523 
524   ^block_10:
525     return ;
526 };
527 ```
528