1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 #pragma once
 27 
 28 #include <iostream>
 29 #include <map>
 30 #include <vector>
 31 #include <cstdio>
 32 #include <cstring>
 33 #include <unistd.h>
 34 #include <sys/time.h>
 35 #include <iostream>
 36 #include <iomanip>
 37 #include <bitset>
 38 #include <stack>
 39 
 40 #include "strutil.h"
 41 #include "config.h"
 42 
 43 #ifdef __APPLE__
 44 #define SNPRINTF snprintf
 45 #else
 46 #include <malloc.h>
 47 #if defined (_WIN32)
 48 #include "windows.h"
 49 #define SNPRINTF _snprintf
 50 #else
 51 #define SNPRINTF  snprintf
 52 #endif
 53 #endif
 54 
 55 typedef char s8_t;
 56 typedef char byte;
 57 typedef char boolean;
 58 typedef char z1_t;
 59 typedef unsigned char u8_t;
 60 typedef short s16_t;
 61 typedef unsigned short u16_t;
 62 typedef unsigned int u32_t;
 63 typedef int s32_t;
 64 typedef float f32_t;
 65 typedef double f64_t;
 66 typedef long s64_t;
 67 typedef unsigned long u64_t;
 68 
 69 extern void hexdump(void *ptr, int buflen);
 70 
 71 class Text {
 72 public:
 73     size_t len;
 74     char *text;
 75     bool isCopy;
 76 
 77     Text(size_t len, char *text, bool isCopy);
 78 
 79     Text(char *text, bool isCopy);
 80 
 81     explicit Text(size_t len);
 82 
 83     void write(const std::string &filename) const;
 84 
 85     void read(const std::string &filename);
 86 
 87     virtual ~Text();
 88 };
 89 
 90 class Log : public Text {
 91 public:
 92     explicit Log(size_t len);
 93 
 94     explicit Log(char *text);
 95 
 96     ~Log() override = default;
 97 };
 98 
 99 #define UNKNOWN_BYTE 0
100 #define RO_BYTE (1<<1)
101 #define WO_BYTE (1<<2)
102 #define RW_BYTE (RO_BYTE|WO_BYTE)
103 
104 struct Buffer_s {
105     void *memorySegment; // Address of a Buffer/MemorySegment
106     long sizeInBytes;    // The size of the memory segment in bytes
107     u8_t access;         // see hat/buffer/ArgArray.java  UNKNOWN_BYTE=0, RO_BYTE =1<<1,WO_BYTE =1<<2,RW_BYTE =RO_BYTE|WO_BYTE;
108 };
109 
110 union Value_u {
111     boolean z1; // 'Z'
112     u8_t s8; // 'B'
113     u16_t u16; // 'C'
114     s16_t s16; // 'S'
115     u16_t x16; // 'C' or 'S'   // this is never used
116     s32_t s32; // 'I'
117     s32_t x32; // 'I' or 'F'   // this is never used
118     f32_t f32; // 'F'
119     f64_t f64; // 'D'
120     s64_t s64; // 'J'
121     s64_t x64; // 'D' or 'J'   // this is never used
122     Buffer_s buffer; // '&'
123 };
124 
125 struct KernelArg {
126     u32_t idx; // 0..argc
127     u8_t variant; // which variant 'I','Z','S','J','F', '&' implies Buffer/MemorySegment
128     u8_t pad8[8];
129     Value_u value;
130     u8_t pad6[6];
131 
132     size_t size() const {
133         size_t sz;
134         switch (variant) {
135             case 'I':
136             case 'F':
137                 sz = sizeof(u32_t);
138                 break;
139             case 'S':
140             case 'C':
141                 sz = sizeof(u16_t);
142                 break;
143             case 'D':
144             case 'J':
145                 return sizeof(u64_t);
146             case 'B':
147                 return sizeof(u8_t);
148             default:
149                 std::cerr << "Bad variant " << variant << "arg::size" << std::endl;
150                 exit(1);
151         }
152         return sz;
153     }
154 };
155 
156 struct BufferState {
157     static constexpr long MAGIC = 0x4a71facebffab175;   // This magic number is a delimiter to
158                                                         // check the length of the buffer as follows:
159                                                         // *(bufferStart+(bufferLen - sizeof(bufferState)) == MAGIC
160     static constexpr int NO_STATE = 0;
161     static constexpr int NEW_STATE = 1;
162     static constexpr int HOST_OWNED = 2;
163     static constexpr int DEVICE_OWNED = 3;
164     static constexpr int DEVICE_VALID_HOST_HAS_COPY = 4;
165     const static char *stateNames[]; // See below for out of line definition
166 
167     long magic1;
168     void *ptr;
169     long length;
170     int bits;
171     mutable int state;
172     void *vendorPtr;
173     long magic2;
174 
175     bool ok() const {
176         return ((magic1 == MAGIC) && (magic2 == MAGIC));
177     }
178 
179     void setState(int newState) {
180         state = newState;
181     }
182 
183     int getState() const {
184         return state;
185     }
186 
187     void dump(const char *msg) const {
188         if (ok()) {
189             printf("{%s,ptr:%016lx,length: %016lx,  state:%08x, vendorPtr:%016lx}\n", msg, (long) ptr, length, state,
190                    (long) vendorPtr);
191         } else {
192             printf("%s bad magic \n", msg);
193             printf("(magic1:%016lx,", magic1);
194             printf("{%s, ptr:%016lx, length: %016lx,  state:%08x, vendorPtr:%016lx}", msg, (long) ptr, length, state,
195                    (long) vendorPtr);
196             printf("magic2:%016lx)\n", magic2);
197         }
198     }
199 
200     static BufferState *of(void *ptr, size_t sizeInBytes) {
201         return reinterpret_cast<BufferState *>(static_cast<char *>(ptr) + sizeInBytes - sizeof(BufferState));
202     }
203 
204     static BufferState *of(const KernelArg *arg) {
205         // access?
206         BufferState *bufferState = of(
207             arg->value.buffer.memorySegment,
208             arg->value.buffer.sizeInBytes
209         );
210 
211         // Sanity check the buffers
212         // These sanity check finds errors passing memory segments which are not Buffers
213         if (bufferState->ptr != arg->value.buffer.memorySegment) {
214             std::cerr << "bufferState->ptr !=  arg->value.buffer.memorySegment" << std::endl;
215 
216             // A bit brutal to stop the VM? We can throw an exception and handle it in the Java side?
217             std::exit(1);
218         }
219 
220         if ((bufferState->vendorPtr == nullptr) && (bufferState->state != NEW_STATE)) {
221             std::cerr << "Warning:  Unexpected initial state for buffer "
222                     << " state=" << bufferState->state << " '"
223                     << stateNames[bufferState->state] << "'"
224                     << " vendorPtr" << bufferState->vendorPtr << std::endl;
225         }
226         // End of sanity checks
227         return bufferState;
228     }
229 };
230 
231 #ifdef shared_cpp
232 const char *BufferState::stateNames[] = {
233     "NO_STATE",
234     "NEW_STATE",
235     "HOST_OWNED",
236     "DEVICE_OWNED",
237     "DEVICE_VALID_HOST_HAS_COPY"
238 };
239 #endif
240 
241 struct ArgArray_s {
242     u32_t argc;
243     u8_t pad12[12];
244     KernelArg argv[0/*argc*/];
245 };
246 
247 class ArgSled {
248 private:
249     ArgArray_s *argArray;
250 
251 public:
252     int argc() const {
253         return argArray->argc;
254     }
255 
256     KernelArg *arg(int n) const {
257         KernelArg *a = (argArray->argv + n);
258         return a;
259     }
260 
261     void hexdumpArg(int n) const {
262         hexdump(arg(n), sizeof(KernelArg));
263     }
264 
265     void dumpArg(int n) const {
266         KernelArg *a = arg(n);
267         int idx = (int) a->idx;
268         std::cout << "arg[" << idx << "]";
269         char variant = (char) a->variant;
270         switch (variant) {
271             case 'F':
272                 std::cout << " f32 " << a->value.f32 << std::endl;
273                 break;
274             case 'I':
275                 std::cout << " s32 " << a->value.s32 << std::endl;
276                 break;
277             case 'D':
278                 std::cout << " f64 " << a->value.f64 << std::endl;
279                 break;
280             case 'J':
281                 std::cout << " s64 " << a->value.s64 << std::endl;
282                 break;
283             case 'C':
284                 std::cout << " u16 " << a->value.u16 << std::endl;
285                 break;
286             case 'S':
287                 std::cout << " s16 " << a->value.s32 << std::endl;
288                 break;
289             case 'Z':
290                 std::cout << " z1 " << a->value.z1 << std::endl;
291                 break;
292             case '&':
293                 std::cout << " buffer {"
294                         << " void *address = 0x" << std::hex << (long) a->value.buffer.memorySegment << std::dec
295                         << ", long bytesSize= 0x" << std::hex << (long) a->value.buffer.sizeInBytes << std::dec
296                         << ", char access= 0x" << std::hex << (unsigned char) a->value.buffer.access << std::dec
297                         << "}" << std::endl;
298                 break;
299             default:
300                 std::cout << (char) variant << std::endl;
301                 break;
302         }
303     }
304 
305     void *afterArgsPtrPtr() const {
306         KernelArg *a = arg(argc());
307         return (void *) a;
308     }
309 
310     int *schemaLenPtr() const {
311         int *schemaLenP = (int *) ((char *) afterArgsPtrPtr() /*+ sizeof(void *) */);
312         return schemaLenP;
313     }
314 
315     int schemaLen() const {
316         return *schemaLenPtr();
317     }
318 
319     char *schema() const {
320         int *schemaLenP = ((int *) ((char *) afterArgsPtrPtr() /*+ sizeof(void *)*/) + 1);
321         return (char *) schemaLenP;
322     }
323 
324     explicit ArgSled(ArgArray_s *argArray)
325         : argArray(argArray) {
326     }
327 };
328 
329 
330 class Timer {
331     struct timeval startTV, endTV;
332 
333 public:
334     unsigned long elapsed_us{};
335 
336     Timer(): startTV(), endTV() {
337     }
338 
339     void start() {
340         gettimeofday(&startTV, nullptr);
341     }
342 
343     unsigned long end() {
344         gettimeofday(&endTV, nullptr);
345         elapsed_us = (endTV.tv_sec - startTV.tv_sec) * 1000000; // sec to us
346         elapsed_us += (endTV.tv_usec - startTV.tv_usec);
347         return elapsed_us;
348     }
349 };
350 
351 
352 //extern void hexdump(void *ptr, int buflen);
353 
354 class Sled {
355 public:
356     static void show(std::ostream &out, void *argArray);
357 };
358 
359 class KernelContext {
360 public:
361 
362     // Dimensions of the kernel (1D, 2D or 3D)
363     int dimensions;
364 
365     // global index
366     int gix;
367     int giy;
368     int giz;
369 
370     // global sizes
371     int gsx;
372     int gsy;
373     int gsz;
374 
375     // local index
376     int lix;
377     int liy;
378     int liz;
379 
380     // local size
381     int lsx;
382     int lsy;
383     int lsz;
384 
385     // Group index
386     int bix;
387     int biy;
388     int biz;
389 };
390 
391 class Backend {
392 public:
393     class Config final : public BasicConfig {
394     public:
395         explicit Config(int mode);
396 
397         ~Config() override;
398     };
399 
400     class Buffer {
401     public:
402         Backend *backend;
403         BufferState *bufferState;
404 
405         Buffer(Backend *backend, BufferState *bufferState)
406             : backend(backend), bufferState(bufferState) {
407         }
408 
409         virtual ~Buffer() = default;
410     };
411 
412     class CompilationUnit {
413     public:
414         class Kernel {
415         public:
416             char *name;
417 
418             CompilationUnit *compilationUnit;
419 
420             virtual bool setArg(KernelArg *arg, Buffer *openCLBuffer) = 0;
421 
422             virtual bool setArg(KernelArg *arg) = 0;
423 
424             virtual long ndrange(void *argArray) final;
425 
426             Kernel(CompilationUnit *compilationUnit, char *name)
427                 : name(strutil::clone(name)), compilationUnit(compilationUnit) {
428             }
429 
430             virtual ~Kernel() {
431                 delete[] name;
432             }
433         };
434 
435     public:
436         Backend *backend;
437         char *src;
438         char *log;
439         bool ok;
440 
441         virtual Kernel *getKernel(int nameLen, char *name) = 0;
442 
443         virtual bool compilationUnitOK() final {
444             return ok;
445         }
446 
447         CompilationUnit(Backend *backend, char *src, char *log, bool ok)
448             : backend(backend), src(src), log(log), ok(ok) {
449         }
450 
451         virtual ~CompilationUnit() {
452             delete[] src;
453             delete[] log;
454         };
455     };
456 
457     class Queue {
458     public:
459         Backend *backend;
460 
461         explicit Queue(Backend *backend);
462 
463         virtual void wait() = 0;
464 
465         virtual void release() = 0;
466 
467         virtual void computeStart() = 0;
468 
469         virtual void computeEnd() = 0;
470 
471         virtual void copyToDevice(Buffer *buffer) =0;
472 
473         virtual void copyFromDevice(Buffer *buffer) =0;
474 
475         virtual void dispatch(KernelContext *kernelContext, CompilationUnit::Kernel *kernel) = 0;
476 
477         virtual ~Queue();
478     };
479 
480     class ProfilableQueue : public Queue {
481     public:
482         static constexpr int START_BIT_IDX = 20;
483         static constexpr int CopyToDeviceBits = 1 << START_BIT_IDX;
484         static constexpr int CopyFromDeviceBits = 1 << 21;
485         static constexpr int NDRangeBits = 1 << 22;
486         static constexpr int StartComputeBits = 1 << 23;
487         static constexpr int EndComputeBits = 1 << 24;
488         static constexpr int EnterKernelDispatchBits = 1 << 25;
489         static constexpr int LeaveKernelDispatchBits = 1 << 26;
490         static constexpr int HasConstCharPtrArgBits = 1 << 27;
491         static constexpr int hasIntArgBits = 1 << 28;
492         static constexpr int END_BIT_IDX = 27;
493 
494         size_t eventMax;
495         size_t eventc;
496         int *eventInfoBits;
497         const char **eventInfoConstCharPtrArgs;
498 
499         virtual void showEvents(int width) = 0;
500 
501         virtual void inc(int bits) = 0;
502 
503         virtual void inc(int bits, const char *arg) = 0;
504 
505         virtual void marker(int bits) = 0;
506 
507         virtual void marker(int bits, const char *arg) = 0;
508 
509 
510         virtual void markAsStartComputeAndInc() = 0;
511 
512         virtual void markAsEndComputeAndInc() = 0;
513 
514         virtual void markAsEnterKernelDispatchAndInc() = 0;
515 
516         virtual void markAsLeaveKernelDispatchAndInc() = 0;
517 
518         ProfilableQueue(Backend *backend, int eventMax)
519             : Queue(backend),
520               eventMax(eventMax),
521               eventInfoBits(new int[eventMax]),
522               eventInfoConstCharPtrArgs(new const char *[eventMax]),
523               eventc(0) {
524         }
525 
526         ~ProfilableQueue() override {
527             delete[]eventInfoBits;
528             delete[]eventInfoConstCharPtrArgs;
529         }
530     };
531 
532     Config *config;
533     Queue *queue;
534 
535     Backend(Config *config, Queue *queue)
536         : config(config), queue(queue) {
537     }
538 
539     virtual Buffer *getOrCreateBuffer(BufferState *bufferState) = 0;
540 
541     virtual void showDeviceInfo() = 0;
542 
543     virtual void computeStart() = 0;
544 
545     virtual void computeEnd() = 0;
546 
547     virtual CompilationUnit *compile(int len, char *source) = 0;
548 
549     virtual bool getBufferFromDeviceIfDirty(void *memorySegment, long memorySegmentLength) = 0;
550 
551     virtual ~Backend() = default;
552 };
553 
554 template<typename T>
555 T *bufferOf(const char *name) {
556     size_t lenIncludingBufferState = sizeof(T);
557     size_t lenExcludingBufferState = lenIncludingBufferState - sizeof(BufferState);
558     T *buffer = reinterpret_cast<T *>(new unsigned char[lenIncludingBufferState]);
559     auto *bufferState = reinterpret_cast<BufferState *>(reinterpret_cast<char *>(buffer) + lenExcludingBufferState);
560     bufferState->magic1 = bufferState->magic2 = BufferState::MAGIC;
561     bufferState->ptr = buffer;
562     bufferState->length = sizeof(T) - sizeof(BufferState);
563     bufferState->state = BufferState::NEW_STATE;
564     bufferState->vendorPtr = nullptr;
565     bufferState->dump(name);
566     return buffer;
567 }