1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 #include "opencl_backend.h"
 26 
 27 /*
 28 While based on OpenCL's event list, I think we need to use a MOD eventMax queue.
 29 
 30 So
 31 */
 32 OpenCLBackend::OpenCLQueue::OpenCLQueue(Backend *backend)
 33     : ProfilableQueue(backend, 10000),
 34       command_queue(),
 35       events(new cl_event[eventMax]) {
 36 }
 37 
 38 cl_event *OpenCLBackend::OpenCLQueue::eventListPtr() const {
 39     return (eventc == 0) ? nullptr : events;
 40 }
 41 
 42 cl_event *OpenCLBackend::OpenCLQueue::nextEventPtr() const {
 43     return &events[eventc];
 44 }
 45 
 46 void OpenCLBackend::OpenCLQueue::showEvents(const int width) {
 47     constexpr int SAMPLE_TYPES = 4;
 48     auto *samples = new cl_ulong[SAMPLE_TYPES * eventc]; // queued, submit, start, end, complete
 49     int sample = 0;
 50     cl_ulong min = CL_LONG_MAX;
 51     cl_ulong max = CL_LONG_MIN;
 52 
 53     for (int event = 0; event < eventc; event++) {
 54         for (int type = 0; type < SAMPLE_TYPES; type++) {
 55             cl_profiling_info profiling_info_arr[] = {
 56                 CL_PROFILING_COMMAND_QUEUED,CL_PROFILING_COMMAND_SUBMIT,CL_PROFILING_COMMAND_START,
 57                 CL_PROFILING_COMMAND_END
 58             };
 59             if ((clGetEventProfilingInfo(events[event], profiling_info_arr[type], sizeof(samples[sample]),
 60                                          &samples[sample], NULL)) !=
 61                 CL_SUCCESS) {
 62                 const char *profiling_info_name_arr[] = {
 63                     "CL_PROFILING_COMMAND_QUEUED", "CL_PROFILING_COMMAND_SUBMIT", "CL_PROFILING_COMMAND_START",
 64                     "CL_PROFILING_COMMAND_END"
 65                 };
 66                 std::cerr << "failed to get profile info " << profiling_info_name_arr[type] << std::endl;
 67             }
 68             if (sample == 0) {
 69                 if (type == 0) {
 70                     min = max = samples[sample];
 71                 }
 72             } else {
 73                 if (samples[sample] < min) {
 74                     min = samples[sample];
 75                 }
 76                 if (samples[sample] > max) {
 77                     max = samples[sample];
 78                 }
 79             }
 80             sample++;
 81         }
 82     }
 83     sample = 0;
 84     const cl_ulong range = (max - min);
 85     const cl_ulong scale = range / width; // range per char
 86     std::cout << "Range: " << min << "-" << max << "(" << range << "ns)"
 87             << "  (" << scale << "ns) per char"
 88             << " +:submitted, .:started, =:end  " << std::endl;
 89 
 90     for (int event = 0; event < eventc; event++) {
 91         /*  cl_command_type command_type;
 92           clGetEventInfo(events[event],CL_EVENT_COMMAND_TYPE,sizeof(command_type), &command_type, nullptr);
 93           switch (command_type){
 94             case CL_COMMAND_MARKER:         std::cout <<   "marker "; break;
 95             case CL_COMMAND_USER:           std::cout <<   "  user "; break;
 96             case CL_COMMAND_NDRANGE_KERNEL: std::cout <<   "kernel "; break;
 97             case CL_COMMAND_READ_BUFFER:    std::cout <<   "  read "; break;
 98             case CL_COMMAND_WRITE_BUFFER:   std::cout <<   " write "; break;
 99             default: std::cout <<                          " other "; break;
100           } */
101         const int bits = eventInfoBits[event];
102         if ((bits & CopyToDeviceBits) == CopyToDeviceBits) {
103             std::cout << "  write " << (bits & 0xffff) << " ";
104         }
105         if ((bits & CopyFromDeviceBits) == CopyFromDeviceBits) {
106             std::cout << "   read " << (bits & 0xffff) << " ";
107         }
108         if ((bits & StartComputeBits) == StartComputeBits) {
109             std::cout << "  start    ";
110         }
111         if ((bits & EndComputeBits) == EndComputeBits) {
112             std::cout << "    end    ";
113         }
114         if ((bits & NDRangeBits) == NDRangeBits) {
115             std::cout << " kernel    ";
116         }
117         if ((bits & EnterKernelDispatchBits) == EnterKernelDispatchBits) {
118             if ((bits & HasConstCharPtrArgBits) == HasConstCharPtrArgBits) {
119                 std::cout << eventInfoConstCharPtrArgs[event] << std::endl;
120             }
121             std::cout << "  enter{   ";
122         }
123         if ((bits & LeaveKernelDispatchBits) == LeaveKernelDispatchBits) {
124             // std::cout <<   "  leave    ";
125             if ((bits & HasConstCharPtrArgBits) == HasConstCharPtrArgBits) {
126                 std::cout << eventInfoConstCharPtrArgs[event] << std::endl;
127             }
128             std::cout << " }leave    ";
129         }
130 
131 
132         const cl_ulong queue = (samples[sample++] - min) / scale;
133         const cl_ulong submit = (samples[sample++] - min) / scale;
134         const cl_ulong start = (samples[sample++] - min) / scale;
135         const cl_ulong end = (samples[sample++] - min) / scale;
136 
137         std::cout << std::setw(20) << (queue - end) << "(ns) ";
138         for (int c = 0; c < width; c++) {
139             char ch = ' ';
140             if (c >= queue && c <= submit) {
141                 ch = '+';
142             } else if (c > submit && c < start) {
143                 ch = '.';
144             } else if (c >= start && c < end) {
145                 ch = '=';
146             }
147             std::cout << ch;
148         }
149         std::cout << std::endl;
150     }
151     delete[] samples;
152 }
153 
154 void OpenCLBackend::OpenCLQueue::wait() {
155     if (eventc > 0) {
156         OPENCL_CHECK(clWaitForEvents(eventc, events), "clWaitForEvents");
157     }
158 }
159 
160 void OpenCLBackend::OpenCLQueue::marker(int bits) {
161     cl_int status = clEnqueueMarkerWithWaitList(
162         command_queue,
163         this->eventc,
164         this->eventListPtr(),
165         this->nextEventPtr()
166     );
167     if (status != CL_SUCCESS) {
168         std::cerr << "failed to clEnqueueMarkerWithWaitList " << errorMsg(status) << std::endl;
169         std::exit(1);
170     }
171     inc(bits);
172 }
173 
174 void OpenCLBackend::OpenCLQueue::marker(int bits, const char *arg) {
175     OPENCL_CHECK(clEnqueueMarkerWithWaitList(
176                      command_queue,
177                      this->eventc,
178                      this->eventListPtr(),
179                      this->nextEventPtr()),
180                  "clEnqueueMarkerWithWaitList");
181 
182     inc(bits, arg);
183 }
184 
185 void OpenCLBackend::OpenCLQueue::computeStart() {
186     wait(); // should be no-op
187     release(); // also ;
188     marker(StartComputeBits);
189 }
190 
191 void OpenCLBackend::OpenCLQueue::computeEnd() {
192     marker(EndComputeBits);
193 }
194 
195 void OpenCLBackend::OpenCLQueue::inc(const int bits) {
196     if (eventc + 1 >= eventMax) {
197         std::cerr << "OpenCLBackend::OpenCLQueue event list overflowed!!" << std::endl;
198     } else {
199         eventInfoBits[eventc] = bits;
200     }
201     eventc++;
202 }
203 
204 void OpenCLBackend::OpenCLQueue::inc(const int bits, const char *arg) {
205     if (eventc + 1 >= eventMax) {
206         std::cerr << "OpenCLBackend::OpenCLQueue event list overflowed!!" << std::endl;
207     } else {
208         eventInfoBits[eventc] = bits | HasConstCharPtrArgBits;
209         eventInfoConstCharPtrArgs[eventc] = arg;
210     }
211     eventc++;
212 }
213 
214 void OpenCLBackend::OpenCLQueue::markAsEndComputeAndInc() {
215     inc(EndComputeBits);
216 }
217 
218 void OpenCLBackend::OpenCLQueue::markAsStartComputeAndInc() {
219     inc(StartComputeBits);
220 }
221 
222 void OpenCLBackend::OpenCLQueue::markAsEnterKernelDispatchAndInc() {
223     inc(EnterKernelDispatchBits);
224 }
225 
226 void OpenCLBackend::OpenCLQueue::markAsLeaveKernelDispatchAndInc() {
227     inc(LeaveKernelDispatchBits);
228 }
229 
230 void OpenCLBackend::OpenCLQueue::release() {
231     // TODO: possible check ALL events before return from the macro
232     for (int i = 0; i < eventc; i++) {
233         OPENCL_CHECK(clReleaseEvent(events[i]), "clReleaseEvent");
234     }
235     eventc = 0;
236 }
237 
238 OpenCLBackend::OpenCLQueue::~OpenCLQueue() {
239     OPENCL_CHECK(clReleaseCommandQueue(command_queue), "clReleaseCommandQueue");
240     delete []events;
241 }
242 
243 void printWarningLocalGroupResized(const size_t local_work_size[]) {
244     std::cout << "[Warning] Thread-Block size got automatically resized: [" << local_work_size[0] << "," << local_work_size[1] << "," << local_work_size[2] << "]" << std::endl;
245 }
246 
247 void checkThreadBlockFits(OpenCLBackend *backend, const KernelContext *kernelContext, const size_t global_work_size[], size_t *local_work_size) {
248     const PlatformInfo platformInfo(backend);
249     size_t max_group_size = platformInfo.deviceInfo.maxWorkGroupSize;
250     size_t totalThreads = kernelContext->lsx * kernelContext->lsy * kernelContext->lsz;
251 
252     // Adjust depending on the total number of threads in the local-work-group
253     while (totalThreads > max_group_size) {
254         // Here just a simple heuristic, starting with the first dimension, 16, 4, 2 local group size
255         if (local_work_size[0] >= 16) {
256             local_work_size[0] /= 2;
257         } else if (local_work_size[1] >= 4) {
258             local_work_size[1] /= 2;
259         } else if (local_work_size[2] >= 2) {
260             local_work_size[2] /= 2;
261         }
262         totalThreads = local_work_size[0] * local_work_size[1] * local_work_size[2];
263         if (backend->config->info) {
264             printWarningLocalGroupResized(local_work_size);
265         }
266     }
267 
268     // Adjust also depending on the global size. We can't launch more threads as local work than global work for
269     // each dimension
270     for (int i = 0; i < 3; i++) {
271         while (local_work_size[i] > global_work_size[i]) {
272             local_work_size[i] /= 2;
273             if (backend->config->info) {
274                 printWarningLocalGroupResized(local_work_size);
275             }
276         }
277     }
278 }
279 
280 void OpenCLBackend::OpenCLQueue::dispatch(KernelContext *kernelContext, CompilationUnit::Kernel *kernel) {
281     size_t numDimensions = kernelContext->dimensions;
282 
283     size_t global_work_size[] {
284         static_cast<size_t>(kernelContext->gsx),
285         static_cast<size_t>(kernelContext->gsy),
286         static_cast<size_t>(kernelContext->gsz)
287     };
288 
289     size_t local_work_size[] = {
290         static_cast<size_t>(kernelContext->lsx),
291         static_cast<size_t>(kernelContext->lsy),
292         static_cast<size_t>(kernelContext->lsz),
293     };
294 
295     if (kernelContext->tlx > 0) {
296         global_work_size[0] = ceil_div(global_work_size[0], kernelContext->tlx);
297     }
298     if (kernelContext->tly > 0) {
299         global_work_size[1] = ceil_div(global_work_size[1], kernelContext->tly);
300     }
301     if (kernelContext->tlz > 0) {
302         global_work_size[2] = ceil_div(global_work_size[2], kernelContext->tlz);
303     }
304 
305     // In the OpenCL backend, we don't currently support warp-sizes to be able to run with OpenCL 1.2 (Apple)
306     // The CUDA backend supports warp-sizes
307 
308     // Check the local-sizes fit
309     auto backendInstance = dynamic_cast<OpenCLBackend *>(this->backend);
310     checkThreadBlockFits(backendInstance, kernelContext, global_work_size, local_work_size);
311 
312     if (backend->config->info) {
313         backend->shortDeviceInfo();
314         std::cout << "[INFO] OpenCLBackend::OpenCLQueue::dispatch" << std::endl;
315         std::cout << "[INFO] numDimensions: " << numDimensions << std::endl;
316         std::cout << "[INFO] GLOBAL [" << global_work_size[0] << "," << global_work_size[1] << "," << global_work_size[2] << "]" << std::endl;
317         if (kernelContext->lsx > 0) {
318             std::cout << "[INFO] LOCAL  [" << local_work_size[0] << "," << local_work_size[1] << "," << local_work_size[2] << "]" << std::endl;
319         } else {
320             std::cout << "[INFO] LOCAL  [ nullptr ] // The driver will setup a default value" << std::endl;
321         }
322     }
323 
324     const cl_int status = clEnqueueNDRangeKernel(
325         command_queue,
326         dynamic_cast<OpenCLProgram::OpenCLKernel *>(kernel)->kernel,
327         numDimensions,
328         nullptr,
329         global_work_size,
330         kernelContext->lsx > 0 ? local_work_size : nullptr,
331         eventc,
332         eventListPtr(),
333         nextEventPtr());
334 
335     inc(NDRangeBits);
336     // markAsNDRangeAndInc();
337 
338     OPENCL_CHECK(status, "clEnqueueNDRangeKernel");
339     if (backend->config->trace | backend->config->traceEnqueues) {
340         std::cout << "enqueued kernel dispatch \"" << kernel->name << "\" globalSize=" << kernelContext->gsx <<
341                 std::endl;
342     }
343 }
344 
345 void OpenCLBackend::OpenCLQueue::copyToDevice(Buffer *buffer) {
346     auto openclBuffer = dynamic_cast<OpenCLBuffer *>(buffer);
347     cl_int status = clEnqueueWriteBuffer(
348         command_queue,
349         openclBuffer->clMem,
350         CL_FALSE,
351         0,
352         buffer->bufferState->length,
353         buffer->bufferState->ptr,
354         eventc,
355         eventListPtr(),
356         nextEventPtr()
357     );
358 
359     OPENCL_CHECK(status, "clEnqueueWriteBuffer");
360 
361     inc(CopyToDeviceBits);
362     //  markAsCopyToDeviceAndInc();
363 }
364 
365 void OpenCLBackend::OpenCLQueue::copyFromDevice(Buffer *buffer) {
366     auto openclBuffer = dynamic_cast<OpenCLBuffer *>(buffer);
367     cl_int status = clEnqueueReadBuffer(
368         command_queue,
369         openclBuffer->clMem,
370         CL_FALSE,
371         0,
372         buffer->bufferState->length,
373         buffer->bufferState->ptr,
374         eventc,
375         eventListPtr(),
376         nextEventPtr()
377     );
378     OPENCL_CHECK(status, "clEnqueueReadBuffer");
379     inc(CopyFromDeviceBits);
380     //markAsCopyFromDeviceAndInc();
381 }