1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 #include "opencl_backend.h"
 26 /*
 27 While based on OpenCL's event list, I think we need to use a MOD eventMax queue.
 28 
 29 So
 30 */
 31 OpenCLBackend::OpenCLQueue::OpenCLQueue(Backend *backend)
 32     : ProfilableQueue(backend, 10000),
 33       command_queue(),
 34       events(new cl_event[eventMax]) {
 35 }
 36 
 37 cl_event *OpenCLBackend::OpenCLQueue::eventListPtr() const {
 38     return (eventc == 0) ? nullptr : events;
 39 }
 40 
 41 cl_event *OpenCLBackend::OpenCLQueue::nextEventPtr() const {
 42     return &events[eventc];
 43 }
 44 
 45 void OpenCLBackend::OpenCLQueue::showEvents(const int width) {
 46     constexpr int SAMPLE_TYPES = 4;
 47     auto *samples = new cl_ulong[SAMPLE_TYPES * eventc]; // queued, submit, start, end, complete
 48     int sample = 0;
 49     cl_ulong min = CL_LONG_MAX;
 50     cl_ulong max = CL_LONG_MIN;
 51 
 52     for (int event = 0; event < eventc; event++) {
 53         for (int type = 0; type < SAMPLE_TYPES; type++) {
 54             cl_profiling_info profiling_info_arr[] = {
 55                 CL_PROFILING_COMMAND_QUEUED,CL_PROFILING_COMMAND_SUBMIT,CL_PROFILING_COMMAND_START,
 56                 CL_PROFILING_COMMAND_END
 57             };
 58             if ((clGetEventProfilingInfo(events[event], profiling_info_arr[type], sizeof(samples[sample]),
 59                                          &samples[sample], NULL)) !=
 60                 CL_SUCCESS) {
 61                 const char *profiling_info_name_arr[] = {
 62                     "CL_PROFILING_COMMAND_QUEUED", "CL_PROFILING_COMMAND_SUBMIT", "CL_PROFILING_COMMAND_START",
 63                     "CL_PROFILING_COMMAND_END"
 64                 };
 65                 std::cerr << "failed to get profile info " << profiling_info_name_arr[type] << std::endl;
 66             }
 67             if (sample == 0) {
 68                 if (type == 0) {
 69                     min = max = samples[sample];
 70                 }
 71             } else {
 72                 if (samples[sample] < min) {
 73                     min = samples[sample];
 74                 }
 75                 if (samples[sample] > max) {
 76                     max = samples[sample];
 77                 }
 78             }
 79             sample++;
 80         }
 81     }
 82     sample = 0;
 83     const cl_ulong range = (max - min);
 84     const cl_ulong scale = range / width; // range per char
 85     std::cout << "Range: " << min << "-" << max << "(" << range << "ns)"
 86             << "  (" << scale << "ns) per char"
 87             << " +:submitted, .:started, =:end  " << std::endl;
 88 
 89     for (int event = 0; event < eventc; event++) {
 90         /*  cl_command_type command_type;
 91           clGetEventInfo(events[event],CL_EVENT_COMMAND_TYPE,sizeof(command_type), &command_type, nullptr);
 92           switch (command_type){
 93             case CL_COMMAND_MARKER:         std::cout <<   "marker "; break;
 94             case CL_COMMAND_USER:           std::cout <<   "  user "; break;
 95             case CL_COMMAND_NDRANGE_KERNEL: std::cout <<   "kernel "; break;
 96             case CL_COMMAND_READ_BUFFER:    std::cout <<   "  read "; break;
 97             case CL_COMMAND_WRITE_BUFFER:   std::cout <<   " write "; break;
 98             default: std::cout <<                          " other "; break;
 99           } */
100         const int bits = eventInfoBits[event];
101         if ((bits & CopyToDeviceBits) == CopyToDeviceBits) {
102             std::cout << "  write " << (bits & 0xffff) << " ";
103         }
104         if ((bits & CopyFromDeviceBits) == CopyFromDeviceBits) {
105             std::cout << "   read " << (bits & 0xffff) << " ";
106         }
107         if ((bits & StartComputeBits) == StartComputeBits) {
108             std::cout << "  start    ";
109         }
110         if ((bits & EndComputeBits) == EndComputeBits) {
111             std::cout << "    end    ";
112         }
113         if ((bits & NDRangeBits) == NDRangeBits) {
114             std::cout << " kernel    ";
115         }
116         if ((bits & EnterKernelDispatchBits) == EnterKernelDispatchBits) {
117             if ((bits & HasConstCharPtrArgBits) == HasConstCharPtrArgBits) {
118                 std::cout << eventInfoConstCharPtrArgs[event] << std::endl;
119             }
120             std::cout << "  enter{   ";
121         }
122         if ((bits & LeaveKernelDispatchBits) == LeaveKernelDispatchBits) {
123             // std::cout <<   "  leave    ";
124             if ((bits & HasConstCharPtrArgBits) == HasConstCharPtrArgBits) {
125                 std::cout << eventInfoConstCharPtrArgs[event] << std::endl;
126             }
127             std::cout << " }leave    ";
128         }
129 
130 
131         const cl_ulong queue = (samples[sample++] - min) / scale;
132         const cl_ulong submit = (samples[sample++] - min) / scale;
133         const cl_ulong start = (samples[sample++] - min) / scale;
134         const cl_ulong end = (samples[sample++] - min) / scale;
135 
136         std::cout << std::setw(20) << (queue - end) << "(ns) ";
137         for (int c = 0; c < width; c++) {
138             char ch = ' ';
139             if (c >= queue && c <= submit) {
140                 ch = '+';
141             } else if (c > submit && c < start) {
142                 ch = '.';
143             } else if (c >= start && c < end) {
144                 ch = '=';
145             }
146             std::cout << ch;
147         }
148         std::cout << std::endl;
149     }
150     delete[] samples;
151 }
152 
153 void OpenCLBackend::OpenCLQueue::wait() {
154     if (eventc > 0) {
155         OPENCL_CHECK(clWaitForEvents(eventc, events), "clWaitForEvents");
156     }
157 }
158 
159 void OpenCLBackend::OpenCLQueue::marker(int bits) {
160     cl_int status = clEnqueueMarkerWithWaitList(
161         command_queue,
162         this->eventc,
163         this->eventListPtr(),
164         this->nextEventPtr()
165     );
166     if (status != CL_SUCCESS) {
167         std::cerr << "failed to clEnqueueMarkerWithWaitList " << errorMsg(status) << std::endl;
168         std::exit(1);
169     }
170     inc(bits);
171 }
172 
173 void OpenCLBackend::OpenCLQueue::marker(int bits, const char *arg) {
174     OPENCL_CHECK(clEnqueueMarkerWithWaitList(
175                      command_queue,
176                      this->eventc,
177                      this->eventListPtr(),
178                      this->nextEventPtr()),
179                  "clEnqueueMarkerWithWaitList");
180 
181     inc(bits, arg);
182 }
183 
184 void OpenCLBackend::OpenCLQueue::computeStart() {
185     wait(); // should be no-op
186     release(); // also ;
187     marker(StartComputeBits);
188 }
189 
190 void OpenCLBackend::OpenCLQueue::computeEnd() {
191     marker(EndComputeBits);
192 }
193 
194 void OpenCLBackend::OpenCLQueue::inc(const int bits) {
195     if (eventc + 1 >= eventMax) {
196         std::cerr << "OpenCLBackend::OpenCLQueue event list overflowed!!" << std::endl;
197     } else {
198         eventInfoBits[eventc] = bits;
199     }
200     eventc++;
201 }
202 
203 void OpenCLBackend::OpenCLQueue::inc(const int bits, const char *arg) {
204     if (eventc + 1 >= eventMax) {
205         std::cerr << "OpenCLBackend::OpenCLQueue event list overflowed!!" << std::endl;
206     } else {
207         eventInfoBits[eventc] = bits | HasConstCharPtrArgBits;
208         eventInfoConstCharPtrArgs[eventc] = arg;
209     }
210     eventc++;
211 }
212 
213 void OpenCLBackend::OpenCLQueue::markAsEndComputeAndInc() {
214     inc(EndComputeBits);
215 }
216 
217 void OpenCLBackend::OpenCLQueue::markAsStartComputeAndInc() {
218     inc(StartComputeBits);
219 }
220 
221 void OpenCLBackend::OpenCLQueue::markAsEnterKernelDispatchAndInc() {
222     inc(EnterKernelDispatchBits);
223 }
224 
225 void OpenCLBackend::OpenCLQueue::markAsLeaveKernelDispatchAndInc() {
226     inc(LeaveKernelDispatchBits);
227 }
228 
229 void OpenCLBackend::OpenCLQueue::release() {
230     // TODO: possible check ALL events before return from the macro
231     for (int i = 0; i < eventc; i++) {
232         OPENCL_CHECK(clReleaseEvent(events[i]), "clReleaseEvent");
233     }
234     eventc = 0;
235 }
236 
237 OpenCLBackend::OpenCLQueue::~OpenCLQueue() {
238     OPENCL_CHECK(clReleaseCommandQueue(command_queue), "clReleaseCommandQueue");
239     delete []events;
240 }
241 
242 void OpenCLBackend::OpenCLQueue::dispatch(KernelContext *kernelContext, Backend::CompilationUnit::Kernel *kernel) {
243     size_t numDimensions = kernelContext->globalMesh.dimensions;
244 
245     size_t global_work_size[] {
246         static_cast<size_t>(kernelContext->globalMesh.maxX),
247         static_cast<size_t>(kernelContext->globalMesh.maxY),
248         static_cast<size_t>(kernelContext->globalMesh.maxZ)
249     };
250 
251     size_t local_work_size[] = {
252         static_cast<size_t>(kernelContext->localMesh.maxX),
253         static_cast<size_t>(kernelContext->localMesh.maxY),
254         static_cast<size_t>(kernelContext->localMesh.maxZ),
255     };
256 
257     if (backend->config->info) {
258         std::cout << "[INFO] OpenCLBackend::OpenCLQueue::dispatch" << std::endl;
259         std::cout << "[INFO] numDimensions: " << numDimensions << std::endl;
260         std::cout << "[INFO] GLOBAL [" << global_work_size[0] << "," << global_work_size[1] << "," << global_work_size[2] << "]" << std::endl;
261         if (kernelContext->localMesh.maxX > 0) {
262             std::cout << "[INFO] LOCAL  [" << local_work_size[0] << "," << local_work_size[1] << "," << local_work_size[2] << "]" << std::endl;
263         } else {
264             std::cout << "[INFO] LOCAL  [ nullptr ] // The driver will setup a default value" << std::endl;
265         }
266     }
267 
268     cl_int status = clEnqueueNDRangeKernel(
269         command_queue,
270         dynamic_cast<OpenCLProgram::OpenCLKernel *>(kernel)->kernel,
271         numDimensions,
272         nullptr,
273         global_work_size,
274         kernelContext->localMesh.maxX > 0 ? local_work_size : nullptr,
275         eventc,
276         eventListPtr(),
277         nextEventPtr());
278 
279     inc(NDRangeBits);
280     // markAsNDRangeAndInc();
281 
282     OPENCL_CHECK(status, "clEnqueueNDRangeKernel");
283     if (backend->config->trace | backend->config->traceEnqueues) {
284         std::cout << "enqueued kernel dispatch \"" << kernel->name << "\" globalSize=" << kernelContext->globalMesh.maxX <<
285                 std::endl;
286     }
287 }
288 
289 void OpenCLBackend::OpenCLQueue::copyToDevice(Buffer *buffer) {
290     auto openclBuffer = dynamic_cast<OpenCLBuffer *>(buffer);
291     cl_int status = clEnqueueWriteBuffer(
292         command_queue,
293         openclBuffer->clMem,
294         CL_FALSE,
295         0,
296         buffer->bufferState->length,
297         buffer->bufferState->ptr,
298         eventc,
299         eventListPtr(),
300         nextEventPtr()
301     );
302 
303     OPENCL_CHECK(status, "clEnqueueWriteBuffer");
304 
305     inc(CopyToDeviceBits);
306     //  markAsCopyToDeviceAndInc();
307 }
308 
309 void OpenCLBackend::OpenCLQueue::copyFromDevice(Buffer *buffer) {
310     auto openclBuffer = dynamic_cast<OpenCLBuffer *>(buffer);
311     cl_int status = clEnqueueReadBuffer(
312         command_queue,
313         openclBuffer->clMem,
314         CL_FALSE,
315         0,
316         buffer->bufferState->length,
317         buffer->bufferState->ptr,
318         eventc,
319         eventListPtr(),
320         nextEventPtr()
321     );
322     OPENCL_CHECK(status, "clEnqueueReadBuffer");
323     inc(CopyFromDeviceBits);
324     //markAsCopyFromDeviceAndInc();
325 }