1 /* 2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 #include "opencl_backend.h" 26 /* 27 While based on OpenCL's event list, I think we need to use a MOD eventMax queue. 28 29 So 30 */ 31 OpenCLBackend::OpenCLQueue::OpenCLQueue(Backend *backend) 32 : ProfilableQueue(backend, 10000), 33 command_queue(), 34 events(new cl_event[eventMax]) { 35 } 36 37 cl_event *OpenCLBackend::OpenCLQueue::eventListPtr() const { 38 return (eventc == 0) ? nullptr : events; 39 } 40 41 cl_event *OpenCLBackend::OpenCLQueue::nextEventPtr() const { 42 return &events[eventc]; 43 } 44 45 void OpenCLBackend::OpenCLQueue::showEvents(const int width) { 46 constexpr int SAMPLE_TYPES = 4; 47 auto *samples = new cl_ulong[SAMPLE_TYPES * eventc]; // queued, submit, start, end, complete 48 int sample = 0; 49 cl_ulong min = CL_LONG_MAX; 50 cl_ulong max = CL_LONG_MIN; 51 52 for (int event = 0; event < eventc; event++) { 53 for (int type = 0; type < SAMPLE_TYPES; type++) { 54 cl_profiling_info profiling_info_arr[] = { 55 CL_PROFILING_COMMAND_QUEUED,CL_PROFILING_COMMAND_SUBMIT,CL_PROFILING_COMMAND_START, 56 CL_PROFILING_COMMAND_END 57 }; 58 if ((clGetEventProfilingInfo(events[event], profiling_info_arr[type], sizeof(samples[sample]), 59 &samples[sample], NULL)) != 60 CL_SUCCESS) { 61 const char *profiling_info_name_arr[] = { 62 "CL_PROFILING_COMMAND_QUEUED", "CL_PROFILING_COMMAND_SUBMIT", "CL_PROFILING_COMMAND_START", 63 "CL_PROFILING_COMMAND_END" 64 }; 65 std::cerr << "failed to get profile info " << profiling_info_name_arr[type] << std::endl; 66 } 67 if (sample == 0) { 68 if (type == 0) { 69 min = max = samples[sample]; 70 } 71 } else { 72 if (samples[sample] < min) { 73 min = samples[sample]; 74 } 75 if (samples[sample] > max) { 76 max = samples[sample]; 77 } 78 } 79 sample++; 80 } 81 } 82 sample = 0; 83 const cl_ulong range = (max - min); 84 const cl_ulong scale = range / width; // range per char 85 std::cout << "Range: " << min << "-" << max << "(" << range << "ns)" 86 << " (" << scale << "ns) per char" 87 << " +:submitted, .:started, =:end " << std::endl; 88 89 for (int event = 0; event < eventc; event++) { 90 /* cl_command_type command_type; 91 clGetEventInfo(events[event],CL_EVENT_COMMAND_TYPE,sizeof(command_type), &command_type, nullptr); 92 switch (command_type){ 93 case CL_COMMAND_MARKER: std::cout << "marker "; break; 94 case CL_COMMAND_USER: std::cout << " user "; break; 95 case CL_COMMAND_NDRANGE_KERNEL: std::cout << "kernel "; break; 96 case CL_COMMAND_READ_BUFFER: std::cout << " read "; break; 97 case CL_COMMAND_WRITE_BUFFER: std::cout << " write "; break; 98 default: std::cout << " other "; break; 99 } */ 100 const int bits = eventInfoBits[event]; 101 if ((bits & CopyToDeviceBits) == CopyToDeviceBits) { 102 std::cout << " write " << (bits & 0xffff) << " "; 103 } 104 if ((bits & CopyFromDeviceBits) == CopyFromDeviceBits) { 105 std::cout << " read " << (bits & 0xffff) << " "; 106 } 107 if ((bits & StartComputeBits) == StartComputeBits) { 108 std::cout << " start "; 109 } 110 if ((bits & EndComputeBits) == EndComputeBits) { 111 std::cout << " end "; 112 } 113 if ((bits & NDRangeBits) == NDRangeBits) { 114 std::cout << " kernel "; 115 } 116 if ((bits & EnterKernelDispatchBits) == EnterKernelDispatchBits) { 117 if ((bits & HasConstCharPtrArgBits) == HasConstCharPtrArgBits) { 118 std::cout << eventInfoConstCharPtrArgs[event] << std::endl; 119 } 120 std::cout << " enter{ "; 121 } 122 if ((bits & LeaveKernelDispatchBits) == LeaveKernelDispatchBits) { 123 // std::cout << " leave "; 124 if ((bits & HasConstCharPtrArgBits) == HasConstCharPtrArgBits) { 125 std::cout << eventInfoConstCharPtrArgs[event] << std::endl; 126 } 127 std::cout << " }leave "; 128 } 129 130 131 const cl_ulong queue = (samples[sample++] - min) / scale; 132 const cl_ulong submit = (samples[sample++] - min) / scale; 133 const cl_ulong start = (samples[sample++] - min) / scale; 134 const cl_ulong end = (samples[sample++] - min) / scale; 135 136 std::cout << std::setw(20) << (queue - end) << "(ns) "; 137 for (int c = 0; c < width; c++) { 138 char ch = ' '; 139 if (c >= queue && c <= submit) { 140 ch = '+'; 141 } else if (c > submit && c < start) { 142 ch = '.'; 143 } else if (c >= start && c < end) { 144 ch = '='; 145 } 146 std::cout << ch; 147 } 148 std::cout << std::endl; 149 } 150 delete[] samples; 151 } 152 153 void OpenCLBackend::OpenCLQueue::wait() { 154 if (eventc > 0) { 155 OPENCL_CHECK(clWaitForEvents(eventc, events), "clWaitForEvents"); 156 } 157 } 158 159 void OpenCLBackend::OpenCLQueue::marker(int bits) { 160 cl_int status = clEnqueueMarkerWithWaitList( 161 command_queue, 162 this->eventc, 163 this->eventListPtr(), 164 this->nextEventPtr() 165 ); 166 if (status != CL_SUCCESS) { 167 std::cerr << "failed to clEnqueueMarkerWithWaitList " << errorMsg(status) << std::endl; 168 std::exit(1); 169 } 170 inc(bits); 171 } 172 173 void OpenCLBackend::OpenCLQueue::marker(int bits, const char *arg) { 174 OPENCL_CHECK(clEnqueueMarkerWithWaitList( 175 command_queue, 176 this->eventc, 177 this->eventListPtr(), 178 this->nextEventPtr()), 179 "clEnqueueMarkerWithWaitList"); 180 181 inc(bits, arg); 182 } 183 184 void OpenCLBackend::OpenCLQueue::computeStart() { 185 wait(); // should be no-op 186 release(); // also ; 187 marker(StartComputeBits); 188 } 189 190 void OpenCLBackend::OpenCLQueue::computeEnd() { 191 marker(EndComputeBits); 192 } 193 194 void OpenCLBackend::OpenCLQueue::inc(const int bits) { 195 if (eventc + 1 >= eventMax) { 196 std::cerr << "OpenCLBackend::OpenCLQueue event list overflowed!!" << std::endl; 197 } else { 198 eventInfoBits[eventc] = bits; 199 } 200 eventc++; 201 } 202 203 void OpenCLBackend::OpenCLQueue::inc(const int bits, const char *arg) { 204 if (eventc + 1 >= eventMax) { 205 std::cerr << "OpenCLBackend::OpenCLQueue event list overflowed!!" << std::endl; 206 } else { 207 eventInfoBits[eventc] = bits | HasConstCharPtrArgBits; 208 eventInfoConstCharPtrArgs[eventc] = arg; 209 } 210 eventc++; 211 } 212 213 void OpenCLBackend::OpenCLQueue::markAsEndComputeAndInc() { 214 inc(EndComputeBits); 215 } 216 217 void OpenCLBackend::OpenCLQueue::markAsStartComputeAndInc() { 218 inc(StartComputeBits); 219 } 220 221 void OpenCLBackend::OpenCLQueue::markAsEnterKernelDispatchAndInc() { 222 inc(EnterKernelDispatchBits); 223 } 224 225 void OpenCLBackend::OpenCLQueue::markAsLeaveKernelDispatchAndInc() { 226 inc(LeaveKernelDispatchBits); 227 } 228 229 void OpenCLBackend::OpenCLQueue::release() { 230 // TODO: possible check ALL events before return from the macro 231 for (int i = 0; i < eventc; i++) { 232 OPENCL_CHECK(clReleaseEvent(events[i]), "clReleaseEvent"); 233 } 234 eventc = 0; 235 } 236 237 OpenCLBackend::OpenCLQueue::~OpenCLQueue() { 238 OPENCL_CHECK(clReleaseCommandQueue(command_queue), "clReleaseCommandQueue"); 239 delete []events; 240 } 241 242 void OpenCLBackend::OpenCLQueue::dispatch(KernelContext *kernelContext, Backend::CompilationUnit::Kernel *kernel) { 243 size_t numDimensions = kernelContext->globalMesh.dimensions; 244 245 size_t global_work_size[] { 246 static_cast<size_t>(kernelContext->globalMesh.maxX), 247 static_cast<size_t>(kernelContext->globalMesh.maxY), 248 static_cast<size_t>(kernelContext->globalMesh.maxZ) 249 }; 250 251 size_t local_work_size[] = { 252 static_cast<size_t>(kernelContext->localMesh.maxX), 253 static_cast<size_t>(kernelContext->localMesh.maxY), 254 static_cast<size_t>(kernelContext->localMesh.maxZ), 255 }; 256 257 if (backend->config->info) { 258 std::cout << "[INFO] OpenCLBackend::OpenCLQueue::dispatch" << std::endl; 259 std::cout << "[INFO] numDimensions: " << numDimensions << std::endl; 260 std::cout << "[INFO] GLOBAL [" << global_work_size[0] << "," << global_work_size[1] << "," << global_work_size[2] << "]" << std::endl; 261 if (kernelContext->localMesh.maxX > 0) { 262 std::cout << "[INFO] LOCAL [" << local_work_size[0] << "," << local_work_size[1] << "," << local_work_size[2] << "]" << std::endl; 263 } else { 264 std::cout << "[INFO] LOCAL [ nullptr ] // The driver will setup a default value" << std::endl; 265 } 266 } 267 268 cl_int status = clEnqueueNDRangeKernel( 269 command_queue, 270 dynamic_cast<OpenCLProgram::OpenCLKernel *>(kernel)->kernel, 271 numDimensions, 272 nullptr, 273 global_work_size, 274 kernelContext->localMesh.maxX > 0 ? local_work_size : nullptr, 275 eventc, 276 eventListPtr(), 277 nextEventPtr()); 278 279 inc(NDRangeBits); 280 // markAsNDRangeAndInc(); 281 282 OPENCL_CHECK(status, "clEnqueueNDRangeKernel"); 283 if (backend->config->trace | backend->config->traceEnqueues) { 284 std::cout << "enqueued kernel dispatch \"" << kernel->name << "\" globalSize=" << kernelContext->globalMesh.maxX << 285 std::endl; 286 } 287 } 288 289 void OpenCLBackend::OpenCLQueue::copyToDevice(Buffer *buffer) { 290 auto openclBuffer = dynamic_cast<OpenCLBuffer *>(buffer); 291 cl_int status = clEnqueueWriteBuffer( 292 command_queue, 293 openclBuffer->clMem, 294 CL_FALSE, 295 0, 296 buffer->bufferState->length, 297 buffer->bufferState->ptr, 298 eventc, 299 eventListPtr(), 300 nextEventPtr() 301 ); 302 303 OPENCL_CHECK(status, "clEnqueueWriteBuffer"); 304 305 inc(CopyToDeviceBits); 306 // markAsCopyToDeviceAndInc(); 307 } 308 309 void OpenCLBackend::OpenCLQueue::copyFromDevice(Buffer *buffer) { 310 auto openclBuffer = dynamic_cast<OpenCLBuffer *>(buffer); 311 cl_int status = clEnqueueReadBuffer( 312 command_queue, 313 openclBuffer->clMem, 314 CL_FALSE, 315 0, 316 buffer->bufferState->length, 317 buffer->bufferState->ptr, 318 eventc, 319 eventListPtr(), 320 nextEventPtr() 321 ); 322 OPENCL_CHECK(status, "clEnqueueReadBuffer"); 323 inc(CopyFromDeviceBits); 324 //markAsCopyFromDeviceAndInc(); 325 }