1 /*
2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25 #include "opencl_backend.h"
26
27 /*
28 While based on OpenCL's event list, I think we need to use a MOD eventMax queue.
29
30 So
31 */
32 OpenCLBackend::OpenCLQueue::OpenCLQueue(Backend *backend)
33 : ProfilableQueue(backend, 10000),
34 command_queue(),
35 events(new cl_event[eventMax]) {
36 }
37
38 cl_event *OpenCLBackend::OpenCLQueue::eventListPtr() const {
39 return (eventc == 0) ? nullptr : events;
40 }
41
42 cl_event *OpenCLBackend::OpenCLQueue::nextEventPtr() const {
43 return &events[eventc];
44 }
45
46 void OpenCLBackend::OpenCLQueue::showEvents(const int width) {
47 constexpr int SAMPLE_TYPES = 4;
48 auto *samples = new cl_ulong[SAMPLE_TYPES * eventc]; // queued, submit, start, end, complete
49 int sample = 0;
50 cl_ulong min = CL_LONG_MAX;
51 cl_ulong max = CL_LONG_MIN;
52
53 for (int event = 0; event < eventc; event++) {
54 for (int type = 0; type < SAMPLE_TYPES; type++) {
55 cl_profiling_info profiling_info_arr[] = {
56 CL_PROFILING_COMMAND_QUEUED,CL_PROFILING_COMMAND_SUBMIT,CL_PROFILING_COMMAND_START,
57 CL_PROFILING_COMMAND_END
58 };
59 if ((clGetEventProfilingInfo(events[event], profiling_info_arr[type], sizeof(samples[sample]),
60 &samples[sample], NULL)) !=
61 CL_SUCCESS) {
62 const char *profiling_info_name_arr[] = {
63 "CL_PROFILING_COMMAND_QUEUED", "CL_PROFILING_COMMAND_SUBMIT", "CL_PROFILING_COMMAND_START",
64 "CL_PROFILING_COMMAND_END"
65 };
66 std::cerr << "failed to get profile info " << profiling_info_name_arr[type] << std::endl;
67 }
68 if (sample == 0) {
69 if (type == 0) {
70 min = max = samples[sample];
71 }
72 } else {
73 if (samples[sample] < min) {
74 min = samples[sample];
75 }
76 if (samples[sample] > max) {
77 max = samples[sample];
78 }
79 }
80 sample++;
81 }
82 }
83 sample = 0;
84 const cl_ulong range = (max - min);
85 const cl_ulong scale = range / width; // range per char
86 std::cout << "Range: " << min << "-" << max << "(" << range << "ns)"
87 << " (" << scale << "ns) per char"
88 << " +:submitted, .:started, =:end " << std::endl;
89
90 for (int event = 0; event < eventc; event++) {
91 /* cl_command_type command_type;
92 clGetEventInfo(events[event],CL_EVENT_COMMAND_TYPE,sizeof(command_type), &command_type, nullptr);
93 switch (command_type){
94 case CL_COMMAND_MARKER: std::cout << "marker "; break;
95 case CL_COMMAND_USER: std::cout << " user "; break;
96 case CL_COMMAND_NDRANGE_KERNEL: std::cout << "kernel "; break;
97 case CL_COMMAND_READ_BUFFER: std::cout << " read "; break;
98 case CL_COMMAND_WRITE_BUFFER: std::cout << " write "; break;
99 default: std::cout << " other "; break;
100 } */
101 const int bits = eventInfoBits[event];
102 if ((bits & CopyToDeviceBits) == CopyToDeviceBits) {
103 std::cout << " write " << (bits & 0xffff) << " ";
104 }
105 if ((bits & CopyFromDeviceBits) == CopyFromDeviceBits) {
106 std::cout << " read " << (bits & 0xffff) << " ";
107 }
108 if ((bits & StartComputeBits) == StartComputeBits) {
109 std::cout << " start ";
110 }
111 if ((bits & EndComputeBits) == EndComputeBits) {
112 std::cout << " end ";
113 }
114 if ((bits & NDRangeBits) == NDRangeBits) {
115 std::cout << " kernel ";
116 }
117 if ((bits & EnterKernelDispatchBits) == EnterKernelDispatchBits) {
118 if ((bits & HasConstCharPtrArgBits) == HasConstCharPtrArgBits) {
119 std::cout << eventInfoConstCharPtrArgs[event] << std::endl;
120 }
121 std::cout << " enter{ ";
122 }
123 if ((bits & LeaveKernelDispatchBits) == LeaveKernelDispatchBits) {
124 // std::cout << " leave ";
125 if ((bits & HasConstCharPtrArgBits) == HasConstCharPtrArgBits) {
126 std::cout << eventInfoConstCharPtrArgs[event] << std::endl;
127 }
128 std::cout << " }leave ";
129 }
130
131
132 const cl_ulong queue = (samples[sample++] - min) / scale;
133 const cl_ulong submit = (samples[sample++] - min) / scale;
134 const cl_ulong start = (samples[sample++] - min) / scale;
135 const cl_ulong end = (samples[sample++] - min) / scale;
136
137 std::cout << std::setw(20) << (queue - end) << "(ns) ";
138 for (int c = 0; c < width; c++) {
139 char ch = ' ';
140 if (c >= queue && c <= submit) {
141 ch = '+';
142 } else if (c > submit && c < start) {
143 ch = '.';
144 } else if (c >= start && c < end) {
145 ch = '=';
146 }
147 std::cout << ch;
148 }
149 std::cout << std::endl;
150 }
151 delete[] samples;
152 }
153
154 void OpenCLBackend::OpenCLQueue::wait() {
155 if (eventc > 0) {
156 OPENCL_CHECK(clWaitForEvents(eventc, events), "clWaitForEvents");
157 }
158 }
159
160 void OpenCLBackend::OpenCLQueue::marker(int bits) {
161 cl_int status = clEnqueueMarkerWithWaitList(
162 command_queue,
163 this->eventc,
164 this->eventListPtr(),
165 this->nextEventPtr()
166 );
167 if (status != CL_SUCCESS) {
168 std::cerr << "failed to clEnqueueMarkerWithWaitList " << errorMsg(status) << std::endl;
169 std::exit(1);
170 }
171 inc(bits);
172 }
173
174 void OpenCLBackend::OpenCLQueue::marker(int bits, const char *arg) {
175 OPENCL_CHECK(clEnqueueMarkerWithWaitList(
176 command_queue,
177 this->eventc,
178 this->eventListPtr(),
179 this->nextEventPtr()),
180 "clEnqueueMarkerWithWaitList");
181
182 inc(bits, arg);
183 }
184
185 void OpenCLBackend::OpenCLQueue::computeStart() {
186 wait(); // should be no-op
187 release(); // also ;
188 marker(StartComputeBits);
189 }
190
191 void OpenCLBackend::OpenCLQueue::computeEnd() {
192 marker(EndComputeBits);
193 }
194
195 void OpenCLBackend::OpenCLQueue::inc(const int bits) {
196 if (eventc + 1 >= eventMax) {
197 std::cerr << "OpenCLBackend::OpenCLQueue event list overflowed!!" << std::endl;
198 } else {
199 eventInfoBits[eventc] = bits;
200 }
201 eventc++;
202 }
203
204 void OpenCLBackend::OpenCLQueue::inc(const int bits, const char *arg) {
205 if (eventc + 1 >= eventMax) {
206 std::cerr << "OpenCLBackend::OpenCLQueue event list overflowed!!" << std::endl;
207 } else {
208 eventInfoBits[eventc] = bits | HasConstCharPtrArgBits;
209 eventInfoConstCharPtrArgs[eventc] = arg;
210 }
211 eventc++;
212 }
213
214 void OpenCLBackend::OpenCLQueue::markAsEndComputeAndInc() {
215 inc(EndComputeBits);
216 }
217
218 void OpenCLBackend::OpenCLQueue::markAsStartComputeAndInc() {
219 inc(StartComputeBits);
220 }
221
222 void OpenCLBackend::OpenCLQueue::markAsEnterKernelDispatchAndInc() {
223 inc(EnterKernelDispatchBits);
224 }
225
226 void OpenCLBackend::OpenCLQueue::markAsLeaveKernelDispatchAndInc() {
227 inc(LeaveKernelDispatchBits);
228 }
229
230 void OpenCLBackend::OpenCLQueue::release() {
231 // TODO: possible check ALL events before return from the macro
232 for (int i = 0; i < eventc; i++) {
233 OPENCL_CHECK(clReleaseEvent(events[i]), "clReleaseEvent");
234 }
235 eventc = 0;
236 }
237
238 OpenCLBackend::OpenCLQueue::~OpenCLQueue() {
239 OPENCL_CHECK(clReleaseCommandQueue(command_queue), "clReleaseCommandQueue");
240 delete []events;
241 }
242
243 void printWarningLocalGroupResized(const size_t local_work_size[]) {
244 std::cout << "[Warning] Thread-Block size got automatically resized: [" << local_work_size[0] << "," << local_work_size[1] << "," << local_work_size[2] << "]" << std::endl;
245 }
246
247 void checkThreadBlockFits(OpenCLBackend *backend, const KernelContext *kernelContext, const size_t global_work_size[], size_t *local_work_size) {
248 const PlatformInfo platformInfo(backend);
249 size_t max_group_size = platformInfo.deviceInfo.maxWorkGroupSize;
250 size_t totalThreads = kernelContext->lsx * kernelContext->lsy * kernelContext->lsz;
251
252 // Adjust depending on the total number of threads in the local-work-group
253 while (totalThreads > max_group_size) {
254 // Here just a simple heuristic, starting with the first dimension, 16, 4, 2 local group size
255 if (local_work_size[0] >= 16) {
256 local_work_size[0] /= 2;
257 } else if (local_work_size[1] >= 4) {
258 local_work_size[1] /= 2;
259 } else if (local_work_size[2] >= 2) {
260 local_work_size[2] /= 2;
261 }
262 totalThreads = local_work_size[0] * local_work_size[1] * local_work_size[2];
263 if (backend->config->info) {
264 printWarningLocalGroupResized(local_work_size);
265 }
266 }
267
268 // Adjust also depending on the global size. We can't launch more threads as local work than global work for
269 // each dimension
270 for (int i = 0; i < 3; i++) {
271 while (local_work_size[i] > global_work_size[i]) {
272 local_work_size[i] /= 2;
273 if (backend->config->info) {
274 printWarningLocalGroupResized(local_work_size);
275 }
276 }
277 }
278 }
279
280 void OpenCLBackend::OpenCLQueue::dispatch(KernelContext *kernelContext, CompilationUnit::Kernel *kernel) {
281 size_t numDimensions = kernelContext->dimensions;
282
283 size_t global_work_size[] {
284 static_cast<size_t>(kernelContext->gsx),
285 static_cast<size_t>(kernelContext->gsy),
286 static_cast<size_t>(kernelContext->gsz)
287 };
288
289 size_t local_work_size[] = {
290 static_cast<size_t>(kernelContext->lsx),
291 static_cast<size_t>(kernelContext->lsy),
292 static_cast<size_t>(kernelContext->lsz),
293 };
294
295 if (kernelContext->tlx > 0) {
296 global_work_size[0] = ceil_div(global_work_size[0], kernelContext->tlx);
297 }
298 if (kernelContext->tly > 0) {
299 global_work_size[1] = ceil_div(global_work_size[1], kernelContext->tly);
300 }
301 if (kernelContext->tlz > 0) {
302 global_work_size[2] = ceil_div(global_work_size[2], kernelContext->tlz);
303 }
304
305 // In the OpenCL backend, we don't currently support warp-sizes to be able to run with OpenCL 1.2 (Apple)
306 // The CUDA backend supports warp-sizes
307
308 // Check the local-sizes fit
309 auto backendInstance = dynamic_cast<OpenCLBackend *>(this->backend);
310 checkThreadBlockFits(backendInstance, kernelContext, global_work_size, local_work_size);
311
312 if (backend->config->info) {
313 backend->shortDeviceInfo();
314 std::cout << "[INFO] OpenCLBackend::OpenCLQueue::dispatch" << std::endl;
315 std::cout << "[INFO] numDimensions: " << numDimensions << std::endl;
316 std::cout << "[INFO] GLOBAL [" << global_work_size[0] << "," << global_work_size[1] << "," << global_work_size[2] << "]" << std::endl;
317 if (kernelContext->lsx > 0) {
318 std::cout << "[INFO] LOCAL [" << local_work_size[0] << "," << local_work_size[1] << "," << local_work_size[2] << "]" << std::endl;
319 } else {
320 std::cout << "[INFO] LOCAL [ nullptr ] // The driver will setup a default value" << std::endl;
321 }
322 }
323
324 const cl_int status = clEnqueueNDRangeKernel(
325 command_queue,
326 dynamic_cast<OpenCLProgram::OpenCLKernel *>(kernel)->kernel,
327 numDimensions,
328 nullptr,
329 global_work_size,
330 kernelContext->lsx > 0 ? local_work_size : nullptr,
331 eventc,
332 eventListPtr(),
333 nextEventPtr());
334
335 inc(NDRangeBits);
336 // markAsNDRangeAndInc();
337
338 OPENCL_CHECK(status, "clEnqueueNDRangeKernel");
339 if (backend->config->trace | backend->config->traceEnqueues) {
340 std::cout << "enqueued kernel dispatch \"" << kernel->name << "\" globalSize=" << kernelContext->gsx <<
341 std::endl;
342 }
343 }
344
345 void OpenCLBackend::OpenCLQueue::copyToDevice(Buffer *buffer) {
346 auto openclBuffer = dynamic_cast<OpenCLBuffer *>(buffer);
347 cl_int status = clEnqueueWriteBuffer(
348 command_queue,
349 openclBuffer->clMem,
350 CL_FALSE,
351 0,
352 buffer->bufferState->length,
353 buffer->bufferState->ptr,
354 eventc,
355 eventListPtr(),
356 nextEventPtr()
357 );
358
359 OPENCL_CHECK(status, "clEnqueueWriteBuffer");
360
361 inc(CopyToDeviceBits);
362 // markAsCopyToDeviceAndInc();
363 }
364
365 void OpenCLBackend::OpenCLQueue::copyFromDevice(Buffer *buffer) {
366 auto openclBuffer = dynamic_cast<OpenCLBuffer *>(buffer);
367 cl_int status = clEnqueueReadBuffer(
368 command_queue,
369 openclBuffer->clMem,
370 CL_FALSE,
371 0,
372 buffer->bufferState->length,
373 buffer->bufferState->ptr,
374 eventc,
375 eventListPtr(),
376 nextEventPtr()
377 );
378 OPENCL_CHECK(status, "clEnqueueReadBuffer");
379 inc(CopyFromDeviceBits);
380 //markAsCopyFromDeviceAndInc();
381 }