1 /*
2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 #include <sys/wait.h>
27 #include <chrono>
28 #include <thread>
29 #include "cuda_backend.h"
30
31 CudaBackend::CudaQueue::CudaQueue(Backend *backend)
32 : Backend::Queue(backend),cuStream(),streamCreationThread() {
33 }
34 void CudaBackend::CudaQueue::init(){
35 streamCreationThread = std::this_thread::get_id();
36 if (backend->config->traceCalls){
37 std::cout << "init() 0x"
38 << " thread=" <<streamCreationThread
39 << std::endl;
40 }
41
42 WHERE{.f=__FILE__ , .l=__LINE__,
43 .e=cuStreamCreate(&cuStream,CU_STREAM_DEFAULT),
44 .t= "cuStreamCreate"
45 }.report();
46
47 if (backend->config->traceCalls){
48 std::cout << "exiting init() 0x"
49 << " custream=" <<std::hex<<streamCreationThread <<std::dec
50 << std::endl;
51 }
52 }
53
54 void CudaBackend::CudaQueue::wait(){
55 CUDA_CHECK(cuStreamSynchronize(cuStream), "cuStreamSynchronize");
56 }
57
58
59 void CudaBackend::CudaQueue::computeStart() {
60 wait(); // should be no-op
61 release(); // also ;
62 }
63
64 void CudaBackend::CudaQueue::computeEnd() {
65
66 }
67
68 void CudaBackend::CudaQueue::release() {
69
70 }
71
72 CudaBackend::CudaQueue::~CudaQueue() {
73 CUDA_CHECK(cuStreamDestroy(cuStream), "cuStreamDestroy");
74 }
75
76 void CudaBackend::CudaQueue::copyToDevice(Buffer *buffer) {
77 const auto *cudaBuffer = dynamic_cast<CudaBuffer *>(buffer);
78 const std::thread::id thread_id = std::this_thread::get_id();
79 if (thread_id != streamCreationThread){
80 std::cout << "copyToDevice() thread=" <<thread_id<< " != "<< streamCreationThread<< std::endl;
81 }
82 if (backend->config->traceCalls) {
83
84 std::cout << "copyToDevice() 0x"
85 << std::hex<<cudaBuffer->bufferState->length<<std::dec << "/"
86 << cudaBuffer->bufferState->length << " "
87 << "devptr=" << std::hex<< static_cast<long>(cudaBuffer->devicePtr) <<std::dec
88 << " thread=" <<thread_id
89 << std::endl;
90 }
91
92 CUDA_CHECK(cuMemcpyHtoDAsync(cudaBuffer->devicePtr,
93 cudaBuffer->bufferState->ptr,
94 cudaBuffer->bufferState->length,
95 dynamic_cast<CudaQueue*>(backend->queue)->cuStream), "cuMemcpyHtoDAsync");
96 }
97
98 void CudaBackend::CudaQueue::copyFromDevice(Buffer *buffer) {
99 const auto *cudaBuffer = dynamic_cast<CudaBuffer *>(buffer);
100 const std::thread::id thread_id = std::this_thread::get_id();
101 if (thread_id != streamCreationThread){
102 std::cout << "copyFromDevice() thread=" <<thread_id<< " != "<< streamCreationThread<< std::endl;
103 }
104 if (backend->config->traceCalls) {
105
106 std::cout << "copyFromDevice() 0x"
107 << std::hex<<cudaBuffer->bufferState->length<<std::dec << "/"
108 << cudaBuffer->bufferState->length << " "
109 << "devptr=" << std::hex<< static_cast<long>(cudaBuffer->devicePtr) <<std::dec
110 << " thread=" <<thread_id
111 << std::endl;
112 }
113
114 CUDA_CHECK(cuMemcpyDtoHAsync(cudaBuffer->bufferState->ptr,
115 cudaBuffer->devicePtr,
116 cudaBuffer->bufferState->length,
117 dynamic_cast<CudaQueue*>(backend->queue)->cuStream),
118 "cuMemcpyDtoHAsync");
119
120 }
121
122 // TODO: Improve heuristics to decide a better block size, if possible.
123 // The following is just a rough number to fit into a modern NVIDIA GPU.
124 int CudaBackend::CudaQueue::estimateThreadsPerBlock(int dimensions) {
125 switch (dimensions) {
126 case 1: return 256;
127 case 2: return 16;
128 case 3: return 16;
129 default: return 1;
130 }
131 }
132
133 int CudaBackend::CudaQueue::estimateThreadsPerBlock(int dimensions, int globalSizePerDimension, int localSize) {
134 int threadsPerBlock = 1;
135 if (localSize > 0) {
136 threadsPerBlock = localSize;
137 } else if (globalSizePerDimension > 1) {
138 threadsPerBlock = estimateThreadsPerBlock(dimensions);
139 // Check if we are running a small range
140 while (globalSizePerDimension < threadsPerBlock) {
141 threadsPerBlock /= 2;
142 }
143 }
144 return threadsPerBlock;
145 }
146
147 void CudaBackend::CudaQueue::dispatch(KernelContext *kernelContext, CompilationUnit::Kernel *kernel) {
148
149 const auto cudaKernel = dynamic_cast<CudaModule::CudaKernel *>(kernel);
150
151 int threadsPerBlockX = estimateThreadsPerBlock(kernelContext->dimensions, kernelContext->gsx, kernelContext->lsx);
152 int threadsPerBlockY = estimateThreadsPerBlock(kernelContext->dimensions, kernelContext->gsy, kernelContext->lsy);
153 int threadsPerBlockZ = estimateThreadsPerBlock(kernelContext->dimensions, kernelContext->gsz, kernelContext->lsz);
154
155 int warpFactor[3] = { 1, 1, 1 };
156 if (kernelContext->wsx) {
157 warpFactor[0] = 32;
158 }
159 if (kernelContext->wsy) {
160 warpFactor[1] = 32;
161 }
162 if (kernelContext->wsz) {
163 warpFactor[2] = 32;
164 }
165
166 int globalSize[3] = { kernelContext->gsx, kernelContext->gsy, kernelContext->gsz };
167 globalSize[0] = kernelContext->tlx? ceil_div(kernelContext->gsx, kernelContext->tlx) * warpFactor[0]: kernelContext->gsx;
168 globalSize[1] = kernelContext->tly? ceil_div(kernelContext->gsy, kernelContext->tly) * warpFactor[1]: kernelContext->gsy;
169 globalSize[2] = kernelContext->tlz? ceil_div(kernelContext->gsz, kernelContext->tlz) * warpFactor[2]: kernelContext->gsz;
170
171 int blocksPerGridX = ceil_div(globalSize[0], threadsPerBlockX);
172 int blocksPerGridY = 1;
173 int blocksPerGridZ = 1;
174 if (kernelContext->dimensions > 1) {
175 blocksPerGridY = ceil_div(globalSize[1], threadsPerBlockY);
176 }
177 if (kernelContext->dimensions > 2) {
178 blocksPerGridZ = ceil_div(globalSize[2], threadsPerBlockZ);
179 }
180
181 // Enable debug information with info: HAT=INFO
182 if (backend->config->info) {
183 backend->shortDeviceInfo();
184 std::cout << "[INFO] Dispatching the CUDA kernel" << std::endl;
185 std::cout << " \\_ BlocksPerGrid = [" << blocksPerGridX << "," << blocksPerGridY << "," << blocksPerGridZ << "]" << std::endl;
186 std::cout << " \\_ ThreadsPerBlock = [" << threadsPerBlockX << "," << threadsPerBlockY << "," << threadsPerBlockZ << "]" << std::endl;
187 }
188
189 const std::thread::id thread_id = std::this_thread::get_id();
190 if (thread_id != streamCreationThread) {
191 std::cout << "dispatch() thread=" <<thread_id<< " != "<< streamCreationThread<< std::endl;
192 }
193
194 // // CUDA events for timing
195 // cudaEvent_t start, stop;
196 // cuEventCreate(&start, cudaEventDefault);
197 // cuEventCreate(&stop, cudaEventDefault);
198 // cuEventRecord(start, 0);
199
200 const auto status = cuLaunchKernel(cudaKernel->function, //
201 blocksPerGridX, blocksPerGridY, blocksPerGridZ, //
202 threadsPerBlockX, threadsPerBlockY, threadsPerBlockZ, //
203 0, //
204 cuStream, //
205 cudaKernel->argslist, //
206 nullptr);
207 // cuEventRecord(stop, 0);
208 // cuEventSynchronize(stop);
209 // float elapsedTimeMs = 0.0f;
210 // cuEventElapsedTime(&elapsedTimeMs, start, stop);
211 // std::cout << "Kernel Elapsed Time: " << elapsedTimeMs << " ms\n";
212
213 CUDA_CHECK(status, "cuLaunchKernel");
214 }