1 /*
2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 #include <sys/wait.h>
27 #include <chrono>
28 #include <thread>
29 #include "cuda_backend.h"
30
31 CudaBackend::CudaQueue::CudaQueue(Backend *backend)
32 : Backend::Queue(backend),cuStream(),streamCreationThread() {
33 }
34 void CudaBackend::CudaQueue::init(){
35 streamCreationThread = std::this_thread::get_id();
36 if (backend->config->traceCalls){
37 std::cout << "init() 0x"
38 << " thread=" <<streamCreationThread
39 << std::endl;
40 }
41
42 WHERE{.f=__FILE__ , .l=__LINE__,
43 .e=cuStreamCreate(&cuStream,CU_STREAM_DEFAULT),
44 .t= "cuStreamCreate"
45 }.report();
46
47 if (backend->config->traceCalls){
48 std::cout << "exiting init() 0x"
49 << " custream=" <<std::hex<<streamCreationThread <<std::dec
50 << std::endl;
51 }
52 }
53
54 void CudaBackend::CudaQueue::wait(){
55 CUDA_CHECK(cuStreamSynchronize(cuStream), "cuStreamSynchronize");
56 }
57
58
59 void CudaBackend::CudaQueue::computeStart() {
60 wait(); // should be no-op
61 release(); // also ;
62 }
63
64 void CudaBackend::CudaQueue::computeEnd() {
65
66 }
67
68 void CudaBackend::CudaQueue::release() {
69
70 }
71
72 CudaBackend::CudaQueue::~CudaQueue() {
73 CUDA_CHECK(cuStreamDestroy(cuStream), "cuStreamDestroy");
74 }
75
76 void CudaBackend::CudaQueue::copyToDevice(Buffer *buffer) {
77 const auto *cudaBuffer = dynamic_cast<CudaBuffer *>(buffer);
78 const std::thread::id thread_id = std::this_thread::get_id();
79 if (thread_id != streamCreationThread){
80 std::cout << "copyToDevice() thread=" <<thread_id<< " != "<< streamCreationThread<< std::endl;
81 }
82 if (backend->config->traceCalls) {
83
84 std::cout << "copyToDevice() 0x"
85 << std::hex<<cudaBuffer->bufferState->length<<std::dec << "/"
86 << cudaBuffer->bufferState->length << " "
87 << "devptr=" << std::hex<< static_cast<long>(cudaBuffer->devicePtr) <<std::dec
88 << " thread=" <<thread_id
89 << std::endl;
90 }
91
92 CUDA_CHECK(cuMemcpyHtoDAsync(cudaBuffer->devicePtr,
93 cudaBuffer->bufferState->ptr,
94 cudaBuffer->bufferState->length,
95 dynamic_cast<CudaQueue*>(backend->queue)->cuStream), "cuMemcpyHtoDAsync");
96 }
97
98 void CudaBackend::CudaQueue::copyFromDevice(Buffer *buffer) {
99 const auto *cudaBuffer = dynamic_cast<CudaBuffer *>(buffer);
100 const std::thread::id thread_id = std::this_thread::get_id();
101 if (thread_id != streamCreationThread){
102 std::cout << "copyFromDevice() thread=" <<thread_id<< " != "<< streamCreationThread<< std::endl;
103 }
104 if (backend->config->traceCalls) {
105
106 std::cout << "copyFromDevice() 0x"
107 << std::hex<<cudaBuffer->bufferState->length<<std::dec << "/"
108 << cudaBuffer->bufferState->length << " "
109 << "devptr=" << std::hex<< static_cast<long>(cudaBuffer->devicePtr) <<std::dec
110 << " thread=" <<thread_id
111 << std::endl;
112 }
113
114 CUDA_CHECK(cuMemcpyDtoHAsync(cudaBuffer->bufferState->ptr,
115 cudaBuffer->devicePtr,
116 cudaBuffer->bufferState->length,
117 dynamic_cast<CudaQueue*>(backend->queue)->cuStream),
118 "cuMemcpyDtoHAsync");
119
120 }
121
122 // TODO: Improve heuristics to decide a better block size, if possible.
123 // The following is just a rough number to fit into a modern NVIDIA GPU.
124 int CudaBackend::CudaQueue::estimateThreadsPerBlock(int dimensions) {
125 switch (dimensions) {
126 case 1: return 256;
127 case 2: return 16;
128 case 3: return 16;
129 default: return 1;
130 }
131 }
132
133 int CudaBackend::CudaQueue::estimateThreadsPerBlock(int dimensions, int globalSizePerDimension, int localSize) {
134 int threadsPerBlock = 1;
135 if (localSize > 0) {
136 threadsPerBlock = localSize;
137 } else if (globalSizePerDimension > 1) {
138 threadsPerBlock = estimateThreadsPerBlock(dimensions);
139 // Check if we are running a small range
140 while (globalSizePerDimension < threadsPerBlock) {
141 threadsPerBlock /= 2;
142 }
143 }
144 return threadsPerBlock;
145 }
146
147 void CudaBackend::CudaQueue::dispatch(KernelContext *kernelContext, CompilationUnit::Kernel *kernel) {
148 const auto cudaKernel = dynamic_cast<CudaModule::CudaKernel *>(kernel);
149
150 int threadsPerBlockX = estimateThreadsPerBlock(kernelContext->dimensions, kernelContext->gsx, kernelContext->lsx);
151 int threadsPerBlockY = estimateThreadsPerBlock(kernelContext->dimensions, kernelContext->gsy, kernelContext->lsy);
152 int threadsPerBlockZ = estimateThreadsPerBlock(kernelContext->dimensions, kernelContext->gsz, kernelContext->lsz);
153
154 int blocksPerGridX = (kernelContext->gsx + threadsPerBlockX - 1) / threadsPerBlockX;
155 int blocksPerGridY = 1;
156 int blocksPerGridZ = 1;
157 if (kernelContext->dimensions > 1) {
158 blocksPerGridY = (kernelContext->gsy + threadsPerBlockY - 1) / threadsPerBlockY;
159 }
160 if (kernelContext->dimensions > 2) {
161 blocksPerGridZ = (kernelContext->gsz + threadsPerBlockZ - 1) / threadsPerBlockZ;
162 }
163
164 // Enable debug information with trace. Use HAT=INFO
165 if (backend->config->info) {
166 std::cout << "Dispatching the CUDA kernel" << std::endl;
167 std::cout << " \\_ BlocksPerGrid = [" << blocksPerGridX << "," << blocksPerGridY << "," << blocksPerGridZ << "]" << std::endl;
168 std::cout << " \\_ ThreadsPerBlock = [" << threadsPerBlockX << "," << threadsPerBlockY << "," << threadsPerBlockZ << "]" << std::endl;
169 }
170
171 const std::thread::id thread_id = std::this_thread::get_id();
172 if (thread_id != streamCreationThread) {
173 std::cout << "dispatch() thread=" <<thread_id<< " != "<< streamCreationThread<< std::endl;
174 }
175
176 // // CUDA events for timing
177 // cudaEvent_t start, stop;
178 // cuEventCreate(&start, cudaEventDefault);
179 // cuEventCreate(&stop, cudaEventDefault);
180 // cuEventRecord(start, 0);
181
182 const auto status = cuLaunchKernel(cudaKernel->function, //
183 blocksPerGridX, blocksPerGridY, blocksPerGridZ, //
184 threadsPerBlockX, threadsPerBlockY, threadsPerBlockZ, //
185 0, //
186 cuStream, //
187 cudaKernel->argslist, //
188 nullptr);
189 // cuEventRecord(stop, 0);
190 // cuEventSynchronize(stop);
191 // float elapsedTimeMs = 0.0f;
192 // cuEventElapsedTime(&elapsedTimeMs, start, stop);
193 // std::cout << "Kernel Elapsed Time: " << elapsedTimeMs << " ms\n";
194
195 CUDA_CHECK(status, "cuLaunchKernel");
196 }