1 /*
2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 #include <sys/wait.h>
27 #include <chrono>
28 #include "cuda_backend.h"
29 #include <iostream>
30 #include <cstdlib>
31 #include <filesystem>
32
33 PtxSource::PtxSource()
34 : Text(0L) {
35 }
36
37 PtxSource::PtxSource(size_t len)
38 : Text(len) {
39 }
40
41 PtxSource::PtxSource(char *text)
42 : Text(text, false) {
43 }
44
45 PtxSource::PtxSource(size_t len, char *text)
46 : Text(len, text, true) {
47 }
48 PtxSource::PtxSource(size_t len, char *text, bool isCopy)
49 : Text(len, text, isCopy) {
50 }
51
52 CudaSource::CudaSource(size_t len)
53 : Text(len) {
54 }
55
56 CudaSource::CudaSource(char *text)
57 : Text(text, false) {
58 }
59
60 CudaSource::CudaSource(size_t len, char *text, bool isCopy, bool lineinfo)
61 : Text(len, text, isCopy) {
62 _lineInfo = lineinfo;
63 }
64
65 CudaSource::CudaSource()
66 : Text(0) {
67 }
68
69 bool CudaSource::lineInfo() const {
70 return _lineInfo;
71 }
72
73 uint64_t timeSinceEpochMillisec() {
74 using namespace std::chrono;
75 return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
76 }
77
78 std::string tmpFileName(uint64_t time, const std::string directoryName, const std::string &suffix) {
79 std::stringstream timestamp;
80 timestamp << directoryName << "/tmp_" << time << suffix;
81 return timestamp.str();
82 }
83
84 CudaBackend::CudaBackend(int configBits)
85 : Backend(new Config(configBits), new CudaQueue(this)), initStatus(cuInit(0)), device(), context() {
86 int deviceCount = 0;
87
88 if (initStatus == CUDA_SUCCESS) {
89 CUDA_CHECK(cuDeviceGetCount(&deviceCount), "cuDeviceGetCount");
90 if (config->info) {
91 std::cout << "CudaBackend device count = " << deviceCount << std::endl;
92 }
93 CUDA_CHECK(cuDeviceGet(&device, 0), "cuDeviceGet");
94 #if defined(CUDA_VERSION) && CUDA_VERSION >= 12080
95 CUctxCreateParams ctxCreateParams = {};
96 CUDA_CHECK(cuCtxCreate_v4(&context, &ctxCreateParams, 0, device), "cuCtxCreate");
97 #else
98 // Invoke previous implementation with 3 parameters
99 CUDA_CHECK(cuCtxCreate(&context, 0, device), "cuCtxCreate");
100 #endif
101 if (config->info) {
102 std::cout << "CudaBackend context created ok (id=" << context << ")" << std::endl;
103 }
104 dynamic_cast<CudaQueue *>(queue)->init();
105 } else {
106 CUDA_CHECK(initStatus, "cuInit() failed we seem to have the runtime library but no device");
107 }
108 }
109
110 CudaBackend::~CudaBackend() {
111 std::cout << "freeing context" << std::endl;
112 CUDA_CHECK(cuCtxDestroy(context), "cuCtxDestroy");
113 }
114
115 void CudaBackend::shortDeviceInfo() {
116 char name[100];
117 CUDA_CHECK(cuDeviceGetName(name, sizeof(name), device), "cuDeviceGetName");
118 std::cout << "[INFO] Using NVIDIA GPU: " << name << std::endl;
119 }
120
121 void CudaBackend::showDeviceInfo() {
122 char name[100];
123 CUDA_CHECK(cuDeviceGetName(name, sizeof(name), device), "cuDeviceGetName");
124
125 std::cout << "> Using device 0: " << name << std::endl;
126
127 // get compute capabilities and the device name
128 int major = 0, minor = 0;
129 CUDA_CHECK(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device), "cuDeviceGetAttribute");
130 CUDA_CHECK(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device), "cuDeviceGetAttribute");
131 std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl;
132
133 int warpSize;
134 CUDA_CHECK(cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device), "cuDeviceGetAttribute");
135 std::cout << "> GPU Device has warpSize " << warpSize << std::endl;
136
137 int threadsPerBlock;
138 CUDA_CHECK(cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device), "cuDeviceGetAttribute");
139 std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl;
140
141 int cores;
142 CUDA_CHECK(cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device), "cuDeviceGetAttribute");
143 std::cout << "> GPU Cores " << cores << std::endl;
144
145 size_t totalGlobalMem;
146 CUDA_CHECK(cuDeviceTotalMem(&totalGlobalMem, device), "cuDeviceTotalMem");
147 std::cout << " Total amount of global memory: " << (unsigned long long) totalGlobalMem << std::endl;
148 std::cout << " 64-bit Memory Address: " <<
149 ((totalGlobalMem > static_cast<unsigned long long>(4) * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl;
150 }
151
152 PtxSource *CudaBackend::nvcc(const CudaSource *cudaSource) {
153
154 // create var/cuda directory
155 std::string localDirectory = "./var/cuda";
156 std::filesystem::create_directories(localDirectory);
157 // create temp file for cuda generarated code
158 const uint64_t time = timeSinceEpochMillisec();
159 const std::string ptxPath = tmpFileName(time, localDirectory, ".ptx");
160 const std::string cudaPath = tmpFileName(time, localDirectory, ".cu");
161
162 // compile the generated code
163 int pid;
164 cudaSource->write(cudaPath);
165 if ((pid = fork()) == 0) { //child
166 const auto path = "nvcc";
167 std::vector<std::string> command;
168 command.push_back(path);
169 command.push_back("-ptx");
170 command.push_back("-Wno-deprecated-gpu-targets");
171 command.push_back(cudaPath);
172 if (cudaSource->lineInfo()) {
173 command.push_back("-lineinfo");
174 }
175 command.push_back("-o");
176 command.push_back(ptxPath);
177
178 // conver to char*[]
179 const char* args[command.size() + 1];
180 for (int i = 0; i < command.size(); i++) {
181 args[i] = command[i].c_str();
182 }
183 args[command.size()] = nullptr;
184 const int stat = execvp(path, (char *const *) args);
185 std::cerr << " nvcc stat = " << stat << " errno=" << errno << " '" << std::strerror(errno) << "'" << std::endl;
186 std::exit(errno);
187 } else if (pid < 0) {// fork failed.
188 std::cerr << "fork of nvcc failed" << std::endl;
189 std::exit(1);
190 } else { //parent
191 int status;
192 pid_t result = wait(&status);
193 auto *ptx = new PtxSource();
194 ptx->read(ptxPath);
195 return ptx;
196 }
197 }
198
199 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource &cudaSource) {
200 return compile(&cudaSource);
201 }
202
203 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource *cudaSource) {
204 const PtxSource *ptxSource = nvcc(cudaSource);
205 return compile(ptxSource);
206 }
207
208 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource &ptxSource) {
209 return compile(&ptxSource);
210 }
211
212 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource *ptx) {
213 CUmodule module;
214 if (ptx->text != nullptr) {
215 const Log *infLog = new Log(8192);
216 const Log *errLog = new Log(8192);
217 constexpr unsigned int optc = 5;
218 const auto jitOptions = new CUjit_option[optc];
219 auto jitOptVals = new void *[optc];
220
221 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
222 jitOptVals[0] = reinterpret_cast<void *>(infLog->len);
223 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
224 jitOptVals[1] = infLog->text;
225 jitOptions[2] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
226 jitOptVals[2] = reinterpret_cast<void *>(errLog->len);
227 jitOptions[3] = CU_JIT_ERROR_LOG_BUFFER;
228 jitOptVals[3] = errLog->text;
229 jitOptions[4] = CU_JIT_GENERATE_LINE_INFO;
230 jitOptVals[4] = reinterpret_cast<void *>(1);
231
232 CUDA_CHECK(cuCtxSetCurrent(context), "cuCtxSetCurrent");
233 CUDA_CHECK(cuModuleLoadDataEx(&module, ptx->text, optc, jitOptions, (void **) jitOptVals), "cuModuleLoadDataEx");
234
235 if (*infLog->text!='\0'){
236 std::cout << "> PTX JIT inflog:" << std::endl << infLog->text << std::endl;
237 }
238 if (*errLog->text!='\0'){
239 std::cout << "> PTX JIT errlog:" << std::endl << errLog->text << std::endl;
240 }
241 return new CudaModule(this, ptx->text, infLog->text, true, module);
242
243 //delete ptx;
244 } else {
245 std::cout << "no ptx content!" << std::endl;
246 exit(1);
247 }
248 }
249
250 //Entry point from HAT. We use the config PTX bit to determine which Source type
251
252 Backend::CompilationUnit *CudaBackend::compile(const int len, char *source) {
253 if (config->traceCalls) {
254 std::cout << "inside compileProgram" << std::endl;
255 }
256
257 if (config->ptx){
258 if (config->trace) {
259 std::cout << "compiling from provided ptx " << std::endl;
260 }
261 PtxSource ptxSource(len, source, false);
262 return compile(ptxSource);
263 }else{
264 if (config->trace) {
265 std::cout << "compiling from provided cuda " << std::endl;
266 }
267 CudaSource cudaSource(len , source, false, config->profileCudaKernel);
268 return compile(cudaSource);
269 }
270 }
271
272 /*
273
274 if (config->ptx) {
275
276 } else {
277 if (config->trace) {
278 std::cout << "compiling from cuda c99 " << std::endl;
279 }
280 if (config->showCode) {
281 std::cout << "cuda " << source << std::endl;
282 }
283 auto* cuda = new CudaSource(len, source, false);
284 ptx = nvcc(cuda);
285 }
286 if (config->showCode) {
287 std::cout << "ptx " << ptx->text << std::endl;
288 }
289 CUmodule module;
290
291
292 if (ptx->text != nullptr) {
293 constexpr unsigned int jitNumOptions = 2;
294 const auto jitOptions = new CUjit_option[jitNumOptions];
295 const auto jitOptVals = new void *[jitNumOptions];
296
297 // set up size of compilation log buffer
298 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
299 constexpr int jitLogBufferSize = 8192;
300 jitOptVals[0] = reinterpret_cast<void *>(jitLogBufferSize);
301
302 // set up pointer to the compilation log buffer
303 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
304 auto jitLogBuffer = new char[jitLogBufferSize];
305 jitOptVals[1] = jitLogBuffer;
306 cuCtxSetCurrent(context);
307
308 WHERE{
309 .f = __FILE__, .l = __LINE__,
310 .e = cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, jitOptVals),
311 .t = "cuModuleLoadDataEx"
312 }.report();
313 if (jitLogBuffer != nullptr && *jitLogBuffer!='\0'){
314 std::cout << "PTX log:" << jitLogBuffer << std::endl;
315 }
316 return new CudaModule(this, ptx->text, jitLogBuffer, true, module);
317 } else {
318 std::cout << "no ptx content!" << std::endl;
319 exit(1);
320 }
321 } */
322
323 extern "C" long getBackend(int mode) {
324 long backendHandle = reinterpret_cast<long>(new CudaBackend(mode));
325 // std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl;
326 return backendHandle;
327 }
328
329 void clCallback(void *) {
330 std::cerr << "start of compute" << std::endl;
331 }
332
333 void CudaBackend::computeEnd() {
334 queue->computeEnd();
335 }
336
337 void CudaBackend::computeStart() {
338 queue->computeStart();
339 }
340
341 bool CudaBackend::getBufferFromDeviceIfDirty(void *memorySegment, long memorySegmentLength) {
342 if (config->traceCalls) {
343 std::cout << "getBufferFromDeviceIfDirty(" << std::hex << reinterpret_cast<long>(memorySegment) << "," <<
344 std::dec << memorySegmentLength << "){" << std::endl;
345 }
346 if (config->minimizeCopies) {
347 const BufferState *bufferState = BufferState::of(memorySegment, memorySegmentLength);
348 if (bufferState->state == BufferState::DEVICE_OWNED) {
349 queue->copyFromDevice(static_cast<Backend::Buffer *>(bufferState->vendorPtr));
350 if (config->traceEnqueues | config->traceCopies) {
351 std::cout << "copying buffer from device (from java access) " << std::endl;
352 }
353 queue->wait();
354 queue->release();
355 } else {
356 std::cout << "HOW DID WE GET HERE 1 attempting to get buffer but buffer is not device dirty" << std::endl;
357 std::exit(1);
358 }
359 } else {
360 std::cerr <<
361 "HOW DID WE GET HERE ? java side should avoid calling getBufferFromDeviceIfDirty as we are not minimising buffers!"
362 << std::endl;
363 std::exit(1);
364 }
365 if (config->traceCalls) {
366 std::cout << "}getBufferFromDeviceIfDirty()" << std::endl;
367 }
368 return true;
369 }
370
371 CudaBackend *CudaBackend::of(const long backendHandle) {
372 return reinterpret_cast<CudaBackend *>(backendHandle);
373 }
374
375 CudaBackend *CudaBackend::of(Backend *backend) {
376 return dynamic_cast<CudaBackend *>(backend);
377 }
378
379 CudaBackend::CudaBuffer *CudaBackend::getOrCreateBuffer(BufferState *bufferState) {
380 CudaBuffer *cudaBuffer = nullptr;
381 if (bufferState->vendorPtr == nullptr || bufferState->state == BufferState::NEW_STATE) {
382 cudaBuffer = new CudaBuffer(this, bufferState);
383 if (config->trace) {
384 std::cout << "We allocated arg buffer " << std::endl;
385 }
386 bufferState->state = BufferState::NEW_STATE;
387 } else {
388 if (config->trace) {
389 std::cout << "Were reusing buffer buffer " << std::endl;
390 }
391 cudaBuffer = static_cast<CudaBuffer *>(bufferState->vendorPtr);
392 }
393 return cudaBuffer;
394 }