1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 #include <sys/wait.h>
 27 #include <chrono>
 28 #include <thread>
 29 #include "cuda_backend.h"
 30 
 31 CudaBackend::CudaQueue::CudaQueue(Backend *backend)
 32         : Backend::Queue(backend),cuStream(),streamCreationThread() {
 33 }
 34 void CudaBackend::CudaQueue::init(){
 35     streamCreationThread = std::this_thread::get_id();
 36     if (backend->config->traceCalls){
 37         std::cout << "init() 0x"
 38                   << " thread=" <<streamCreationThread
 39                   << std::endl;
 40     }
 41 
 42     WHERE{.f=__FILE__ , .l=__LINE__,
 43           .e=cuStreamCreate(&cuStream,CU_STREAM_DEFAULT),
 44           .t= "cuStreamCreate"
 45     }.report();
 46 
 47     if (backend->config->traceCalls){
 48         std::cout << "exiting init() 0x"
 49                   << " custream=" <<std::hex<<streamCreationThread <<std::dec
 50                   << std::endl;
 51     }
 52     }
 53 
 54 void CudaBackend::CudaQueue::wait(){
 55     WHERE{.f=__FILE__, .l=__LINE__,
 56           .e=cuStreamSynchronize(cuStream),
 57           .t= "cuStreamSynchronize"
 58     }.report();
 59 }
 60 
 61 
 62 void CudaBackend::CudaQueue::computeStart(){
 63     wait(); // should be no-op
 64     release(); // also ;
 65 }
 66 
 67 
 68 
 69 void CudaBackend::CudaQueue::computeEnd(){
 70 
 71 }
 72 
 73 
 74 
 75 
 76 void CudaBackend::CudaQueue::release(){
 77 
 78 }
 79 
 80 CudaBackend::CudaQueue::~CudaQueue(){
 81     WHERE{.f=__FILE__, .l=__LINE__,
 82             .e=cuStreamDestroy(cuStream),
 83             .t= "cuStreamDestroy"
 84     }.report();
 85 }
 86 
 87 void CudaBackend::CudaQueue::copyToDevice(Buffer *buffer) {
 88     const auto *cudaBuffer = dynamic_cast<CudaBuffer *>(buffer);
 89     const std::thread::id thread_id = std::this_thread::get_id();
 90     if (thread_id != streamCreationThread){
 91         std::cout << "copyToDevice()  thread=" <<thread_id<< " != "<< streamCreationThread<< std::endl;
 92     }
 93     if (backend->config->traceCalls) {
 94 
 95         std::cout << "copyToDevice() 0x"
 96                 << std::hex<<cudaBuffer->bufferState->length<<std::dec << "/"
 97                 << cudaBuffer->bufferState->length << " "
 98                 << "devptr=" << std::hex<<  static_cast<long>(cudaBuffer->devicePtr) <<std::dec
 99                 << " thread=" <<thread_id
100                   << std::endl;
101     }
102     WHERE{.f=__FILE__, .l=__LINE__,
103             .e=cuMemcpyHtoDAsync(
104                     cudaBuffer->devicePtr,
105                     cudaBuffer->bufferState->ptr,
106                     cudaBuffer->bufferState->length,
107                     dynamic_cast<CudaQueue*>(backend->queue)->cuStream),
108             .t="cuMemcpyHtoDAsync"
109     }.report();
110 
111 }
112 
113 void CudaBackend::CudaQueue::copyFromDevice(Buffer *buffer) {
114     const auto *cudaBuffer = dynamic_cast<CudaBuffer *>(buffer);
115     const std::thread::id thread_id = std::this_thread::get_id();
116     if (thread_id != streamCreationThread){
117         std::cout << "copyFromDevice()  thread=" <<thread_id<< " != "<< streamCreationThread<< std::endl;
118     }
119     if (backend->config->traceCalls) {
120 
121         std::cout << "copyFromDevice() 0x"
122                   << std::hex<<cudaBuffer->bufferState->length<<std::dec << "/"
123                   << cudaBuffer->bufferState->length << " "
124                   << "devptr=" << std::hex<<  static_cast<long>(cudaBuffer->devicePtr) <<std::dec
125                 << " thread=" <<thread_id
126                   << std::endl;
127     }
128 
129 
130     WHERE{.f=__FILE__, .l=__LINE__,
131             .e=cuMemcpyDtoHAsync(
132                     cudaBuffer->bufferState->ptr,
133                     cudaBuffer->devicePtr,
134                     cudaBuffer->bufferState->length,
135                                  dynamic_cast<CudaQueue*>(backend->queue)->cuStream),
136             .t="cuMemcpyDtoHAsync"
137     }.report();
138 
139 }
140 
141 void CudaBackend::CudaQueue::dispatch(KernelContext *kernelContext, CompilationUnit::Kernel *kernel) {
142     const auto cudaKernel = dynamic_cast<CudaModule::CudaKernel *>(kernel);
143 
144     const int range = kernelContext->maxX;
145     int rangediv1024 = range / 1024;
146     int rangemod1024 = range % 1024;
147     if (rangemod1024 > 0) {
148         rangediv1024++;
149     }
150 // std::cout << "Running the kernel..." << std::endl;
151 // std::cout << "   Requested range   = " << range << std::endl;
152 // std::cout << "   Range mod 1024    = " << rangemod1024 << std::endl;
153 // std::cout << "   Actual range 1024 = " << (rangediv1024 * 1024) << std::endl;
154 //  auto status= static_cast<CUresult>(cudaStreamSynchronize(cudaBackend->cudaQueue.cuStream));
155 
156 //  cudaBackend->cudaQueue.wait();
157     const std::thread::id thread_id = std::this_thread::get_id();
158     if (thread_id != streamCreationThread){
159         std::cout << "dispatch()  thread=" <<thread_id<< " != "<< streamCreationThread<< std::endl;
160     }
161 
162     const auto status = cuLaunchKernel(cudaKernel->function,
163                                  rangediv1024, 1, 1,
164                                  1024, 1, 1,
165                                  0, cuStream,
166                                  cudaKernel->argslist, nullptr);
167 
168     WHERE{.f=__FILE__, .l=__LINE__, .e=status, .t="cuLaunchKernel"}.report();
169 }