New hat/backends/ffi/cuda/cpp/cuda_backend

  1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 #include <sys/wait.h>
 27 #include <chrono>
 28 #include <thread>
 29 #include "cuda_backend.h"
 30 
 31 CudaBackend::CudaQueue::CudaQueue(Backend *backend)
 32         : Backend::Queue(backend),cuStream(),streamCreationThread() {
 33 }
 34 void CudaBackend::CudaQueue::init(){
 35     streamCreationThread = std::this_thread::get_id();
 36     if (backend->config->traceCalls){
 37         std::cout << "init() 0x"
 38                   << " thread=" <<streamCreationThread
 39                   << std::endl;
 40     }
 41 
 42     WHERE{.f=__FILE__ , .l=__LINE__,
 43           .e=cuStreamCreate(&cuStream,CU_STREAM_DEFAULT),
 44           .t= "cuStreamCreate"
 45     }.report();
 46 
 47     if (backend->config->traceCalls){
 48         std::cout << "exiting init() 0x"
 49                   << " custream=" <<std::hex<<streamCreationThread <<std::dec
 50                   << std::endl;
 51     }
 52     }
 53 
 54 //void CudaBackend::CudaQueue::sync(const char *file, int line) const {
 55 
 56 //}
 57 
 58 
 59 void CudaBackend::CudaQueue::wait(){
 60     WHERE{.f=__FILE__, .l=__LINE__,
 61           .e=cuStreamSynchronize(cuStream),
 62           .t= "cuStreamSynchronize"
 63     }.report();
 64 }
 65 
 66 
 67 void CudaBackend::CudaQueue::computeStart(){
 68     wait(); // should be no-op
 69     release(); // also ;
 70 }
 71 
 72 
 73 
 74 void CudaBackend::CudaQueue::computeEnd(){
 75 
 76 }
 77 
 78 
 79 
 80 
 81 void CudaBackend::CudaQueue::release(){
 82 
 83 }
 84 
 85 CudaBackend::CudaQueue::~CudaQueue(){
 86    // delete []events;
 87     WHERE{.f=__FILE__, .l=__LINE__,
 88             .e=cuStreamDestroy(cuStream),
 89             .t= "cuStreamDestroy"
 90     }.report();
 91 }
 92 
 93 void CudaBackend::CudaQueue::copyToDevice(Buffer *buffer) {
 94     //auto cudaBackend = dynamic_cast<CudaBackend*>(backend);
 95     auto *cudaBuffer = dynamic_cast<CudaBuffer *>(buffer);
 96     std::thread::id thread_id = std::this_thread::get_id();
 97     if (thread_id != streamCreationThread){
 98         std::cout << "copyToDevice()  thread=" <<thread_id<< " != "<< streamCreationThread<< std::endl;
 99     }
100     if (backend->config->traceCalls) {
101 
102         std::cout << "copyToDevice() 0x"
103                 << std::hex<<cudaBuffer->bufferState->length<<std::dec << "/"
104                 << cudaBuffer->bufferState->length << " "
105                 << "devptr=" << std::hex<<  (long)cudaBuffer->devicePtr <<std::dec
106                 << " thread=" <<thread_id
107                   << std::endl;
108     }
109     WHERE{.f=__FILE__, .l=__LINE__,
110             .e=cuMemcpyHtoDAsync(
111                     cudaBuffer->devicePtr,
112                     cudaBuffer->bufferState->ptr,
113                     cudaBuffer->bufferState->length,
114                     dynamic_cast<CudaQueue*>(backend->queue)->cuStream),
115             .t="cuMemcpyHtoDAsync"
116     }.report();
117 
118 }
119 
120 void CudaBackend::CudaQueue::copyFromDevice(Buffer *buffer) {
121     auto *cudaBuffer = dynamic_cast<CudaBuffer *>(buffer);
122     //auto cudaBackend = dynamic_cast<CudaBackend*>(backend);
123     std::thread::id thread_id = std::this_thread::get_id();
124     if (thread_id != streamCreationThread){
125         std::cout << "copyFromDevice()  thread=" <<thread_id<< " != "<< streamCreationThread<< std::endl;
126     }
127     if (backend->config->traceCalls) {
128 
129         std::cout << "copyFromDevice() 0x"
130                   << std::hex<<cudaBuffer->bufferState->length<<std::dec << "/"
131                   << cudaBuffer->bufferState->length << " "
132                   << "devptr=" << std::hex<<  (long)cudaBuffer->devicePtr <<std::dec
133                 << " thread=" <<thread_id
134                   << std::endl;
135     }
136 
137 
138     WHERE{.f=__FILE__, .l=__LINE__,
139             .e=cuMemcpyDtoHAsync(
140                     cudaBuffer->bufferState->ptr,
141                     cudaBuffer->devicePtr,
142                     cudaBuffer->bufferState->length,
143                                  dynamic_cast<CudaQueue*>(backend->queue)->cuStream),
144             .t="cuMemcpyDtoHAsync"
145     }.report();
146 
147 }
148 
149 void CudaBackend::CudaQueue::dispatch(KernelContext *kernelContext, CompilationUnit::Kernel *kernel) {
150     auto cudaKernel = dynamic_cast<CudaModule::CudaKernel *>(kernel);
151 
152     int range = kernelContext->maxX;
153     int rangediv1024 = range / 1024;
154     int rangemod1024 = range % 1024;
155     if (rangemod1024 > 0) {
156         rangediv1024++;
157     }
158 // std::cout << "Running the kernel..." << std::endl;
159 // std::cout << "   Requested range   = " << range << std::endl;
160 // std::cout << "   Range mod 1024    = " << rangemod1024 << std::endl;
161 // std::cout << "   Actual range 1024 = " << (rangediv1024 * 1024) << std::endl;
162 //  auto status= static_cast<CUresult>(cudaStreamSynchronize(cudaBackend->cudaQueue.cuStream));
163 
164 //  cudaBackend->cudaQueue.wait();
165     std::thread::id thread_id = std::this_thread::get_id();
166     if (thread_id != streamCreationThread){
167         std::cout << "dispatch()  thread=" <<thread_id<< " != "<< streamCreationThread<< std::endl;
168     }
169 
170     auto status = cuLaunchKernel(cudaKernel->function,
171                                  rangediv1024, 1, 1,
172                                  1024, 1, 1,
173                                  0, cuStream,
174                                  cudaKernel->argslist, nullptr);
175 
176     WHERE{.f=__FILE__, .l=__LINE__, .e=status, .t="cuLaunchKernel"}.report();
177 }