1 2 # Minimizing Buffer Transfers 3 4 ---- 5 6 * [Contents](hat-00.md) 7 * House Keeping 8 * [Project Layout](hat-01-01-project-layout.md) 9 * [Building Babylon](hat-01-02-building-babylon.md) 10 * [Building HAT](hat-01-03-building-hat.md) 11 * Programming Model 12 * [Programming Model](hat-03-programming-model.md) 13 * Interface Mapping 14 * [Interface Mapping Overview](hat-04-01-interface-mapping.md) 15 * [Cascade Interface Mapping](hat-04-02-cascade-interface-mapping.md) 16 * Implementation Detail 17 * [Walkthrough Of Accelerator.compute()](hat-accelerator-compute.md) 18 * [How we minimize buffer transfers](hat-minimizing-buffer-transfers.md) 19 20 ---- 21 22 ## Using buffer marking to minimize data transfers 23 24 ### The naive approach 25 The default execution model is that at each kernel 26 dispatch the backend just copy all arg buffers togc 27 the device and after the dispatch it copies all arg 28 buffers back. 29 30 ### Using kernel arg buffer access patterns 31 If we knew how each kernel accesses it's args (via static analysis of code model orgc 32 by marking the args RO, RW or WO with annotations) we can avoid some copies by onlygc 33 copying in if the kernel 'reads' the arg buffer and only copying out if the 34 kernel writes to the arg buffer. 35 36 Lets use the game of life as an example.gc 37 38 We assume that the UI only needs updating at some 'rate' (say 5 fps), but the kernels can generate 39 generations faster that 5 generations per second. code to generate eactgc 40 41 So not every generation needs to be copied to the device.gc 42 43 We'll ignore the detail regarding the `life` kernel, and we will assume the kernel args Mostly we care ab 44 are appropriately annotated as RO, RW or WO. 45 46 ```java 47 @CodeReflection 48 public static void life(@RO KernelContext kc, @RO Control control, @RW CellGrid cellGrid) { 49 if (kc.x < kc.maxX) { 50 Compute.lifePerIdx(kc.x, control, cellGrid); 51 } 52 } 53 54 @CodeReflection 55 static public void compute(final @RO ComputeContext cc, 56 Viewer viewer, @RO Control control, @RW CellGrid cellGrid) { 57 var timeOfLastUIUpdate = System.currentTimeMillis(); 58 var msPerFrame = 1000/5; // we want 5 fps 59 while (viewer.state.generation < viewer.state.maxGenerations) { 60 long now = System.currentTimeMillis(); 61 var msSinceLastUpdate = (now - timeOfLastUIUpdate); 62 var updateNeeded = (msSinceLastUpdate > msPerFrame); 63 gc 64 cc.dispatchKernel(cellGrid.width() * cellGrid.height(), 65 kc -> Compute.life(kc, control, cellGrid) 66 ); 67 gc 68 // Here we are swapping from<->to on the control buffer 69 int to = control.from(); 70 control.from(control.to()); 71 control.to(to); 72 gc 73 if (updateNeeded) { 74 viewer.update(now, to, cellGrid); 75 timeOfLastUIUpdate = now; 76 } 77 } 78 } 79 ``` 80 81 First, let's assume there were no automatic transfers, assume we had to define them. We had to explicitly control transfers so we will insert code. 82 83 What would our code look like? 84 85 ```java 86 @CodeReflection 87 public static void life(@RO KernelContext kc, @RO Control control, @RW CellGrid cellGrid) { 88 if (kc.x < kc.maxX) { 89 Compute.lifePerIdx(kc.x, control, cellGrid); 90 } 91 } 92 93 @CodeReflection 94 static public void compute(final @RO ComputeContext cc, 95 Viewer viewer, @RO Control control, @RW CellGrid cellGrid) { 96 var timeOfLastUIUpdate = System.currentTimeMillis(); 97 var msPerFrame = 1000/5; // we want 5 fps 98 var cellGridIsJavaDirty = true; 99 var controlIsJavaDirty = true; 100 var cellGridIsDeviceDirty = true; 101 var controlIsDeviceDirty = true; 102 while (true) { 103 long now = System.currentTimeMillis(); 104 var msSinceLastUpdate = (now - timeOfLastUIUpdate); 105 var updateNeeded = (msSinceLastUpdate > msPerFrame); 106 gc 107 if (cellGridIsJavaDirty){ 108 cc.copyToDevice(cellGrid); 109 } 110 if (controlIsJavaDirty){ 111 cc.copyToDevice(control); 112 } 113 cc.dispatchKernel(cellGrid.width() * cellGrid.height(), 114 kc -> Compute.life(kc, control, cellGrid) 115 ); 116 controlIsDeviceDirty = false; // Compute.life marked control as @RO 117 cellGridIsDeviceDirty = true; // Compute.life marjed cellGrid as @RW 118 gc 119 // Here we are swapping from<->to on the control buffer 120 if (controlIsDeviceDirty){ 121 cc.copyFromDevice(control); 122 } 123 int to = control.from(); 124 control.from(control.to()); 125 control.to(to); 126 controlIsJavaDirty = true; 127 gc 128 if (updateNeeded) { 129 if (cellGridIsDeviceDirty){ 130 cc.copyFromDevice(cellGrid); 131 } 132 viewer.update(now, to, cellGrid); 133 timeOfLastUIUpdate = now; 134 } 135 } 136 } 137 ``` 138 139 Alternatively, what if the buffers themselves could hold the deviceDirty flags javaDirty? 140 141 142 ```java 143 @CodeReflection 144 public static void life(@RO KernelContext kc, @RO Control control, @RW CellGrid cellGrid) { 145 if (kc.x < kc.maxX) { 146 Compute.lifePerIdx(kc.x, control, cellGrid); 147 } 148 } 149 150 @CodeReflection 151 static public void compute(final @RO ComputeContext cc, 152 Viewer viewer, @RO Control control, @RW CellGrid cellGrid) { 153 control.flags =JavaDirty; // not ideal but necessary 154 cellGrid.flags = JavaDirty; // not ideal but necessary 155 gc 156 var timeOfLastUIUpdate = System.currentTimeMillis(); 157 var msPerFrame = 1000/5; // we want 5 fps 158 159 while (true) { 160 long now = System.currentTimeMillis(); 161 var msSinceLastUpdate = (now - timeOfLastUIUpdate); 162 var updateNeeded = (msSinceLastUpdate > msPerFrame); 163 gc 164 if ((cellGrid.flags & JavaDirty) == JavaDirty){ 165 cc.copyToDevice(cellGrid); 166 } 167 if ((control.flags & JavaDirty) == JavaDirty){ 168 cc.copyToDevice(control); 169 } 170 cc.dispatchKernel(cellGrid.width() * cellGrid.height(), 171 kc -> Compute.life(kc, control, cellGrid) 172 ); 173 control.flags = JavaDirty; // Compute.life marked control as @RO 174 cellGrid.flags = DeviceDirty; // Compute.life marjed cellGrid as @RW 175 gc 176 // Here we are swapping from<->to on the control buffer 177 if ((control.flags & DeviceDirty)==DeviceDirty){ 178 cc.copyFromDevice(control); 179 } 180 int to = control.from(); 181 control.from(control.to()); 182 control.to(to); 183 control.flags = JavaDirty; 184 gc 185 if (updateNeeded) { 186 if ((cellGrid.flags & DeviceDirty)==DeviceDirty){ 187 cc.copyFromDevice(cellGrid); 188 } 189 viewer.update(now, to, cellGrid); 190 // update does not mutate cellGrid so cellGrid.flags = DeviceDirty 191 timeOfLastUIUpdate = now; 192 } 193 } 194 } 195 ``` 196 197 Essentially, we defer to the kernel dispatch to determine whether buffers are 198 copied to the device and to mark buffers accordingly if the dispatch mutated the buffer.gc 199 200 Pseudo-code for dispatch is essentially 201 ```java 202 203 void dispatchKernel(Kernel kernel, KernelContext kc, Arg ... args) { 204 for (int argn = 0; argn<args.length; argn++){ 205 Arg arg = args[argn]; 206 if (((arg.flags &JavaDirty)==JavaDirty) && kernel.readsFrom(arg)) { 207 enqueueCopyToDevice(arg); 208 } 209 } 210 enqueueKernel(kernel); 211 for (int argn = 0; argn<args.length; argn++){ 212 Arg arg = args[argn]; 213 if (kernel.writesTo(arg)) { 214 arg.flags = DeviceDirty; 215 } 216 } 217 } 218 ``` 219 We rely on babylon to mark each buffer passed to it as JavaDirty 220 221 ```java 222 223 @CodeReflection 224 static public void compute(final @RO ComputeContext cc, 225 Viewer viewer, @RO Control control, @RW CellGrid cellGrid) { 226 control.flags = JavaDirty; 227 cellGrid.flags = JavaDirty; 228 // yada yada 229 } 230 ``` 231 232 We also rely on babylon to inject calls before each buffer access from java in the compute code. 233 234 So the injected code would look like this.gc 235 236 ```java 237 238 @CodeReflection 239 static public void compute(final @RO ComputeContext cc, 240 Viewer viewer, @RO Control control, @RW CellGrid cellGrid) { 241 control.flags =JavaDirty; // injected by bablyon 242 cellGrid.flags = JavaDirty; // injected by babylon 243 gc 244 var timeOfLastUIUpdate = System.currentTimeMillis(); 245 var msPerFrame = 1000/5; // we want 5 fps 246 while (true) { 247 long now = System.currentTimeMillis(); 248 var msSinceLastUpdate = (now - timeOfLastUIUpdate); 249 var updateNeeded = (msSinceLastUpdate > msPerFrame); 250 gc 251 // See the psuedo code above to see how dispatchKernel 252 // Only copies buffers that need copying, and marks 253 // buffers it has mutate as dirty 254 cc.dispatchKernel(cellGrid.width() * cellGrid.height(), 255 kc -> Compute.life(kc, control, cellGrid) 256 ); 257 gc 258 // injected by babylon 259 if ((control.flags & DeviceDirty)==DeviceDirty){ 260 cc.copyFromDevice(control); 261 } 262 // Here we are swapping from<->to on the control buffer 263 int to = control.from(); 264 gc 265 control.from(control.to()); 266 control.flags = JavaDirty; // injectedgc 267 control.to(to); 268 control.flags = JavaDirty; // injected, but can be avoided 269 gc 270 if (updateNeeded) { 271 // Injected by babylon because cellGrid escapes cpmputegc 272 // and because viewer.update marks cellGrid as @RO 273 if ((cellGrid.flags & DeviceDirty)==DeviceDirty){ 274 cc.copyFromDevice(cellGrid); 275 } 276 viewer.update(now, to, cellGrid); 277 // We don't copy cellgrid back after escape becausegc 278 // viewer.update annotates cellGrdi access as RO 279 timeOfLastUIUpdate = now; 280 } 281 } 282 } 283 ``` 284 285 286 287 288 289 290 291