1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 package violajones.attic;
 26 
 27 
 28 import hat.Accelerator;
 29 import hat.NDRange;
 30 import hat.backend.java.WorkStealer;
 31 import hat.buffer.F32Array2D;
 32 import org.xml.sax.SAXException;
 33 import violajones.Viewer;
 34 import violajones.XMLHaarCascadeModel;
 35 import hat.buffer.S08x3RGBImage;
 36 import violajones.ifaces.Cascade;
 37 import violajones.ifaces.ResultTable;
 38 import violajones.ifaces.ScaleTable;
 39 
 40 import javax.imageio.ImageIO;
 41 import javax.xml.parsers.ParserConfigurationException;
 42 import java.awt.image.BufferedImage;
 43 import java.io.IOException;
 44 import java.lang.invoke.MethodHandles;
 45 import java.util.Objects;
 46 
 47 public class ViolaJones {
 48 
 49     public static void main(String[] _args) throws IOException, ParserConfigurationException, SAXException {
 50         Accelerator accelerator = new Accelerator(MethodHandles.lookup());
 51 
 52 
 53         BufferedImage nasa = ImageIO.read(Objects.requireNonNull(ViolaJones.class.getResourceAsStream("/images/Nasa1996.jpg")));
 54         XMLHaarCascadeModel xmlCascade = XMLHaarCascadeModel.load(ViolaJonesRaw.class.getResourceAsStream("/cascades/haarcascade_frontalface_default.xml"));
 55    //     Cascade cascade = Cascade.create(accelerator, xmlHaarCascade);
 56         var cascade = Cascade.createFrom(accelerator,xmlCascade);
 57         var width = nasa.getWidth();
 58         var height = nasa.getHeight();
 59         S08x3RGBImage rgbImage = S08x3RGBImage.create(accelerator,width,height);
 60 
 61 
 62         // harViz.showIntegrals();
 63 
 64         var scaleTable = ScaleTable.createFrom(accelerator,new ScaleTable.Constraints(cascade,width,height));
 65 
 66 
 67         var greyImageF32 = F32Array2D.create(accelerator, width, height);
 68         var integralImageF32 = F32Array2D.create(accelerator, width, height);
 69         var integralSqImageF32 = F32Array2D.create(accelerator, width, height);
 70         var resultTable = ResultTable.create(accelerator, 1000);
 71         CoreJavaViolaJones.rgbToGreyScale(rgbImage, greyImageF32);
 72         CoreJavaViolaJones.createIntegralImage(greyImageF32, integralImageF32, integralSqImageF32);
 73 
 74         Viewer harViz = new Viewer(accelerator, nasa, rgbImage, cascade, integralImageF32, integralSqImageF32);
 75 
 76         harViz.showIntegrals();
 77 
 78 
 79         //   long floatToShortKernel = accelerator.bridge.getKernel(progHandle, "floatToShortKernel");
 80         //   long integralColKernel = accelerator.bridge.getKernel(progHandle, "integralColKernel");
 81         //   long integralRowKernel = accelerator.bridge.getKernel(progHandle, "integralRowKernel");
 82         //   long singlePassCascadeKernel = accelerator.bridge.getKernel(progHandle, "singlePassCascadeKernel");
 83 
 84 
 85         // openCLBridge.dump(cascadeMemorySegment, ((OpenCLStructLayout.Tools)cascadeInterface).getLayout());
 86         // openCLBridge.dump(scaleTable,scaleTableLayout.layout);
 87 
 88 
 89         //   openCLBridge.dump(treeTable, treeTableLayout.layout);
 90         // openCLBridge.dump(stageTable, stageTableLayout.layout);
 91         //   FloatBuffer integralImage = FloatBuffer.create(accelerator, rgbImageLayout.getElementCount());
 92         //   FloatBuffer integralSqImage = FloatBuffer.create(accelerator, rgbImageLayout.getElementCount());
 93 
 94         // MemorySegment integralImageMemorySegment = arena.allocateArray(JAVA_FLOAT, rgbImageLayout.getElementCount());
 95         // MemorySegment integralSqImageMemorySegment = arena.allocateArray(JAVA_FLOAT, rgbImageLayout.getElementCount());
 96 
 97         // openCLBridge.dump(cascadeMemorySegment, cascadeLayout.layout);
 98 
 99 
100         int groupSize = 256;
101         int rangeModGroupSize = scaleTable.rangeModGroupSize(groupSize);
102         //(scaleTable.multiScaleAccumulativeRange() / groupSize) + ((scaleTable.multiScaleAccumulativeRange() % groupSize) == 0 ? 0 : 1)) * groupSize;
103 
104 /*
105         OpenCLCodeBuilder c99 = (OpenCLCodeBuilder) accelerator.getCodeBuilder();
106         c99
107                 .typedef(FeatureTable.Feature.RectTable.Rect.class)
108                 .typedef(FeatureTable.Feature.LinkOrValue.class)
109                 .typedef(FeatureTable.Feature.class)
110                 .typedef(ScaleTable.Scale.class)
111                 .typedef(StageTable.Stage.class)
112                 .typedef(TreeTable.Tree.class)
113                 .typedef(ResultTable.Result.class)
114                 .typedef(Cascade.class)
115                 .append("""
116 
117                 #define SCOPE_START  ndrange_t ndrange;ndrange.id.x=get_global_id(0);ndrange.id.maxX=get_global_size(0);
118                 #ifdef NDRANGE_CUDA
119                 #define atomicInc(p) atomicAdd(p, 1)
120                 #else
121                 #define atomicInc(p) atom_add(p, 1)
122                 #endif
123 
124                 int b2i(i4 v){
125                    return v < 0 ? 256 + v : v;
126                 }
127                 int rgbToGrey(i4 r, i4 g, i4 b){
128                    return (29 * b2i(r) + 60 * b2i(g) + 11 * b2i(b)) / 100;
129                 }
130                 void integralColById(i4 id, __global cascade_t *cascadeContext, __global b1 *rgb, __global f4 *integral, __global f4 *integralSq){
131                    integralSq[id] = integral[id] = 0.0f;
132                    for (s32_t y = 1; y < cascadeContext->imageHeight; y++) {
133                        s32_t monoOffset = (y * cascadeContext->imageWidth) + id;
134                        f32_t lastSq = integralSq[monoOffset - cascadeContext->imageWidth];
135                        f32_t last = integral[monoOffset - cascadeContext->imageWidth];
136                        char r = rgb[monoOffset * 3 + 0];
137                        char g = rgb[monoOffset * 3 + 1];
138                        char b = rgb[monoOffset * 3 + 2];
139                        f32_t greyValue = rgbToGrey(r, g, b);
140                        f32_t greyValueSq = greyValue * greyValue;
141                        integralSq[monoOffset] = greyValueSq + lastSq;
142                        integral[monoOffset] = greyValue + last;
143                    }
144                 }
145                 __kernel void integralColKernel(__global cascade_t *cascadeContext, __global b1 *rgb, __global f4 *integral, __global f4 *integralSq){
146                      SCOPE_START
147                      integralColById(ndrange.id.x, cascadeContext, rgb,  integral, integralSq);
148                 }
149                 void integralRowById(i4 id, __global cascade_t *cascadeContext, __global f4 *integral, __global f4 *integralSq){
150                      for (s32_t x = 1; x < cascadeContext->imageWidth; x++) {
151                         s32_t monoOffset = (id * cascadeContext->imageWidth) + x;
152                         integral[monoOffset] = integral[monoOffset] + integral[monoOffset - 1];
153                      }
154                      for (s32_t x = 1; x < cascadeContext->imageWidth; x++) {
155                         s32_t monoOffset = (id * cascadeContext->imageWidth) + x;
156                         integralSq[monoOffset] = integralSq[monoOffset] + integralSq[monoOffset - 1];
157                      }
158                 }
159                 __kernel void integralRowKernel(__global cascade_t *cascadeContext, __global f4 *integral, __global f4 *integralSq){
160                    SCOPE_START
161                    integralRowById(ndrange.id.x,  cascadeContext, integral, integralSq);
162                 }
163                 __kernel void floatToShortKernel(__global cascade_t *cascadeContext, __global f4 *fromIntegral, __global s2 *toIntegral, __global f4 *fromIntegralSq, __global s2 *toIntegralSq){
164                    SCOPE_START
165                    toIntegral[ndrange.id.x] = (s16_t)(fromIntegral[ndrange.id.x]*(65536/fromIntegral[ndrange.id.maxX-1]));
166                    toIntegralSq[ndrange.id.x] = (s16_t)(fromIntegralSq[ndrange.id.x]*(65536/fromIntegralSq[ndrange.id.maxX-1]));
167                 }
168 
169 
170                 /
171                       A +-------+ B
172                         |       |       D-B-C+A
173                       C +-------+ D
174                 /
175                 float gradient(__global f4 *image, i4 imageWidth, i4 x, i4 y, i4 width, i4 height){
176                    f32_t A = image[(y * imageWidth) + x];
177                    f32_t D = image[((y + height) * imageWidth) + x + width];
178                    f32_t C = image[((y + height) * imageWidth) + x];
179                    f32_t B = image[(y * imageWidth) + x + width];
180                    return D-B-C+A;
181                 }
182                 boolean isAFaceStage(__global cascade_t *cascadeContext, __global scale_t *scale, i4 x, i4 y, f4 vnorm, __global f4 *integral, __global stage_t *stagePtr, __global tree_t *treeTable, __global feature_t *featureTable){
183                    f32_t sumOfThisStage = 0;
184                    for (s32_t treeId = stagePtr->firstTreeId; treeId < (stagePtr->firstTreeId+stagePtr->treeCount); treeId++) {
185                        // featureId from 0 to how many roots there are.... we use -1 for none! hence s32_t
186                        const __global tree_t *treePtr = &treeTable[treeId];
187                        s32_t featureId = treePtr->firstFeatureId;
188                        while (featureId >= 0) {
189                            const __global feature_t *featurePtr = &featureTable[featureId];
190                            f32_t featureGradientSum = .0f;
191                            for (s32_t i = 0; i < 3; i++) {
192                                const __global rect_t  *rect = &featurePtr->rects[i];
193                                featureGradientSum +=   featurePtr->rects[i].weight *
194                                    gradient(integral,  cascadeContext->imageWidth,
195                                        x + (int) (rect->x * scale->scaleValue),
196                                        y + (int) (rect->y * scale->scaleValue),
197                                        (int) (rect->width * scale->scaleValue),
198                                        (int) (rect->height * scale->scaleValue)
199                                    ) ;
200                            }
201                            if ((featureGradientSum * scale->invArea) < (featurePtr->threshold * vnorm)) {//left
202                               if (featurePtr->left.hasValue) {
203                                   sumOfThisStage += featurePtr->left.anon.value;
204                                   featureId = -1;
205                               } else {
206                                   featureId = treePtr->firstFeatureId+featurePtr->left.anon.featureId;
207                               }
208                            }else{ // right
209                               if (featurePtr->right.hasValue) {
210                                   sumOfThisStage += featurePtr->right.anon.value;
211                                   featureId = -1;
212                               } else {
213                                   featureId = treePtr->firstFeatureId+featurePtr->right.anon.featureId;
214                               }
215                            }
216                        }
217                    }
218                    return sumOfThisStage > stagePtr->threshold;
219                 }
220                 __kernel void singlePassCascadeKernel(__global cascade_t *cascadeContext, __global f4 *integral, __global f4 *integralSq, __global scale_t *scaleTable, __global result_t *resultTable, __global stage_t *stageTable, __global tree_t *treeTable, __global feature_t *featureTable){
221                    SCOPE_START
222 
223                    size_t gid = ndrange.id.x;
224                    if (gid < cascadeContext->multiScaleAccumulativeRange){
225                       s32_t i;
226                       // This is where we select the scale to use.
227                       for (i=0; gid >=scaleTable[i].accumGridSizeMax; i++)
228                          ;
229 
230                       __global scale_t *scale = &scaleTable[i];
231 
232                       s16_t x = (s16_t)(((gid-scale->accumGridSizeMin) % scale->gridWidth) * scale->scaledXInc);
233                       s16_t y = (s16_t)(((gid-scale->accumGridSizeMin) / scale->gridWidth) * scale->scaledYInc);
234 
235                       f32_t integralGradient = gradient(integral, cascadeContext->imageWidth, x, y, scale->scaledFeatureWidth, scale->scaledFeatureHeight) * scale->invArea;
236                       f32_t integralSqGradient = gradient(integralSq, cascadeContext->imageWidth, x, y, scale->scaledFeatureWidth, scale->scaledFeatureHeight) * scale->invArea;
237 
238                       f32_t vnorm = integralSqGradient - integralGradient * integralGradient;
239                       vnorm =  (vnorm > 1) ? sqrt(vnorm) : 1;
240 
241                       bool stillLooksLikeAFace = true;
242 
243                       for (s32_t stageId = 0; stillLooksLikeAFace && (stageId < cascadeContext->stageCount); stageId++) {
244                          __global stage_t *stagePtr = &stageTable[stageId];
245                          stillLooksLikeAFace =isAFaceStage(cascadeContext, scale,  x, y,  vnorm,  integral,  stagePtr, treeTable, featureTable);
246                       }
247                       if (stillLooksLikeAFace) {
248                          s32_t index = atomicInc(&cascadeContext->atomicResultTableCount);
249                          if (index<cascadeContext->maxResults){
250                             resultTable[index].x = x;
251                             resultTable[index].y = y;
252                             resultTable[index].width = scale->scaledFeatureWidth;
253                             resultTable[index].height = scale->scaledFeatureHeight;
254                          }
255                        }
256                    }
257                 }
258                 """
259              );
260 
261 
262 
263         long progHandle = accelerator.bridge.compileProgram(c99.toString());
264         if (accelerator.bridge.programOK(progHandle)) {
265             long floatToShortKernel = accelerator.bridge.getKernel(progHandle, "floatToShortKernel");
266             long integralColKernel = accelerator.bridge.getKernel(progHandle, "integralColKernel");
267             long integralRowKernel = accelerator.bridge.getKernel(progHandle, "integralRowKernel");
268             long singlePassCascadeKernel = accelerator.bridge.getKernel(progHandle, "singlePassCascadeKernel");
269 
270 
271             // openCLBridge.dump(cascadeMemorySegment, ((OpenCLStructLayout.Tools)cascadeInterface).getLayout());
272             // openCLBridge.dump(scaleTable,scaleTableLayout.layout);
273 
274 
275             //   openCLBridge.dump(treeTable, treeTableLayout.layout);
276             // openCLBridge.dump(stageTable, stageTableLayout.layout);
277             MemorySegment integralImageMemorySegment = arena.allocateArray(JAVA_FLOAT, rgbImageLayout.getElementCount());
278             MemorySegment integralSqImageMemorySegment = arena.allocateArray(JAVA_FLOAT, rgbImageLayout.getElementCount());
279 
280             // openCLBridge.dump(cascadeMemorySegment, cascadeLayout.layout);
281 
282             int groupSize = 256;
283             int range = ((multiScaleTable.multiScaleAccumulativeRange / groupSize) + ((multiScaleTable.multiScaleAccumulativeRange % groupSize) == 0 ? 0 : 1)) * groupSize;
284 
285             ImageLayout integralImage = new ImageLayout(new BufferedImage(rgbImageLayout.getWidth(), rgbImageLayout.getHeight(), BufferedImage.TYPE_USHORT_GRAY));
286             ImageLayout integralSqImage = new ImageLayout(new BufferedImage(rgbImageLayout.getWidth(), rgbImageLayout.getHeight(), BufferedImage.TYPE_USHORT_GRAY));
287             ImageLayout.Instance integralImageInstance = integralImage.instance(arena);
288             ImageLayout.Instance integralSqImageInstance = integralSqImage.instance(arena);
289 
290             ImageLayout.Instance rgbImageLayoutInstance = rgbImageLayout.instance(arena);
291 
292             HaarVisualizer harViz = new HaarVisualizer(rgbImageLayoutInstance, haarCascade, integralImageInstance, integralSqImageInstance);
293 
294             accelerator.bridge.ndrange(integralColKernel, rgbImageLayout.getWidth(),
295                     DeviceArgs.of()
296                             .s08_1dRO(cascade.segment())
297                             .s08_1dRO(rgbImageLayoutInstance.memorySegment)
298                             .f32_1dWO(integralImageMemorySegment)
299                             .f32_1dWO(integralSqImageMemorySegment)
300             );
301 
302             accelerator.bridge.ndrange(integralRowKernel, rgbImageLayout.getHeight(),
303                     DeviceArgs.of()
304                             .s08_1dRO(cascade.segment())
305                             .f32_1dRW(integralImageMemorySegment)
306                             .f32_1dRW(integralSqImageMemorySegment)
307             );
308 
309             // This allows us to visualize the integral or integralSq image.
310             // We map the integral + integralSq floats to a grey image
311             accelerator.bridge.ndrange(floatToShortKernel,
312                     rgbImageLayout.getElementCount(),
313                     DeviceArgs.of()
314                             .s08_1dRO(cascade.segment())
315                             .f32_1dRO(integralImageMemorySegment)
316                             .u16_1dWO(integralImageInstance.memorySegment)
317                             .f32_1dRO(integralSqImageMemorySegment)
318                             .u16_1dWO(integralSqImageInstance.memorySegment)
319             );
320             harViz.showIntegrals();
321 
322 
323             String mode = System.getProperty("mode", "bridge");
324             System.out.println("Mode =" + mode);
325 
326             long start = System.currentTimeMillis();
327 */
328 
329         if (true) {
330             long start = System.currentTimeMillis();
331             WorkStealer.usingAllProcessors()
332                     .forEachInRange(accelerator.range(NDRange.of1D(scaleTable.multiScaleAccumulativeRange())), kc -> {
333                         ReferenceJavaViolaJones.findFeatures(
334                                 kc.gix,
335                                 xmlCascade,//cascade,//haarCascade, //or cascade
336                                 integralImageF32,
337                                 integralSqImageF32,
338                                 scaleTable,
339                                 resultTable);
340                     });
341             long ms = (System.currentTimeMillis() - start);
342             System.out.println("done " + ms + "ms");
343             harViz.showResults(resultTable, null, null, ms);
344         }
345         //   } else if (mode.equals("javaSegments")) {
346 
347       /*  WorkStealer.of(1)
348                 .forEachInRange(multiScaleTable.multiScaleAccumulativeRange, gid -> {
349                     ReferenceJavaViolaJones.findFeatures(
350                             gid,
351                             cascade,
352                             harViz,
353                             null,
354                             integralImageInstance.memorySegment,
355                             integralSqImageInstance.memorySegment,
356                             scaleTable,
357                             resultTable,
358                             stageTable,
359                             treeTable,
360                             featureTable);
361                 }); */
362 
363 
364                 /*
365             } else {
366                 accelerator.bridge.ndrange(singlePassCascadeKernel, range,
367                         DeviceArgs.of()
368                                 .s08_1dRW(cascade.segment()) // RW only  for atomicResult counter
369                                 .f32_1dRO(integralImageMemorySegment)
370                                 .f32_1dRO(integralSqImageMemorySegment)
371                                 .s08_1dRO(scaleTable.segment())
372                                 .s08_1dRW(resultTable.segment())
373                                 .s08_1dRO(stageTable.segment())
374                                 .s08_1dRO(treeTable.segment())
375                                 .s08_1dRO(featureTable.segment())
376 
377                 );
378             }
379             System.out.println("ms = " + (System.currentTimeMillis() - start));
380             //  openCLBridge.dump(cascadeMemorySegment, cascadeLayout.layout);
381             //  harViz.showResults(cascadeInstance.getAtomicResultTableCount(),cascadeInstance.getMaxResults(), resultTable, resultTableLayout);
382             harViz.showResults(cascade.getAtomicResultTableCount(), cascade.getMaxResults(), resultTable);
383 
384             accelerator.bridge.releaseKernel(integralColKernel);
385             accelerator.bridge.releaseKernel(integralRowKernel);
386             accelerator.bridge.releaseKernel(floatToShortKernel);
387             accelerator.bridge.releaseKernel(singlePassCascadeKernel);
388 
389             accelerator.bridge.releaseProgram(progHandle);
390         }
391         accelerator.bridge.release();
392 */
393 
394     }
395 }