1 /*
2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25 package violajones.attic;
26
27
28 import hat.Accelerator;
29 import hat.NDRange;
30 import hat.backend.java.WorkStealer;
31 import hat.buffer.F32Array2D;
32 import org.xml.sax.SAXException;
33 import violajones.Viewer;
34 import violajones.XMLHaarCascadeModel;
35 import hat.buffer.S08x3RGBImage;
36 import violajones.ifaces.Cascade;
37 import violajones.ifaces.ResultTable;
38 import violajones.ifaces.ScaleTable;
39
40 import javax.imageio.ImageIO;
41 import javax.xml.parsers.ParserConfigurationException;
42 import java.awt.image.BufferedImage;
43 import java.io.IOException;
44 import java.lang.invoke.MethodHandles;
45 import java.util.Objects;
46
47 public class ViolaJones {
48
49 public static void main(String[] _args) throws IOException, ParserConfigurationException, SAXException {
50 Accelerator accelerator = new Accelerator(MethodHandles.lookup());
51
52
53 BufferedImage nasa = ImageIO.read(Objects.requireNonNull(ViolaJones.class.getResourceAsStream("/images/Nasa1996.jpg")));
54 XMLHaarCascadeModel xmlCascade = XMLHaarCascadeModel.load(ViolaJonesRaw.class.getResourceAsStream("/cascades/haarcascade_frontalface_default.xml"));
55 // Cascade cascade = Cascade.create(accelerator, xmlHaarCascade);
56 var cascade = Cascade.createFrom(accelerator,xmlCascade);
57 var width = nasa.getWidth();
58 var height = nasa.getHeight();
59 S08x3RGBImage rgbImage = S08x3RGBImage.create(accelerator,width,height);
60
61
62 // harViz.showIntegrals();
63
64 var scaleTable = ScaleTable.createFrom(accelerator,new ScaleTable.Constraints(cascade,width,height));
65
66
67 var greyImageF32 = F32Array2D.create(accelerator, width, height);
68 var integralImageF32 = F32Array2D.create(accelerator, width, height);
69 var integralSqImageF32 = F32Array2D.create(accelerator, width, height);
70 var resultTable = ResultTable.create(accelerator, 1000);
71 CoreJavaViolaJones.rgbToGreyScale(rgbImage, greyImageF32);
72 CoreJavaViolaJones.createIntegralImage(greyImageF32, integralImageF32, integralSqImageF32);
73
74 Viewer harViz = new Viewer(accelerator, nasa, rgbImage, cascade, integralImageF32, integralSqImageF32);
75
76 harViz.showIntegrals();
77
78
79 // long floatToShortKernel = accelerator.bridge.getKernel(progHandle, "floatToShortKernel");
80 // long integralColKernel = accelerator.bridge.getKernel(progHandle, "integralColKernel");
81 // long integralRowKernel = accelerator.bridge.getKernel(progHandle, "integralRowKernel");
82 // long singlePassCascadeKernel = accelerator.bridge.getKernel(progHandle, "singlePassCascadeKernel");
83
84
85 // openCLBridge.dump(cascadeMemorySegment, ((OpenCLStructLayout.Tools)cascadeInterface).getLayout());
86 // openCLBridge.dump(scaleTable,scaleTableLayout.layout);
87
88
89 // openCLBridge.dump(treeTable, treeTableLayout.layout);
90 // openCLBridge.dump(stageTable, stageTableLayout.layout);
91 // FloatBuffer integralImage = FloatBuffer.create(accelerator, rgbImageLayout.getElementCount());
92 // FloatBuffer integralSqImage = FloatBuffer.create(accelerator, rgbImageLayout.getElementCount());
93
94 // MemorySegment integralImageMemorySegment = arena.allocateArray(JAVA_FLOAT, rgbImageLayout.getElementCount());
95 // MemorySegment integralSqImageMemorySegment = arena.allocateArray(JAVA_FLOAT, rgbImageLayout.getElementCount());
96
97 // openCLBridge.dump(cascadeMemorySegment, cascadeLayout.layout);
98
99
100 int groupSize = 256;
101 int rangeModGroupSize = scaleTable.rangeModGroupSize(groupSize);
102 //(scaleTable.multiScaleAccumulativeRange() / groupSize) + ((scaleTable.multiScaleAccumulativeRange() % groupSize) == 0 ? 0 : 1)) * groupSize;
103
104 /*
105 OpenCLCodeBuilder c99 = (OpenCLCodeBuilder) accelerator.getCodeBuilder();
106 c99
107 .typedef(FeatureTable.Feature.RectTable.Rect.class)
108 .typedef(FeatureTable.Feature.LinkOrValue.class)
109 .typedef(FeatureTable.Feature.class)
110 .typedef(ScaleTable.Scale.class)
111 .typedef(StageTable.Stage.class)
112 .typedef(TreeTable.Tree.class)
113 .typedef(ResultTable.Result.class)
114 .typedef(Cascade.class)
115 .append("""
116
117 #define SCOPE_START ndrange_t ndrange;ndrange.id.x=get_global_id(0);ndrange.id.maxX=get_global_size(0);
118 #ifdef NDRANGE_CUDA
119 #define atomicInc(p) atomicAdd(p, 1)
120 #else
121 #define atomicInc(p) atom_add(p, 1)
122 #endif
123
124 int b2i(i4 v){
125 return v < 0 ? 256 + v : v;
126 }
127 int rgbToGrey(i4 r, i4 g, i4 b){
128 return (29 * b2i(r) + 60 * b2i(g) + 11 * b2i(b)) / 100;
129 }
130 void integralColById(i4 id, __global cascade_t *cascadeContext, __global b1 *rgb, __global f4 *integral, __global f4 *integralSq){
131 integralSq[id] = integral[id] = 0.0f;
132 for (s32_t y = 1; y < cascadeContext->imageHeight; y++) {
133 s32_t monoOffset = (y * cascadeContext->imageWidth) + id;
134 f32_t lastSq = integralSq[monoOffset - cascadeContext->imageWidth];
135 f32_t last = integral[monoOffset - cascadeContext->imageWidth];
136 char r = rgb[monoOffset * 3 + 0];
137 char g = rgb[monoOffset * 3 + 1];
138 char b = rgb[monoOffset * 3 + 2];
139 f32_t greyValue = rgbToGrey(r, g, b);
140 f32_t greyValueSq = greyValue * greyValue;
141 integralSq[monoOffset] = greyValueSq + lastSq;
142 integral[monoOffset] = greyValue + last;
143 }
144 }
145 __kernel void integralColKernel(__global cascade_t *cascadeContext, __global b1 *rgb, __global f4 *integral, __global f4 *integralSq){
146 SCOPE_START
147 integralColById(ndrange.id.x, cascadeContext, rgb, integral, integralSq);
148 }
149 void integralRowById(i4 id, __global cascade_t *cascadeContext, __global f4 *integral, __global f4 *integralSq){
150 for (s32_t x = 1; x < cascadeContext->imageWidth; x++) {
151 s32_t monoOffset = (id * cascadeContext->imageWidth) + x;
152 integral[monoOffset] = integral[monoOffset] + integral[monoOffset - 1];
153 }
154 for (s32_t x = 1; x < cascadeContext->imageWidth; x++) {
155 s32_t monoOffset = (id * cascadeContext->imageWidth) + x;
156 integralSq[monoOffset] = integralSq[monoOffset] + integralSq[monoOffset - 1];
157 }
158 }
159 __kernel void integralRowKernel(__global cascade_t *cascadeContext, __global f4 *integral, __global f4 *integralSq){
160 SCOPE_START
161 integralRowById(ndrange.id.x, cascadeContext, integral, integralSq);
162 }
163 __kernel void floatToShortKernel(__global cascade_t *cascadeContext, __global f4 *fromIntegral, __global s2 *toIntegral, __global f4 *fromIntegralSq, __global s2 *toIntegralSq){
164 SCOPE_START
165 toIntegral[ndrange.id.x] = (s16_t)(fromIntegral[ndrange.id.x]*(65536/fromIntegral[ndrange.id.maxX-1]));
166 toIntegralSq[ndrange.id.x] = (s16_t)(fromIntegralSq[ndrange.id.x]*(65536/fromIntegralSq[ndrange.id.maxX-1]));
167 }
168
169
170 /
171 A +-------+ B
172 | | D-B-C+A
173 C +-------+ D
174 /
175 float gradient(__global f4 *image, i4 imageWidth, i4 x, i4 y, i4 width, i4 height){
176 f32_t A = image[(y * imageWidth) + x];
177 f32_t D = image[((y + height) * imageWidth) + x + width];
178 f32_t C = image[((y + height) * imageWidth) + x];
179 f32_t B = image[(y * imageWidth) + x + width];
180 return D-B-C+A;
181 }
182 boolean isAFaceStage(__global cascade_t *cascadeContext, __global scale_t *scale, i4 x, i4 y, f4 vnorm, __global f4 *integral, __global stage_t *stagePtr, __global tree_t *treeTable, __global feature_t *featureTable){
183 f32_t sumOfThisStage = 0;
184 for (s32_t treeId = stagePtr->firstTreeId; treeId < (stagePtr->firstTreeId+stagePtr->treeCount); treeId++) {
185 // featureId from 0 to how many roots there are.... we use -1 for none! hence s32_t
186 const __global tree_t *treePtr = &treeTable[treeId];
187 s32_t featureId = treePtr->firstFeatureId;
188 while (featureId >= 0) {
189 const __global feature_t *featurePtr = &featureTable[featureId];
190 f32_t featureGradientSum = .0f;
191 for (s32_t i = 0; i < 3; i++) {
192 const __global rect_t *rect = &featurePtr->rects[i];
193 featureGradientSum += featurePtr->rects[i].weight *
194 gradient(integral, cascadeContext->imageWidth,
195 x + (int) (rect->x * scale->scaleValue),
196 y + (int) (rect->y * scale->scaleValue),
197 (int) (rect->width * scale->scaleValue),
198 (int) (rect->height * scale->scaleValue)
199 ) ;
200 }
201 if ((featureGradientSum * scale->invArea) < (featurePtr->threshold * vnorm)) {//left
202 if (featurePtr->left.hasValue) {
203 sumOfThisStage += featurePtr->left.anon.value;
204 featureId = -1;
205 } else {
206 featureId = treePtr->firstFeatureId+featurePtr->left.anon.featureId;
207 }
208 }else{ // right
209 if (featurePtr->right.hasValue) {
210 sumOfThisStage += featurePtr->right.anon.value;
211 featureId = -1;
212 } else {
213 featureId = treePtr->firstFeatureId+featurePtr->right.anon.featureId;
214 }
215 }
216 }
217 }
218 return sumOfThisStage > stagePtr->threshold;
219 }
220 __kernel void singlePassCascadeKernel(__global cascade_t *cascadeContext, __global f4 *integral, __global f4 *integralSq, __global scale_t *scaleTable, __global result_t *resultTable, __global stage_t *stageTable, __global tree_t *treeTable, __global feature_t *featureTable){
221 SCOPE_START
222
223 size_t gid = ndrange.id.x;
224 if (gid < cascadeContext->multiScaleAccumulativeRange){
225 s32_t i;
226 // This is where we select the scale to use.
227 for (i=0; gid >=scaleTable[i].accumGridSizeMax; i++)
228 ;
229
230 __global scale_t *scale = &scaleTable[i];
231
232 s16_t x = (s16_t)(((gid-scale->accumGridSizeMin) % scale->gridWidth) * scale->scaledXInc);
233 s16_t y = (s16_t)(((gid-scale->accumGridSizeMin) / scale->gridWidth) * scale->scaledYInc);
234
235 f32_t integralGradient = gradient(integral, cascadeContext->imageWidth, x, y, scale->scaledFeatureWidth, scale->scaledFeatureHeight) * scale->invArea;
236 f32_t integralSqGradient = gradient(integralSq, cascadeContext->imageWidth, x, y, scale->scaledFeatureWidth, scale->scaledFeatureHeight) * scale->invArea;
237
238 f32_t vnorm = integralSqGradient - integralGradient * integralGradient;
239 vnorm = (vnorm > 1) ? sqrt(vnorm) : 1;
240
241 bool stillLooksLikeAFace = true;
242
243 for (s32_t stageId = 0; stillLooksLikeAFace && (stageId < cascadeContext->stageCount); stageId++) {
244 __global stage_t *stagePtr = &stageTable[stageId];
245 stillLooksLikeAFace =isAFaceStage(cascadeContext, scale, x, y, vnorm, integral, stagePtr, treeTable, featureTable);
246 }
247 if (stillLooksLikeAFace) {
248 s32_t index = atomicInc(&cascadeContext->atomicResultTableCount);
249 if (index<cascadeContext->maxResults){
250 resultTable[index].x = x;
251 resultTable[index].y = y;
252 resultTable[index].width = scale->scaledFeatureWidth;
253 resultTable[index].height = scale->scaledFeatureHeight;
254 }
255 }
256 }
257 }
258 """
259 );
260
261
262
263 long progHandle = accelerator.bridge.compileProgram(c99.toString());
264 if (accelerator.bridge.programOK(progHandle)) {
265 long floatToShortKernel = accelerator.bridge.getKernel(progHandle, "floatToShortKernel");
266 long integralColKernel = accelerator.bridge.getKernel(progHandle, "integralColKernel");
267 long integralRowKernel = accelerator.bridge.getKernel(progHandle, "integralRowKernel");
268 long singlePassCascadeKernel = accelerator.bridge.getKernel(progHandle, "singlePassCascadeKernel");
269
270
271 // openCLBridge.dump(cascadeMemorySegment, ((OpenCLStructLayout.Tools)cascadeInterface).getLayout());
272 // openCLBridge.dump(scaleTable,scaleTableLayout.layout);
273
274
275 // openCLBridge.dump(treeTable, treeTableLayout.layout);
276 // openCLBridge.dump(stageTable, stageTableLayout.layout);
277 MemorySegment integralImageMemorySegment = arena.allocateArray(JAVA_FLOAT, rgbImageLayout.getElementCount());
278 MemorySegment integralSqImageMemorySegment = arena.allocateArray(JAVA_FLOAT, rgbImageLayout.getElementCount());
279
280 // openCLBridge.dump(cascadeMemorySegment, cascadeLayout.layout);
281
282 int groupSize = 256;
283 int range = ((multiScaleTable.multiScaleAccumulativeRange / groupSize) + ((multiScaleTable.multiScaleAccumulativeRange % groupSize) == 0 ? 0 : 1)) * groupSize;
284
285 ImageLayout integralImage = new ImageLayout(new BufferedImage(rgbImageLayout.getWidth(), rgbImageLayout.getHeight(), BufferedImage.TYPE_USHORT_GRAY));
286 ImageLayout integralSqImage = new ImageLayout(new BufferedImage(rgbImageLayout.getWidth(), rgbImageLayout.getHeight(), BufferedImage.TYPE_USHORT_GRAY));
287 ImageLayout.Instance integralImageInstance = integralImage.instance(arena);
288 ImageLayout.Instance integralSqImageInstance = integralSqImage.instance(arena);
289
290 ImageLayout.Instance rgbImageLayoutInstance = rgbImageLayout.instance(arena);
291
292 HaarVisualizer harViz = new HaarVisualizer(rgbImageLayoutInstance, haarCascade, integralImageInstance, integralSqImageInstance);
293
294 accelerator.bridge.ndrange(integralColKernel, rgbImageLayout.getWidth(),
295 DeviceArgs.of()
296 .s08_1dRO(cascade.segment())
297 .s08_1dRO(rgbImageLayoutInstance.memorySegment)
298 .f32_1dWO(integralImageMemorySegment)
299 .f32_1dWO(integralSqImageMemorySegment)
300 );
301
302 accelerator.bridge.ndrange(integralRowKernel, rgbImageLayout.getHeight(),
303 DeviceArgs.of()
304 .s08_1dRO(cascade.segment())
305 .f32_1dRW(integralImageMemorySegment)
306 .f32_1dRW(integralSqImageMemorySegment)
307 );
308
309 // This allows us to visualize the integral or integralSq image.
310 // We map the integral + integralSq floats to a grey image
311 accelerator.bridge.ndrange(floatToShortKernel,
312 rgbImageLayout.getElementCount(),
313 DeviceArgs.of()
314 .s08_1dRO(cascade.segment())
315 .f32_1dRO(integralImageMemorySegment)
316 .u16_1dWO(integralImageInstance.memorySegment)
317 .f32_1dRO(integralSqImageMemorySegment)
318 .u16_1dWO(integralSqImageInstance.memorySegment)
319 );
320 harViz.showIntegrals();
321
322
323 String mode = System.getProperty("mode", "bridge");
324 System.out.println("Mode =" + mode);
325
326 long start = System.currentTimeMillis();
327 */
328
329 if (true) {
330 long start = System.currentTimeMillis();
331 WorkStealer.usingAllProcessors()
332 .forEachInRange(accelerator.range(NDRange.of1D(scaleTable.multiScaleAccumulativeRange())), kc -> {
333 ReferenceJavaViolaJones.findFeatures(
334 kc.gix,
335 xmlCascade,//cascade,//haarCascade, //or cascade
336 integralImageF32,
337 integralSqImageF32,
338 scaleTable,
339 resultTable);
340 });
341 long ms = (System.currentTimeMillis() - start);
342 System.out.println("done " + ms + "ms");
343 harViz.showResults(resultTable, null, null, ms);
344 }
345 // } else if (mode.equals("javaSegments")) {
346
347 /* WorkStealer.of(1)
348 .forEachInRange(multiScaleTable.multiScaleAccumulativeRange, gid -> {
349 ReferenceJavaViolaJones.findFeatures(
350 gid,
351 cascade,
352 harViz,
353 null,
354 integralImageInstance.memorySegment,
355 integralSqImageInstance.memorySegment,
356 scaleTable,
357 resultTable,
358 stageTable,
359 treeTable,
360 featureTable);
361 }); */
362
363
364 /*
365 } else {
366 accelerator.bridge.ndrange(singlePassCascadeKernel, range,
367 DeviceArgs.of()
368 .s08_1dRW(cascade.segment()) // RW only for atomicResult counter
369 .f32_1dRO(integralImageMemorySegment)
370 .f32_1dRO(integralSqImageMemorySegment)
371 .s08_1dRO(scaleTable.segment())
372 .s08_1dRW(resultTable.segment())
373 .s08_1dRO(stageTable.segment())
374 .s08_1dRO(treeTable.segment())
375 .s08_1dRO(featureTable.segment())
376
377 );
378 }
379 System.out.println("ms = " + (System.currentTimeMillis() - start));
380 // openCLBridge.dump(cascadeMemorySegment, cascadeLayout.layout);
381 // harViz.showResults(cascadeInstance.getAtomicResultTableCount(),cascadeInstance.getMaxResults(), resultTable, resultTableLayout);
382 harViz.showResults(cascade.getAtomicResultTableCount(), cascade.getMaxResults(), resultTable);
383
384 accelerator.bridge.releaseKernel(integralColKernel);
385 accelerator.bridge.releaseKernel(integralRowKernel);
386 accelerator.bridge.releaseKernel(floatToShortKernel);
387 accelerator.bridge.releaseKernel(singlePassCascadeKernel);
388
389 accelerator.bridge.releaseProgram(progHandle);
390 }
391 accelerator.bridge.release();
392 */
393
394 }
395 }