root / rgbdslam / external / siftgpu / src / SiftGPU / ProgramCU.cu @ 9240aaa3
History | View | Annotate | Download (56.6 KB)
1 | 9240aaa3 | Alex | ////////////////////////////////////////////////////////////////////////////
|
---|---|---|---|
2 | // File: ProgramCU.cu
|
||
3 | // Author: Changchang Wu
|
||
4 | // Description : implementation of ProgramCU and all CUDA kernels
|
||
5 | //
|
||
6 | // Copyright (c) 2007 University of North Carolina at Chapel Hill
|
||
7 | // All Rights Reserved
|
||
8 | //
|
||
9 | // Permission to use, copy, modify and distribute this software and its
|
||
10 | // documentation for educational, research and non-profit purposes, without
|
||
11 | // fee, and without a written agreement is hereby granted, provided that the
|
||
12 | // above copyright notice and the following paragraph appear in all copies.
|
||
13 | //
|
||
14 | // The University of North Carolina at Chapel Hill make no representations
|
||
15 | // about the suitability of this software for any purpose. It is provided
|
||
16 | // 'as is' without express or implied warranty.
|
||
17 | //
|
||
18 | // Please send BUG REPORTS to ccwu@cs.unc.edu
|
||
19 | //
|
||
20 | ////////////////////////////////////////////////////////////////////////////
|
||
21 | |||
22 | #if defined(CUDA_SIFTGPU_ENABLED)
|
||
23 | |||
24 | #include "GL/glew.h" |
||
25 | #include "stdio.h" |
||
26 | |||
27 | #include "CuTexImage.h" |
||
28 | #include "ProgramCU.h" |
||
29 | #include "GlobalUtil.h" |
||
30 | |||
31 | //----------------------------------------------------------------
|
||
32 | //Begin SiftGPU setting section.
|
||
33 | //////////////////////////////////////////////////////////
|
||
34 | #define IMUL(X,Y) __mul24(X,Y)
|
||
35 | //#define FDIV(X,Y) ((X)/(Y))
|
||
36 | #define FDIV(X,Y) __fdividef(X,Y)
|
||
37 | |||
38 | /////////////////////////////////////////////////////////
|
||
39 | //filter kernel width range (don't change this)
|
||
40 | #define KERNEL_MAX_WIDTH 33 |
||
41 | #define KERNEL_MIN_WIDTH 5 |
||
42 | |||
43 | //////////////////////////////////////////////////////////
|
||
44 | //horizontal filter block size (32, 64, 128, 256, 512)
|
||
45 | #define FILTERH_TILE_WIDTH 128 |
||
46 | //thread block for vertical filter. FILTERV_BLOCK_WIDTH can be (4, 8 or 16)
|
||
47 | #define FILTERV_BLOCK_WIDTH 16 |
||
48 | #define FILTERV_BLOCK_HEIGHT 32 |
||
49 | //The corresponding image patch for a thread block
|
||
50 | #define FILTERV_PIXEL_PER_THREAD 4 |
||
51 | #define FILTERV_TILE_WIDTH FILTERV_BLOCK_WIDTH
|
||
52 | #define FILTERV_TILE_HEIGHT (FILTERV_PIXEL_PER_THREAD * FILTERV_BLOCK_HEIGHT)
|
||
53 | |||
54 | |||
55 | //////////////////////////////////////////////////////////
|
||
56 | //thread block size for computing Difference of Gaussian
|
||
57 | #define DOG_BLOCK_LOG_DIMX 7 |
||
58 | #define DOG_BLOCK_LOG_DIMY 0 |
||
59 | #define DOG_BLOCK_DIMX (1 << DOG_BLOCK_LOG_DIMX) |
||
60 | #define DOG_BLOCK_DIMY (1 << DOG_BLOCK_LOG_DIMY) |
||
61 | |||
62 | //////////////////////////////////////////////////////////
|
||
63 | //thread block size for keypoint detection
|
||
64 | #define KEY_BLOCK_LOG_DIMX 3 |
||
65 | #define KEY_BLOCK_LOG_DIMY 3 |
||
66 | #define KEY_BLOCK_DIMX (1<<KEY_BLOCK_LOG_DIMX) |
||
67 | #define KEY_BLOCK_DIMY (1<<KEY_BLOCK_LOG_DIMY) |
||
68 | //#define KEY_OFFSET_ONE
|
||
69 | //make KEY_BLOCK_LOG_DIMX 4 will make the write coalesced..
|
||
70 | //but it seems uncoalesced writes don't affect the speed
|
||
71 | |||
72 | //////////////////////////////////////////////////////////
|
||
73 | //thread block size for initializing list generation (64, 128, 256, 512 ...)
|
||
74 | #define HIST_INIT_WIDTH 128 |
||
75 | //thread block size for generating feature list (32, 64, 128, 256, 512, ...)
|
||
76 | #define LISTGEN_BLOCK_DIM 128 |
||
77 | |||
78 | |||
79 | /////////////////////////////////////////////////////////
|
||
80 | //how many keypoint orientations to compute in a block
|
||
81 | #define ORIENTATION_COMPUTE_PER_BLOCK 64 |
||
82 | //how many keypoint descriptor to compute in a block (2, 4, 8, 16, 32)
|
||
83 | #define DESCRIPTOR_COMPUTE_PER_BLOCK 4 |
||
84 | #define DESCRIPTOR_COMPUTE_BLOCK_SIZE (16 * DESCRIPTOR_COMPUTE_PER_BLOCK) |
||
85 | //how many keypoint descriptor to normalized in a block (32, ...)
|
||
86 | #define DESCRIPTOR_NORMALIZ_PER_BLOCK 32 |
||
87 | |||
88 | |||
89 | |||
90 | ///////////////////////////////////////////
|
||
91 | //Thread block size for visualization
|
||
92 | //(This doesn't affect the speed of computation)
|
||
93 | #define BLOCK_LOG_DIM 4 |
||
94 | #define BLOCK_DIM (1 << BLOCK_LOG_DIM) |
||
95 | |||
96 | //End SiftGPU setting section.
|
||
97 | //----------------------------------------------------------------
|
||
98 | |||
99 | |||
100 | __device__ __constant__ float d_kernel[KERNEL_MAX_WIDTH];
|
||
101 | texture<float, 1, cudaReadModeElementType> texData; |
||
102 | texture<unsigned char, 1, cudaReadModeNormalizedFloat> texDataB; |
||
103 | texture<float2, 2, cudaReadModeElementType> texDataF2;
|
||
104 | texture<float4, 1, cudaReadModeElementType> texDataF4;
|
||
105 | texture<int4, 1, cudaReadModeElementType> texDataI4;
|
||
106 | texture<int4, 1, cudaReadModeElementType> texDataList;
|
||
107 | |||
108 | //template<int i> __device__ float Conv(float *data) { return Conv<i-1>(data) + data[i]*d_kernel[i];}
|
||
109 | //template<> __device__ float Conv<0>(float *data) { return data[0] * d_kernel[0]; }
|
||
110 | |||
111 | |||
112 | //////////////////////////////////////////////////////////////
|
||
113 | template<int FW> __global__ void FilterH( float* d_result, int width) |
||
114 | { |
||
115 | |||
116 | const int HALF_WIDTH = FW >> 1; |
||
117 | const int CACHE_WIDTH = FILTERH_TILE_WIDTH + FW -1; |
||
118 | const int CACHE_COUNT = 2 + (CACHE_WIDTH - 2)/ FILTERH_TILE_WIDTH; |
||
119 | __shared__ float data[CACHE_WIDTH];
|
||
120 | const int bcol = IMUL(blockIdx.x, FILTERH_TILE_WIDTH); |
||
121 | const int col = bcol + threadIdx.x; |
||
122 | const int index_min = IMUL(blockIdx.y, width); |
||
123 | const int index_max = index_min + width - 1; |
||
124 | int src_index = index_min + bcol - HALF_WIDTH + threadIdx.x;
|
||
125 | int cache_index = threadIdx.x;
|
||
126 | float value = 0; |
||
127 | #pragma unroll
|
||
128 | for(int j = 0; j < CACHE_COUNT; ++j) |
||
129 | { |
||
130 | if(cache_index < CACHE_WIDTH)
|
||
131 | { |
||
132 | int fetch_index = src_index < index_min? index_min : (src_index > index_max ? index_max : src_index);
|
||
133 | data[cache_index] = tex1Dfetch(texData,fetch_index); |
||
134 | src_index += FILTERH_TILE_WIDTH; |
||
135 | cache_index += FILTERH_TILE_WIDTH; |
||
136 | } |
||
137 | } |
||
138 | __syncthreads(); |
||
139 | if(col >= width) return; |
||
140 | #pragma unroll
|
||
141 | for(int i = 0; i < FW; ++i) |
||
142 | { |
||
143 | value += (data[threadIdx.x + i]* d_kernel[i]); |
||
144 | } |
||
145 | // value = Conv<FW-1>(data + threadIdx.x);
|
||
146 | d_result[index_min + col] = value; |
||
147 | } |
||
148 | |||
149 | |||
150 | |||
151 | ////////////////////////////////////////////////////////////////////
|
||
152 | template<int FW> __global__ void FilterV(float* d_result, int width, int height) |
||
153 | { |
||
154 | const int HALF_WIDTH = FW >> 1; |
||
155 | const int CACHE_WIDTH = FW + FILTERV_TILE_HEIGHT - 1; |
||
156 | const int TEMP = CACHE_WIDTH & 0xf; |
||
157 | //add some extra space to avoid bank conflict
|
||
158 | #if FILTERV_TILE_WIDTH == 16 |
||
159 | //make the stride 16 * n +/- 1
|
||
160 | const int EXTRA = (TEMP == 1 || TEMP == 0) ? 1 - TEMP : 15 - TEMP; |
||
161 | #elif FILTERV_TILE_WIDTH == 8 |
||
162 | //make the stride 16 * n +/- 2
|
||
163 | const int EXTRA = (TEMP == 2 || TEMP == 1 || TEMP == 0) ? 2 - TEMP : (TEMP == 15? 3 : 14 - TEMP); |
||
164 | #elif FILTERV_TILE_WIDTH == 4 |
||
165 | //make the stride 16 * n +/- 4
|
||
166 | const int EXTRA = (TEMP >=0 && TEMP <=4) ? 4 - TEMP : (TEMP > 12? 20 - TEMP : 12 - TEMP); |
||
167 | #else
|
||
168 | #error
|
||
169 | #endif
|
||
170 | const int CACHE_TRUE_WIDTH = CACHE_WIDTH + EXTRA; |
||
171 | const int CACHE_COUNT = (CACHE_WIDTH + FILTERV_BLOCK_HEIGHT - 1) / FILTERV_BLOCK_HEIGHT; |
||
172 | const int WRITE_COUNT = (FILTERV_TILE_HEIGHT + FILTERV_BLOCK_HEIGHT -1) / FILTERV_BLOCK_HEIGHT; |
||
173 | __shared__ float data[CACHE_TRUE_WIDTH * FILTERV_TILE_WIDTH];
|
||
174 | const int row_block_first = IMUL(blockIdx.y, FILTERV_TILE_HEIGHT); |
||
175 | const int col = IMUL(blockIdx.x, FILTERV_TILE_WIDTH) + threadIdx.x; |
||
176 | const int row_first = row_block_first - HALF_WIDTH; |
||
177 | const int data_index_max = IMUL(height - 1, width) + col; |
||
178 | const int cache_col_start = threadIdx.y; |
||
179 | const int cache_row_start = IMUL(threadIdx.x, CACHE_TRUE_WIDTH); |
||
180 | int cache_index = cache_col_start + cache_row_start;
|
||
181 | int data_index = IMUL(row_first + cache_col_start, width) + col;
|
||
182 | |||
183 | if(col < width)
|
||
184 | { |
||
185 | #pragma unroll
|
||
186 | for(int i = 0; i < CACHE_COUNT; ++i) |
||
187 | { |
||
188 | if(cache_col_start < CACHE_WIDTH - i * FILTERV_BLOCK_HEIGHT)
|
||
189 | { |
||
190 | int fetch_index = data_index < col ? col : (data_index > data_index_max? data_index_max : data_index);
|
||
191 | data[cache_index + i * FILTERV_BLOCK_HEIGHT] = tex1Dfetch(texData,fetch_index); |
||
192 | data_index += IMUL(FILTERV_BLOCK_HEIGHT, width); |
||
193 | } |
||
194 | } |
||
195 | } |
||
196 | __syncthreads(); |
||
197 | |||
198 | if(col >= width) return; |
||
199 | |||
200 | int row = row_block_first + threadIdx.y;
|
||
201 | int index_start = cache_row_start + threadIdx.y;
|
||
202 | #pragma unroll
|
||
203 | for(int i = 0; i < WRITE_COUNT; ++i, |
||
204 | row += FILTERV_BLOCK_HEIGHT, index_start += FILTERV_BLOCK_HEIGHT) |
||
205 | { |
||
206 | if(row < height)
|
||
207 | { |
||
208 | int index_dest = IMUL(row, width) + col;
|
||
209 | float value = 0; |
||
210 | #pragma unroll
|
||
211 | for(int i = 0; i < FW; ++i) |
||
212 | { |
||
213 | value += (data[index_start + i] * d_kernel[i]); |
||
214 | } |
||
215 | d_result[index_dest] = value; |
||
216 | } |
||
217 | } |
||
218 | } |
||
219 | |||
220 | |||
221 | template<int LOG_SCALE> __global__ void UpsampleKernel(float* d_result, int width) |
||
222 | { |
||
223 | const int SCALE = (1 << LOG_SCALE), SCALE_MASK = (SCALE - 1); |
||
224 | const float INV_SCALE = 1.0f / (float(SCALE)); |
||
225 | int col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
|
||
226 | if(col >= width) return; |
||
227 | |||
228 | int row = blockIdx.y >> LOG_SCALE;
|
||
229 | int index = row * width + col;
|
||
230 | int dst_row = blockIdx.y;
|
||
231 | int dst_idx= (width * dst_row + col) * SCALE;
|
||
232 | int helper = blockIdx.y & SCALE_MASK;
|
||
233 | if (helper)
|
||
234 | { |
||
235 | float v11 = tex1Dfetch(texData, index);
|
||
236 | float v12 = tex1Dfetch(texData, index + 1); |
||
237 | index += width; |
||
238 | float v21 = tex1Dfetch(texData, index);
|
||
239 | float v22 = tex1Dfetch(texData, index + 1); |
||
240 | float w1 = INV_SCALE * helper, w2 = 1.0 - w1; |
||
241 | float v1 = (v21 * w1 + w2 * v11);
|
||
242 | float v2 = (v22 * w1 + w2 * v12);
|
||
243 | d_result[dst_idx] = v1; |
||
244 | #pragma unroll
|
||
245 | for(int i = 1; i < SCALE; ++i) |
||
246 | { |
||
247 | const float r2 = i * INV_SCALE; |
||
248 | const float r1 = 1.0f - r2; |
||
249 | d_result[dst_idx +i] = v1 * r1 + v2 * r2; |
||
250 | } |
||
251 | }else
|
||
252 | { |
||
253 | float v1 = tex1Dfetch(texData, index);
|
||
254 | float v2 = tex1Dfetch(texData, index + 1); |
||
255 | d_result[dst_idx] = v1; |
||
256 | #pragma unroll
|
||
257 | for(int i = 1; i < SCALE; ++i) |
||
258 | { |
||
259 | const float r2 = i * INV_SCALE; |
||
260 | const float r1 = 1.0f - r2; |
||
261 | d_result[dst_idx +i] = v1 * r1 + v2 * r2; |
||
262 | } |
||
263 | } |
||
264 | |||
265 | } |
||
266 | |||
267 | ////////////////////////////////////////////////////////////////////////////////////////
|
||
268 | void ProgramCU::SampleImageU(CuTexImage *dst, CuTexImage *src, int log_scale) |
||
269 | { |
||
270 | int width = src->GetImgWidth(), height = src->GetImgHeight();
|
||
271 | src->BindTexture(texData); |
||
272 | dim3 grid((width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height << log_scale);
|
||
273 | dim3 block(FILTERH_TILE_WIDTH); |
||
274 | switch(log_scale)
|
||
275 | { |
||
276 | case 1 : UpsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, width); break; |
||
277 | case 2 : UpsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, width); break; |
||
278 | case 3 : UpsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, width); break; |
||
279 | default: break; |
||
280 | } |
||
281 | } |
||
282 | |||
283 | template<int LOG_SCALE> __global__ void DownsampleKernel(float* d_result, int src_width, int dst_width) |
||
284 | { |
||
285 | const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x; |
||
286 | if(dst_col >= dst_width) return; |
||
287 | const int src_col = min((dst_col << LOG_SCALE), (src_width - 1)); |
||
288 | const int dst_row = blockIdx.y; |
||
289 | const int src_row = blockIdx.y << LOG_SCALE; |
||
290 | const int src_idx = IMUL(src_row, src_width) + src_col; |
||
291 | const int dst_idx = IMUL(dst_width, dst_row) + dst_col; |
||
292 | d_result[dst_idx] = tex1Dfetch(texData, src_idx); |
||
293 | |||
294 | } |
||
295 | |||
296 | __global__ void DownsampleKernel(float* d_result, int src_width, int dst_width, const int log_scale) |
||
297 | { |
||
298 | const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x; |
||
299 | if(dst_col >= dst_width) return; |
||
300 | const int src_col = min((dst_col << log_scale), (src_width - 1)); |
||
301 | const int dst_row = blockIdx.y; |
||
302 | const int src_row = blockIdx.y << log_scale; |
||
303 | const int src_idx = IMUL(src_row, src_width) + src_col; |
||
304 | const int dst_idx = IMUL(dst_width, dst_row) + dst_col; |
||
305 | d_result[dst_idx] = tex1Dfetch(texData, src_idx); |
||
306 | |||
307 | } |
||
308 | |||
309 | void ProgramCU::SampleImageD(CuTexImage *dst, CuTexImage *src, int log_scale) |
||
310 | { |
||
311 | int src_width = src->GetImgWidth(), dst_width = dst->GetImgWidth() ;
|
||
312 | |||
313 | src->BindTexture(texData); |
||
314 | dim3 grid((dst_width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, dst->GetImgHeight());
|
||
315 | dim3 block(FILTERH_TILE_WIDTH); |
||
316 | switch(log_scale)
|
||
317 | { |
||
318 | case 1 : DownsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width); break; |
||
319 | case 2 : DownsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width); break; |
||
320 | case 3 : DownsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width); break; |
||
321 | default: DownsampleKernel <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width, log_scale); |
||
322 | } |
||
323 | } |
||
324 | |||
325 | __global__ void ChannelReduce_Kernel(float* d_result) |
||
326 | { |
||
327 | int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
|
||
328 | d_result[index] = tex1Dfetch(texData, index*4);
|
||
329 | } |
||
330 | |||
331 | __global__ void ChannelReduce_Convert_Kernel(float* d_result) |
||
332 | { |
||
333 | int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
|
||
334 | float4 rgba = tex1Dfetch(texDataF4, index); |
||
335 | d_result[index] = 0.299f * rgba.x + 0.587f* rgba.y + 0.114f * rgba.z; |
||
336 | } |
||
337 | |||
338 | void ProgramCU::ReduceToSingleChannel(CuTexImage* dst, CuTexImage* src, int convert_rgb) |
||
339 | { |
||
340 | int width = src->GetImgWidth(), height = dst->GetImgHeight() ;
|
||
341 | |||
342 | dim3 grid((width * height + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);
|
||
343 | dim3 block(FILTERH_TILE_WIDTH); |
||
344 | if(convert_rgb)
|
||
345 | { |
||
346 | src->BindTexture(texDataF4); |
||
347 | ChannelReduce_Convert_Kernel<<<grid, block>>>((float*)dst->_cuData);
|
||
348 | }else
|
||
349 | { |
||
350 | src->BindTexture(texData); |
||
351 | ChannelReduce_Kernel<<<grid, block>>>((float*)dst->_cuData);
|
||
352 | } |
||
353 | } |
||
354 | |||
355 | __global__ void ConvertByteToFloat_Kernel(float* d_result) |
||
356 | { |
||
357 | int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
|
||
358 | d_result[index] = tex1Dfetch(texDataB, index); |
||
359 | } |
||
360 | |||
361 | void ProgramCU::ConvertByteToFloat(CuTexImage*src, CuTexImage* dst)
|
||
362 | { |
||
363 | int width = src->GetImgWidth(), height = dst->GetImgHeight() ;
|
||
364 | dim3 grid((width * height + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);
|
||
365 | dim3 block(FILTERH_TILE_WIDTH); |
||
366 | src->BindTexture(texDataB); |
||
367 | ConvertByteToFloat_Kernel<<<grid, block>>>((float*)dst->_cuData);
|
||
368 | } |
||
369 | |||
370 | void ProgramCU::CreateFilterKernel(float sigma, float* kernel, int& width) |
||
371 | { |
||
372 | int i, sz = int( ceil( GlobalUtil::_FilterWidthFactor * sigma -0.5) ) ;// |
||
373 | width = 2*sz + 1; |
||
374 | |||
375 | if(width > KERNEL_MAX_WIDTH)
|
||
376 | { |
||
377 | //filter size truncation
|
||
378 | sz = KERNEL_MAX_WIDTH >> 1;
|
||
379 | width =KERNEL_MAX_WIDTH; |
||
380 | }else if(width < KERNEL_MIN_WIDTH) |
||
381 | { |
||
382 | sz = KERNEL_MIN_WIDTH >> 1;
|
||
383 | width =KERNEL_MIN_WIDTH; |
||
384 | } |
||
385 | |||
386 | float rv = 1.0f/(sigma*sigma), v, ksum =0; |
||
387 | |||
388 | // pre-compute filter
|
||
389 | for( i = -sz ; i <= sz ; ++i)
|
||
390 | { |
||
391 | kernel[i+sz] = v = exp(-0.5f * i * i *rv) ; |
||
392 | ksum += v; |
||
393 | } |
||
394 | |||
395 | //normalize the kernel
|
||
396 | rv = 1.0f/ksum; |
||
397 | for(i = 0; i< width ;i++) kernel[i]*=rv; |
||
398 | } |
||
399 | |||
400 | |||
401 | template<int FW> void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf) |
||
402 | { |
||
403 | int width = src->GetImgWidth(), height = src->GetImgHeight();
|
||
404 | |||
405 | //horizontal filtering
|
||
406 | src->BindTexture(texData); |
||
407 | dim3 gridh((width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height);
|
||
408 | dim3 blockh(FILTERH_TILE_WIDTH); |
||
409 | FilterH<FW><<<gridh, blockh>>>((float*)buf->_cuData, width);
|
||
410 | CheckErrorCUDA("FilterH");
|
||
411 | |||
412 | ///vertical filtering
|
||
413 | buf->BindTexture(texData); |
||
414 | dim3 gridv((width + FILTERV_TILE_WIDTH - 1)/ FILTERV_TILE_WIDTH, (height + FILTERV_TILE_HEIGHT - 1)/FILTERV_TILE_HEIGHT); |
||
415 | dim3 blockv(FILTERV_TILE_WIDTH, FILTERV_BLOCK_HEIGHT); |
||
416 | FilterV<FW><<<gridv, blockv>>>((float*)dst->_cuData, width, height);
|
||
417 | CheckErrorCUDA("FilterV");
|
||
418 | } |
||
419 | |||
420 | //////////////////////////////////////////////////////////////////////
|
||
421 | // tested on 2048x1500 image, the time on pyramid construction is
|
||
422 | // OpenGL version : 18ms
|
||
423 | // CUDA version: 28 ms
|
||
424 | void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf, float sigma) |
||
425 | { |
||
426 | float filter_kernel[KERNEL_MAX_WIDTH]; int width; |
||
427 | CreateFilterKernel(sigma, filter_kernel, width); |
||
428 | cudaMemcpyToSymbol(d_kernel, filter_kernel, width * sizeof(float), 0, cudaMemcpyHostToDevice); |
||
429 | |||
430 | switch(width)
|
||
431 | { |
||
432 | case 5: FilterImage< 5>(dst, src, buf); break; |
||
433 | case 7: FilterImage< 7>(dst, src, buf); break; |
||
434 | case 9: FilterImage< 9>(dst, src, buf); break; |
||
435 | case 11: FilterImage<11>(dst, src, buf); break; |
||
436 | case 13: FilterImage<13>(dst, src, buf); break; |
||
437 | case 15: FilterImage<15>(dst, src, buf); break; |
||
438 | case 17: FilterImage<17>(dst, src, buf); break; |
||
439 | case 19: FilterImage<19>(dst, src, buf); break; |
||
440 | case 21: FilterImage<21>(dst, src, buf); break; |
||
441 | case 23: FilterImage<23>(dst, src, buf); break; |
||
442 | case 25: FilterImage<25>(dst, src, buf); break; |
||
443 | case 27: FilterImage<27>(dst, src, buf); break; |
||
444 | case 29: FilterImage<29>(dst, src, buf); break; |
||
445 | case 31: FilterImage<31>(dst, src, buf); break; |
||
446 | case 33: FilterImage<33>(dst, src, buf); break; |
||
447 | default: break; |
||
448 | } |
||
449 | |||
450 | } |
||
451 | |||
452 | |||
453 | texture<float, 1, cudaReadModeElementType> texC; |
||
454 | texture<float, 1, cudaReadModeElementType> texP; |
||
455 | texture<float, 1, cudaReadModeElementType> texN; |
||
456 | |||
457 | void __global__ ComputeDOG_Kernel(float* d_dog, float2* d_got, int width, int height) |
||
458 | { |
||
459 | int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
|
||
460 | int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
|
||
461 | if(col < width && row < height)
|
||
462 | { |
||
463 | int index = IMUL(row, width) + col;
|
||
464 | float vp = tex1Dfetch(texP, index);
|
||
465 | float v = tex1Dfetch(texC, index);
|
||
466 | d_dog[index] = v - vp; |
||
467 | float vxn = tex1Dfetch(texC, index + 1); |
||
468 | float vxp = tex1Dfetch(texC, index - 1); |
||
469 | float vyp = tex1Dfetch(texC, index - width);
|
||
470 | float vyn = tex1Dfetch(texC, index + width);
|
||
471 | float dx = vxn - vxp, dy = vyn - vyp;
|
||
472 | float grd = 0.5f * sqrt(dx * dx + dy * dy); |
||
473 | float rot = (grd == 0.0f? 0.0f : atan2(dy, dx)); |
||
474 | d_got[index] = make_float2(grd, rot); |
||
475 | } |
||
476 | } |
||
477 | |||
478 | void __global__ ComputeDOG_Kernel(float* d_dog, int width, int height) |
||
479 | { |
||
480 | int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
|
||
481 | int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
|
||
482 | if(col < width && row < height)
|
||
483 | { |
||
484 | int index = IMUL(row, width) + col;
|
||
485 | float vp = tex1Dfetch(texP, index);
|
||
486 | float v = tex1Dfetch(texC, index);
|
||
487 | d_dog[index] = v - vp; |
||
488 | } |
||
489 | } |
||
490 | |||
491 | void ProgramCU::ComputeDOG(CuTexImage* gus, CuTexImage* dog, CuTexImage* got)
|
||
492 | { |
||
493 | int width = gus->GetImgWidth(), height = gus->GetImgHeight();
|
||
494 | dim3 grid((width + DOG_BLOCK_DIMX - 1)/ DOG_BLOCK_DIMX, (height + DOG_BLOCK_DIMY - 1)/DOG_BLOCK_DIMY); |
||
495 | dim3 block(DOG_BLOCK_DIMX, DOG_BLOCK_DIMY); |
||
496 | gus->BindTexture(texC); |
||
497 | (gus -1)->BindTexture(texP);
|
||
498 | if(got->_cuData)
|
||
499 | ComputeDOG_Kernel<<<grid, block>>>((float*) dog->_cuData, (float2*) got->_cuData, width, height);
|
||
500 | else
|
||
501 | ComputeDOG_Kernel<<<grid, block>>>((float*) dog->_cuData, width, height);
|
||
502 | } |
||
503 | |||
504 | |||
505 | #define READ_CMP_DOG_DATA(datai, tex, idx) \
|
||
506 | datai[0] = tex1Dfetch(tex, idx - 1);\ |
||
507 | datai[1] = tex1Dfetch(tex, idx);\
|
||
508 | datai[2] = tex1Dfetch(tex, idx + 1);\ |
||
509 | if(v > nmax)\
|
||
510 | {\ |
||
511 | nmax = max(nmax, datai[0]);\
|
||
512 | nmax = max(nmax, datai[1]);\
|
||
513 | nmax = max(nmax, datai[2]);\
|
||
514 | if(v < nmax) goto key_finish;\ |
||
515 | }else\
|
||
516 | {\ |
||
517 | nmin = min(nmin, datai[0]);\
|
||
518 | nmin = min(nmin, datai[1]);\
|
||
519 | nmin = min(nmin, datai[2]);\
|
||
520 | if(v > nmin) goto key_finish;\ |
||
521 | } |
||
522 | |||
523 | |||
524 | void __global__ ComputeKEY_Kernel(float4* d_key, int width, int colmax, int rowmax, |
||
525 | float dog_threshold0, float dog_threshold, float edge_threshold, int subpixel_localization) |
||
526 | { |
||
527 | float data[3][3], v; |
||
528 | float datap[3][3], datan[3][3]; |
||
529 | #ifdef KEY_OFFSET_ONE
|
||
530 | int row = (blockIdx.y << KEY_BLOCK_LOG_DIMY) + threadIdx.y + 1; |
||
531 | int col = (blockIdx.x << KEY_BLOCK_LOG_DIMX) + threadIdx.x + 1; |
||
532 | #else
|
||
533 | int row = (blockIdx.y << KEY_BLOCK_LOG_DIMY) + threadIdx.y;
|
||
534 | int col = (blockIdx.x << KEY_BLOCK_LOG_DIMX) + threadIdx.x;
|
||
535 | #endif
|
||
536 | int index = IMUL(row, width) + col;
|
||
537 | int idx[3] ={index - width, index, index + width}; |
||
538 | int in_image =0; |
||
539 | float nmax, nmin, result = 0.0f; |
||
540 | float dx = 0, dy = 0, ds = 0; |
||
541 | bool offset_test_passed = true; |
||
542 | #ifdef KEY_OFFSET_ONE
|
||
543 | if(row < rowmax && col < colmax)
|
||
544 | #else
|
||
545 | if(row > 0 && col > 0 && row < rowmax && col < colmax) |
||
546 | #endif
|
||
547 | { |
||
548 | in_image = 1;
|
||
549 | data[1][1] = v = tex1Dfetch(texC, idx[1]); |
||
550 | if(fabs(v) <= dog_threshold0) goto key_finish; |
||
551 | |||
552 | data[1][0] = tex1Dfetch(texC, idx[1] - 1); |
||
553 | data[1][2] = tex1Dfetch(texC, idx[1] + 1); |
||
554 | nmax = max(data[1][0], data[1][2]); |
||
555 | nmin = min(data[1][0], data[1][2]); |
||
556 | |||
557 | if(v <=nmax && v >= nmin) goto key_finish; |
||
558 | //if((v > nmax && v < 0 )|| (v < nmin && v > 0)) goto key_finish;
|
||
559 | READ_CMP_DOG_DATA(data[0], texC, idx[0]); |
||
560 | READ_CMP_DOG_DATA(data[2], texC, idx[2]); |
||
561 | |||
562 | //edge supression
|
||
563 | float vx2 = v * 2.0f; |
||
564 | float fxx = data[1][0] + data[1][2] - vx2; |
||
565 | float fyy = data[0][1] + data[2][1] - vx2; |
||
566 | float fxy = 0.25f * (data[2][2] + data[0][0] - data[2][0] - data[0][2]); |
||
567 | float temp1 = fxx * fyy - fxy * fxy;
|
||
568 | float temp2 = (fxx + fyy) * (fxx + fyy);
|
||
569 | if(temp1 <=0 || temp2 > edge_threshold * temp1) goto key_finish; |
||
570 | |||
571 | |||
572 | //read the previous level
|
||
573 | READ_CMP_DOG_DATA(datap[0], texP, idx[0]); |
||
574 | READ_CMP_DOG_DATA(datap[1], texP, idx[1]); |
||
575 | READ_CMP_DOG_DATA(datap[2], texP, idx[2]); |
||
576 | |||
577 | |||
578 | //read the next level
|
||
579 | READ_CMP_DOG_DATA(datan[0], texN, idx[0]); |
||
580 | READ_CMP_DOG_DATA(datan[1], texN, idx[1]); |
||
581 | READ_CMP_DOG_DATA(datan[2], texN, idx[2]); |
||
582 | |||
583 | if(subpixel_localization)
|
||
584 | { |
||
585 | //subpixel localization
|
||
586 | float fx = 0.5f * (data[1][2] - data[1][0]); |
||
587 | float fy = 0.5f * (data[2][1] - data[0][1]); |
||
588 | float fs = 0.5f * (datan[1][1] - datap[1][1]); |
||
589 | |||
590 | float fss = (datan[1][1] + datap[1][1] - vx2); |
||
591 | float fxs = 0.25f* (datan[1][2] + datap[1][0] - datan[1][0] - datap[1][2]); |
||
592 | float fys = 0.25f* (datan[2][1] + datap[0][1] - datan[0][1] - datap[2][1]); |
||
593 | |||
594 | //need to solve dx, dy, ds;
|
||
595 | // |-fx| | fxx fxy fxs | |dx|
|
||
596 | // |-fy| = | fxy fyy fys | * |dy|
|
||
597 | // |-fs| | fxs fys fss | |ds|
|
||
598 | float4 A0 = fxx > 0? make_float4(fxx, fxy, fxs, -fx) : make_float4(-fxx, -fxy, -fxs, fx);
|
||
599 | float4 A1 = fxy > 0? make_float4(fxy, fyy, fys, -fy) : make_float4(-fxy, -fyy, -fys, fy);
|
||
600 | float4 A2 = fxs > 0? make_float4(fxs, fys, fss, -fs) : make_float4(-fxs, -fys, -fss, fs);
|
||
601 | float maxa = max(max(A0.x, A1.x), A2.x);
|
||
602 | if(maxa >= 1e-10) |
||
603 | { |
||
604 | if(maxa == A1.x)
|
||
605 | { |
||
606 | float4 TEMP = A1; A1 = A0; A0 = TEMP; |
||
607 | }else if(maxa == A2.x) |
||
608 | { |
||
609 | float4 TEMP = A2; A2 = A0; A0 = TEMP; |
||
610 | } |
||
611 | A0.y /= A0.x; A0.z /= A0.x; A0.w/= A0.x; |
||
612 | A1.y -= A1.x * A0.y; A1.z -= A1.x * A0.z; A1.w -= A1.x * A0.w; |
||
613 | A2.y -= A2.x * A0.y; A2.z -= A2.x * A0.z; A2.w -= A2.x * A0.w; |
||
614 | if(abs(A2.y) > abs(A1.y))
|
||
615 | { |
||
616 | float4 TEMP = A2; A2 = A1; A1 = TEMP; |
||
617 | } |
||
618 | if(abs(A1.y) >= 1e-10) |
||
619 | { |
||
620 | A1.z /= A1.y; A1.w /= A1.y; |
||
621 | A2.z -= A2.y * A1.z; A2.w -= A2.y * A1.w; |
||
622 | if(abs(A2.z) >= 1e-10) |
||
623 | { |
||
624 | ds = A2.w / A2.z; |
||
625 | dy = A1.w - ds * A1.z; |
||
626 | dx = A0.w - ds * A0.z - dy * A0.y; |
||
627 | |||
628 | offset_test_passed = |
||
629 | fabs(data[1][1] + 0.5f * (dx * fx + dy * fy + ds * fs)) > dog_threshold |
||
630 | &&fabs(ds) < 1.0f && fabs(dx) < 1.0f && fabs(dy) < 1.0f; |
||
631 | } |
||
632 | } |
||
633 | } |
||
634 | } |
||
635 | if(offset_test_passed) result = v > nmax ? 1.0 : -1.0; |
||
636 | } |
||
637 | key_finish:
|
||
638 | if(in_image) d_key[index] = make_float4(result, dx, dy, ds);
|
||
639 | } |
||
640 | |||
641 | |||
642 | void ProgramCU::ComputeKEY(CuTexImage* dog, CuTexImage* key, float Tdog, float Tedge) |
||
643 | { |
||
644 | int width = dog->GetImgWidth(), height = dog->GetImgHeight();
|
||
645 | float Tdog1 = (GlobalUtil::_SubpixelLocalization? 0.8f : 1.0f) * Tdog; |
||
646 | CuTexImage* dogp = dog - 1;
|
||
647 | CuTexImage* dogn = dog + 1;
|
||
648 | #ifdef KEY_OFFSET_ONE
|
||
649 | dim3 grid((width - 1 + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX, (height - 1 + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY); |
||
650 | #else
|
||
651 | dim3 grid((width + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX, (height + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY); |
||
652 | #endif
|
||
653 | dim3 block(KEY_BLOCK_DIMX, KEY_BLOCK_DIMY); |
||
654 | dogp->BindTexture(texP); |
||
655 | dog ->BindTexture(texC); |
||
656 | dogn->BindTexture(texN); |
||
657 | Tedge = (Tedge+1)*(Tedge+1)/Tedge; |
||
658 | ComputeKEY_Kernel<<<grid, block>>>((float4*) key->_cuData, width, |
||
659 | width -1, height -1, Tdog1, Tdog, Tedge, GlobalUtil::_SubpixelLocalization); |
||
660 | |||
661 | } |
||
662 | |||
663 | |||
664 | |||
665 | void __global__ InitHist_Kernel(int4* hist, int ws, int wd, int height) |
||
666 | { |
||
667 | int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
|
||
668 | int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
|
||
669 | if(row < height && col < wd)
|
||
670 | { |
||
671 | int hidx = IMUL(row, wd) + col;
|
||
672 | int scol = col << 2; |
||
673 | int sidx = IMUL(row, ws) + scol;
|
||
674 | int v[4] = {0, 0, 0, 0}; |
||
675 | if(row > 0 && row < height -1) |
||
676 | { |
||
677 | #pragma unroll
|
||
678 | for(int i = 0; i < 4 ; ++i, ++scol) |
||
679 | { |
||
680 | float4 temp = tex1Dfetch(texDataF4, sidx +i); |
||
681 | v[i] = (scol < ws -1 && scol > 0 && temp.x!=0) ? 1 : 0; |
||
682 | } |
||
683 | } |
||
684 | hist[hidx] = make_int4(v[0], v[1], v[2], v[3]); |
||
685 | |||
686 | } |
||
687 | } |
||
688 | |||
689 | |||
690 | |||
691 | void ProgramCU::InitHistogram(CuTexImage* key, CuTexImage* hist)
|
||
692 | { |
||
693 | int ws = key->GetImgWidth(), hs = key->GetImgHeight();
|
||
694 | int wd = hist->GetImgWidth(), hd = hist->GetImgHeight();
|
||
695 | dim3 grid((wd + HIST_INIT_WIDTH - 1)/ HIST_INIT_WIDTH, hd);
|
||
696 | dim3 block(HIST_INIT_WIDTH, 1);
|
||
697 | key->BindTexture(texDataF4); |
||
698 | InitHist_Kernel<<<grid, block>>>((int4*) hist->_cuData, ws, wd, hd); |
||
699 | } |
||
700 | |||
701 | |||
702 | |||
703 | void __global__ ReduceHist_Kernel(int4* d_hist, int ws, int wd, int height) |
||
704 | { |
||
705 | int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
|
||
706 | int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
|
||
707 | if(row < height && col < wd)
|
||
708 | { |
||
709 | int hidx = IMUL(row, wd) + col;
|
||
710 | int scol = col << 2; |
||
711 | int sidx = IMUL(row, ws) + scol;
|
||
712 | int v[4] = {0, 0, 0, 0}; |
||
713 | #pragma unroll
|
||
714 | for(int i = 0; i < 4 && scol < ws; ++i, ++scol) |
||
715 | { |
||
716 | int4 temp = tex1Dfetch(texDataI4, sidx + i); |
||
717 | v[i] = temp.x + temp.y + temp.z + temp.w; |
||
718 | } |
||
719 | d_hist[hidx] = make_int4(v[0], v[1], v[2], v[3]); |
||
720 | } |
||
721 | } |
||
722 | |||
723 | void ProgramCU::ReduceHistogram(CuTexImage*hist1, CuTexImage* hist2)
|
||
724 | { |
||
725 | int ws = hist1->GetImgWidth(), hs = hist1->GetImgHeight();
|
||
726 | int wd = hist2->GetImgWidth(), hd = hist2->GetImgHeight();
|
||
727 | int temp = (int)floor(logf(float(wd * 2/ 3)) / logf(2.0f)); |
||
728 | const int wi = min(7, max(temp , 0)); |
||
729 | hist1->BindTexture(texDataI4); |
||
730 | |||
731 | const int BW = 1 << wi, BH = 1 << (7 - wi); |
||
732 | dim3 grid((wd + BW - 1)/ BW, (hd + BH -1) / BH); |
||
733 | dim3 block(BW, BH); |
||
734 | ReduceHist_Kernel<<<grid, block>>>((int4*)hist2->_cuData, ws, wd, hd); |
||
735 | } |
||
736 | |||
737 | |||
738 | void __global__ ListGen_Kernel(int4* d_list, int width) |
||
739 | { |
||
740 | int idx1 = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
|
||
741 | int4 pos = tex1Dfetch(texDataList, idx1); |
||
742 | int idx2 = IMUL(pos.y, width) + pos.x;
|
||
743 | int4 temp = tex1Dfetch(texDataI4, idx2); |
||
744 | int sum1 = temp.x + temp.y;
|
||
745 | int sum2 = sum1 + temp.z;
|
||
746 | pos.x <<= 2;
|
||
747 | if(pos.z >= sum2)
|
||
748 | { |
||
749 | pos.x += 3;
|
||
750 | pos.z -= sum2; |
||
751 | }else if(pos.z >= sum1) |
||
752 | { |
||
753 | pos.x += 2;
|
||
754 | pos.z -= sum1; |
||
755 | }else if(pos.z >= temp.x) |
||
756 | { |
||
757 | pos.x += 1;
|
||
758 | pos.z -= temp.x; |
||
759 | } |
||
760 | d_list[idx1] = pos; |
||
761 | } |
||
762 | |||
763 | //input list (x, y) (x, y) ....
|
||
764 | void ProgramCU::GenerateList(CuTexImage* list, CuTexImage* hist)
|
||
765 | { |
||
766 | int len = list->GetImgWidth();
|
||
767 | list->BindTexture(texDataList); |
||
768 | hist->BindTexture(texDataI4); |
||
769 | dim3 grid((len + LISTGEN_BLOCK_DIM -1) /LISTGEN_BLOCK_DIM);
|
||
770 | dim3 block(LISTGEN_BLOCK_DIM); |
||
771 | ListGen_Kernel<<<grid, block>>>((int4*) list->_cuData, hist->GetImgWidth()); |
||
772 | } |
||
773 | |||
774 | void __global__ ComputeOrientation_Kernel(float4* d_list,
|
||
775 | int list_len,
|
||
776 | int width, int height, |
||
777 | float sigma, float sigma_step, |
||
778 | float gaussian_factor, float sample_factor, |
||
779 | int num_orientation,
|
||
780 | int existing_keypoint,
|
||
781 | int subpixel,
|
||
782 | int keepsign)
|
||
783 | { |
||
784 | const float ten_degree_per_radius = 5.7295779513082320876798154814105; |
||
785 | const float radius_per_ten_degrees = 1.0 / 5.7295779513082320876798154814105; |
||
786 | int idx = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
|
||
787 | if(idx >= list_len) return; |
||
788 | float4 key; |
||
789 | if(existing_keypoint)
|
||
790 | { |
||
791 | key = tex1Dfetch(texDataF4, idx); |
||
792 | }else
|
||
793 | { |
||
794 | int4 ikey = tex1Dfetch(texDataList, idx); |
||
795 | key.x = ikey.x + 0.5f; |
||
796 | key.y = ikey.y + 0.5f; |
||
797 | key.z = sigma; |
||
798 | if(subpixel || keepsign)
|
||
799 | { |
||
800 | float4 offset = tex1Dfetch(texDataF4, IMUL(width, ikey.y) + ikey.x); |
||
801 | if(subpixel)
|
||
802 | { |
||
803 | key.x += offset.y; |
||
804 | key.y += offset.z; |
||
805 | key.z *= pow(sigma_step, offset.w); |
||
806 | } |
||
807 | if(keepsign) key.z *= offset.x;
|
||
808 | } |
||
809 | } |
||
810 | if(num_orientation == 0) |
||
811 | { |
||
812 | key.w = 0;
|
||
813 | d_list[idx] = key; |
||
814 | return;
|
||
815 | } |
||
816 | float vote[37]; |
||
817 | float gsigma = key.z * gaussian_factor;
|
||
818 | float win = fabs(key.z) * sample_factor;
|
||
819 | float dist_threshold = win * win + 0.5; |
||
820 | float factor = -0.5f / (gsigma * gsigma); |
||
821 | float xmin = max(1.5f, floor(key.x - win) + 0.5f); |
||
822 | float ymin = max(1.5f, floor(key.y - win) + 0.5f); |
||
823 | float xmax = min(width - 1.5f, floor(key.x + win) + 0.5f); |
||
824 | float ymax = min(height -1.5f, floor(key.y + win) + 0.5f); |
||
825 | #pragma unroll
|
||
826 | for(int i = 0; i < 36; ++i) vote[i] = 0.0f; |
||
827 | for(float y = ymin; y <= ymax; y += 1.0f) |
||
828 | { |
||
829 | for(float x = xmin; x <= xmax; x += 1.0f) |
||
830 | { |
||
831 | float dx = x - key.x;
|
||
832 | float dy = y - key.y;
|
||
833 | float sq_dist = dx * dx + dy * dy;
|
||
834 | if(sq_dist >= dist_threshold) continue; |
||
835 | float2 got = tex2D(texDataF2, x, y); |
||
836 | float weight = got.x * exp(sq_dist * factor);
|
||
837 | float fidx = floor(got.y * ten_degree_per_radius);
|
||
838 | int oidx = fidx;
|
||
839 | if(oidx < 0) oidx += 36; |
||
840 | vote[oidx] += weight; |
||
841 | } |
||
842 | } |
||
843 | |||
844 | //filter the vote
|
||
845 | |||
846 | const float one_third = 1.0 /3.0; |
||
847 | #pragma unroll
|
||
848 | for(int i = 0; i < 6; ++i) |
||
849 | { |
||
850 | vote[36] = vote[0]; |
||
851 | float pre = vote[35]; |
||
852 | #pragma unroll
|
||
853 | for(int j = 0; j < 36; ++j) |
||
854 | { |
||
855 | float temp = one_third * (pre + vote[j] + vote[j + 1]); |
||
856 | pre = vote[j]; vote[j] = temp; |
||
857 | } |
||
858 | } |
||
859 | |||
860 | vote[36] = vote[0]; |
||
861 | if(num_orientation == 1 || existing_keypoint) |
||
862 | { |
||
863 | int index_max = 0; |
||
864 | float max_vote = vote[0]; |
||
865 | #pragma unroll
|
||
866 | for(int i = 1; i < 36; ++i) |
||
867 | { |
||
868 | index_max = vote[i] > max_vote? i : index_max; |
||
869 | max_vote = max(max_vote, vote[i]); |
||
870 | } |
||
871 | float pre = vote[index_max == 0? 35 : index_max -1]; |
||
872 | float next = vote[index_max + 1]; |
||
873 | float weight = max_vote;
|
||
874 | float off = 0.5f * FDIV(next - pre, weight + weight - next - pre); |
||
875 | key.w = radius_per_ten_degrees * (index_max + 0.5f + off); |
||
876 | d_list[idx] = key; |
||
877 | |||
878 | }else
|
||
879 | { |
||
880 | float max_vote = vote[0]; |
||
881 | #pragma unroll
|
||
882 | for(int i = 1; i < 36; ++i) max_vote = max(max_vote, vote[i]); |
||
883 | |||
884 | float vote_threshold = max_vote * 0.8f; |
||
885 | float pre = vote[35]; |
||
886 | float max_rot[2], max_vot[2] = {0, 0}; |
||
887 | int ocount = 0; |
||
888 | #pragma unroll
|
||
889 | for(int i =0; i < 36; ++i) |
||
890 | { |
||
891 | float next = vote[i + 1]; |
||
892 | if(vote[i] > vote_threshold && vote[i] > pre && vote[i] > next)
|
||
893 | { |
||
894 | float di = 0.5f * FDIV(next - pre, vote[i] + vote[i] - next - pre); |
||
895 | float rot = i + di + 0.5f; |
||
896 | float weight = vote[i];
|
||
897 | ///
|
||
898 | if(weight > max_vot[1]) |
||
899 | { |
||
900 | if(weight > max_vot[0]) |
||
901 | { |
||
902 | max_vot[1] = max_vot[0]; |
||
903 | max_rot[1] = max_rot[0]; |
||
904 | max_vot[0] = weight;
|
||
905 | max_rot[0] = rot;
|
||
906 | } |
||
907 | else
|
||
908 | { |
||
909 | max_vot[1] = weight;
|
||
910 | max_rot[1] = rot;
|
||
911 | } |
||
912 | ocount ++; |
||
913 | } |
||
914 | } |
||
915 | pre = vote[i]; |
||
916 | } |
||
917 | float fr1 = max_rot[0] / 36.0f; |
||
918 | if(fr1 < 0) fr1 += 1.0f; |
||
919 | unsigned short us1 = ocount == 0? 65535 : ((unsigned short )floor(fr1 * 65535.0f)); |
||
920 | unsigned short us2 = 65535; |
||
921 | if(ocount > 1) |
||
922 | { |
||
923 | float fr2 = max_rot[1] / 36.0f; |
||
924 | if(fr2 < 0) fr2 += 1.0f; |
||
925 | us2 = (unsigned short ) floor(fr2 * 65535.0f); |
||
926 | } |
||
927 | unsigned int uspack = (us2 << 16) | us1; |
||
928 | key.w = __int_as_float(uspack); |
||
929 | d_list[idx] = key; |
||
930 | } |
||
931 | |||
932 | } |
||
933 | |||
934 | |||
935 | |||
936 | |||
937 | void ProgramCU::ComputeOrientation(CuTexImage* list, CuTexImage* got, CuTexImage*key,
|
||
938 | float sigma, float sigma_step, int existing_keypoint) |
||
939 | { |
||
940 | int len = list->GetImgWidth();
|
||
941 | if(len <= 0) return; |
||
942 | int width = got->GetImgWidth(), height = got->GetImgHeight();
|
||
943 | if(existing_keypoint)
|
||
944 | { |
||
945 | list->BindTexture(texDataF4); |
||
946 | }else
|
||
947 | { |
||
948 | list->BindTexture(texDataList); |
||
949 | if(GlobalUtil::_SubpixelLocalization) key->BindTexture(texDataF4);
|
||
950 | } |
||
951 | got->BindTexture2D(texDataF2); |
||
952 | |||
953 | const int block_width = len < ORIENTATION_COMPUTE_PER_BLOCK ? 16 : ORIENTATION_COMPUTE_PER_BLOCK; |
||
954 | dim3 grid((len + block_width -1) / block_width);
|
||
955 | dim3 block(block_width); |
||
956 | |||
957 | ComputeOrientation_Kernel<<<grid, block>>>((float4*) list->_cuData, |
||
958 | len, width, height, sigma, sigma_step, |
||
959 | GlobalUtil::_OrientationGaussianFactor, |
||
960 | GlobalUtil::_OrientationGaussianFactor * GlobalUtil::_OrientationWindowFactor, |
||
961 | GlobalUtil::_FixedOrientation? 0 : GlobalUtil::_MaxOrientation,
|
||
962 | existing_keypoint, GlobalUtil::_SubpixelLocalization, GlobalUtil::_KeepExtremumSign); |
||
963 | |||
964 | ProgramCU::CheckErrorCUDA("ComputeOrientation");
|
||
965 | } |
||
966 | |||
967 | template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptor_Kernel(float4* d_des, int num, |
||
968 | int width, int height, float window_factor) |
||
969 | { |
||
970 | const float rpi = 4.0/ 3.14159265358979323846; |
||
971 | int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
|
||
972 | int fidx = idx >> 4; |
||
973 | if(fidx >= num) return; |
||
974 | float4 key = tex1Dfetch(texDataF4, fidx); |
||
975 | int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2; |
||
976 | float spt = fabs(key.z * window_factor);
|
||
977 | float s, c; __sincosf(key.w, &s, &c);
|
||
978 | float anglef = key.w > 3.14159265358979323846? key.w - (2.0 * 3.14159265358979323846) : key.w ; |
||
979 | float cspt = c * spt, sspt = s * spt;
|
||
980 | float crspt = c / spt, srspt = s / spt;
|
||
981 | float2 offsetpt, pt; |
||
982 | float xmin, ymin, xmax, ymax, bsz;
|
||
983 | offsetpt.x = ix - 1.5f; |
||
984 | offsetpt.y = iy - 1.5f; |
||
985 | pt.x = cspt * offsetpt.x - sspt * offsetpt.y + key.x; |
||
986 | pt.y = cspt * offsetpt.y + sspt * offsetpt.x + key.y; |
||
987 | bsz = fabs(cspt) + fabs(sspt); |
||
988 | xmin = max(1.5f, floor(pt.x - bsz) + 0.5f); |
||
989 | ymin = max(1.5f, floor(pt.y - bsz) + 0.5f); |
||
990 | xmax = min(width - 1.5f, floor(pt.x + bsz) + 0.5f); |
||
991 | ymax = min(height - 1.5f, floor(pt.y + bsz) + 0.5f); |
||
992 | float des[9]; |
||
993 | #pragma unroll
|
||
994 | for(int i =0; i < 9; ++i) des[i] = 0.0f; |
||
995 | for(float y = ymin; y <= ymax; y += 1.0f) |
||
996 | { |
||
997 | for(float x = xmin; x <= xmax; x += 1.0f) |
||
998 | { |
||
999 | float dx = x - pt.x;
|
||
1000 | float dy = y - pt.y;
|
||
1001 | float nx = crspt * dx + srspt * dy;
|
||
1002 | float ny = crspt * dy - srspt * dx;
|
||
1003 | float nxn = fabs(nx);
|
||
1004 | float nyn = fabs(ny);
|
||
1005 | if(nxn < 1.0f && nyn < 1.0f) |
||
1006 | { |
||
1007 | float2 cc = tex2D(texDataF2, x, y); |
||
1008 | float dnx = nx + offsetpt.x;
|
||
1009 | float dny = ny + offsetpt.y;
|
||
1010 | float ww = exp(-0.125f * (dnx * dnx + dny * dny)); |
||
1011 | float wx = 1.0 - nxn; |
||
1012 | float wy = 1.0 - nyn; |
||
1013 | float weight = ww * wx * wy * cc.x;
|
||
1014 | float theta = (anglef - cc.y) * rpi;
|
||
1015 | if(theta < 0) theta += 8.0f; |
||
1016 | float fo = floor(theta);
|
||
1017 | int fidx = fo;
|
||
1018 | float weight1 = fo + 1.0f - theta; |
||
1019 | float weight2 = theta - fo;
|
||
1020 | if(DYNAMIC_INDEXING)
|
||
1021 | { |
||
1022 | des[fidx] += (weight1 * weight); |
||
1023 | des[fidx + 1] += (weight2 * weight);
|
||
1024 | //this dynamic indexing part might be slow
|
||
1025 | }else
|
||
1026 | { |
||
1027 | #pragma unroll
|
||
1028 | for(int k = 0; k < 8; ++k) |
||
1029 | { |
||
1030 | if(k == fidx)
|
||
1031 | { |
||
1032 | des[k] += (weight1 * weight); |
||
1033 | des[k+1] += (weight2 * weight);
|
||
1034 | } |
||
1035 | } |
||
1036 | } |
||
1037 | } |
||
1038 | } |
||
1039 | } |
||
1040 | des[0] += des[8]; |
||
1041 | |||
1042 | int didx = idx << 1; |
||
1043 | d_des[didx] = make_float4(des[0], des[1], des[2], des[3]); |
||
1044 | d_des[didx+1] = make_float4(des[4], des[5], des[6], des[7]); |
||
1045 | } |
||
1046 | |||
1047 | |||
1048 | template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptorRECT_Kernel(float4* d_des, int num, |
||
1049 | int width, int height, float window_factor) |
||
1050 | { |
||
1051 | const float rpi = 4.0/ 3.14159265358979323846; |
||
1052 | int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
|
||
1053 | int fidx = idx >> 4; |
||
1054 | if(fidx >= num) return; |
||
1055 | float4 key = tex1Dfetch(texDataF4, fidx); |
||
1056 | int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2; |
||
1057 | //float aspect_ratio = key.w / key.z;
|
||
1058 | //float aspect_sq = aspect_ratio * aspect_ratio;
|
||
1059 | float sptx = key.z * 0.25, spty = key.w * 0.25; |
||
1060 | float xmin, ymin, xmax, ymax; float2 pt;
|
||
1061 | pt.x = sptx * (ix + 0.5f) + key.x; |
||
1062 | pt.y = spty * (iy + 0.5f) + key.y; |
||
1063 | xmin = max(1.5f, floor(pt.x - sptx) + 0.5f); |
||
1064 | ymin = max(1.5f, floor(pt.y - spty) + 0.5f); |
||
1065 | xmax = min(width - 1.5f, floor(pt.x + sptx) + 0.5f); |
||
1066 | ymax = min(height - 1.5f, floor(pt.y + spty) + 0.5f); |
||
1067 | float des[9]; |
||
1068 | #pragma unroll
|
||
1069 | for(int i =0; i < 9; ++i) des[i] = 0.0f; |
||
1070 | for(float y = ymin; y <= ymax; y += 1.0f) |
||
1071 | { |
||
1072 | for(float x = xmin; x <= xmax; x += 1.0f) |
||
1073 | { |
||
1074 | float nx = (x - pt.x) / sptx;
|
||
1075 | float ny = (y - pt.y) / spty;
|
||
1076 | float nxn = fabs(nx);
|
||
1077 | float nyn = fabs(ny);
|
||
1078 | if(nxn < 1.0f && nyn < 1.0f) |
||
1079 | { |
||
1080 | float2 cc = tex2D(texDataF2, x, y); |
||
1081 | float wx = 1.0 - nxn; |
||
1082 | float wy = 1.0 - nyn; |
||
1083 | float weight = wx * wy * cc.x;
|
||
1084 | float theta = (- cc.y) * rpi;
|
||
1085 | if(theta < 0) theta += 8.0f; |
||
1086 | float fo = floor(theta);
|
||
1087 | int fidx = fo;
|
||
1088 | float weight1 = fo + 1.0f - theta; |
||
1089 | float weight2 = theta - fo;
|
||
1090 | if(DYNAMIC_INDEXING)
|
||
1091 | { |
||
1092 | des[fidx] += (weight1 * weight); |
||
1093 | des[fidx + 1] += (weight2 * weight);
|
||
1094 | //this dynamic indexing part might be slow
|
||
1095 | }else
|
||
1096 | { |
||
1097 | #pragma unroll
|
||
1098 | for(int k = 0; k < 8; ++k) |
||
1099 | { |
||
1100 | if(k == fidx)
|
||
1101 | { |
||
1102 | des[k] += (weight1 * weight); |
||
1103 | des[k+1] += (weight2 * weight);
|
||
1104 | } |
||
1105 | } |
||
1106 | } |
||
1107 | } |
||
1108 | } |
||
1109 | } |
||
1110 | des[0] += des[8]; |
||
1111 | |||
1112 | int didx = idx << 1; |
||
1113 | d_des[didx] = make_float4(des[0], des[1], des[2], des[3]); |
||
1114 | d_des[didx+1] = make_float4(des[4], des[5], des[6], des[7]); |
||
1115 | } |
||
1116 | |||
1117 | void __global__ NormalizeDescriptor_Kernel(float4* d_des, int num) |
||
1118 | { |
||
1119 | float4 temp[32];
|
||
1120 | int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
|
||
1121 | if(idx >= num) return; |
||
1122 | int sidx = idx << 5; |
||
1123 | float norm1 = 0, norm2 = 0; |
||
1124 | #pragma unroll
|
||
1125 | for(int i = 0; i < 32; ++i) |
||
1126 | { |
||
1127 | temp[i] = tex1Dfetch(texDataF4, sidx +i); |
||
1128 | norm1 += (temp[i].x * temp[i].x + temp[i].y * temp[i].y + |
||
1129 | temp[i].z * temp[i].z + temp[i].w * temp[i].w); |
||
1130 | } |
||
1131 | norm1 = rsqrt(norm1); |
||
1132 | |||
1133 | #pragma unroll
|
||
1134 | for(int i = 0; i < 32; ++i) |
||
1135 | { |
||
1136 | temp[i].x = min(0.2f, temp[i].x * norm1); |
||
1137 | temp[i].y = min(0.2f, temp[i].y * norm1); |
||
1138 | temp[i].z = min(0.2f, temp[i].z * norm1); |
||
1139 | temp[i].w = min(0.2f, temp[i].w * norm1); |
||
1140 | norm2 += (temp[i].x * temp[i].x + temp[i].y * temp[i].y + |
||
1141 | temp[i].z * temp[i].z + temp[i].w * temp[i].w); |
||
1142 | } |
||
1143 | |||
1144 | norm2 = rsqrt(norm2); |
||
1145 | #pragma unroll
|
||
1146 | for(int i = 0; i < 32; ++i) |
||
1147 | { |
||
1148 | temp[i].x *= norm2; temp[i].y *= norm2; |
||
1149 | temp[i].z *= norm2; temp[i].w *= norm2; |
||
1150 | d_des[sidx + i] = temp[i]; |
||
1151 | } |
||
1152 | } |
||
1153 | |||
1154 | void ProgramCU::ComputeDescriptor(CuTexImage*list, CuTexImage* got, CuTexImage* dtex, int rect, int stream) |
||
1155 | { |
||
1156 | int num = list->GetImgWidth();
|
||
1157 | int width = got->GetImgWidth();
|
||
1158 | int height = got->GetImgHeight();
|
||
1159 | |||
1160 | dtex->InitTexture(num * 128, 1, 1); |
||
1161 | got->BindTexture2D(texDataF2); |
||
1162 | list->BindTexture(texDataF4); |
||
1163 | int block_width = DESCRIPTOR_COMPUTE_BLOCK_SIZE;
|
||
1164 | dim3 grid((num * 16 + block_width -1) / block_width); |
||
1165 | dim3 block(block_width); |
||
1166 | |||
1167 | if(rect)
|
||
1168 | { |
||
1169 | if(GlobalUtil::_UseDynamicIndexing)
|
||
1170 | ComputeDescriptorRECT_Kernel<true><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
|
||
1171 | else
|
||
1172 | ComputeDescriptorRECT_Kernel<false><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
|
||
1173 | |||
1174 | }else
|
||
1175 | { |
||
1176 | if(GlobalUtil::_UseDynamicIndexing)
|
||
1177 | ComputeDescriptor_Kernel<true><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
|
||
1178 | else
|
||
1179 | ComputeDescriptor_Kernel<false><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
|
||
1180 | } |
||
1181 | if(GlobalUtil::_NormalizedSIFT)
|
||
1182 | { |
||
1183 | dtex->BindTexture(texDataF4); |
||
1184 | const int block_width = DESCRIPTOR_NORMALIZ_PER_BLOCK; |
||
1185 | dim3 grid((num + block_width -1) / block_width);
|
||
1186 | dim3 block(block_width); |
||
1187 | NormalizeDescriptor_Kernel<<<grid, block>>>((float4*) dtex->_cuData, num); |
||
1188 | } |
||
1189 | CheckErrorCUDA("ComputeDescriptor");
|
||
1190 | } |
||
1191 | |||
1192 | //////////////////////////////////////////////////////
|
||
1193 | void ProgramCU::FinishCUDA()
|
||
1194 | { |
||
1195 | cudaThreadSynchronize(); |
||
1196 | } |
||
1197 | |||
1198 | int ProgramCU::CheckErrorCUDA(const char* location) |
||
1199 | { |
||
1200 | cudaError_t e = cudaGetLastError(); |
||
1201 | if(e)
|
||
1202 | { |
||
1203 | if(location) fprintf(stderr, "%s:\t", location); |
||
1204 | fprintf(stderr, "%s\n", cudaGetErrorString(e));
|
||
1205 | //assert(0);
|
||
1206 | return 1; |
||
1207 | }else
|
||
1208 | { |
||
1209 | return 0; |
||
1210 | } |
||
1211 | } |
||
1212 | |||
1213 | void __global__ ConvertDOG_Kernel(float* d_result, int width, int height) |
||
1214 | { |
||
1215 | int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
|
||
1216 | int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
|
||
1217 | if(col < width && row < height)
|
||
1218 | { |
||
1219 | int index = row * width + col;
|
||
1220 | float v = tex1Dfetch(texData, index);
|
||
1221 | d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)? |
||
1222 | 0.5 : saturate(0.5+20.0*v); |
||
1223 | } |
||
1224 | } |
||
1225 | ///
|
||
1226 | void ProgramCU::DisplayConvertDOG(CuTexImage* dog, CuTexImage* out)
|
||
1227 | { |
||
1228 | if(out->_cuData == NULL) return; |
||
1229 | int width = dog->GetImgWidth(), height = dog ->GetImgHeight();
|
||
1230 | dog->BindTexture(texData); |
||
1231 | dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM, (height + BLOCK_DIM - 1)/BLOCK_DIM); |
||
1232 | dim3 block(BLOCK_DIM, BLOCK_DIM); |
||
1233 | ConvertDOG_Kernel<<<grid, block>>>((float*) out->_cuData, width, height);
|
||
1234 | ProgramCU::CheckErrorCUDA("DisplayConvertDOG");
|
||
1235 | } |
||
1236 | |||
1237 | void __global__ ConvertGRD_Kernel(float* d_result, int width, int height) |
||
1238 | { |
||
1239 | int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
|
||
1240 | int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
|
||
1241 | if(col < width && row < height)
|
||
1242 | { |
||
1243 | int index = row * width + col;
|
||
1244 | float v = tex1Dfetch(texData, index << 1); |
||
1245 | d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)? |
||
1246 | 0 : saturate(5 * v); |
||
1247 | |||
1248 | } |
||
1249 | } |
||
1250 | |||
1251 | |||
1252 | void ProgramCU::DisplayConvertGRD(CuTexImage* got, CuTexImage* out)
|
||
1253 | { |
||
1254 | if(out->_cuData == NULL) return; |
||
1255 | int width = got->GetImgWidth(), height = got ->GetImgHeight();
|
||
1256 | got->BindTexture(texData); |
||
1257 | dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM, (height + BLOCK_DIM - 1)/BLOCK_DIM); |
||
1258 | dim3 block(BLOCK_DIM, BLOCK_DIM); |
||
1259 | ConvertGRD_Kernel<<<grid, block>>>((float*) out->_cuData, width, height);
|
||
1260 | ProgramCU::CheckErrorCUDA("DisplayConvertGRD");
|
||
1261 | } |
||
1262 | |||
1263 | void __global__ ConvertKEY_Kernel(float4* d_result, int width, int height) |
||
1264 | { |
||
1265 | |||
1266 | int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
|
||
1267 | int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
|
||
1268 | if(col < width && row < height)
|
||
1269 | { |
||
1270 | int index = row * width + col;
|
||
1271 | float4 keyv = tex1Dfetch(texDataF4, index); |
||
1272 | int is_key = (keyv.x == 1.0f || keyv.x == -1.0f); |
||
1273 | int inside = col > 0 && row > 0 && row < height -1 && col < width - 1; |
||
1274 | float v = inside? saturate(0.5 + 20 * tex1Dfetch(texData, index)) : 0.5; |
||
1275 | d_result[index] = is_key && inside ? |
||
1276 | (keyv.x > 0? make_float4(1.0f, 0, 0, 1.0f) : make_float4(0.0f, 1.0f, 0.0f, 1.0f)): |
||
1277 | make_float4(v, v, v, 1.0f) ; |
||
1278 | } |
||
1279 | } |
||
1280 | void ProgramCU::DisplayConvertKEY(CuTexImage* key, CuTexImage* dog, CuTexImage* out)
|
||
1281 | { |
||
1282 | if(out->_cuData == NULL) return; |
||
1283 | int width = key->GetImgWidth(), height = key ->GetImgHeight();
|
||
1284 | dog->BindTexture(texData); |
||
1285 | key->BindTexture(texDataF4); |
||
1286 | dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM, (height + BLOCK_DIM - 1)/BLOCK_DIM); |
||
1287 | dim3 block(BLOCK_DIM, BLOCK_DIM); |
||
1288 | ConvertKEY_Kernel<<<grid, block>>>((float4*) out->_cuData, width, height); |
||
1289 | } |
||
1290 | |||
1291 | |||
1292 | void __global__ DisplayKeyPoint_Kernel(float4 * d_result, int num) |
||
1293 | { |
||
1294 | int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
|
||
1295 | if(idx >= num) return; |
||
1296 | float4 v = tex1Dfetch(texDataF4, idx); |
||
1297 | d_result[idx] = make_float4(v.x, v.y, 0, 1.0f); |
||
1298 | } |
||
1299 | |||
1300 | void ProgramCU::DisplayKeyPoint(CuTexImage* ftex, CuTexImage* out)
|
||
1301 | { |
||
1302 | int num = ftex->GetImgWidth();
|
||
1303 | int block_width = 64; |
||
1304 | dim3 grid((num + block_width -1) /block_width);
|
||
1305 | dim3 block(block_width); |
||
1306 | ftex->BindTexture(texDataF4); |
||
1307 | DisplayKeyPoint_Kernel<<<grid, block>>>((float4*) out->_cuData, num); |
||
1308 | ProgramCU::CheckErrorCUDA("DisplayKeyPoint");
|
||
1309 | } |
||
1310 | |||
1311 | void __global__ DisplayKeyBox_Kernel(float4* d_result, int num) |
||
1312 | { |
||
1313 | int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
|
||
1314 | if(idx >= num) return; |
||
1315 | int kidx = idx / 10, vidx = idx - IMUL(kidx , 10); |
||
1316 | float4 v = tex1Dfetch(texDataF4, kidx); |
||
1317 | float sz = fabs(v.z * 3.0f); |
||
1318 | ///////////////////////
|
||
1319 | float s, c; __sincosf(v.w, &s, &c);
|
||
1320 | ///////////////////////
|
||
1321 | float dx = vidx == 0? 0 : ((vidx <= 4 || vidx >= 9)? sz : -sz); |
||
1322 | float dy = vidx <= 1? 0 : ((vidx <= 2 || vidx >= 7)? -sz : sz); |
||
1323 | float4 pos; |
||
1324 | pos.x = v.x + c * dx - s * dy; |
||
1325 | pos.y = v.y + c * dy + s * dx; |
||
1326 | pos.z = 0; pos.w = 1.0f; |
||
1327 | d_result[idx] = pos; |
||
1328 | } |
||
1329 | |||
1330 | void ProgramCU::DisplayKeyBox(CuTexImage* ftex, CuTexImage* out)
|
||
1331 | { |
||
1332 | int len = ftex->GetImgWidth();
|
||
1333 | int block_width = 32; |
||
1334 | dim3 grid((len * 10 + block_width -1) / block_width); |
||
1335 | dim3 block(block_width); |
||
1336 | ftex->BindTexture(texDataF4); |
||
1337 | DisplayKeyBox_Kernel<<<grid, block>>>((float4*) out->_cuData, len * 10);
|
||
1338 | } |
||
1339 | ///////////////////////////////////////////////////////////////////
|
||
1340 | inline void CuTexImage:: BindTexture(textureReference& texRef) |
||
1341 | { |
||
1342 | cudaBindTexture(NULL, &texRef, _cuData, &texRef.channelDesc, _numBytes);
|
||
1343 | } |
||
1344 | |||
1345 | inline void CuTexImage::BindTexture2D(textureReference& texRef) |
||
1346 | { |
||
1347 | #if defined(SIFTGPU_ENABLE_LINEAR_TEX2D)
|
||
1348 | cudaBindTexture2D(0, &texRef, _cuData, &texRef.channelDesc, _imgWidth, _imgHeight, _imgWidth* _numChannel* sizeof(float)); |
||
1349 | #else
|
||
1350 | cudaChannelFormatDesc desc; |
||
1351 | cudaGetChannelDesc(&desc, _cuData2D); |
||
1352 | cudaBindTextureToArray(&texRef, _cuData2D, &desc); |
||
1353 | #endif
|
||
1354 | } |
||
1355 | |||
1356 | int ProgramCU::CheckCudaDevice(int device) |
||
1357 | { |
||
1358 | int count = 0, device_used; |
||
1359 | if(cudaGetDeviceCount(&count) || count <= 0) |
||
1360 | { |
||
1361 | ProgramCU::CheckErrorCUDA("CheckCudaDevice");
|
||
1362 | return 0; |
||
1363 | }else if(count == 1) |
||
1364 | { |
||
1365 | cudaDeviceProp deviceProp; |
||
1366 | cudaGetDeviceProperties(&deviceProp, 0);
|
||
1367 | if (deviceProp.major == 9999 && deviceProp.minor == 9999) |
||
1368 | { |
||
1369 | fprintf(stderr, "CheckCudaDevice: no device supporting CUDA.\n");
|
||
1370 | return 0; |
||
1371 | } |
||
1372 | } |
||
1373 | if(device >0 && device < count) |
||
1374 | { |
||
1375 | cudaSetDevice(device); |
||
1376 | CheckErrorCUDA("cudaSetDevice\n");
|
||
1377 | } |
||
1378 | cudaGetDevice(&device_used); |
||
1379 | if(device != device_used)
|
||
1380 | fprintf(stderr, "\nERROR: Cannot set device to %d\n"
|
||
1381 | "\nWARNING: Use # %d device instead (out of %d)\n", device, device_used, count);
|
||
1382 | return 1; |
||
1383 | } |
||
1384 | |||
1385 | ////////////////////////////////////////////////////////////////////////////////////////
|
||
1386 | // siftmatch funtions
|
||
1387 | //////////////////////////////////////////////////////////////////////////////////////////
|
||
1388 | |||
1389 | #define MULT_TBLOCK_DIMX 128 |
||
1390 | #define MULT_TBLOCK_DIMY 1 |
||
1391 | #define MULT_BLOCK_DIMX (MULT_TBLOCK_DIMX)
|
||
1392 | #define MULT_BLOCK_DIMY (8 * MULT_TBLOCK_DIMY) |
||
1393 | |||
1394 | |||
1395 | texture<uint4, 1, cudaReadModeElementType> texDes1;
|
||
1396 | texture<uint4, 1, cudaReadModeElementType> texDes2;
|
||
1397 | |||
1398 | void __global__ MultiplyDescriptor_Kernel(int* d_result, int num1, int num2, int3* d_temp) |
||
1399 | { |
||
1400 | int idx01 = (blockIdx.y * MULT_BLOCK_DIMY), idx02 = (blockIdx.x * MULT_BLOCK_DIMX);
|
||
1401 | |||
1402 | int idx1 = idx01 + threadIdx.y, idx2 = idx02 + threadIdx.x;
|
||
1403 | __shared__ int data1[17 * 2 * MULT_BLOCK_DIMY]; |
||
1404 | int read_idx1 = idx01 * 8 + threadIdx.x, read_idx2 = idx2 * 8; |
||
1405 | int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2; |
||
1406 | int cache_idx1 = IMUL(row4, 17) + (col4 << 2); |
||
1407 | |||
1408 | ///////////////////////////////////////////////////////////////
|
||
1409 | //Load feature descriptors
|
||
1410 | ///////////////////////////////////////////////////////////////
|
||
1411 | #if MULT_BLOCK_DIMY == 16 |
||
1412 | uint4 v = tex1Dfetch(texDes1, read_idx1); |
||
1413 | data1[cache_idx1] = v.x; data1[cache_idx1+1] = v.y;
|
||
1414 | data1[cache_idx1+2] = v.z; data1[cache_idx1+3] = v.w; |
||
1415 | #elif MULT_BLOCK_DIMY == 8 |
||
1416 | if(threadIdx.x < 64) |
||
1417 | { |
||
1418 | uint4 v = tex1Dfetch(texDes1, read_idx1); |
||
1419 | data1[cache_idx1] = v.x; data1[cache_idx1+1] = v.y;
|
||
1420 | data1[cache_idx1+2] = v.z; data1[cache_idx1+3] = v.w; |
||
1421 | } |
||
1422 | #else
|
||
1423 | #error
|
||
1424 | #endif
|
||
1425 | __syncthreads(); |
||
1426 | |||
1427 | ///
|
||
1428 | if(idx2 >= num2) return; |
||
1429 | ///////////////////////////////////////////////////////////////////////////
|
||
1430 | //compare descriptors
|
||
1431 | |||
1432 | int results[MULT_BLOCK_DIMY];
|
||
1433 | #pragma unroll
|
||
1434 | for(int i = 0; i < MULT_BLOCK_DIMY; ++i) results[i] = 0; |
||
1435 | |||
1436 | #pragma unroll
|
||
1437 | for(int i = 0; i < 8; ++i) |
||
1438 | { |
||
1439 | uint4 v = tex1Dfetch(texDes2, read_idx2 + i); |
||
1440 | unsigned char* p2 = (unsigned char*)(&v); |
||
1441 | #pragma unroll
|
||
1442 | for(int k = 0; k < MULT_BLOCK_DIMY; ++k) |
||
1443 | { |
||
1444 | unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i * 4 + (i/4)); |
||
1445 | results[k] += ( IMUL(p1[0], p2[0]) + IMUL(p1[1], p2[1]) |
||
1446 | + IMUL(p1[2], p2[2]) + IMUL(p1[3], p2[3]) |
||
1447 | + IMUL(p1[4], p2[4]) + IMUL(p1[5], p2[5]) |
||
1448 | + IMUL(p1[6], p2[6]) + IMUL(p1[7], p2[7]) |
||
1449 | + IMUL(p1[8], p2[8]) + IMUL(p1[9], p2[9]) |
||
1450 | + IMUL(p1[10], p2[10]) + IMUL(p1[11], p2[11]) |
||
1451 | + IMUL(p1[12], p2[12]) + IMUL(p1[13], p2[13]) |
||
1452 | + IMUL(p1[14], p2[14]) + IMUL(p1[15], p2[15])); |
||
1453 | } |
||
1454 | } |
||
1455 | |||
1456 | int dst_idx = IMUL(idx1, num2) + idx2;
|
||
1457 | if(d_temp)
|
||
1458 | { |
||
1459 | int3 cmp_result = make_int3(0, -1, 0); |
||
1460 | |||
1461 | #pragma unroll
|
||
1462 | for(int i = 0; i < MULT_BLOCK_DIMY; ++i) |
||
1463 | { |
||
1464 | if(idx1 + i < num1)
|
||
1465 | { |
||
1466 | cmp_result = results[i] > cmp_result.x? |
||
1467 | make_int3(results[i], idx1 + i, cmp_result.x) : |
||
1468 | make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i])); |
||
1469 | d_result[dst_idx + IMUL(i, num2)] = results[i]; |
||
1470 | } |
||
1471 | } |
||
1472 | d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result; |
||
1473 | }else
|
||
1474 | { |
||
1475 | #pragma unroll
|
||
1476 | for(int i = 0; i < MULT_BLOCK_DIMY; ++i) |
||
1477 | { |
||
1478 | if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = results[i];
|
||
1479 | } |
||
1480 | } |
||
1481 | |||
1482 | } |
||
1483 | |||
1484 | |||
1485 | void ProgramCU::MultiplyDescriptor(CuTexImage* des1, CuTexImage* des2, CuTexImage* texDot, CuTexImage* texCRT)
|
||
1486 | { |
||
1487 | int num1 = des1->GetImgWidth() / 8; |
||
1488 | int num2 = des2->GetImgWidth() / 8; |
||
1489 | dim3 grid( (num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,
|
||
1490 | (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);
|
||
1491 | dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY); |
||
1492 | texDot->InitTexture( num2,num1); |
||
1493 | if(texCRT) texCRT->InitTexture(num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 32); |
||
1494 | des1->BindTexture(texDes1); |
||
1495 | des2->BindTexture(texDes2); |
||
1496 | |||
1497 | MultiplyDescriptor_Kernel<<<grid, block>>>((int*)texDot->_cuData, num1, num2,
|
||
1498 | (texCRT? (int3*)texCRT->_cuData : NULL));
|
||
1499 | ProgramCU::CheckErrorCUDA("MultiplyDescriptor");
|
||
1500 | } |
||
1501 | |||
1502 | texture<float, 1, cudaReadModeElementType> texLoc1; |
||
1503 | texture<float2, 1, cudaReadModeElementType> texLoc2;
|
||
1504 | struct Matrix33{float mat[3][3];}; |
||
1505 | |||
1506 | |||
1507 | |||
1508 | void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, int3* d_temp, |
||
1509 | Matrix33 H, float hdistmax, Matrix33 F, float fdistmax) |
||
1510 | { |
||
1511 | int idx01 = (blockIdx.y * MULT_BLOCK_DIMY);
|
||
1512 | int idx02 = (blockIdx.x * MULT_BLOCK_DIMX);
|
||
1513 | |||
1514 | int idx1 = idx01 + threadIdx.y;
|
||
1515 | int idx2 = idx02 + threadIdx.x;
|
||
1516 | __shared__ int data1[17 * 2 * MULT_BLOCK_DIMY]; |
||
1517 | __shared__ float loc1[MULT_BLOCK_DIMY * 2]; |
||
1518 | int read_idx1 = idx01 * 8 + threadIdx.x ; |
||
1519 | int read_idx2 = idx2 * 8; |
||
1520 | int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2; |
||
1521 | int cache_idx1 = IMUL(row4, 17) + (col4 << 2); |
||
1522 | #if MULT_BLOCK_DIMY == 16 |
||
1523 | uint4 v = tex1Dfetch(texDes1, read_idx1); |
||
1524 | data1[cache_idx1] = v.x; |
||
1525 | data1[cache_idx1+1] = v.y;
|
||
1526 | data1[cache_idx1+2] = v.z;
|
||
1527 | data1[cache_idx1+3] = v.w;
|
||
1528 | #elif MULT_BLOCK_DIMY == 8 |
||
1529 | if(threadIdx.x < 64) |
||
1530 | { |
||
1531 | uint4 v = tex1Dfetch(texDes1, read_idx1); |
||
1532 | data1[cache_idx1] = v.x; |
||
1533 | data1[cache_idx1+1] = v.y;
|
||
1534 | data1[cache_idx1+2] = v.z;
|
||
1535 | data1[cache_idx1+3] = v.w;
|
||
1536 | } |
||
1537 | #else
|
||
1538 | #error
|
||
1539 | #endif
|
||
1540 | __syncthreads(); |
||
1541 | if(threadIdx.x < MULT_BLOCK_DIMY * 2) |
||
1542 | { |
||
1543 | loc1[threadIdx.x] = tex1Dfetch(texLoc1, 2 * idx01 + threadIdx.x);
|
||
1544 | } |
||
1545 | __syncthreads(); |
||
1546 | if(idx2 >= num2) return; |
||
1547 | int results[MULT_BLOCK_DIMY];
|
||
1548 | /////////////////////////////////////////////////////////////////////////////////////////////
|
||
1549 | //geometric verification
|
||
1550 | /////////////////////////////////////////////////////////////////////////////////////////////
|
||
1551 | int good_count = 0; |
||
1552 | float2 loc2 = tex1Dfetch(texLoc2, idx2); |
||
1553 | #pragma unroll
|
||
1554 | for(int i = 0; i < MULT_BLOCK_DIMY; ++i) |
||
1555 | { |
||
1556 | |||
1557 | if(idx1 + i < num1)
|
||
1558 | { |
||
1559 | float* loci = loc1 + i * 2; |
||
1560 | float locx = loci[0], locy = loci[1]; |
||
1561 | //homography
|
||
1562 | float x[3], diff[2]; |
||
1563 | x[0] = H.mat[0][0] * locx + H.mat[0][1] * locy + H.mat[0][2]; |
||
1564 | x[1] = H.mat[1][0] * locx + H.mat[1][1] * locy + H.mat[1][2]; |
||
1565 | x[2] = H.mat[2][0] * locx + H.mat[2][1] * locy + H.mat[2][2]; |
||
1566 | diff[0] = fabs(FDIV(x[0], x[2]) - loc2.x); |
||
1567 | diff[1] = fabs(FDIV(x[1], x[2]) - loc2.y); |
||
1568 | if(diff[0] < hdistmax && diff[1] < hdistmax) |
||
1569 | { |
||
1570 | //check fundamental matrix
|
||
1571 | float fx1[3], ftx2[3], x2fx1, se; |
||
1572 | fx1[0] = F.mat[0][0] * locx + F.mat[0][1] * locy + F.mat[0][2]; |
||
1573 | fx1[1] = F.mat[1][0] * locx + F.mat[1][1] * locy + F.mat[1][2]; |
||
1574 | //fx1[2] = F.mat[2][0] * locx + F.mat[2][1] * locy + F.mat[2][2];
|
||
1575 | |||
1576 | ftx2[0] = F.mat[0][0] * locx + F.mat[1][0] * locy + F.mat[2][0]; |
||
1577 | ftx2[1] = F.mat[0][1] * locx + F.mat[1][1] * locy + F.mat[2][1]; |
||
1578 | ftx2[2] = F.mat[0][2] * locx + F.mat[1][2] * locy + F.mat[2][2]; |
||
1579 | |||
1580 | x2fx1 = locx * ftx2[0] + locy * ftx2[1] + ftx2[2]; |
||
1581 | se = FDIV(x2fx1 * x2fx1, fx1[0] * fx1[0] + fx1[1] * fx1[1] + ftx2[0] * ftx2[0] + ftx2[1] * ftx2[1]); |
||
1582 | results[i] = se < fdistmax? 0: -262144; |
||
1583 | }else
|
||
1584 | { |
||
1585 | results[i] = -262144;
|
||
1586 | } |
||
1587 | }else
|
||
1588 | { |
||
1589 | results[i] = -262144;
|
||
1590 | } |
||
1591 | good_count += (results[i] >=0);
|
||
1592 | } |
||
1593 | /////////////////////////////////////////////////////////////////////////////////////////////
|
||
1594 | ///compare feature descriptors anyway
|
||
1595 | /////////////////////////////////////////////////////////////////////////////////////////////
|
||
1596 | if(good_count > 0) |
||
1597 | { |
||
1598 | #pragma unroll
|
||
1599 | for(int i = 0; i < 8; ++i) |
||
1600 | { |
||
1601 | uint4 v = tex1Dfetch(texDes2, read_idx2 + i); |
||
1602 | unsigned char* p2 = (unsigned char*)(&v); |
||
1603 | #pragma unroll
|
||
1604 | for(int k = 0; k < MULT_BLOCK_DIMY; ++k) |
||
1605 | { |
||
1606 | unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i * 4 + (i/4)); |
||
1607 | results[k] += ( IMUL(p1[0], p2[0]) + IMUL(p1[1], p2[1]) |
||
1608 | + IMUL(p1[2], p2[2]) + IMUL(p1[3], p2[3]) |
||
1609 | + IMUL(p1[4], p2[4]) + IMUL(p1[5], p2[5]) |
||
1610 | + IMUL(p1[6], p2[6]) + IMUL(p1[7], p2[7]) |
||
1611 | + IMUL(p1[8], p2[8]) + IMUL(p1[9], p2[9]) |
||
1612 | + IMUL(p1[10], p2[10]) + IMUL(p1[11], p2[11]) |
||
1613 | + IMUL(p1[12], p2[12]) + IMUL(p1[13], p2[13]) |
||
1614 | + IMUL(p1[14], p2[14]) + IMUL(p1[15], p2[15])); |
||
1615 | } |
||
1616 | } |
||
1617 | } |
||
1618 | int dst_idx = IMUL(idx1, num2) + idx2;
|
||
1619 | if(d_temp)
|
||
1620 | { |
||
1621 | int3 cmp_result = make_int3(0, -1, 0); |
||
1622 | #pragma unroll
|
||
1623 | for(int i= 0; i < MULT_BLOCK_DIMY; ++i) |
||
1624 | { |
||
1625 | if(idx1 + i < num1)
|
||
1626 | { |
||
1627 | cmp_result = results[i] > cmp_result.x? |
||
1628 | make_int3(results[i], idx1 + i, cmp_result.x) : |
||
1629 | make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i])); |
||
1630 | d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0);
|
||
1631 | }else
|
||
1632 | { |
||
1633 | break;
|
||
1634 | } |
||
1635 | } |
||
1636 | d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result; |
||
1637 | }else
|
||
1638 | { |
||
1639 | #pragma unroll
|
||
1640 | for(int i = 0; i < MULT_BLOCK_DIMY; ++i) |
||
1641 | { |
||
1642 | if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0); |
||
1643 | else break; |
||
1644 | } |
||
1645 | } |
||
1646 | |||
1647 | } |
||
1648 | |||
1649 | |||
1650 | void ProgramCU::MultiplyDescriptorG(CuTexImage* des1, CuTexImage* des2,
|
||
1651 | CuTexImage* loc1, CuTexImage* loc2, CuTexImage* texDot, CuTexImage* texCRT, |
||
1652 | float H[3][3], float hdistmax, float F[3][3], float fdistmax) |
||
1653 | { |
||
1654 | int num1 = des1->GetImgWidth() / 8; |
||
1655 | int num2 = des2->GetImgWidth() / 8; |
||
1656 | Matrix33 MatF, MatH; |
||
1657 | //copy the matrix
|
||
1658 | memcpy(MatF.mat, F, 9 * sizeof(float)); |
||
1659 | memcpy(MatH.mat, H, 9 * sizeof(float)); |
||
1660 | //thread blocks
|
||
1661 | dim3 grid( (num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,
|
||
1662 | (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);
|
||
1663 | dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY); |
||
1664 | //intermediate results
|
||
1665 | texDot->InitTexture( num2,num1); |
||
1666 | if(texCRT) texCRT->InitTexture( num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 3); |
||
1667 | loc1->BindTexture(texLoc1); |
||
1668 | loc2->BindTexture(texLoc2); |
||
1669 | des1->BindTexture(texDes1); |
||
1670 | des2->BindTexture(texDes2); |
||
1671 | MultiplyDescriptorG_Kernel<<<grid, block>>>((int*)texDot->_cuData, num1, num2,
|
||
1672 | (texCRT? (int3*)texCRT->_cuData : NULL),
|
||
1673 | MatH, hdistmax, MatF, fdistmax); |
||
1674 | } |
||
1675 | |||
1676 | |||
1677 | texture<int, 1, cudaReadModeElementType> texDOT; |
||
1678 | |||
1679 | #define ROWMATCH_BLOCK_WIDTH 32 |
||
1680 | #define ROWMATCH_BLOCK_HEIGHT 1 |
||
1681 | |||
1682 | void __global__ RowMatch_Kernel(int*d_dot, int* d_result, int num2, float distmax, float ratiomax) |
||
1683 | { |
||
1684 | #if ROWMATCH_BLOCK_HEIGHT == 1 |
||
1685 | __shared__ int dotmax[ROWMATCH_BLOCK_WIDTH];
|
||
1686 | __shared__ int dotnxt[ROWMATCH_BLOCK_WIDTH];
|
||
1687 | __shared__ int dotidx[ROWMATCH_BLOCK_WIDTH];
|
||
1688 | int row = blockIdx.y;
|
||
1689 | #else
|
||
1690 | __shared__ int x_dotmax[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
|
||
1691 | __shared__ int x_dotnxt[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
|
||
1692 | __shared__ int x_dotidx[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
|
||
1693 | int* dotmax = x_dotmax[threadIdx.y];
|
||
1694 | int* dotnxt = x_dotnxt[threadIdx.y];
|
||
1695 | int* dotidx = x_dotidx[threadIdx.y];
|
||
1696 | int row = IMUL(blockIdx.y, ROWMATCH_BLOCK_HEIGHT) + threadIdx.y;
|
||
1697 | #endif
|
||
1698 | |||
1699 | int base_address = IMUL(row , num2);
|
||
1700 | int t_dotmax = 0, t_dotnxt = 0, t_dotidx = -1; |
||
1701 | for(int i = 0; i < num2; i += ROWMATCH_BLOCK_WIDTH) |
||
1702 | { |
||
1703 | if(threadIdx.x + i < num2)
|
||
1704 | { |
||
1705 | int v = tex1Dfetch(texDOT, base_address + threadIdx.x + i);//d_dot[base_address + threadIdx.x + i];// |
||
1706 | bool test = v > t_dotmax;
|
||
1707 | t_dotnxt = test? t_dotmax : max(t_dotnxt, v); |
||
1708 | t_dotidx = test? (threadIdx.x + i) : t_dotidx; |
||
1709 | t_dotmax = test? v: t_dotmax; |
||
1710 | } |
||
1711 | __syncthreads(); |
||
1712 | } |
||
1713 | dotmax[threadIdx.x] = t_dotmax; |
||
1714 | dotnxt[threadIdx.x] = t_dotnxt; |
||
1715 | dotidx[threadIdx.x] = t_dotidx; |
||
1716 | __syncthreads(); |
||
1717 | |||
1718 | #pragma unroll
|
||
1719 | for(int step = ROWMATCH_BLOCK_WIDTH/2; step >0; step /= 2) |
||
1720 | { |
||
1721 | if(threadIdx.x < step)
|
||
1722 | { |
||
1723 | int v1 = dotmax[threadIdx.x], v2 = dotmax[threadIdx.x + step];
|
||
1724 | bool test = v2 > v1;
|
||
1725 | dotnxt[threadIdx.x] = test? max(v1, dotnxt[threadIdx.x + step]) :max(dotnxt[threadIdx.x], v2); |
||
1726 | dotidx[threadIdx.x] = test? dotidx[threadIdx.x + step] : dotidx[threadIdx.x]; |
||
1727 | dotmax[threadIdx.x] = test? v2 : v1; |
||
1728 | } |
||
1729 | __syncthreads(); |
||
1730 | } |
||
1731 | if(threadIdx.x == 0) |
||
1732 | { |
||
1733 | float dist = acos(min(dotmax[0] * 0.000003814697265625f, 1.0)); |
||
1734 | float distn = acos(min(dotnxt[0] * 0.000003814697265625f, 1.0)); |
||
1735 | //float ratio = dist / distn;
|
||
1736 | d_result[row] = (dist < distmax) && (dist < distn * ratiomax) ? dotidx[0] : -1;//? : -1; |
||
1737 | } |
||
1738 | |||
1739 | } |
||
1740 | |||
1741 | |||
1742 | void ProgramCU::GetRowMatch(CuTexImage* texDot, CuTexImage* texMatch, float distmax, float ratiomax) |
||
1743 | { |
||
1744 | int num1 = texDot->GetImgHeight();
|
||
1745 | int num2 = texDot->GetImgWidth();
|
||
1746 | dim3 grid(1, num1/ROWMATCH_BLOCK_HEIGHT);
|
||
1747 | dim3 block(ROWMATCH_BLOCK_WIDTH, ROWMATCH_BLOCK_HEIGHT); |
||
1748 | texDot->BindTexture(texDOT); |
||
1749 | RowMatch_Kernel<<<grid, block>>>((int*)texDot->_cuData,
|
||
1750 | (int*)texMatch->_cuData, num2, distmax, ratiomax);
|
||
1751 | } |
||
1752 | |||
1753 | #define COLMATCH_BLOCK_WIDTH 32 |
||
1754 | |||
1755 | //texture<int3, 1, cudaReadModeElementType> texCT;
|
||
1756 | |||
1757 | void __global__ ColMatch_Kernel(int3*d_crt, int* d_result, int height, int num2, float distmax, float ratiomax) |
||
1758 | { |
||
1759 | int col = COLMATCH_BLOCK_WIDTH * blockIdx.x + threadIdx.x;
|
||
1760 | if(col >= num2) return; |
||
1761 | int3 result = d_crt[col];//tex1Dfetch(texCT, col);
|
||
1762 | int read_idx = col + num2;
|
||
1763 | for(int i = 1; i < height; ++i, read_idx += num2) |
||
1764 | { |
||
1765 | int3 temp = d_crt[read_idx];//tex1Dfetch(texCT, read_idx);
|
||
1766 | result = result.x < temp.x? |
||
1767 | make_int3(temp.x, temp.y, max(result.x, temp.z)) : |
||
1768 | make_int3(result.x, result.y, max(result.z, temp.x)); |
||
1769 | } |
||
1770 | |||
1771 | float dist = acos(min(result.x * 0.000003814697265625f, 1.0)); |
||
1772 | float distn = acos(min(result.z * 0.000003814697265625f, 1.0)); |
||
1773 | //float ratio = dist / distn;
|
||
1774 | d_result[col] = (dist < distmax) && (dist < distn * ratiomax) ? result.y : -1;//? : -1; |
||
1775 | |||
1776 | } |
||
1777 | |||
1778 | void ProgramCU::GetColMatch(CuTexImage* texCRT, CuTexImage* texMatch, float distmax, float ratiomax) |
||
1779 | { |
||
1780 | int height = texCRT->GetImgHeight();
|
||
1781 | int num2 = texCRT->GetImgWidth();
|
||
1782 | //texCRT->BindTexture(texCT);
|
||
1783 | dim3 grid((num2 + COLMATCH_BLOCK_WIDTH -1) / COLMATCH_BLOCK_WIDTH);
|
||
1784 | dim3 block(COLMATCH_BLOCK_WIDTH); |
||
1785 | ColMatch_Kernel<<<grid, block>>>((int3*)texCRT->_cuData, (int*) texMatch->_cuData, height, num2, distmax, ratiomax);
|
||
1786 | } |
||
1787 | |||
1788 | #endif |