root / rgbdslam / external / siftgpu / src / SiftGPU / ProgramCU.cu @ 9240aaa3
History | View | Annotate | Download (56.6 KB)
1 |
////////////////////////////////////////////////////////////////////////////
|
---|---|
2 |
// File: ProgramCU.cu
|
3 |
// Author: Changchang Wu
|
4 |
// Description : implementation of ProgramCU and all CUDA kernels
|
5 |
//
|
6 |
// Copyright (c) 2007 University of North Carolina at Chapel Hill
|
7 |
// All Rights Reserved
|
8 |
//
|
9 |
// Permission to use, copy, modify and distribute this software and its
|
10 |
// documentation for educational, research and non-profit purposes, without
|
11 |
// fee, and without a written agreement is hereby granted, provided that the
|
12 |
// above copyright notice and the following paragraph appear in all copies.
|
13 |
//
|
14 |
// The University of North Carolina at Chapel Hill make no representations
|
15 |
// about the suitability of this software for any purpose. It is provided
|
16 |
// 'as is' without express or implied warranty.
|
17 |
//
|
18 |
// Please send BUG REPORTS to ccwu@cs.unc.edu
|
19 |
//
|
20 |
////////////////////////////////////////////////////////////////////////////
|
21 |
|
22 |
#if defined(CUDA_SIFTGPU_ENABLED)
|
23 |
|
24 |
#include "GL/glew.h" |
25 |
#include "stdio.h" |
26 |
|
27 |
#include "CuTexImage.h" |
28 |
#include "ProgramCU.h" |
29 |
#include "GlobalUtil.h" |
30 |
|
31 |
//----------------------------------------------------------------
|
32 |
//Begin SiftGPU setting section.
|
33 |
//////////////////////////////////////////////////////////
|
34 |
#define IMUL(X,Y) __mul24(X,Y)
|
35 |
//#define FDIV(X,Y) ((X)/(Y))
|
36 |
#define FDIV(X,Y) __fdividef(X,Y)
|
37 |
|
38 |
/////////////////////////////////////////////////////////
|
39 |
//filter kernel width range (don't change this)
|
40 |
#define KERNEL_MAX_WIDTH 33 |
41 |
#define KERNEL_MIN_WIDTH 5 |
42 |
|
43 |
//////////////////////////////////////////////////////////
|
44 |
//horizontal filter block size (32, 64, 128, 256, 512)
|
45 |
#define FILTERH_TILE_WIDTH 128 |
46 |
//thread block for vertical filter. FILTERV_BLOCK_WIDTH can be (4, 8 or 16)
|
47 |
#define FILTERV_BLOCK_WIDTH 16 |
48 |
#define FILTERV_BLOCK_HEIGHT 32 |
49 |
//The corresponding image patch for a thread block
|
50 |
#define FILTERV_PIXEL_PER_THREAD 4 |
51 |
#define FILTERV_TILE_WIDTH FILTERV_BLOCK_WIDTH
|
52 |
#define FILTERV_TILE_HEIGHT (FILTERV_PIXEL_PER_THREAD * FILTERV_BLOCK_HEIGHT)
|
53 |
|
54 |
|
55 |
//////////////////////////////////////////////////////////
|
56 |
//thread block size for computing Difference of Gaussian
|
57 |
#define DOG_BLOCK_LOG_DIMX 7 |
58 |
#define DOG_BLOCK_LOG_DIMY 0 |
59 |
#define DOG_BLOCK_DIMX (1 << DOG_BLOCK_LOG_DIMX) |
60 |
#define DOG_BLOCK_DIMY (1 << DOG_BLOCK_LOG_DIMY) |
61 |
|
62 |
//////////////////////////////////////////////////////////
|
63 |
//thread block size for keypoint detection
|
64 |
#define KEY_BLOCK_LOG_DIMX 3 |
65 |
#define KEY_BLOCK_LOG_DIMY 3 |
66 |
#define KEY_BLOCK_DIMX (1<<KEY_BLOCK_LOG_DIMX) |
67 |
#define KEY_BLOCK_DIMY (1<<KEY_BLOCK_LOG_DIMY) |
68 |
//#define KEY_OFFSET_ONE
|
69 |
//make KEY_BLOCK_LOG_DIMX 4 will make the write coalesced..
|
70 |
//but it seems uncoalesced writes don't affect the speed
|
71 |
|
72 |
//////////////////////////////////////////////////////////
|
73 |
//thread block size for initializing list generation (64, 128, 256, 512 ...)
|
74 |
#define HIST_INIT_WIDTH 128 |
75 |
//thread block size for generating feature list (32, 64, 128, 256, 512, ...)
|
76 |
#define LISTGEN_BLOCK_DIM 128 |
77 |
|
78 |
|
79 |
/////////////////////////////////////////////////////////
|
80 |
//how many keypoint orientations to compute in a block
|
81 |
#define ORIENTATION_COMPUTE_PER_BLOCK 64 |
82 |
//how many keypoint descriptor to compute in a block (2, 4, 8, 16, 32)
|
83 |
#define DESCRIPTOR_COMPUTE_PER_BLOCK 4 |
84 |
#define DESCRIPTOR_COMPUTE_BLOCK_SIZE (16 * DESCRIPTOR_COMPUTE_PER_BLOCK) |
85 |
//how many keypoint descriptor to normalized in a block (32, ...)
|
86 |
#define DESCRIPTOR_NORMALIZ_PER_BLOCK 32 |
87 |
|
88 |
|
89 |
|
90 |
///////////////////////////////////////////
|
91 |
//Thread block size for visualization
|
92 |
//(This doesn't affect the speed of computation)
|
93 |
#define BLOCK_LOG_DIM 4 |
94 |
#define BLOCK_DIM (1 << BLOCK_LOG_DIM) |
95 |
|
96 |
//End SiftGPU setting section.
|
97 |
//----------------------------------------------------------------
|
98 |
|
99 |
|
100 |
__device__ __constant__ float d_kernel[KERNEL_MAX_WIDTH];
|
101 |
texture<float, 1, cudaReadModeElementType> texData; |
102 |
texture<unsigned char, 1, cudaReadModeNormalizedFloat> texDataB; |
103 |
texture<float2, 2, cudaReadModeElementType> texDataF2;
|
104 |
texture<float4, 1, cudaReadModeElementType> texDataF4;
|
105 |
texture<int4, 1, cudaReadModeElementType> texDataI4;
|
106 |
texture<int4, 1, cudaReadModeElementType> texDataList;
|
107 |
|
108 |
//template<int i> __device__ float Conv(float *data) { return Conv<i-1>(data) + data[i]*d_kernel[i];}
|
109 |
//template<> __device__ float Conv<0>(float *data) { return data[0] * d_kernel[0]; }
|
110 |
|
111 |
|
112 |
//////////////////////////////////////////////////////////////
|
113 |
template<int FW> __global__ void FilterH( float* d_result, int width) |
114 |
{ |
115 |
|
116 |
const int HALF_WIDTH = FW >> 1; |
117 |
const int CACHE_WIDTH = FILTERH_TILE_WIDTH + FW -1; |
118 |
const int CACHE_COUNT = 2 + (CACHE_WIDTH - 2)/ FILTERH_TILE_WIDTH; |
119 |
__shared__ float data[CACHE_WIDTH];
|
120 |
const int bcol = IMUL(blockIdx.x, FILTERH_TILE_WIDTH); |
121 |
const int col = bcol + threadIdx.x; |
122 |
const int index_min = IMUL(blockIdx.y, width); |
123 |
const int index_max = index_min + width - 1; |
124 |
int src_index = index_min + bcol - HALF_WIDTH + threadIdx.x;
|
125 |
int cache_index = threadIdx.x;
|
126 |
float value = 0; |
127 |
#pragma unroll
|
128 |
for(int j = 0; j < CACHE_COUNT; ++j) |
129 |
{ |
130 |
if(cache_index < CACHE_WIDTH)
|
131 |
{ |
132 |
int fetch_index = src_index < index_min? index_min : (src_index > index_max ? index_max : src_index);
|
133 |
data[cache_index] = tex1Dfetch(texData,fetch_index); |
134 |
src_index += FILTERH_TILE_WIDTH; |
135 |
cache_index += FILTERH_TILE_WIDTH; |
136 |
} |
137 |
} |
138 |
__syncthreads(); |
139 |
if(col >= width) return; |
140 |
#pragma unroll
|
141 |
for(int i = 0; i < FW; ++i) |
142 |
{ |
143 |
value += (data[threadIdx.x + i]* d_kernel[i]); |
144 |
} |
145 |
// value = Conv<FW-1>(data + threadIdx.x);
|
146 |
d_result[index_min + col] = value; |
147 |
} |
148 |
|
149 |
|
150 |
|
151 |
////////////////////////////////////////////////////////////////////
|
152 |
template<int FW> __global__ void FilterV(float* d_result, int width, int height) |
153 |
{ |
154 |
const int HALF_WIDTH = FW >> 1; |
155 |
const int CACHE_WIDTH = FW + FILTERV_TILE_HEIGHT - 1; |
156 |
const int TEMP = CACHE_WIDTH & 0xf; |
157 |
//add some extra space to avoid bank conflict
|
158 |
#if FILTERV_TILE_WIDTH == 16 |
159 |
//make the stride 16 * n +/- 1
|
160 |
const int EXTRA = (TEMP == 1 || TEMP == 0) ? 1 - TEMP : 15 - TEMP; |
161 |
#elif FILTERV_TILE_WIDTH == 8 |
162 |
//make the stride 16 * n +/- 2
|
163 |
const int EXTRA = (TEMP == 2 || TEMP == 1 || TEMP == 0) ? 2 - TEMP : (TEMP == 15? 3 : 14 - TEMP); |
164 |
#elif FILTERV_TILE_WIDTH == 4 |
165 |
//make the stride 16 * n +/- 4
|
166 |
const int EXTRA = (TEMP >=0 && TEMP <=4) ? 4 - TEMP : (TEMP > 12? 20 - TEMP : 12 - TEMP); |
167 |
#else
|
168 |
#error
|
169 |
#endif
|
170 |
const int CACHE_TRUE_WIDTH = CACHE_WIDTH + EXTRA; |
171 |
const int CACHE_COUNT = (CACHE_WIDTH + FILTERV_BLOCK_HEIGHT - 1) / FILTERV_BLOCK_HEIGHT; |
172 |
const int WRITE_COUNT = (FILTERV_TILE_HEIGHT + FILTERV_BLOCK_HEIGHT -1) / FILTERV_BLOCK_HEIGHT; |
173 |
__shared__ float data[CACHE_TRUE_WIDTH * FILTERV_TILE_WIDTH];
|
174 |
const int row_block_first = IMUL(blockIdx.y, FILTERV_TILE_HEIGHT); |
175 |
const int col = IMUL(blockIdx.x, FILTERV_TILE_WIDTH) + threadIdx.x; |
176 |
const int row_first = row_block_first - HALF_WIDTH; |
177 |
const int data_index_max = IMUL(height - 1, width) + col; |
178 |
const int cache_col_start = threadIdx.y; |
179 |
const int cache_row_start = IMUL(threadIdx.x, CACHE_TRUE_WIDTH); |
180 |
int cache_index = cache_col_start + cache_row_start;
|
181 |
int data_index = IMUL(row_first + cache_col_start, width) + col;
|
182 |
|
183 |
if(col < width)
|
184 |
{ |
185 |
#pragma unroll
|
186 |
for(int i = 0; i < CACHE_COUNT; ++i) |
187 |
{ |
188 |
if(cache_col_start < CACHE_WIDTH - i * FILTERV_BLOCK_HEIGHT)
|
189 |
{ |
190 |
int fetch_index = data_index < col ? col : (data_index > data_index_max? data_index_max : data_index);
|
191 |
data[cache_index + i * FILTERV_BLOCK_HEIGHT] = tex1Dfetch(texData,fetch_index); |
192 |
data_index += IMUL(FILTERV_BLOCK_HEIGHT, width); |
193 |
} |
194 |
} |
195 |
} |
196 |
__syncthreads(); |
197 |
|
198 |
if(col >= width) return; |
199 |
|
200 |
int row = row_block_first + threadIdx.y;
|
201 |
int index_start = cache_row_start + threadIdx.y;
|
202 |
#pragma unroll
|
203 |
for(int i = 0; i < WRITE_COUNT; ++i, |
204 |
row += FILTERV_BLOCK_HEIGHT, index_start += FILTERV_BLOCK_HEIGHT) |
205 |
{ |
206 |
if(row < height)
|
207 |
{ |
208 |
int index_dest = IMUL(row, width) + col;
|
209 |
float value = 0; |
210 |
#pragma unroll
|
211 |
for(int i = 0; i < FW; ++i) |
212 |
{ |
213 |
value += (data[index_start + i] * d_kernel[i]); |
214 |
} |
215 |
d_result[index_dest] = value; |
216 |
} |
217 |
} |
218 |
} |
219 |
|
220 |
|
221 |
template<int LOG_SCALE> __global__ void UpsampleKernel(float* d_result, int width) |
222 |
{ |
223 |
const int SCALE = (1 << LOG_SCALE), SCALE_MASK = (SCALE - 1); |
224 |
const float INV_SCALE = 1.0f / (float(SCALE)); |
225 |
int col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
|
226 |
if(col >= width) return; |
227 |
|
228 |
int row = blockIdx.y >> LOG_SCALE;
|
229 |
int index = row * width + col;
|
230 |
int dst_row = blockIdx.y;
|
231 |
int dst_idx= (width * dst_row + col) * SCALE;
|
232 |
int helper = blockIdx.y & SCALE_MASK;
|
233 |
if (helper)
|
234 |
{ |
235 |
float v11 = tex1Dfetch(texData, index);
|
236 |
float v12 = tex1Dfetch(texData, index + 1); |
237 |
index += width; |
238 |
float v21 = tex1Dfetch(texData, index);
|
239 |
float v22 = tex1Dfetch(texData, index + 1); |
240 |
float w1 = INV_SCALE * helper, w2 = 1.0 - w1; |
241 |
float v1 = (v21 * w1 + w2 * v11);
|
242 |
float v2 = (v22 * w1 + w2 * v12);
|
243 |
d_result[dst_idx] = v1; |
244 |
#pragma unroll
|
245 |
for(int i = 1; i < SCALE; ++i) |
246 |
{ |
247 |
const float r2 = i * INV_SCALE; |
248 |
const float r1 = 1.0f - r2; |
249 |
d_result[dst_idx +i] = v1 * r1 + v2 * r2; |
250 |
} |
251 |
}else
|
252 |
{ |
253 |
float v1 = tex1Dfetch(texData, index);
|
254 |
float v2 = tex1Dfetch(texData, index + 1); |
255 |
d_result[dst_idx] = v1; |
256 |
#pragma unroll
|
257 |
for(int i = 1; i < SCALE; ++i) |
258 |
{ |
259 |
const float r2 = i * INV_SCALE; |
260 |
const float r1 = 1.0f - r2; |
261 |
d_result[dst_idx +i] = v1 * r1 + v2 * r2; |
262 |
} |
263 |
} |
264 |
|
265 |
} |
266 |
|
267 |
////////////////////////////////////////////////////////////////////////////////////////
|
268 |
void ProgramCU::SampleImageU(CuTexImage *dst, CuTexImage *src, int log_scale) |
269 |
{ |
270 |
int width = src->GetImgWidth(), height = src->GetImgHeight();
|
271 |
src->BindTexture(texData); |
272 |
dim3 grid((width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height << log_scale);
|
273 |
dim3 block(FILTERH_TILE_WIDTH); |
274 |
switch(log_scale)
|
275 |
{ |
276 |
case 1 : UpsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, width); break; |
277 |
case 2 : UpsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, width); break; |
278 |
case 3 : UpsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, width); break; |
279 |
default: break; |
280 |
} |
281 |
} |
282 |
|
283 |
template<int LOG_SCALE> __global__ void DownsampleKernel(float* d_result, int src_width, int dst_width) |
284 |
{ |
285 |
const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x; |
286 |
if(dst_col >= dst_width) return; |
287 |
const int src_col = min((dst_col << LOG_SCALE), (src_width - 1)); |
288 |
const int dst_row = blockIdx.y; |
289 |
const int src_row = blockIdx.y << LOG_SCALE; |
290 |
const int src_idx = IMUL(src_row, src_width) + src_col; |
291 |
const int dst_idx = IMUL(dst_width, dst_row) + dst_col; |
292 |
d_result[dst_idx] = tex1Dfetch(texData, src_idx); |
293 |
|
294 |
} |
295 |
|
296 |
__global__ void DownsampleKernel(float* d_result, int src_width, int dst_width, const int log_scale) |
297 |
{ |
298 |
const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x; |
299 |
if(dst_col >= dst_width) return; |
300 |
const int src_col = min((dst_col << log_scale), (src_width - 1)); |
301 |
const int dst_row = blockIdx.y; |
302 |
const int src_row = blockIdx.y << log_scale; |
303 |
const int src_idx = IMUL(src_row, src_width) + src_col; |
304 |
const int dst_idx = IMUL(dst_width, dst_row) + dst_col; |
305 |
d_result[dst_idx] = tex1Dfetch(texData, src_idx); |
306 |
|
307 |
} |
308 |
|
309 |
void ProgramCU::SampleImageD(CuTexImage *dst, CuTexImage *src, int log_scale) |
310 |
{ |
311 |
int src_width = src->GetImgWidth(), dst_width = dst->GetImgWidth() ;
|
312 |
|
313 |
src->BindTexture(texData); |
314 |
dim3 grid((dst_width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, dst->GetImgHeight());
|
315 |
dim3 block(FILTERH_TILE_WIDTH); |
316 |
switch(log_scale)
|
317 |
{ |
318 |
case 1 : DownsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width); break; |
319 |
case 2 : DownsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width); break; |
320 |
case 3 : DownsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width); break; |
321 |
default: DownsampleKernel <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width, log_scale); |
322 |
} |
323 |
} |
324 |
|
325 |
__global__ void ChannelReduce_Kernel(float* d_result) |
326 |
{ |
327 |
int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
|
328 |
d_result[index] = tex1Dfetch(texData, index*4);
|
329 |
} |
330 |
|
331 |
__global__ void ChannelReduce_Convert_Kernel(float* d_result) |
332 |
{ |
333 |
int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
|
334 |
float4 rgba = tex1Dfetch(texDataF4, index); |
335 |
d_result[index] = 0.299f * rgba.x + 0.587f* rgba.y + 0.114f * rgba.z; |
336 |
} |
337 |
|
338 |
void ProgramCU::ReduceToSingleChannel(CuTexImage* dst, CuTexImage* src, int convert_rgb) |
339 |
{ |
340 |
int width = src->GetImgWidth(), height = dst->GetImgHeight() ;
|
341 |
|
342 |
dim3 grid((width * height + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);
|
343 |
dim3 block(FILTERH_TILE_WIDTH); |
344 |
if(convert_rgb)
|
345 |
{ |
346 |
src->BindTexture(texDataF4); |
347 |
ChannelReduce_Convert_Kernel<<<grid, block>>>((float*)dst->_cuData);
|
348 |
}else
|
349 |
{ |
350 |
src->BindTexture(texData); |
351 |
ChannelReduce_Kernel<<<grid, block>>>((float*)dst->_cuData);
|
352 |
} |
353 |
} |
354 |
|
355 |
__global__ void ConvertByteToFloat_Kernel(float* d_result) |
356 |
{ |
357 |
int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
|
358 |
d_result[index] = tex1Dfetch(texDataB, index); |
359 |
} |
360 |
|
361 |
void ProgramCU::ConvertByteToFloat(CuTexImage*src, CuTexImage* dst)
|
362 |
{ |
363 |
int width = src->GetImgWidth(), height = dst->GetImgHeight() ;
|
364 |
dim3 grid((width * height + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);
|
365 |
dim3 block(FILTERH_TILE_WIDTH); |
366 |
src->BindTexture(texDataB); |
367 |
ConvertByteToFloat_Kernel<<<grid, block>>>((float*)dst->_cuData);
|
368 |
} |
369 |
|
370 |
void ProgramCU::CreateFilterKernel(float sigma, float* kernel, int& width) |
371 |
{ |
372 |
int i, sz = int( ceil( GlobalUtil::_FilterWidthFactor * sigma -0.5) ) ;// |
373 |
width = 2*sz + 1; |
374 |
|
375 |
if(width > KERNEL_MAX_WIDTH)
|
376 |
{ |
377 |
//filter size truncation
|
378 |
sz = KERNEL_MAX_WIDTH >> 1;
|
379 |
width =KERNEL_MAX_WIDTH; |
380 |
}else if(width < KERNEL_MIN_WIDTH) |
381 |
{ |
382 |
sz = KERNEL_MIN_WIDTH >> 1;
|
383 |
width =KERNEL_MIN_WIDTH; |
384 |
} |
385 |
|
386 |
float rv = 1.0f/(sigma*sigma), v, ksum =0; |
387 |
|
388 |
// pre-compute filter
|
389 |
for( i = -sz ; i <= sz ; ++i)
|
390 |
{ |
391 |
kernel[i+sz] = v = exp(-0.5f * i * i *rv) ; |
392 |
ksum += v; |
393 |
} |
394 |
|
395 |
//normalize the kernel
|
396 |
rv = 1.0f/ksum; |
397 |
for(i = 0; i< width ;i++) kernel[i]*=rv; |
398 |
} |
399 |
|
400 |
|
401 |
template<int FW> void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf) |
402 |
{ |
403 |
int width = src->GetImgWidth(), height = src->GetImgHeight();
|
404 |
|
405 |
//horizontal filtering
|
406 |
src->BindTexture(texData); |
407 |
dim3 gridh((width + FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height);
|
408 |
dim3 blockh(FILTERH_TILE_WIDTH); |
409 |
FilterH<FW><<<gridh, blockh>>>((float*)buf->_cuData, width);
|
410 |
CheckErrorCUDA("FilterH");
|
411 |
|
412 |
///vertical filtering
|
413 |
buf->BindTexture(texData); |
414 |
dim3 gridv((width + FILTERV_TILE_WIDTH - 1)/ FILTERV_TILE_WIDTH, (height + FILTERV_TILE_HEIGHT - 1)/FILTERV_TILE_HEIGHT); |
415 |
dim3 blockv(FILTERV_TILE_WIDTH, FILTERV_BLOCK_HEIGHT); |
416 |
FilterV<FW><<<gridv, blockv>>>((float*)dst->_cuData, width, height);
|
417 |
CheckErrorCUDA("FilterV");
|
418 |
} |
419 |
|
420 |
//////////////////////////////////////////////////////////////////////
|
421 |
// tested on 2048x1500 image, the time on pyramid construction is
|
422 |
// OpenGL version : 18ms
|
423 |
// CUDA version: 28 ms
|
424 |
void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf, float sigma) |
425 |
{ |
426 |
float filter_kernel[KERNEL_MAX_WIDTH]; int width; |
427 |
CreateFilterKernel(sigma, filter_kernel, width); |
428 |
cudaMemcpyToSymbol(d_kernel, filter_kernel, width * sizeof(float), 0, cudaMemcpyHostToDevice); |
429 |
|
430 |
switch(width)
|
431 |
{ |
432 |
case 5: FilterImage< 5>(dst, src, buf); break; |
433 |
case 7: FilterImage< 7>(dst, src, buf); break; |
434 |
case 9: FilterImage< 9>(dst, src, buf); break; |
435 |
case 11: FilterImage<11>(dst, src, buf); break; |
436 |
case 13: FilterImage<13>(dst, src, buf); break; |
437 |
case 15: FilterImage<15>(dst, src, buf); break; |
438 |
case 17: FilterImage<17>(dst, src, buf); break; |
439 |
case 19: FilterImage<19>(dst, src, buf); break; |
440 |
case 21: FilterImage<21>(dst, src, buf); break; |
441 |
case 23: FilterImage<23>(dst, src, buf); break; |
442 |
case 25: FilterImage<25>(dst, src, buf); break; |
443 |
case 27: FilterImage<27>(dst, src, buf); break; |
444 |
case 29: FilterImage<29>(dst, src, buf); break; |
445 |
case 31: FilterImage<31>(dst, src, buf); break; |
446 |
case 33: FilterImage<33>(dst, src, buf); break; |
447 |
default: break; |
448 |
} |
449 |
|
450 |
} |
451 |
|
452 |
|
453 |
texture<float, 1, cudaReadModeElementType> texC; |
454 |
texture<float, 1, cudaReadModeElementType> texP; |
455 |
texture<float, 1, cudaReadModeElementType> texN; |
456 |
|
457 |
void __global__ ComputeDOG_Kernel(float* d_dog, float2* d_got, int width, int height) |
458 |
{ |
459 |
int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
|
460 |
int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
|
461 |
if(col < width && row < height)
|
462 |
{ |
463 |
int index = IMUL(row, width) + col;
|
464 |
float vp = tex1Dfetch(texP, index);
|
465 |
float v = tex1Dfetch(texC, index);
|
466 |
d_dog[index] = v - vp; |
467 |
float vxn = tex1Dfetch(texC, index + 1); |
468 |
float vxp = tex1Dfetch(texC, index - 1); |
469 |
float vyp = tex1Dfetch(texC, index - width);
|
470 |
float vyn = tex1Dfetch(texC, index + width);
|
471 |
float dx = vxn - vxp, dy = vyn - vyp;
|
472 |
float grd = 0.5f * sqrt(dx * dx + dy * dy); |
473 |
float rot = (grd == 0.0f? 0.0f : atan2(dy, dx)); |
474 |
d_got[index] = make_float2(grd, rot); |
475 |
} |
476 |
} |
477 |
|
478 |
void __global__ ComputeDOG_Kernel(float* d_dog, int width, int height) |
479 |
{ |
480 |
int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
|
481 |
int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
|
482 |
if(col < width && row < height)
|
483 |
{ |
484 |
int index = IMUL(row, width) + col;
|
485 |
float vp = tex1Dfetch(texP, index);
|
486 |
float v = tex1Dfetch(texC, index);
|
487 |
d_dog[index] = v - vp; |
488 |
} |
489 |
} |
490 |
|
491 |
void ProgramCU::ComputeDOG(CuTexImage* gus, CuTexImage* dog, CuTexImage* got)
|
492 |
{ |
493 |
int width = gus->GetImgWidth(), height = gus->GetImgHeight();
|
494 |
dim3 grid((width + DOG_BLOCK_DIMX - 1)/ DOG_BLOCK_DIMX, (height + DOG_BLOCK_DIMY - 1)/DOG_BLOCK_DIMY); |
495 |
dim3 block(DOG_BLOCK_DIMX, DOG_BLOCK_DIMY); |
496 |
gus->BindTexture(texC); |
497 |
(gus -1)->BindTexture(texP);
|
498 |
if(got->_cuData)
|
499 |
ComputeDOG_Kernel<<<grid, block>>>((float*) dog->_cuData, (float2*) got->_cuData, width, height);
|
500 |
else
|
501 |
ComputeDOG_Kernel<<<grid, block>>>((float*) dog->_cuData, width, height);
|
502 |
} |
503 |
|
504 |
|
505 |
#define READ_CMP_DOG_DATA(datai, tex, idx) \
|
506 |
datai[0] = tex1Dfetch(tex, idx - 1);\ |
507 |
datai[1] = tex1Dfetch(tex, idx);\
|
508 |
datai[2] = tex1Dfetch(tex, idx + 1);\ |
509 |
if(v > nmax)\
|
510 |
{\ |
511 |
nmax = max(nmax, datai[0]);\
|
512 |
nmax = max(nmax, datai[1]);\
|
513 |
nmax = max(nmax, datai[2]);\
|
514 |
if(v < nmax) goto key_finish;\ |
515 |
}else\
|
516 |
{\ |
517 |
nmin = min(nmin, datai[0]);\
|
518 |
nmin = min(nmin, datai[1]);\
|
519 |
nmin = min(nmin, datai[2]);\
|
520 |
if(v > nmin) goto key_finish;\ |
521 |
} |
522 |
|
523 |
|
524 |
void __global__ ComputeKEY_Kernel(float4* d_key, int width, int colmax, int rowmax, |
525 |
float dog_threshold0, float dog_threshold, float edge_threshold, int subpixel_localization) |
526 |
{ |
527 |
float data[3][3], v; |
528 |
float datap[3][3], datan[3][3]; |
529 |
#ifdef KEY_OFFSET_ONE
|
530 |
int row = (blockIdx.y << KEY_BLOCK_LOG_DIMY) + threadIdx.y + 1; |
531 |
int col = (blockIdx.x << KEY_BLOCK_LOG_DIMX) + threadIdx.x + 1; |
532 |
#else
|
533 |
int row = (blockIdx.y << KEY_BLOCK_LOG_DIMY) + threadIdx.y;
|
534 |
int col = (blockIdx.x << KEY_BLOCK_LOG_DIMX) + threadIdx.x;
|
535 |
#endif
|
536 |
int index = IMUL(row, width) + col;
|
537 |
int idx[3] ={index - width, index, index + width}; |
538 |
int in_image =0; |
539 |
float nmax, nmin, result = 0.0f; |
540 |
float dx = 0, dy = 0, ds = 0; |
541 |
bool offset_test_passed = true; |
542 |
#ifdef KEY_OFFSET_ONE
|
543 |
if(row < rowmax && col < colmax)
|
544 |
#else
|
545 |
if(row > 0 && col > 0 && row < rowmax && col < colmax) |
546 |
#endif
|
547 |
{ |
548 |
in_image = 1;
|
549 |
data[1][1] = v = tex1Dfetch(texC, idx[1]); |
550 |
if(fabs(v) <= dog_threshold0) goto key_finish; |
551 |
|
552 |
data[1][0] = tex1Dfetch(texC, idx[1] - 1); |
553 |
data[1][2] = tex1Dfetch(texC, idx[1] + 1); |
554 |
nmax = max(data[1][0], data[1][2]); |
555 |
nmin = min(data[1][0], data[1][2]); |
556 |
|
557 |
if(v <=nmax && v >= nmin) goto key_finish; |
558 |
//if((v > nmax && v < 0 )|| (v < nmin && v > 0)) goto key_finish;
|
559 |
READ_CMP_DOG_DATA(data[0], texC, idx[0]); |
560 |
READ_CMP_DOG_DATA(data[2], texC, idx[2]); |
561 |
|
562 |
//edge supression
|
563 |
float vx2 = v * 2.0f; |
564 |
float fxx = data[1][0] + data[1][2] - vx2; |
565 |
float fyy = data[0][1] + data[2][1] - vx2; |
566 |
float fxy = 0.25f * (data[2][2] + data[0][0] - data[2][0] - data[0][2]); |
567 |
float temp1 = fxx * fyy - fxy * fxy;
|
568 |
float temp2 = (fxx + fyy) * (fxx + fyy);
|
569 |
if(temp1 <=0 || temp2 > edge_threshold * temp1) goto key_finish; |
570 |
|
571 |
|
572 |
//read the previous level
|
573 |
READ_CMP_DOG_DATA(datap[0], texP, idx[0]); |
574 |
READ_CMP_DOG_DATA(datap[1], texP, idx[1]); |
575 |
READ_CMP_DOG_DATA(datap[2], texP, idx[2]); |
576 |
|
577 |
|
578 |
//read the next level
|
579 |
READ_CMP_DOG_DATA(datan[0], texN, idx[0]); |
580 |
READ_CMP_DOG_DATA(datan[1], texN, idx[1]); |
581 |
READ_CMP_DOG_DATA(datan[2], texN, idx[2]); |
582 |
|
583 |
if(subpixel_localization)
|
584 |
{ |
585 |
//subpixel localization
|
586 |
float fx = 0.5f * (data[1][2] - data[1][0]); |
587 |
float fy = 0.5f * (data[2][1] - data[0][1]); |
588 |
float fs = 0.5f * (datan[1][1] - datap[1][1]); |
589 |
|
590 |
float fss = (datan[1][1] + datap[1][1] - vx2); |
591 |
float fxs = 0.25f* (datan[1][2] + datap[1][0] - datan[1][0] - datap[1][2]); |
592 |
float fys = 0.25f* (datan[2][1] + datap[0][1] - datan[0][1] - datap[2][1]); |
593 |
|
594 |
//need to solve dx, dy, ds;
|
595 |
// |-fx| | fxx fxy fxs | |dx|
|
596 |
// |-fy| = | fxy fyy fys | * |dy|
|
597 |
// |-fs| | fxs fys fss | |ds|
|
598 |
float4 A0 = fxx > 0? make_float4(fxx, fxy, fxs, -fx) : make_float4(-fxx, -fxy, -fxs, fx);
|
599 |
float4 A1 = fxy > 0? make_float4(fxy, fyy, fys, -fy) : make_float4(-fxy, -fyy, -fys, fy);
|
600 |
float4 A2 = fxs > 0? make_float4(fxs, fys, fss, -fs) : make_float4(-fxs, -fys, -fss, fs);
|
601 |
float maxa = max(max(A0.x, A1.x), A2.x);
|
602 |
if(maxa >= 1e-10) |
603 |
{ |
604 |
if(maxa == A1.x)
|
605 |
{ |
606 |
float4 TEMP = A1; A1 = A0; A0 = TEMP; |
607 |
}else if(maxa == A2.x) |
608 |
{ |
609 |
float4 TEMP = A2; A2 = A0; A0 = TEMP; |
610 |
} |
611 |
A0.y /= A0.x; A0.z /= A0.x; A0.w/= A0.x; |
612 |
A1.y -= A1.x * A0.y; A1.z -= A1.x * A0.z; A1.w -= A1.x * A0.w; |
613 |
A2.y -= A2.x * A0.y; A2.z -= A2.x * A0.z; A2.w -= A2.x * A0.w; |
614 |
if(abs(A2.y) > abs(A1.y))
|
615 |
{ |
616 |
float4 TEMP = A2; A2 = A1; A1 = TEMP; |
617 |
} |
618 |
if(abs(A1.y) >= 1e-10) |
619 |
{ |
620 |
A1.z /= A1.y; A1.w /= A1.y; |
621 |
A2.z -= A2.y * A1.z; A2.w -= A2.y * A1.w; |
622 |
if(abs(A2.z) >= 1e-10) |
623 |
{ |
624 |
ds = A2.w / A2.z; |
625 |
dy = A1.w - ds * A1.z; |
626 |
dx = A0.w - ds * A0.z - dy * A0.y; |
627 |
|
628 |
offset_test_passed = |
629 |
fabs(data[1][1] + 0.5f * (dx * fx + dy * fy + ds * fs)) > dog_threshold |
630 |
&&fabs(ds) < 1.0f && fabs(dx) < 1.0f && fabs(dy) < 1.0f; |
631 |
} |
632 |
} |
633 |
} |
634 |
} |
635 |
if(offset_test_passed) result = v > nmax ? 1.0 : -1.0; |
636 |
} |
637 |
key_finish:
|
638 |
if(in_image) d_key[index] = make_float4(result, dx, dy, ds);
|
639 |
} |
640 |
|
641 |
|
642 |
void ProgramCU::ComputeKEY(CuTexImage* dog, CuTexImage* key, float Tdog, float Tedge) |
643 |
{ |
644 |
int width = dog->GetImgWidth(), height = dog->GetImgHeight();
|
645 |
float Tdog1 = (GlobalUtil::_SubpixelLocalization? 0.8f : 1.0f) * Tdog; |
646 |
CuTexImage* dogp = dog - 1;
|
647 |
CuTexImage* dogn = dog + 1;
|
648 |
#ifdef KEY_OFFSET_ONE
|
649 |
dim3 grid((width - 1 + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX, (height - 1 + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY); |
650 |
#else
|
651 |
dim3 grid((width + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX, (height + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY); |
652 |
#endif
|
653 |
dim3 block(KEY_BLOCK_DIMX, KEY_BLOCK_DIMY); |
654 |
dogp->BindTexture(texP); |
655 |
dog ->BindTexture(texC); |
656 |
dogn->BindTexture(texN); |
657 |
Tedge = (Tedge+1)*(Tedge+1)/Tedge; |
658 |
ComputeKEY_Kernel<<<grid, block>>>((float4*) key->_cuData, width, |
659 |
width -1, height -1, Tdog1, Tdog, Tedge, GlobalUtil::_SubpixelLocalization); |
660 |
|
661 |
} |
662 |
|
663 |
|
664 |
|
665 |
void __global__ InitHist_Kernel(int4* hist, int ws, int wd, int height) |
666 |
{ |
667 |
int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
|
668 |
int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
|
669 |
if(row < height && col < wd)
|
670 |
{ |
671 |
int hidx = IMUL(row, wd) + col;
|
672 |
int scol = col << 2; |
673 |
int sidx = IMUL(row, ws) + scol;
|
674 |
int v[4] = {0, 0, 0, 0}; |
675 |
if(row > 0 && row < height -1) |
676 |
{ |
677 |
#pragma unroll
|
678 |
for(int i = 0; i < 4 ; ++i, ++scol) |
679 |
{ |
680 |
float4 temp = tex1Dfetch(texDataF4, sidx +i); |
681 |
v[i] = (scol < ws -1 && scol > 0 && temp.x!=0) ? 1 : 0; |
682 |
} |
683 |
} |
684 |
hist[hidx] = make_int4(v[0], v[1], v[2], v[3]); |
685 |
|
686 |
} |
687 |
} |
688 |
|
689 |
|
690 |
|
691 |
void ProgramCU::InitHistogram(CuTexImage* key, CuTexImage* hist)
|
692 |
{ |
693 |
int ws = key->GetImgWidth(), hs = key->GetImgHeight();
|
694 |
int wd = hist->GetImgWidth(), hd = hist->GetImgHeight();
|
695 |
dim3 grid((wd + HIST_INIT_WIDTH - 1)/ HIST_INIT_WIDTH, hd);
|
696 |
dim3 block(HIST_INIT_WIDTH, 1);
|
697 |
key->BindTexture(texDataF4); |
698 |
InitHist_Kernel<<<grid, block>>>((int4*) hist->_cuData, ws, wd, hd); |
699 |
} |
700 |
|
701 |
|
702 |
|
703 |
void __global__ ReduceHist_Kernel(int4* d_hist, int ws, int wd, int height) |
704 |
{ |
705 |
int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
|
706 |
int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
|
707 |
if(row < height && col < wd)
|
708 |
{ |
709 |
int hidx = IMUL(row, wd) + col;
|
710 |
int scol = col << 2; |
711 |
int sidx = IMUL(row, ws) + scol;
|
712 |
int v[4] = {0, 0, 0, 0}; |
713 |
#pragma unroll
|
714 |
for(int i = 0; i < 4 && scol < ws; ++i, ++scol) |
715 |
{ |
716 |
int4 temp = tex1Dfetch(texDataI4, sidx + i); |
717 |
v[i] = temp.x + temp.y + temp.z + temp.w; |
718 |
} |
719 |
d_hist[hidx] = make_int4(v[0], v[1], v[2], v[3]); |
720 |
} |
721 |
} |
722 |
|
723 |
void ProgramCU::ReduceHistogram(CuTexImage*hist1, CuTexImage* hist2)
|
724 |
{ |
725 |
int ws = hist1->GetImgWidth(), hs = hist1->GetImgHeight();
|
726 |
int wd = hist2->GetImgWidth(), hd = hist2->GetImgHeight();
|
727 |
int temp = (int)floor(logf(float(wd * 2/ 3)) / logf(2.0f)); |
728 |
const int wi = min(7, max(temp , 0)); |
729 |
hist1->BindTexture(texDataI4); |
730 |
|
731 |
const int BW = 1 << wi, BH = 1 << (7 - wi); |
732 |
dim3 grid((wd + BW - 1)/ BW, (hd + BH -1) / BH); |
733 |
dim3 block(BW, BH); |
734 |
ReduceHist_Kernel<<<grid, block>>>((int4*)hist2->_cuData, ws, wd, hd); |
735 |
} |
736 |
|
737 |
|
738 |
void __global__ ListGen_Kernel(int4* d_list, int width) |
739 |
{ |
740 |
int idx1 = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
|
741 |
int4 pos = tex1Dfetch(texDataList, idx1); |
742 |
int idx2 = IMUL(pos.y, width) + pos.x;
|
743 |
int4 temp = tex1Dfetch(texDataI4, idx2); |
744 |
int sum1 = temp.x + temp.y;
|
745 |
int sum2 = sum1 + temp.z;
|
746 |
pos.x <<= 2;
|
747 |
if(pos.z >= sum2)
|
748 |
{ |
749 |
pos.x += 3;
|
750 |
pos.z -= sum2; |
751 |
}else if(pos.z >= sum1) |
752 |
{ |
753 |
pos.x += 2;
|
754 |
pos.z -= sum1; |
755 |
}else if(pos.z >= temp.x) |
756 |
{ |
757 |
pos.x += 1;
|
758 |
pos.z -= temp.x; |
759 |
} |
760 |
d_list[idx1] = pos; |
761 |
} |
762 |
|
763 |
//input list (x, y) (x, y) ....
|
764 |
void ProgramCU::GenerateList(CuTexImage* list, CuTexImage* hist)
|
765 |
{ |
766 |
int len = list->GetImgWidth();
|
767 |
list->BindTexture(texDataList); |
768 |
hist->BindTexture(texDataI4); |
769 |
dim3 grid((len + LISTGEN_BLOCK_DIM -1) /LISTGEN_BLOCK_DIM);
|
770 |
dim3 block(LISTGEN_BLOCK_DIM); |
771 |
ListGen_Kernel<<<grid, block>>>((int4*) list->_cuData, hist->GetImgWidth()); |
772 |
} |
773 |
|
774 |
void __global__ ComputeOrientation_Kernel(float4* d_list,
|
775 |
int list_len,
|
776 |
int width, int height, |
777 |
float sigma, float sigma_step, |
778 |
float gaussian_factor, float sample_factor, |
779 |
int num_orientation,
|
780 |
int existing_keypoint,
|
781 |
int subpixel,
|
782 |
int keepsign)
|
783 |
{ |
784 |
const float ten_degree_per_radius = 5.7295779513082320876798154814105; |
785 |
const float radius_per_ten_degrees = 1.0 / 5.7295779513082320876798154814105; |
786 |
int idx = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
|
787 |
if(idx >= list_len) return; |
788 |
float4 key; |
789 |
if(existing_keypoint)
|
790 |
{ |
791 |
key = tex1Dfetch(texDataF4, idx); |
792 |
}else
|
793 |
{ |
794 |
int4 ikey = tex1Dfetch(texDataList, idx); |
795 |
key.x = ikey.x + 0.5f; |
796 |
key.y = ikey.y + 0.5f; |
797 |
key.z = sigma; |
798 |
if(subpixel || keepsign)
|
799 |
{ |
800 |
float4 offset = tex1Dfetch(texDataF4, IMUL(width, ikey.y) + ikey.x); |
801 |
if(subpixel)
|
802 |
{ |
803 |
key.x += offset.y; |
804 |
key.y += offset.z; |
805 |
key.z *= pow(sigma_step, offset.w); |
806 |
} |
807 |
if(keepsign) key.z *= offset.x;
|
808 |
} |
809 |
} |
810 |
if(num_orientation == 0) |
811 |
{ |
812 |
key.w = 0;
|
813 |
d_list[idx] = key; |
814 |
return;
|
815 |
} |
816 |
float vote[37]; |
817 |
float gsigma = key.z * gaussian_factor;
|
818 |
float win = fabs(key.z) * sample_factor;
|
819 |
float dist_threshold = win * win + 0.5; |
820 |
float factor = -0.5f / (gsigma * gsigma); |
821 |
float xmin = max(1.5f, floor(key.x - win) + 0.5f); |
822 |
float ymin = max(1.5f, floor(key.y - win) + 0.5f); |
823 |
float xmax = min(width - 1.5f, floor(key.x + win) + 0.5f); |
824 |
float ymax = min(height -1.5f, floor(key.y + win) + 0.5f); |
825 |
#pragma unroll
|
826 |
for(int i = 0; i < 36; ++i) vote[i] = 0.0f; |
827 |
for(float y = ymin; y <= ymax; y += 1.0f) |
828 |
{ |
829 |
for(float x = xmin; x <= xmax; x += 1.0f) |
830 |
{ |
831 |
float dx = x - key.x;
|
832 |
float dy = y - key.y;
|
833 |
float sq_dist = dx * dx + dy * dy;
|
834 |
if(sq_dist >= dist_threshold) continue; |
835 |
float2 got = tex2D(texDataF2, x, y); |
836 |
float weight = got.x * exp(sq_dist * factor);
|
837 |
float fidx = floor(got.y * ten_degree_per_radius);
|
838 |
int oidx = fidx;
|
839 |
if(oidx < 0) oidx += 36; |
840 |
vote[oidx] += weight; |
841 |
} |
842 |
} |
843 |
|
844 |
//filter the vote
|
845 |
|
846 |
const float one_third = 1.0 /3.0; |
847 |
#pragma unroll
|
848 |
for(int i = 0; i < 6; ++i) |
849 |
{ |
850 |
vote[36] = vote[0]; |
851 |
float pre = vote[35]; |
852 |
#pragma unroll
|
853 |
for(int j = 0; j < 36; ++j) |
854 |
{ |
855 |
float temp = one_third * (pre + vote[j] + vote[j + 1]); |
856 |
pre = vote[j]; vote[j] = temp; |
857 |
} |
858 |
} |
859 |
|
860 |
vote[36] = vote[0]; |
861 |
if(num_orientation == 1 || existing_keypoint) |
862 |
{ |
863 |
int index_max = 0; |
864 |
float max_vote = vote[0]; |
865 |
#pragma unroll
|
866 |
for(int i = 1; i < 36; ++i) |
867 |
{ |
868 |
index_max = vote[i] > max_vote? i : index_max; |
869 |
max_vote = max(max_vote, vote[i]); |
870 |
} |
871 |
float pre = vote[index_max == 0? 35 : index_max -1]; |
872 |
float next = vote[index_max + 1]; |
873 |
float weight = max_vote;
|
874 |
float off = 0.5f * FDIV(next - pre, weight + weight - next - pre); |
875 |
key.w = radius_per_ten_degrees * (index_max + 0.5f + off); |
876 |
d_list[idx] = key; |
877 |
|
878 |
}else
|
879 |
{ |
880 |
float max_vote = vote[0]; |
881 |
#pragma unroll
|
882 |
for(int i = 1; i < 36; ++i) max_vote = max(max_vote, vote[i]); |
883 |
|
884 |
float vote_threshold = max_vote * 0.8f; |
885 |
float pre = vote[35]; |
886 |
float max_rot[2], max_vot[2] = {0, 0}; |
887 |
int ocount = 0; |
888 |
#pragma unroll
|
889 |
for(int i =0; i < 36; ++i) |
890 |
{ |
891 |
float next = vote[i + 1]; |
892 |
if(vote[i] > vote_threshold && vote[i] > pre && vote[i] > next)
|
893 |
{ |
894 |
float di = 0.5f * FDIV(next - pre, vote[i] + vote[i] - next - pre); |
895 |
float rot = i + di + 0.5f; |
896 |
float weight = vote[i];
|
897 |
///
|
898 |
if(weight > max_vot[1]) |
899 |
{ |
900 |
if(weight > max_vot[0]) |
901 |
{ |
902 |
max_vot[1] = max_vot[0]; |
903 |
max_rot[1] = max_rot[0]; |
904 |
max_vot[0] = weight;
|
905 |
max_rot[0] = rot;
|
906 |
} |
907 |
else
|
908 |
{ |
909 |
max_vot[1] = weight;
|
910 |
max_rot[1] = rot;
|
911 |
} |
912 |
ocount ++; |
913 |
} |
914 |
} |
915 |
pre = vote[i]; |
916 |
} |
917 |
float fr1 = max_rot[0] / 36.0f; |
918 |
if(fr1 < 0) fr1 += 1.0f; |
919 |
unsigned short us1 = ocount == 0? 65535 : ((unsigned short )floor(fr1 * 65535.0f)); |
920 |
unsigned short us2 = 65535; |
921 |
if(ocount > 1) |
922 |
{ |
923 |
float fr2 = max_rot[1] / 36.0f; |
924 |
if(fr2 < 0) fr2 += 1.0f; |
925 |
us2 = (unsigned short ) floor(fr2 * 65535.0f); |
926 |
} |
927 |
unsigned int uspack = (us2 << 16) | us1; |
928 |
key.w = __int_as_float(uspack); |
929 |
d_list[idx] = key; |
930 |
} |
931 |
|
932 |
} |
933 |
|
934 |
|
935 |
|
936 |
|
937 |
void ProgramCU::ComputeOrientation(CuTexImage* list, CuTexImage* got, CuTexImage*key,
|
938 |
float sigma, float sigma_step, int existing_keypoint) |
939 |
{ |
940 |
int len = list->GetImgWidth();
|
941 |
if(len <= 0) return; |
942 |
int width = got->GetImgWidth(), height = got->GetImgHeight();
|
943 |
if(existing_keypoint)
|
944 |
{ |
945 |
list->BindTexture(texDataF4); |
946 |
}else
|
947 |
{ |
948 |
list->BindTexture(texDataList); |
949 |
if(GlobalUtil::_SubpixelLocalization) key->BindTexture(texDataF4);
|
950 |
} |
951 |
got->BindTexture2D(texDataF2); |
952 |
|
953 |
const int block_width = len < ORIENTATION_COMPUTE_PER_BLOCK ? 16 : ORIENTATION_COMPUTE_PER_BLOCK; |
954 |
dim3 grid((len + block_width -1) / block_width);
|
955 |
dim3 block(block_width); |
956 |
|
957 |
ComputeOrientation_Kernel<<<grid, block>>>((float4*) list->_cuData, |
958 |
len, width, height, sigma, sigma_step, |
959 |
GlobalUtil::_OrientationGaussianFactor, |
960 |
GlobalUtil::_OrientationGaussianFactor * GlobalUtil::_OrientationWindowFactor, |
961 |
GlobalUtil::_FixedOrientation? 0 : GlobalUtil::_MaxOrientation,
|
962 |
existing_keypoint, GlobalUtil::_SubpixelLocalization, GlobalUtil::_KeepExtremumSign); |
963 |
|
964 |
ProgramCU::CheckErrorCUDA("ComputeOrientation");
|
965 |
} |
966 |
|
967 |
template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptor_Kernel(float4* d_des, int num, |
968 |
int width, int height, float window_factor) |
969 |
{ |
970 |
const float rpi = 4.0/ 3.14159265358979323846; |
971 |
int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
|
972 |
int fidx = idx >> 4; |
973 |
if(fidx >= num) return; |
974 |
float4 key = tex1Dfetch(texDataF4, fidx); |
975 |
int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2; |
976 |
float spt = fabs(key.z * window_factor);
|
977 |
float s, c; __sincosf(key.w, &s, &c);
|
978 |
float anglef = key.w > 3.14159265358979323846? key.w - (2.0 * 3.14159265358979323846) : key.w ; |
979 |
float cspt = c * spt, sspt = s * spt;
|
980 |
float crspt = c / spt, srspt = s / spt;
|
981 |
float2 offsetpt, pt; |
982 |
float xmin, ymin, xmax, ymax, bsz;
|
983 |
offsetpt.x = ix - 1.5f; |
984 |
offsetpt.y = iy - 1.5f; |
985 |
pt.x = cspt * offsetpt.x - sspt * offsetpt.y + key.x; |
986 |
pt.y = cspt * offsetpt.y + sspt * offsetpt.x + key.y; |
987 |
bsz = fabs(cspt) + fabs(sspt); |
988 |
xmin = max(1.5f, floor(pt.x - bsz) + 0.5f); |
989 |
ymin = max(1.5f, floor(pt.y - bsz) + 0.5f); |
990 |
xmax = min(width - 1.5f, floor(pt.x + bsz) + 0.5f); |
991 |
ymax = min(height - 1.5f, floor(pt.y + bsz) + 0.5f); |
992 |
float des[9]; |
993 |
#pragma unroll
|
994 |
for(int i =0; i < 9; ++i) des[i] = 0.0f; |
995 |
for(float y = ymin; y <= ymax; y += 1.0f) |
996 |
{ |
997 |
for(float x = xmin; x <= xmax; x += 1.0f) |
998 |
{ |
999 |
float dx = x - pt.x;
|
1000 |
float dy = y - pt.y;
|
1001 |
float nx = crspt * dx + srspt * dy;
|
1002 |
float ny = crspt * dy - srspt * dx;
|
1003 |
float nxn = fabs(nx);
|
1004 |
float nyn = fabs(ny);
|
1005 |
if(nxn < 1.0f && nyn < 1.0f) |
1006 |
{ |
1007 |
float2 cc = tex2D(texDataF2, x, y); |
1008 |
float dnx = nx + offsetpt.x;
|
1009 |
float dny = ny + offsetpt.y;
|
1010 |
float ww = exp(-0.125f * (dnx * dnx + dny * dny)); |
1011 |
float wx = 1.0 - nxn; |
1012 |
float wy = 1.0 - nyn; |
1013 |
float weight = ww * wx * wy * cc.x;
|
1014 |
float theta = (anglef - cc.y) * rpi;
|
1015 |
if(theta < 0) theta += 8.0f; |
1016 |
float fo = floor(theta);
|
1017 |
int fidx = fo;
|
1018 |
float weight1 = fo + 1.0f - theta; |
1019 |
float weight2 = theta - fo;
|
1020 |
if(DYNAMIC_INDEXING)
|
1021 |
{ |
1022 |
des[fidx] += (weight1 * weight); |
1023 |
des[fidx + 1] += (weight2 * weight);
|
1024 |
//this dynamic indexing part might be slow
|
1025 |
}else
|
1026 |
{ |
1027 |
#pragma unroll
|
1028 |
for(int k = 0; k < 8; ++k) |
1029 |
{ |
1030 |
if(k == fidx)
|
1031 |
{ |
1032 |
des[k] += (weight1 * weight); |
1033 |
des[k+1] += (weight2 * weight);
|
1034 |
} |
1035 |
} |
1036 |
} |
1037 |
} |
1038 |
} |
1039 |
} |
1040 |
des[0] += des[8]; |
1041 |
|
1042 |
int didx = idx << 1; |
1043 |
d_des[didx] = make_float4(des[0], des[1], des[2], des[3]); |
1044 |
d_des[didx+1] = make_float4(des[4], des[5], des[6], des[7]); |
1045 |
} |
1046 |
|
1047 |
|
1048 |
template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptorRECT_Kernel(float4* d_des, int num, |
1049 |
int width, int height, float window_factor) |
1050 |
{ |
1051 |
const float rpi = 4.0/ 3.14159265358979323846; |
1052 |
int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
|
1053 |
int fidx = idx >> 4; |
1054 |
if(fidx >= num) return; |
1055 |
float4 key = tex1Dfetch(texDataF4, fidx); |
1056 |
int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2; |
1057 |
//float aspect_ratio = key.w / key.z;
|
1058 |
//float aspect_sq = aspect_ratio * aspect_ratio;
|
1059 |
float sptx = key.z * 0.25, spty = key.w * 0.25; |
1060 |
float xmin, ymin, xmax, ymax; float2 pt;
|
1061 |
pt.x = sptx * (ix + 0.5f) + key.x; |
1062 |
pt.y = spty * (iy + 0.5f) + key.y; |
1063 |
xmin = max(1.5f, floor(pt.x - sptx) + 0.5f); |
1064 |
ymin = max(1.5f, floor(pt.y - spty) + 0.5f); |
1065 |
xmax = min(width - 1.5f, floor(pt.x + sptx) + 0.5f); |
1066 |
ymax = min(height - 1.5f, floor(pt.y + spty) + 0.5f); |
1067 |
float des[9]; |
1068 |
#pragma unroll
|
1069 |
for(int i =0; i < 9; ++i) des[i] = 0.0f; |
1070 |
for(float y = ymin; y <= ymax; y += 1.0f) |
1071 |
{ |
1072 |
for(float x = xmin; x <= xmax; x += 1.0f) |
1073 |
{ |
1074 |
float nx = (x - pt.x) / sptx;
|
1075 |
float ny = (y - pt.y) / spty;
|
1076 |
float nxn = fabs(nx);
|
1077 |
float nyn = fabs(ny);
|
1078 |
if(nxn < 1.0f && nyn < 1.0f) |
1079 |
{ |
1080 |
float2 cc = tex2D(texDataF2, x, y); |
1081 |
float wx = 1.0 - nxn; |
1082 |
float wy = 1.0 - nyn; |
1083 |
float weight = wx * wy * cc.x;
|
1084 |
float theta = (- cc.y) * rpi;
|
1085 |
if(theta < 0) theta += 8.0f; |
1086 |
float fo = floor(theta);
|
1087 |
int fidx = fo;
|
1088 |
float weight1 = fo + 1.0f - theta; |
1089 |
float weight2 = theta - fo;
|
1090 |
if(DYNAMIC_INDEXING)
|
1091 |
{ |
1092 |
des[fidx] += (weight1 * weight); |
1093 |
des[fidx + 1] += (weight2 * weight);
|
1094 |
//this dynamic indexing part might be slow
|
1095 |
}else
|
1096 |
{ |
1097 |
#pragma unroll
|
1098 |
for(int k = 0; k < 8; ++k) |
1099 |
{ |
1100 |
if(k == fidx)
|
1101 |
{ |
1102 |
des[k] += (weight1 * weight); |
1103 |
des[k+1] += (weight2 * weight);
|
1104 |
} |
1105 |
} |
1106 |
} |
1107 |
} |
1108 |
} |
1109 |
} |
1110 |
des[0] += des[8]; |
1111 |
|
1112 |
int didx = idx << 1; |
1113 |
d_des[didx] = make_float4(des[0], des[1], des[2], des[3]); |
1114 |
d_des[didx+1] = make_float4(des[4], des[5], des[6], des[7]); |
1115 |
} |
1116 |
|
1117 |
void __global__ NormalizeDescriptor_Kernel(float4* d_des, int num) |
1118 |
{ |
1119 |
float4 temp[32];
|
1120 |
int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
|
1121 |
if(idx >= num) return; |
1122 |
int sidx = idx << 5; |
1123 |
float norm1 = 0, norm2 = 0; |
1124 |
#pragma unroll
|
1125 |
for(int i = 0; i < 32; ++i) |
1126 |
{ |
1127 |
temp[i] = tex1Dfetch(texDataF4, sidx +i); |
1128 |
norm1 += (temp[i].x * temp[i].x + temp[i].y * temp[i].y + |
1129 |
temp[i].z * temp[i].z + temp[i].w * temp[i].w); |
1130 |
} |
1131 |
norm1 = rsqrt(norm1); |
1132 |
|
1133 |
#pragma unroll
|
1134 |
for(int i = 0; i < 32; ++i) |
1135 |
{ |
1136 |
temp[i].x = min(0.2f, temp[i].x * norm1); |
1137 |
temp[i].y = min(0.2f, temp[i].y * norm1); |
1138 |
temp[i].z = min(0.2f, temp[i].z * norm1); |
1139 |
temp[i].w = min(0.2f, temp[i].w * norm1); |
1140 |
norm2 += (temp[i].x * temp[i].x + temp[i].y * temp[i].y + |
1141 |
temp[i].z * temp[i].z + temp[i].w * temp[i].w); |
1142 |
} |
1143 |
|
1144 |
norm2 = rsqrt(norm2); |
1145 |
#pragma unroll
|
1146 |
for(int i = 0; i < 32; ++i) |
1147 |
{ |
1148 |
temp[i].x *= norm2; temp[i].y *= norm2; |
1149 |
temp[i].z *= norm2; temp[i].w *= norm2; |
1150 |
d_des[sidx + i] = temp[i]; |
1151 |
} |
1152 |
} |
1153 |
|
1154 |
void ProgramCU::ComputeDescriptor(CuTexImage*list, CuTexImage* got, CuTexImage* dtex, int rect, int stream) |
1155 |
{ |
1156 |
int num = list->GetImgWidth();
|
1157 |
int width = got->GetImgWidth();
|
1158 |
int height = got->GetImgHeight();
|
1159 |
|
1160 |
dtex->InitTexture(num * 128, 1, 1); |
1161 |
got->BindTexture2D(texDataF2); |
1162 |
list->BindTexture(texDataF4); |
1163 |
int block_width = DESCRIPTOR_COMPUTE_BLOCK_SIZE;
|
1164 |
dim3 grid((num * 16 + block_width -1) / block_width); |
1165 |
dim3 block(block_width); |
1166 |
|
1167 |
if(rect)
|
1168 |
{ |
1169 |
if(GlobalUtil::_UseDynamicIndexing)
|
1170 |
ComputeDescriptorRECT_Kernel<true><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
|
1171 |
else
|
1172 |
ComputeDescriptorRECT_Kernel<false><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
|
1173 |
|
1174 |
}else
|
1175 |
{ |
1176 |
if(GlobalUtil::_UseDynamicIndexing)
|
1177 |
ComputeDescriptor_Kernel<true><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
|
1178 |
else
|
1179 |
ComputeDescriptor_Kernel<false><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
|
1180 |
} |
1181 |
if(GlobalUtil::_NormalizedSIFT)
|
1182 |
{ |
1183 |
dtex->BindTexture(texDataF4); |
1184 |
const int block_width = DESCRIPTOR_NORMALIZ_PER_BLOCK; |
1185 |
dim3 grid((num + block_width -1) / block_width);
|
1186 |
dim3 block(block_width); |
1187 |
NormalizeDescriptor_Kernel<<<grid, block>>>((float4*) dtex->_cuData, num); |
1188 |
} |
1189 |
CheckErrorCUDA("ComputeDescriptor");
|
1190 |
} |
1191 |
|
1192 |
//////////////////////////////////////////////////////
|
1193 |
void ProgramCU::FinishCUDA()
|
1194 |
{ |
1195 |
cudaThreadSynchronize(); |
1196 |
} |
1197 |
|
1198 |
int ProgramCU::CheckErrorCUDA(const char* location) |
1199 |
{ |
1200 |
cudaError_t e = cudaGetLastError(); |
1201 |
if(e)
|
1202 |
{ |
1203 |
if(location) fprintf(stderr, "%s:\t", location); |
1204 |
fprintf(stderr, "%s\n", cudaGetErrorString(e));
|
1205 |
//assert(0);
|
1206 |
return 1; |
1207 |
}else
|
1208 |
{ |
1209 |
return 0; |
1210 |
} |
1211 |
} |
1212 |
|
1213 |
void __global__ ConvertDOG_Kernel(float* d_result, int width, int height) |
1214 |
{ |
1215 |
int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
|
1216 |
int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
|
1217 |
if(col < width && row < height)
|
1218 |
{ |
1219 |
int index = row * width + col;
|
1220 |
float v = tex1Dfetch(texData, index);
|
1221 |
d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)? |
1222 |
0.5 : saturate(0.5+20.0*v); |
1223 |
} |
1224 |
} |
1225 |
///
|
1226 |
void ProgramCU::DisplayConvertDOG(CuTexImage* dog, CuTexImage* out)
|
1227 |
{ |
1228 |
if(out->_cuData == NULL) return; |
1229 |
int width = dog->GetImgWidth(), height = dog ->GetImgHeight();
|
1230 |
dog->BindTexture(texData); |
1231 |
dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM, (height + BLOCK_DIM - 1)/BLOCK_DIM); |
1232 |
dim3 block(BLOCK_DIM, BLOCK_DIM); |
1233 |
ConvertDOG_Kernel<<<grid, block>>>((float*) out->_cuData, width, height);
|
1234 |
ProgramCU::CheckErrorCUDA("DisplayConvertDOG");
|
1235 |
} |
1236 |
|
1237 |
void __global__ ConvertGRD_Kernel(float* d_result, int width, int height) |
1238 |
{ |
1239 |
int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
|
1240 |
int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
|
1241 |
if(col < width && row < height)
|
1242 |
{ |
1243 |
int index = row * width + col;
|
1244 |
float v = tex1Dfetch(texData, index << 1); |
1245 |
d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)? |
1246 |
0 : saturate(5 * v); |
1247 |
|
1248 |
} |
1249 |
} |
1250 |
|
1251 |
|
1252 |
void ProgramCU::DisplayConvertGRD(CuTexImage* got, CuTexImage* out)
|
1253 |
{ |
1254 |
if(out->_cuData == NULL) return; |
1255 |
int width = got->GetImgWidth(), height = got ->GetImgHeight();
|
1256 |
got->BindTexture(texData); |
1257 |
dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM, (height + BLOCK_DIM - 1)/BLOCK_DIM); |
1258 |
dim3 block(BLOCK_DIM, BLOCK_DIM); |
1259 |
ConvertGRD_Kernel<<<grid, block>>>((float*) out->_cuData, width, height);
|
1260 |
ProgramCU::CheckErrorCUDA("DisplayConvertGRD");
|
1261 |
} |
1262 |
|
1263 |
void __global__ ConvertKEY_Kernel(float4* d_result, int width, int height) |
1264 |
{ |
1265 |
|
1266 |
int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
|
1267 |
int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
|
1268 |
if(col < width && row < height)
|
1269 |
{ |
1270 |
int index = row * width + col;
|
1271 |
float4 keyv = tex1Dfetch(texDataF4, index); |
1272 |
int is_key = (keyv.x == 1.0f || keyv.x == -1.0f); |
1273 |
int inside = col > 0 && row > 0 && row < height -1 && col < width - 1; |
1274 |
float v = inside? saturate(0.5 + 20 * tex1Dfetch(texData, index)) : 0.5; |
1275 |
d_result[index] = is_key && inside ? |
1276 |
(keyv.x > 0? make_float4(1.0f, 0, 0, 1.0f) : make_float4(0.0f, 1.0f, 0.0f, 1.0f)): |
1277 |
make_float4(v, v, v, 1.0f) ; |
1278 |
} |
1279 |
} |
1280 |
void ProgramCU::DisplayConvertKEY(CuTexImage* key, CuTexImage* dog, CuTexImage* out)
|
1281 |
{ |
1282 |
if(out->_cuData == NULL) return; |
1283 |
int width = key->GetImgWidth(), height = key ->GetImgHeight();
|
1284 |
dog->BindTexture(texData); |
1285 |
key->BindTexture(texDataF4); |
1286 |
dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM, (height + BLOCK_DIM - 1)/BLOCK_DIM); |
1287 |
dim3 block(BLOCK_DIM, BLOCK_DIM); |
1288 |
ConvertKEY_Kernel<<<grid, block>>>((float4*) out->_cuData, width, height); |
1289 |
} |
1290 |
|
1291 |
|
1292 |
void __global__ DisplayKeyPoint_Kernel(float4 * d_result, int num) |
1293 |
{ |
1294 |
int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
|
1295 |
if(idx >= num) return; |
1296 |
float4 v = tex1Dfetch(texDataF4, idx); |
1297 |
d_result[idx] = make_float4(v.x, v.y, 0, 1.0f); |
1298 |
} |
1299 |
|
1300 |
void ProgramCU::DisplayKeyPoint(CuTexImage* ftex, CuTexImage* out)
|
1301 |
{ |
1302 |
int num = ftex->GetImgWidth();
|
1303 |
int block_width = 64; |
1304 |
dim3 grid((num + block_width -1) /block_width);
|
1305 |
dim3 block(block_width); |
1306 |
ftex->BindTexture(texDataF4); |
1307 |
DisplayKeyPoint_Kernel<<<grid, block>>>((float4*) out->_cuData, num); |
1308 |
ProgramCU::CheckErrorCUDA("DisplayKeyPoint");
|
1309 |
} |
1310 |
|
1311 |
void __global__ DisplayKeyBox_Kernel(float4* d_result, int num) |
1312 |
{ |
1313 |
int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
|
1314 |
if(idx >= num) return; |
1315 |
int kidx = idx / 10, vidx = idx - IMUL(kidx , 10); |
1316 |
float4 v = tex1Dfetch(texDataF4, kidx); |
1317 |
float sz = fabs(v.z * 3.0f); |
1318 |
///////////////////////
|
1319 |
float s, c; __sincosf(v.w, &s, &c);
|
1320 |
///////////////////////
|
1321 |
float dx = vidx == 0? 0 : ((vidx <= 4 || vidx >= 9)? sz : -sz); |
1322 |
float dy = vidx <= 1? 0 : ((vidx <= 2 || vidx >= 7)? -sz : sz); |
1323 |
float4 pos; |
1324 |
pos.x = v.x + c * dx - s * dy; |
1325 |
pos.y = v.y + c * dy + s * dx; |
1326 |
pos.z = 0; pos.w = 1.0f; |
1327 |
d_result[idx] = pos; |
1328 |
} |
1329 |
|
1330 |
void ProgramCU::DisplayKeyBox(CuTexImage* ftex, CuTexImage* out)
|
1331 |
{ |
1332 |
int len = ftex->GetImgWidth();
|
1333 |
int block_width = 32; |
1334 |
dim3 grid((len * 10 + block_width -1) / block_width); |
1335 |
dim3 block(block_width); |
1336 |
ftex->BindTexture(texDataF4); |
1337 |
DisplayKeyBox_Kernel<<<grid, block>>>((float4*) out->_cuData, len * 10);
|
1338 |
} |
1339 |
///////////////////////////////////////////////////////////////////
|
1340 |
inline void CuTexImage:: BindTexture(textureReference& texRef) |
1341 |
{ |
1342 |
cudaBindTexture(NULL, &texRef, _cuData, &texRef.channelDesc, _numBytes);
|
1343 |
} |
1344 |
|
1345 |
inline void CuTexImage::BindTexture2D(textureReference& texRef) |
1346 |
{ |
1347 |
#if defined(SIFTGPU_ENABLE_LINEAR_TEX2D)
|
1348 |
cudaBindTexture2D(0, &texRef, _cuData, &texRef.channelDesc, _imgWidth, _imgHeight, _imgWidth* _numChannel* sizeof(float)); |
1349 |
#else
|
1350 |
cudaChannelFormatDesc desc; |
1351 |
cudaGetChannelDesc(&desc, _cuData2D); |
1352 |
cudaBindTextureToArray(&texRef, _cuData2D, &desc); |
1353 |
#endif
|
1354 |
} |
1355 |
|
1356 |
int ProgramCU::CheckCudaDevice(int device) |
1357 |
{ |
1358 |
int count = 0, device_used; |
1359 |
if(cudaGetDeviceCount(&count) || count <= 0) |
1360 |
{ |
1361 |
ProgramCU::CheckErrorCUDA("CheckCudaDevice");
|
1362 |
return 0; |
1363 |
}else if(count == 1) |
1364 |
{ |
1365 |
cudaDeviceProp deviceProp; |
1366 |
cudaGetDeviceProperties(&deviceProp, 0);
|
1367 |
if (deviceProp.major == 9999 && deviceProp.minor == 9999) |
1368 |
{ |
1369 |
fprintf(stderr, "CheckCudaDevice: no device supporting CUDA.\n");
|
1370 |
return 0; |
1371 |
} |
1372 |
} |
1373 |
if(device >0 && device < count) |
1374 |
{ |
1375 |
cudaSetDevice(device); |
1376 |
CheckErrorCUDA("cudaSetDevice\n");
|
1377 |
} |
1378 |
cudaGetDevice(&device_used); |
1379 |
if(device != device_used)
|
1380 |
fprintf(stderr, "\nERROR: Cannot set device to %d\n"
|
1381 |
"\nWARNING: Use # %d device instead (out of %d)\n", device, device_used, count);
|
1382 |
return 1; |
1383 |
} |
1384 |
|
1385 |
////////////////////////////////////////////////////////////////////////////////////////
|
1386 |
// siftmatch funtions
|
1387 |
//////////////////////////////////////////////////////////////////////////////////////////
|
1388 |
|
1389 |
#define MULT_TBLOCK_DIMX 128 |
1390 |
#define MULT_TBLOCK_DIMY 1 |
1391 |
#define MULT_BLOCK_DIMX (MULT_TBLOCK_DIMX)
|
1392 |
#define MULT_BLOCK_DIMY (8 * MULT_TBLOCK_DIMY) |
1393 |
|
1394 |
|
1395 |
texture<uint4, 1, cudaReadModeElementType> texDes1;
|
1396 |
texture<uint4, 1, cudaReadModeElementType> texDes2;
|
1397 |
|
1398 |
void __global__ MultiplyDescriptor_Kernel(int* d_result, int num1, int num2, int3* d_temp) |
1399 |
{ |
1400 |
int idx01 = (blockIdx.y * MULT_BLOCK_DIMY), idx02 = (blockIdx.x * MULT_BLOCK_DIMX);
|
1401 |
|
1402 |
int idx1 = idx01 + threadIdx.y, idx2 = idx02 + threadIdx.x;
|
1403 |
__shared__ int data1[17 * 2 * MULT_BLOCK_DIMY]; |
1404 |
int read_idx1 = idx01 * 8 + threadIdx.x, read_idx2 = idx2 * 8; |
1405 |
int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2; |
1406 |
int cache_idx1 = IMUL(row4, 17) + (col4 << 2); |
1407 |
|
1408 |
///////////////////////////////////////////////////////////////
|
1409 |
//Load feature descriptors
|
1410 |
///////////////////////////////////////////////////////////////
|
1411 |
#if MULT_BLOCK_DIMY == 16 |
1412 |
uint4 v = tex1Dfetch(texDes1, read_idx1); |
1413 |
data1[cache_idx1] = v.x; data1[cache_idx1+1] = v.y;
|
1414 |
data1[cache_idx1+2] = v.z; data1[cache_idx1+3] = v.w; |
1415 |
#elif MULT_BLOCK_DIMY == 8 |
1416 |
if(threadIdx.x < 64) |
1417 |
{ |
1418 |
uint4 v = tex1Dfetch(texDes1, read_idx1); |
1419 |
data1[cache_idx1] = v.x; data1[cache_idx1+1] = v.y;
|
1420 |
data1[cache_idx1+2] = v.z; data1[cache_idx1+3] = v.w; |
1421 |
} |
1422 |
#else
|
1423 |
#error
|
1424 |
#endif
|
1425 |
__syncthreads(); |
1426 |
|
1427 |
///
|
1428 |
if(idx2 >= num2) return; |
1429 |
///////////////////////////////////////////////////////////////////////////
|
1430 |
//compare descriptors
|
1431 |
|
1432 |
int results[MULT_BLOCK_DIMY];
|
1433 |
#pragma unroll
|
1434 |
for(int i = 0; i < MULT_BLOCK_DIMY; ++i) results[i] = 0; |
1435 |
|
1436 |
#pragma unroll
|
1437 |
for(int i = 0; i < 8; ++i) |
1438 |
{ |
1439 |
uint4 v = tex1Dfetch(texDes2, read_idx2 + i); |
1440 |
unsigned char* p2 = (unsigned char*)(&v); |
1441 |
#pragma unroll
|
1442 |
for(int k = 0; k < MULT_BLOCK_DIMY; ++k) |
1443 |
{ |
1444 |
unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i * 4 + (i/4)); |
1445 |
results[k] += ( IMUL(p1[0], p2[0]) + IMUL(p1[1], p2[1]) |
1446 |
+ IMUL(p1[2], p2[2]) + IMUL(p1[3], p2[3]) |
1447 |
+ IMUL(p1[4], p2[4]) + IMUL(p1[5], p2[5]) |
1448 |
+ IMUL(p1[6], p2[6]) + IMUL(p1[7], p2[7]) |
1449 |
+ IMUL(p1[8], p2[8]) + IMUL(p1[9], p2[9]) |
1450 |
+ IMUL(p1[10], p2[10]) + IMUL(p1[11], p2[11]) |
1451 |
+ IMUL(p1[12], p2[12]) + IMUL(p1[13], p2[13]) |
1452 |
+ IMUL(p1[14], p2[14]) + IMUL(p1[15], p2[15])); |
1453 |
} |
1454 |
} |
1455 |
|
1456 |
int dst_idx = IMUL(idx1, num2) + idx2;
|
1457 |
if(d_temp)
|
1458 |
{ |
1459 |
int3 cmp_result = make_int3(0, -1, 0); |
1460 |
|
1461 |
#pragma unroll
|
1462 |
for(int i = 0; i < MULT_BLOCK_DIMY; ++i) |
1463 |
{ |
1464 |
if(idx1 + i < num1)
|
1465 |
{ |
1466 |
cmp_result = results[i] > cmp_result.x? |
1467 |
make_int3(results[i], idx1 + i, cmp_result.x) : |
1468 |
make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i])); |
1469 |
d_result[dst_idx + IMUL(i, num2)] = results[i]; |
1470 |
} |
1471 |
} |
1472 |
d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result; |
1473 |
}else
|
1474 |
{ |
1475 |
#pragma unroll
|
1476 |
for(int i = 0; i < MULT_BLOCK_DIMY; ++i) |
1477 |
{ |
1478 |
if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = results[i];
|
1479 |
} |
1480 |
} |
1481 |
|
1482 |
} |
1483 |
|
1484 |
|
1485 |
void ProgramCU::MultiplyDescriptor(CuTexImage* des1, CuTexImage* des2, CuTexImage* texDot, CuTexImage* texCRT)
|
1486 |
{ |
1487 |
int num1 = des1->GetImgWidth() / 8; |
1488 |
int num2 = des2->GetImgWidth() / 8; |
1489 |
dim3 grid( (num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,
|
1490 |
(num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);
|
1491 |
dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY); |
1492 |
texDot->InitTexture( num2,num1); |
1493 |
if(texCRT) texCRT->InitTexture(num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 32); |
1494 |
des1->BindTexture(texDes1); |
1495 |
des2->BindTexture(texDes2); |
1496 |
|
1497 |
MultiplyDescriptor_Kernel<<<grid, block>>>((int*)texDot->_cuData, num1, num2,
|
1498 |
(texCRT? (int3*)texCRT->_cuData : NULL));
|
1499 |
ProgramCU::CheckErrorCUDA("MultiplyDescriptor");
|
1500 |
} |
1501 |
|
1502 |
texture<float, 1, cudaReadModeElementType> texLoc1; |
1503 |
texture<float2, 1, cudaReadModeElementType> texLoc2;
|
1504 |
struct Matrix33{float mat[3][3];}; |
1505 |
|
1506 |
|
1507 |
|
1508 |
void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, int3* d_temp, |
1509 |
Matrix33 H, float hdistmax, Matrix33 F, float fdistmax) |
1510 |
{ |
1511 |
int idx01 = (blockIdx.y * MULT_BLOCK_DIMY);
|
1512 |
int idx02 = (blockIdx.x * MULT_BLOCK_DIMX);
|
1513 |
|
1514 |
int idx1 = idx01 + threadIdx.y;
|
1515 |
int idx2 = idx02 + threadIdx.x;
|
1516 |
__shared__ int data1[17 * 2 * MULT_BLOCK_DIMY]; |
1517 |
__shared__ float loc1[MULT_BLOCK_DIMY * 2]; |
1518 |
int read_idx1 = idx01 * 8 + threadIdx.x ; |
1519 |
int read_idx2 = idx2 * 8; |
1520 |
int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2; |
1521 |
int cache_idx1 = IMUL(row4, 17) + (col4 << 2); |
1522 |
#if MULT_BLOCK_DIMY == 16 |
1523 |
uint4 v = tex1Dfetch(texDes1, read_idx1); |
1524 |
data1[cache_idx1] = v.x; |
1525 |
data1[cache_idx1+1] = v.y;
|
1526 |
data1[cache_idx1+2] = v.z;
|
1527 |
data1[cache_idx1+3] = v.w;
|
1528 |
#elif MULT_BLOCK_DIMY == 8 |
1529 |
if(threadIdx.x < 64) |
1530 |
{ |
1531 |
uint4 v = tex1Dfetch(texDes1, read_idx1); |
1532 |
data1[cache_idx1] = v.x; |
1533 |
data1[cache_idx1+1] = v.y;
|
1534 |
data1[cache_idx1+2] = v.z;
|
1535 |
data1[cache_idx1+3] = v.w;
|
1536 |
} |
1537 |
#else
|
1538 |
#error
|
1539 |
#endif
|
1540 |
__syncthreads(); |
1541 |
if(threadIdx.x < MULT_BLOCK_DIMY * 2) |
1542 |
{ |
1543 |
loc1[threadIdx.x] = tex1Dfetch(texLoc1, 2 * idx01 + threadIdx.x);
|
1544 |
} |
1545 |
__syncthreads(); |
1546 |
if(idx2 >= num2) return; |
1547 |
int results[MULT_BLOCK_DIMY];
|
1548 |
/////////////////////////////////////////////////////////////////////////////////////////////
|
1549 |
//geometric verification
|
1550 |
/////////////////////////////////////////////////////////////////////////////////////////////
|
1551 |
int good_count = 0; |
1552 |
float2 loc2 = tex1Dfetch(texLoc2, idx2); |
1553 |
#pragma unroll
|
1554 |
for(int i = 0; i < MULT_BLOCK_DIMY; ++i) |
1555 |
{ |
1556 |
|
1557 |
if(idx1 + i < num1)
|
1558 |
{ |
1559 |
float* loci = loc1 + i * 2; |
1560 |
float locx = loci[0], locy = loci[1]; |
1561 |
//homography
|
1562 |
float x[3], diff[2]; |
1563 |
x[0] = H.mat[0][0] * locx + H.mat[0][1] * locy + H.mat[0][2]; |
1564 |
x[1] = H.mat[1][0] * locx + H.mat[1][1] * locy + H.mat[1][2]; |
1565 |
x[2] = H.mat[2][0] * locx + H.mat[2][1] * locy + H.mat[2][2]; |
1566 |
diff[0] = fabs(FDIV(x[0], x[2]) - loc2.x); |
1567 |
diff[1] = fabs(FDIV(x[1], x[2]) - loc2.y); |
1568 |
if(diff[0] < hdistmax && diff[1] < hdistmax) |
1569 |
{ |
1570 |
//check fundamental matrix
|
1571 |
float fx1[3], ftx2[3], x2fx1, se; |
1572 |
fx1[0] = F.mat[0][0] * locx + F.mat[0][1] * locy + F.mat[0][2]; |
1573 |
fx1[1] = F.mat[1][0] * locx + F.mat[1][1] * locy + F.mat[1][2]; |
1574 |
//fx1[2] = F.mat[2][0] * locx + F.mat[2][1] * locy + F.mat[2][2];
|
1575 |
|
1576 |
ftx2[0] = F.mat[0][0] * locx + F.mat[1][0] * locy + F.mat[2][0]; |
1577 |
ftx2[1] = F.mat[0][1] * locx + F.mat[1][1] * locy + F.mat[2][1]; |
1578 |
ftx2[2] = F.mat[0][2] * locx + F.mat[1][2] * locy + F.mat[2][2]; |
1579 |
|
1580 |
x2fx1 = locx * ftx2[0] + locy * ftx2[1] + ftx2[2]; |
1581 |
se = FDIV(x2fx1 * x2fx1, fx1[0] * fx1[0] + fx1[1] * fx1[1] + ftx2[0] * ftx2[0] + ftx2[1] * ftx2[1]); |
1582 |
results[i] = se < fdistmax? 0: -262144; |
1583 |
}else
|
1584 |
{ |
1585 |
results[i] = -262144;
|
1586 |
} |
1587 |
}else
|
1588 |
{ |
1589 |
results[i] = -262144;
|
1590 |
} |
1591 |
good_count += (results[i] >=0);
|
1592 |
} |
1593 |
/////////////////////////////////////////////////////////////////////////////////////////////
|
1594 |
///compare feature descriptors anyway
|
1595 |
/////////////////////////////////////////////////////////////////////////////////////////////
|
1596 |
if(good_count > 0) |
1597 |
{ |
1598 |
#pragma unroll
|
1599 |
for(int i = 0; i < 8; ++i) |
1600 |
{ |
1601 |
uint4 v = tex1Dfetch(texDes2, read_idx2 + i); |
1602 |
unsigned char* p2 = (unsigned char*)(&v); |
1603 |
#pragma unroll
|
1604 |
for(int k = 0; k < MULT_BLOCK_DIMY; ++k) |
1605 |
{ |
1606 |
unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i * 4 + (i/4)); |
1607 |
results[k] += ( IMUL(p1[0], p2[0]) + IMUL(p1[1], p2[1]) |
1608 |
+ IMUL(p1[2], p2[2]) + IMUL(p1[3], p2[3]) |
1609 |
+ IMUL(p1[4], p2[4]) + IMUL(p1[5], p2[5]) |
1610 |
+ IMUL(p1[6], p2[6]) + IMUL(p1[7], p2[7]) |
1611 |
+ IMUL(p1[8], p2[8]) + IMUL(p1[9], p2[9]) |
1612 |
+ IMUL(p1[10], p2[10]) + IMUL(p1[11], p2[11]) |
1613 |
+ IMUL(p1[12], p2[12]) + IMUL(p1[13], p2[13]) |
1614 |
+ IMUL(p1[14], p2[14]) + IMUL(p1[15], p2[15])); |
1615 |
} |
1616 |
} |
1617 |
} |
1618 |
int dst_idx = IMUL(idx1, num2) + idx2;
|
1619 |
if(d_temp)
|
1620 |
{ |
1621 |
int3 cmp_result = make_int3(0, -1, 0); |
1622 |
#pragma unroll
|
1623 |
for(int i= 0; i < MULT_BLOCK_DIMY; ++i) |
1624 |
{ |
1625 |
if(idx1 + i < num1)
|
1626 |
{ |
1627 |
cmp_result = results[i] > cmp_result.x? |
1628 |
make_int3(results[i], idx1 + i, cmp_result.x) : |
1629 |
make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i])); |
1630 |
d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0);
|
1631 |
}else
|
1632 |
{ |
1633 |
break;
|
1634 |
} |
1635 |
} |
1636 |
d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result; |
1637 |
}else
|
1638 |
{ |
1639 |
#pragma unroll
|
1640 |
for(int i = 0; i < MULT_BLOCK_DIMY; ++i) |
1641 |
{ |
1642 |
if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0); |
1643 |
else break; |
1644 |
} |
1645 |
} |
1646 |
|
1647 |
} |
1648 |
|
1649 |
|
1650 |
void ProgramCU::MultiplyDescriptorG(CuTexImage* des1, CuTexImage* des2,
|
1651 |
CuTexImage* loc1, CuTexImage* loc2, CuTexImage* texDot, CuTexImage* texCRT, |
1652 |
float H[3][3], float hdistmax, float F[3][3], float fdistmax) |
1653 |
{ |
1654 |
int num1 = des1->GetImgWidth() / 8; |
1655 |
int num2 = des2->GetImgWidth() / 8; |
1656 |
Matrix33 MatF, MatH; |
1657 |
//copy the matrix
|
1658 |
memcpy(MatF.mat, F, 9 * sizeof(float)); |
1659 |
memcpy(MatH.mat, H, 9 * sizeof(float)); |
1660 |
//thread blocks
|
1661 |
dim3 grid( (num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,
|
1662 |
(num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);
|
1663 |
dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY); |
1664 |
//intermediate results
|
1665 |
texDot->InitTexture( num2,num1); |
1666 |
if(texCRT) texCRT->InitTexture( num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 3); |
1667 |
loc1->BindTexture(texLoc1); |
1668 |
loc2->BindTexture(texLoc2); |
1669 |
des1->BindTexture(texDes1); |
1670 |
des2->BindTexture(texDes2); |
1671 |
MultiplyDescriptorG_Kernel<<<grid, block>>>((int*)texDot->_cuData, num1, num2,
|
1672 |
(texCRT? (int3*)texCRT->_cuData : NULL),
|
1673 |
MatH, hdistmax, MatF, fdistmax); |
1674 |
} |
1675 |
|
1676 |
|
1677 |
texture<int, 1, cudaReadModeElementType> texDOT; |
1678 |
|
1679 |
#define ROWMATCH_BLOCK_WIDTH 32 |
1680 |
#define ROWMATCH_BLOCK_HEIGHT 1 |
1681 |
|
1682 |
void __global__ RowMatch_Kernel(int*d_dot, int* d_result, int num2, float distmax, float ratiomax) |
1683 |
{ |
1684 |
#if ROWMATCH_BLOCK_HEIGHT == 1 |
1685 |
__shared__ int dotmax[ROWMATCH_BLOCK_WIDTH];
|
1686 |
__shared__ int dotnxt[ROWMATCH_BLOCK_WIDTH];
|
1687 |
__shared__ int dotidx[ROWMATCH_BLOCK_WIDTH];
|
1688 |
int row = blockIdx.y;
|
1689 |
#else
|
1690 |
__shared__ int x_dotmax[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
|
1691 |
__shared__ int x_dotnxt[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
|
1692 |
__shared__ int x_dotidx[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
|
1693 |
int* dotmax = x_dotmax[threadIdx.y];
|
1694 |
int* dotnxt = x_dotnxt[threadIdx.y];
|
1695 |
int* dotidx = x_dotidx[threadIdx.y];
|
1696 |
int row = IMUL(blockIdx.y, ROWMATCH_BLOCK_HEIGHT) + threadIdx.y;
|
1697 |
#endif
|
1698 |
|
1699 |
int base_address = IMUL(row , num2);
|
1700 |
int t_dotmax = 0, t_dotnxt = 0, t_dotidx = -1; |
1701 |
for(int i = 0; i < num2; i += ROWMATCH_BLOCK_WIDTH) |
1702 |
{ |
1703 |
if(threadIdx.x + i < num2)
|
1704 |
{ |
1705 |
int v = tex1Dfetch(texDOT, base_address + threadIdx.x + i);//d_dot[base_address + threadIdx.x + i];// |
1706 |
bool test = v > t_dotmax;
|
1707 |
t_dotnxt = test? t_dotmax : max(t_dotnxt, v); |
1708 |
t_dotidx = test? (threadIdx.x + i) : t_dotidx; |
1709 |
t_dotmax = test? v: t_dotmax; |
1710 |
} |
1711 |
__syncthreads(); |
1712 |
} |
1713 |
dotmax[threadIdx.x] = t_dotmax; |
1714 |
dotnxt[threadIdx.x] = t_dotnxt; |
1715 |
dotidx[threadIdx.x] = t_dotidx; |
1716 |
__syncthreads(); |
1717 |
|
1718 |
#pragma unroll
|
1719 |
for(int step = ROWMATCH_BLOCK_WIDTH/2; step >0; step /= 2) |
1720 |
{ |
1721 |
if(threadIdx.x < step)
|
1722 |
{ |
1723 |
int v1 = dotmax[threadIdx.x], v2 = dotmax[threadIdx.x + step];
|
1724 |
bool test = v2 > v1;
|
1725 |
dotnxt[threadIdx.x] = test? max(v1, dotnxt[threadIdx.x + step]) :max(dotnxt[threadIdx.x], v2); |
1726 |
dotidx[threadIdx.x] = test? dotidx[threadIdx.x + step] : dotidx[threadIdx.x]; |
1727 |
dotmax[threadIdx.x] = test? v2 : v1; |
1728 |
} |
1729 |
__syncthreads(); |
1730 |
} |
1731 |
if(threadIdx.x == 0) |
1732 |
{ |
1733 |
float dist = acos(min(dotmax[0] * 0.000003814697265625f, 1.0)); |
1734 |
float distn = acos(min(dotnxt[0] * 0.000003814697265625f, 1.0)); |
1735 |
//float ratio = dist / distn;
|
1736 |
d_result[row] = (dist < distmax) && (dist < distn * ratiomax) ? dotidx[0] : -1;//? : -1; |
1737 |
} |
1738 |
|
1739 |
} |
1740 |
|
1741 |
|
1742 |
void ProgramCU::GetRowMatch(CuTexImage* texDot, CuTexImage* texMatch, float distmax, float ratiomax) |
1743 |
{ |
1744 |
int num1 = texDot->GetImgHeight();
|
1745 |
int num2 = texDot->GetImgWidth();
|
1746 |
dim3 grid(1, num1/ROWMATCH_BLOCK_HEIGHT);
|
1747 |
dim3 block(ROWMATCH_BLOCK_WIDTH, ROWMATCH_BLOCK_HEIGHT); |
1748 |
texDot->BindTexture(texDOT); |
1749 |
RowMatch_Kernel<<<grid, block>>>((int*)texDot->_cuData,
|
1750 |
(int*)texMatch->_cuData, num2, distmax, ratiomax);
|
1751 |
} |
1752 |
|
1753 |
#define COLMATCH_BLOCK_WIDTH 32 |
1754 |
|
1755 |
//texture<int3, 1, cudaReadModeElementType> texCT;
|
1756 |
|
1757 |
void __global__ ColMatch_Kernel(int3*d_crt, int* d_result, int height, int num2, float distmax, float ratiomax) |
1758 |
{ |
1759 |
int col = COLMATCH_BLOCK_WIDTH * blockIdx.x + threadIdx.x;
|
1760 |
if(col >= num2) return; |
1761 |
int3 result = d_crt[col];//tex1Dfetch(texCT, col);
|
1762 |
int read_idx = col + num2;
|
1763 |
for(int i = 1; i < height; ++i, read_idx += num2) |
1764 |
{ |
1765 |
int3 temp = d_crt[read_idx];//tex1Dfetch(texCT, read_idx);
|
1766 |
result = result.x < temp.x? |
1767 |
make_int3(temp.x, temp.y, max(result.x, temp.z)) : |
1768 |
make_int3(result.x, result.y, max(result.z, temp.x)); |
1769 |
} |
1770 |
|
1771 |
float dist = acos(min(result.x * 0.000003814697265625f, 1.0)); |
1772 |
float distn = acos(min(result.z * 0.000003814697265625f, 1.0)); |
1773 |
//float ratio = dist / distn;
|
1774 |
d_result[col] = (dist < distmax) && (dist < distn * ratiomax) ? result.y : -1;//? : -1; |
1775 |
|
1776 |
} |
1777 |
|
1778 |
void ProgramCU::GetColMatch(CuTexImage* texCRT, CuTexImage* texMatch, float distmax, float ratiomax) |
1779 |
{ |
1780 |
int height = texCRT->GetImgHeight();
|
1781 |
int num2 = texCRT->GetImgWidth();
|
1782 |
//texCRT->BindTexture(texCT);
|
1783 |
dim3 grid((num2 + COLMATCH_BLOCK_WIDTH -1) / COLMATCH_BLOCK_WIDTH);
|
1784 |
dim3 block(COLMATCH_BLOCK_WIDTH); |
1785 |
ColMatch_Kernel<<<grid, block>>>((int3*)texCRT->_cuData, (int*) texMatch->_cuData, height, num2, distmax, ratiomax);
|
1786 |
} |
1787 |
|
1788 |
#endif
|