||
Dynamic parallelism is a new feature of CUDA 5.0 for GPUs with compute capability 3.5, allowing to launch kernels directly from other kernels.
It promises to further speedup applications by better handling computing workloads at runtime directly on the GPU; that avoids CPU/GPU interactions with benefits to mechanisms like recursion.
To use dynamic parallelism in Visual Studio 2010, do the following:
1) View -> Property Pages 2) Configuration Properties -> CUDA C/C++ -> Common -> -> Generate Relocatable Device Code -> Yes (-rdc=true) 3) Configuration Properties -> CUDA C/C++ -> Device -> -> Code Generation -> compute_35, sm_35 4) Configuration Properties -> Linker -> Input -> -> Additional Dependencies -> cudadevrt.libThrust: https://github.com/thrust/thrust/wiki/Quick-Start-Guide
http://thrust.github.io/doc/modules.html
CUDPP: http://cudpp.github.io/
转自:http://download.csdn.net/download/anson2004110/5912747
VS2010 CUDA 5.5 Win7 64位配置
VS2010+CUDA+5.5+Win7+64位配置以及项目创建配置.docx
错误:thrust::system::system_error in transform_reduce
I'm using VS2010 and when it breaks at the errors it points to the following in the dbgheap.c file.
__finally {
/* unlock the heap
*/
_munlock(_HEAP_LOCK);
}
I forgot to adjust the Properties of the project to my CUDA card compute capability
Configuration Properties > CUDA CC++ > Device > Code Generation change compute_10,sm_10 to your GPU compute capability
For Nvidia card with 2.1 compute capability it will be compute_20,sm_21
CUDA Thread Indexing
Sample code
1D grid of 1D blocks
}
1D grid of 2D blocks
}
1D grid of 3D blocks
__device__ int getGlobalIdx_1D_3D()
{
return blockIdx.x * blockDim.x * blockDim.y * blockDim.z
+ threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x;
}
}
2D grid of 1D blocks
__device__ int getGlobalIdx_2D_1D()
{
int blockId = blockIdx.y * gridDim.x + blockIdx.x;
int threadId = blockId * blockDim.x + threadIdx.x;
return threadId;
}
}
2D grid of 2D blocks
__device__ int getGlobalIdx_2D_2D()
{
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
return threadId;
}
2D grid of 3D blocks
__device__ int getGlobalIdx_2D_3D()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
return threadId;
}
3D grid of 1D blocks
__device__ int getGlobalIdx_3D_1D()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * blockDim.x + threadIdx.x;
return threadId;
}
3D grid of 2D blocks
__device__ int getGlobalIdx_3D_2D()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y)
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
return threadId;
}
3D grid of 3D blocks
__device__ int getGlobalIdx_3D_3D()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
return threadId;
}
Archiver|手机版|科学网 ( 京ICP备07017567号-12 )
GMT+8, 2024-9-27 10:29
Powered by ScienceNet.cn
Copyright © 2007- 中国科学报社