//----2011-08-18--lqlu@home.ipe.ac.cn---20:38:33--
//copy array of structures-
const int N = 3;
const int L = 2;
struct Foo
{
int number;
float * bp;
};
Foo* dFoo; //device
Foo* hFoo = new struct Foo[N]; //host
Foo* tFoo = new struct Foo[N]; //temp
for (int i = 0; i<N; ++i) //init data
{
hFoo[i].number = L;
hFoo[i].bp = (float*)malloc(sizeof(float)*L);
}
for (size_t i = 0; i != N; ++i) //copy each strcture's array element
{
float * d_bp;
cudaMalloc((void**)&d_bp, sizeof(float)*L);
cudaMemcpy(d_bp, hFoo[i].bp,sizeof(float)*L, cudaMemcpyHostToDevice);
tFoo[i].bp = d_bp; // this is already a device pointer!
tFoo[i].number = hFoo[i].number;
}
cudaMalloc((void**) &dFoo, sizeof(Foo) * N); //device mem. alloc
cudaMemcpy(dFoo, tFoo, sizeof(Foo)*N, cudaMemcpyHostToDevice);
MyFunc<<<1,N>>>(dFoo);
//copy back
cudaMemcpy(tFoo , dFoo, sizeof(Foo)*N,cudaMemcpyDeviceToHost);
for (size_t i = 0; i != N; ++i)
cudaMemcpy(hFoo[i].bp , tFoo[i].bp, sizeof(float)*L,cudaMemcpyDeviceToHost);
//========================================================================================================
//maping 2d to 1d
__global__ kernel_(double, int,)
{
int thx = threadIdx.x + blockIdx.x * blockDim.x;
int thy = threadIdx.y + blockIdx.y * blockDim.y;
int offset = thx + thy * blockDim.x * gridDim.x;
if (offset>=length) return;
}
int ttx = ;//total threads along x
int tty = ;//total thread along y
dim3 threads(32,32)//1024 threads per block
int bnx = (int)ceil(ttx/(double)threads.x);
int bny = (int)ceil(tty/(double)threads.y);
//printf("tnx=%d,tny=%d,totalThreadsp/block=%dnbnx=%d,bny=%d,totalBlocks=%dn",threads.x,threads.y,threads.x*threads.y,bnx,bny,bnx*bny);
dim3 blocks(bnx,bny)
kernel_<<<blocks,threads>>>(,)
cudaThreadSynchronize();
//CUT_CHECK_ERROR("!!!kernel_ failed!!!n");
"//---------------------------------------------------------------------------------------------"
//002:timing
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
kernel<<<grid,threads>>> ( d_odata, d_idata, size_x, size_y, NUM_REPS);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("kernel spent:%ds",stop-start)
"//-------------------use--shared--mem------------------------------------------------------------------------"
__global__ void coalescedMultiply(float *a, float* b, float *c, int N)
{
__shared__ float aTile[TILE_DIM][TILE_DIM];
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
aTile[threadIdx.y][threadIdx.x] = a[row*TILE_DIM+threadIdx.x];
for (int i = 0; i < TILE_DIM; i++)
sum += aTile[threadIdx.y][i]* b[i*N+col];
c[row*N+col] = sum;
}
"//---------------------------------------------------------------------------------------------"
__global__ void sharedABMultiply(float *a, float* b, float *c, int N)
{
__shared__ float aTile[TILE_DIM][TILE_DIM], bTile[TILE_DIM][TILE_DIM];
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
aTile[threadIdx.y][threadIdx.x] = a[row*TILE_DIM+threadIdx.x];
bTile[threadIdx.y][threadIdx.x] = b[threadIdx.y*N+col];
__syncthreads();
for (int i = 0; i < TILE_DIM; i++)
sum += aTile[threadIdx.y][i]* bTile[i][threadIdx.x];
c[row*N+col] = sum;
}
"//----------------pitch---mem--------------------------------------------------------------------------"
float *cpu_A, *gpu_A;
int height_A = M;//矩阵的高度(行数)
int width_A = DIM; //矩阵的宽度(列数)
int pitch_A_num;
size_t pitch_A, mem_size_A, mem_size_row_A; //GPU数组的pitch
mem_size_row_A = sizeof(float)*width_A;
mem_size_A = mem_size_row_A * height_A;
cpu_A = (float*)malloc(mem_size_A);
cudaMallocPitch((void**) &gpu_A, &pitch_A, mem_size_row_A, height_A));
pitch_A_num = pitch_A/sizeof(float);
cutilSafeCall( cudaMemcpy2D(gpu_A, pitch_A, cpu_A, mem_size_row_A, mem_size_row_A, height_A, cudaMemcpyHostToDevice));
//cudaMemcpy2D (void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind)
cutilSafeCall( cudaMemcpy2D(cpu_C, mem_size_row_C, gpu_C, pitch_C*sizeof(float), mem_size_row_C, height_C, cudaMemcpyDeviceToHost));
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if(row>=M||col>=N) return;
float sum = 0.0f;
for(int i=0; i<DIM; i++){
sum += a[row*pitch_A+i]*b[i*pitch_B+col];
}
c[row*pitch_C+col] = sum;
https://blog.sciencenet.cn/blog-325165-484290.html
上一篇:
计算流体力学小程序源代码(FTCS\BTCS\CNCS)、RealTime version下一篇:
开了一半的题:颗粒物质相关