CUDA(Compute Unified Device Architecture)主要用于GPU上的编程,让SIMD能够对应用更加通用
GPU利用Moore定律
CUDA的目标:
CUDA GPU是流多处理器(streaming multiprocessors, SM)的集合,每一个SM都是一个SIMD执行流水的集合(scalar processors),共享控制逻辑、寄存器堆、L1 cache。
Nvidia GPU架构演变
注意:现在WSL还不支持GPU!
由于GPU一般作为协处理器,故CPU与GPU常构成异构系统,其中CPU为host,GPU为device。
典型CUDA程序执行流程如下:
#include <iostream>
#include <math.h>
// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
for (int i = 0; i < n; i++)
y[i] = x[i] + y[i];
}
int main(void)
{
int N = 1<<20;
float *x, *y;
// Allocate Unified Memory – accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
// Run kernel on 1M elements on the GPU
add<<<1, 1>>>(N, x, y);
// Wait for GPU to finish before accessing on host
// default asynchronous, different from OpenMP
cudaDeviceSynchronize();
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = fmax(maxError, fabs(y[i]-3.0f));
std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(x);
cudaFree(y);
return 0;
}
__global__
:核函数,在device上线程中并行执行的函数,从host中调用(一些特定的GPU也可以从device上调用),返回类型必须是void,不支持可变参数参数,不能成为类成员函数,只能访问设备内存。注意用__global__
定义的kernel是异步的,这意味着host不会等待kernel执行完就执行下一步__device__
:在device上执行,仅可以从device中调用,不可以和__global__
同时用__host__
:在host上执行,仅可以从host上调用,一般省略不写,不可以和__global__
同时用,但可和__device__
,此时函数会在device和host都编译内存操作
cudaError_t cudaMalloc(void** devPtr, size_t size);
cudaFree(void*)
cudaError_t cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind)
:kind控制复制的方向cudaMemcpyHostToHost, cudaMemcpyHostToDevice, cudaMemcpyDeviceToHost及cudaMemcpyDeviceToDevicecudaError_t cudaMallocManaged(void **devPtr, size_t size, unsigned int flag=0)
:自CUDA 6.0开始,统一管理内存,自动进行数据传输CUDA编程模型分为4级层次结构(同样可以map到其他硬件上)
每一个cuda线程都有自己的控制流、PC、寄存器、堆栈,能够访问GPU任意全局内存地址
threadIdx.{x,y,z}
blockIdx.{x,y}
Kernel上的两层线程组织结构如下(2-dim)
一个线程需要两个内置的坐标变量(blockIdx,threadIdx)
来唯一标识,都是dim3
变量。<<<grid,block>>>
代表网格、线程块数目。
关于dim3
的结构类型
dim3
是基于uint3
定义的矢量类型,相当于由3个unsigned int
型组成的结构体。uint3
类型有三个数据成员unsigned int x; unsigned int y; unsigned int z;
threadIdx
,顾名思义获取线程thread
的ID索引;如果线程是一维的那么就取threadIdx.x
,二维的还可以多取到一个值threadIdx.y
,以此类推到三维threadIdx.z
。blockIdx
,线程块的ID索引;同样有blockIdx.x
,blockIdx.y
,blockIdx.z
。blockDim
,线程块的维度,同样有blockDim.x
,blockDim.y
,blockDim.z
。gridDim
,线程格的维度,同样有gridDim.x
,gridDim.y
,gridDim.z
。block
,线程的threadID = threadIdx.x
(blockDim.x, blockDim.y)
的二维block
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
(blockDim.x, blockDim.y, blockDim.z)
的三维block
,线程的threadID = threadIdx.x+threadIdx.y*blockDim.x+threadIdx.z*blockDim.x*blockDim.y
stride = blockDim.x * gridDim.x; threadId += stride
dim3 grid(3, 2);
dim3 block(5, 3);
kernel_fun<<< grid, block >>>(prams...);
// Thread(1,1)
// threadIdx.x = 1
// threadIdx.y = 1
// blockIdx.x = 1
// blockIdx.y = 1
变量声明 | 存储器 | 作用域 | 生存周期 |
---|---|---|---|
int var |
寄存器 | 线程 | 线程 |
int array_var[100] |
寄存器/本地 | 线程 | 线程 |
__shared__ int shared_var |
共享 | 线程块 | 线程块 |
__device__ int global_var |
全局 | 全局 | 应用程序 |
__constant__ int constant_var |
常量 | 全局 | 应用程序 |
dataElem
数组all_names
name
)在内存空间中连续
struct dataElem{
int prop_0;
int prop_1;
int name_pos, name_len;
}
char* all_names;
__constant__ int const_var[16];
__global__ void kernel(){
int i = blockIdx.x;
int value = const_var[i%16];
}
__constant__ int const_var[16];
__global__ void kernel(){
int i = blockIdx.x * blockDim.x + threadIdx.x;
int value = const_var[i%16];
}
常量内存的最佳访问模式
常量内存的最差访问模式
texture<Datatype, Type, ReadMode> tex
cudaBindTexture()
、cudaBindTexture2D()
cudaUnbindTexture()
#define N 1024
texture<float, 1, cudaReadModeElementType> tex;
__global__ void kernel() {
int i = blockIdx.x * blockDim.x + threadIdx.x;
float x = tex1Dfetch(tex, i);
}
int main() {
float *buffer;
cudaMalloc(&buffer, N*sizeof(float));
cudaBindTexture(0, tex, buffer, N*sizeof(float));
kernel <<<grid, block >>>();
cudaUnbindTexture(tex);
cudaFree(buffer);
}
只读缓存
__ldg()
代替标准指针解引用,并且强制通过只读数据缓存加载
__global__ void kernel(int *buffer) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int x = __ldg(&buffer[i]);
}
int main() {
int *buffer;
cudaMalloc(&buffer, sizeof(int)*N);
kernel <<< grid, block >>>(buffer);
cudaFree(buffer);
}
const __restrict__
表明数据应该通过只读缓存被访问
__global__ void kernel(const int* __restrict__ buffer){
int i = blockIdx.x * blockDim.x + threadIdx.x;
int x = buffer[i];
}
int main() {
int *buffer;
cudaMalloc(&buffer, sizeof(int)*N);
kernel << <grid, block >> >(buffer);
cudaFree(buffer);
}
读写速度非常快,带宽>1TB/s
存储体(bank)和访问模式
可通过修改步长来消除存储体冲突(会导致消耗内存增加)。
nvcc
nvprof
nvidia-smi
CHECK(cudaMalloc((void**)&a, n_bytes));
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess){ \
printf("Error: %s:%d, ", __FILE__, __LINE__); \
printf("code:%d, reason: %s \n", \
error, cudaGetErrorString(error)); \
exit(1); \
} \
}
__global__ void vector_add(int *a, int* b, int* c, int n){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < n){
c[tid] = a[tid] + b[tid];
}
}
int divup(int n, int m){
return (if (n%m) ? (n/m+1) : (n/m));
}
vector_add<<< divup(n,m), m>>>(a, b, c);
__global__ void matrix_add(int *A, int *B, int *C, int n, int m){
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if ( tid<n*m ){
C[tid] = A[tid] + B[tid];
}
}
matrix_add<<< divup(n*m, block_size), block_size >>>(A, B, C, n, m);
__global__ void stencil(float *in, float *out){
__shared__ float smem[BDIM+2*RADIUS];
//thread index to global memory
int tid = blockIdx.x*blockDim.x + threadIdx.x;
//index to shared memory
int sid = threadIdx.x + RADIUS;
//copy to shared memory
smem[sid] = in[tid];
if (threadIdx.x < RADIUS) {
smem[sid-RADIUS] = in[tid-RADIUS];
smem[sid+BDIM] = in[tid+BDIM];
}
__syncthreads();
float tmp = 0.0f;
for(int i = 1; i <=RADIUS; ++i){
tmp += c[i]*(smem[sid+i]-smem[sid-i]);
}
out[tid] = tmp;
}