CUDA Programming
Recap
__global__
__global__voidvoiddkernel(unsigned
dkernel(unsigned*vector,
*vector,unsigned
unsignedvectorsize)
vectorsize){{
int
intidid==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x; S0
ifif(id
(id%%2)2)vector[id] id; S1
vector[id]==id;
else
elsevector[id]
vector[id]==vectorsize
vectorsize**vectorsize;
vectorsize; S2
vector[id]++;
vector[id]++; S3
}}
0 1 2 3 4 5 6 7
S0 S0 S0 S0 S0 S0 S0 S0 NOP
S1 S1 S1 S1
Time
S2 S2 S2 S2
S3 S3 S3 S3 S3 S3 S3 S3 2
Classwork
●
Rewrite the following program fragment to
remove thread-divergence.
assert(x
assert(x== ==yy||||xx==
==zz||||xx==
==ww););
ifif(x
(x==
==y)y)xx==zz++w;
w;
else
elseif(
if(xx==
==zz))xx==ww++y;y;
else
elsexx==yy++z;z;
assert(x
assert(x====yy||||xx==
==zz||||xx==
==ww););
xx==yy++zz++ww––x;x;
3
Classwork
●
How many steps does warp threads take to
execute?
__global__
__global__voidvoiddkernel(unsigned
dkernel(unsigned*vector,
*vector,unsigned
unsignedvectorsize)
vectorsize){{
int
intid id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
ifif((id
id<=
<=00 ){){
vector[id]
vector[id]==0;0;
for
for(int
(inti=1;i<=100;i++){
i=1;i<=100;i++){
vector[id]
vector[id]+=
+=i;i;
}}
}}
else{
else{
vector[id]
vector[id]==1;1;
}}
}} 4
Classwork
●
How many steps does warp threads take to
execute?
__global__
__global__voidvoiddkernel(unsigned
dkernel(unsigned*vector,
*vector,unsigned
unsignedvectorsize)
vectorsize){{
int
intid id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
ifif((id
id<=
<=00 ){){
vector[id]
vector[id]==(101*100)
(101*100)//2;2;
}}
else{
else{
vector[id]
vector[id]==1;1;
}}
}}
5
Classwork
●
How many steps does warp threads take to
execute?
__global__
__global__void
voiddkernel(unsigned
dkernel(unsigned*vector,
*vector,unsigned
unsignedvectorsize)
vectorsize){{
int
intid
id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
vector[id]
vector[id]==((11++(((-id)>>31)
(-id)>>31)))**((((101*100)
((101*100)//2)
2)--11))++11; ;
}}
6
Thread-Divergence
__global__
__global__void
voiddkernel(unsigned
dkernel(unsigned*vector,
*vector,unsigned
unsignedvectorsize)
vectorsize){{
unsigned
unsignedid id==blockIdx.x
blockIdx.x**blockDim.x
blockDim.x++threadIdx.x;
threadIdx.x;
switch
switch(id)
(id){{
case
case0:0:vector[id]
vector[id]==0;0; break;
break;
case
case1:1:vector[id]
vector[id]==vector[id];
vector[id]; break;
break;
case
case2:2:vector[id]
vector[id]==vector[id
vector[id--2];
2]; break;
break;
case
case3:3:vector[id]
vector[id]==vector[id
vector[id++3];
3]; break;
break;
case
case4:4:vector[id]
vector[id]==44++44++vector[id];
vector[id]; break;
break;
case
case5:5:vector[id]
vector[id]==55--vector[id];
vector[id]; break;
break;
case
case6:6:vector[id]
vector[id]==vector[6];
vector[6]; break;
break;
case
case7:7:vector[id]
vector[id]==77++7;7; break;
break;
case
case8:8:vector[id]
vector[id]==vector[id]
vector[id]++8;8; break;
break;
case
case9:9:vector[id]
vector[id]==vector[id]
vector[id]**9;9; break;
break;
}}
}}
How
How many
many steps
steps will
will the
the warp
warp threads
threads take?
take?
7
Thread-Divergence
__global__
__global__voidvoiddkernel()
dkernel()
{{
ifif(threadidx.x
(threadidx.x<16)
<16)
{{
printf(“Inside
printf(“InsideIf”);
If”);
Global_Barrier();
Global_Barrier();
}}
else
elseifif(threadidx
(threadidx>=16)
>=16)
{{
printf(“Inside
printf(“Insideelse”);
else”);
Global_Barrier();
Global_Barrier();
}}
}}
What
What is
is the
the Output?
Output?
Deadlock!!
Deadlock!! 8
Memory
Agenda
●
Computation
●
Memory
●
Synchronization
●
Functions
●
Support
●
Topics
10
CUDA Memory Model Overview
• Global / Video memory
– Main means of communicating data
Grid
between host and device
Block (0, 0) Block (1, 0)
– Contents visible to all GPU threads
– Long latency access (400-800 cycles) Shared Memory Shared Memory
– Throughput ~200 GBPS Registers Registers Registers Registers
• Texture Memory Thread (0, 0) Thread (1, 0) Thread (0, 0) Thread (1, 0)
– Read-only (12 KB)
– ~800 GBPS Host Global Memory
– Optimized for 2D spatial locality
• Constant Memory
– Read-only (64 KB)
1111
The numbers are typical values.