-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bring CUDA support to Tracking.jl
#33
Changes from 101 commits
64b0064
5aad150
4d602f9
f1952f5
808b7d6
d905623
9cbca18
c4e1616
a9869a3
43c07fd
186acde
0c0b49e
4217d83
ad2b607
82bcf0f
ecc981e
c8fc7a4
80e11e4
b8ae78b
4cb1e05
e263962
269e956
9464364
2927448
ea93620
7cdacfa
a5b8c31
629b96b
16e69ed
eae463d
35d35d0
4f3120c
b1d7b56
e0fc930
7c55ee5
15aa324
22a40ba
a5aac85
b9a40bd
dc8d766
26db618
92ec34a
1eceefe
dfced23
1cf107c
7a0b6cf
36cdd2c
5f80968
d111f2a
9ffc0f6
4652695
5904d04
7dade1f
5e52117
a1b3df8
3b53bd8
bff0e07
5dd429a
149e697
c2e1a34
c21f3c1
39e0424
9c91eb7
a719cc2
8d30a28
fa580d3
ad0f826
2c38ed0
7fcc4a9
4023b64
92f7bb2
a0e912a
345e03f
7ccdb91
93ce7a3
911c04d
468c9b0
9646328
7b60fe0
e80b4c0
480f283
6256c92
002dd2d
c9410ef
c7b31b6
2bd5482
0ce8e02
363159e
9d278e0
b4d66c1
10d7960
63d21d2
465b422
2c235dc
504afbb
e5a8398
eb51208
e45cd77
8f3464d
90358a9
81d94c3
ccb75f0
2a24f7d
eab881c
724dcf6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
env: | ||
SECRET_CODECOV_TOKEN: "Q3fuMdJjaQy9h/uk43rwSqz8M6ulvlCedU2Ir0S3QLP4t9F8cf7pzrTkX+nVhkGycZ/r5FRtTOwPr445R3wK5v9mEAsJN5GMOgI5w/L8m2XDwLmW3PN8RMno+fm2JVxZyPMNNmIQqbYEmmQcBS6Q3nywW3xi0Cl5umJuwDB+NdOFbpq3wc2wrnbOAbwlBJoCJmlH+F4ncuVY6EMmsgNKAf9RqUNWQxIthG616X1cNwuYEpL4dO/PWY2GMXWXTQ8ndO/713p4b5yIlzDP0mr2MrO+1A5fhgPc7Vr+f9mUlIAx+9AsWQYPrqPTkr2L5+mfaTodVE3u2Cop877WJZQD7w==;U2FsdGVkX1/wk2jzfWlRZ66IWgionQK/5Fu0pg3u0b26hhmmMjAjOklyi7QZKhJHjjt4KjK/dJzhd3eK28S0qQ==" | ||
|
||
steps: | ||
- label: "Julia v1.6" | ||
plugins: | ||
- JuliaCI/julia#v1: | ||
version: "1.6" | ||
- JuliaCI/julia-test#v1: ~ | ||
- JuliaCI/julia-coverage#v1: | ||
codecov: true | ||
agents: | ||
queue: "juliagpu" | ||
cuda: "*" | ||
if: build.message !~ /\[skip tests\]/ | ||
timeout_in_minutes: 60 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -124,7 +124,7 @@ Get prompt correlator | |
function get_prompt(correlator::AbstractCorrelator, correlator_sample_shifts) | ||
correlator.accumulators[get_prompt_index(correlator_sample_shifts)] | ||
end | ||
|
||
CUDA.dot | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's that? |
||
""" | ||
$(SIGNATURES) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,4 +29,130 @@ function downconvert_and_correlate( | |
accumulators_result = complex.(a_re, a_im) | ||
C(map(+, get_accumulators(correlator), accumulators_result)) | ||
end | ||
=# | ||
=# | ||
|
||
# CUDA Kernel | ||
function downconvert_and_correlate_kernel( | ||
res_re, | ||
res_im, | ||
signal_re, | ||
signal_im, | ||
carrier_re, | ||
carrier_im, | ||
codes, | ||
code_frequency, | ||
correlator_sample_shifts, | ||
carrier_frequency, | ||
sampling_frequency, | ||
start_code_phase, | ||
carrier_phase, | ||
code_length, | ||
prn, | ||
num_samples, | ||
num_ants, | ||
num_corrs | ||
) | ||
cache = @cuDynamicSharedMem(Float32, (2 * blockDim().x, num_ants, num_corrs)) | ||
sample_idx = 1 + ((blockIdx().x - 1) * blockDim().x + (threadIdx().x - 1)) | ||
antenna_idx = 1 + ((blockIdx().y - 1) * blockDim().y + (threadIdx().y - 1)) | ||
corr_idx = 1 + ((blockIdx().z - 1) * blockDim().z + (threadIdx().z - 1)) | ||
iq_offset = blockDim().x | ||
cache_index = threadIdx().x - 1 | ||
|
||
code_phase = accum_re = accum_im = dw_re = dw_im = 0.0f0 | ||
mod_floor_code_phase = Int(0) | ||
|
||
if sample_idx <= num_samples && antenna_idx <= num_ants && corr_idx <= num_corrs | ||
# generate carrier | ||
carrier_im[sample_idx], carrier_re[sample_idx] = CUDA.sincos(2π * ((sample_idx - 1) * carrier_frequency / sampling_frequency + carrier_phase)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For what purpose do you save the sincos result in a vector? Is it for performance improvements in the future? If this is the case, let's dismiss that here to have a clean baseline. |
||
|
||
# downconvert with the conjugate of the carrier | ||
dw_re = signal_re[sample_idx, antenna_idx] * carrier_re[sample_idx] + signal_im[sample_idx, antenna_idx] * carrier_im[sample_idx] | ||
dw_im = signal_im[sample_idx, antenna_idx] * carrier_re[sample_idx] - signal_re[sample_idx, antenna_idx] * carrier_im[sample_idx] | ||
|
||
# calculate the code phase | ||
code_phase = code_frequency / sampling_frequency * ((sample_idx - 1) + correlator_sample_shifts[corr_idx]) + start_code_phase | ||
|
||
# wrap the code phase around the code length e.g. phase = 1024 -> modfloorphase = 1 | ||
mod_floor_code_phase = 1 + mod(floor(Int32, code_phase), code_length) | ||
|
||
# multiply elementwise with the code | ||
accum_re += codes[mod_floor_code_phase, prn] * dw_re | ||
accum_im += codes[mod_floor_code_phase, prn] * dw_im | ||
end | ||
|
||
cache[1 + cache_index + 0 * iq_offset, antenna_idx, corr_idx] = accum_re | ||
cache[1 + cache_index + 1 * iq_offset, antenna_idx, corr_idx] = accum_im | ||
|
||
## Reduction | ||
# wait until all the accumulators have done writing the results to the cache | ||
sync_threads() | ||
|
||
i::Int = blockDim().x ÷ 2 | ||
@inbounds while i != 0 | ||
if cache_index < i | ||
cache[1 + cache_index + 0 * iq_offset, antenna_idx, corr_idx] += cache[1 + cache_index + 0 * iq_offset + i, antenna_idx, corr_idx] | ||
cache[1 + cache_index + 1 * iq_offset, antenna_idx, corr_idx] += cache[1 + cache_index + 1 * iq_offset + i, antenna_idx, corr_idx] | ||
end | ||
sync_threads() | ||
i ÷= 2 | ||
end | ||
|
||
if (threadIdx().x - 1) == 0 | ||
res_re[blockIdx().x, antenna_idx, corr_idx] += cache[1 + 0 * iq_offset, antenna_idx, corr_idx] | ||
res_im[blockIdx().x, antenna_idx, corr_idx] += cache[1 + 1 * iq_offset, antenna_idx, corr_idx] | ||
end | ||
return nothing | ||
end | ||
|
||
function downconvert_and_correlate_kernel_wrapper( | ||
system, | ||
signal, | ||
correlator, | ||
code_replica, | ||
code_phase, | ||
carrier_replica, | ||
carrier_phase, | ||
downconverted_signal, | ||
code_frequency, | ||
correlator_sample_shifts, | ||
carrier_frequency, | ||
sampling_frequency, | ||
signal_start_sample, | ||
num_samples_left, | ||
prn | ||
) | ||
num_corrs = length(correlator_sample_shifts) | ||
num_ants = size(signal, 2) | ||
num_samples = size(signal, 1) | ||
block_dim_z = num_corrs | ||
block_dim_y = num_ants | ||
# keep num_corrs and num_ants in seperate dimensions, truncate num_samples accordingly to fit | ||
block_dim_x = prevpow(2, 1024 ÷ block_dim_y ÷ block_dim_z) | ||
threads = (block_dim_x, block_dim_y, block_dim_z) | ||
blocks = cld(size(signal, 1), block_dim_x) | ||
res_re = CUDA.zeros(Float32, blocks, block_dim_y, block_dim_z) | ||
res_im = CUDA.zeros(Float32, blocks, block_dim_y, block_dim_z) | ||
shmem_size = sizeof(ComplexF32)*block_dim_x*block_dim_y*block_dim_z | ||
@cuda threads=threads blocks=blocks shmem=shmem_size downconvert_and_correlate_kernel( | ||
res_re, | ||
res_im, | ||
signal.re, | ||
signal.im, | ||
carrier_replica.carrier.re, | ||
carrier_replica.carrier.im, | ||
system.codes, | ||
Float32(code_frequency), | ||
correlator_sample_shifts, | ||
Float32(carrier_frequency), | ||
Float32(sampling_frequency), | ||
Float32(code_phase), | ||
Float32(carrier_phase), | ||
size(system.codes, 1), | ||
prn, | ||
num_samples, | ||
num_ants, | ||
num_corrs | ||
) | ||
return sum(res_re .+ 1im*res_im, dims=1) | ||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess this is a left over from previous tests. I think this can be removed.