4 uint single_tone_length,
6 float2* __restrict intput,
7 float2* __restrict output
12 double* __restrict tone_frquencies,
14 uint single_tone_length,
16 float2* __restrict intput,
17 float2* __restrict output
20 double tone_calculated_phase;
23 for(uint i = blockIdx.x * blockDim.x + threadIdx.x;
25 i += gridDim.x*blockDim.x
28 input_index = i % single_tone_length;
31 tone_calculated_phase = 2. * tone_frquencies[i/single_tone_length] * (index_counter + input_index);
34 sincospi(tone_calculated_phase,&_q,&_i);
37 output[i].x = intput[input_index].x * _i + intput[input_index].y * _q;
38 output[i].y = intput[input_index].y * _i - intput[input_index].x * _q;
46 int* __restrict tone_frequencies,
47 int* __restrict tone_phases,
50 size_t single_tone_length,
52 float2* __restrict input,
53 float2* __restrict output
56 double tone_calculated_phase;
59 for(
size_t i = blockIdx.x * blockDim.x + threadIdx.x;
61 i += gridDim.x*blockDim.x
63 input_index = i % single_tone_length;
64 size_t ch = i / single_tone_length;
65 long long int tf = tone_frequencies[ch];
66 long long int ii = (input_index + index_counter)%wavetablelen;
67 long long int tp = tone_phases[ch];
68 long long int my_phase = (tp + (tf * ii)%wavetablelen);
69 tone_calculated_phase = 2. * (my_phase / (double)wavetablelen);
72 sincospi(tone_calculated_phase,&_q,&_i);
82 output[i].y = input[input_index].y * _i - input[input_index].x * _q;
83 output[i].x = input[input_index].x * _i + input[input_index].y * _q;
90 int* __restrict tone_frequencies,
91 int* __restrict tone_phases,
94 size_t single_tone_length,
96 float2* __restrict input,
97 float2* __restrict output,
98 cudaStream_t internal_stream){
100 direct_demodulator_integer<<<1024,32,0,internal_stream>>>(tone_frequencies,tone_phases,wavetablelen,index_counter,single_tone_length,total_length,input,output);
106 template <
typename T>
109 T *d_win,*h_win = (T*)malloc(length*
sizeof(T));
112 cudaMalloc((
void **)&d_win, length*
sizeof(T));
118 for(
int i = 0; i < side; i++){
122 for(
int i = length - side; i < length; i++){
126 for(
int i = 0; i < length - side; i++){
130 h_win[i+side].x = (0.54-0.46*cos(2.f*
pi_f*i/((length-side)-1)));
131 scale += h_win[i+side].x;
135 for(
int i = 0; i < length; i++) h_win[i].x /= scale;
138 cudaMemcpy(d_win, h_win, length*
sizeof(T),cudaMemcpyHostToDevice);
143 FILE* window_diagnostic = fopen(
"USRP_hamming_filter_window.dat",
"wb");
144 fwrite(static_cast<void*>(h_win), length,
sizeof(T), window_diagnostic);
145 fclose(window_diagnostic);
150 cudaDeviceSynchronize();
157 float2 *ret,*d_win,*h_win = (float2*)malloc(length*
sizeof(float2));
162 for(
int i = 0; i < side; i++){
166 for(
int i = length - side; i < length; i++){
170 for(
int i = 0; i < length - side; i++){
174 h_win[i+side].x = (0.54-0.46*cos(2.f*
pi_f*i/((length-side)-1)));
175 scale += h_win[i+side].x;
180 for(
int i = 0; i < length; i++) h_win[i].x /= scale;
185 FILE* window_diagnostic = fopen(
"USRP_hamming_filter_window.dat",
"wb");
186 fwrite(static_cast<void*>(h_win), length,
sizeof(float2), window_diagnostic);
187 fclose(window_diagnostic);
192 cudaMalloc((
void **)&d_win, length*
sizeof(float2));
195 cudaMemcpy(d_win, h_win, length*
sizeof(float2),cudaMemcpyHostToDevice);
199 cudaDeviceSynchronize();
210 float2 *d_win,*h_win = (float2*)malloc(length*
sizeof(float2));
213 cudaMalloc((
void **)&d_win, length*
sizeof(float2));
219 for(
int i = 0; i < side; i++){
223 for(
int i = length - side; i < length; i++){
227 for(
int i = 0; i < length - side; i++){
229 h_win[i+side].x = 1.;
230 h_win[i+side].y = 0.;
231 scale += h_win[i+side].x;
236 for(
int i = 0; i < length; i++) h_win[i].x /= scale;
239 cudaMemcpy(d_win, h_win, length*
sizeof(float2),cudaMemcpyHostToDevice);
244 FILE* window_diagnostic = fopen(
"USRP_hamming_filter_window.dat",
"wb");
245 fwrite(static_cast<void*>(h_win), length,
sizeof(float2), window_diagnostic);
246 fclose(window_diagnostic);
251 cudaDeviceSynchronize();
258 float2*
make_sinc_window(
int length,
float fc,
bool diagnostic =
false,
bool host_ret =
false){
260 float2 *ret,*d_win,*h_win = (float2*)malloc(length*
sizeof(float2));
266 for(
int i = 0; i < length; i++){
268 sinc_index = i - (length-1)/2;
275 h_win[i].x = (2.f*fc)*sin(2.f*
pi_f*fc*sinc_index)/(2.f*
pi_f*fc*sinc_index):
276 h_win[i].x = (2.f*fc);
279 h_win[i].x *= (0.54-0.46*cos(2.f*
pi_f*i/(length-1)));
286 for(
int i = 0; i < length; i++) h_win[i].x /= scale;
293 FILE* window_diagnostic = fopen(
"USRP_polyphase_filter_window.dat",
"wb");
294 fwrite(static_cast<void*>(h_win), length,
sizeof(float2), window_diagnostic);
295 fclose(window_diagnostic);
299 cudaMalloc((
void **)&d_win, length*
sizeof(float2));
301 cudaMemcpy(d_win, h_win, length*
sizeof(float2),cudaMemcpyHostToDevice);
304 cudaDeviceSynchronize();
312 __global__
void init_states(curandState *state,
int twice_vector_len){
314 for(
int i = blockIdx.x * blockDim.x + threadIdx.x;
315 i < twice_vector_len;
316 i += gridDim.x*blockDim.x
319 curand_init(1337,i,0,&state[i]);
323 __global__
void make_rand(curandState *state, float2 *vector,
int len,
float scale = 1){
325 for(
int i = blockIdx.x * blockDim.x + threadIdx.x;
327 i += gridDim.x*blockDim.x
329 vector[i].x = scale * 2*curand_uniform(&state[2*i])-1;
330 vector[i].y = scale * 2*curand_uniform(&state[2*i]+1)-1;
340 return number - __float2int_rd(number/modulus)*
modulus;
343 __device__
unsigned int round_index(
unsigned int last_index,
unsigned int offset,
unsigned int num_f,
unsigned int f_len){
345 unsigned long int pos = last_index + offset;
347 unsigned long int chirp_len = f_len * num_f;
349 return (pos - ((pos/chirp_len) * chirp_len));
357 float2* __restrict__ output,
358 unsigned int output_size,
360 unsigned long int last_index,
364 unsigned long int effective_index;
365 unsigned long int frequency_index;
366 unsigned long int phase_correction;
368 for(
unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x;
369 offset < output_size;
370 offset += gridDim.x*blockDim.x
374 effective_index = (last_index + offset) % (info->num_steps * info->length);
379 frequency_index = effective_index/info->length;
381 unsigned long int q_phase = (frequency_index/2)*(frequency_index +1) + (frequency_index % 2)*((frequency_index +1)/2);
384 phase_correction = ( info->chirpness * (info->length * q_phase) );
387 index = (effective_index * (info->f0 + frequency_index * info->chirpness ) - phase_correction);
389 output[offset].x = sinpi(((
double)(index)/2147483647.5))*scale;
390 output[offset].y = -cospi(((
double)(index)/2147483647.5))*scale;
397 float2* __restrict__ output,
398 unsigned int output_size,
400 unsigned long int last_index,
401 cudaStream_t internal_stream,
406 chirp_gen<<<1024,32,0,internal_stream>>>(output,output_size,info,last_index,scale);
412 float2* __restrict__ input,
413 float2* __restrict__ output,
414 unsigned int output_size,
415 unsigned long int last_index,
418 unsigned long int effective_index;
419 unsigned long int frequency_index;
420 unsigned long int phase_correction;
423 for(
unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x;
424 offset < output_size;
425 offset += gridDim.x*blockDim.x
429 effective_index = (last_index + offset) % (info->num_steps * info->length);
433 frequency_index = effective_index/info->length;
435 unsigned long int q_phase = (frequency_index/2)*(frequency_index +1) + (frequency_index % 2)*((frequency_index +1)/2);
438 phase_correction = ( info->chirpness * (info->length * q_phase) );
441 index = (effective_index * (info->f0 + frequency_index * info->chirpness ) - phase_correction) ;
443 chirp.x = sinpi(((
double)(index)/2147483647.5));
444 chirp.y = -cospi(((
double)(index)/2147483647.5));
446 output[offset].x = chirp.x*input[offset].x + chirp.y*input[offset].y;
447 output[offset].y = chirp.x*input[offset].y - chirp.y*input[offset].x;
450 __device__
float absolute(float2 number){
return sqrt(number.x*number.x+number.y+number.y);}
453 float2* __restrict__ input,
454 float2* __restrict__ output,
455 unsigned int output_size,
456 unsigned long int last_index,
458 cudaStream_t internal_stream
461 chirp_demodulator<<<1024,32,0,internal_stream>>>(input,output,output_size,last_index,info);
467 float2* __restrict__ from,
468 float2* __restrict__ to,
474 for(
int offset = blockIdx.x * blockDim.x + threadIdx.x;
476 offset += gridDim.x*blockDim.x
478 to[offset+to_offset] = from[offset+from_offset];
483 float2* __restrict__ from,
484 float2* __restrict__ to,
488 cudaStream_t internal_stream
490 move_buffer<<<1024,64,0,internal_stream>>>(from,to,size,from_offset,to_offset);
497 float2* __restrict__ input,
498 float2* __restrict__ output,
503 for(
int offset = blockIdx.x * blockDim.x + threadIdx.x;
504 offset < filter_info->batching * filter_info->n_tones;
505 offset += gridDim.x*blockDim.x
509 if(offset + filter_info->n_tones*(filter_info->average_buffer) < filter_info->batching * filter_info->n_tones){
517 for(
int i = 0; i<(filter_info->average_buffer); i++ ){
520 int sample_index = offset + i* (filter_info->n_tones);
523 int win_index = offset%filter_info->n_tones+i*filter_info->n_tones;
526 acc.x += input[sample_index].x * (filter_info->window)[win_index].x;
527 acc.y += input[sample_index].y * (filter_info->window)[win_index].x;
535 output[offset] = acc;
541 float2* __restrict__ input,
542 float2* __restrict__ output,
544 cudaStream_t internal_stream
547 polyphase_filter<<<896*2,64,0,internal_stream>>>(input,output,filter_info);
554 float2* __restrict__ input,
555 float2* __restrict__ output,
557 int effective_batching
564 for(
int offset = blockIdx.x * blockDim.x + threadIdx.x;
565 offset < effective_batching*filter_info->eff_n_tones;
566 offset += gridDim.x*blockDim.x
570 int index = (offset/filter_info->eff_n_tones)*filter_info->n_tones + filter_info->tones[offset % filter_info->eff_n_tones];
573 output[offset] = input[index];
579 float2* __restrict__ input,
580 float2* __restrict__ output,
582 int effective_batching,
583 cudaStream_t internal_stream
586 tone_select<<<1024,64,0,internal_stream>>>(input,output,filter_info,effective_batching);
593 float2* __restrict__ input,
599 for(
int offset = blockIdx.x * blockDim.x + threadIdx.x;
601 offset += gridDim.x*blockDim.x
604 input[offset].x = input[offset].x * scale;
605 input[offset].y = input[offset].y * scale;
620 base_vector = (float2*)malloc(sampling_rate*
sizeof(float2));
624 if (cufftPlan1d(&plan, sampling_rate, CUFFT_C2C, 1) != CUFFT_SUCCESS){
626 std::cout<<
"CUDA ERROR IN cufftHandle"<<std::endl;
632 cudaMalloc((
void **)&device_base, sampling_rate*
sizeof(float2));
635 float normalization = 0;
638 for(
int i = 0; i < sampling_rate; i++){
639 base_vector[i].x = 0;
640 base_vector[i].y = 0;
656 base_vector[freq].y = 0;
666 cudaMemcpy(device_base, base_vector, sampling_rate*
sizeof(float2),cudaMemcpyHostToDevice);
669 if (cufftExecC2C(plan, device_base, device_base, CUFFT_INVERSE) != CUFFT_SUCCESS){
671 std::cout<<
"CUDA ERROR: Cannot execute fft transform for tone generation."<<std::endl;
679 if(scale>1.) std::cout<<
"CUDA WARNING: Maximum amplitude of the TX buffer is > 1."<<std::endl;
680 if(scale!=1.) scale_buffer<<<1024, 32>>>(device_base, sampling_rate, scale);
683 cudaMemcpy(base_vector,device_base,sampling_rate*
sizeof(float2),cudaMemcpyDeviceToHost);
700 cudaFree(device_base);
710 float2* __restrict__ buffer1,
711 float2* __restrict__ buffer2,
712 float2* __restrict__ output,
716 for(
int offset = blockIdx.x * blockDim.x + threadIdx.x;
718 offset += gridDim.x*blockDim.x
720 output[offset].x = buffer1[offset].x + buffer2[offset].x;
721 output[offset].y = buffer1[offset].y + buffer2[offset].y;
727 float2* __restrict__ input,
728 float2* __restrict__ output,
734 for(
int offset = blockIdx.x * blockDim.x + threadIdx.x;
736 offset += gridDim.x*blockDim.x
739 int output_offset = offset%nfft + nfft*int(offset/(nfft*decim));
740 atomicAdd(&output[output_offset].x, input[offset].x);
741 atomicAdd(&output[output_offset].y, input[offset].y);
749 float2* __restrict__ input,
750 float2* __restrict__ output,
761 average_spectra<<<1024, 32, 0, stram_f>>>(
769 scale_buffer<<<1024, 32>>>(output, output_len, 1./decim);
777 float2* __restrict__ input,
778 float2* __restrict__ output,
785 extern __shared__ float2 shared_buffer[];
788 for(
int offset = blockIdx.x * blockDim.x + threadIdx.x;
789 offset < output_length;
790 offset += gridDim.x*blockDim.x
794 shared_buffer[threadIdx.x].x = 0;
795 shared_buffer[threadIdx.x].y = 0;
799 for(
int j = 0; j < decim; j++){
801 input_index = j * (offset % nfft);
804 shared_buffer[threadIdx.x].x += input[input_index].x;
805 shared_buffer[threadIdx.x].y += input[input_index].y;
809 output[offset].x = shared_buffer[threadIdx.x].x/decim;
810 output[offset].y = shared_buffer[threadIdx.x].y/decim;
815 __global__
void zero_mem(float2* __restrict__ input,
int input_len,
float value){
816 for(
int offset = blockIdx.x * blockDim.x + threadIdx.x;
818 offset += gridDim.x*blockDim.x
820 input[offset].x = value;
828 return sqrt(sample.x*sample.x+sample.y*sample.y);
836 void _cudaGetErrorEnum(cublasStatus_t error)
841 case CUBLAS_STATUS_NOT_INITIALIZED:
843 std::cout<<
"CUBLAS ERROR: CUBLAS_STATUS_NOT_INITIALIZED"<<std::endl;
845 case CUBLAS_STATUS_ALLOC_FAILED:
847 std::cout<<
"CUBLAS ERROR: CUBLAS_STATUS_ALLOC_FAILED"<<std::endl;
849 case CUBLAS_STATUS_INVALID_VALUE:
851 std::cout<<
"CUBLAS ERROR: CUBLAS_STATUS_INVALID_VALUE"<<std::endl;
853 case CUBLAS_STATUS_ARCH_MISMATCH:
855 std::cout<<
"CUBLAS ERROR: CUBLAS_STATUS_ARCH_MISMATCH"<<std::endl;
857 case CUBLAS_STATUS_MAPPING_ERROR:
859 std::cout<<
"CUBLAS ERROR: CUBLAS_STATUS_MAPPING_ERROR"<<std::endl;
861 case CUBLAS_STATUS_EXECUTION_FAILED:
863 std::cout<<
"CUBLAS ERROR: CUBLAS_STATUS_EXECUTION_FAILED"<<std::endl;
865 case CUBLAS_STATUS_INTERNAL_ERROR:
867 std::cout<<
"CUBLAS ERROR: UBLAS_STATUS_INTERNAL_ERROR"<<std::endl;
875 float2* __restrict__ input,
876 float2* __restrict__ output,
877 float2* __restrict__ profile,
878 cuComplex* __restrict__ zero,
879 cuComplex* __restrict__ one,
882 cublasHandle_t* __restrict__ handle
886 cublasStatus_t err = cublasCgemv(*handle, CUBLAS_OP_T,
893 _cudaGetErrorEnum(err);
901 float2* __restrict__ input,
902 float2* __restrict__ output,
911 accumulate_ffts<<<blocks, PFB_DECIM_TPB, PFB_DECIM_TPB*sizeof(float2), stram_f>>>(
912 input,output,decim,nfft,output_length);
918 double2* __restrict__ input,
919 double2* __restrict__ output,
920 double2* __restrict__ profile,
921 cuDoubleComplex* __restrict__ zero,
922 cuDoubleComplex* __restrict__ one,
925 cublasHandle_t* __restrict__ handle
929 cublasStatus_t err = cublasZgemv(*handle, CUBLAS_OP_T,
936 _cudaGetErrorEnum(err);
939 double2* __restrict__ input,
940 float2* __restrict__ output,
944 for(
int offset = blockIdx.x * blockDim.x + threadIdx.x;
946 offset += gridDim.x*blockDim.x
948 output[offset].x = input[offset].x;
949 output[offset].y = input[offset].y;
954 float2* __restrict__ input,
955 double2* __restrict__ output,
959 for(
int offset = blockIdx.x * blockDim.x + threadIdx.x;
961 offset += gridDim.x*blockDim.x
963 output[offset].x = input[offset].x;
964 output[offset].y = input[offset].y;
__device__ unsigned int round_index(unsigned int last_index, unsigned int offset, unsigned int num_f, unsigned int f_len)
__global__ void scale_buffer(float2 *__restrict__ input, int input_size, float scale)
__global__ void float2double(float2 *__restrict__ input, double2 *__restrict__ output, int length)
__global__ void make_rand(curandState *state, float2 *vector, int len, float scale=1)
void chirp_demodulator_wrapper(float2 *__restrict__ input, float2 *__restrict__ output, unsigned int output_size, unsigned long int last_index, chirp_parameter *__restrict__ info, cudaStream_t internal_stream)
void D_cublas_decim(double2 *__restrict__ input, double2 *__restrict__ output, double2 *__restrict__ profile, cuDoubleComplex *__restrict__ zero, cuDoubleComplex *__restrict__ one, int ppt, int n_freqs, cublasHandle_t *__restrict__ handle)
void decimate_pfb(float2 *__restrict__ input, float2 *__restrict__ output, int decim, int nfft, int output_length, cudaStream_t stram_f)
void tone_select_wrapper(float2 *__restrict__ input, float2 *__restrict__ output, filter_param *__restrict__ filter_info, int effective_batching, cudaStream_t internal_stream)
Descriptor of the mutitone generation.
__global__ void direct_demodulator_fp64(double *__restrict tone_frquencies, size_t index_counter, uint single_tone_length, size_t total_length, float2 *__restrict intput, float2 *__restrict output)
Direct demodulation kernel. This kernel takes the raw input from the SDR and separate channels...
__global__ void double2float(double2 *__restrict__ input, float2 *__restrict__ output, int length)
__device__ float modulus(float number, float modulus)
float2 * make_flat_window(int length, int side, bool diagnostic)
Creates a flattop window in the GPU memory.
__global__ void direct_demodulator_integer(int *__restrict tone_frequencies, int *__restrict tone_phases, int wavetablelen, size_t index_counter, size_t single_tone_length, size_t total_length, float2 *__restrict input, float2 *__restrict output)
Integer version of the direct demodulation kernel (numerically more stable). This kernel takes the ra...
__global__ void tone_select(float2 *__restrict__ input, float2 *__restrict__ output, filter_param *__restrict__ filter_info, int effective_batching)
float2 * make_sinc_window(int length, float fc, bool diagnostic=false, bool host_ret=false)
__global__ void zero_mem(float2 *__restrict__ input, int input_len, float value)
float2 * tone_gen(tone_parameters *info, int sampling_rate, float scale, bool device)
__global__ void DIRECT_decimator(uint single_tone_length, size_t total_length, float2 *__restrict intput, float2 *__restrict output)
__global__ void accumulate_ffts(float2 *__restrict__ input, float2 *__restrict__ output, int decim, int nfft, int output_length)
#define PFB_DECIM_TPB
Tune the Thread Per Block used in certain functions.
void move_buffer_wrapper(float2 *__restrict__ from, float2 *__restrict__ to, int size, int from_offset, int to_offset, cudaStream_t internal_stream)
void polyphase_filter_wrapper(float2 *__restrict__ input, float2 *__restrict__ output, filter_param *__restrict__ filter_info, cudaStream_t internal_stream)
T * make_hamming_window(int length, int side, bool diagnostic, bool host_ret)
void print_chirp_params(std::string comment, chirp_parameter cp)
__global__ void move_buffer(float2 *__restrict__ from, float2 *__restrict__ to, int size, int from_offset, int to_offset)
void chirp_gen_wrapper(float2 *__restrict__ output, unsigned int output_size, chirp_parameter *__restrict__ info, unsigned long int last_index, cudaStream_t internal_stream, float scale=1)
__device__ float magnitude(float2 sample)
__global__ void init_states(curandState *state, int twice_vector_len)
float2 * make_hamming_window< float2 >(int length, int side, bool diagnostic, bool host_ret)
__global__ void chirp_demodulator(float2 *__restrict__ input, float2 *__restrict__ output, unsigned int output_size, unsigned long int last_index, chirp_parameter *__restrict__ info)
__global__ void mix_buffers(float2 *__restrict__ buffer1, float2 *__restrict__ buffer2, float2 *__restrict__ output, int length)
__global__ void chirp_gen(float2 *__restrict__ output, unsigned int output_size, chirp_parameter *__restrict__ info, unsigned long int last_index, float scale=1)
void direct_demodulator_wrapper(int *__restrict tone_frequencies, int *__restrict tone_phases, int wavetablelen, size_t index_counter, size_t single_tone_length, size_t total_length, float2 *__restrict input, float2 *__restrict output, cudaStream_t internal_stream)
__global__ void average_spectra(float2 *__restrict__ input, float2 *__restrict__ output, int decim, int nfft, int input_len)
__global__ void polyphase_filter(float2 *__restrict__ input, float2 *__restrict__ output, filter_param *__restrict__ filter_info)
__device__ float absolute(float2 number)
void decimate_spectra(float2 *__restrict__ input, float2 *__restrict__ output, int decim, int nfft, int input_len, int output_len, cudaStream_t stram_f)
void cublas_decim(float2 *__restrict__ input, float2 *__restrict__ output, float2 *__restrict__ profile, cuComplex *__restrict__ zero, cuComplex *__restrict__ one, int ppt, int n_freqs, cublasHandle_t *__restrict__ handle)